fix: fix import ingestors error, add color to log

This commit is contained in:
Sosokker 2025-05-12 22:42:49 +07:00
parent 701c2b0ae7
commit b7aa490316
4 changed files with 105 additions and 304 deletions

View File

@ -19,7 +19,7 @@ class AppSettings(BaseSettings):
# Application settings # Application settings
APP_NAME: str = "PipelineRunnerApp" APP_NAME: str = "PipelineRunnerApp"
LOG_LEVEL: str = "INFO" # Logging level (e.g., DEBUG, INFO, WARNING) LOG_LEVEL: str = "DEBUG" # Logging level (e.g., DEBUG, INFO, WARNING)
LOG_ENABLE_SSE: bool = True # Flag to enable/disable SSE log streaming sink LOG_ENABLE_SSE: bool = True # Flag to enable/disable SSE log streaming sink
# Store configuration # Store configuration
@ -56,7 +56,8 @@ logger.remove()
logger.add( logger.add(
sys.stderr, sys.stderr,
level=settings.LOG_LEVEL.upper(), level=settings.LOG_LEVEL.upper(),
format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}", # format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}",
colorize=True,
) )
# File Sink # File Sink

View File

@ -1,4 +1,8 @@
from ingestors import IngestionMethod, SimpleIngestionStrategy, MLIngestionStrategy from ingestion.ingestors import (
IngestionMethod,
SimpleIngestionStrategy,
MLIngestionStrategy,
)
from models.ingestion import IngestSourceConfig, OutputData from models.ingestion import IngestSourceConfig, OutputData

View File

@ -1,186 +0,0 @@
"""
Pydantic models for pipelines and runs.
"""
from typing import List, Union, Annotated, Optional, Literal, Dict, Any
from uuid import UUID
from datetime import datetime
from pydantic import BaseModel, Field, HttpUrl, field_validator, ValidationInfo
class RunCreate(BaseModel):
"""
Model for creating a new run. (Empty)
"""
pass
class Run(BaseModel):
"""
Status of a pipeline run.
"""
id: UUID
pipeline_id: UUID
status: Literal["PENDING", "RUNNING", "COMPLETED", "FAILED"]
started_at: datetime
finished_at: Optional[datetime] = None
class RunResult(Run):
"""
Extended run model including results or error.
"""
results: Optional[List[Dict[str, Any]]] = None
error: Optional[str] = None
class ApiConfig(BaseModel):
"""
Configuration for an API source.
"""
url: HttpUrl = Field(..., description="API endpoint URL")
token: Optional[str] = Field(
None,
description="Optional bearer token for API authentication",
)
class ScrapeConfig(BaseModel):
"""
Configuration for a web-scraping source.
"""
urls: List[HttpUrl] = Field(
...,
description="List of URLs to scrape",
)
schema_file: Optional[str] = Field(
None,
description="Path to a JSON file containing CSS extraction schema",
)
prompt: Optional[str] = Field(
None,
description="Prompt string for LLM-based extraction",
)
class FileConfig(BaseModel):
"""
Configuration for a file-based source. Supports either a file path or an uploaded file.
"""
path: Optional[str] = Field(
None,
description="Path to the input file (optional if upload is provided)",
)
upload: Optional[Any] = Field(
None,
description="Uploaded file object or metadata (optional if path is provided)",
)
upload_filename: Optional[str] = Field(
None,
description="Original filename of the uploaded file (for validation)",
)
format: Literal["csv", "json", "sqlite"] = Field(
"json", description="Format of the file"
)
@field_validator("path", mode="before")
def require_path_or_upload(cls, v, info: ValidationInfo):
data = info.data
if not v and not data.get("upload"):
raise ValueError("Either 'path' or 'upload' must be provided.")
return v
@field_validator("upload_filename", mode="before")
def filename_extension_matches_format(cls, v, info: ValidationInfo):
fmt = info.data.get("format")
if v and fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
return v
@field_validator("path", mode="after")
def path_or_upload_extension_matches_format(cls, v, info: ValidationInfo):
fmt = info.data.get("format")
upload_filename = info.data.get("upload_filename")
if v and fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"File extension must match format '{fmt}'")
if upload_filename and fmt and not upload_filename.lower().endswith(f".{fmt}"):
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
return v
class ApiSource(BaseModel):
"""
An API-based data source.
"""
type: Literal["api"] = Field(
"api",
description="Discriminator for API source", # Removed const=True
)
config: ApiConfig
class ScrapeSource(BaseModel):
"""
A web-scraping data source.
"""
type: Literal["scrape"] = Field(
"scrape",
description="Discriminator for scrape source", # Removed const=True
)
config: ScrapeConfig
class FileSource(BaseModel):
"""
A file-based data source.
"""
type: Literal["file"] = Field(
"file",
description="Discriminator for file source", # Removed const=True
)
config: FileConfig
Source = Annotated[
Union[ApiSource, ScrapeSource, FileSource],
Field(discriminator="type", description="Union of all source types"),
]
class PipelineCreate(BaseModel):
"""
Payload for creating a new pipeline.
"""
name: Optional[str] = Field(
default=None,
description="Optional human-readable name for the pipeline",
)
sources: List[Source] = Field(
..., description="List of data sources for this pipeline"
)
class Pipeline(BaseModel):
"""
Representation of a pipeline.
"""
id: UUID = Field(..., description="Unique identifier for the pipeline")
name: Optional[str] = Field(
None, description="Optional human-readable name for the pipeline"
)
sources: List[Source] = Field(..., description="List of configured data sources")
created_at: datetime = Field(
..., description="UTC timestamp when the pipeline was created"
)

View File

@ -8,6 +8,8 @@ from uuid import UUID, uuid4
from typing import Optional, List, TYPE_CHECKING from typing import Optional, List, TYPE_CHECKING
from loguru import logger from loguru import logger
from ingestion import Ingestor
from models.pipeline import ( from models.pipeline import (
Pipeline, Pipeline,
PipelineCreate, PipelineCreate,
@ -240,142 +242,122 @@ class PipelineService:
async def run_pipeline(self, pipeline_id: UUID) -> None: async def run_pipeline(self, pipeline_id: UUID) -> None:
""" """
Executes the pipeline logic, updating status and run times. Executes the pipeline logic, updating status and run times.
This is called by the scheduler job or manual trigger. Logs associated with this run will include the pipeline_id.
""" """
logger.info(f"Attempting run execution for pipeline: id={pipeline_id}") # Use contextualize to tag logs originating from this specific run
pipeline = await self.store.get(pipeline_id) with logger.contextualize(
pipeline_id=str(pipeline_id)
): # Ensure it's a string for context
logger.info(
"Attempting run execution for pipeline"
) # Log context takes effect here
pipeline = await self.store.get(pipeline_id)
if not pipeline: if not pipeline:
logger.error(f"Cannot run pipeline: Pipeline not found (id={pipeline_id})") logger.error("Cannot run pipeline: Pipeline not found")
return return
# NOTE: lock mechanism if pipeline.status == PipelineStatus.ACTIVE:
if pipeline.status == PipelineStatus.ACTIVE: logger.warning("Pipeline is already ACTIVE. Skipping run.")
logger.warning(
f"Pipeline id={pipeline_id} is already ACTIVE. Skipping run."
)
return
# --- Mark as ACTIVE ---
try:
pipeline.status = PipelineStatus.ACTIVE
pipeline.updated_at = datetime.now(UTC) # Update timestamp
await self.store.save(pipeline)
logger.info(f"Pipeline {pipeline_id} marked as ACTIVE.")
except Exception as e:
logger.error(
f"Failed to mark pipeline {pipeline_id} as ACTIVE: {e}. Aborting run.",
exc_info=True,
)
# Attempt to restore status? Depends on store guarantees.
# pipeline.status = original_status # Potentially try rollback
return # Abort run
# --- Execute Pipeline Logic ---
run_successful = False
try:
logger.info(f"Executing core logic for pipeline id={pipeline_id}...")
# ---------------------------------------------------
# Ensure _execute_ingestion is awaited if it's async
await self._execute_ingestion(pipeline.config.ingestor_config)
# ---------------------------------------------------
logger.info(f"Core logic finished successfully for id={pipeline_id}.")
run_successful = True
except Exception as e:
logger.error(
f"Core logic failed during pipeline run id={pipeline_id}: {e}",
exc_info=True,
)
# run_successful remains False
# --- Update Final State ---
try:
# Fetch the latest state again to minimize race conditions, though the ACTIVE lock helps
final_pipeline_state = await self.store.get(pipeline_id)
if not final_pipeline_state:
logger.warning(
f"Pipeline {pipeline_id} disappeared during run. Cannot update final state."
)
# The pipeline might have been deleted externally while running.
# Scheduler might need cleanup if the job still exists.
if self.scheduler_manager:
logger.warning(
f"Attempting to unschedule potentially orphaned job for {pipeline_id}"
)
asyncio.create_task(
self.scheduler_manager.unschedule_pipeline(pipeline_id)
)
return return
# Avoid modifying the object fetched directly if store uses caching/references # --- Mark as ACTIVE ---
final_pipeline_state = final_pipeline_state.model_copy(deep=True) # original_status = pipeline.status # Store original status for potential rollback
try:
pipeline.status = PipelineStatus.ACTIVE
pipeline.updated_at = datetime.now(UTC)
await self.store.save(pipeline)
logger.info("Pipeline marked as ACTIVE.")
except Exception as e:
logger.error(
f"Failed to mark pipeline as ACTIVE: {e}. Aborting run.",
exc_info=True,
)
# Attempt to restore status? Requires careful thought on atomicity
# pipeline.status = original_status
# await self.store.save(pipeline) # Potential race condition/overwrite here
return
now = datetime.now(UTC) # --- Execute Pipeline Logic ---
final_pipeline_state.status = ( run_successful = False
PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED try:
) logger.info("Executing core logic...")
# This call and anything within it will inherit the pipeline_id context
await self._execute_ingestion(pipeline.config.ingestor_config)
logger.info("Core logic finished successfully.")
run_successful = True
except Exception as e:
logger.error(
f"Core logic failed during pipeline run: {e}", exc_info=True
)
# run_successful remains False
if run_successful: # --- Update Final State ---
final_pipeline_state.config.last_run = ( try:
now # Mark completion time on success # Fetch latest state again (important if external changes possible)
final_pipeline_state = await self.store.get(pipeline_id)
if not final_pipeline_state:
logger.warning(
"Pipeline disappeared during run. Cannot update final state."
)
# Handle potential deletion during run (e.g., unschedule if needed)
if self.scheduler_manager:
logger.warning(
"Attempting to unschedule potentially orphaned job"
)
asyncio.create_task(
self.scheduler_manager.unschedule_pipeline(pipeline_id)
)
return
final_pipeline_state = final_pipeline_state.model_copy(deep=True)
now = datetime.now(UTC)
final_pipeline_state.status = (
PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
) )
# Calculate and store the *next* run time based on the outcome if run_successful:
# Use the *updated* last_run if the run was successful final_pipeline_state.config.last_run = now
current_last_run = (
final_pipeline_state.config.last_run
) # This is 'now' if successful, else original last_run
final_pipeline_state.config.next_run = calculate_next_run(
frequency=final_pipeline_state.config.run_frequency,
last_run=current_last_run, # Use the relevant last_run for calculation
start_reference_time=now, # Use current time as reference for calculation
)
final_pipeline_state.updated_at = ( current_last_run = final_pipeline_state.config.last_run
now # Update timestamp for this final save final_pipeline_state.config.next_run = calculate_next_run(
) frequency=final_pipeline_state.config.run_frequency,
last_run=current_last_run,
await self.store.save(final_pipeline_state) start_reference_time=now,
logger.info(
f"Pipeline {pipeline_id} run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
)
# Notify scheduler about the *new* next run time so it can reschedule accurately
if self.scheduler_manager:
logger.debug(
f"Notifying scheduler to reschedule pipeline {pipeline_id} after run completion with next run {final_pipeline_state.config.next_run}."
) )
asyncio.create_task( final_pipeline_state.updated_at = now
self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
await self.store.save(final_pipeline_state)
logger.info(
f"Pipeline run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
) )
except Exception as e: if self.scheduler_manager:
logger.error( logger.debug(
f"Failed to update pipeline {pipeline_id} state after run execution: {e}", "Notifying scheduler to reschedule pipeline after run completion"
exc_info=True, )
) asyncio.create_task(
# The pipeline might be left in ACTIVE or an inconsistent state. self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
# Consider adding monitoring or retry logic here. )
except Exception as e:
logger.error(
f"Failed to update pipeline state after run execution: {e}",
exc_info=True,
)
# Pipeline might be left ACTIVE or FAILED state might not be saved. Needs monitoring.
async def _execute_ingestion(self, config: IngestorInput): async def _execute_ingestion(self, config: IngestorInput):
""" """
Executes the ingestion process for a pipeline using the provided IngestorInput config. Executes the ingestion process for a pipeline using the provided IngestorInput config.
Returns the ingestion results or raises an exception on failure. Returns the ingestion results or raises an exception on failure.
""" """
# Ensure Ingestor is imported locally or globally if needed
# from ingestion.core import Ingestor # Example import if needed
# Check if Ingestor is already available (e.g., imported at module level)
# If not, uncomment the import above or ensure it's accessible.
# Assuming Ingestor is available in the scope:
try: try:
# Avoid circular import # from ..ingestion import Ingestor
from ingestion.core import Ingestor
logger.info(f"Executing ingestion with config: {config}") logger.info(f"Executing ingestion with config: {config}")
# NOTE: Can be async
results = Ingestor.run(config.sources) results = Ingestor.run(config.sources)
logger.info(f"Ingestion completed successfully. Results: {results}") logger.info(
f"Ingestion completed successfully. Results count: {len(results.records)}"
)
return results return results
except ImportError: except ImportError:
logger.error("Failed to import Ingestor. Cannot execute ingestion.") logger.error("Failed to import Ingestor. Cannot execute ingestion.")