fix: fix import ingestors error, add color to log

2025-12-18 20:24:05 +01:00 · 2025-05-12 22:42:49 +07:00 · 2025-05-12 22:42:49 +07:00 · b7aa490316
commit b7aa490316
parent 701c2b0ae7
4 changed files with 105 additions and 304 deletions
--- a/pipeline/config.py
+++ b/pipeline/config.py
@ -19,7 +19,7 @@ class AppSettings(BaseSettings):
    # Application settings
    APP_NAME: str = "PipelineRunnerApp"
-    LOG_LEVEL: str = "INFO"  # Logging level (e.g., DEBUG, INFO, WARNING)
+    LOG_LEVEL: str = "DEBUG"  # Logging level (e.g., DEBUG, INFO, WARNING)
    LOG_ENABLE_SSE: bool = True  # Flag to enable/disable SSE log streaming sink
    # Store configuration
@ -56,7 +56,8 @@ logger.remove()
 logger.add(
    sys.stderr,
    level=settings.LOG_LEVEL.upper(),
-    format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}",
+    # format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}",
    colorize=True,
 )
 # File Sink
--- a/pipeline/ingestion/core.py
+++ b/pipeline/ingestion/core.py
@ -1,4 +1,8 @@
-from ingestors import IngestionMethod, SimpleIngestionStrategy, MLIngestionStrategy
+from ingestion.ingestors import (
    IngestionMethod,
    SimpleIngestionStrategy,
    MLIngestionStrategy,
 )
 from models.ingestion import IngestSourceConfig, OutputData
--- a/pipeline/models/models.py
+++ b/pipeline/models/models.py
@ -1,186 +0,0 @@
 """
 Pydantic models for pipelines and runs.
 """
 from typing import List, Union, Annotated, Optional, Literal, Dict, Any
 from uuid import UUID
 from datetime import datetime
 from pydantic import BaseModel, Field, HttpUrl, field_validator, ValidationInfo
 class RunCreate(BaseModel):
    """
    Model for creating a new run. (Empty)
    """
    pass
 class Run(BaseModel):
    """
    Status of a pipeline run.
    """
    id: UUID
    pipeline_id: UUID
    status: Literal["PENDING", "RUNNING", "COMPLETED", "FAILED"]
    started_at: datetime
    finished_at: Optional[datetime] = None
 class RunResult(Run):
    """
    Extended run model including results or error.
    """
    results: Optional[List[Dict[str, Any]]] = None
    error: Optional[str] = None
 class ApiConfig(BaseModel):
    """
    Configuration for an API source.
    """
    url: HttpUrl = Field(..., description="API endpoint URL")
    token: Optional[str] = Field(
        None,
        description="Optional bearer token for API authentication",
    )
 class ScrapeConfig(BaseModel):
    """
    Configuration for a web-scraping source.
    """
    urls: List[HttpUrl] = Field(
        ...,
        description="List of URLs to scrape",
    )
    schema_file: Optional[str] = Field(
        None,
        description="Path to a JSON file containing CSS extraction schema",
    )
    prompt: Optional[str] = Field(
        None,
        description="Prompt string for LLM-based extraction",
    )
 class FileConfig(BaseModel):
    """
    Configuration for a file-based source. Supports either a file path or an uploaded file.
    """
    path: Optional[str] = Field(
        None,
        description="Path to the input file (optional if upload is provided)",
    )
    upload: Optional[Any] = Field(
        None,
        description="Uploaded file object or metadata (optional if path is provided)",
    )
    upload_filename: Optional[str] = Field(
        None,
        description="Original filename of the uploaded file (for validation)",
    )
    format: Literal["csv", "json", "sqlite"] = Field(
        "json", description="Format of the file"
    )
    @field_validator("path", mode="before")
    def require_path_or_upload(cls, v, info: ValidationInfo):
        data = info.data
        if not v and not data.get("upload"):
            raise ValueError("Either 'path' or 'upload' must be provided.")
        return v
    @field_validator("upload_filename", mode="before")
    def filename_extension_matches_format(cls, v, info: ValidationInfo):
        fmt = info.data.get("format")
        if v and fmt and not v.lower().endswith(f".{fmt}"):
            raise ValueError(f"Uploaded file extension must match format '{fmt}'")
        return v
    @field_validator("path", mode="after")
    def path_or_upload_extension_matches_format(cls, v, info: ValidationInfo):
        fmt = info.data.get("format")
        upload_filename = info.data.get("upload_filename")
        if v and fmt and not v.lower().endswith(f".{fmt}"):
            raise ValueError(f"File extension must match format '{fmt}'")
        if upload_filename and fmt and not upload_filename.lower().endswith(f".{fmt}"):
            raise ValueError(f"Uploaded file extension must match format '{fmt}'")
        return v
 class ApiSource(BaseModel):
    """
    An API-based data source.
    """
    type: Literal["api"] = Field(
        "api",
        description="Discriminator for API source",  # Removed const=True
    )
    config: ApiConfig
 class ScrapeSource(BaseModel):
    """
    A web-scraping data source.
    """
    type: Literal["scrape"] = Field(
        "scrape",
        description="Discriminator for scrape source",  # Removed const=True
    )
    config: ScrapeConfig
 class FileSource(BaseModel):
    """
    A file-based data source.
    """
    type: Literal["file"] = Field(
        "file",
        description="Discriminator for file source",  # Removed const=True
    )
    config: FileConfig
 Source = Annotated[
    Union[ApiSource, ScrapeSource, FileSource],
    Field(discriminator="type", description="Union of all source types"),
 ]
 class PipelineCreate(BaseModel):
    """
    Payload for creating a new pipeline.
    """
    name: Optional[str] = Field(
        default=None,
        description="Optional human-readable name for the pipeline",
    )
    sources: List[Source] = Field(
        ..., description="List of data sources for this pipeline"
    )
 class Pipeline(BaseModel):
    """
    Representation of a pipeline.
    """
    id: UUID = Field(..., description="Unique identifier for the pipeline")
    name: Optional[str] = Field(
        None, description="Optional human-readable name for the pipeline"
    )
    sources: List[Source] = Field(..., description="List of configured data sources")
    created_at: datetime = Field(
        ..., description="UTC timestamp when the pipeline was created"
    )
--- a/pipeline/services/pipeline_service.py
+++ b/pipeline/services/pipeline_service.py
@ -8,6 +8,8 @@ from uuid import UUID, uuid4
 from typing import Optional, List, TYPE_CHECKING
 from loguru import logger
 from ingestion import Ingestor
 from models.pipeline import (
    Pipeline,
    PipelineCreate,
@ -240,142 +242,122 @@ class PipelineService:
    async def run_pipeline(self, pipeline_id: UUID) -> None:
        """
        Executes the pipeline logic, updating status and run times.
-        This is called by the scheduler job or manual trigger.
+        Logs associated with this run will include the pipeline_id.
        """
-        logger.info(f"Attempting run execution for pipeline: id={pipeline_id}")
+        # Use contextualize to tag logs originating from this specific run
-        pipeline = await self.store.get(pipeline_id)
+        with logger.contextualize(
            pipeline_id=str(pipeline_id)
        ):  # Ensure it's a string for context
            logger.info(
                "Attempting run execution for pipeline"
            )  # Log context takes effect here
            pipeline = await self.store.get(pipeline_id)
-        if not pipeline:
+            if not pipeline:
-            logger.error(f"Cannot run pipeline: Pipeline not found (id={pipeline_id})")
+                logger.error("Cannot run pipeline: Pipeline not found")
-            return
+                return
-        # NOTE: lock mechanism
+            if pipeline.status == PipelineStatus.ACTIVE:
-        if pipeline.status == PipelineStatus.ACTIVE:
+                logger.warning("Pipeline is already ACTIVE. Skipping run.")
            logger.warning(
                f"Pipeline id={pipeline_id} is already ACTIVE. Skipping run."
            )
            return
        # --- Mark as ACTIVE ---
        try:
            pipeline.status = PipelineStatus.ACTIVE
            pipeline.updated_at = datetime.now(UTC)  # Update timestamp
            await self.store.save(pipeline)
            logger.info(f"Pipeline {pipeline_id} marked as ACTIVE.")
        except Exception as e:
            logger.error(
                f"Failed to mark pipeline {pipeline_id} as ACTIVE: {e}. Aborting run.",
                exc_info=True,
            )
            # Attempt to restore status? Depends on store guarantees.
            # pipeline.status = original_status # Potentially try rollback
            return  # Abort run
        # --- Execute Pipeline Logic ---
        run_successful = False
        try:
            logger.info(f"Executing core logic for pipeline id={pipeline_id}...")
            # ---------------------------------------------------
            # Ensure _execute_ingestion is awaited if it's async
            await self._execute_ingestion(pipeline.config.ingestor_config)
            # ---------------------------------------------------
            logger.info(f"Core logic finished successfully for id={pipeline_id}.")
            run_successful = True
        except Exception as e:
            logger.error(
                f"Core logic failed during pipeline run id={pipeline_id}: {e}",
                exc_info=True,
            )
            # run_successful remains False
        # --- Update Final State ---
        try:
            # Fetch the latest state again to minimize race conditions, though the ACTIVE lock helps
            final_pipeline_state = await self.store.get(pipeline_id)
            if not final_pipeline_state:
                logger.warning(
                    f"Pipeline {pipeline_id} disappeared during run. Cannot update final state."
                )
                # The pipeline might have been deleted externally while running.
                # Scheduler might need cleanup if the job still exists.
                if self.scheduler_manager:
                    logger.warning(
                        f"Attempting to unschedule potentially orphaned job for {pipeline_id}"
                    )
                    asyncio.create_task(
                        self.scheduler_manager.unschedule_pipeline(pipeline_id)
                    )
                return
-            # Avoid modifying the object fetched directly if store uses caching/references
+            # --- Mark as ACTIVE ---
-            final_pipeline_state = final_pipeline_state.model_copy(deep=True)
+            # original_status = pipeline.status # Store original status for potential rollback
            try:
                pipeline.status = PipelineStatus.ACTIVE
                pipeline.updated_at = datetime.now(UTC)
                await self.store.save(pipeline)
                logger.info("Pipeline marked as ACTIVE.")
            except Exception as e:
                logger.error(
                    f"Failed to mark pipeline as ACTIVE: {e}. Aborting run.",
                    exc_info=True,
                )
                # Attempt to restore status? Requires careful thought on atomicity
                # pipeline.status = original_status
                # await self.store.save(pipeline) # Potential race condition/overwrite here
                return
-            now = datetime.now(UTC)
+            # --- Execute Pipeline Logic ---
-            final_pipeline_state.status = (
+            run_successful = False
-                PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
+            try:
-            )
+                logger.info("Executing core logic...")
                # This call and anything within it will inherit the pipeline_id context
                await self._execute_ingestion(pipeline.config.ingestor_config)
                logger.info("Core logic finished successfully.")
                run_successful = True
            except Exception as e:
                logger.error(
                    f"Core logic failed during pipeline run: {e}", exc_info=True
                )
                # run_successful remains False
-            if run_successful:
+            # --- Update Final State ---
-                final_pipeline_state.config.last_run = (
+            try:
-                    now  # Mark completion time on success
+                # Fetch latest state again (important if external changes possible)
                final_pipeline_state = await self.store.get(pipeline_id)
                if not final_pipeline_state:
                    logger.warning(
                        "Pipeline disappeared during run. Cannot update final state."
                    )
                    # Handle potential deletion during run (e.g., unschedule if needed)
                    if self.scheduler_manager:
                        logger.warning(
                            "Attempting to unschedule potentially orphaned job"
                        )
                        asyncio.create_task(
                            self.scheduler_manager.unschedule_pipeline(pipeline_id)
                        )
                    return
                final_pipeline_state = final_pipeline_state.model_copy(deep=True)
                now = datetime.now(UTC)
                final_pipeline_state.status = (
                    PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
                )
-            # Calculate and store the *next* run time based on the outcome
+                if run_successful:
-            # Use the *updated* last_run if the run was successful
+                    final_pipeline_state.config.last_run = now
            current_last_run = (
                final_pipeline_state.config.last_run
            )  # This is 'now' if successful, else original last_run
            final_pipeline_state.config.next_run = calculate_next_run(
                frequency=final_pipeline_state.config.run_frequency,
                last_run=current_last_run,  # Use the relevant last_run for calculation
                start_reference_time=now,  # Use current time as reference for calculation
            )
-            final_pipeline_state.updated_at = (
+                current_last_run = final_pipeline_state.config.last_run
-                now  # Update timestamp for this final save
+                final_pipeline_state.config.next_run = calculate_next_run(
-            )
+                    frequency=final_pipeline_state.config.run_frequency,
-
+                    last_run=current_last_run,
-            await self.store.save(final_pipeline_state)
+                    start_reference_time=now,
            logger.info(
                f"Pipeline {pipeline_id} run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
            )
            # Notify scheduler about the *new* next run time so it can reschedule accurately
            if self.scheduler_manager:
                logger.debug(
                    f"Notifying scheduler to reschedule pipeline {pipeline_id} after run completion with next run {final_pipeline_state.config.next_run}."
                )
-                asyncio.create_task(
+                final_pipeline_state.updated_at = now
-                    self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
+
                await self.store.save(final_pipeline_state)
                logger.info(
                    f"Pipeline run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
                )
-        except Exception as e:
+                if self.scheduler_manager:
-            logger.error(
+                    logger.debug(
-                f"Failed to update pipeline {pipeline_id} state after run execution: {e}",
+                        "Notifying scheduler to reschedule pipeline after run completion"
-                exc_info=True,
+                    )
-            )
+                    asyncio.create_task(
-            # The pipeline might be left in ACTIVE or an inconsistent state.
+                        self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
-            # Consider adding monitoring or retry logic here.
+                    )
            except Exception as e:
                logger.error(
                    f"Failed to update pipeline state after run execution: {e}",
                    exc_info=True,
                )
                # Pipeline might be left ACTIVE or FAILED state might not be saved. Needs monitoring.
    async def _execute_ingestion(self, config: IngestorInput):
        """
        Executes the ingestion process for a pipeline using the provided IngestorInput config.
        Returns the ingestion results or raises an exception on failure.
        """
        # Ensure Ingestor is imported locally or globally if needed
        # from ingestion.core import Ingestor # Example import if needed
        # Check if Ingestor is already available (e.g., imported at module level)
        # If not, uncomment the import above or ensure it's accessible.
        # Assuming Ingestor is available in the scope:
        try:
-            # Avoid circular import
+            # from ..ingestion import Ingestor
            from ingestion.core import Ingestor
            logger.info(f"Executing ingestion with config: {config}")
            # NOTE: Can be async
            results = Ingestor.run(config.sources)
-            logger.info(f"Ingestion completed successfully. Results: {results}")
+            logger.info(
                f"Ingestion completed successfully. Results count: {len(results.records)}"
            )
            return results
        except ImportError:
            logger.error("Failed to import Ingestor. Cannot execute ingestion.")