mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-18 20:24:05 +01:00
fix: fix import ingestors error, add color to log
This commit is contained in:
parent
701c2b0ae7
commit
b7aa490316
@ -19,7 +19,7 @@ class AppSettings(BaseSettings):
|
|||||||
|
|
||||||
# Application settings
|
# Application settings
|
||||||
APP_NAME: str = "PipelineRunnerApp"
|
APP_NAME: str = "PipelineRunnerApp"
|
||||||
LOG_LEVEL: str = "INFO" # Logging level (e.g., DEBUG, INFO, WARNING)
|
LOG_LEVEL: str = "DEBUG" # Logging level (e.g., DEBUG, INFO, WARNING)
|
||||||
LOG_ENABLE_SSE: bool = True # Flag to enable/disable SSE log streaming sink
|
LOG_ENABLE_SSE: bool = True # Flag to enable/disable SSE log streaming sink
|
||||||
|
|
||||||
# Store configuration
|
# Store configuration
|
||||||
@ -56,7 +56,8 @@ logger.remove()
|
|||||||
logger.add(
|
logger.add(
|
||||||
sys.stderr,
|
sys.stderr,
|
||||||
level=settings.LOG_LEVEL.upper(),
|
level=settings.LOG_LEVEL.upper(),
|
||||||
format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}",
|
# format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}",
|
||||||
|
colorize=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# File Sink
|
# File Sink
|
||||||
|
|||||||
@ -1,4 +1,8 @@
|
|||||||
from ingestors import IngestionMethod, SimpleIngestionStrategy, MLIngestionStrategy
|
from ingestion.ingestors import (
|
||||||
|
IngestionMethod,
|
||||||
|
SimpleIngestionStrategy,
|
||||||
|
MLIngestionStrategy,
|
||||||
|
)
|
||||||
from models.ingestion import IngestSourceConfig, OutputData
|
from models.ingestion import IngestSourceConfig, OutputData
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,186 +0,0 @@
|
|||||||
"""
|
|
||||||
Pydantic models for pipelines and runs.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import List, Union, Annotated, Optional, Literal, Dict, Any
|
|
||||||
from uuid import UUID
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field, HttpUrl, field_validator, ValidationInfo
|
|
||||||
|
|
||||||
|
|
||||||
class RunCreate(BaseModel):
|
|
||||||
"""
|
|
||||||
Model for creating a new run. (Empty)
|
|
||||||
"""
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Run(BaseModel):
|
|
||||||
"""
|
|
||||||
Status of a pipeline run.
|
|
||||||
"""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
pipeline_id: UUID
|
|
||||||
status: Literal["PENDING", "RUNNING", "COMPLETED", "FAILED"]
|
|
||||||
started_at: datetime
|
|
||||||
finished_at: Optional[datetime] = None
|
|
||||||
|
|
||||||
|
|
||||||
class RunResult(Run):
|
|
||||||
"""
|
|
||||||
Extended run model including results or error.
|
|
||||||
"""
|
|
||||||
|
|
||||||
results: Optional[List[Dict[str, Any]]] = None
|
|
||||||
error: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class ApiConfig(BaseModel):
|
|
||||||
"""
|
|
||||||
Configuration for an API source.
|
|
||||||
"""
|
|
||||||
|
|
||||||
url: HttpUrl = Field(..., description="API endpoint URL")
|
|
||||||
token: Optional[str] = Field(
|
|
||||||
None,
|
|
||||||
description="Optional bearer token for API authentication",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ScrapeConfig(BaseModel):
|
|
||||||
"""
|
|
||||||
Configuration for a web-scraping source.
|
|
||||||
"""
|
|
||||||
|
|
||||||
urls: List[HttpUrl] = Field(
|
|
||||||
...,
|
|
||||||
description="List of URLs to scrape",
|
|
||||||
)
|
|
||||||
schema_file: Optional[str] = Field(
|
|
||||||
None,
|
|
||||||
description="Path to a JSON file containing CSS extraction schema",
|
|
||||||
)
|
|
||||||
prompt: Optional[str] = Field(
|
|
||||||
None,
|
|
||||||
description="Prompt string for LLM-based extraction",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class FileConfig(BaseModel):
|
|
||||||
"""
|
|
||||||
Configuration for a file-based source. Supports either a file path or an uploaded file.
|
|
||||||
"""
|
|
||||||
|
|
||||||
path: Optional[str] = Field(
|
|
||||||
None,
|
|
||||||
description="Path to the input file (optional if upload is provided)",
|
|
||||||
)
|
|
||||||
upload: Optional[Any] = Field(
|
|
||||||
None,
|
|
||||||
description="Uploaded file object or metadata (optional if path is provided)",
|
|
||||||
)
|
|
||||||
upload_filename: Optional[str] = Field(
|
|
||||||
None,
|
|
||||||
description="Original filename of the uploaded file (for validation)",
|
|
||||||
)
|
|
||||||
format: Literal["csv", "json", "sqlite"] = Field(
|
|
||||||
"json", description="Format of the file"
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("path", mode="before")
|
|
||||||
def require_path_or_upload(cls, v, info: ValidationInfo):
|
|
||||||
data = info.data
|
|
||||||
if not v and not data.get("upload"):
|
|
||||||
raise ValueError("Either 'path' or 'upload' must be provided.")
|
|
||||||
return v
|
|
||||||
|
|
||||||
@field_validator("upload_filename", mode="before")
|
|
||||||
def filename_extension_matches_format(cls, v, info: ValidationInfo):
|
|
||||||
fmt = info.data.get("format")
|
|
||||||
if v and fmt and not v.lower().endswith(f".{fmt}"):
|
|
||||||
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
|
|
||||||
return v
|
|
||||||
|
|
||||||
@field_validator("path", mode="after")
|
|
||||||
def path_or_upload_extension_matches_format(cls, v, info: ValidationInfo):
|
|
||||||
fmt = info.data.get("format")
|
|
||||||
upload_filename = info.data.get("upload_filename")
|
|
||||||
if v and fmt and not v.lower().endswith(f".{fmt}"):
|
|
||||||
raise ValueError(f"File extension must match format '{fmt}'")
|
|
||||||
if upload_filename and fmt and not upload_filename.lower().endswith(f".{fmt}"):
|
|
||||||
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
class ApiSource(BaseModel):
|
|
||||||
"""
|
|
||||||
An API-based data source.
|
|
||||||
"""
|
|
||||||
|
|
||||||
type: Literal["api"] = Field(
|
|
||||||
"api",
|
|
||||||
description="Discriminator for API source", # Removed const=True
|
|
||||||
)
|
|
||||||
config: ApiConfig
|
|
||||||
|
|
||||||
|
|
||||||
class ScrapeSource(BaseModel):
|
|
||||||
"""
|
|
||||||
A web-scraping data source.
|
|
||||||
"""
|
|
||||||
|
|
||||||
type: Literal["scrape"] = Field(
|
|
||||||
"scrape",
|
|
||||||
description="Discriminator for scrape source", # Removed const=True
|
|
||||||
)
|
|
||||||
config: ScrapeConfig
|
|
||||||
|
|
||||||
|
|
||||||
class FileSource(BaseModel):
|
|
||||||
"""
|
|
||||||
A file-based data source.
|
|
||||||
"""
|
|
||||||
|
|
||||||
type: Literal["file"] = Field(
|
|
||||||
"file",
|
|
||||||
description="Discriminator for file source", # Removed const=True
|
|
||||||
)
|
|
||||||
config: FileConfig
|
|
||||||
|
|
||||||
|
|
||||||
Source = Annotated[
|
|
||||||
Union[ApiSource, ScrapeSource, FileSource],
|
|
||||||
Field(discriminator="type", description="Union of all source types"),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineCreate(BaseModel):
|
|
||||||
"""
|
|
||||||
Payload for creating a new pipeline.
|
|
||||||
"""
|
|
||||||
|
|
||||||
name: Optional[str] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Optional human-readable name for the pipeline",
|
|
||||||
)
|
|
||||||
sources: List[Source] = Field(
|
|
||||||
..., description="List of data sources for this pipeline"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(BaseModel):
|
|
||||||
"""
|
|
||||||
Representation of a pipeline.
|
|
||||||
"""
|
|
||||||
|
|
||||||
id: UUID = Field(..., description="Unique identifier for the pipeline")
|
|
||||||
name: Optional[str] = Field(
|
|
||||||
None, description="Optional human-readable name for the pipeline"
|
|
||||||
)
|
|
||||||
sources: List[Source] = Field(..., description="List of configured data sources")
|
|
||||||
created_at: datetime = Field(
|
|
||||||
..., description="UTC timestamp when the pipeline was created"
|
|
||||||
)
|
|
||||||
@ -8,6 +8,8 @@ from uuid import UUID, uuid4
|
|||||||
from typing import Optional, List, TYPE_CHECKING
|
from typing import Optional, List, TYPE_CHECKING
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
from ingestion import Ingestor
|
||||||
|
|
||||||
from models.pipeline import (
|
from models.pipeline import (
|
||||||
Pipeline,
|
Pipeline,
|
||||||
PipelineCreate,
|
PipelineCreate,
|
||||||
@ -240,142 +242,122 @@ class PipelineService:
|
|||||||
async def run_pipeline(self, pipeline_id: UUID) -> None:
|
async def run_pipeline(self, pipeline_id: UUID) -> None:
|
||||||
"""
|
"""
|
||||||
Executes the pipeline logic, updating status and run times.
|
Executes the pipeline logic, updating status and run times.
|
||||||
This is called by the scheduler job or manual trigger.
|
Logs associated with this run will include the pipeline_id.
|
||||||
"""
|
"""
|
||||||
logger.info(f"Attempting run execution for pipeline: id={pipeline_id}")
|
# Use contextualize to tag logs originating from this specific run
|
||||||
pipeline = await self.store.get(pipeline_id)
|
with logger.contextualize(
|
||||||
|
pipeline_id=str(pipeline_id)
|
||||||
|
): # Ensure it's a string for context
|
||||||
|
logger.info(
|
||||||
|
"Attempting run execution for pipeline"
|
||||||
|
) # Log context takes effect here
|
||||||
|
pipeline = await self.store.get(pipeline_id)
|
||||||
|
|
||||||
if not pipeline:
|
if not pipeline:
|
||||||
logger.error(f"Cannot run pipeline: Pipeline not found (id={pipeline_id})")
|
logger.error("Cannot run pipeline: Pipeline not found")
|
||||||
return
|
return
|
||||||
# NOTE: lock mechanism
|
if pipeline.status == PipelineStatus.ACTIVE:
|
||||||
if pipeline.status == PipelineStatus.ACTIVE:
|
logger.warning("Pipeline is already ACTIVE. Skipping run.")
|
||||||
logger.warning(
|
|
||||||
f"Pipeline id={pipeline_id} is already ACTIVE. Skipping run."
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
# --- Mark as ACTIVE ---
|
|
||||||
try:
|
|
||||||
pipeline.status = PipelineStatus.ACTIVE
|
|
||||||
pipeline.updated_at = datetime.now(UTC) # Update timestamp
|
|
||||||
await self.store.save(pipeline)
|
|
||||||
logger.info(f"Pipeline {pipeline_id} marked as ACTIVE.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to mark pipeline {pipeline_id} as ACTIVE: {e}. Aborting run.",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
# Attempt to restore status? Depends on store guarantees.
|
|
||||||
# pipeline.status = original_status # Potentially try rollback
|
|
||||||
return # Abort run
|
|
||||||
|
|
||||||
# --- Execute Pipeline Logic ---
|
|
||||||
run_successful = False
|
|
||||||
try:
|
|
||||||
logger.info(f"Executing core logic for pipeline id={pipeline_id}...")
|
|
||||||
# ---------------------------------------------------
|
|
||||||
# Ensure _execute_ingestion is awaited if it's async
|
|
||||||
await self._execute_ingestion(pipeline.config.ingestor_config)
|
|
||||||
# ---------------------------------------------------
|
|
||||||
logger.info(f"Core logic finished successfully for id={pipeline_id}.")
|
|
||||||
run_successful = True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Core logic failed during pipeline run id={pipeline_id}: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
# run_successful remains False
|
|
||||||
|
|
||||||
# --- Update Final State ---
|
|
||||||
try:
|
|
||||||
# Fetch the latest state again to minimize race conditions, though the ACTIVE lock helps
|
|
||||||
final_pipeline_state = await self.store.get(pipeline_id)
|
|
||||||
if not final_pipeline_state:
|
|
||||||
logger.warning(
|
|
||||||
f"Pipeline {pipeline_id} disappeared during run. Cannot update final state."
|
|
||||||
)
|
|
||||||
# The pipeline might have been deleted externally while running.
|
|
||||||
# Scheduler might need cleanup if the job still exists.
|
|
||||||
if self.scheduler_manager:
|
|
||||||
logger.warning(
|
|
||||||
f"Attempting to unschedule potentially orphaned job for {pipeline_id}"
|
|
||||||
)
|
|
||||||
asyncio.create_task(
|
|
||||||
self.scheduler_manager.unschedule_pipeline(pipeline_id)
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# Avoid modifying the object fetched directly if store uses caching/references
|
# --- Mark as ACTIVE ---
|
||||||
final_pipeline_state = final_pipeline_state.model_copy(deep=True)
|
# original_status = pipeline.status # Store original status for potential rollback
|
||||||
|
try:
|
||||||
|
pipeline.status = PipelineStatus.ACTIVE
|
||||||
|
pipeline.updated_at = datetime.now(UTC)
|
||||||
|
await self.store.save(pipeline)
|
||||||
|
logger.info("Pipeline marked as ACTIVE.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to mark pipeline as ACTIVE: {e}. Aborting run.",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
# Attempt to restore status? Requires careful thought on atomicity
|
||||||
|
# pipeline.status = original_status
|
||||||
|
# await self.store.save(pipeline) # Potential race condition/overwrite here
|
||||||
|
return
|
||||||
|
|
||||||
now = datetime.now(UTC)
|
# --- Execute Pipeline Logic ---
|
||||||
final_pipeline_state.status = (
|
run_successful = False
|
||||||
PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
|
try:
|
||||||
)
|
logger.info("Executing core logic...")
|
||||||
|
# This call and anything within it will inherit the pipeline_id context
|
||||||
|
await self._execute_ingestion(pipeline.config.ingestor_config)
|
||||||
|
logger.info("Core logic finished successfully.")
|
||||||
|
run_successful = True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Core logic failed during pipeline run: {e}", exc_info=True
|
||||||
|
)
|
||||||
|
# run_successful remains False
|
||||||
|
|
||||||
if run_successful:
|
# --- Update Final State ---
|
||||||
final_pipeline_state.config.last_run = (
|
try:
|
||||||
now # Mark completion time on success
|
# Fetch latest state again (important if external changes possible)
|
||||||
|
final_pipeline_state = await self.store.get(pipeline_id)
|
||||||
|
if not final_pipeline_state:
|
||||||
|
logger.warning(
|
||||||
|
"Pipeline disappeared during run. Cannot update final state."
|
||||||
|
)
|
||||||
|
# Handle potential deletion during run (e.g., unschedule if needed)
|
||||||
|
if self.scheduler_manager:
|
||||||
|
logger.warning(
|
||||||
|
"Attempting to unschedule potentially orphaned job"
|
||||||
|
)
|
||||||
|
asyncio.create_task(
|
||||||
|
self.scheduler_manager.unschedule_pipeline(pipeline_id)
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
final_pipeline_state = final_pipeline_state.model_copy(deep=True)
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
final_pipeline_state.status = (
|
||||||
|
PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate and store the *next* run time based on the outcome
|
if run_successful:
|
||||||
# Use the *updated* last_run if the run was successful
|
final_pipeline_state.config.last_run = now
|
||||||
current_last_run = (
|
|
||||||
final_pipeline_state.config.last_run
|
|
||||||
) # This is 'now' if successful, else original last_run
|
|
||||||
final_pipeline_state.config.next_run = calculate_next_run(
|
|
||||||
frequency=final_pipeline_state.config.run_frequency,
|
|
||||||
last_run=current_last_run, # Use the relevant last_run for calculation
|
|
||||||
start_reference_time=now, # Use current time as reference for calculation
|
|
||||||
)
|
|
||||||
|
|
||||||
final_pipeline_state.updated_at = (
|
current_last_run = final_pipeline_state.config.last_run
|
||||||
now # Update timestamp for this final save
|
final_pipeline_state.config.next_run = calculate_next_run(
|
||||||
)
|
frequency=final_pipeline_state.config.run_frequency,
|
||||||
|
last_run=current_last_run,
|
||||||
await self.store.save(final_pipeline_state)
|
start_reference_time=now,
|
||||||
logger.info(
|
|
||||||
f"Pipeline {pipeline_id} run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Notify scheduler about the *new* next run time so it can reschedule accurately
|
|
||||||
if self.scheduler_manager:
|
|
||||||
logger.debug(
|
|
||||||
f"Notifying scheduler to reschedule pipeline {pipeline_id} after run completion with next run {final_pipeline_state.config.next_run}."
|
|
||||||
)
|
)
|
||||||
asyncio.create_task(
|
final_pipeline_state.updated_at = now
|
||||||
self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
|
|
||||||
|
await self.store.save(final_pipeline_state)
|
||||||
|
logger.info(
|
||||||
|
f"Pipeline run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
if self.scheduler_manager:
|
||||||
logger.error(
|
logger.debug(
|
||||||
f"Failed to update pipeline {pipeline_id} state after run execution: {e}",
|
"Notifying scheduler to reschedule pipeline after run completion"
|
||||||
exc_info=True,
|
)
|
||||||
)
|
asyncio.create_task(
|
||||||
# The pipeline might be left in ACTIVE or an inconsistent state.
|
self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
|
||||||
# Consider adding monitoring or retry logic here.
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to update pipeline state after run execution: {e}",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
# Pipeline might be left ACTIVE or FAILED state might not be saved. Needs monitoring.
|
||||||
|
|
||||||
async def _execute_ingestion(self, config: IngestorInput):
|
async def _execute_ingestion(self, config: IngestorInput):
|
||||||
"""
|
"""
|
||||||
Executes the ingestion process for a pipeline using the provided IngestorInput config.
|
Executes the ingestion process for a pipeline using the provided IngestorInput config.
|
||||||
Returns the ingestion results or raises an exception on failure.
|
Returns the ingestion results or raises an exception on failure.
|
||||||
"""
|
"""
|
||||||
# Ensure Ingestor is imported locally or globally if needed
|
|
||||||
# from ingestion.core import Ingestor # Example import if needed
|
|
||||||
|
|
||||||
# Check if Ingestor is already available (e.g., imported at module level)
|
|
||||||
# If not, uncomment the import above or ensure it's accessible.
|
|
||||||
# Assuming Ingestor is available in the scope:
|
|
||||||
try:
|
try:
|
||||||
# Avoid circular import
|
# from ..ingestion import Ingestor
|
||||||
from ingestion.core import Ingestor
|
|
||||||
|
|
||||||
logger.info(f"Executing ingestion with config: {config}")
|
logger.info(f"Executing ingestion with config: {config}")
|
||||||
# NOTE: Can be async
|
|
||||||
results = Ingestor.run(config.sources)
|
results = Ingestor.run(config.sources)
|
||||||
logger.info(f"Ingestion completed successfully. Results: {results}")
|
logger.info(
|
||||||
|
f"Ingestion completed successfully. Results count: {len(results.records)}"
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.error("Failed to import Ingestor. Cannot execute ingestion.")
|
logger.error("Failed to import Ingestor. Cannot execute ingestion.")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user