backend-api/services.py
2025-05-11 16:01:49 +07:00

68 lines
2.5 KiB
Python

"""
Background service to execute pipelines: ingestion → normalization.
"""
from typing import List, Dict, Any
from uuid import UUID
from datetime import datetime
import stores
import models
from ingestion.ingestor import Ingestor
from normalization.normalizer import Normalizer
from log.logging_utils import setup_run_logging, cleanup_run_logging, pipeline_log
def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
"""
Execute a pipeline: ingest data, normalize it, and update run status.
Args:
pipeline: The Pipeline model to run.
run_id: UUID of the RunResult to update.
"""
run = stores.runs.get(run_id)
if not run:
return
# Setup structured per-run logging
setup_run_logging(str(pipeline.id), str(run_id))
pipeline_log("INFO", "Pipeline run starting", str(pipeline.id), str(run_id), status="RUNNING")
# Mark as running
run.status = 'RUNNING'
run.started_at = datetime.utcnow()
try:
# Ingest raw records
pipeline_log("INFO", "Ingesting raw records", str(pipeline.id), str(run_id))
raw_records: List[Dict[str, Any]] = Ingestor.run(pipeline.sources)
pipeline_log("INFO", f"Ingested {len(raw_records)} records", str(pipeline.id), str(run_id))
# Normalize records
normalizer = Normalizer()
canonical: List[Dict[str, Any]] = []
for raw in raw_records:
source_type = raw.get('source_type')
source = raw.get('source')
if not source_type or not source:
pipeline_log("ERROR", "Record missing 'source_type' or 'source'", str(pipeline.id), str(run_id), status="FAILED")
raise ValueError("Record missing 'source_type' or 'source'.")
norm = normalizer.normalize([raw], source_type, source)
canonical.extend(norm)
# Success
run.status = 'COMPLETED'
run.finished_at = datetime.utcnow()
run.results = canonical
pipeline_log("SUCCESS", f"Pipeline run completed with {len(canonical)} records", str(pipeline.id), str(run_id), status="COMPLETED")
except Exception as e:
# Log failure with stack trace
pipeline_log("ERROR", f"Pipeline run failed: {e}", str(pipeline.id), str(run_id), status="FAILED", error=str(e))
run.status = 'FAILED'
run.finished_at = datetime.utcnow()
run.error = str(e)
finally:
pipeline_log("INFO", "Pipeline run finished", str(pipeline.id), str(run_id), status=run.status)
cleanup_run_logging(str(run_id))