add logging

2025-12-18 20:24:05 +01:00 · 2025-05-11 16:01:49 +07:00 · 2025-05-11 16:01:49 +07:00 · 73654a402c
commit 73654a402c
parent 10856f6cdf
14 changed files with 358 additions and 25 deletions
--- a/data/sample.json
+++ b/data/sample.json
@ -0,0 +1,38 @@
 {
    "quiz": {
        "sport": {
            "q1": {
                "question": "Which one is correct team name in NBA?",
                "options": [
                    "New York Bulls",
                    "Los Angeles Kings",
                    "Golden State Warriros",
                    "Huston Rocket"
                ],
                "answer": "Huston Rocket"
            }
        },
        "maths": {
            "q1": {
                "question": "5 + 7 = ?",
                "options": [
                    "10",
                    "11",
                    "12",
                    "13"
                ],
                "answer": "12"
            },
            "q2": {
                "question": "12 - 8 = ?",
                "options": [
                    "1",
                    "2",
                    "3",
                    "4"
                ],
                "answer": "4"
            }
        }
    }
 }
--- a/ingestion/adapters/api_adapter.py
+++ b/ingestion/adapters/api_adapter.py
@ -9,6 +9,7 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from .base import DataSourceAdapter
 from loguru import logger
 class ApiAdapter(DataSourceAdapter):
@ -20,7 +21,8 @@ class ApiAdapter(DataSourceAdapter):
        self,
        url: str,
        headers: Optional[Dict[str, str]] = None,
-        timeout: float = 30
+        timeout: float = 30,
        token: Optional[str] = None,
    ):
        """
        Initialize the API adapter.
@ -29,10 +31,14 @@ class ApiAdapter(DataSourceAdapter):
            url: Endpoint URL to fetch.
            headers: Optional HTTP headers.
            timeout: Timeout in seconds for the request.
            token: Optional bearer token for Authorization header.
        """
        self.url = url
        self.headers = headers or {}
        if token:
            self.headers["Authorization"] = f"Bearer {token}"
        self.timeout = timeout
        logger.info(f"Initializing ApiAdapter for URL: {url}")
        self.session = self._init_session()
    def _init_session(self) -> requests.Session:
@ -49,6 +55,7 @@ class ApiAdapter(DataSourceAdapter):
        adapter = HTTPAdapter(max_retries=retries)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        logger.debug("HTTP session initialized with retry strategy.")
        return session
    def fetch(self) -> List[Dict[str, Any]]:
@ -61,21 +68,27 @@ class ApiAdapter(DataSourceAdapter):
        Raises:
            RuntimeError: On network error, HTTP error, or JSON parse error.
        """
        logger.info(f"Fetching data from API: {self.url}")
        try:
            response = self.session.get(
                self.url, headers=self.headers, timeout=self.timeout
            )
            response.raise_for_status()
            logger.debug(f"Received response with status code: {response.status_code}")
        except requests.exceptions.RequestException as e:
            logger.error(f"API request failed: {e}")
            raise RuntimeError(f"API request failed: {e}")
        try:
            data = response.json()
            logger.debug(f"Successfully parsed JSON response from {self.url}")
        except ValueError as e:
            logger.error(f"Failed to parse JSON response: {e}")
            raise RuntimeError(f"Failed to parse JSON response: {e}")
        if isinstance(data, list):
            return data
        if isinstance(data, dict):
            return [data]
        logger.error("Unexpected JSON structure: expected list or dict.")
        raise RuntimeError("Unexpected JSON structure: expected list or dict.")
--- a/ingestion/adapters/file_adapter.py
+++ b/ingestion/adapters/file_adapter.py
@ -6,27 +6,36 @@ from typing import List, Dict, Any
 import json
 import pandas as pd
 from loguru import logger
 from .base import DataSourceAdapter
 class FileAdapter(DataSourceAdapter):
    """
-    Adapter for reading data from local files (CSV or JSON).
+    Adapter for reading data from local files (CSV or JSON), or from uploaded file-like objects.
    """
-    def __init__(self, path: str):
+    def __init__(self, path: str = None, format: str = None, upload=None, upload_filename: str = None):
        """
        Initialize the file adapter.
        Args:
-            path: Path to the input file (.csv or .json).
+            path: Path to the input file (.csv or .json), optional if upload is provided.
            format: Optional file format (e.g., 'csv', 'json').
            upload: Optional file-like object (e.g., from upload).
            upload_filename: Optional original filename for validation/logging.
        """
        self.path = path
        self.format = format
        self.upload = upload
        self.upload_filename = upload_filename
        logger.info(f"Initialized FileAdapter for path: {path}, upload: {upload_filename}, format: {format}")
    def fetch(self) -> List[Dict[str, Any]]:
        """
        Read and parse the file, returning a list of records.
        Supports both path-based and uploaded file-like inputs.
        Returns:
            List of dicts from the file contents.
@ -35,26 +44,68 @@ class FileAdapter(DataSourceAdapter):
            RuntimeError: On read or parse errors.
            ValueError: If file extension is unsupported.
        """
-        p = self.path.lower()
+        if self.upload is not None:
            # Handle uploaded file-like object
            logger.info(f"Fetching data from uploaded file: {self.upload_filename or '[no filename]'}")
            if self.format == "csv" or (self.upload_filename and self.upload_filename.lower().endswith(".csv")):
                try:
                    self.upload.seek(0)
                    df = pd.read_csv(self.upload)
                    logger.debug(f"Successfully read uploaded CSV file: {self.upload_filename}")
                    return df.to_dict(orient="records")
                except Exception as e:
                    logger.error(f"Failed to read uploaded CSV '{self.upload_filename}': {e}")
                    raise RuntimeError(f"Failed to read uploaded CSV '{self.upload_filename}': {e}")
            elif self.format == "json" or (self.upload_filename and self.upload_filename.lower().endswith(".json")):
                try:
                    self.upload.seek(0)
                    data = json.load(self.upload)
                    logger.debug(f"Successfully read uploaded JSON file: {self.upload_filename}")
                    if isinstance(data, list):
                        return data
                    if isinstance(data, dict):
                        return [data]
                    logger.error(f"Uploaded JSON file '{self.upload_filename}' does not contain a list or dict.")
                    raise RuntimeError(
                        f"Uploaded JSON file '{self.upload_filename}' does not contain a list or dict."
                    )
                except Exception as e:
                    logger.error(f"Failed to read uploaded JSON '{self.upload_filename}': {e}")
                    raise RuntimeError(f"Failed to read uploaded JSON '{self.upload_filename}': {e}")
            else:
                logger.error(f"Unsupported uploaded file extension for '{self.upload_filename}'. Only .csv and .json are supported.")
                raise ValueError(
                    f"Unsupported uploaded file extension for '{self.upload_filename}'. "
                    "Only .csv and .json are supported."
                )
        # Fallback to path-based loading
        p = (self.path or "").lower()
        logger.info(f"Attempting to fetch data from file: {self.path}")
        if p.endswith(".csv"):
            try:
                df = pd.read_csv(self.path)
                logger.debug(f"Successfully read CSV file: {self.path}")
                return df.to_dict(orient="records")
            except Exception as e:
                logger.error(f"Failed to read CSV '{self.path}': {e}")
                raise RuntimeError(f"Failed to read CSV '{self.path}': {e}")
        if p.endswith(".json"):
            try:
                with open(self.path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                logger.debug(f"Successfully read JSON file: {self.path}")
                if isinstance(data, list):
                    return data
                if isinstance(data, dict):
                    return [data]
                logger.error(f"JSON file '{self.path}' does not contain a list or dict.")
                raise RuntimeError(
                    f"JSON file '{self.path}' does not contain a list or dict."
                )
            except Exception as e:
                logger.error(f"Failed to read JSON '{self.path}': {e}")
                raise RuntimeError(f"Failed to read JSON '{self.path}': {e}")
        logger.error(f"Unsupported file extension for '{self.path}'. Only .csv and .json are supported.")
        raise ValueError(
            f"Unsupported file extension for '{self.path}'. "
            "Only .csv and .json are supported."
--- a/ingestion/adapters/web_scraper_adapter.py
+++ b/ingestion/adapters/web_scraper_adapter.py
@ -21,6 +21,7 @@ from crawl4ai.extraction_strategy import (
 )
 from .base import DataSourceAdapter
 from loguru import logger
 class WebScraperAdapter(DataSourceAdapter):
@ -60,6 +61,7 @@ class WebScraperAdapter(DataSourceAdapter):
        self.output_format = output_format
        self.verbose = verbose
        self.cache_mode = cache_mode
        logger.info(f"Initialized WebScraperAdapter for URLs: {urls}")
    def fetch(self) -> List[Dict[str, Any]]:
        """
@ -71,15 +73,18 @@ class WebScraperAdapter(DataSourceAdapter):
        Raises:
            RuntimeError: On failure during crawling or extraction.
        """
        logger.info("Starting synchronous fetch for web scraping.")
        try:
            return asyncio.run(self._fetch_async())
        except Exception as e:
            logger.error(f"Web scraping failed: {e}")
            raise RuntimeError(f"Web scraping failed: {e}")
    async def _fetch_async(self) -> List[Dict[str, Any]]:
        """
        Internal async method to perform crawling and extraction.
        """
        logger.info("Starting async web scraping fetch.")
        # Initialize crawler
        browser_cfg = BrowserConfig(headless=True, verbose=self.verbose)
        crawler = AsyncWebCrawler(config=browser_cfg)
@ -96,7 +101,9 @@ class WebScraperAdapter(DataSourceAdapter):
                extraction_strategy = JsonCssExtractionStrategy(
                    schema=schema, verbose=self.verbose
                )
                logger.debug(f"Loaded schema file: {self.schema_file}")
            except Exception as e:
                logger.error(f"Failed to load schema file '{self.schema_file}': {e}")
                await crawler.close()
                raise RuntimeError(
                    f"Failed to load schema file '{self.schema_file}': {e}"
@ -109,7 +116,9 @@ class WebScraperAdapter(DataSourceAdapter):
                apply_chunking=True,
                verbose=self.verbose,
            )
            logger.debug("Using LLM extraction strategy.")
        else:
            logger.error("Either 'schema_file' or 'prompt' must be provided.")
            await crawler.close()
            raise ValueError("Either 'schema_file' or 'prompt' must be provided.")
@ -117,6 +126,7 @@ class WebScraperAdapter(DataSourceAdapter):
        try:
            cache_enum = getattr(CacheMode, self.cache_mode.upper())
        except AttributeError:
            logger.warning(f"Invalid cache mode '{self.cache_mode}', defaulting to ENABLED.")
            cache_enum = CacheMode.ENABLED
        run_cfg = CrawlerRunConfig(
@ -127,9 +137,11 @@ class WebScraperAdapter(DataSourceAdapter):
        # Execute crawl
        try:
            logger.info(f"Crawling URLs: {self.urls}")
            results: List[CrawlResult] = await crawler.arun_many(
                urls=self.urls, config=run_cfg
            )
            logger.debug(f"Crawling completed. Results: {results}")
        finally:
            await crawler.close()
@ -137,10 +149,16 @@ class WebScraperAdapter(DataSourceAdapter):
        records: List[Dict[str, Any]] = []
        for res in results:
            if not res.success or not res.extracted_content:
                logger.warning(f"Skipping failed or empty result for URL: {getattr(res, 'url', None)}")
                continue
            try:
                content = json.loads(res.extracted_content)
                logger.debug(f"Parsed extracted content for URL: {res.url}")
            except Exception:
                logger.error(f"Failed to parse extracted content for URL: {res.url}")
                continue
            if content is None:
                logger.warning(f"Extracted content is None for URL: {res.url}")
                continue
            if isinstance(content, list):
                for item in content:
@ -150,5 +168,8 @@ class WebScraperAdapter(DataSourceAdapter):
            elif isinstance(content, dict):
                content["source_url"] = res.url
                records.append(content)
            else:
                logger.warning(f"Extracted content for URL {res.url} is not a list or dict: {type(content)}")
        logger.info(f"Web scraping completed. Extracted {len(records)} records.")
        return records
--- a/ingestion/ingestor.py
+++ b/ingestion/ingestor.py
@ -7,6 +7,8 @@ from typing import List, Dict, Any
 from ingestion.adapters.api_adapter import ApiAdapter
 from ingestion.adapters.file_adapter import FileAdapter
 from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
 from pydantic import BaseModel
 from loguru import logger
 class Ingestor:
@ -31,11 +33,24 @@ class Ingestor:
            ValueError: For unknown source types.
            RuntimeError: If an adapter fails during fetch.
        """
        from log.logging_utils import pipeline_log
        aggregated: List[Dict[str, Any]] = []
        logger.info("Starting ingestion run for sources.")
        for src in sources:
-            src_type = src.get("type")
+            # accept Pydantic models or raw dicts
-            config = src.get("config", {})
+            if isinstance(src, BaseModel):
                src_item = src.dict()
            else:
                src_item = src
            src_type = src_item.get("type")
            config = src_item.get("config", {})
            # convert BaseModel config to dict if needed
            if not isinstance(config, dict) and hasattr(config, "dict"):
                config = config.dict(exclude_unset=True)
            pipeline_id = config.get("pipeline_id") or src_item.get("pipeline_id")
            run_id = config.get("run_id") or src_item.get("run_id")
            logger.info(f"Processing source type: {src_type} with config: {config}")
            if src_type == "api":
                adapter = ApiAdapter(**config)
            elif src_type == "scrape":
@ -43,16 +58,22 @@ class Ingestor:
            elif src_type == "file":
                adapter = FileAdapter(**config)
            else:
                logger.error(f"Unknown source type: {src_type}")
                pipeline_log("ERROR", f"Unknown source type: {src_type}", pipeline_id, run_id, status="FAILED")
                raise ValueError(f"Unknown source type: {src_type}")
            try:
-                data = adapter.fetch()
+                logger.info(f"Fetching records using {src_type} adapter.")
-                aggregated.extend(data)
+                records = adapter.fetch()
                logger.info(f"Fetched {len(records)} records from {src_type} source.")
                aggregated.extend(records)
                pipeline_log("SUCCESS", f"Fetched {len(records)} records.", pipeline_id, run_id, status="COMPLETED")
            except Exception as e:
-                raise RuntimeError(
+                logger.error(f"Fetch failed for source {src_type}: {e}")
-                    f"Ingestion failed for source '{src_type}' with config {config}: {e}"
+                pipeline_log("ERROR", f"Fetch failed: {e}", pipeline_id, run_id, status="FAILED")
-                )
+                raise RuntimeError(f"Fetch failed for source {src_type}: {e}")
        logger.info(f"Ingestion run completed. Total records aggregated: {len(aggregated)}")
        return aggregated
--- a/log/init.py
+++ b/log/init.py
--- a/log/log_stream.py
+++ b/log/log_stream.py
@ -0,0 +1,26 @@
 from fastapi import APIRouter, Request, HTTPException
 from fastapi.responses import StreamingResponse
 from log.logging_utils import RUN_LOG_QUEUES
 from queue import Empty
 import asyncio
 router = APIRouter()
@router.get("/pipelines/{pipeline_id}/runs/{run_id}/logs/stream")
 async def stream_logs(request: Request, pipeline_id: str, run_id: str):
    log_queue = RUN_LOG_QUEUES.get(run_id)
    if not log_queue:
        raise HTTPException(status_code=404, detail="No logs for this run.")
    async def event_generator():
        while True:
            if await request.is_disconnected():
                break
            try:
                log_line = log_queue.get(timeout=1)
                yield f"data: {log_line}\n\n"
            except Empty:
                await asyncio.sleep(0.2)
                continue
    return StreamingResponse(event_generator(), media_type="text/event-stream")
--- a/log/logging_utils.py
+++ b/log/logging_utils.py
@ -0,0 +1,69 @@
 from loguru import logger
 from queue import Queue
 from typing import Dict, Optional
 import json
 # Per-run log queues (thread-safe)
 RUN_LOG_QUEUES: Dict[str, Queue] = {}
 RUN_LOG_HANDLERS: Dict[str, int] = {}
 # Structured log format
 def make_log_record(level: str, message: str, pipeline_id: Optional[str], run_id: Optional[str], status: Optional[str] = None, error: Optional[str] = None, extra: Optional[dict] = None) -> dict:
    record = {
        "level": level,
        "message": message,
        "pipeline_id": pipeline_id,
        "run_id": run_id,
        "status": status,
        "error": error,
        "extra": extra or {},
    }
    return record
 # Custom loguru sink for per-run logging
 def log_sink(message):
    record = message.record
    run_id = record["extra"].get("run_id")
    pipeline_id = record["extra"].get("pipeline_id")
    if run_id and run_id in RUN_LOG_QUEUES:
        # Structure the log as JSON for frontend parsing
        log_entry = make_log_record(
            level=record["level"].name,
            message=record["message"],
            pipeline_id=pipeline_id,
            run_id=run_id,
            status=record["extra"].get("status"),
            error=record["extra"].get("error"),
            extra=record["extra"]
        )
        RUN_LOG_QUEUES[run_id].put(json.dumps(log_entry))
 # Setup per-run logging sink
 def setup_run_logging(pipeline_id: str, run_id: str):
    log_queue = Queue()
    RUN_LOG_QUEUES[run_id] = log_queue
    handler_id = logger.add(
        log_sink,
        filter=lambda record: record["extra"].get("run_id") == run_id,
        enqueue=True
    )
    RUN_LOG_HANDLERS[run_id] = handler_id
    return log_queue
 # Remove per-run logging sink and clean up
 def cleanup_run_logging(run_id: str):
    if run_id in RUN_LOG_HANDLERS:
        logger.remove(RUN_LOG_HANDLERS[run_id])
        del RUN_LOG_HANDLERS[run_id]
    if run_id in RUN_LOG_QUEUES:
        del RUN_LOG_QUEUES[run_id]
 # Helper for logging with context
 def pipeline_log(level: str, message: str, pipeline_id: str, run_id: str, status: Optional[str] = None, error: Optional[str] = None, extra: Optional[dict] = None):
    logger.log(level, message, extra={"pipeline_id": pipeline_id, "run_id": run_id, "status": status, "error": error, **(extra or {})})
 # Example usage:
 # setup_run_logging(pipeline_id, run_id)
 # pipeline_log("INFO", "Pipeline started", pipeline_id, run_id, status="RUNNING")
 # pipeline_log("ERROR", "Pipeline failed", pipeline_id, run_id, status="FAILED", error="Some error")
 # cleanup_run_logging(run_id)
--- a/main.py
+++ b/main.py
@ -7,11 +7,19 @@ from uuid import UUID
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 import platform
 import asyncio
 if platform.system() == "Windows":
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
 import models
 import stores
 import services
 from log.log_stream import router as log_stream_router
 app = FastAPI(title="Data Integration Pipeline API")
 app.include_router(log_stream_router)
@app.post(
@ -137,4 +145,23 @@ def get_run_results(pipeline_id: UUID, run_id: UUID) -> List[Dict[str, Any]]:
            detail="Run not completed or has failed"
        )
-    return run.results or []
+    return run.results or []
 # Dedicated endpoint to retrieve the error message for a failed run
@app.get(
    "/pipelines/{pipeline_id}/runs/{run_id}/error",
    response_model=str,
    summary="Get run error message"
 )
 def get_run_error(pipeline_id: UUID, run_id: UUID) -> str:
    """
    Retrieve the error message for a run that failed.
    """
    pipeline = stores.get_pipeline(pipeline_id)
    if not pipeline:
        raise HTTPException(status_code=404, detail="Pipeline not found")
    run = stores.get_run(run_id)
    if not run or run.pipeline_id != pipeline_id:
        raise HTTPException(status_code=404, detail="Run not found")
    return run.error or ""
--- a/models.py
+++ b/models.py
@ -6,7 +6,7 @@ from typing import List, Union, Annotated, Optional, Literal, Dict, Any
 from uuid import UUID
 from datetime import datetime
-from pydantic import BaseModel, Field, HttpUrl, field_validator
+from pydantic import BaseModel, Field, HttpUrl, field_validator, ValidationInfo
 class RunCreate(BaseModel):
@ -75,26 +75,54 @@ class ScrapeConfig(BaseModel):
 class FileConfig(BaseModel):
    """
-    Configuration for a file-based source.
+    Configuration for a file-based source. Supports either a file path or an uploaded file.
    """
-    path: str = Field(
+    path: Optional[str] = Field(
-        ...,
+        None,
-        description="Path to the input file",
+        description="Path to the input file (optional if upload is provided)",
        example="/data/myfile.json"
    )
    upload: Optional[Any] = Field(
        None,
        description="Uploaded file object or metadata (optional if path is provided)",
        example=None
    )
    upload_filename: Optional[str] = Field(
        None,
        description="Original filename of the uploaded file (for validation)",
        example="myfile.json"
    )
    format: Literal["csv", "json", "sqlite"] = Field(
        "json",
        description="Format of the file",
        example="csv"
    )
-    @field_validator("path")
+    @field_validator("path", mode="before")
-    def path_extension_matches_format(cls, v: str, values):
+    def require_path_or_upload(cls, v, info: ValidationInfo):
-        fmt = values.get("format")
+        data = info.data
-        if fmt and not v.lower().endswith(f".{fmt}"):
+        if not v and not data.get("upload"):
-            raise ValueError(f"File extension must match format '{fmt}'")
+            raise ValueError("Either 'path' or 'upload' must be provided.")
        return v
    @field_validator("upload_filename", mode="before")
    def filename_extension_matches_format(cls, v, info: ValidationInfo):
        fmt = info.data.get("format")
        if v and fmt and not v.lower().endswith(f".{fmt}"):
            raise ValueError(f"Uploaded file extension must match format '{fmt}'")
        return v
    @field_validator("path", mode="after")
    def path_or_upload_extension_matches_format(cls, v, info: ValidationInfo):
        fmt = info.data.get("format")
        upload_filename = info.data.get("upload_filename")
        if v and fmt and not v.lower().endswith(f".{fmt}"):
            raise ValueError(f"File extension must match format '{fmt}'")
        if upload_filename and fmt and not upload_filename.lower().endswith(f".{fmt}"):
            raise ValueError(f"Uploaded file extension must match format '{fmt}'")
        return v
 class ApiSource(BaseModel):
    """
--- a/pyproject.toml
+++ b/pyproject.toml
@ -8,6 +8,7 @@ dependencies = [
    "crawl4ai>=0.5.0.post8",
    "fastapi[standard]>=0.115.12",
    "inquirer>=3.4.0",
    "loguru>=0.7.3",
    "pandas>=2.2.3",
    "python-dotenv>=1.1.0",
    "rich>=14.0.0",
--- a/schema.yaml
+++ b/schema.yaml
--- a/services.py
+++ b/services.py
@ -10,6 +10,7 @@ import stores
 import models
 from ingestion.ingestor import Ingestor
 from normalization.normalizer import Normalizer
 from log.logging_utils import setup_run_logging, cleanup_run_logging, pipeline_log
 def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
@ -24,13 +25,19 @@ def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
    if not run:
        return
    # Setup structured per-run logging
    setup_run_logging(str(pipeline.id), str(run_id))
    pipeline_log("INFO", "Pipeline run starting", str(pipeline.id), str(run_id), status="RUNNING")
    # Mark as running
    run.status = 'RUNNING'
    run.started_at = datetime.utcnow()
    try:
        # Ingest raw records
        pipeline_log("INFO", "Ingesting raw records", str(pipeline.id), str(run_id))
        raw_records: List[Dict[str, Any]] = Ingestor.run(pipeline.sources)
        pipeline_log("INFO", f"Ingested {len(raw_records)} records", str(pipeline.id), str(run_id))
        # Normalize records
        normalizer = Normalizer()
@ -39,6 +46,7 @@ def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
            source_type = raw.get('source_type')
            source = raw.get('source')
            if not source_type or not source:
                pipeline_log("ERROR", "Record missing 'source_type' or 'source'", str(pipeline.id), str(run_id), status="FAILED")
                raise ValueError("Record missing 'source_type' or 'source'.")
            norm = normalizer.normalize([raw], source_type, source)
            canonical.extend(norm)
@ -47,9 +55,14 @@ def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
        run.status = 'COMPLETED'
        run.finished_at = datetime.utcnow()
        run.results = canonical
        pipeline_log("SUCCESS", f"Pipeline run completed with {len(canonical)} records", str(pipeline.id), str(run_id), status="COMPLETED")
    except Exception as e:
-        # Failure
+        # Log failure with stack trace
        pipeline_log("ERROR", f"Pipeline run failed: {e}", str(pipeline.id), str(run_id), status="FAILED", error=str(e))
        run.status = 'FAILED'
        run.finished_at = datetime.utcnow()
-        run.error = str(e)
+        run.error = str(e)
    finally:
        pipeline_log("INFO", "Pipeline run finished", str(pipeline.id), str(run_id), status=run.status)
        cleanup_run_logging(str(run_id))
--- a/uv.lock
+++ b/uv.lock
@ -305,6 +305,7 @@ dependencies = [
    { name = "crawl4ai" },
    { name = "fastapi", extra = ["standard"] },
    { name = "inquirer" },
    { name = "loguru" },
    { name = "pandas" },
    { name = "python-dotenv" },
    { name = "rich" },
@ -315,6 +316,7 @@ requires-dist = [
    { name = "crawl4ai", specifier = ">=0.5.0.post8" },
    { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
    { name = "inquirer", specifier = ">=3.4.0" },
    { name = "loguru", specifier = ">=0.7.3" },
    { name = "pandas", specifier = ">=2.2.3" },
    { name = "python-dotenv", specifier = ">=1.1.0" },
    { name = "rich", specifier = ">=14.0.0" },
@ -813,6 +815,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cf/f4/25a25c75ec02fcec729cee95288635367f2cdf8add0416494d0c42842ccc/litellm-1.65.6-py3-none-any.whl", hash = "sha256:c65ec7676f251c4f28cfb7446a542d15f091fe0fb71d6d6e630d8c8849f9a76d", size = 7518562 },
 ]
 [[package]]
 name = "loguru"
 version = "0.7.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595 },
 ]
 [[package]]
 name = "lxml"
 version = "5.3.2"
@ -1859,6 +1874,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743 },
 ]
 [[package]]
 name = "win32-setctime"
 version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083 },
 ]
 [[package]]
 name = "xmod"
 version = "1.8.1"