add logging

This commit is contained in:
Sosokker 2025-05-11 16:01:49 +07:00
parent 10856f6cdf
commit 73654a402c
14 changed files with 358 additions and 25 deletions

38
data/sample.json Normal file
View File

@ -0,0 +1,38 @@
{
"quiz": {
"sport": {
"q1": {
"question": "Which one is correct team name in NBA?",
"options": [
"New York Bulls",
"Los Angeles Kings",
"Golden State Warriros",
"Huston Rocket"
],
"answer": "Huston Rocket"
}
},
"maths": {
"q1": {
"question": "5 + 7 = ?",
"options": [
"10",
"11",
"12",
"13"
],
"answer": "12"
},
"q2": {
"question": "12 - 8 = ?",
"options": [
"1",
"2",
"3",
"4"
],
"answer": "4"
}
}
}
}

View File

@ -9,6 +9,7 @@ from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from .base import DataSourceAdapter from .base import DataSourceAdapter
from loguru import logger
class ApiAdapter(DataSourceAdapter): class ApiAdapter(DataSourceAdapter):
@ -20,7 +21,8 @@ class ApiAdapter(DataSourceAdapter):
self, self,
url: str, url: str,
headers: Optional[Dict[str, str]] = None, headers: Optional[Dict[str, str]] = None,
timeout: float = 30 timeout: float = 30,
token: Optional[str] = None,
): ):
""" """
Initialize the API adapter. Initialize the API adapter.
@ -29,10 +31,14 @@ class ApiAdapter(DataSourceAdapter):
url: Endpoint URL to fetch. url: Endpoint URL to fetch.
headers: Optional HTTP headers. headers: Optional HTTP headers.
timeout: Timeout in seconds for the request. timeout: Timeout in seconds for the request.
token: Optional bearer token for Authorization header.
""" """
self.url = url self.url = url
self.headers = headers or {} self.headers = headers or {}
if token:
self.headers["Authorization"] = f"Bearer {token}"
self.timeout = timeout self.timeout = timeout
logger.info(f"Initializing ApiAdapter for URL: {url}")
self.session = self._init_session() self.session = self._init_session()
def _init_session(self) -> requests.Session: def _init_session(self) -> requests.Session:
@ -49,6 +55,7 @@ class ApiAdapter(DataSourceAdapter):
adapter = HTTPAdapter(max_retries=retries) adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter) session.mount("https://", adapter)
session.mount("http://", adapter) session.mount("http://", adapter)
logger.debug("HTTP session initialized with retry strategy.")
return session return session
def fetch(self) -> List[Dict[str, Any]]: def fetch(self) -> List[Dict[str, Any]]:
@ -61,21 +68,27 @@ class ApiAdapter(DataSourceAdapter):
Raises: Raises:
RuntimeError: On network error, HTTP error, or JSON parse error. RuntimeError: On network error, HTTP error, or JSON parse error.
""" """
logger.info(f"Fetching data from API: {self.url}")
try: try:
response = self.session.get( response = self.session.get(
self.url, headers=self.headers, timeout=self.timeout self.url, headers=self.headers, timeout=self.timeout
) )
response.raise_for_status() response.raise_for_status()
logger.debug(f"Received response with status code: {response.status_code}")
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.error(f"API request failed: {e}")
raise RuntimeError(f"API request failed: {e}") raise RuntimeError(f"API request failed: {e}")
try: try:
data = response.json() data = response.json()
logger.debug(f"Successfully parsed JSON response from {self.url}")
except ValueError as e: except ValueError as e:
logger.error(f"Failed to parse JSON response: {e}")
raise RuntimeError(f"Failed to parse JSON response: {e}") raise RuntimeError(f"Failed to parse JSON response: {e}")
if isinstance(data, list): if isinstance(data, list):
return data return data
if isinstance(data, dict): if isinstance(data, dict):
return [data] return [data]
logger.error("Unexpected JSON structure: expected list or dict.")
raise RuntimeError("Unexpected JSON structure: expected list or dict.") raise RuntimeError("Unexpected JSON structure: expected list or dict.")

View File

@ -6,27 +6,36 @@ from typing import List, Dict, Any
import json import json
import pandas as pd import pandas as pd
from loguru import logger
from .base import DataSourceAdapter from .base import DataSourceAdapter
class FileAdapter(DataSourceAdapter): class FileAdapter(DataSourceAdapter):
""" """
Adapter for reading data from local files (CSV or JSON). Adapter for reading data from local files (CSV or JSON), or from uploaded file-like objects.
""" """
def __init__(self, path: str): def __init__(self, path: str = None, format: str = None, upload=None, upload_filename: str = None):
""" """
Initialize the file adapter. Initialize the file adapter.
Args: Args:
path: Path to the input file (.csv or .json). path: Path to the input file (.csv or .json), optional if upload is provided.
format: Optional file format (e.g., 'csv', 'json').
upload: Optional file-like object (e.g., from upload).
upload_filename: Optional original filename for validation/logging.
""" """
self.path = path self.path = path
self.format = format
self.upload = upload
self.upload_filename = upload_filename
logger.info(f"Initialized FileAdapter for path: {path}, upload: {upload_filename}, format: {format}")
def fetch(self) -> List[Dict[str, Any]]: def fetch(self) -> List[Dict[str, Any]]:
""" """
Read and parse the file, returning a list of records. Read and parse the file, returning a list of records.
Supports both path-based and uploaded file-like inputs.
Returns: Returns:
List of dicts from the file contents. List of dicts from the file contents.
@ -35,26 +44,68 @@ class FileAdapter(DataSourceAdapter):
RuntimeError: On read or parse errors. RuntimeError: On read or parse errors.
ValueError: If file extension is unsupported. ValueError: If file extension is unsupported.
""" """
p = self.path.lower() if self.upload is not None:
# Handle uploaded file-like object
logger.info(f"Fetching data from uploaded file: {self.upload_filename or '[no filename]'}")
if self.format == "csv" or (self.upload_filename and self.upload_filename.lower().endswith(".csv")):
try:
self.upload.seek(0)
df = pd.read_csv(self.upload)
logger.debug(f"Successfully read uploaded CSV file: {self.upload_filename}")
return df.to_dict(orient="records")
except Exception as e:
logger.error(f"Failed to read uploaded CSV '{self.upload_filename}': {e}")
raise RuntimeError(f"Failed to read uploaded CSV '{self.upload_filename}': {e}")
elif self.format == "json" or (self.upload_filename and self.upload_filename.lower().endswith(".json")):
try:
self.upload.seek(0)
data = json.load(self.upload)
logger.debug(f"Successfully read uploaded JSON file: {self.upload_filename}")
if isinstance(data, list):
return data
if isinstance(data, dict):
return [data]
logger.error(f"Uploaded JSON file '{self.upload_filename}' does not contain a list or dict.")
raise RuntimeError(
f"Uploaded JSON file '{self.upload_filename}' does not contain a list or dict."
)
except Exception as e:
logger.error(f"Failed to read uploaded JSON '{self.upload_filename}': {e}")
raise RuntimeError(f"Failed to read uploaded JSON '{self.upload_filename}': {e}")
else:
logger.error(f"Unsupported uploaded file extension for '{self.upload_filename}'. Only .csv and .json are supported.")
raise ValueError(
f"Unsupported uploaded file extension for '{self.upload_filename}'. "
"Only .csv and .json are supported."
)
# Fallback to path-based loading
p = (self.path or "").lower()
logger.info(f"Attempting to fetch data from file: {self.path}")
if p.endswith(".csv"): if p.endswith(".csv"):
try: try:
df = pd.read_csv(self.path) df = pd.read_csv(self.path)
logger.debug(f"Successfully read CSV file: {self.path}")
return df.to_dict(orient="records") return df.to_dict(orient="records")
except Exception as e: except Exception as e:
logger.error(f"Failed to read CSV '{self.path}': {e}")
raise RuntimeError(f"Failed to read CSV '{self.path}': {e}") raise RuntimeError(f"Failed to read CSV '{self.path}': {e}")
if p.endswith(".json"): if p.endswith(".json"):
try: try:
with open(self.path, "r", encoding="utf-8") as f: with open(self.path, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
logger.debug(f"Successfully read JSON file: {self.path}")
if isinstance(data, list): if isinstance(data, list):
return data return data
if isinstance(data, dict): if isinstance(data, dict):
return [data] return [data]
logger.error(f"JSON file '{self.path}' does not contain a list or dict.")
raise RuntimeError( raise RuntimeError(
f"JSON file '{self.path}' does not contain a list or dict." f"JSON file '{self.path}' does not contain a list or dict."
) )
except Exception as e: except Exception as e:
logger.error(f"Failed to read JSON '{self.path}': {e}")
raise RuntimeError(f"Failed to read JSON '{self.path}': {e}") raise RuntimeError(f"Failed to read JSON '{self.path}': {e}")
logger.error(f"Unsupported file extension for '{self.path}'. Only .csv and .json are supported.")
raise ValueError( raise ValueError(
f"Unsupported file extension for '{self.path}'. " f"Unsupported file extension for '{self.path}'. "
"Only .csv and .json are supported." "Only .csv and .json are supported."

View File

@ -21,6 +21,7 @@ from crawl4ai.extraction_strategy import (
) )
from .base import DataSourceAdapter from .base import DataSourceAdapter
from loguru import logger
class WebScraperAdapter(DataSourceAdapter): class WebScraperAdapter(DataSourceAdapter):
@ -60,6 +61,7 @@ class WebScraperAdapter(DataSourceAdapter):
self.output_format = output_format self.output_format = output_format
self.verbose = verbose self.verbose = verbose
self.cache_mode = cache_mode self.cache_mode = cache_mode
logger.info(f"Initialized WebScraperAdapter for URLs: {urls}")
def fetch(self) -> List[Dict[str, Any]]: def fetch(self) -> List[Dict[str, Any]]:
""" """
@ -71,15 +73,18 @@ class WebScraperAdapter(DataSourceAdapter):
Raises: Raises:
RuntimeError: On failure during crawling or extraction. RuntimeError: On failure during crawling or extraction.
""" """
logger.info("Starting synchronous fetch for web scraping.")
try: try:
return asyncio.run(self._fetch_async()) return asyncio.run(self._fetch_async())
except Exception as e: except Exception as e:
logger.error(f"Web scraping failed: {e}")
raise RuntimeError(f"Web scraping failed: {e}") raise RuntimeError(f"Web scraping failed: {e}")
async def _fetch_async(self) -> List[Dict[str, Any]]: async def _fetch_async(self) -> List[Dict[str, Any]]:
""" """
Internal async method to perform crawling and extraction. Internal async method to perform crawling and extraction.
""" """
logger.info("Starting async web scraping fetch.")
# Initialize crawler # Initialize crawler
browser_cfg = BrowserConfig(headless=True, verbose=self.verbose) browser_cfg = BrowserConfig(headless=True, verbose=self.verbose)
crawler = AsyncWebCrawler(config=browser_cfg) crawler = AsyncWebCrawler(config=browser_cfg)
@ -96,7 +101,9 @@ class WebScraperAdapter(DataSourceAdapter):
extraction_strategy = JsonCssExtractionStrategy( extraction_strategy = JsonCssExtractionStrategy(
schema=schema, verbose=self.verbose schema=schema, verbose=self.verbose
) )
logger.debug(f"Loaded schema file: {self.schema_file}")
except Exception as e: except Exception as e:
logger.error(f"Failed to load schema file '{self.schema_file}': {e}")
await crawler.close() await crawler.close()
raise RuntimeError( raise RuntimeError(
f"Failed to load schema file '{self.schema_file}': {e}" f"Failed to load schema file '{self.schema_file}': {e}"
@ -109,7 +116,9 @@ class WebScraperAdapter(DataSourceAdapter):
apply_chunking=True, apply_chunking=True,
verbose=self.verbose, verbose=self.verbose,
) )
logger.debug("Using LLM extraction strategy.")
else: else:
logger.error("Either 'schema_file' or 'prompt' must be provided.")
await crawler.close() await crawler.close()
raise ValueError("Either 'schema_file' or 'prompt' must be provided.") raise ValueError("Either 'schema_file' or 'prompt' must be provided.")
@ -117,6 +126,7 @@ class WebScraperAdapter(DataSourceAdapter):
try: try:
cache_enum = getattr(CacheMode, self.cache_mode.upper()) cache_enum = getattr(CacheMode, self.cache_mode.upper())
except AttributeError: except AttributeError:
logger.warning(f"Invalid cache mode '{self.cache_mode}', defaulting to ENABLED.")
cache_enum = CacheMode.ENABLED cache_enum = CacheMode.ENABLED
run_cfg = CrawlerRunConfig( run_cfg = CrawlerRunConfig(
@ -127,9 +137,11 @@ class WebScraperAdapter(DataSourceAdapter):
# Execute crawl # Execute crawl
try: try:
logger.info(f"Crawling URLs: {self.urls}")
results: List[CrawlResult] = await crawler.arun_many( results: List[CrawlResult] = await crawler.arun_many(
urls=self.urls, config=run_cfg urls=self.urls, config=run_cfg
) )
logger.debug(f"Crawling completed. Results: {results}")
finally: finally:
await crawler.close() await crawler.close()
@ -137,10 +149,16 @@ class WebScraperAdapter(DataSourceAdapter):
records: List[Dict[str, Any]] = [] records: List[Dict[str, Any]] = []
for res in results: for res in results:
if not res.success or not res.extracted_content: if not res.success or not res.extracted_content:
logger.warning(f"Skipping failed or empty result for URL: {getattr(res, 'url', None)}")
continue continue
try: try:
content = json.loads(res.extracted_content) content = json.loads(res.extracted_content)
logger.debug(f"Parsed extracted content for URL: {res.url}")
except Exception: except Exception:
logger.error(f"Failed to parse extracted content for URL: {res.url}")
continue
if content is None:
logger.warning(f"Extracted content is None for URL: {res.url}")
continue continue
if isinstance(content, list): if isinstance(content, list):
for item in content: for item in content:
@ -150,5 +168,8 @@ class WebScraperAdapter(DataSourceAdapter):
elif isinstance(content, dict): elif isinstance(content, dict):
content["source_url"] = res.url content["source_url"] = res.url
records.append(content) records.append(content)
else:
logger.warning(f"Extracted content for URL {res.url} is not a list or dict: {type(content)}")
logger.info(f"Web scraping completed. Extracted {len(records)} records.")
return records return records

View File

@ -7,6 +7,8 @@ from typing import List, Dict, Any
from ingestion.adapters.api_adapter import ApiAdapter from ingestion.adapters.api_adapter import ApiAdapter
from ingestion.adapters.file_adapter import FileAdapter from ingestion.adapters.file_adapter import FileAdapter
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
from pydantic import BaseModel
from loguru import logger
class Ingestor: class Ingestor:
@ -31,11 +33,24 @@ class Ingestor:
ValueError: For unknown source types. ValueError: For unknown source types.
RuntimeError: If an adapter fails during fetch. RuntimeError: If an adapter fails during fetch.
""" """
from log.logging_utils import pipeline_log
aggregated: List[Dict[str, Any]] = [] aggregated: List[Dict[str, Any]] = []
logger.info("Starting ingestion run for sources.")
for src in sources: for src in sources:
src_type = src.get("type") # accept Pydantic models or raw dicts
config = src.get("config", {}) if isinstance(src, BaseModel):
src_item = src.dict()
else:
src_item = src
src_type = src_item.get("type")
config = src_item.get("config", {})
# convert BaseModel config to dict if needed
if not isinstance(config, dict) and hasattr(config, "dict"):
config = config.dict(exclude_unset=True)
pipeline_id = config.get("pipeline_id") or src_item.get("pipeline_id")
run_id = config.get("run_id") or src_item.get("run_id")
logger.info(f"Processing source type: {src_type} with config: {config}")
if src_type == "api": if src_type == "api":
adapter = ApiAdapter(**config) adapter = ApiAdapter(**config)
elif src_type == "scrape": elif src_type == "scrape":
@ -43,16 +58,22 @@ class Ingestor:
elif src_type == "file": elif src_type == "file":
adapter = FileAdapter(**config) adapter = FileAdapter(**config)
else: else:
logger.error(f"Unknown source type: {src_type}")
pipeline_log("ERROR", f"Unknown source type: {src_type}", pipeline_id, run_id, status="FAILED")
raise ValueError(f"Unknown source type: {src_type}") raise ValueError(f"Unknown source type: {src_type}")
try: try:
data = adapter.fetch() logger.info(f"Fetching records using {src_type} adapter.")
aggregated.extend(data) records = adapter.fetch()
logger.info(f"Fetched {len(records)} records from {src_type} source.")
aggregated.extend(records)
pipeline_log("SUCCESS", f"Fetched {len(records)} records.", pipeline_id, run_id, status="COMPLETED")
except Exception as e: except Exception as e:
raise RuntimeError( logger.error(f"Fetch failed for source {src_type}: {e}")
f"Ingestion failed for source '{src_type}' with config {config}: {e}" pipeline_log("ERROR", f"Fetch failed: {e}", pipeline_id, run_id, status="FAILED")
) raise RuntimeError(f"Fetch failed for source {src_type}: {e}")
logger.info(f"Ingestion run completed. Total records aggregated: {len(aggregated)}")
return aggregated return aggregated

0
log/__init__.py Normal file
View File

26
log/log_stream.py Normal file
View File

@ -0,0 +1,26 @@
from fastapi import APIRouter, Request, HTTPException
from fastapi.responses import StreamingResponse
from log.logging_utils import RUN_LOG_QUEUES
from queue import Empty
import asyncio
router = APIRouter()
@router.get("/pipelines/{pipeline_id}/runs/{run_id}/logs/stream")
async def stream_logs(request: Request, pipeline_id: str, run_id: str):
log_queue = RUN_LOG_QUEUES.get(run_id)
if not log_queue:
raise HTTPException(status_code=404, detail="No logs for this run.")
async def event_generator():
while True:
if await request.is_disconnected():
break
try:
log_line = log_queue.get(timeout=1)
yield f"data: {log_line}\n\n"
except Empty:
await asyncio.sleep(0.2)
continue
return StreamingResponse(event_generator(), media_type="text/event-stream")

69
log/logging_utils.py Normal file
View File

@ -0,0 +1,69 @@
from loguru import logger
from queue import Queue
from typing import Dict, Optional
import json
# Per-run log queues (thread-safe)
RUN_LOG_QUEUES: Dict[str, Queue] = {}
RUN_LOG_HANDLERS: Dict[str, int] = {}
# Structured log format
def make_log_record(level: str, message: str, pipeline_id: Optional[str], run_id: Optional[str], status: Optional[str] = None, error: Optional[str] = None, extra: Optional[dict] = None) -> dict:
record = {
"level": level,
"message": message,
"pipeline_id": pipeline_id,
"run_id": run_id,
"status": status,
"error": error,
"extra": extra or {},
}
return record
# Custom loguru sink for per-run logging
def log_sink(message):
record = message.record
run_id = record["extra"].get("run_id")
pipeline_id = record["extra"].get("pipeline_id")
if run_id and run_id in RUN_LOG_QUEUES:
# Structure the log as JSON for frontend parsing
log_entry = make_log_record(
level=record["level"].name,
message=record["message"],
pipeline_id=pipeline_id,
run_id=run_id,
status=record["extra"].get("status"),
error=record["extra"].get("error"),
extra=record["extra"]
)
RUN_LOG_QUEUES[run_id].put(json.dumps(log_entry))
# Setup per-run logging sink
def setup_run_logging(pipeline_id: str, run_id: str):
log_queue = Queue()
RUN_LOG_QUEUES[run_id] = log_queue
handler_id = logger.add(
log_sink,
filter=lambda record: record["extra"].get("run_id") == run_id,
enqueue=True
)
RUN_LOG_HANDLERS[run_id] = handler_id
return log_queue
# Remove per-run logging sink and clean up
def cleanup_run_logging(run_id: str):
if run_id in RUN_LOG_HANDLERS:
logger.remove(RUN_LOG_HANDLERS[run_id])
del RUN_LOG_HANDLERS[run_id]
if run_id in RUN_LOG_QUEUES:
del RUN_LOG_QUEUES[run_id]
# Helper for logging with context
def pipeline_log(level: str, message: str, pipeline_id: str, run_id: str, status: Optional[str] = None, error: Optional[str] = None, extra: Optional[dict] = None):
logger.log(level, message, extra={"pipeline_id": pipeline_id, "run_id": run_id, "status": status, "error": error, **(extra or {})})
# Example usage:
# setup_run_logging(pipeline_id, run_id)
# pipeline_log("INFO", "Pipeline started", pipeline_id, run_id, status="RUNNING")
# pipeline_log("ERROR", "Pipeline failed", pipeline_id, run_id, status="FAILED", error="Some error")
# cleanup_run_logging(run_id)

29
main.py
View File

@ -7,11 +7,19 @@ from uuid import UUID
from fastapi import FastAPI, HTTPException, BackgroundTasks from fastapi import FastAPI, HTTPException, BackgroundTasks
import platform
import asyncio
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
import models import models
import stores import stores
import services import services
from log.log_stream import router as log_stream_router
app = FastAPI(title="Data Integration Pipeline API") app = FastAPI(title="Data Integration Pipeline API")
app.include_router(log_stream_router)
@app.post( @app.post(
@ -137,4 +145,23 @@ def get_run_results(pipeline_id: UUID, run_id: UUID) -> List[Dict[str, Any]]:
detail="Run not completed or has failed" detail="Run not completed or has failed"
) )
return run.results or [] return run.results or []
# Dedicated endpoint to retrieve the error message for a failed run
@app.get(
"/pipelines/{pipeline_id}/runs/{run_id}/error",
response_model=str,
summary="Get run error message"
)
def get_run_error(pipeline_id: UUID, run_id: UUID) -> str:
"""
Retrieve the error message for a run that failed.
"""
pipeline = stores.get_pipeline(pipeline_id)
if not pipeline:
raise HTTPException(status_code=404, detail="Pipeline not found")
run = stores.get_run(run_id)
if not run or run.pipeline_id != pipeline_id:
raise HTTPException(status_code=404, detail="Run not found")
return run.error or ""

View File

@ -6,7 +6,7 @@ from typing import List, Union, Annotated, Optional, Literal, Dict, Any
from uuid import UUID from uuid import UUID
from datetime import datetime from datetime import datetime
from pydantic import BaseModel, Field, HttpUrl, field_validator from pydantic import BaseModel, Field, HttpUrl, field_validator, ValidationInfo
class RunCreate(BaseModel): class RunCreate(BaseModel):
@ -75,26 +75,54 @@ class ScrapeConfig(BaseModel):
class FileConfig(BaseModel): class FileConfig(BaseModel):
""" """
Configuration for a file-based source. Configuration for a file-based source. Supports either a file path or an uploaded file.
""" """
path: str = Field( path: Optional[str] = Field(
..., None,
description="Path to the input file", description="Path to the input file (optional if upload is provided)",
example="/data/myfile.json" example="/data/myfile.json"
) )
upload: Optional[Any] = Field(
None,
description="Uploaded file object or metadata (optional if path is provided)",
example=None
)
upload_filename: Optional[str] = Field(
None,
description="Original filename of the uploaded file (for validation)",
example="myfile.json"
)
format: Literal["csv", "json", "sqlite"] = Field( format: Literal["csv", "json", "sqlite"] = Field(
"json", "json",
description="Format of the file", description="Format of the file",
example="csv" example="csv"
) )
@field_validator("path") @field_validator("path", mode="before")
def path_extension_matches_format(cls, v: str, values): def require_path_or_upload(cls, v, info: ValidationInfo):
fmt = values.get("format") data = info.data
if fmt and not v.lower().endswith(f".{fmt}"): if not v and not data.get("upload"):
raise ValueError(f"File extension must match format '{fmt}'") raise ValueError("Either 'path' or 'upload' must be provided.")
return v return v
@field_validator("upload_filename", mode="before")
def filename_extension_matches_format(cls, v, info: ValidationInfo):
fmt = info.data.get("format")
if v and fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
return v
@field_validator("path", mode="after")
def path_or_upload_extension_matches_format(cls, v, info: ValidationInfo):
fmt = info.data.get("format")
upload_filename = info.data.get("upload_filename")
if v and fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"File extension must match format '{fmt}'")
if upload_filename and fmt and not upload_filename.lower().endswith(f".{fmt}"):
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
return v
class ApiSource(BaseModel): class ApiSource(BaseModel):
""" """

View File

@ -8,6 +8,7 @@ dependencies = [
"crawl4ai>=0.5.0.post8", "crawl4ai>=0.5.0.post8",
"fastapi[standard]>=0.115.12", "fastapi[standard]>=0.115.12",
"inquirer>=3.4.0", "inquirer>=3.4.0",
"loguru>=0.7.3",
"pandas>=2.2.3", "pandas>=2.2.3",
"python-dotenv>=1.1.0", "python-dotenv>=1.1.0",
"rich>=14.0.0", "rich>=14.0.0",

1
schema.yaml Normal file

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,7 @@ import stores
import models import models
from ingestion.ingestor import Ingestor from ingestion.ingestor import Ingestor
from normalization.normalizer import Normalizer from normalization.normalizer import Normalizer
from log.logging_utils import setup_run_logging, cleanup_run_logging, pipeline_log
def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None: def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
@ -24,13 +25,19 @@ def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
if not run: if not run:
return return
# Setup structured per-run logging
setup_run_logging(str(pipeline.id), str(run_id))
pipeline_log("INFO", "Pipeline run starting", str(pipeline.id), str(run_id), status="RUNNING")
# Mark as running # Mark as running
run.status = 'RUNNING' run.status = 'RUNNING'
run.started_at = datetime.utcnow() run.started_at = datetime.utcnow()
try: try:
# Ingest raw records # Ingest raw records
pipeline_log("INFO", "Ingesting raw records", str(pipeline.id), str(run_id))
raw_records: List[Dict[str, Any]] = Ingestor.run(pipeline.sources) raw_records: List[Dict[str, Any]] = Ingestor.run(pipeline.sources)
pipeline_log("INFO", f"Ingested {len(raw_records)} records", str(pipeline.id), str(run_id))
# Normalize records # Normalize records
normalizer = Normalizer() normalizer = Normalizer()
@ -39,6 +46,7 @@ def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
source_type = raw.get('source_type') source_type = raw.get('source_type')
source = raw.get('source') source = raw.get('source')
if not source_type or not source: if not source_type or not source:
pipeline_log("ERROR", "Record missing 'source_type' or 'source'", str(pipeline.id), str(run_id), status="FAILED")
raise ValueError("Record missing 'source_type' or 'source'.") raise ValueError("Record missing 'source_type' or 'source'.")
norm = normalizer.normalize([raw], source_type, source) norm = normalizer.normalize([raw], source_type, source)
canonical.extend(norm) canonical.extend(norm)
@ -47,9 +55,14 @@ def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
run.status = 'COMPLETED' run.status = 'COMPLETED'
run.finished_at = datetime.utcnow() run.finished_at = datetime.utcnow()
run.results = canonical run.results = canonical
pipeline_log("SUCCESS", f"Pipeline run completed with {len(canonical)} records", str(pipeline.id), str(run_id), status="COMPLETED")
except Exception as e: except Exception as e:
# Failure # Log failure with stack trace
pipeline_log("ERROR", f"Pipeline run failed: {e}", str(pipeline.id), str(run_id), status="FAILED", error=str(e))
run.status = 'FAILED' run.status = 'FAILED'
run.finished_at = datetime.utcnow() run.finished_at = datetime.utcnow()
run.error = str(e) run.error = str(e)
finally:
pipeline_log("INFO", "Pipeline run finished", str(pipeline.id), str(run_id), status=run.status)
cleanup_run_logging(str(run_id))

24
uv.lock
View File

@ -305,6 +305,7 @@ dependencies = [
{ name = "crawl4ai" }, { name = "crawl4ai" },
{ name = "fastapi", extra = ["standard"] }, { name = "fastapi", extra = ["standard"] },
{ name = "inquirer" }, { name = "inquirer" },
{ name = "loguru" },
{ name = "pandas" }, { name = "pandas" },
{ name = "python-dotenv" }, { name = "python-dotenv" },
{ name = "rich" }, { name = "rich" },
@ -315,6 +316,7 @@ requires-dist = [
{ name = "crawl4ai", specifier = ">=0.5.0.post8" }, { name = "crawl4ai", specifier = ">=0.5.0.post8" },
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
{ name = "inquirer", specifier = ">=3.4.0" }, { name = "inquirer", specifier = ">=3.4.0" },
{ name = "loguru", specifier = ">=0.7.3" },
{ name = "pandas", specifier = ">=2.2.3" }, { name = "pandas", specifier = ">=2.2.3" },
{ name = "python-dotenv", specifier = ">=1.1.0" }, { name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "rich", specifier = ">=14.0.0" }, { name = "rich", specifier = ">=14.0.0" },
@ -813,6 +815,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cf/f4/25a25c75ec02fcec729cee95288635367f2cdf8add0416494d0c42842ccc/litellm-1.65.6-py3-none-any.whl", hash = "sha256:c65ec7676f251c4f28cfb7446a542d15f091fe0fb71d6d6e630d8c8849f9a76d", size = 7518562 }, { url = "https://files.pythonhosted.org/packages/cf/f4/25a25c75ec02fcec729cee95288635367f2cdf8add0416494d0c42842ccc/litellm-1.65.6-py3-none-any.whl", hash = "sha256:c65ec7676f251c4f28cfb7446a542d15f091fe0fb71d6d6e630d8c8849f9a76d", size = 7518562 },
] ]
[[package]]
name = "loguru"
version = "0.7.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "win32-setctime", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595 },
]
[[package]] [[package]]
name = "lxml" name = "lxml"
version = "5.3.2" version = "5.3.2"
@ -1859,6 +1874,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743 }, { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743 },
] ]
[[package]]
name = "win32-setctime"
version = "1.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083 },
]
[[package]] [[package]]
name = "xmod" name = "xmod"
version = "1.8.1" version = "1.8.1"