mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-18 12:14:05 +01:00
initial commit
This commit is contained in:
commit
10856f6cdf
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
|
||||
.env
|
||||
|
||||
/ingestion/data
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
||||
3.12
|
||||
85
config.py
Normal file
85
config.py
Normal file
@ -0,0 +1,85 @@
|
||||
# config.py
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
# Load environment variables from .env file located in the script's directory
|
||||
# Make sure .env is in the *same directory* as config.py
|
||||
dotenv_path = Path(__file__).parent / '.env'
|
||||
if dotenv_path.is_file():
|
||||
load_dotenv(dotenv_path=dotenv_path)
|
||||
else:
|
||||
print(f"Warning: .env file not found at {dotenv_path}")
|
||||
|
||||
|
||||
# --- Default Settings ---
|
||||
DEFAULT_OUTPUT_FILE = "extracted_data.json"
|
||||
DEFAULT_OUTPUT_FORMAT = "json" # csv, json, sqlite
|
||||
DEFAULT_CACHE_MODE = "ENABLED" # ENABLED, BYPASS, DISABLED, READ_ONLY, WRITE_ONLY
|
||||
DEFAULT_VERBOSE = False
|
||||
DEFAULT_LLM_PROVIDER = "openai/gpt-4o-mini" # Default LLM
|
||||
|
||||
# --- LLM Provider Configuration ---
|
||||
PROVIDER_ENV_MAP = {
|
||||
"openai": "OPENAI_API_KEY",
|
||||
"gemini": "GEMINI_API_KEY",
|
||||
"groq": "GROQ_API_KEY",
|
||||
"anthropic": "ANTHROPIC_API_KEY",
|
||||
"ollama": None, # Ollama typically doesn't require an API key
|
||||
# Add other providers and their corresponding env variable names here
|
||||
}
|
||||
|
||||
|
||||
def get_api_key_env_name(provider: str) -> str | None:
|
||||
"""Gets the expected environment variable name for the given provider."""
|
||||
provider_prefix = provider.split('/')[0].lower()
|
||||
return PROVIDER_ENV_MAP.get(provider_prefix)
|
||||
|
||||
|
||||
def get_api_key(provider: str, direct_key: str | None = None, env_var_name: str | None = None) -> str | None:
|
||||
"""
|
||||
Retrieves the API key for a given provider.
|
||||
Priority: direct_key > env_var_name > default env var from PROVIDER_ENV_MAP.
|
||||
"""
|
||||
if direct_key:
|
||||
print(f"INFO: Using direct API key provided via --api-key for provider '{provider}'.")
|
||||
return direct_key
|
||||
|
||||
if env_var_name:
|
||||
key = os.getenv(env_var_name)
|
||||
if key:
|
||||
print(
|
||||
f"INFO: Using API key from specified environment variable '{env_var_name}' for provider '{provider}'."
|
||||
)
|
||||
return key
|
||||
else:
|
||||
print(f"Warning: Specified environment variable '{env_var_name}' not found.")
|
||||
|
||||
default_env_name = get_api_key_env_name(provider)
|
||||
if default_env_name:
|
||||
key = os.getenv(default_env_name)
|
||||
if key:
|
||||
print(
|
||||
f"INFO: Using API key from default environment variable '{default_env_name}' for provider '{provider}'."
|
||||
)
|
||||
return key
|
||||
else:
|
||||
if default_env_name is not None: # Don't warn if provider like Ollama has None mapping
|
||||
print(
|
||||
f"Warning: Default environment variable '{default_env_name}' for provider '{provider}' not found."
|
||||
)
|
||||
return None
|
||||
|
||||
# If provider is not in map and no key was provided
|
||||
# Allow providers like 'ollama' to proceed without a key
|
||||
if provider.split('/')[0].lower() != "ollama":
|
||||
print(f"Warning: No API key found or specified for provider '{provider}'. LLM features might fail.")
|
||||
return None
|
||||
|
||||
|
||||
# --- Exportable Configuration Variables ---
|
||||
LLM_PROVIDER = os.getenv("DEFAULT_LLM_PROVIDER", DEFAULT_LLM_PROVIDER)
|
||||
OUTPUT_FILE = os.getenv("DEFAULT_OUTPUT_FILE", DEFAULT_OUTPUT_FILE)
|
||||
OUTPUT_FORMAT = os.getenv("DEFAULT_OUTPUT_FORMAT", DEFAULT_OUTPUT_FORMAT)
|
||||
CACHE_MODE = os.getenv("DEFAULT_CACHE_MODE", DEFAULT_CACHE_MODE)
|
||||
VERBOSE = os.getenv("DEFAULT_VERBOSE", str(DEFAULT_VERBOSE)).lower() in ('true', '1', 't')
|
||||
3
ingestion/adapters/__init__.py
Normal file
3
ingestion/adapters/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
"""
|
||||
Adapters package for the ingestion layer.
|
||||
"""
|
||||
81
ingestion/adapters/api_adapter.py
Normal file
81
ingestion/adapters/api_adapter.py
Normal file
@ -0,0 +1,81 @@
|
||||
"""
|
||||
API adapter to fetch JSON data from HTTP endpoints.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from .base import DataSourceAdapter
|
||||
|
||||
|
||||
class ApiAdapter(DataSourceAdapter):
|
||||
"""
|
||||
Adapter for fetching data from a REST API endpoint.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
timeout: float = 30
|
||||
):
|
||||
"""
|
||||
Initialize the API adapter.
|
||||
|
||||
Args:
|
||||
url: Endpoint URL to fetch.
|
||||
headers: Optional HTTP headers.
|
||||
timeout: Timeout in seconds for the request.
|
||||
"""
|
||||
self.url = url
|
||||
self.headers = headers or {}
|
||||
self.timeout = timeout
|
||||
self.session = self._init_session()
|
||||
|
||||
def _init_session(self) -> requests.Session:
|
||||
"""
|
||||
Initialize a requests.Session with retry logic.
|
||||
"""
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=3,
|
||||
backoff_factor=0.3,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
allowed_methods=["GET"]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
session.mount("https://", adapter)
|
||||
session.mount("http://", adapter)
|
||||
return session
|
||||
|
||||
def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Perform a GET request and return JSON data as a list of records.
|
||||
|
||||
Returns:
|
||||
List of dicts from the JSON response.
|
||||
|
||||
Raises:
|
||||
RuntimeError: On network error, HTTP error, or JSON parse error.
|
||||
"""
|
||||
try:
|
||||
response = self.session.get(
|
||||
self.url, headers=self.headers, timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise RuntimeError(f"API request failed: {e}")
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
except ValueError as e:
|
||||
raise RuntimeError(f"Failed to parse JSON response: {e}")
|
||||
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
if isinstance(data, dict):
|
||||
return [data]
|
||||
raise RuntimeError("Unexpected JSON structure: expected list or dict.")
|
||||
20
ingestion/adapters/base.py
Normal file
20
ingestion/adapters/base.py
Normal file
@ -0,0 +1,20 @@
|
||||
"""
|
||||
Define the DataSourceAdapter protocol for ingestion adapters.
|
||||
"""
|
||||
|
||||
from typing import Protocol, List, Dict, Any
|
||||
|
||||
|
||||
class DataSourceAdapter(Protocol):
|
||||
"""
|
||||
Protocol for data source adapters.
|
||||
"""
|
||||
|
||||
def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch data from the source.
|
||||
|
||||
Returns:
|
||||
A list of records, each represented as a dict.
|
||||
"""
|
||||
...
|
||||
61
ingestion/adapters/file_adapter.py
Normal file
61
ingestion/adapters/file_adapter.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""
|
||||
File adapter to load data from CSV or JSON files.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .base import DataSourceAdapter
|
||||
|
||||
|
||||
class FileAdapter(DataSourceAdapter):
|
||||
"""
|
||||
Adapter for reading data from local files (CSV or JSON).
|
||||
"""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""
|
||||
Initialize the file adapter.
|
||||
|
||||
Args:
|
||||
path: Path to the input file (.csv or .json).
|
||||
"""
|
||||
self.path = path
|
||||
|
||||
def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Read and parse the file, returning a list of records.
|
||||
|
||||
Returns:
|
||||
List of dicts from the file contents.
|
||||
|
||||
Raises:
|
||||
RuntimeError: On read or parse errors.
|
||||
ValueError: If file extension is unsupported.
|
||||
"""
|
||||
p = self.path.lower()
|
||||
if p.endswith(".csv"):
|
||||
try:
|
||||
df = pd.read_csv(self.path)
|
||||
return df.to_dict(orient="records")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to read CSV '{self.path}': {e}")
|
||||
if p.endswith(".json"):
|
||||
try:
|
||||
with open(self.path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
if isinstance(data, dict):
|
||||
return [data]
|
||||
raise RuntimeError(
|
||||
f"JSON file '{self.path}' does not contain a list or dict."
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to read JSON '{self.path}': {e}")
|
||||
raise ValueError(
|
||||
f"Unsupported file extension for '{self.path}'. "
|
||||
"Only .csv and .json are supported."
|
||||
)
|
||||
154
ingestion/adapters/web_scraper_adapter.py
Normal file
154
ingestion/adapters/web_scraper_adapter.py
Normal file
@ -0,0 +1,154 @@
|
||||
"""
|
||||
Web scraper adapter using crawl4ai to extract structured data.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
LLMConfig,
|
||||
CrawlResult,
|
||||
)
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
ExtractionStrategy,
|
||||
)
|
||||
|
||||
from .base import DataSourceAdapter
|
||||
|
||||
|
||||
class WebScraperAdapter(DataSourceAdapter):
|
||||
"""
|
||||
Adapter for web scraping using crawl4ai.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
urls: List[str],
|
||||
schema_file: Optional[str] = None,
|
||||
prompt: Optional[str] = None,
|
||||
llm_provider: str = "openai/gpt-4",
|
||||
api_key: Optional[str] = None,
|
||||
output_format: str = "json",
|
||||
verbose: bool = False,
|
||||
cache_mode: str = "ENABLED",
|
||||
):
|
||||
"""
|
||||
Initialize the scraper adapter.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to scrape.
|
||||
schema_file: Path to a JSON file with CSS extraction schema.
|
||||
prompt: Prompt for LLM-based extraction.
|
||||
llm_provider: LLM provider identifier.
|
||||
api_key: API key for the LLM provider.
|
||||
output_format: Desired format for the extracted data.
|
||||
verbose: Enable verbose logging.
|
||||
cache_mode: Crawl cache mode (e.g., 'ENABLED').
|
||||
"""
|
||||
self.urls = urls
|
||||
self.schema_file = schema_file
|
||||
self.prompt = prompt
|
||||
self.llm_provider = llm_provider
|
||||
self.api_key = api_key
|
||||
self.output_format = output_format
|
||||
self.verbose = verbose
|
||||
self.cache_mode = cache_mode
|
||||
|
||||
def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Synchronously fetch data by running the async crawler.
|
||||
|
||||
Returns:
|
||||
List of extracted records.
|
||||
|
||||
Raises:
|
||||
RuntimeError: On failure during crawling or extraction.
|
||||
"""
|
||||
try:
|
||||
return asyncio.run(self._fetch_async())
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Web scraping failed: {e}")
|
||||
|
||||
async def _fetch_async(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Internal async method to perform crawling and extraction.
|
||||
"""
|
||||
# Initialize crawler
|
||||
browser_cfg = BrowserConfig(headless=True, verbose=self.verbose)
|
||||
crawler = AsyncWebCrawler(config=browser_cfg)
|
||||
await crawler.start()
|
||||
|
||||
# Prepare extraction strategy
|
||||
llm_cfg = LLMConfig(provider=self.llm_provider, api_token=self.api_key)
|
||||
extraction_strategy: Optional[ExtractionStrategy] = None
|
||||
|
||||
if self.schema_file:
|
||||
try:
|
||||
with open(self.schema_file, "r", encoding="utf-8") as f:
|
||||
schema = json.load(f)
|
||||
extraction_strategy = JsonCssExtractionStrategy(
|
||||
schema=schema, verbose=self.verbose
|
||||
)
|
||||
except Exception as e:
|
||||
await crawler.close()
|
||||
raise RuntimeError(
|
||||
f"Failed to load schema file '{self.schema_file}': {e}"
|
||||
)
|
||||
elif self.prompt:
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llm_config=llm_cfg,
|
||||
instruction=self.prompt,
|
||||
extraction_type="schema",
|
||||
apply_chunking=True,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
else:
|
||||
await crawler.close()
|
||||
raise ValueError("Either 'schema_file' or 'prompt' must be provided.")
|
||||
|
||||
# Configure cache mode
|
||||
try:
|
||||
cache_enum = getattr(CacheMode, self.cache_mode.upper())
|
||||
except AttributeError:
|
||||
cache_enum = CacheMode.ENABLED
|
||||
|
||||
run_cfg = CrawlerRunConfig(
|
||||
cache_mode=cache_enum,
|
||||
extraction_strategy=extraction_strategy,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
|
||||
# Execute crawl
|
||||
try:
|
||||
results: List[CrawlResult] = await crawler.arun_many(
|
||||
urls=self.urls, config=run_cfg
|
||||
)
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
# Process crawl results
|
||||
records: List[Dict[str, Any]] = []
|
||||
for res in results:
|
||||
if not res.success or not res.extracted_content:
|
||||
continue
|
||||
try:
|
||||
content = json.loads(res.extracted_content)
|
||||
except Exception:
|
||||
continue
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
item["source_url"] = res.url
|
||||
records.extend(content)
|
||||
elif isinstance(content, dict):
|
||||
content["source_url"] = res.url
|
||||
records.append(content)
|
||||
|
||||
return records
|
||||
91
ingestion/ingestor.py
Normal file
91
ingestion/ingestor.py
Normal file
@ -0,0 +1,91 @@
|
||||
"""
|
||||
Ingestor module to orchestrate data ingestion from multiple adapters.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from ingestion.adapters.api_adapter import ApiAdapter
|
||||
from ingestion.adapters.file_adapter import FileAdapter
|
||||
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
|
||||
|
||||
|
||||
class Ingestor:
|
||||
"""
|
||||
Ingestor for aggregating data from various sources.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def run(sources: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Run ingestion for a list of sources.
|
||||
|
||||
Args:
|
||||
sources: List of dicts, each with:
|
||||
- type: 'api', 'scrape', or 'file'
|
||||
- config: kwargs for the adapter constructor
|
||||
|
||||
Returns:
|
||||
Flat list of all records fetched from sources.
|
||||
|
||||
Raises:
|
||||
ValueError: For unknown source types.
|
||||
RuntimeError: If an adapter fails during fetch.
|
||||
"""
|
||||
aggregated: List[Dict[str, Any]] = []
|
||||
|
||||
for src in sources:
|
||||
src_type = src.get("type")
|
||||
config = src.get("config", {})
|
||||
if src_type == "api":
|
||||
adapter = ApiAdapter(**config)
|
||||
elif src_type == "scrape":
|
||||
adapter = WebScraperAdapter(**config)
|
||||
elif src_type == "file":
|
||||
adapter = FileAdapter(**config)
|
||||
else:
|
||||
raise ValueError(f"Unknown source type: {src_type}")
|
||||
|
||||
try:
|
||||
data = adapter.fetch()
|
||||
aggregated.extend(data)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Ingestion failed for source '{src_type}' with config {config}: {e}"
|
||||
)
|
||||
|
||||
return aggregated
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage of the Ingestor.
|
||||
example_sources = [
|
||||
{
|
||||
"type": "api",
|
||||
"config": {
|
||||
"url": "https://dummyjson.com/products",
|
||||
"headers": {"Accept": "application/json"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "file",
|
||||
"config": {"path": "data/sample.json"},
|
||||
},
|
||||
{
|
||||
"type": "scrape",
|
||||
"config": {
|
||||
"urls": ["https://www.hipflat.co.th/en"],
|
||||
"schema_file": None,
|
||||
"prompt": "Extract all listings",
|
||||
"llm_provider": "gemini/gemini-2.0-flash",
|
||||
"api_key": "AIzaSyAGnER5on8a0bVXU7quXFMnNyOvCiC_ees",
|
||||
"output_format": "json",
|
||||
"verbose": False,
|
||||
"cache_mode": "ENABLED",
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
records = Ingestor.run(example_sources)
|
||||
print(f"Total records ingested: {len(records)}")
|
||||
for record in records:
|
||||
print(record)
|
||||
140
main.py
Normal file
140
main.py
Normal file
@ -0,0 +1,140 @@
|
||||
"""
|
||||
FastAPI service for managing and running data integration pipelines.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
||||
|
||||
import models
|
||||
import stores
|
||||
import services
|
||||
|
||||
app = FastAPI(title="Data Integration Pipeline API")
|
||||
|
||||
|
||||
@app.post(
|
||||
"/pipelines",
|
||||
response_model=models.Pipeline,
|
||||
status_code=201,
|
||||
summary="Create a new pipeline"
|
||||
)
|
||||
def create_pipeline(pipeline_in: models.PipelineCreate) -> models.Pipeline:
|
||||
"""
|
||||
Register a new pipeline with sources configuration.
|
||||
"""
|
||||
return stores.create_pipeline(pipeline_in)
|
||||
|
||||
|
||||
@app.get(
|
||||
"/pipelines",
|
||||
response_model=List[models.Pipeline],
|
||||
summary="List all pipelines"
|
||||
)
|
||||
def list_pipelines() -> List[models.Pipeline]:
|
||||
"""
|
||||
Retrieve all registered pipelines.
|
||||
"""
|
||||
return stores.list_pipelines()
|
||||
|
||||
|
||||
@app.get(
|
||||
"/pipelines/{pipeline_id}",
|
||||
response_model=models.Pipeline,
|
||||
summary="Get a pipeline by ID"
|
||||
)
|
||||
def get_pipeline(pipeline_id: UUID) -> models.Pipeline:
|
||||
"""
|
||||
Fetch details of a specific pipeline.
|
||||
"""
|
||||
pipeline = stores.get_pipeline(pipeline_id)
|
||||
if not pipeline:
|
||||
raise HTTPException(status_code=404, detail="Pipeline not found")
|
||||
return pipeline
|
||||
|
||||
|
||||
@app.post(
|
||||
"/pipelines/{pipeline_id}/run",
|
||||
response_model=models.Run,
|
||||
status_code=201,
|
||||
summary="Trigger a pipeline run"
|
||||
)
|
||||
def run_pipeline(
|
||||
pipeline_id: UUID,
|
||||
background_tasks: BackgroundTasks
|
||||
) -> models.Run:
|
||||
"""
|
||||
Start a new run for the given pipeline. Runs asynchronously.
|
||||
"""
|
||||
pipeline = stores.get_pipeline(pipeline_id)
|
||||
if not pipeline:
|
||||
raise HTTPException(status_code=404, detail="Pipeline not found")
|
||||
|
||||
run = stores.create_run(pipeline_id)
|
||||
background_tasks.add_task(services.execute_pipeline, pipeline, run.id)
|
||||
return run
|
||||
|
||||
|
||||
@app.get(
|
||||
"/pipelines/{pipeline_id}/runs",
|
||||
response_model=List[models.Run],
|
||||
summary="List runs for a pipeline"
|
||||
)
|
||||
def list_runs(pipeline_id: UUID) -> List[models.Run]:
|
||||
"""
|
||||
List all runs associated with a pipeline.
|
||||
"""
|
||||
pipeline = stores.get_pipeline(pipeline_id)
|
||||
if not pipeline:
|
||||
raise HTTPException(status_code=404, detail="Pipeline not found")
|
||||
|
||||
runs = stores.list_runs_for_pipeline(pipeline_id)
|
||||
# Return only the Run fields (omit results/error)
|
||||
return [models.Run(**r.dict()) for r in runs]
|
||||
|
||||
|
||||
@app.get(
|
||||
"/pipelines/{pipeline_id}/runs/{run_id}",
|
||||
response_model=models.Run,
|
||||
summary="Get run status"
|
||||
)
|
||||
def get_run(pipeline_id: UUID, run_id: UUID) -> models.Run:
|
||||
"""
|
||||
Retrieve the status of a specific run.
|
||||
"""
|
||||
pipeline = stores.get_pipeline(pipeline_id)
|
||||
if not pipeline:
|
||||
raise HTTPException(status_code=404, detail="Pipeline not found")
|
||||
|
||||
run = stores.get_run(run_id)
|
||||
if not run or run.pipeline_id != pipeline_id:
|
||||
raise HTTPException(status_code=404, detail="Run not found")
|
||||
|
||||
return models.Run(**run.dict())
|
||||
|
||||
|
||||
@app.get(
|
||||
"/pipelines/{pipeline_id}/runs/{run_id}/results",
|
||||
response_model=List[Dict[str, Any]],
|
||||
summary="Get run results"
|
||||
)
|
||||
def get_run_results(pipeline_id: UUID, run_id: UUID) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Retrieve normalized results of a completed run.
|
||||
"""
|
||||
pipeline = stores.get_pipeline(pipeline_id)
|
||||
if not pipeline:
|
||||
raise HTTPException(status_code=404, detail="Pipeline not found")
|
||||
|
||||
run = stores.get_run(run_id)
|
||||
if not run or run.pipeline_id != pipeline_id:
|
||||
raise HTTPException(status_code=404, detail="Run not found")
|
||||
|
||||
if run.status != 'COMPLETED':
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Run not completed or has failed"
|
||||
)
|
||||
|
||||
return run.results or []
|
||||
167
models.py
Normal file
167
models.py
Normal file
@ -0,0 +1,167 @@
|
||||
"""
|
||||
Pydantic models for pipelines and runs.
|
||||
"""
|
||||
|
||||
from typing import List, Union, Annotated, Optional, Literal, Dict, Any
|
||||
from uuid import UUID
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl, field_validator
|
||||
|
||||
|
||||
class RunCreate(BaseModel):
|
||||
"""
|
||||
Model for creating a new run. (Empty)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class Run(BaseModel):
|
||||
"""
|
||||
Status of a pipeline run.
|
||||
"""
|
||||
id: UUID
|
||||
pipeline_id: UUID
|
||||
status: Literal['PENDING', 'RUNNING', 'COMPLETED', 'FAILED']
|
||||
started_at: datetime
|
||||
finished_at: Optional[datetime] = None
|
||||
|
||||
|
||||
class RunResult(Run):
|
||||
"""
|
||||
Extended run model including results or error.
|
||||
"""
|
||||
results: Optional[List[Dict[str, Any]]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
|
||||
class ApiConfig(BaseModel):
|
||||
"""
|
||||
Configuration for an API source.
|
||||
"""
|
||||
url: HttpUrl = Field(
|
||||
...,
|
||||
description="API endpoint URL",
|
||||
example="https://api.example.com/data"
|
||||
)
|
||||
token: Optional[str] = Field(
|
||||
None,
|
||||
description="Optional bearer token for API authentication",
|
||||
example="abcdef123456"
|
||||
)
|
||||
|
||||
|
||||
class ScrapeConfig(BaseModel):
|
||||
"""
|
||||
Configuration for a web-scraping source.
|
||||
"""
|
||||
urls: List[HttpUrl] = Field(
|
||||
...,
|
||||
description="List of URLs to scrape",
|
||||
example=["https://example.com/page1", "https://example.com/page2"]
|
||||
)
|
||||
schema_file: Optional[str] = Field(
|
||||
None,
|
||||
description="Path to a JSON file containing CSS extraction schema",
|
||||
example="schemas/page_schema.json"
|
||||
)
|
||||
prompt: Optional[str] = Field(
|
||||
None,
|
||||
description="Prompt string for LLM-based extraction",
|
||||
example="Extract product titles and prices"
|
||||
)
|
||||
|
||||
|
||||
class FileConfig(BaseModel):
|
||||
"""
|
||||
Configuration for a file-based source.
|
||||
"""
|
||||
path: str = Field(
|
||||
...,
|
||||
description="Path to the input file",
|
||||
example="/data/myfile.json"
|
||||
)
|
||||
format: Literal["csv", "json", "sqlite"] = Field(
|
||||
"json",
|
||||
description="Format of the file",
|
||||
example="csv"
|
||||
)
|
||||
|
||||
@field_validator("path")
|
||||
def path_extension_matches_format(cls, v: str, values):
|
||||
fmt = values.get("format")
|
||||
if fmt and not v.lower().endswith(f".{fmt}"):
|
||||
raise ValueError(f"File extension must match format '{fmt}'")
|
||||
return v
|
||||
|
||||
|
||||
class ApiSource(BaseModel):
|
||||
"""
|
||||
An API-based data source.
|
||||
"""
|
||||
type: Literal["api"] = Field(
|
||||
"api", description="Discriminator for API source" # Removed const=True
|
||||
)
|
||||
config: ApiConfig
|
||||
|
||||
|
||||
class ScrapeSource(BaseModel):
|
||||
"""
|
||||
A web-scraping data source.
|
||||
"""
|
||||
type: Literal["scrape"] = Field(
|
||||
"scrape", description="Discriminator for scrape source" # Removed const=True
|
||||
)
|
||||
config: ScrapeConfig
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
"""
|
||||
A file-based data source.
|
||||
"""
|
||||
type: Literal["file"] = Field(
|
||||
"file", description="Discriminator for file source" # Removed const=True
|
||||
)
|
||||
config: FileConfig
|
||||
|
||||
Source = Annotated[
|
||||
Union[ApiSource, ScrapeSource, FileSource],
|
||||
Field(discriminator="type", description="Union of all source types")
|
||||
]
|
||||
|
||||
class PipelineCreate(BaseModel):
|
||||
"""
|
||||
Payload for creating a new pipeline.
|
||||
"""
|
||||
name: Optional[str] = Field(
|
||||
None,
|
||||
description="Optional human-readable name for the pipeline",
|
||||
example="My Data Pipeline"
|
||||
)
|
||||
sources: List[Source] = Field(
|
||||
...,
|
||||
description="List of data sources for this pipeline"
|
||||
)
|
||||
|
||||
|
||||
class Pipeline(BaseModel):
|
||||
"""
|
||||
Representation of a pipeline.
|
||||
"""
|
||||
id: UUID = Field(
|
||||
...,
|
||||
description="Unique identifier for the pipeline"
|
||||
)
|
||||
name: Optional[str] = Field(
|
||||
None,
|
||||
description="Optional human-readable name for the pipeline"
|
||||
)
|
||||
sources: List[Source] = Field(
|
||||
...,
|
||||
description="List of configured data sources"
|
||||
)
|
||||
created_at: datetime = Field(
|
||||
...,
|
||||
description="UTC timestamp when the pipeline was created"
|
||||
)
|
||||
6
normalization/__init__.py
Normal file
6
normalization/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""
|
||||
Normalization package for data integration service.
|
||||
|
||||
Provides utilities and classes to normalize raw records
|
||||
into a canonical schema.
|
||||
"""
|
||||
23
normalization/base.py
Normal file
23
normalization/base.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""
|
||||
Base module defining protocols for the normalization layer.
|
||||
"""
|
||||
|
||||
from typing import Protocol, Dict, Any
|
||||
|
||||
|
||||
class TextExtractor(Protocol):
|
||||
"""
|
||||
Protocol for text extraction strategies.
|
||||
"""
|
||||
|
||||
def extract(self, record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract and return text from a flattened record.
|
||||
|
||||
Args:
|
||||
record: A flattened record dict.
|
||||
|
||||
Returns:
|
||||
A string containing the extracted text.
|
||||
"""
|
||||
...
|
||||
86
normalization/normalizer.py
Normal file
86
normalization/normalizer.py
Normal file
@ -0,0 +1,86 @@
|
||||
"""
|
||||
Normalizer module to transform raw records into a canonical schema.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
from .base import TextExtractor
|
||||
from .utils import flatten_dict, generate_id, extract_all_text
|
||||
|
||||
|
||||
class _DefaultTextExtractor:
|
||||
"""
|
||||
Default text extractor using the extract_all_text utility.
|
||||
"""
|
||||
|
||||
def extract(self, record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract text from the record.
|
||||
|
||||
Args:
|
||||
record: A flattened record dict.
|
||||
|
||||
Returns:
|
||||
A string containing concatenated text values.
|
||||
"""
|
||||
return extract_all_text(record)
|
||||
|
||||
|
||||
class Normalizer:
|
||||
"""
|
||||
Class to normalize raw records into a canonical format.
|
||||
"""
|
||||
|
||||
def __init__(self, extractor: Optional[TextExtractor] = None):
|
||||
"""
|
||||
Initialize the Normalizer.
|
||||
|
||||
Args:
|
||||
extractor: Optional custom TextExtractor strategy.
|
||||
"""
|
||||
self.extractor: TextExtractor = extractor or _DefaultTextExtractor()
|
||||
|
||||
def normalize(
|
||||
self,
|
||||
records: List[Dict[str, Any]],
|
||||
source_type: str,
|
||||
source: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Normalize a list of raw records.
|
||||
|
||||
Args:
|
||||
records: Raw records to normalize.
|
||||
source_type: Type of the source ('api', 'scrape', 'file').
|
||||
source: Original source identifier (URL or path).
|
||||
|
||||
Returns:
|
||||
A list of canonical records matching the schema.
|
||||
"""
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
|
||||
for raw in records:
|
||||
flat = flatten_dict(raw)
|
||||
text = self.extractor.extract(flat)
|
||||
rec_id = generate_id(source, flat)
|
||||
metadata = {k: v for k, v in flat.items() if not isinstance(v, str)}
|
||||
|
||||
canonical = {
|
||||
"id": rec_id,
|
||||
"source_type": source_type,
|
||||
"source": source,
|
||||
"raw": raw,
|
||||
"metadata": metadata,
|
||||
"text": text,
|
||||
}
|
||||
normalized.append(canonical)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
sample = [{"title": "Hello", "details": {"body": "World", "count": 5}}]
|
||||
norm = Normalizer()
|
||||
records = norm.normalize(sample, source_type="api", source="https://example.com")
|
||||
print(records)
|
||||
64
normalization/utils.py
Normal file
64
normalization/utils.py
Normal file
@ -0,0 +1,64 @@
|
||||
"""
|
||||
Utility functions for the normalization layer.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def flatten_dict(
|
||||
d: Dict[str, Any],
|
||||
parent_key: str = "",
|
||||
sep: str = "."
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Recursively flatten a nested dictionary.
|
||||
|
||||
Args:
|
||||
d: The dictionary to flatten.
|
||||
parent_key: The base key string for recursion.
|
||||
sep: Separator between keys.
|
||||
|
||||
Returns:
|
||||
A flattened dictionary with compound keys.
|
||||
"""
|
||||
items: Dict[str, Any] = {}
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.update(flatten_dict(v, new_key, sep=sep))
|
||||
else:
|
||||
items[new_key] = v
|
||||
return items
|
||||
|
||||
|
||||
def generate_id(source: str, record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate a stable UUID based on source and record content.
|
||||
|
||||
Args:
|
||||
source: Identifier for the data source (URL or file path).
|
||||
record: The flattened record dict.
|
||||
|
||||
Returns:
|
||||
A string representation of a UUID.
|
||||
"""
|
||||
record_json = json.dumps(record, sort_keys=True)
|
||||
namespace = uuid.NAMESPACE_URL
|
||||
uid = uuid.uuid5(namespace, f"{source}-{record_json}")
|
||||
return str(uid)
|
||||
|
||||
|
||||
def extract_all_text(record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract all string values from the record and concatenate them.
|
||||
|
||||
Args:
|
||||
record: A flattened record dict.
|
||||
|
||||
Returns:
|
||||
A single string containing all text values separated by spaces.
|
||||
"""
|
||||
texts = [v for v in record.values() if isinstance(v, str)]
|
||||
return " ".join(texts)
|
||||
14
pyproject.toml
Normal file
14
pyproject.toml
Normal file
@ -0,0 +1,14 @@
|
||||
[project]
|
||||
name = "crawler-ai"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"crawl4ai>=0.5.0.post8",
|
||||
"fastapi[standard]>=0.115.12",
|
||||
"inquirer>=3.4.0",
|
||||
"pandas>=2.2.3",
|
||||
"python-dotenv>=1.1.0",
|
||||
"rich>=14.0.0",
|
||||
]
|
||||
55
services.py
Normal file
55
services.py
Normal file
@ -0,0 +1,55 @@
|
||||
"""
|
||||
Background service to execute pipelines: ingestion → normalization.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from uuid import UUID
|
||||
from datetime import datetime
|
||||
|
||||
import stores
|
||||
import models
|
||||
from ingestion.ingestor import Ingestor
|
||||
from normalization.normalizer import Normalizer
|
||||
|
||||
|
||||
def execute_pipeline(pipeline: models.Pipeline, run_id: UUID) -> None:
|
||||
"""
|
||||
Execute a pipeline: ingest data, normalize it, and update run status.
|
||||
|
||||
Args:
|
||||
pipeline: The Pipeline model to run.
|
||||
run_id: UUID of the RunResult to update.
|
||||
"""
|
||||
run = stores.runs.get(run_id)
|
||||
if not run:
|
||||
return
|
||||
|
||||
# Mark as running
|
||||
run.status = 'RUNNING'
|
||||
run.started_at = datetime.utcnow()
|
||||
|
||||
try:
|
||||
# Ingest raw records
|
||||
raw_records: List[Dict[str, Any]] = Ingestor.run(pipeline.sources)
|
||||
|
||||
# Normalize records
|
||||
normalizer = Normalizer()
|
||||
canonical: List[Dict[str, Any]] = []
|
||||
for raw in raw_records:
|
||||
source_type = raw.get('source_type')
|
||||
source = raw.get('source')
|
||||
if not source_type or not source:
|
||||
raise ValueError("Record missing 'source_type' or 'source'.")
|
||||
norm = normalizer.normalize([raw], source_type, source)
|
||||
canonical.extend(norm)
|
||||
|
||||
# Success
|
||||
run.status = 'COMPLETED'
|
||||
run.finished_at = datetime.utcnow()
|
||||
run.results = canonical
|
||||
|
||||
except Exception as e:
|
||||
# Failure
|
||||
run.status = 'FAILED'
|
||||
run.finished_at = datetime.utcnow()
|
||||
run.error = str(e)
|
||||
76
stores.py
Normal file
76
stores.py
Normal file
@ -0,0 +1,76 @@
|
||||
"""
|
||||
In‐memory stores for pipelines and runs.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
from uuid import UUID, uuid4
|
||||
from datetime import datetime
|
||||
|
||||
import models
|
||||
|
||||
# In‐memory storage
|
||||
pipelines: Dict[UUID, models.Pipeline] = {}
|
||||
runs: Dict[UUID, models.RunResult] = {}
|
||||
|
||||
|
||||
def create_pipeline(pipeline_in: models.PipelineCreate) -> models.Pipeline:
|
||||
"""
|
||||
Create and store a new pipeline.
|
||||
"""
|
||||
pipeline_id = uuid4()
|
||||
now = datetime.utcnow()
|
||||
pipeline = models.Pipeline(
|
||||
id=pipeline_id,
|
||||
name=pipeline_in.name,
|
||||
sources=pipeline_in.sources,
|
||||
created_at=now,
|
||||
)
|
||||
pipelines[pipeline_id] = pipeline
|
||||
return pipeline
|
||||
|
||||
|
||||
def get_pipeline(pipeline_id: UUID) -> Optional[models.Pipeline]:
|
||||
"""
|
||||
Retrieve a pipeline by its ID.
|
||||
"""
|
||||
return pipelines.get(pipeline_id)
|
||||
|
||||
|
||||
def list_pipelines() -> List[models.Pipeline]:
|
||||
"""
|
||||
List all registered pipelines.
|
||||
"""
|
||||
return list(pipelines.values())
|
||||
|
||||
|
||||
def create_run(pipeline_id: UUID) -> models.RunResult:
|
||||
"""
|
||||
Create and store a new run for a given pipeline.
|
||||
"""
|
||||
run_id = uuid4()
|
||||
now = datetime.utcnow()
|
||||
run = models.RunResult(
|
||||
id=run_id,
|
||||
pipeline_id=pipeline_id,
|
||||
status='PENDING',
|
||||
started_at=now,
|
||||
finished_at=None,
|
||||
results=None,
|
||||
error=None,
|
||||
)
|
||||
runs[run_id] = run
|
||||
return run
|
||||
|
||||
|
||||
def get_run(run_id: UUID) -> Optional[models.RunResult]:
|
||||
"""
|
||||
Retrieve a run by its ID.
|
||||
"""
|
||||
return runs.get(run_id)
|
||||
|
||||
|
||||
def list_runs_for_pipeline(pipeline_id: UUID) -> List[models.RunResult]:
|
||||
"""
|
||||
List all runs for a specific pipeline.
|
||||
"""
|
||||
return [r for r in runs.values() if r.pipeline_id == pipeline_id]
|
||||
Loading…
Reference in New Issue
Block a user