backend-api/models.py
2025-05-11 16:01:49 +07:00

195 lines
5.2 KiB
Python

"""
Pydantic models for pipelines and runs.
"""
from typing import List, Union, Annotated, Optional, Literal, Dict, Any
from uuid import UUID
from datetime import datetime
from pydantic import BaseModel, Field, HttpUrl, field_validator, ValidationInfo
class RunCreate(BaseModel):
"""
Model for creating a new run. (Empty)
"""
pass
class Run(BaseModel):
"""
Status of a pipeline run.
"""
id: UUID
pipeline_id: UUID
status: Literal['PENDING', 'RUNNING', 'COMPLETED', 'FAILED']
started_at: datetime
finished_at: Optional[datetime] = None
class RunResult(Run):
"""
Extended run model including results or error.
"""
results: Optional[List[Dict[str, Any]]] = None
error: Optional[str] = None
class ApiConfig(BaseModel):
"""
Configuration for an API source.
"""
url: HttpUrl = Field(
...,
description="API endpoint URL",
example="https://api.example.com/data"
)
token: Optional[str] = Field(
None,
description="Optional bearer token for API authentication",
example="abcdef123456"
)
class ScrapeConfig(BaseModel):
"""
Configuration for a web-scraping source.
"""
urls: List[HttpUrl] = Field(
...,
description="List of URLs to scrape",
example=["https://example.com/page1", "https://example.com/page2"]
)
schema_file: Optional[str] = Field(
None,
description="Path to a JSON file containing CSS extraction schema",
example="schemas/page_schema.json"
)
prompt: Optional[str] = Field(
None,
description="Prompt string for LLM-based extraction",
example="Extract product titles and prices"
)
class FileConfig(BaseModel):
"""
Configuration for a file-based source. Supports either a file path or an uploaded file.
"""
path: Optional[str] = Field(
None,
description="Path to the input file (optional if upload is provided)",
example="/data/myfile.json"
)
upload: Optional[Any] = Field(
None,
description="Uploaded file object or metadata (optional if path is provided)",
example=None
)
upload_filename: Optional[str] = Field(
None,
description="Original filename of the uploaded file (for validation)",
example="myfile.json"
)
format: Literal["csv", "json", "sqlite"] = Field(
"json",
description="Format of the file",
example="csv"
)
@field_validator("path", mode="before")
def require_path_or_upload(cls, v, info: ValidationInfo):
data = info.data
if not v and not data.get("upload"):
raise ValueError("Either 'path' or 'upload' must be provided.")
return v
@field_validator("upload_filename", mode="before")
def filename_extension_matches_format(cls, v, info: ValidationInfo):
fmt = info.data.get("format")
if v and fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
return v
@field_validator("path", mode="after")
def path_or_upload_extension_matches_format(cls, v, info: ValidationInfo):
fmt = info.data.get("format")
upload_filename = info.data.get("upload_filename")
if v and fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"File extension must match format '{fmt}'")
if upload_filename and fmt and not upload_filename.lower().endswith(f".{fmt}"):
raise ValueError(f"Uploaded file extension must match format '{fmt}'")
return v
class ApiSource(BaseModel):
"""
An API-based data source.
"""
type: Literal["api"] = Field(
"api", description="Discriminator for API source" # Removed const=True
)
config: ApiConfig
class ScrapeSource(BaseModel):
"""
A web-scraping data source.
"""
type: Literal["scrape"] = Field(
"scrape", description="Discriminator for scrape source" # Removed const=True
)
config: ScrapeConfig
class FileSource(BaseModel):
"""
A file-based data source.
"""
type: Literal["file"] = Field(
"file", description="Discriminator for file source" # Removed const=True
)
config: FileConfig
Source = Annotated[
Union[ApiSource, ScrapeSource, FileSource],
Field(discriminator="type", description="Union of all source types")
]
class PipelineCreate(BaseModel):
"""
Payload for creating a new pipeline.
"""
name: Optional[str] = Field(
None,
description="Optional human-readable name for the pipeline",
example="My Data Pipeline"
)
sources: List[Source] = Field(
...,
description="List of data sources for this pipeline"
)
class Pipeline(BaseModel):
"""
Representation of a pipeline.
"""
id: UUID = Field(
...,
description="Unique identifier for the pipeline"
)
name: Optional[str] = Field(
None,
description="Optional human-readable name for the pipeline"
)
sources: List[Source] = Field(
...,
description="List of configured data sources"
)
created_at: datetime = Field(
...,
description="UTC timestamp when the pipeline was created"
)