backend-api/models.py
2025-04-20 19:46:54 +07:00

167 lines
4.0 KiB
Python

"""
Pydantic models for pipelines and runs.
"""
from typing import List, Union, Annotated, Optional, Literal, Dict, Any
from uuid import UUID
from datetime import datetime
from pydantic import BaseModel, Field, HttpUrl, field_validator
class RunCreate(BaseModel):
"""
Model for creating a new run. (Empty)
"""
pass
class Run(BaseModel):
"""
Status of a pipeline run.
"""
id: UUID
pipeline_id: UUID
status: Literal['PENDING', 'RUNNING', 'COMPLETED', 'FAILED']
started_at: datetime
finished_at: Optional[datetime] = None
class RunResult(Run):
"""
Extended run model including results or error.
"""
results: Optional[List[Dict[str, Any]]] = None
error: Optional[str] = None
class ApiConfig(BaseModel):
"""
Configuration for an API source.
"""
url: HttpUrl = Field(
...,
description="API endpoint URL",
example="https://api.example.com/data"
)
token: Optional[str] = Field(
None,
description="Optional bearer token for API authentication",
example="abcdef123456"
)
class ScrapeConfig(BaseModel):
"""
Configuration for a web-scraping source.
"""
urls: List[HttpUrl] = Field(
...,
description="List of URLs to scrape",
example=["https://example.com/page1", "https://example.com/page2"]
)
schema_file: Optional[str] = Field(
None,
description="Path to a JSON file containing CSS extraction schema",
example="schemas/page_schema.json"
)
prompt: Optional[str] = Field(
None,
description="Prompt string for LLM-based extraction",
example="Extract product titles and prices"
)
class FileConfig(BaseModel):
"""
Configuration for a file-based source.
"""
path: str = Field(
...,
description="Path to the input file",
example="/data/myfile.json"
)
format: Literal["csv", "json", "sqlite"] = Field(
"json",
description="Format of the file",
example="csv"
)
@field_validator("path")
def path_extension_matches_format(cls, v: str, values):
fmt = values.get("format")
if fmt and not v.lower().endswith(f".{fmt}"):
raise ValueError(f"File extension must match format '{fmt}'")
return v
class ApiSource(BaseModel):
"""
An API-based data source.
"""
type: Literal["api"] = Field(
"api", description="Discriminator for API source" # Removed const=True
)
config: ApiConfig
class ScrapeSource(BaseModel):
"""
A web-scraping data source.
"""
type: Literal["scrape"] = Field(
"scrape", description="Discriminator for scrape source" # Removed const=True
)
config: ScrapeConfig
class FileSource(BaseModel):
"""
A file-based data source.
"""
type: Literal["file"] = Field(
"file", description="Discriminator for file source" # Removed const=True
)
config: FileConfig
Source = Annotated[
Union[ApiSource, ScrapeSource, FileSource],
Field(discriminator="type", description="Union of all source types")
]
class PipelineCreate(BaseModel):
"""
Payload for creating a new pipeline.
"""
name: Optional[str] = Field(
None,
description="Optional human-readable name for the pipeline",
example="My Data Pipeline"
)
sources: List[Source] = Field(
...,
description="List of data sources for this pipeline"
)
class Pipeline(BaseModel):
"""
Representation of a pipeline.
"""
id: UUID = Field(
...,
description="Unique identifier for the pipeline"
)
name: Optional[str] = Field(
None,
description="Optional human-readable name for the pipeline"
)
sources: List[Source] = Field(
...,
description="List of configured data sources"
)
created_at: datetime = Field(
...,
description="UTC timestamp when the pipeline was created"
)