backend-api/pipeline/ingestion/ingestors/simple_ingest.py

66 lines
2.5 KiB
Python

from ingestion.adapters.api_adapter import ApiAdapter
from ingestion.adapters.file_adapter import FileAdapter
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
from .base import IngestionMethod
from models.ingestion import (
AdapterRecord,
IngestSourceConfig,
SourceType,
ApiConfig,
FileConfig,
ScrapeConfig,
OutputData,
)
from loguru import logger
class SimpleIngestionStrategy(IngestionMethod):
def run(self, sources: list[IngestSourceConfig]) -> OutputData:
results: list[AdapterRecord] = []
for source in sources:
try:
match source.type:
case SourceType.API:
config = source.config
assert isinstance(config, ApiConfig)
adapter = ApiAdapter(
url=config.url,
headers=config.headers,
timeout=config.timeout or 30,
token=config.token,
)
records = adapter.fetch()
case SourceType.FILE:
config = source.config
assert isinstance(config, FileConfig)
adapter = FileAdapter(upload=config.upload)
records = adapter.fetch()
case SourceType.SCRAPE:
config = source.config
assert isinstance(config, ScrapeConfig)
adapter = WebScraperAdapter(
urls=config.urls,
api_key=config.api_key,
schema_file=config.schema_file,
prompt=config.prompt or WebScraperAdapter.DEFAULT_PROMPT,
llm_provider=config.llm_provider or "openai/gpt-4o-mini",
output_format=config.output_format or "json",
verbose=config.verbose or False,
cache_mode=config.cache_mode or "ENABLED",
)
records = adapter.fetch()
results.extend(records)
except Exception as e:
logger.error(f"Failed to ingest from source {source.type}: {e}")
return OutputData(
records=results,
unified=False,
metadata={"source_count": len(sources), "record_count": len(results)},
)