mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-18 20:24:05 +01:00
66 lines
2.5 KiB
Python
66 lines
2.5 KiB
Python
from ingestion.adapters.api_adapter import ApiAdapter
|
|
from ingestion.adapters.file_adapter import FileAdapter
|
|
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
|
|
from .base import IngestionMethod
|
|
from models.ingestion import (
|
|
AdapterRecord,
|
|
IngestSourceConfig,
|
|
SourceType,
|
|
ApiConfig,
|
|
FileConfig,
|
|
ScrapeConfig,
|
|
OutputData,
|
|
)
|
|
from loguru import logger
|
|
|
|
|
|
class SimpleIngestionStrategy(IngestionMethod):
|
|
def run(self, sources: list[IngestSourceConfig]) -> OutputData:
|
|
results: list[AdapterRecord] = []
|
|
|
|
for source in sources:
|
|
try:
|
|
match source.type:
|
|
case SourceType.API:
|
|
config = source.config
|
|
assert isinstance(config, ApiConfig)
|
|
adapter = ApiAdapter(
|
|
url=config.url,
|
|
headers=config.headers,
|
|
timeout=config.timeout or 30,
|
|
token=config.token,
|
|
)
|
|
records = adapter.fetch()
|
|
|
|
case SourceType.FILE:
|
|
config = source.config
|
|
assert isinstance(config, FileConfig)
|
|
adapter = FileAdapter(upload=config.upload)
|
|
records = adapter.fetch()
|
|
|
|
case SourceType.SCRAPE:
|
|
config = source.config
|
|
assert isinstance(config, ScrapeConfig)
|
|
adapter = WebScraperAdapter(
|
|
urls=config.urls,
|
|
api_key=config.api_key,
|
|
schema_file=config.schema_file,
|
|
prompt=config.prompt or WebScraperAdapter.DEFAULT_PROMPT,
|
|
llm_provider=config.llm_provider or "openai/gpt-4o-mini",
|
|
output_format=config.output_format or "json",
|
|
verbose=config.verbose or False,
|
|
cache_mode=config.cache_mode or "ENABLED",
|
|
)
|
|
records = adapter.fetch()
|
|
|
|
results.extend(records)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to ingest from source {source.type}: {e}")
|
|
|
|
return OutputData(
|
|
records=results,
|
|
unified=False,
|
|
metadata={"source_count": len(sources), "record_count": len(results)},
|
|
)
|