backend-api/ingestion/ingestor.py
2025-04-20 19:46:54 +07:00

91 lines
2.7 KiB
Python

"""
Ingestor module to orchestrate data ingestion from multiple adapters.
"""
from typing import List, Dict, Any
from ingestion.adapters.api_adapter import ApiAdapter
from ingestion.adapters.file_adapter import FileAdapter
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
class Ingestor:
"""
Ingestor for aggregating data from various sources.
"""
@staticmethod
def run(sources: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Run ingestion for a list of sources.
Args:
sources: List of dicts, each with:
- type: 'api', 'scrape', or 'file'
- config: kwargs for the adapter constructor
Returns:
Flat list of all records fetched from sources.
Raises:
ValueError: For unknown source types.
RuntimeError: If an adapter fails during fetch.
"""
aggregated: List[Dict[str, Any]] = []
for src in sources:
src_type = src.get("type")
config = src.get("config", {})
if src_type == "api":
adapter = ApiAdapter(**config)
elif src_type == "scrape":
adapter = WebScraperAdapter(**config)
elif src_type == "file":
adapter = FileAdapter(**config)
else:
raise ValueError(f"Unknown source type: {src_type}")
try:
data = adapter.fetch()
aggregated.extend(data)
except Exception as e:
raise RuntimeError(
f"Ingestion failed for source '{src_type}' with config {config}: {e}"
)
return aggregated
if __name__ == "__main__":
# Example usage of the Ingestor.
example_sources = [
{
"type": "api",
"config": {
"url": "https://dummyjson.com/products",
"headers": {"Accept": "application/json"},
},
},
{
"type": "file",
"config": {"path": "data/sample.json"},
},
{
"type": "scrape",
"config": {
"urls": ["https://www.hipflat.co.th/en"],
"schema_file": None,
"prompt": "Extract all listings",
"llm_provider": "gemini/gemini-2.0-flash",
"api_key": "AIzaSyAGnER5on8a0bVXU7quXFMnNyOvCiC_ees",
"output_format": "json",
"verbose": False,
"cache_mode": "ENABLED",
},
},
]
records = Ingestor.run(example_sources)
print(f"Total records ingested: {len(records)}")
for record in records:
print(record)