mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-18 20:24:05 +01:00
91 lines
2.7 KiB
Python
91 lines
2.7 KiB
Python
"""
|
|
Ingestor module to orchestrate data ingestion from multiple adapters.
|
|
"""
|
|
|
|
from typing import List, Dict, Any
|
|
|
|
from ingestion.adapters.api_adapter import ApiAdapter
|
|
from ingestion.adapters.file_adapter import FileAdapter
|
|
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
|
|
|
|
|
|
class Ingestor:
|
|
"""
|
|
Ingestor for aggregating data from various sources.
|
|
"""
|
|
|
|
@staticmethod
|
|
def run(sources: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Run ingestion for a list of sources.
|
|
|
|
Args:
|
|
sources: List of dicts, each with:
|
|
- type: 'api', 'scrape', or 'file'
|
|
- config: kwargs for the adapter constructor
|
|
|
|
Returns:
|
|
Flat list of all records fetched from sources.
|
|
|
|
Raises:
|
|
ValueError: For unknown source types.
|
|
RuntimeError: If an adapter fails during fetch.
|
|
"""
|
|
aggregated: List[Dict[str, Any]] = []
|
|
|
|
for src in sources:
|
|
src_type = src.get("type")
|
|
config = src.get("config", {})
|
|
if src_type == "api":
|
|
adapter = ApiAdapter(**config)
|
|
elif src_type == "scrape":
|
|
adapter = WebScraperAdapter(**config)
|
|
elif src_type == "file":
|
|
adapter = FileAdapter(**config)
|
|
else:
|
|
raise ValueError(f"Unknown source type: {src_type}")
|
|
|
|
try:
|
|
data = adapter.fetch()
|
|
aggregated.extend(data)
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Ingestion failed for source '{src_type}' with config {config}: {e}"
|
|
)
|
|
|
|
return aggregated
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage of the Ingestor.
|
|
example_sources = [
|
|
{
|
|
"type": "api",
|
|
"config": {
|
|
"url": "https://dummyjson.com/products",
|
|
"headers": {"Accept": "application/json"},
|
|
},
|
|
},
|
|
{
|
|
"type": "file",
|
|
"config": {"path": "data/sample.json"},
|
|
},
|
|
{
|
|
"type": "scrape",
|
|
"config": {
|
|
"urls": ["https://www.hipflat.co.th/en"],
|
|
"schema_file": None,
|
|
"prompt": "Extract all listings",
|
|
"llm_provider": "gemini/gemini-2.0-flash",
|
|
"api_key": "AIzaSyAGnER5on8a0bVXU7quXFMnNyOvCiC_ees",
|
|
"output_format": "json",
|
|
"verbose": False,
|
|
"cache_mode": "ENABLED",
|
|
},
|
|
},
|
|
]
|
|
|
|
records = Ingestor.run(example_sources)
|
|
print(f"Total records ingested: {len(records)}")
|
|
for record in records:
|
|
print(record) |