mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-18 20:24:05 +01:00
83 lines
2.6 KiB
Python
83 lines
2.6 KiB
Python
import pytest
|
|
import json
|
|
from unittest.mock import patch, AsyncMock, MagicMock, mock_open
|
|
|
|
from ingestion.adapters.web_scraper_adapter import WebScraperAdapter
|
|
from models.ingestion import AdapterRecord
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_with_llm_extraction():
|
|
"""
|
|
Test fetching data using LLM extraction.
|
|
"""
|
|
mock_result = MagicMock()
|
|
mock_result.success = True
|
|
mock_result.url = "http://example.com"
|
|
mock_result.extracted_content = json.dumps({"title": "Example"})
|
|
|
|
with patch(
|
|
"ingestion.adapters.web_scraper_adapter.AsyncWebCrawler"
|
|
) as mock_crawler_cls:
|
|
mock_crawler = AsyncMock()
|
|
mock_crawler_cls.return_value = mock_crawler
|
|
mock_crawler.arun_many.return_value = [mock_result]
|
|
|
|
adapter = WebScraperAdapter(
|
|
urls=["http://example.com"],
|
|
api_key="fake-key",
|
|
schema_file=None,
|
|
prompt="Extract data",
|
|
)
|
|
|
|
records = await adapter._fetch_async()
|
|
|
|
assert isinstance(records, list)
|
|
assert isinstance(records[0], AdapterRecord)
|
|
assert records[0].data["title"] == "Example"
|
|
assert records[0].data["source_url"] == "http://example.com"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_with_schema_file():
|
|
"""
|
|
Test fetching data using schema file.
|
|
"""
|
|
schema = {"title": {"selector": "h1"}}
|
|
mock_result = MagicMock()
|
|
mock_result.success = True
|
|
mock_result.url = "http://example.com"
|
|
mock_result.extracted_content = json.dumps({"title": "Example"})
|
|
|
|
with patch("builtins.open", mock_open(read_data=json.dumps(schema))):
|
|
with patch(
|
|
"ingestion.adapters.web_scraper_adapter.AsyncWebCrawler"
|
|
) as mock_crawler_cls:
|
|
mock_crawler = AsyncMock()
|
|
mock_crawler_cls.return_value = mock_crawler
|
|
mock_crawler.arun_many.return_value = [mock_result]
|
|
|
|
adapter = WebScraperAdapter(
|
|
urls=["http://example.com"],
|
|
api_key="fake-key",
|
|
schema_file="schema.json",
|
|
)
|
|
|
|
records = await adapter._fetch_async()
|
|
|
|
assert len(records) == 1
|
|
assert records[0].data["title"] == "Example"
|
|
assert records[0].data["source_url"] == "http://example.com"
|
|
|
|
|
|
def test_fetch_sync_calls_async():
|
|
"""
|
|
Test that the sync fetch method calls the async fetch method.
|
|
"""
|
|
adapter = WebScraperAdapter(
|
|
urls=["http://example.com"], api_key="fake-key", prompt="Extract data"
|
|
)
|
|
with patch.object(adapter, "_fetch_async", new=AsyncMock(return_value=[])):
|
|
result = adapter.fetch()
|
|
assert result == []
|