fix: add pipeline test, add FAILED status and fix pydantic scheme error

This commit is contained in:
Sosokker 2025-05-12 21:12:50 +07:00
parent f24886afdc
commit 5cd5fcfeb3
5 changed files with 718 additions and 54 deletions

View File

@ -9,6 +9,7 @@ from models.ingestion import IngestorInput
class PipelineStatus(str, enum.Enum):
ACTIVE = "active"
INACTIVE = "inactive"
FAILED = "failed"
class RunFrequency(str, enum.Enum):

View File

@ -8,12 +8,14 @@ dependencies = [
"apscheduler>=3.11.0",
"crawl4ai>=0.5.0.post8",
"fastapi[standard]>=0.115.12",
"freezegun>=1.5.1",
"inquirer>=3.4.0",
"loguru>=0.7.3",
"pandas>=2.2.3",
"pydantic-settings>=2.9.1",
"pytest>=8.3.5",
"pytest-asyncio>=0.26.0",
"pytest-mock>=3.14.0",
"python-dotenv>=1.1.0",
"responses>=0.25.7",
"rich>=14.0.0",
@ -22,3 +24,5 @@ dependencies = [
[tool.pytest.ini_options]
asyncio_mode = "auto"
asyncio_default_fixture_loop_scope = "function"
log_cli = true
log_cli_level = "INFO"

View File

@ -17,7 +17,7 @@ from models.pipeline import (
)
from models.ingestion import IngestorInput
from stores.base import PipelineStore
from scheduler.utils import calculate_next_run, UTC # Import the utility and UTC
from scheduler.utils import calculate_next_run, UTC
# !use TYPE_CHECKING to avoid circular imports at runtime
# the SchedulerManager needs PipelineService, and PipelineService now needs SchedulerManager
@ -67,12 +67,12 @@ class PipelineService:
)
try:
pipeline_id = uuid4()
now = datetime.now(UTC) # Use UTC consistently
now = datetime.now(UTC)
# Calculate the initial next_run time
initial_next_run = calculate_next_run(
frequency=run_frequency,
last_run=None, # No last run yet
last_run=None,
start_reference_time=now,
)
@ -84,9 +84,9 @@ class PipelineService:
ingestor_config=ingestor_config,
run_frequency=run_frequency,
last_run=None,
next_run=initial_next_run, # Store the calculated next run
next_run=initial_next_run,
),
status=PipelineStatus.INACTIVE, # Start as inactive
status=PipelineStatus.INACTIVE,
created_at=now,
updated_at=now,
)
@ -121,50 +121,72 @@ class PipelineService:
return None
try:
update_data = pipeline_in.model_dump(exclude_unset=True)
# Use model_copy for a cleaner update merge
updated_pipeline = existing_pipeline.model_copy(
deep=True, update=update_data
)
# 1. Create a deep copy to modify
updated_pipeline = existing_pipeline.model_copy(deep=True)
# Check if frequency changed, if so, recalculate next_run
config_changed = "config" in update_data
# 2. Update top-level fields directly from the input model
updated_pipeline.name = pipeline_in.name
updated_pipeline.description = pipeline_in.description
# 3. Handle config update carefully
config_changed = False
frequency_changed = False
if (
config_changed
and updated_pipeline.config.run_frequency
!= existing_pipeline.config.run_frequency
):
frequency_changed = True
original_frequency = (
updated_pipeline.config.run_frequency
) # Store before potential change
# Check if the input payload actually provided config data
if pipeline_in.config:
config_changed = True
# Update the fields *within* the existing config object
# Ensure the nested ingestor_config is also handled correctly (assuming assignment works or potentially use model_copy/re-init if complex)
updated_pipeline.config.ingestor_config = (
pipeline_in.config.ingestor_config.model_copy(deep=True)
) # Use model_copy for safety
updated_pipeline.config.run_frequency = pipeline_in.config.run_frequency
# Check if the frequency actually changed after the update
if updated_pipeline.config.run_frequency != original_frequency:
frequency_changed = True
# 4. Recalculate next_run ONLY if frequency changed
if frequency_changed:
logger.info(
f"Run frequency changed for pipeline {pipeline_id}. Recalculating next run."
f"Run frequency changed for pipeline {pipeline_id} from {original_frequency} to {updated_pipeline.config.run_frequency}. Recalculating next run."
)
now = datetime.now(UTC)
# Use the existing last_run from the copied object
updated_pipeline.config.next_run = calculate_next_run(
frequency=updated_pipeline.config.run_frequency,
last_run=existing_pipeline.config.last_run, # Base on last run
last_run=updated_pipeline.config.last_run,
start_reference_time=now,
)
logger.info(
f"Recalculated next_run for {pipeline_id}: {updated_pipeline.config.next_run}"
)
# Save the updated pipeline (store's save method handles updated_at)
await self.store.save(updated_pipeline)
logger.info(f"Pipeline updated: id={updated_pipeline.id}")
# 5. Update the timestamp before saving
updated_pipeline.updated_at = datetime.now(UTC)
# Notify the scheduler if relevant config changed
# We notify on any config change or if frequency specifically changed
if self.scheduler_manager and (config_changed or frequency_changed):
# 6. Save the updated pipeline
await self.store.save(updated_pipeline)
logger.info(f"Pipeline updated successfully: id={updated_pipeline.id}")
# 7. Notify the scheduler if config changed (including frequency)
# Scheduler needs the *final* state of the updated pipeline for rescheduling.
if self.scheduler_manager and config_changed:
logger.debug(
f"Notifying scheduler to reschedule pipeline {updated_pipeline.id}"
f"Notifying scheduler to reschedule pipeline {updated_pipeline.id} due to config change."
)
# Pass the fully updated pipeline object
asyncio.create_task(
self.scheduler_manager.reschedule_pipeline(updated_pipeline)
)
elif self.scheduler_manager:
logger.debug(
f"Pipeline {updated_pipeline.id} updated, but no schedule change needed."
f"Pipeline {updated_pipeline.id} updated (non-config fields), no reschedule needed based on config."
# NOTE: might still want to reschedule if other non-config updates could affect execution,
# but based on current logic, only config changes trigger rescheduling.
)
return updated_pipeline
@ -226,7 +248,7 @@ class PipelineService:
if not pipeline:
logger.error(f"Cannot run pipeline: Pipeline not found (id={pipeline_id})")
return
# Simple lock mechanism using status
# NOTE: lock mechanism
if pipeline.status == PipelineStatus.ACTIVE:
logger.warning(
f"Pipeline id={pipeline_id} is already ACTIVE. Skipping run."
@ -236,8 +258,7 @@ class PipelineService:
# --- Mark as ACTIVE ---
try:
pipeline.status = PipelineStatus.ACTIVE
# Optionally mark start time here if needed, but last_run usually marks completion
# pipeline.config.last_run = datetime.now(UTC)
pipeline.updated_at = datetime.now(UTC) # Update timestamp
await self.store.save(pipeline)
logger.info(f"Pipeline {pipeline_id} marked as ACTIVE.")
except Exception as e:
@ -245,20 +266,19 @@ class PipelineService:
f"Failed to mark pipeline {pipeline_id} as ACTIVE: {e}. Aborting run.",
exc_info=True,
)
# Restore original status if possible? Depends on store implementation.
return # Abort run if we can't even update status
# Attempt to restore status? Depends on store guarantees.
# pipeline.status = original_status # Potentially try rollback
return # Abort run
# --- Execute Pipeline Logic ---
run_successful = False
try:
logger.info(f"Executing core logic for pipeline id={pipeline_id}...")
# ---------------------------------------------------
# TODO: replace with actual pipeline execution call
# Example: await self._execute_ingestion(pipeline.config.ingestor_config)
# Example: await self._process_data(...)
await asyncio.sleep(5) # Simulate work
logger.info(f"Core logic finished successfully for id={pipeline_id}.")
# Ensure _execute_ingestion is awaited if it's async
await self._execute_ingestion(pipeline.config.ingestor_config)
# ---------------------------------------------------
logger.info(f"Core logic finished successfully for id={pipeline_id}.")
run_successful = True
except Exception as e:
@ -270,29 +290,49 @@ class PipelineService:
# --- Update Final State ---
try:
# Fetch the latest state again in case of external changes (though unlikely with ACTIVE status lock)
# Fetch the latest state again to minimize race conditions, though the ACTIVE lock helps
final_pipeline_state = await self.store.get(pipeline_id)
if not final_pipeline_state:
logger.warning(
f"Pipeline {pipeline_id} disappeared during run. Cannot update final state."
)
# The pipeline might have been deleted externally while running.
# Scheduler might need cleanup if the job still exists.
if self.scheduler_manager:
logger.warning(
f"Attempting to unschedule potentially orphaned job for {pipeline_id}"
)
asyncio.create_task(
self.scheduler_manager.unschedule_pipeline(pipeline_id)
)
return
# Avoid modifying the object fetched directly if store uses caching/references
final_pipeline_state = final_pipeline_state.model_copy(deep=True)
now = datetime.now(UTC)
final_pipeline_state.status = PipelineStatus.INACTIVE # Reset status
# TODO: Add a FAILED status?
# final_pipeline_state.status = PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
final_pipeline_state.status = (
PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
)
if run_successful:
final_pipeline_state.config.last_run = (
now # Mark completion time on success
)
# Calculate and store the *next* run time after this one
# Calculate and store the *next* run time based on the outcome
# Use the *updated* last_run if the run was successful
current_last_run = (
final_pipeline_state.config.last_run
) # This is 'now' if successful, else original last_run
final_pipeline_state.config.next_run = calculate_next_run(
frequency=final_pipeline_state.config.run_frequency,
last_run=final_pipeline_state.config.last_run, # Use the updated last_run
start_reference_time=now,
last_run=current_last_run, # Use the relevant last_run for calculation
start_reference_time=now, # Use current time as reference for calculation
)
final_pipeline_state.updated_at = (
now # Update timestamp for this final save
)
await self.store.save(final_pipeline_state)
@ -300,10 +340,10 @@ class PipelineService:
f"Pipeline {pipeline_id} run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
)
# Notify scheduler about the *new* next run time
# Notify scheduler about the *new* next run time so it can reschedule accurately
if self.scheduler_manager:
logger.debug(
f"Notifying scheduler to reschedule pipeline {pipeline_id} after run completion."
f"Notifying scheduler to reschedule pipeline {pipeline_id} after run completion with next run {final_pipeline_state.config.next_run}."
)
asyncio.create_task(
self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
@ -314,12 +354,32 @@ class PipelineService:
f"Failed to update pipeline {pipeline_id} state after run execution: {e}",
exc_info=True,
)
# The pipeline might be left in ACTIVE state if this fails. Requires manual intervention or recovery logic.
# The pipeline might be left in ACTIVE or an inconsistent state.
# Consider adding monitoring or retry logic here.
# TODO: Complete this method
# --- Placeholder for actual execution ---
async def _execute_ingestion(self, config: IngestorInput):
# Replace with your actual ingestion logic
logger.info(f"Simulating ingestion with config: {config}")
await asyncio.sleep(2) # Simulate I/O
logger.info("Ingestion simulation complete.")
"""
Executes the ingestion process for a pipeline using the provided IngestorInput config.
Returns the ingestion results or raises an exception on failure.
"""
# Ensure Ingestor is imported locally or globally if needed
# from ingestion.core import Ingestor # Example import if needed
# Check if Ingestor is already available (e.g., imported at module level)
# If not, uncomment the import above or ensure it's accessible.
# Assuming Ingestor is available in the scope:
try:
# Avoid circular import
from ingestion.core import Ingestor
logger.info(f"Executing ingestion with config: {config}")
# NOTE: Can be async
results = Ingestor.run(config.sources)
logger.info(f"Ingestion completed successfully. Results: {results}")
return results
except ImportError:
logger.error("Failed to import Ingestor. Cannot execute ingestion.")
raise RuntimeError("Ingestion module not found")
except Exception as e:
logger.error(f"Ingestion execution failed: {e}", exc_info=True)
raise

View File

@ -0,0 +1,571 @@
import pytest
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
from uuid import UUID, uuid4
from datetime import datetime, timedelta, timezone
from freezegun import freeze_time
from models.pipeline import (
Pipeline,
PipelineCreate,
PipelineConfig,
RunFrequency,
PipelineStatus,
)
from models.ingestion import (
IngestorInput,
IngestSourceConfig,
SourceType,
ApiConfig,
)
from services.pipeline_service import PipelineService
from stores.base import PipelineStore
from scheduler.manager import SchedulerManager
from scheduler.utils import calculate_next_run
pytestmark = pytest.mark.asyncio
# --- Fixtures ---
@pytest.fixture
def mock_store(mocker) -> AsyncMock:
"""Fixture for a mocked PipelineStore."""
mock = mocker.patch("stores.base.PipelineStore", spec=PipelineStore)
mock.save = AsyncMock(return_value=None)
mock.get = AsyncMock(return_value=None)
mock.get_all = AsyncMock(return_value=[])
mock.delete = AsyncMock(return_value=False)
return mock
@pytest.fixture
def mock_scheduler(mocker) -> AsyncMock:
"""Fixture for a mocked SchedulerManager."""
mock = mocker.patch("scheduler.manager.SchedulerManager", spec=SchedulerManager)
mock.schedule_pipeline = AsyncMock(return_value=None)
mock.reschedule_pipeline = AsyncMock(return_value=None)
mock.unschedule_pipeline = AsyncMock(return_value=None)
# NOTE: add other methods if the service starts calling them
return mock
@pytest.fixture
def pipeline_service(mock_store, mock_scheduler) -> PipelineService:
"""Fixture for PipelineService instance with mocked dependencies."""
service = PipelineService(store=mock_store, scheduler_manager=mock_scheduler)
# NOTE: if has internal methods to mock (like actual execution logic)
# NOTE: you might patch them here or within specific tests using mocker.patch.object
return service
@pytest.fixture
def sample_ingestor_input() -> IngestorInput:
"""Sample IngestorInput for creating pipelines."""
return IngestorInput(
sources=[
IngestSourceConfig(
type=SourceType.API, config=ApiConfig(url="http://example.com/api")
)
]
)
@pytest.fixture
def sample_pipeline_config(sample_ingestor_input) -> PipelineConfig:
"""Sample PipelineConfig."""
return PipelineConfig(
ingestor_config=sample_ingestor_input,
run_frequency=RunFrequency.DAILY,
last_run=None,
next_run=None,
)
@pytest.fixture
def sample_pipeline(sample_pipeline_config) -> Pipeline:
"""Sample Pipeline object."""
now = datetime.now(timezone.utc)
pid = uuid4()
next_run = calculate_next_run(sample_pipeline_config.run_frequency, None, now)
return Pipeline(
id=pid,
name="Test Pipeline",
description="A pipeline for testing",
config=sample_pipeline_config.model_copy(
update={"next_run": next_run}
), # Use updated config
status=PipelineStatus.INACTIVE,
created_at=now - timedelta(hours=1),
updated_at=now - timedelta(minutes=30),
)
@pytest.fixture
def sample_pipeline_create(sample_pipeline_config) -> PipelineCreate:
"""Sample PipelineCreate object for updates/creations."""
return PipelineCreate(
name="New Test Pipeline",
description="Creating a pipeline",
config=sample_pipeline_config,
)
# --- Test Cases ---
FROZEN_TIME = datetime(2025, 5, 12, 12, 30, 0, tzinfo=timezone.utc) # NOTE: 7:30 PM +07
@freeze_time(FROZEN_TIME)
async def test_create_pipeline_success(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_ingestor_input: IngestorInput,
):
"""Test successful pipeline creation."""
name = "My New Pipeline"
description = "Test description"
frequency = RunFrequency.WEEKLY
expected_next_run = calculate_next_run(frequency, None, FROZEN_TIME)
created_pipeline = await pipeline_service.create_pipeline(
name=name,
description=description,
ingestor_config=sample_ingestor_input,
run_frequency=frequency,
)
assert created_pipeline is not None
assert created_pipeline.name == name
assert created_pipeline.description == description
assert created_pipeline.config.run_frequency == frequency
assert created_pipeline.config.ingestor_config == sample_ingestor_input
assert created_pipeline.status == PipelineStatus.INACTIVE
assert created_pipeline.created_at == FROZEN_TIME
assert created_pipeline.updated_at == FROZEN_TIME
assert created_pipeline.config.last_run is None
assert created_pipeline.config.next_run == expected_next_run
assert isinstance(created_pipeline.id, UUID)
mock_store.save.assert_awaited_once()
saved_pipeline_arg = mock_store.save.call_args[0][0]
assert isinstance(saved_pipeline_arg, Pipeline)
assert saved_pipeline_arg.id == created_pipeline.id
assert saved_pipeline_arg.config.next_run == expected_next_run
# Verify scheduler interaction (using create_task, so check if called)
# We check if schedule_pipeline was called with the created pipeline object
await asyncio.sleep(0) # Allow create_task to potentially run
mock_scheduler.schedule_pipeline.assert_awaited_once_with(created_pipeline)
async def test_create_pipeline_store_error(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_ingestor_input: IngestorInput,
):
"""Test pipeline creation when store save fails."""
mock_store.save.side_effect = Exception("Database connection error")
with pytest.raises(Exception, match="Database connection error"):
await pipeline_service.create_pipeline(
name="Fail Pipeline",
description="This should fail",
ingestor_config=sample_ingestor_input,
run_frequency=RunFrequency.DAILY,
)
# Ensure scheduler was NOT called if store save failed
mock_scheduler.schedule_pipeline.assert_not_awaited()
@freeze_time(FROZEN_TIME)
async def test_update_pipeline_success_no_freq_change(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_pipeline: Pipeline,
):
"""Test successful update without changing run frequency."""
mock_store.get.return_value = sample_pipeline
update_payload = PipelineCreate(
name="Updated Name",
description="Updated Description",
config=sample_pipeline.config,
)
updated_pipeline = await pipeline_service.update_pipeline(
sample_pipeline.id, update_payload
)
assert updated_pipeline is not None
assert updated_pipeline.id == sample_pipeline.id
assert updated_pipeline.name == "Updated Name"
assert updated_pipeline.description == "Updated Description"
assert updated_pipeline.config.run_frequency == sample_pipeline.config.run_frequency
assert updated_pipeline.config.next_run == sample_pipeline.config.next_run
assert updated_pipeline.created_at == sample_pipeline.created_at
mock_store.get.assert_awaited_once_with(sample_pipeline.id)
mock_store.save.assert_awaited_once()
saved_pipeline_arg = mock_store.save.call_args[0][0]
assert saved_pipeline_arg.id == sample_pipeline.id
assert saved_pipeline_arg.name == "Updated Name"
assert saved_pipeline_arg.config.next_run == sample_pipeline.config.next_run
# Verify scheduler interaction (should still be called if config changes, even if freq doesn't)
await asyncio.sleep(0)
mock_scheduler.reschedule_pipeline.assert_awaited_once_with(updated_pipeline)
@freeze_time(FROZEN_TIME)
async def test_update_pipeline_success_with_freq_change(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_pipeline: Pipeline, # Existing pipeline (Daily frequency)
sample_ingestor_input: IngestorInput,
):
"""Test successful update changing run frequency."""
mock_store.get.return_value = sample_pipeline
original_next_run = sample_pipeline.config.next_run
new_config = PipelineConfig(
ingestor_config=sample_ingestor_input,
run_frequency=RunFrequency.MONTHLY,
last_run=sample_pipeline.config.last_run,
)
update_payload = PipelineCreate(
name=sample_pipeline.name,
description=sample_pipeline.description,
config=new_config,
)
# Calculate expected next run for MONTHLY based on FROZEN_TIME and no last_run
expected_new_next_run = calculate_next_run(
RunFrequency.MONTHLY, sample_pipeline.config.last_run, FROZEN_TIME
)
updated_pipeline = await pipeline_service.update_pipeline(
sample_pipeline.id, update_payload
)
assert updated_pipeline is not None
assert updated_pipeline.id == sample_pipeline.id
assert updated_pipeline.config.run_frequency == RunFrequency.MONTHLY
assert updated_pipeline.config.next_run != original_next_run
assert updated_pipeline.config.next_run == expected_new_next_run
mock_store.get.assert_awaited_once_with(sample_pipeline.id)
mock_store.save.assert_awaited_once()
saved_pipeline_arg = mock_store.save.call_args[0][0]
assert saved_pipeline_arg.config.run_frequency == RunFrequency.MONTHLY
assert saved_pipeline_arg.config.next_run == expected_new_next_run
await asyncio.sleep(0)
mock_scheduler.reschedule_pipeline.assert_awaited_once_with(updated_pipeline)
async def test_update_pipeline_not_found(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_pipeline_create: PipelineCreate,
):
"""Test updating a pipeline that doesn't exist."""
non_existent_id = uuid4()
mock_store.get.return_value = None
updated_pipeline = await pipeline_service.update_pipeline(
non_existent_id, sample_pipeline_create
)
assert updated_pipeline is None
mock_store.get.assert_awaited_once_with(non_existent_id)
mock_store.save.assert_not_awaited()
mock_scheduler.reschedule_pipeline.assert_not_awaited()
async def test_delete_pipeline_success(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_pipeline: Pipeline,
):
"""Test successful pipeline deletion."""
mock_store.delete.return_value = True
mock_store.get.return_value = sample_pipeline
pipeline_id = sample_pipeline.id
deleted = await pipeline_service.delete_pipeline(pipeline_id)
assert deleted is True
call_order = MagicMock()
call_order.attach_mock(mock_scheduler.unschedule_pipeline, "scheduler_unschedule")
call_order.attach_mock(mock_store.delete, "store_delete")
mock_scheduler.unschedule_pipeline.assert_awaited_once_with(pipeline_id)
mock_store.delete.assert_awaited_once_with(pipeline_id)
# NOTE: can check call order hear
# assert call_order.mock_calls == [
# mocker.call.scheduler_unschedule(pipeline_id),
# mocker.call.store_delete(pipeline_id),
# ]
async def test_delete_pipeline_not_found(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
):
"""Test deleting a pipeline that doesn't exist."""
non_existent_id = uuid4()
mock_store.get.return_value = None
mock_store.delete.return_value = False
deleted = await pipeline_service.delete_pipeline(non_existent_id)
assert deleted is False
# Verify interactions:
mock_store.get.assert_awaited_once_with(non_existent_id) # Verify get was checked
# Scheduler should NOT be called if pipeline wasn't found by get
mock_scheduler.unschedule_pipeline.assert_not_awaited()
# Store delete should NOT be called if pipeline wasn't found by get
mock_store.delete.assert_not_awaited()
async def test_get_pipeline_success(
pipeline_service: PipelineService,
mock_store: AsyncMock,
sample_pipeline: Pipeline,
):
"""Test getting an existing pipeline."""
mock_store.get.return_value = sample_pipeline
pipeline_id = sample_pipeline.id
result = await pipeline_service.get_pipeline(pipeline_id)
assert result == sample_pipeline
mock_store.get.assert_awaited_once_with(pipeline_id)
async def test_get_pipeline_not_found(
pipeline_service: PipelineService,
mock_store: AsyncMock,
):
"""Test getting a non-existent pipeline."""
mock_store.get.return_value = None
pipeline_id = uuid4()
result = await pipeline_service.get_pipeline(pipeline_id)
assert result is None
mock_store.get.assert_awaited_once_with(pipeline_id)
async def test_list_pipelines(
pipeline_service: PipelineService,
mock_store: AsyncMock,
sample_pipeline: Pipeline,
):
"""Test listing pipelines."""
pipelines_list = [
sample_pipeline,
sample_pipeline.model_copy(update={"id": uuid4(), "name": "Another Pipeline"}),
]
mock_store.get_all.return_value = pipelines_list
result = await pipeline_service.list_pipelines()
assert result == pipelines_list
mock_store.get_all.assert_awaited_once()
# --- Tests for run_pipeline ---
async def test_run_pipeline_not_found(
pipeline_service: PipelineService,
mock_store: AsyncMock,
):
"""Test running a pipeline that doesn't exist."""
mock_store.get.return_value = None
pipeline_id = uuid4()
# Patch the internal execution method just in case, although it shouldn't be reached
with patch.object(
pipeline_service, "_execute_ingestion", new_callable=AsyncMock
) as mock_exec:
await pipeline_service.run_pipeline(pipeline_id)
mock_store.get.assert_awaited_once_with(pipeline_id)
mock_store.save.assert_not_awaited()
mock_exec.assert_not_awaited()
async def test_run_pipeline_already_active(
pipeline_service: PipelineService,
mock_store: AsyncMock,
sample_pipeline: Pipeline,
):
"""Test running a pipeline that is already in ACTIVE status."""
active_pipeline = sample_pipeline.model_copy(
update={"status": PipelineStatus.ACTIVE}
)
mock_store.get.return_value = active_pipeline
pipeline_id = active_pipeline.id
with patch.object(
pipeline_service, "_execute_ingestion", new_callable=AsyncMock
) as mock_exec:
await pipeline_service.run_pipeline(pipeline_id)
mock_store.get.assert_awaited_once_with(pipeline_id)
mock_store.save.assert_not_awaited()
mock_exec.assert_not_awaited()
@freeze_time(FROZEN_TIME)
async def test_run_pipeline_success(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_pipeline: Pipeline,
mocker,
):
"""Test a successful pipeline run."""
pipeline_id = sample_pipeline.id
# --- Setup Mock Responses ---
# 1. Initial get returns the inactive pipeline
# 2. Second get (after execution) returns the pipeline again (simulate no external changes)
mock_store.get.side_effect = [
sample_pipeline,
sample_pipeline.model_copy(update={"status": PipelineStatus.ACTIVE}),
]
# --- Patch Internal Execution Logic ---
mock_execute_ingestion = mocker.patch.object(
pipeline_service, "_execute_ingestion", new_callable=AsyncMock
)
mock_execute_ingestion.return_value = None
# --- Execute ---
await pipeline_service.run_pipeline(pipeline_id)
# --- Assertions ---
# Verify get calls
assert mock_store.get.await_count == 2
mock_store.get.assert_any_await(pipeline_id)
# Verify execution logic was called
mock_execute_ingestion.assert_awaited_once() # Check if the core logic ran
# Verify save calls (should be 2: one for ACTIVE, one for INACTIVE+updates)
assert mock_store.save.await_count == 2
# Check the first save call (setting status to ACTIVE)
call1_args = mock_store.save.await_args_list[0][0]
saved_pipeline_active: Pipeline = call1_args[0]
assert saved_pipeline_active.id == pipeline_id
assert saved_pipeline_active.status == PipelineStatus.ACTIVE
# Check the second save call (setting status to INACTIVE and updating times)
call2_args = mock_store.save.await_args_list[1][0]
saved_pipeline_final: Pipeline = call2_args[0]
assert saved_pipeline_final.id == pipeline_id
assert saved_pipeline_final.status == PipelineStatus.INACTIVE
assert (
saved_pipeline_final.config.last_run == FROZEN_TIME
) # Should be updated to now
# Verify next_run was recalculated based on the new last_run time
expected_next_run_after_success = calculate_next_run(
saved_pipeline_final.config.run_frequency,
FROZEN_TIME, # The new last_run
FROZEN_TIME,
)
assert saved_pipeline_final.config.next_run == expected_next_run_after_success
# assert (
# saved_pipeline_final.config.next_run != original_next_run
# ) # Ensure it changed
# Verify scheduler notification for reschedule after completion
await asyncio.sleep(0) # Allow create_task to run
mock_scheduler.reschedule_pipeline.assert_awaited_once_with(saved_pipeline_final)
@freeze_time(FROZEN_TIME)
async def test_run_pipeline_execution_fails(
pipeline_service: PipelineService,
mock_store: AsyncMock,
mock_scheduler: AsyncMock,
sample_pipeline: Pipeline,
mocker,
):
"""Test a pipeline run where the internal execution logic fails."""
pipeline_id = sample_pipeline.id
# --- Setup Mock Responses ---
mock_store.get.side_effect = [
sample_pipeline,
sample_pipeline.model_copy(update={"status": PipelineStatus.ACTIVE}),
]
# --- Patch Internal Execution Logic to Raise Error ---
mock_execute_ingestion = mocker.patch.object(
pipeline_service, "_execute_ingestion", new_callable=AsyncMock
)
execution_error = ValueError("Something went wrong during ingestion")
mock_execute_ingestion.side_effect = execution_error
# --- Execute ---
# The service currently catches the exception and logs it, but doesn't re-raise.
# If it re-raised, we'd wrap this in pytest.raises.
await pipeline_service.run_pipeline(pipeline_id)
# --- Assertions ---
# Verify get calls
assert mock_store.get.await_count == 2
mock_store.get.assert_any_await(pipeline_id)
# Verify execution logic was called and raised error
mock_execute_ingestion.assert_awaited_once()
# Verify save calls (should be 2: one for ACTIVE, one for INACTIVE after failure)
assert mock_store.save.await_count == 2
# Check the first save call (setting status to ACTIVE)
call1_args = mock_store.save.await_args_list[0][0]
saved_pipeline_active: Pipeline = call1_args[0]
assert saved_pipeline_active.status == PipelineStatus.ACTIVE
# Check the second save call (setting status back to INACTIVE, NO last_run update)
call2_args = mock_store.save.await_args_list[1][0]
saved_pipeline_final: Pipeline = call2_args[0]
assert saved_pipeline_final.id == pipeline_id
assert saved_pipeline_final.status == PipelineStatus.FAILED
# ! IMPORTANT: last_run should NOT be updated on failure
assert saved_pipeline_final.config.last_run == sample_pipeline.config.last_run
# Next run should be recalculated based on the *original* last_run
expected_next_run_after_fail = calculate_next_run(
saved_pipeline_final.config.run_frequency,
sample_pipeline.config.last_run, # Use original last_run
FROZEN_TIME,
)
assert saved_pipeline_final.config.next_run == expected_next_run_after_fail
# Verify scheduler notification for reschedule even after failure
await asyncio.sleep(0)
mock_scheduler.reschedule_pipeline.assert_awaited_once_with(saved_pipeline_final)

View File

@ -317,12 +317,14 @@ dependencies = [
{ name = "apscheduler" },
{ name = "crawl4ai" },
{ name = "fastapi", extra = ["standard"] },
{ name = "freezegun" },
{ name = "inquirer" },
{ name = "loguru" },
{ name = "pandas" },
{ name = "pydantic-settings" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "pytest-mock" },
{ name = "python-dotenv" },
{ name = "responses" },
{ name = "rich" },
@ -333,12 +335,14 @@ requires-dist = [
{ name = "apscheduler", specifier = ">=3.11.0" },
{ name = "crawl4ai", specifier = ">=0.5.0.post8" },
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
{ name = "freezegun", specifier = ">=1.5.1" },
{ name = "inquirer", specifier = ">=3.4.0" },
{ name = "loguru", specifier = ">=0.7.3" },
{ name = "pandas", specifier = ">=2.2.3" },
{ name = "pydantic-settings", specifier = ">=2.9.1" },
{ name = "pytest", specifier = ">=8.3.5" },
{ name = "pytest-asyncio", specifier = ">=0.26.0" },
{ name = "pytest-mock", specifier = ">=3.14.0" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "responses", specifier = ">=0.25.7" },
{ name = "rich", specifier = ">=14.0.0" },
@ -518,6 +522,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215 },
]
[[package]]
name = "freezegun"
version = "1.5.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "python-dateutil" },
]
sdist = { url = "https://files.pythonhosted.org/packages/2c/ef/722b8d71ddf4d48f25f6d78aa2533d505bf3eec000a7cacb8ccc8de61f2f/freezegun-1.5.1.tar.gz", hash = "sha256:b29dedfcda6d5e8e083ce71b2b542753ad48cfec44037b3fc79702e2980a89e9", size = 33697 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/51/0b/0d7fee5919bccc1fdc1c2a7528b98f65c6f69b223a3fd8f809918c142c36/freezegun-1.5.1-py3-none-any.whl", hash = "sha256:bf111d7138a8abe55ab48a71755673dbaa4ab87f4cff5634a4442dfec34c15f1", size = 17569 },
]
[[package]]
name = "frozenlist"
version = "1.5.0"
@ -1411,6 +1427,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/20/7f/338843f449ace853647ace35870874f69a764d251872ed1b4de9f234822c/pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0", size = 19694 },
]
[[package]]
name = "pytest-mock"
version = "3.14.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pytest" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c6/90/a955c3ab35ccd41ad4de556596fa86685bf4fc5ffcc62d22d856cfd4e29a/pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0", size = 32814 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f2/3b/b26f90f74e2986a82df6e7ac7e319b8ea7ccece1caec9f8ab6104dc70603/pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f", size = 9863 },
]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"