backend-api/pipeline/services/pipeline_service.py

"""
Pipeline service to help do pipeline CRUD
"""

import asyncio
from datetime import datetime
from uuid import UUID, uuid4
from typing import Optional, List, TYPE_CHECKING
from loguru import logger

from ingestion import Ingestor

from models.pipeline import (
    Pipeline,
    PipelineCreate,
    PipelineConfig,
    RunFrequency,
    PipelineStatus,
)
from models.ingestion import IngestorInput, OutputData
from stores.base import PipelineStore
from scheduler.utils import calculate_next_run, UTC

# !use TYPE_CHECKING to avoid circular imports at runtime
# the SchedulerManager needs PipelineService, and PipelineService now needs SchedulerManager
if TYPE_CHECKING:
    from scheduler.manager import SchedulerManager


class PipelineService:
    """
    Pipeline service to help do pipeline CRUD
    """

    def __init__(
        self,
        store: PipelineStore,
        scheduler_manager: Optional["SchedulerManager"] = None,
    ):
        self.store = store
        self.scheduler_manager: Optional["SchedulerManager"] = (
            scheduler_manager  # Store the scheduler instance
        )
        logger.info(f"PipelineService initialized with store: {type(store).__name__}")
        if scheduler_manager:
            logger.info("PipelineService configured with SchedulerManager.")
        else:
            logger.warning(
                "PipelineService initialized without SchedulerManager. Scheduling notifications disabled."
            )

    def set_scheduler_manager(self, scheduler_manager: "SchedulerManager"):
        """
        Method to link the scheduler later if needed (e.g., after both are created)
        """
        self.scheduler_manager = scheduler_manager
        logger.info("SchedulerManager linked to PipelineService.")

    async def create_pipeline(
        self,
        name: str,
        description: str,
        ingestor_config: IngestorInput,
        run_frequency: RunFrequency,
    ) -> Pipeline:
        """Create a new pipeline, save it, and notify the scheduler."""
        logger.info(
            f"Creating pipeline: name={name}, description={description}, run_frequency={run_frequency}"
        )
        try:
            pipeline_id = uuid4()
            now = datetime.now(UTC)

            # Calculate the initial next_run time
            initial_next_run = calculate_next_run(
                frequency=run_frequency,
                last_run=None,
                start_reference_time=now,
            )

            pipeline = Pipeline(
                id=pipeline_id,
                name=name,
                description=description,
                config=PipelineConfig(
                    ingestor_config=ingestor_config,
                    run_frequency=run_frequency,
                    last_run=None,
                    next_run=initial_next_run,
                ),
                status=PipelineStatus.INACTIVE,
                created_at=now,
                updated_at=now,
            )
            await self.store.save(pipeline)
            logger.info(
                f"Pipeline created and saved: id={pipeline.id}, next_run={initial_next_run}"
            )

            # Notify the scheduler to add the job immediately
            if self.scheduler_manager:
                logger.debug(f"Notifying scheduler to schedule pipeline {pipeline.id}")
                # Use asyncio.create_task for fire-and-forget notification
                asyncio.create_task(self.scheduler_manager.schedule_pipeline(pipeline))
            else:
                logger.warning(
                    f"Scheduler not available, cannot schedule pipeline {pipeline.id} immediately."
                )

            return pipeline
        except Exception as e:
            logger.error(f"Failed to create pipeline: {e}", exc_info=True)
            raise  # Re-raise the exception after logging

    async def update_pipeline(
        self, pipeline_id: UUID, pipeline_in: PipelineCreate
    ) -> Optional[Pipeline]:
        """Update an existing pipeline, save it, and notify the scheduler."""
        logger.info(f"Updating pipeline: id={pipeline_id}")
        existing_pipeline = await self.store.get(pipeline_id)
        if not existing_pipeline:
            logger.warning(f"Pipeline not found for update: id={pipeline_id}")
            return None

        try:
            # 1. Create a deep copy to modify
            updated_pipeline = existing_pipeline.model_copy(deep=True)

            # 2. Update top-level fields directly from the input model
            updated_pipeline.name = pipeline_in.name
            updated_pipeline.description = pipeline_in.description

            # 3. Handle config update carefully
            config_changed = False
            frequency_changed = False
            original_frequency = (
                updated_pipeline.config.run_frequency
            )  # Store before potential change

            # Check if the input payload actually provided config data
            if pipeline_in.config:
                config_changed = True
                # Update the fields *within* the existing config object
                # Ensure the nested ingestor_config is also handled correctly (assuming assignment works or potentially use model_copy/re-init if complex)
                updated_pipeline.config.ingestor_config = (
                    pipeline_in.config.ingestor_config.model_copy(deep=True)
                )  # Use model_copy for safety
                updated_pipeline.config.run_frequency = pipeline_in.config.run_frequency

                # Check if the frequency actually changed after the update
                if updated_pipeline.config.run_frequency != original_frequency:
                    frequency_changed = True

            # 4. Recalculate next_run ONLY if frequency changed
            if frequency_changed:
                logger.info(
                    f"Run frequency changed for pipeline {pipeline_id} from {original_frequency} to {updated_pipeline.config.run_frequency}. Recalculating next run."
                )
                now = datetime.now(UTC)
                # Use the existing last_run from the copied object
                updated_pipeline.config.next_run = calculate_next_run(
                    frequency=updated_pipeline.config.run_frequency,
                    last_run=updated_pipeline.config.last_run,
                    start_reference_time=now,
                )
                logger.info(
                    f"Recalculated next_run for {pipeline_id}: {updated_pipeline.config.next_run}"
                )

            # 5. Update the timestamp before saving
            updated_pipeline.updated_at = datetime.now(UTC)

            # 6. Save the updated pipeline
            await self.store.save(updated_pipeline)
            logger.info(f"Pipeline updated successfully: id={updated_pipeline.id}")

            # 7. Notify the scheduler if config changed (including frequency)
            # Scheduler needs the *final* state of the updated pipeline for rescheduling.
            if self.scheduler_manager and config_changed:
                logger.debug(
                    f"Notifying scheduler to reschedule pipeline {updated_pipeline.id} due to config change."
                )
                # Pass the fully updated pipeline object
                asyncio.create_task(
                    self.scheduler_manager.reschedule_pipeline(updated_pipeline)
                )
            elif self.scheduler_manager:
                logger.debug(
                    f"Pipeline {updated_pipeline.id} updated (non-config fields), no reschedule needed based on config."
                    # NOTE: might still want to reschedule if other non-config updates could affect execution,
                    # but based on current logic, only config changes trigger rescheduling.
                )

            return updated_pipeline
        except Exception as e:
            logger.error(
                f"Failed to update pipeline id={pipeline_id}: {e}", exc_info=True
            )
            raise

    async def delete_pipeline(self, pipeline_id: UUID) -> bool:
        """Delete an existing pipeline and notify the scheduler."""
        logger.info(f"Attempting to delete pipeline: id={pipeline_id}")

        pipeline_exists = await self.store.get(pipeline_id) is not None
        if not pipeline_exists:
            logger.warning(f"Pipeline {pipeline_id} not found for deletion.")
            return False

        # Notify scheduler *before* deleting from store, in case deletion fails
        if self.scheduler_manager:
            logger.debug(f"Notifying scheduler to unschedule pipeline {pipeline_id}")
            # We need to wait for this to ensure the job is removed before DB record gone
            await self.scheduler_manager.unschedule_pipeline(pipeline_id)
        else:
            logger.warning(
                f"Scheduler not available, cannot unschedule pipeline {pipeline_id}."
            )

        # Proceed with deletion from store
        deleted = await self.store.delete(pipeline_id)
        if deleted:
            logger.info(f"Pipeline deleted successfully from store: id={pipeline_id}")
        else:
            # This might happen if pipeline was already gone, or store error
            logger.warning(
                f"Pipeline {pipeline_id} not found in store for deletion, or delete failed."
            )
            # Scheduler job should have been removed anyway if it existed.
        return deleted

    async def get_pipeline(self, pipeline_id: UUID) -> Optional[Pipeline]:
        """Get a single pipeline by ID."""
        logger.debug(f"Getting pipeline: id={pipeline_id}")
        return await self.store.get(pipeline_id)

    async def list_pipelines(self) -> List[Pipeline]:
        """Get all pipelines."""
        logger.debug("Listing all pipelines")
        return await self.store.get_all()

    async def run_pipeline(self, pipeline_id: UUID) -> None:
        """
        Executes the pipeline logic, updating status and run times.
        Logs associated with this run will include the pipeline_id.
        """
        # Use contextualize to tag logs originating from this specific run
        with logger.contextualize(
            pipeline_id=str(pipeline_id)
        ):  # Ensure it's a string for context
            logger.info(
                "Attempting run execution for pipeline"
            )  # Log context takes effect here
            pipeline = await self.store.get(pipeline_id)

            if not pipeline:
                logger.error("Cannot run pipeline: Pipeline not found")
                return
            if pipeline.status == PipelineStatus.ACTIVE:
                logger.warning("Pipeline is already ACTIVE. Skipping run.")
                return

            # --- Mark as ACTIVE ---
            # original_status = pipeline.status # Store original status for potential rollback
            try:
                pipeline.status = PipelineStatus.ACTIVE
                pipeline.updated_at = datetime.now(UTC)
                await self.store.save(pipeline)
                logger.info("Pipeline marked as ACTIVE.")
            except Exception as e:
                logger.error(
                    f"Failed to mark pipeline as ACTIVE: {e}. Aborting run.",
                    exc_info=True,
                )
                # Attempt to restore status? Requires careful thought on atomicity
                # pipeline.status = original_status
                # await self.store.save(pipeline) # Potential race condition/overwrite here
                return

            # --- Execute Pipeline Logic ---
            run_successful = False
            ingestion_output: OutputData | None = None
            try:
                logger.info("Executing core logic...")
                # This call and anything within it will inherit the pipeline_id context
                ingestion_output = await self._execute_ingestion(
                    pipeline.config.ingestor_config
                )
                logger.info("Core logic finished successfully.")
                run_successful = True
            except Exception as e:
                logger.error(
                    f"Core logic failed during pipeline run: {e}", exc_info=True
                )
                # run_successful remains False

            # --- Update Final State ---
            try:
                # Fetch latest state again (important if external changes possible)
                final_pipeline_state = await self.store.get(pipeline_id)
                if not final_pipeline_state:
                    logger.warning(
                        "Pipeline disappeared during run. Cannot update final state."
                    )
                    # Handle potential deletion during run (e.g., unschedule if needed)
                    if self.scheduler_manager:
                        logger.warning(
                            "Attempting to unschedule potentially orphaned job"
                        )
                        asyncio.create_task(
                            self.scheduler_manager.unschedule_pipeline(pipeline_id)
                        )
                    return

                final_pipeline_state = final_pipeline_state.model_copy(deep=True)
                now = datetime.now(UTC)
                final_pipeline_state.status = (
                    PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED
                )

                if run_successful:
                    final_pipeline_state.config.last_run = now
                    if ingestion_output:
                        final_pipeline_state.latest_run_output = ingestion_output
                    else:
                        logger.warning(
                            "Run was successful but no ingestion output captured."
                        )
                        final_pipeline_state.latest_run_output = None
                else:
                    logger.warning("Run failed.")

                current_last_run = final_pipeline_state.config.last_run
                final_pipeline_state.config.next_run = calculate_next_run(
                    frequency=final_pipeline_state.config.run_frequency,
                    last_run=current_last_run,
                    start_reference_time=now,
                )
                final_pipeline_state.updated_at = now

                await self.store.save(final_pipeline_state)
                logger.info(
                    f"Pipeline run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}"
                )

                if self.scheduler_manager:
                    logger.debug(
                        "Notifying scheduler to reschedule pipeline after run completion"
                    )
                    asyncio.create_task(
                        self.scheduler_manager.reschedule_pipeline(final_pipeline_state)
                    )

            except Exception as e:
                logger.error(
                    f"Failed to update pipeline state after run execution: {e}",
                    exc_info=True,
                )
                # Pipeline might be left ACTIVE or FAILED state might not be saved. Needs monitoring.

    async def _execute_ingestion(self, config: IngestorInput) -> OutputData | None:
        """
        Executes the ingestion process for a pipeline using the provided IngestorInput config.
        Returns the ingestion results or raises an exception on failure.
        """
        try:
            logger.info(f"Executing ingestion with config: {config}")
            results: OutputData = await Ingestor.run(config.sources)
            logger.info(
                f"Ingestion completed successfully. Records count: {len(results.records)}"
            )
            return results
        except ImportError:
            logger.error("Failed to import Ingestor. Cannot execute ingestion.")
            raise RuntimeError("Ingestion module not found")
        except Exception as e:
            logger.error(f"Ingestion execution failed: {e}", exc_info=True)
        raise

    async def get_pipeline_latest_results(
        self, pipeline_id: UUID
    ) -> Optional[OutputData]:
        """Retrieves the output from the latest successful run of a pipeline."""
        logger.debug(f"Getting latest results for pipeline: id={pipeline_id}")
        pipeline = await self.store.get(pipeline_id)
        if pipeline:
            if pipeline.latest_run_output and pipeline.config.last_run:
                # NOTE: can use PipelineRunResult
                return pipeline.latest_run_output
            elif pipeline.config.last_run:
                logger.info(
                    f"Pipeline {pipeline_id} ran at {pipeline.config.last_run} but has no stored output (or run failed)."
                )
                return None
            else:
                logger.info(f"Pipeline {pipeline_id} has no recorded run or output.")
                return None
        logger.warning(f"Pipeline {pipeline_id} not found when retrieving results.")
        return None