""" Pipeline service to help do pipeline CRUD """ import asyncio from datetime import datetime from uuid import UUID, uuid4 from typing import Optional, List, TYPE_CHECKING from loguru import logger from models.pipeline import ( Pipeline, PipelineCreate, PipelineConfig, RunFrequency, PipelineStatus, ) from models.ingestion import IngestorInput from stores.base import PipelineStore from scheduler.utils import calculate_next_run, UTC # Import the utility and UTC # !use TYPE_CHECKING to avoid circular imports at runtime # the SchedulerManager needs PipelineService, and PipelineService now needs SchedulerManager if TYPE_CHECKING: from scheduler.manager import SchedulerManager class PipelineService: """ Pipeline service to help do pipeline CRUD """ def __init__( self, store: PipelineStore, scheduler_manager: Optional["SchedulerManager"] = None, ): self.store = store self.scheduler_manager: Optional["SchedulerManager"] = ( scheduler_manager # Store the scheduler instance ) logger.info(f"PipelineService initialized with store: {type(store).__name__}") if scheduler_manager: logger.info("PipelineService configured with SchedulerManager.") else: logger.warning( "PipelineService initialized without SchedulerManager. Scheduling notifications disabled." ) def set_scheduler_manager(self, scheduler_manager: "SchedulerManager"): """ Method to link the scheduler later if needed (e.g., after both are created) """ self.scheduler_manager = scheduler_manager logger.info("SchedulerManager linked to PipelineService.") async def create_pipeline( self, name: str, description: str, ingestor_config: IngestorInput, run_frequency: RunFrequency, ) -> Pipeline: """Create a new pipeline, save it, and notify the scheduler.""" logger.info( f"Creating pipeline: name={name}, description={description}, run_frequency={run_frequency}" ) try: pipeline_id = uuid4() now = datetime.now(UTC) # Use UTC consistently # Calculate the initial next_run time initial_next_run = calculate_next_run( frequency=run_frequency, last_run=None, # No last run yet start_reference_time=now, ) pipeline = Pipeline( id=pipeline_id, name=name, description=description, config=PipelineConfig( ingestor_config=ingestor_config, run_frequency=run_frequency, last_run=None, next_run=initial_next_run, # Store the calculated next run ), status=PipelineStatus.INACTIVE, # Start as inactive created_at=now, updated_at=now, ) await self.store.save(pipeline) logger.info( f"Pipeline created and saved: id={pipeline.id}, next_run={initial_next_run}" ) # Notify the scheduler to add the job immediately if self.scheduler_manager: logger.debug(f"Notifying scheduler to schedule pipeline {pipeline.id}") # Use asyncio.create_task for fire-and-forget notification asyncio.create_task(self.scheduler_manager.schedule_pipeline(pipeline)) else: logger.warning( f"Scheduler not available, cannot schedule pipeline {pipeline.id} immediately." ) return pipeline except Exception as e: logger.error(f"Failed to create pipeline: {e}", exc_info=True) raise # Re-raise the exception after logging async def update_pipeline( self, pipeline_id: UUID, pipeline_in: PipelineCreate ) -> Optional[Pipeline]: """Update an existing pipeline, save it, and notify the scheduler.""" logger.info(f"Updating pipeline: id={pipeline_id}") existing_pipeline = await self.store.get(pipeline_id) if not existing_pipeline: logger.warning(f"Pipeline not found for update: id={pipeline_id}") return None try: update_data = pipeline_in.model_dump(exclude_unset=True) # Use model_copy for a cleaner update merge updated_pipeline = existing_pipeline.model_copy( deep=True, update=update_data ) # Check if frequency changed, if so, recalculate next_run config_changed = "config" in update_data frequency_changed = False if ( config_changed and updated_pipeline.config.run_frequency != existing_pipeline.config.run_frequency ): frequency_changed = True logger.info( f"Run frequency changed for pipeline {pipeline_id}. Recalculating next run." ) now = datetime.now(UTC) updated_pipeline.config.next_run = calculate_next_run( frequency=updated_pipeline.config.run_frequency, last_run=existing_pipeline.config.last_run, # Base on last run start_reference_time=now, ) logger.info( f"Recalculated next_run for {pipeline_id}: {updated_pipeline.config.next_run}" ) # Save the updated pipeline (store's save method handles updated_at) await self.store.save(updated_pipeline) logger.info(f"Pipeline updated: id={updated_pipeline.id}") # Notify the scheduler if relevant config changed # We notify on any config change or if frequency specifically changed if self.scheduler_manager and (config_changed or frequency_changed): logger.debug( f"Notifying scheduler to reschedule pipeline {updated_pipeline.id}" ) asyncio.create_task( self.scheduler_manager.reschedule_pipeline(updated_pipeline) ) elif self.scheduler_manager: logger.debug( f"Pipeline {updated_pipeline.id} updated, but no schedule change needed." ) return updated_pipeline except Exception as e: logger.error( f"Failed to update pipeline id={pipeline_id}: {e}", exc_info=True ) raise async def delete_pipeline(self, pipeline_id: UUID) -> bool: """Delete an existing pipeline and notify the scheduler.""" logger.info(f"Attempting to delete pipeline: id={pipeline_id}") pipeline_exists = await self.store.get(pipeline_id) is not None if not pipeline_exists: logger.warning(f"Pipeline {pipeline_id} not found for deletion.") return False # Notify scheduler *before* deleting from store, in case deletion fails if self.scheduler_manager: logger.debug(f"Notifying scheduler to unschedule pipeline {pipeline_id}") # We need to wait for this to ensure the job is removed before DB record gone await self.scheduler_manager.unschedule_pipeline(pipeline_id) else: logger.warning( f"Scheduler not available, cannot unschedule pipeline {pipeline_id}." ) # Proceed with deletion from store deleted = await self.store.delete(pipeline_id) if deleted: logger.info(f"Pipeline deleted successfully from store: id={pipeline_id}") else: # This might happen if pipeline was already gone, or store error logger.warning( f"Pipeline {pipeline_id} not found in store for deletion, or delete failed." ) # Scheduler job should have been removed anyway if it existed. return deleted async def get_pipeline(self, pipeline_id: UUID) -> Optional[Pipeline]: """Get a single pipeline by ID.""" logger.debug(f"Getting pipeline: id={pipeline_id}") return await self.store.get(pipeline_id) async def list_pipelines(self) -> List[Pipeline]: """Get all pipelines.""" logger.debug("Listing all pipelines") return await self.store.get_all() async def run_pipeline(self, pipeline_id: UUID) -> None: """ Executes the pipeline logic, updating status and run times. This is called by the scheduler job or manual trigger. """ logger.info(f"Attempting run execution for pipeline: id={pipeline_id}") pipeline = await self.store.get(pipeline_id) if not pipeline: logger.error(f"Cannot run pipeline: Pipeline not found (id={pipeline_id})") return # Simple lock mechanism using status if pipeline.status == PipelineStatus.ACTIVE: logger.warning( f"Pipeline id={pipeline_id} is already ACTIVE. Skipping run." ) return # --- Mark as ACTIVE --- try: pipeline.status = PipelineStatus.ACTIVE # Optionally mark start time here if needed, but last_run usually marks completion # pipeline.config.last_run = datetime.now(UTC) await self.store.save(pipeline) logger.info(f"Pipeline {pipeline_id} marked as ACTIVE.") except Exception as e: logger.error( f"Failed to mark pipeline {pipeline_id} as ACTIVE: {e}. Aborting run.", exc_info=True, ) # Restore original status if possible? Depends on store implementation. return # Abort run if we can't even update status # --- Execute Pipeline Logic --- run_successful = False try: logger.info(f"Executing core logic for pipeline id={pipeline_id}...") # --------------------------------------------------- # TODO: replace with actual pipeline execution call # Example: await self._execute_ingestion(pipeline.config.ingestor_config) # Example: await self._process_data(...) await asyncio.sleep(5) # Simulate work logger.info(f"Core logic finished successfully for id={pipeline_id}.") # --------------------------------------------------- run_successful = True except Exception as e: logger.error( f"Core logic failed during pipeline run id={pipeline_id}: {e}", exc_info=True, ) # run_successful remains False # --- Update Final State --- try: # Fetch the latest state again in case of external changes (though unlikely with ACTIVE status lock) final_pipeline_state = await self.store.get(pipeline_id) if not final_pipeline_state: logger.warning( f"Pipeline {pipeline_id} disappeared during run. Cannot update final state." ) return now = datetime.now(UTC) final_pipeline_state.status = PipelineStatus.INACTIVE # Reset status # TODO: Add a FAILED status? # final_pipeline_state.status = PipelineStatus.INACTIVE if run_successful else PipelineStatus.FAILED if run_successful: final_pipeline_state.config.last_run = ( now # Mark completion time on success ) # Calculate and store the *next* run time after this one final_pipeline_state.config.next_run = calculate_next_run( frequency=final_pipeline_state.config.run_frequency, last_run=final_pipeline_state.config.last_run, # Use the updated last_run start_reference_time=now, ) await self.store.save(final_pipeline_state) logger.info( f"Pipeline {pipeline_id} run finished. Status: {final_pipeline_state.status}, Last Run: {final_pipeline_state.config.last_run}, Next Run: {final_pipeline_state.config.next_run}" ) # Notify scheduler about the *new* next run time if self.scheduler_manager: logger.debug( f"Notifying scheduler to reschedule pipeline {pipeline_id} after run completion." ) asyncio.create_task( self.scheduler_manager.reschedule_pipeline(final_pipeline_state) ) except Exception as e: logger.error( f"Failed to update pipeline {pipeline_id} state after run execution: {e}", exc_info=True, ) # The pipeline might be left in ACTIVE state if this fails. Requires manual intervention or recovery logic. # TODO: Complete this method # --- Placeholder for actual execution --- async def _execute_ingestion(self, config: IngestorInput): # Replace with your actual ingestion logic logger.info(f"Simulating ingestion with config: {config}") await asyncio.sleep(2) # Simulate I/O logger.info("Ingestion simulation complete.")