backend-api/backend/app/workers/tasks.py

215 lines
10 KiB
Python

import asyncio
from datetime import datetime
import logging
import random
import time
from celery import chord, group, shared_task
from app import crud, models, schemas # Keep imports for structure
from app.core.db import AsyncSessionFactory # Use session factory directly in tasks
from app.services.processing_service import ProcessingService # Import dummy service
logger = logging.getLogger(__name__)
# --- Helper to run async code from sync Celery tasks ---
def async_to_sync(awaitable):
"""Runs an awaitable in a new event loop."""
return asyncio.run(awaitable)
# --- Dummy Sub-Tasks ---
@shared_task(bind=True, max_retries=1, default_retry_delay=5)
def dummy_source_task(self, source_id: int, source_type: str):
"""DUMMY: Simulates processing any data source type."""
task_id = self.request.id
logger.info(f"DUMMY TASK dummy_source_task[ID:{task_id}]: Start DS:{source_id} Type:{source_type}")
await_time = random.uniform(0.05, 0.2)
time.sleep(await_time) # Simulate work
# Simulate occasional failure
if random.random() < 0.08:
error_msg = f"Simulated failure processing source {source_id}"
logger.warning(f"DUMMY TASK dummy_source_task[ID:{task_id}]: {error_msg}")
raise ValueError(error_msg) # Raise exception for Celery retry/failure
# Simulate successful result (list of dicts)
num_records = random.randint(1, 3)
result = [{f"data_{source_id}_{i}": random.random(), "source_type": source_type} for i in range(num_records)]
logger.info(f"DUMMY TASK dummy_source_task[ID:{task_id}]: Finish DS:{source_id}, generated {num_records} records.")
return result
# --- Dummy Aggregation Task (Callback) ---
@shared_task(bind=True)
def dummy_aggregate_task(self, results: list, pipeline_id: int, run_id: int):
"""DUMMY: Simulates aggregating results and saving."""
task_id = self.request.id
logger.info(
f"DUMMY TASK dummy_aggregate_task[ID:{task_id}]: Start Aggregation for RunID:{run_id}, PipelineID:{pipeline_id}. Received {len(results)} results."
)
log_messages = [f"Aggregation simulation started at {datetime.utcnow()}"]
final_status = schemas.PipelineStatus.COMPLETED
output_location = None
errors_encountered = sum(1 for r in results if isinstance(r, Exception))
# Instantiate dummy service
service = ProcessingService()
async def process_and_save():
nonlocal output_location, final_status # Allow modification
try:
# Call dummy processing service
processed_df = await service.process_pipeline_results(results, {"dummy_pipeline_cfg": True})
if not processed_df.empty:
# Simulate saving (no actual file handler needed here for dummy)
await asyncio.sleep(0.1) # Simulate save time
output_location = f"dummy_outputs/run_{run_id}_output_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}.csv"
log_messages.append(f"Simulated saving results to {output_location}, shape: {processed_df.shape}")
logger.info(f"DUMMY AGGREGATION: Simulated save complete to {output_location}")
else:
log_messages.append("No data processed after aggregation/filtering.")
# Keep COMPLETED status if no errors, otherwise FAILED was set below
except Exception as e:
logger.error(f"DUMMY AGGREGATION: Error during dummy processing: {e}", exc_info=True)
log_messages.append(f"ERROR during processing: {e}")
final_status = schemas.PipelineStatus.FAILED
output_location = None
if errors_encountered > 0 and final_status != schemas.PipelineStatus.FAILED:
log_messages.append("Pipeline simulation completed with source task errors.")
# Optional: Set a specific status like COMPLETED_WITH_ERRORS if needed
elif errors_encountered > 0 and not processed_df.empty:
final_status = schemas.PipelineStatus.FAILED # Fail if errors and no data
# Simulate DB Update
final_log = "\n".join(log_messages)
logger.info(f"DUMMY AGGREGATION: Simulating final DB update for RunID:{run_id} to status {final_status}")
if AsyncSessionFactory: # Check if DB is configured
async with AsyncSessionFactory() as session:
try:
# Call dummy CRUD functions
await crud.pipeline_run.update_run_status(
db=session,
run_id=run_id,
status=final_status,
output_location=output_location,
run_log=final_log,
)
await crud.pipeline.update( # Use generic update for status
db=session,
db_obj=models.Pipeline(id=pipeline_id), # Need a dummy obj for update
obj_in={"status": schemas.PipelineStatus.IDLE},
)
logger.info(f"DUMMY AGGREGATION: DB update simulation successful for RunID:{run_id}.")
except Exception as db_exc:
logger.error(
f"DUMMY AGGREGATION: Failed DB update simulation for RunID:{run_id}: {db_exc}", exc_info=True
)
else:
logger.warning("DUMMY AGGREGATION: Skipping DB update simulation as DB is not configured.")
async_to_sync(process_and_save())
logger.info(f"DUMMY TASK dummy_aggregate_task[ID:{task_id}]: Finish Aggregation Simulation for RunID:{run_id}")
# --- Dummy Pipeline Orchestrator Task ---
@shared_task(bind=True)
def run_pipeline_task(self, pipeline_id: int):
"""DUMMY: Simulates fetching pipeline details and scheduling sub-tasks."""
task_id = self.request.id
logger.info(
f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Start Orchestration Simulation for PipelineID:{pipeline_id}"
)
run_id = None
async def setup_and_dispatch():
nonlocal run_id
if not AsyncSessionFactory:
logger.error("Cannot simulate pipeline run: Database not configured.")
return None, "Database not configured"
async with AsyncSessionFactory() as session:
# 1. Get Pipeline (dummy)
pipeline = await crud.pipeline.get_with_details(session, id=pipeline_id)
if not pipeline:
logger.error(f"Pipeline {pipeline_id} not found (simulated).")
return None, "Pipeline not found"
if pipeline.status != schemas.PipelineStatus.IDLE:
logger.warning(f"Pipeline {pipeline_id} not idle (status: {pipeline.status}), skipping run simulation.")
return None, f"Pipeline status is {pipeline.status}"
# 2. Create Run Record (dummy)
run = await crud.pipeline_run.create(
session, pipeline_id=pipeline_id, celery_task_id=task_id, status=schemas.PipelineStatus.RUNNING
)
run_id = run.id
logger.info(f"Created dummy PipelineRun record with RunID:{run_id}")
# 3. Update Pipeline Status (dummy)
await crud.pipeline.update(session, db_obj=pipeline, obj_in={"status": schemas.PipelineStatus.RUNNING})
logger.info(f"Set dummy Pipeline {pipeline_id} status to RUNNING")
# 4. Prepare sub-tasks (using dummy sources from get_with_details)
if not pipeline.data_sources:
logger.warning(f"No data sources found for pipeline {pipeline_id}. Finishing run.")
await crud.pipeline_run.update_run_status(
session, run_id=run_id, status=schemas.PipelineStatus.COMPLETED, run_log="No data sources found."
)
await crud.pipeline.update(session, db_obj=pipeline, obj_in={"status": schemas.PipelineStatus.IDLE})
return [], None # No tasks to run
sub_tasks = [dummy_source_task.s(ds.id, ds.type.value) for ds in pipeline.data_sources]
logger.info(f"Prepared {len(sub_tasks)} dummy sub-tasks for RunID:{run_id}")
return sub_tasks, None
async def fail_run(error_message: str):
"""Helper to mark run as failed if setup simulation fails."""
if run_id and AsyncSessionFactory:
logger.error(f"Simulating run failure for RunID:{run_id} - {error_message}")
async with AsyncSessionFactory() as session:
await crud.pipeline_run.update_run_status(
db=session,
run_id=run_id,
status=schemas.PipelineStatus.FAILED,
run_log=f"Orchestration failed: {error_message}",
)
await crud.pipeline.update(
db=session, db_obj=models.Pipeline(id=pipeline_id), obj_in={"status": schemas.PipelineStatus.IDLE}
)
try:
sub_task_signatures, error = async_to_sync(setup_and_dispatch())
if error:
logger.error(f"Orchestration setup simulation failed: {error}")
# fail_run should have been called if run_id was set
return
if not sub_task_signatures:
logger.info("No sub-tasks to execute.")
return # Setup marked run as completed/failed
# Define the workflow chord
workflow = chord(
header=group(sub_task_signatures),
body=dummy_aggregate_task.s(pipeline_id=pipeline_id, run_id=run_id), # Ensure run_id is passed
)
# Simulate applying the workflow
logger.info(
f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Simulating Celery chord apply_async() for RunID:{run_id}"
)
# In a real test you might call workflow() directly to execute synchronously
# For this dummy structure, just log the intent.
logger.info(f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Workflow simulation scheduled for RunID:{run_id}")
except Exception as exc:
logger.error(
f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Orchestration Simulation FAILED: {exc}", exc_info=True
)
async_to_sync(fail_run(f"Orchestration simulation exception: {type(exc).__name__}"))