refactor: initial backend code

This commit is contained in:
Sosokker 2025-04-07 22:53:09 +07:00
parent fbd4113cca
commit e632ee0511
25 changed files with 1327 additions and 0 deletions

10
backend/.gitignore vendored Normal file
View File

@ -0,0 +1,10 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# Virtual environments
.venv

1
backend/.python-version Normal file
View File

@ -0,0 +1 @@
3.12

119
backend/alembic.ini Normal file
View File

@ -0,0 +1,119 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts
# Use forward slashes (/) also on windows to provide an os agnostic path
script_location = alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
# for all available tokens
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =
# max length of characters to apply to the "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; This defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "version_path_separator" below.
# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
# version path separator; As mentioned above, this is the character used to split
# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
# Valid values for version_path_separator are:
#
# version_path_separator = :
# version_path_separator = ;
# version_path_separator = space
# version_path_separator = newline
#
# Use os.pathsep. Default configuration used for new projects.
version_path_separator = os
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = driver://user:pass@localhost/dbname
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARNING
handlers = console
qualname =
[logger_sqlalchemy]
level = WARNING
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

1
backend/alembic/README Normal file
View File

@ -0,0 +1 @@
Generic single-database configuration.

78
backend/alembic/env.py Normal file
View File

@ -0,0 +1,78 @@
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = None
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@ -0,0 +1,28 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
"""Upgrade schema."""
${upgrades if upgrades else "pass"}
def downgrade() -> None:
"""Downgrade schema."""
${downgrades if downgrades else "pass"}

0
backend/app/__init__.py Normal file
View File

View File

View File

@ -0,0 +1,123 @@
import logging
import random
from typing import List
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from app import crud, models, schemas # Import local schemas and crud
from app.core.db import get_db # Import DB dependency
from app.services.processing_service import ProcessingService # Import services
from app.workers.tasks import run_pipeline_task # Import Celery task
logger = logging.getLogger(__name__)
router = APIRouter()
# --- API Endpoint Definitions ---
@router.post("/", response_model=schemas.PipelineRead, status_code=201)
async def create_pipeline(
pipeline_in: schemas.PipelineCreate,
db: AsyncSession = Depends(get_db),
):
"""
DUMMY: Creates a new pipeline configuration.
"""
logger.info("Endpoint: create_pipeline called")
# In real implementation: Add checks, call services if needed.
# Here, directly call (dummy) CRUD.
created_pipeline = await crud.pipeline.create(db=db, obj_in=pipeline_in)
# No need to check for existence as dummy create always returns something
return created_pipeline
@router.get("/", response_model=List[schemas.PipelineRead])
async def read_pipelines(db: AsyncSession = Depends(get_db), skip: int = 0, limit: int = Query(100, le=200)):
"""
DUMMY: Retrieves a list of pipelines.
"""
logger.info("Endpoint: read_pipelines called")
# Call (dummy) CRUD
pipelines = await crud.pipeline.get_multi(db, skip=skip, limit=limit)
return pipelines
@router.get("/{pipeline_id}", response_model=schemas.PipelineReadWithDetails)
async def read_pipeline(pipeline_id: int, db: AsyncSession = Depends(get_db)):
"""
DUMMY: Retrieves details for a specific pipeline, including sources and recent runs.
"""
logger.info(f"Endpoint: read_pipeline called for id={pipeline_id}")
# Call (dummy) CRUD that includes related data loading simulation
db_pipeline = await crud.pipeline.get_with_details(db, id=pipeline_id)
if db_pipeline is None:
# Raise standard FastAPI exception for not found
raise HTTPException(status_code=404, detail="Pipeline not found (simulated)")
return db_pipeline
@router.put("/{pipeline_id}", response_model=schemas.PipelineRead)
async def update_pipeline(
pipeline_id: int,
pipeline_in: schemas.PipelineUpdate,
db: AsyncSession = Depends(get_db),
):
"""
DUMMY: Updates an existing pipeline configuration.
"""
logger.info(f"Endpoint: update_pipeline called for id={pipeline_id}")
# First, get the existing object (dummy)
db_pipeline = await crud.pipeline.get(db, id=pipeline_id)
if not db_pipeline:
raise HTTPException(status_code=404, detail="Pipeline not found (simulated)")
# Call (dummy) CRUD update
updated_pipeline = await crud.pipeline.update(db=db, db_obj=db_pipeline, obj_in=pipeline_in)
return updated_pipeline
@router.delete("/{pipeline_id}", response_model=schemas.PipelineRead)
async def delete_pipeline(pipeline_id: int, db: AsyncSession = Depends(get_db)):
"""
DUMMY: Deletes a pipeline configuration.
Returns the deleted object representation.
"""
logger.info(f"Endpoint: delete_pipeline called for id={pipeline_id}")
# Call (dummy) CRUD remove
deleted_pipeline = await crud.pipeline.remove(db=db, id=pipeline_id)
if deleted_pipeline is None:
raise HTTPException(status_code=404, detail="Pipeline not found (simulated)")
return deleted_pipeline # Return the object that was 'deleted'
@router.post("/{pipeline_id}/run", status_code=202, response_model=dict)
async def trigger_pipeline_run(pipeline_id: int, db: AsyncSession = Depends(get_db)):
"""
DUMMY: Simulates triggering an asynchronous pipeline run via Celery.
"""
logger.info(f"Endpoint: trigger_pipeline_run called for id={pipeline_id}")
# Check pipeline status using dummy CRUD
db_pipeline = await crud.pipeline.get(db, id=pipeline_id)
if not db_pipeline:
raise HTTPException(status_code=404, detail="Pipeline not found (simulated)")
if db_pipeline.status == schemas.PipelineStatus.PAUSED:
raise HTTPException(status_code=400, detail="Pipeline is paused (simulated)")
if db_pipeline.status == schemas.PipelineStatus.RUNNING:
raise HTTPException(status_code=409, detail="Pipeline is already running (simulated)")
# Simulate scheduling the Celery task
logger.info(f"Endpoint: Simulating run_pipeline_task.delay({pipeline_id})")
# In real code: task = run_pipeline_task.delay(pipeline_id)
# task_id = task.id
# For dummy:
dummy_task_id = f"dummy-celery-task-{random.randint(10000, 99999)}"
logger.info(f"Endpoint: Simulated task scheduling, got dummy task ID: {dummy_task_id}")
# Optionally update pipeline status immediately (using dummy crud)
# await crud.pipeline.update_pipeline_status(db, pipeline_id=pipeline_id, status=schemas.PipelineStatus.RUNNING)
return {"message": "Pipeline run simulated successfully", "job_id": dummy_task_id}
# --- Add dummy endpoints for pause/resume if needed, similar to trigger_pipeline_run ---

34
backend/app/config.py Normal file
View File

@ -0,0 +1,34 @@
from pydantic_settings import BaseSettings
from pathlib import Path
# Define a base directory for uploads, ensure it exists
UPLOAD_DIR = Path("./uploads")
UPLOAD_DIR.mkdir(exist_ok=True)
class Settings(BaseSettings):
PROJECT_NAME: str = "Data Integration Pipeline API"
API_V1_STR: str = "/api/v1"
DATABASE_URL: str = "postgresql+asyncpg://user:password@db/data_pipeline_db"
CELERY_BROKER_URL: str = "redis://redis:6379/0"
CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
OPENAI_API_KEY: str = "your_openai_key_here" # Replace in .env or secrets manager
NEWS_API_KEY: str | None = None # Replace if using Bing etc.
# Example Thai RSS feeds - load from config file or DB ideally
NEWS_SOURCES_RSS: list[str] = [
"https://www.bangkokpost.com/rss/data/most-recent.xml",
"https://www.nationthailand.com/rss/feed.xml",
]
UPLOAD_DIR: Path = UPLOAD_DIR # Make upload dir accessible via settings
class Config:
env_file = ".env"
case_sensitive = True
extra = "ignore"
settings = Settings()

View File

View File

@ -0,0 +1,42 @@
import logging
from pathlib import Path
from pydantic_settings import BaseSettings, SettingsConfigDict
logger = logging.getLogger(__name__)
# Define a base directory for uploads relative to this config file's location
# Recommended: Define UPLOAD_DIR based on an environment variable or absolute path in production
_BASE_DIR = Path(__file__).resolve().parent.parent.parent # Moves up from core -> app -> backend
UPLOAD_DIR_DEFAULT = _BASE_DIR / "uploads"
class Settings(BaseSettings):
"""Application configuration settings."""
PROJECT_NAME: str = "Borbann Backend API"
API_V1_STR: str = "/api/v1"
LOG_LEVEL: str = "INFO"
# Database configuration (sensitive, use secrets management in production)
DATABASE_URL: str = "postgresql+asyncpg://user:password@db:5432/borbann_db"
# Celery configuration (sensitive, use secrets management in production)
CELERY_BROKER_URL: str = "redis://redis:6379/0"
CELERY_RESULT_BACKEND: str = "redis://redis:6379/1" # Use different DB for results
# Example external API key (sensitive)
# SOME_EXTERNAL_API_KEY: str | None = None
# File Uploads
UPLOAD_DIR: Path = UPLOAD_DIR_DEFAULT
model_config = SettingsConfigDict(
env_file=".env", env_file_encoding="utf-8", extra="ignore", case_sensitive=True
)
settings = Settings()
# Ensure upload directory exists (can be done on startup as well)
try:
settings.UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
except OSError as e:
logger.error(f"Could not create upload directory: {settings.UPLOAD_DIR} - {e}")

66
backend/app/core/db.py Normal file
View File

@ -0,0 +1,66 @@
import logging
from typing import AsyncGenerator
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
from sqlalchemy.orm import declarative_base
from app.core.config import settings
logger = logging.getLogger(__name__)
# Define a base for declarative models
Base = declarative_base()
# Create the async engine
try:
engine = create_async_engine(
settings.DATABASE_URL,
pool_pre_ping=True,
# echo=True, # Uncomment for debugging SQL statements
)
logger.info("Database engine created successfully.")
except Exception as e:
logger.error(f"Failed to create database engine: {e}", exc_info=True)
# Depending on the application, you might want to exit here
# sys.exit(1)
engine = None # Ensure engine is None if creation failed
# Create a sessionmaker
if engine:
AsyncSessionFactory = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
else:
AsyncSessionFactory = None # No factory if engine failed
async def get_db() -> AsyncGenerator[AsyncSession, None]:
"""FastAPI dependency to get an async database session."""
if not AsyncSessionFactory:
logger.error("Database session factory not configured.")
raise RuntimeError("Database not configured.")
async with AsyncSessionFactory() as session:
# Optional: Start transaction (though commit/rollback might happen elsewhere)
# async with session.begin():
try:
yield session
# If not using 'async with session.begin()', you might commit here
# await session.commit()
except Exception:
logger.exception("Session rollback due to exception")
await session.rollback()
raise
finally:
# Close is handled by the context manager 'async with AsyncSessionFactory()'
pass
async def check_db_connection() -> bool:
"""Optional function to check DB connection on startup."""
if not engine:
return False
try:
async with engine.connect() as connection:
# You can execute a simple query like "SELECT 1" if needed
logger.info("Database connection verified.")
return True
except Exception as e:
logger.error(f"Database connection failed: {e}", exc_info=True)
return False

View File

@ -0,0 +1,3 @@
# Make CRUD functions easily accessible
from .crud_pipeline import pipeline, data_source, pipeline_run
from .crud_news_article import news_article

View File

@ -0,0 +1,122 @@
import logging
import random
from datetime import datetime, timedelta
from typing import List, Optional, Sequence
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app import models, schemas
logger = logging.getLogger(__name__)
class CRUDPipeline:
"""CRUD operations for Pipeline models (Dummy Implementation)."""
async def get(self, db: AsyncSession, id: int) -> Optional[models.Pipeline]:
"""DUMMY: Get a Pipeline by ID."""
logger.info(f"DUMMY CRUD: Simulating get Pipeline with id={id}")
# In real code: result = await db.execute(select(models.Pipeline).filter(models.Pipeline.id == id))
# return result.scalars().first()
if id == 999: # Simulate not found
return None
# Return a dummy model instance
return models.Pipeline(id=id, name=f"Dummy Pipeline {id}", status=schemas.PipelineStatus.IDLE)
async def get_multi(self, db: AsyncSession, skip: int = 0, limit: int = 100) -> Sequence[models.Pipeline]:
"""DUMMY: Get multiple Pipelines."""
logger.info(f"DUMMY CRUD: Simulating get_multi Pipeline skip={skip}, limit={limit}")
# In real code: result = await db.execute(select(models.Pipeline).offset(skip).limit(limit))
# return result.scalars().all()
return [
models.Pipeline(id=i, name=f"Dummy Pipeline {i}", status=schemas.PipelineStatus.IDLE)
for i in range(skip + 1, skip + limit + 1)
]
async def get_with_details(self, db: AsyncSession, id: int) -> Optional[models.Pipeline]:
"""DUMMY: Get a Pipeline with related sources and runs."""
logger.info(f"DUMMY CRUD: Simulating get_with_details Pipeline id={id}")
# In real code: Use eager loading
# stmt = select(models.Pipeline).options(
# selectinload(models.Pipeline.data_sources),
# selectinload(models.Pipeline.runs).order_by(models.PipelineRun.started_at.desc()).limit(5) # Example limit
# ).filter(models.Pipeline.id == id)
# result = await db.execute(stmt)
# return result.scalars().first()
pipeline = await self.get(db, id)
if pipeline:
pipeline.data_sources = [
models.DataSource(
id=id * 10 + 1,
pipeline_id=id,
type=schemas.DataSourceType.URL,
config={"url": "http://dummy.example.com"},
name="Dummy URL",
),
models.DataSource(
id=id * 10 + 2,
pipeline_id=id,
type=schemas.DataSourceType.API,
config={"url": "http://dummy.api/data", "method": "GET"},
name="Dummy API",
),
]
pipeline.runs = [
models.PipelineRun(
id=id * 100 + 1,
pipeline_id=id,
status=schemas.PipelineStatus.COMPLETED,
started_at=datetime.utcnow(),
),
models.PipelineRun(
id=id * 100 + 2,
pipeline_id=id,
status=schemas.PipelineStatus.FAILED,
started_at=datetime.utcnow() - timedelta(hours=1),
),
]
return pipeline
async def create(self, db: AsyncSession, *, obj_in: schemas.PipelineCreate) -> models.Pipeline:
"""DUMMY: Create a Pipeline."""
logger.info(f"DUMMY CRUD: Simulating create Pipeline with data: {obj_in.model_dump()}")
# In real code: db_obj = models.Pipeline(**obj_in.model_dump())
# db.add(db_obj); await db.flush(); await db.refresh(db_obj)
new_id = random.randint(100, 1000)
db_obj = models.Pipeline(id=new_id, status=schemas.PipelineStatus.IDLE, **obj_in.model_dump())
logger.info(f"DUMMY CRUD: Simulated creation with id={new_id}")
return db_obj
async def update(
self, db: AsyncSession, *, db_obj: models.Pipeline, obj_in: schemas.PipelineUpdate | dict
) -> models.Pipeline:
"""DUMMY: Update a Pipeline."""
if isinstance(obj_in, dict):
update_data = obj_in
else:
update_data = obj_in.model_dump(exclude_unset=True)
logger.info(f"DUMMY CRUD: Simulating update Pipeline id={db_obj.id} with data: {update_data}")
# In real code: update object fields, add, flush, refresh
for field, value in update_data.items():
if value is not None: # Apply updates
setattr(db_obj, field, value)
db_obj.updated_at = datetime.utcnow() # Simulate timestamp update
return db_obj
async def remove(self, db: AsyncSession, *, id: int) -> Optional[models.Pipeline]:
"""DUMMY: Remove a Pipeline."""
logger.info(f"DUMMY CRUD: Simulating remove Pipeline id={id}")
obj = await self.get(db=db, id=id)
if obj:
logger.info(f"DUMMY CRUD: Found pipeline {id} to remove.")
# In real code: await db.delete(obj); await db.flush()
return obj # Return the simulated object to be deleted
logger.warning(f"DUMMY CRUD: Pipeline {id} not found for removal.")
return None
# Instantiate CRUD objects for pipelines, datasources, runs etc.
pipeline = CRUDPipeline()
# ... add dummy crud_datasource and crud_pipelinerun similar to above

100
backend/app/main.py Normal file
View File

@ -0,0 +1,100 @@
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from app.core.config import settings
from app.api.v1.endpoints import api_router
from app.core.db import check_db_connection # Optional DB check
# --- Logging Configuration ---
# Basic config, consider more advanced setup (JSON, handlers) for production
logging.basicConfig(level=settings.LOG_LEVEL.upper(), format='%(levelname)s: %(name)s - %(message)s')
logger = logging.getLogger(__name__)
# --- Lifespan Management ---
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup logic
logger.info("Application startup...")
# Example: Check DB connection (using dummy core.db function)
# if not await check_db_connection():
# logger.critical("Database connection failed on startup. Check config/connections.")
# Decide if this is fatal. In dummy mode, we probably continue.
# sys.exit(1) # Or raise RuntimeError
# Example: Placeholder for loading ML models, external resources, etc.
# app.state.ml_model = load_my_model()
logger.info("Dummy startup tasks complete.")
yield # Application runs here
# Shutdown logic
logger.info("Application shutdown...")
# Example: Clean up resources
# if hasattr(app.state, 'ml_model'):
# app.state.ml_model.cleanup()
# Optional: Dispose DB engine explicitly if needed (often handled by context managers)
# from app.core.db import engine
# if engine: await engine.dispose()
logger.info("Dummy shutdown tasks complete.")
# --- FastAPI Application Instance ---
app = FastAPI(
title=settings.PROJECT_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json",
version="0.1.0", # Example version
description="Dummy API for Borbann Data Pipeline",
lifespan=lifespan,
)
# --- Global Exception Handler Example ---
@app.exception_handler(Exception)
async def generic_exception_handler(request: Request, exc: Exception):
# Log the full error internally
logger.error(f"Unhandled exception for request {request.url}: {exc}", exc_info=True)
# Return a generic error response to the client
return JSONResponse(
status_code=500,
content={"detail": "An internal server error occurred."},
)
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
# Default handler for FastAPI's own HTTPExceptions
# You might want to log these as well, depending on the status code
logger.warning(f"HTTP Exception: Status={exc.status_code}, Detail={exc.detail}")
return JSONResponse(
status_code=exc.status_code,
content={"detail": exc.detail},
headers=exc.headers,
)
# --- Mount API Router ---
app.include_router(api_router, prefix=settings.API_V1_STR)
# --- Root Endpoint ---
@app.get("/", tags=["Root"], summary="Root endpoint")
async def read_root():
"""Simple root endpoint providing basic info."""
return {"message": f"Welcome to {settings.PROJECT_NAME} (Dummy Version)"}
# --- Middleware (Example: CORS) ---
# from fastapi.middleware.cors import CORSMiddleware
# origins = [
# "http://localhost:3000", # Allow frontend dev server
# # Add production frontend URL here
# ]
# app.add_middleware(
# CORSMiddleware,
# allow_origins=origins,
# allow_credentials=True,
# allow_methods=["*"],
# allow_headers=["*"],
# )

View File

@ -0,0 +1,8 @@
from app.core.db import Base
# Import all models here to ensure they are registered with Base
from .pipeline import Pipeline, DataSource, PipelineRun, PipelineRunResult
from .news_article import NewsArticle
# You can optionally define __all__ if needed
# __all__ = ["Base", "Pipeline", "DataSource", "NewsArticle", "PipelineRun", "PipelineRunResult"]

View File

@ -0,0 +1,82 @@
import enum
from datetime import datetime
from sqlalchemy import Column, Integer, String, DateTime, Enum, ForeignKey, Text
from sqlalchemy.orm import relationship
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.sql import func
from app.core.db import Base # Import Base from your core db setup
# Define Enums directly here or import from schemas if preferred
# If defined here, ensure schemas.py uses these or compatible definitions
class PipelineStatusEnum(str, enum.Enum):
IDLE = "idle"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
PAUSED = "paused"
class DataSourceTypeEnum(str, enum.Enum):
URL = "url"
API = "api"
FILE = "file"
class Pipeline(Base):
__tablename__ = "pipelines"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, index=True, nullable=False)
description = Column(String, nullable=True)
status = Column(Enum(PipelineStatusEnum), default=PipelineStatusEnum.IDLE, nullable=False)
schedule = Column(String, nullable=True, comment="Cron-like schedule format")
configuration = Column(
JSONB, nullable=True, default=dict, comment="Pipeline-specific config, e.g., processing rules"
)
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
# Relationships
data_sources = relationship("DataSource", back_populates="pipeline", cascade="all, delete-orphan", lazy="selectin")
runs = relationship("PipelineRun", back_populates="pipeline", cascade="all, delete-orphan", lazy="selectin")
def __repr__(self) -> str:
return f"<Pipeline(id={self.id}, name='{self.name}')>"
class DataSource(Base):
__tablename__ = "data_sources"
id = Column(Integer, primary_key=True, index=True)
pipeline_id = Column(Integer, ForeignKey("pipelines.id"), nullable=False)
type = Column(Enum(DataSourceTypeEnum), nullable=False)
name = Column(String, nullable=True, comment="User-friendly name for the source")
config = Column(JSONB, nullable=False, comment="Source-specific config (url, api details, file path/info)")
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
# Relationship
pipeline = relationship("Pipeline", back_populates="data_sources")
def __repr__(self) -> str:
return f"<DataSource(id={self.id}, type='{self.type}', pipeline_id={self.pipeline_id})>"
class PipelineRun(Base):
__tablename__ = "pipeline_runs"
id = Column(Integer, primary_key=True, index=True)
pipeline_id = Column(Integer, ForeignKey("pipelines.id"), nullable=False)
celery_task_id = Column(String, nullable=True, index=True, comment="Celery task ID for the main pipeline run")
status = Column(Enum(PipelineStatusEnum), default=PipelineStatusEnum.RUNNING, nullable=False)
started_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
finished_at = Column(DateTime(timezone=True), nullable=True)
output_location = Column(String, nullable=True, comment="Path to results file or data store reference")
run_log = Column(Text, nullable=True, comment="Execution logs or error details")
# Relationship
pipeline = relationship("Pipeline", back_populates="runs")
def __repr__(self) -> str:
return f"<PipelineRun(id={self.id}, status='{self.status}', pipeline_id={self.pipeline_id})>"

View File

@ -0,0 +1,18 @@
# Import schemas for easier access
from .pipeline import (
PipelineCreate,
PipelineUpdate,
PipelineRead,
PipelineReadWithSources,
PipelineRunRead,
DataSourceCreate,
DataSourceUpdate,
DataSourceRead,
DataSourceType,
PipelineStatus,
)
from .news_article import NewsArticleCreate, NewsArticleRead
from .job import JobStatus
# Define __all__ for clarity if desired
# __all__ = [...]

View File

@ -0,0 +1,120 @@
import enum
from datetime import datetime
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field, HttpUrl, model_validator, ConfigDict
# Re-export enums from models or define compatible ones here
# Importing avoids definition duplication but creates dependency. Define here for clarity:
class PipelineStatus(str, enum.Enum):
IDLE = "idle"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
PAUSED = "paused"
class DataSourceType(str, enum.Enum):
URL = "url"
API = "api"
FILE = "file"
# --- Data Source Schemas ---
class DataSourceBase(BaseModel):
type: DataSourceType
name: Optional[str] = Field(None, max_length=255)
config: Dict[str, Any] = Field(..., description="Source-specific config")
# Example validator based on type
@model_validator(mode='after')
def check_config_based_on_type(self) -> 'DataSourceBase':
config = self.config
type = self.type
if type == DataSourceType.URL:
if not config or 'url' not in config:
raise ValueError("URL config must contain 'url' key")
# Could add URL validation here
elif type == DataSourceType.API:
if not config or 'url' not in config or 'method' not in config:
raise ValueError("API config must contain 'url' and 'method' keys")
elif type == DataSourceType.FILE:
if not config or 'file_path' not in config:
raise ValueError("File config must contain 'file_path' key")
return self
class DataSourceCreate(DataSourceBase):
pipeline_id: int # Must be provided when creating standalone
class DataSourceUpdate(DataSourceBase):
# Make all fields optional for update
type: Optional[DataSourceType] = None
config: Optional[Dict[str, Any]] = None
@model_validator(mode='before')
def prevent_type_change(cls, values: Dict[str, Any]) -> Dict[str, Any]:
# Example: Prevent changing the type during update
if 'type' in values and values['type'] is not None:
# In a real scenario, you'd compare against the existing db_obj type
# Here, we just disallow setting it in the update payload if not None
# raise ValueError("Changing data source type is not allowed.")
pass # Allow for dummy code, but keep validator structure
return values
class DataSourceRead(DataSourceBase):
id: int
pipeline_id: int
created_at: datetime
updated_at: Optional[datetime] = None
model_config = ConfigDict(from_attributes=True)
# --- Pipeline Run Schemas ---
class PipelineRunRead(BaseModel):
id: int
pipeline_id: int
celery_task_id: Optional[str] = None
status: PipelineStatus
started_at: datetime
finished_at: Optional[datetime] = None
output_location: Optional[str] = None
run_log: Optional[str] = Field(None, description="Execution logs, truncated if long")
model_config = ConfigDict(from_attributes=True)
# --- Pipeline Schemas ---
class PipelineBase(BaseModel):
name: str = Field(..., min_length=3, max_length=100)
description: Optional[str] = None
schedule: Optional[str] = Field(None, description="Cron-like schedule format, e.g., '0 * * * *'")
configuration: Optional[Dict[str, Any]] = Field(None, description="Pipeline-wide config")
class PipelineCreate(PipelineBase):
pass # Inherits all fields
class PipelineUpdate(BaseModel):
# Make all fields optional for update
name: Optional[str] = Field(None, min_length=3, max_length=100)
description: Optional[str] = None
schedule: Optional[str] = None
status: Optional[PipelineStatus] = None # Allow pausing/resuming via update
configuration: Optional[Dict[str, Any]] = None
class PipelineRead(PipelineBase):
id: int
status: PipelineStatus
# last_run_at: Optional[datetime] = None # Needs calculation/join, omit for simplicity
created_at: datetime
updated_at: Optional[datetime] = None
model_config = ConfigDict(from_attributes=True)
class PipelineReadWithDetails(PipelineRead):
data_sources: List[DataSourceRead] = []
runs: List[PipelineRunRead] = Field([], description="Most recent runs")

View File

View File

@ -0,0 +1,124 @@
from datetime import datetime
import logging
from typing import Any, Dict, List, Optional
import pandas as pd
import asyncio
import random
from sqlalchemy.ext.asyncio import AsyncSession
from app import crud, models, schemas # Import necessary types
logger = logging.getLogger(__name__)
class ProcessingError(Exception):
"""Custom exception for processing errors."""
pass
class ProcessingService:
"""Service layer for handling data processing logic within a pipeline run."""
def __init__(self, db_session: Optional[AsyncSession] = None):
# Allow injecting session for testing or specific use cases,
# but typically services might not interact directly with db session.
# They usually call CRUD functions.
pass
async def process_pipeline_results(
self, raw_results: List[List[Dict] | Dict | Exception], pipeline_config: Dict[str, Any]
) -> pd.DataFrame:
"""
DUMMY: Orchestrates the processing of raw results from source tasks.
Args:
raw_results: A list containing results (dicts or lists of dicts)
or Exceptions from individual source tasks.
pipeline_config: Configuration for the pipeline affecting processing.
Returns:
A processed Pandas DataFrame.
Raises:
ProcessingError: If a critical processing step fails.
"""
logger.info("DUMMY Service: Starting process_pipeline_results simulation")
# 1. Aggregate valid data (handle errors from tasks)
valid_data = []
errors_encountered = 0
for i, result in enumerate(raw_results):
if isinstance(result, Exception):
errors_encountered += 1
logger.warning(f"Task {i} resulted in error: {result}")
# Decide if errors should halt processing or just be logged
elif isinstance(result, list):
valid_data.extend(result)
elif isinstance(result, dict) and 'error' not in result:
valid_data.append(result)
# Ignore 'error' dicts or None results silently for this dummy example
if not valid_data and errors_encountered > 0:
raise ProcessingError("No valid data received, only errors from source tasks.")
if not valid_data:
logger.warning("No valid data found to process.")
return pd.DataFrame() # Return empty DataFrame
logger.info(f"Aggregated {len(valid_data)} raw records. Encountered {errors_encountered} errors.")
# 2. Convert to DataFrame (simulate potential error)
try:
df = pd.DataFrame(valid_data).fillna("<dummy_missing>") # Handle potential missing keys
logger.info(f"Created initial DataFrame with shape: {df.shape}")
except Exception as e:
logger.error(f"Error creating DataFrame: {e}", exc_info=True)
raise ProcessingError("Failed to create DataFrame from aggregated results.") from e
# 3. Simulate Processing Steps (using dummy functions)
await asyncio.sleep(random.uniform(0.01, 0.05)) # Simulate time
df = self._dummy_normalize(df, pipeline_config)
await asyncio.sleep(random.uniform(0.01, 0.05))
df = self._dummy_deduplicate(df, pipeline_config)
await asyncio.sleep(random.uniform(0.01, 0.05))
df = self._dummy_transform(df, pipeline_config)
logger.info(f"DUMMY Service: Finished processing. Final DataFrame shape: {df.shape}")
return df
def _dummy_normalize(self, df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
"""DUMMY: Simulate schema normalization."""
logger.debug(f"Simulating schema normalization on shape {df.shape}")
# Rename 'api_field' if it exists
if 'api_field' in df.columns:
df = df.rename(columns={'api_field': 'normalized_field'})
return df.copy()
def _dummy_deduplicate(self, df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
"""DUMMY: Simulate deduplication."""
logger.debug(f"Simulating deduplication on shape {df.shape}")
if df.empty:
return df
# Deduplicate based on a dummy 'source_url' or 'url' if present
key = 'source_url' if 'source_url' in df.columns else ('url' if 'url' in df.columns else None)
if key:
initial_count = len(df)
df_dedup = df.drop_duplicates(subset=[key], keep='first')
logger.debug(f"Dropped {initial_count - len(df_dedup)} duplicates based on '{key}'")
return df_dedup
return df.drop_duplicates(keep='first') # Fallback
def _dummy_transform(self, df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
"""DUMMY: Simulate data transformation."""
logger.debug(f"Simulating transformations on shape {df.shape}")
# Add a dummy derived column if possible
if 'value' in df.columns:
# Ensure 'value' is numeric, coercing errors to NaN, then fillna
df['value'] = pd.to_numeric(df['value'], errors='coerce').fillna(0)
df['transformed_value'] = df['value'] * random.uniform(1.1, 1.5)
df['processing_timestamp'] = datetime.utcnow().isoformat()
return df
# Instantiate the service for potential use elsewhere (dependency injection)
processing_service = ProcessingService()

View File

View File

@ -0,0 +1,214 @@
import asyncio
from datetime import datetime
import logging
import random
import time
from celery import chord, group, shared_task
from app import crud, models, schemas # Keep imports for structure
from app.core.db import AsyncSessionFactory # Use session factory directly in tasks
from app.services.processing_service import ProcessingService # Import dummy service
logger = logging.getLogger(__name__)
# --- Helper to run async code from sync Celery tasks ---
def async_to_sync(awaitable):
"""Runs an awaitable in a new event loop."""
return asyncio.run(awaitable)
# --- Dummy Sub-Tasks ---
@shared_task(bind=True, max_retries=1, default_retry_delay=5)
def dummy_source_task(self, source_id: int, source_type: str):
"""DUMMY: Simulates processing any data source type."""
task_id = self.request.id
logger.info(f"DUMMY TASK dummy_source_task[ID:{task_id}]: Start DS:{source_id} Type:{source_type}")
await_time = random.uniform(0.05, 0.2)
time.sleep(await_time) # Simulate work
# Simulate occasional failure
if random.random() < 0.08:
error_msg = f"Simulated failure processing source {source_id}"
logger.warning(f"DUMMY TASK dummy_source_task[ID:{task_id}]: {error_msg}")
raise ValueError(error_msg) # Raise exception for Celery retry/failure
# Simulate successful result (list of dicts)
num_records = random.randint(1, 3)
result = [{f"data_{source_id}_{i}": random.random(), "source_type": source_type} for i in range(num_records)]
logger.info(f"DUMMY TASK dummy_source_task[ID:{task_id}]: Finish DS:{source_id}, generated {num_records} records.")
return result
# --- Dummy Aggregation Task (Callback) ---
@shared_task(bind=True)
def dummy_aggregate_task(self, results: list, pipeline_id: int, run_id: int):
"""DUMMY: Simulates aggregating results and saving."""
task_id = self.request.id
logger.info(
f"DUMMY TASK dummy_aggregate_task[ID:{task_id}]: Start Aggregation for RunID:{run_id}, PipelineID:{pipeline_id}. Received {len(results)} results."
)
log_messages = [f"Aggregation simulation started at {datetime.utcnow()}"]
final_status = schemas.PipelineStatus.COMPLETED
output_location = None
errors_encountered = sum(1 for r in results if isinstance(r, Exception))
# Instantiate dummy service
service = ProcessingService()
async def process_and_save():
nonlocal output_location, final_status # Allow modification
try:
# Call dummy processing service
processed_df = await service.process_pipeline_results(results, {"dummy_pipeline_cfg": True})
if not processed_df.empty:
# Simulate saving (no actual file handler needed here for dummy)
await asyncio.sleep(0.1) # Simulate save time
output_location = f"dummy_outputs/run_{run_id}_output_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}.csv"
log_messages.append(f"Simulated saving results to {output_location}, shape: {processed_df.shape}")
logger.info(f"DUMMY AGGREGATION: Simulated save complete to {output_location}")
else:
log_messages.append("No data processed after aggregation/filtering.")
# Keep COMPLETED status if no errors, otherwise FAILED was set below
except Exception as e:
logger.error(f"DUMMY AGGREGATION: Error during dummy processing: {e}", exc_info=True)
log_messages.append(f"ERROR during processing: {e}")
final_status = schemas.PipelineStatus.FAILED
output_location = None
if errors_encountered > 0 and final_status != schemas.PipelineStatus.FAILED:
log_messages.append("Pipeline simulation completed with source task errors.")
# Optional: Set a specific status like COMPLETED_WITH_ERRORS if needed
elif errors_encountered > 0 and not processed_df.empty:
final_status = schemas.PipelineStatus.FAILED # Fail if errors and no data
# Simulate DB Update
final_log = "\n".join(log_messages)
logger.info(f"DUMMY AGGREGATION: Simulating final DB update for RunID:{run_id} to status {final_status}")
if AsyncSessionFactory: # Check if DB is configured
async with AsyncSessionFactory() as session:
try:
# Call dummy CRUD functions
await crud.pipeline_run.update_run_status(
db=session,
run_id=run_id,
status=final_status,
output_location=output_location,
run_log=final_log,
)
await crud.pipeline.update( # Use generic update for status
db=session,
db_obj=models.Pipeline(id=pipeline_id), # Need a dummy obj for update
obj_in={"status": schemas.PipelineStatus.IDLE},
)
logger.info(f"DUMMY AGGREGATION: DB update simulation successful for RunID:{run_id}.")
except Exception as db_exc:
logger.error(
f"DUMMY AGGREGATION: Failed DB update simulation for RunID:{run_id}: {db_exc}", exc_info=True
)
else:
logger.warning("DUMMY AGGREGATION: Skipping DB update simulation as DB is not configured.")
async_to_sync(process_and_save())
logger.info(f"DUMMY TASK dummy_aggregate_task[ID:{task_id}]: Finish Aggregation Simulation for RunID:{run_id}")
# --- Dummy Pipeline Orchestrator Task ---
@shared_task(bind=True)
def run_pipeline_task(self, pipeline_id: int):
"""DUMMY: Simulates fetching pipeline details and scheduling sub-tasks."""
task_id = self.request.id
logger.info(
f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Start Orchestration Simulation for PipelineID:{pipeline_id}"
)
run_id = None
async def setup_and_dispatch():
nonlocal run_id
if not AsyncSessionFactory:
logger.error("Cannot simulate pipeline run: Database not configured.")
return None, "Database not configured"
async with AsyncSessionFactory() as session:
# 1. Get Pipeline (dummy)
pipeline = await crud.pipeline.get_with_details(session, id=pipeline_id)
if not pipeline:
logger.error(f"Pipeline {pipeline_id} not found (simulated).")
return None, "Pipeline not found"
if pipeline.status != schemas.PipelineStatus.IDLE:
logger.warning(f"Pipeline {pipeline_id} not idle (status: {pipeline.status}), skipping run simulation.")
return None, f"Pipeline status is {pipeline.status}"
# 2. Create Run Record (dummy)
run = await crud.pipeline_run.create(
session, pipeline_id=pipeline_id, celery_task_id=task_id, status=schemas.PipelineStatus.RUNNING
)
run_id = run.id
logger.info(f"Created dummy PipelineRun record with RunID:{run_id}")
# 3. Update Pipeline Status (dummy)
await crud.pipeline.update(session, db_obj=pipeline, obj_in={"status": schemas.PipelineStatus.RUNNING})
logger.info(f"Set dummy Pipeline {pipeline_id} status to RUNNING")
# 4. Prepare sub-tasks (using dummy sources from get_with_details)
if not pipeline.data_sources:
logger.warning(f"No data sources found for pipeline {pipeline_id}. Finishing run.")
await crud.pipeline_run.update_run_status(
session, run_id=run_id, status=schemas.PipelineStatus.COMPLETED, run_log="No data sources found."
)
await crud.pipeline.update(session, db_obj=pipeline, obj_in={"status": schemas.PipelineStatus.IDLE})
return [], None # No tasks to run
sub_tasks = [dummy_source_task.s(ds.id, ds.type.value) for ds in pipeline.data_sources]
logger.info(f"Prepared {len(sub_tasks)} dummy sub-tasks for RunID:{run_id}")
return sub_tasks, None
async def fail_run(error_message: str):
"""Helper to mark run as failed if setup simulation fails."""
if run_id and AsyncSessionFactory:
logger.error(f"Simulating run failure for RunID:{run_id} - {error_message}")
async with AsyncSessionFactory() as session:
await crud.pipeline_run.update_run_status(
db=session,
run_id=run_id,
status=schemas.PipelineStatus.FAILED,
run_log=f"Orchestration failed: {error_message}",
)
await crud.pipeline.update(
db=session, db_obj=models.Pipeline(id=pipeline_id), obj_in={"status": schemas.PipelineStatus.IDLE}
)
try:
sub_task_signatures, error = async_to_sync(setup_and_dispatch())
if error:
logger.error(f"Orchestration setup simulation failed: {error}")
# fail_run should have been called if run_id was set
return
if not sub_task_signatures:
logger.info("No sub-tasks to execute.")
return # Setup marked run as completed/failed
# Define the workflow chord
workflow = chord(
header=group(sub_task_signatures),
body=dummy_aggregate_task.s(pipeline_id=pipeline_id, run_id=run_id), # Ensure run_id is passed
)
# Simulate applying the workflow
logger.info(
f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Simulating Celery chord apply_async() for RunID:{run_id}"
)
# In a real test you might call workflow() directly to execute synchronously
# For this dummy structure, just log the intent.
logger.info(f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Workflow simulation scheduled for RunID:{run_id}")
except Exception as exc:
logger.error(
f"DUMMY TASK run_pipeline_task[ID:{task_id}]: Orchestration Simulation FAILED: {exc}", exc_info=True
)
async_to_sync(fail_run(f"Orchestration simulation exception: {type(exc).__name__}"))

34
backend/pyproject.toml Normal file
View File

@ -0,0 +1,34 @@
# pyproject.toml
[project]
name = "backend"
version = "0.1.0"
description = "Customizable Automated Data Integration Pipeline Backend"
requires-python = ">=3.11" # Playwright and modern libraries benefit from newer Python
dependencies = [
"fastapi",
"uvicorn[standard]", # Includes performance extras
"pydantic",
"pydantic-settings",
"sqlalchemy", # ORM
"psycopg2-binary", # Postgres driver (or asyncpg for async)
"asyncpg", # Async Postgres driver
"alembic", # Database migrations
"celery", # Background tasks
"redis", # Celery broker/backend
"playwright", # For self-hosted browser automation
"beautifulsoup4", # HTML parsing
"python-readability", # Clean HTML content extraction
"openai", # Or anthropic, google-generativeai for LLM
"pandas", # Data manipulation, file reading, export
"httpx", # Async HTTP requests (for APIs, LLM calls)
"python-multipart", # For FastAPI file uploads
"PyYAML", # For YAML export (if needed later)
"feedparser", # For parsing RSS/Atom feeds (News)
# Add other specific news API client libraries if needed
]
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[tool.ruff]