backend-api/pipeline/normalization/normalizer.py

"""
Normalizer module to transform raw records into a canonical schema.
"""

from typing import List, Dict, Any, Optional

from .base import TextExtractor
from .utils import flatten_dict, generate_id, extract_all_text


class _DefaultTextExtractor:
    """
    Default text extractor using the extract_all_text utility.
    """

    def extract(self, record: Dict[str, Any]) -> str:
        """
        Extract text from the record.

        Args:
            record: A flattened record dict.

        Returns:
            A string containing concatenated text values.
        """
        return extract_all_text(record)


class Normalizer:
    """
    Class to normalize raw records into a canonical format.
    """

    def __init__(self, extractor: Optional[TextExtractor] = None):
        """
        Initialize the Normalizer.

        Args:
            extractor: Optional custom TextExtractor strategy.
        """
        self.extractor: TextExtractor = extractor or _DefaultTextExtractor()

    def normalize(
        self,
        records: List[Dict[str, Any]],
        source_type: str,
        source: str
    ) -> List[Dict[str, Any]]:
        """
        Normalize a list of raw records.

        Args:
            records: Raw records to normalize.
            source_type: Type of the source ('api', 'scrape', 'file').
            source: Original source identifier (URL or path).

        Returns:
            A list of canonical records matching the schema.
        """
        normalized: List[Dict[str, Any]] = []

        for raw in records:
            flat = flatten_dict(raw)
            text = self.extractor.extract(flat)
            rec_id = generate_id(source, flat)
            metadata = {k: v for k, v in flat.items() if not isinstance(v, str)}

            canonical = {
                "id": rec_id,
                "source_type": source_type,
                "source": source,
                "raw": raw,
                "metadata": metadata,
                "text": text,
            }
            normalized.append(canonical)

        return normalized


if __name__ == "__main__":
    # Example usage
    sample = [{"title": "Hello", "details": {"body": "World", "count": 5}}]
    norm = Normalizer()
    records = norm.normalize(sample, source_type="api", source="https://example.com")
    print(records)