remove normalization

2025-12-19 20:54:05 +01:00 · 2025-05-12 21:36:06 +07:00 · 2025-05-12 21:36:06 +07:00 · 31e35ff1b0
commit 31e35ff1b0
parent 5cd5fcfeb3
4 changed files with 0 additions and 179 deletions
--- a/pipeline/normalization/init.py
+++ b/pipeline/normalization/init.py
@ -1,6 +0,0 @@
 """
 Normalization package for data integration service.
 Provides utilities and classes to normalize raw records
 into a canonical schema.
 """
--- a/pipeline/normalization/base.py
+++ b/pipeline/normalization/base.py
@ -1,23 +0,0 @@
 """
 Base module defining protocols for the normalization layer.
 """
 from typing import Protocol, Dict, Any
 class TextExtractor(Protocol):
    """
    Protocol for text extraction strategies.
    """
    def extract(self, record: Dict[str, Any]) -> str:
        """
        Extract and return text from a flattened record.
        Args:
            record: A flattened record dict.
        Returns:
            A string containing the extracted text.
        """
        ...
--- a/pipeline/normalization/normalizer.py
+++ b/pipeline/normalization/normalizer.py
@ -1,86 +0,0 @@
 """
 Normalizer module to transform raw records into a canonical schema.
 """
 from typing import List, Dict, Any, Optional
 from .base import TextExtractor
 from .utils import flatten_dict, generate_id, extract_all_text
 class _DefaultTextExtractor:
    """
    Default text extractor using the extract_all_text utility.
    """
    def extract(self, record: Dict[str, Any]) -> str:
        """
        Extract text from the record.
        Args:
            record: A flattened record dict.
        Returns:
            A string containing concatenated text values.
        """
        return extract_all_text(record)
 class Normalizer:
    """
    Class to normalize raw records into a canonical format.
    """
    def __init__(self, extractor: Optional[TextExtractor] = None):
        """
        Initialize the Normalizer.
        Args:
            extractor: Optional custom TextExtractor strategy.
        """
        self.extractor: TextExtractor = extractor or _DefaultTextExtractor()
    def normalize(
        self,
        records: List[Dict[str, Any]],
        source_type: str,
        source: str
    ) -> List[Dict[str, Any]]:
        """
        Normalize a list of raw records.
        Args:
            records: Raw records to normalize.
            source_type: Type of the source ('api', 'scrape', 'file').
            source: Original source identifier (URL or path).
        Returns:
            A list of canonical records matching the schema.
        """
        normalized: List[Dict[str, Any]] = []
        for raw in records:
            flat = flatten_dict(raw)
            text = self.extractor.extract(flat)
            rec_id = generate_id(source, flat)
            metadata = {k: v for k, v in flat.items() if not isinstance(v, str)}
            canonical = {
                "id": rec_id,
                "source_type": source_type,
                "source": source,
                "raw": raw,
                "metadata": metadata,
                "text": text,
            }
            normalized.append(canonical)
        return normalized
 if __name__ == "__main__":
    # Example usage
    sample = [{"title": "Hello", "details": {"body": "World", "count": 5}}]
    norm = Normalizer()
    records = norm.normalize(sample, source_type="api", source="https://example.com")
    print(records)
--- a/pipeline/normalization/utils.py
+++ b/pipeline/normalization/utils.py
@ -1,64 +0,0 @@
 """
 Utility functions for the normalization layer.
 """
 import json
 import uuid
 from typing import Dict, Any
 def flatten_dict(
    d: Dict[str, Any],
    parent_key: str = "",
    sep: str = "."
 ) -> Dict[str, Any]:
    """
    Recursively flatten a nested dictionary.
    Args:
        d: The dictionary to flatten.
        parent_key: The base key string for recursion.
        sep: Separator between keys.
    Returns:
        A flattened dictionary with compound keys.
    """
    items: Dict[str, Any] = {}
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.update(flatten_dict(v, new_key, sep=sep))
        else:
            items[new_key] = v
    return items
 def generate_id(source: str, record: Dict[str, Any]) -> str:
    """
    Generate a stable UUID based on source and record content.
    Args:
        source: Identifier for the data source (URL or file path).
        record: The flattened record dict.
    Returns:
        A string representation of a UUID.
    """
    record_json = json.dumps(record, sort_keys=True)
    namespace = uuid.NAMESPACE_URL
    uid = uuid.uuid5(namespace, f"{source}-{record_json}")
    return str(uid)
 def extract_all_text(record: Dict[str, Any]) -> str:
    """
    Extract all string values from the record and concatenate them.
    Args:
        record: A flattened record dict.
    Returns:
        A single string containing all text values separated by spaces.
    """
    texts = [v for v in record.values() if isinstance(v, str)]
    return " ".join(texts)