backend-api/normalization/normalizer.py
2025-04-20 19:46:54 +07:00

86 lines
2.3 KiB
Python

"""
Normalizer module to transform raw records into a canonical schema.
"""
from typing import List, Dict, Any, Optional
from .base import TextExtractor
from .utils import flatten_dict, generate_id, extract_all_text
class _DefaultTextExtractor:
"""
Default text extractor using the extract_all_text utility.
"""
def extract(self, record: Dict[str, Any]) -> str:
"""
Extract text from the record.
Args:
record: A flattened record dict.
Returns:
A string containing concatenated text values.
"""
return extract_all_text(record)
class Normalizer:
"""
Class to normalize raw records into a canonical format.
"""
def __init__(self, extractor: Optional[TextExtractor] = None):
"""
Initialize the Normalizer.
Args:
extractor: Optional custom TextExtractor strategy.
"""
self.extractor: TextExtractor = extractor or _DefaultTextExtractor()
def normalize(
self,
records: List[Dict[str, Any]],
source_type: str,
source: str
) -> List[Dict[str, Any]]:
"""
Normalize a list of raw records.
Args:
records: Raw records to normalize.
source_type: Type of the source ('api', 'scrape', 'file').
source: Original source identifier (URL or path).
Returns:
A list of canonical records matching the schema.
"""
normalized: List[Dict[str, Any]] = []
for raw in records:
flat = flatten_dict(raw)
text = self.extractor.extract(flat)
rec_id = generate_id(source, flat)
metadata = {k: v for k, v in flat.items() if not isinstance(v, str)}
canonical = {
"id": rec_id,
"source_type": source_type,
"source": source,
"raw": raw,
"metadata": metadata,
"text": text,
}
normalized.append(canonical)
return normalized
if __name__ == "__main__":
# Example usage
sample = [{"title": "Hello", "details": {"body": "World", "count": 5}}]
norm = Normalizer()
records = norm.normalize(sample, source_type="api", source="https://example.com")
print(records)