mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-19 12:44:04 +01:00
86 lines
2.3 KiB
Python
86 lines
2.3 KiB
Python
"""
|
|
Normalizer module to transform raw records into a canonical schema.
|
|
"""
|
|
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
from .base import TextExtractor
|
|
from .utils import flatten_dict, generate_id, extract_all_text
|
|
|
|
|
|
class _DefaultTextExtractor:
|
|
"""
|
|
Default text extractor using the extract_all_text utility.
|
|
"""
|
|
|
|
def extract(self, record: Dict[str, Any]) -> str:
|
|
"""
|
|
Extract text from the record.
|
|
|
|
Args:
|
|
record: A flattened record dict.
|
|
|
|
Returns:
|
|
A string containing concatenated text values.
|
|
"""
|
|
return extract_all_text(record)
|
|
|
|
|
|
class Normalizer:
|
|
"""
|
|
Class to normalize raw records into a canonical format.
|
|
"""
|
|
|
|
def __init__(self, extractor: Optional[TextExtractor] = None):
|
|
"""
|
|
Initialize the Normalizer.
|
|
|
|
Args:
|
|
extractor: Optional custom TextExtractor strategy.
|
|
"""
|
|
self.extractor: TextExtractor = extractor or _DefaultTextExtractor()
|
|
|
|
def normalize(
|
|
self,
|
|
records: List[Dict[str, Any]],
|
|
source_type: str,
|
|
source: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Normalize a list of raw records.
|
|
|
|
Args:
|
|
records: Raw records to normalize.
|
|
source_type: Type of the source ('api', 'scrape', 'file').
|
|
source: Original source identifier (URL or path).
|
|
|
|
Returns:
|
|
A list of canonical records matching the schema.
|
|
"""
|
|
normalized: List[Dict[str, Any]] = []
|
|
|
|
for raw in records:
|
|
flat = flatten_dict(raw)
|
|
text = self.extractor.extract(flat)
|
|
rec_id = generate_id(source, flat)
|
|
metadata = {k: v for k, v in flat.items() if not isinstance(v, str)}
|
|
|
|
canonical = {
|
|
"id": rec_id,
|
|
"source_type": source_type,
|
|
"source": source,
|
|
"raw": raw,
|
|
"metadata": metadata,
|
|
"text": text,
|
|
}
|
|
normalized.append(canonical)
|
|
|
|
return normalized
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage
|
|
sample = [{"title": "Hello", "details": {"body": "World", "count": 5}}]
|
|
norm = Normalizer()
|
|
records = norm.normalize(sample, source_type="api", source="https://example.com")
|
|
print(records) |