mirror of
https://github.com/borbann-platform/backend-api.git
synced 2025-12-18 12:14:05 +01:00
remove normalization
This commit is contained in:
parent
5cd5fcfeb3
commit
31e35ff1b0
@ -1,6 +0,0 @@
|
||||
"""
|
||||
Normalization package for data integration service.
|
||||
|
||||
Provides utilities and classes to normalize raw records
|
||||
into a canonical schema.
|
||||
"""
|
||||
@ -1,23 +0,0 @@
|
||||
"""
|
||||
Base module defining protocols for the normalization layer.
|
||||
"""
|
||||
|
||||
from typing import Protocol, Dict, Any
|
||||
|
||||
|
||||
class TextExtractor(Protocol):
|
||||
"""
|
||||
Protocol for text extraction strategies.
|
||||
"""
|
||||
|
||||
def extract(self, record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract and return text from a flattened record.
|
||||
|
||||
Args:
|
||||
record: A flattened record dict.
|
||||
|
||||
Returns:
|
||||
A string containing the extracted text.
|
||||
"""
|
||||
...
|
||||
@ -1,86 +0,0 @@
|
||||
"""
|
||||
Normalizer module to transform raw records into a canonical schema.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
from .base import TextExtractor
|
||||
from .utils import flatten_dict, generate_id, extract_all_text
|
||||
|
||||
|
||||
class _DefaultTextExtractor:
|
||||
"""
|
||||
Default text extractor using the extract_all_text utility.
|
||||
"""
|
||||
|
||||
def extract(self, record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract text from the record.
|
||||
|
||||
Args:
|
||||
record: A flattened record dict.
|
||||
|
||||
Returns:
|
||||
A string containing concatenated text values.
|
||||
"""
|
||||
return extract_all_text(record)
|
||||
|
||||
|
||||
class Normalizer:
|
||||
"""
|
||||
Class to normalize raw records into a canonical format.
|
||||
"""
|
||||
|
||||
def __init__(self, extractor: Optional[TextExtractor] = None):
|
||||
"""
|
||||
Initialize the Normalizer.
|
||||
|
||||
Args:
|
||||
extractor: Optional custom TextExtractor strategy.
|
||||
"""
|
||||
self.extractor: TextExtractor = extractor or _DefaultTextExtractor()
|
||||
|
||||
def normalize(
|
||||
self,
|
||||
records: List[Dict[str, Any]],
|
||||
source_type: str,
|
||||
source: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Normalize a list of raw records.
|
||||
|
||||
Args:
|
||||
records: Raw records to normalize.
|
||||
source_type: Type of the source ('api', 'scrape', 'file').
|
||||
source: Original source identifier (URL or path).
|
||||
|
||||
Returns:
|
||||
A list of canonical records matching the schema.
|
||||
"""
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
|
||||
for raw in records:
|
||||
flat = flatten_dict(raw)
|
||||
text = self.extractor.extract(flat)
|
||||
rec_id = generate_id(source, flat)
|
||||
metadata = {k: v for k, v in flat.items() if not isinstance(v, str)}
|
||||
|
||||
canonical = {
|
||||
"id": rec_id,
|
||||
"source_type": source_type,
|
||||
"source": source,
|
||||
"raw": raw,
|
||||
"metadata": metadata,
|
||||
"text": text,
|
||||
}
|
||||
normalized.append(canonical)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
sample = [{"title": "Hello", "details": {"body": "World", "count": 5}}]
|
||||
norm = Normalizer()
|
||||
records = norm.normalize(sample, source_type="api", source="https://example.com")
|
||||
print(records)
|
||||
@ -1,64 +0,0 @@
|
||||
"""
|
||||
Utility functions for the normalization layer.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def flatten_dict(
|
||||
d: Dict[str, Any],
|
||||
parent_key: str = "",
|
||||
sep: str = "."
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Recursively flatten a nested dictionary.
|
||||
|
||||
Args:
|
||||
d: The dictionary to flatten.
|
||||
parent_key: The base key string for recursion.
|
||||
sep: Separator between keys.
|
||||
|
||||
Returns:
|
||||
A flattened dictionary with compound keys.
|
||||
"""
|
||||
items: Dict[str, Any] = {}
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.update(flatten_dict(v, new_key, sep=sep))
|
||||
else:
|
||||
items[new_key] = v
|
||||
return items
|
||||
|
||||
|
||||
def generate_id(source: str, record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate a stable UUID based on source and record content.
|
||||
|
||||
Args:
|
||||
source: Identifier for the data source (URL or file path).
|
||||
record: The flattened record dict.
|
||||
|
||||
Returns:
|
||||
A string representation of a UUID.
|
||||
"""
|
||||
record_json = json.dumps(record, sort_keys=True)
|
||||
namespace = uuid.NAMESPACE_URL
|
||||
uid = uuid.uuid5(namespace, f"{source}-{record_json}")
|
||||
return str(uid)
|
||||
|
||||
|
||||
def extract_all_text(record: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract all string values from the record and concatenate them.
|
||||
|
||||
Args:
|
||||
record: A flattened record dict.
|
||||
|
||||
Returns:
|
||||
A single string containing all text values separated by spaces.
|
||||
"""
|
||||
texts = [v for v in record.values() if isinstance(v, str)]
|
||||
return " ".join(texts)
|
||||
Loading…
Reference in New Issue
Block a user