from __future__ import annotations import asyncio from pathlib import Path from typing import Iterable from .frontier import normalize_url from .storage import STORAGE_DIR REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt' class ArticleRegistry: """Keeps track of processed article URLs across crawler runs.""" def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None): self._path = path or REGISTRY_PATH self._path.parent.mkdir(parents=True, exist_ok=True) self._lock = asyncio.Lock() self._processed: set[str] = set() if self._path.exists(): for line in self._path.read_text(encoding='utf-8').splitlines(): line = line.strip() if line: self._processed.add(normalize_url(line)) if preload: for item in preload: self._processed.add(normalize_url(item)) def contains(self, url: str) -> bool: return normalize_url(url) in self._processed async def mark(self, url: str) -> None: norm = normalize_url(url) async with self._lock: if norm in self._processed: return self._processed.add(norm) await asyncio.to_thread(self._append_line, url) def _append_line(self, url: str) -> None: with self._path.open('a', encoding='utf-8') as fh: fh.write(normalize_url(url)) fh.write('\n') def size(self) -> int: return len(self._processed)