line-today-scrape/linetoday/registry.py

from __future__ import annotations

import asyncio
from pathlib import Path
from typing import Iterable

from .frontier import normalize_url
from .storage import STORAGE_DIR

REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'


class ArticleRegistry:
    """Keeps track of processed article URLs across crawler runs."""

    def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
        self._path = path or REGISTRY_PATH
        self._path.parent.mkdir(parents=True, exist_ok=True)
        self._lock = asyncio.Lock()
        self._processed: set[str] = set()

        if self._path.exists():
            for line in self._path.read_text(encoding='utf-8').splitlines():
                line = line.strip()
                if line:
                    self._processed.add(normalize_url(line))

        if preload:
            for item in preload:
                self._processed.add(normalize_url(item))

    def contains(self, url: str) -> bool:
        return normalize_url(url) in self._processed

    async def mark(self, url: str) -> None:
        norm = normalize_url(url)
        async with self._lock:
            if norm in self._processed:
                return
            self._processed.add(norm)
            await asyncio.to_thread(self._append_line, url)

    def _append_line(self, url: str) -> None:
        with self._path.open('a', encoding='utf-8') as fh:
            fh.write(normalize_url(url))
            fh.write('\n')

    def size(self) -> int:
        return len(self._processed)