50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
from .frontier import normalize_url
|
|
from .storage import STORAGE_DIR
|
|
|
|
REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'
|
|
|
|
|
|
class ArticleRegistry:
|
|
"""Keeps track of processed article URLs across crawler runs."""
|
|
|
|
def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
|
|
self._path = path or REGISTRY_PATH
|
|
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._lock = asyncio.Lock()
|
|
self._processed: set[str] = set()
|
|
|
|
if self._path.exists():
|
|
for line in self._path.read_text(encoding='utf-8').splitlines():
|
|
line = line.strip()
|
|
if line:
|
|
self._processed.add(normalize_url(line))
|
|
|
|
if preload:
|
|
for item in preload:
|
|
self._processed.add(normalize_url(item))
|
|
|
|
def contains(self, url: str) -> bool:
|
|
return normalize_url(url) in self._processed
|
|
|
|
async def mark(self, url: str) -> None:
|
|
norm = normalize_url(url)
|
|
async with self._lock:
|
|
if norm in self._processed:
|
|
return
|
|
self._processed.add(norm)
|
|
await asyncio.to_thread(self._append_line, url)
|
|
|
|
def _append_line(self, url: str) -> None:
|
|
with self._path.open('a', encoding='utf-8') as fh:
|
|
fh.write(normalize_url(url))
|
|
fh.write('\n')
|
|
|
|
def size(self) -> int:
|
|
return len(self._processed)
|