line-today-scrape/linetoday/registry.py
Sosokker 04b39ed883
Some checks failed
CI / test (push) Has been cancelled
scrape articles concurrently and add articles registry
2025-10-29 16:42:07 +07:00

50 lines
1.5 KiB
Python

from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Iterable
from .frontier import normalize_url
from .storage import STORAGE_DIR
REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'
class ArticleRegistry:
"""Keeps track of processed article URLs across crawler runs."""
def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
self._path = path or REGISTRY_PATH
self._path.parent.mkdir(parents=True, exist_ok=True)
self._lock = asyncio.Lock()
self._processed: set[str] = set()
if self._path.exists():
for line in self._path.read_text(encoding='utf-8').splitlines():
line = line.strip()
if line:
self._processed.add(normalize_url(line))
if preload:
for item in preload:
self._processed.add(normalize_url(item))
def contains(self, url: str) -> bool:
return normalize_url(url) in self._processed
async def mark(self, url: str) -> None:
norm = normalize_url(url)
async with self._lock:
if norm in self._processed:
return
self._processed.add(norm)
await asyncio.to_thread(self._append_line, url)
def _append_line(self, url: str) -> None:
with self._path.open('a', encoding='utf-8') as fh:
fh.write(normalize_url(url))
fh.write('\n')
def size(self) -> int:
return len(self._processed)