scrape articles concurrently and add articles registry
Some checks failed
CI / test (push) Has been cancelled

This commit is contained in:
Sosokker 2025-10-29 16:42:07 +07:00
parent 0b5b9d98c5
commit 04b39ed883
4 changed files with 140 additions and 11 deletions

View File

@ -9,6 +9,7 @@ from .extractor import extract_article
from .fetcher import Fetcher
from .frontier import Frontier
from .metrics import MetricsRecorder
from .registry import ArticleRegistry
from .robots import RobotsManager
from .storage import store_parsed, store_snapshot
@ -27,6 +28,18 @@ def build_parser() -> argparse.ArgumentParser:
parser.add_argument('--per-origin-capacity', type=int, default=2, help='Burst size per origin token bucket')
parser.add_argument('--per-origin-refill', type=float, default=2.0, help='Seconds per token refill for origin bucket')
parser.add_argument('--timeout', type=float, default=20.0, help='HTTP request timeout')
parser.add_argument(
'--sections',
action='append',
default=[],
help='Seed from specific LINE TODAY sections (comma-separated or repeatable)',
)
parser.add_argument(
'--seed-article',
action='append',
default=[],
help='Seed crawler with specific article URLs or IDs (repeatable)',
)
return parser
@ -47,9 +60,33 @@ async def crawl(args) -> None:
metrics = MetricsRecorder()
queue: asyncio.Queue[str | None] = asyncio.Queue()
if frontier.add(START_URL):
queue.put_nowait(START_URL)
metrics.inc('frontier_seeded')
registry = ArticleRegistry()
logging.info('loaded %s processed article(s) from registry', registry.size())
seeds: set[str] = set()
sections: list[str] = []
for entry in args.sections:
sections.extend([part.strip() for part in entry.split(',') if part.strip()])
if sections:
for section in sections:
seeds.add(f'https://today.line.me/th/v3/page/{section.lstrip("/")}')
else:
seeds.add(START_URL)
for entry in args.seed_article:
val = entry.strip()
if not val:
continue
if val.startswith('http'):
seeds.add(val)
else:
seeds.add(f'https://today.line.me/th/v3/article/{val}')
for seed in seeds:
if frontier.add(seed):
queue.put_nowait(seed)
metrics.inc('frontier_seeded')
fetched = 0
fetched_lock = asyncio.Lock()
@ -85,6 +122,12 @@ async def crawl(args) -> None:
queue.task_done()
continue
is_article_url = '/article/' in url
if is_article_url and registry.contains(url):
metrics.inc('article_skipped_processed')
queue.task_done()
continue
path = url[len(ORIGIN):]
if not args.ignore_robots:
allowed = await asyncio.to_thread(robots.allowed, ORIGIN, path)
@ -123,24 +166,38 @@ async def crawl(args) -> None:
logger.warning('skipping %s status %s', url, status)
else:
html = res.get('text', '')
await asyncio.to_thread(store_snapshot, url, html)
metrics.inc('snapshots_written')
article, links = extract_article(html, url)
if not args.dry_run:
await asyncio.to_thread(store_parsed, article)
metrics.inc('parsed_written')
is_article = bool(article.get('is_article'))
if is_article:
await asyncio.to_thread(store_snapshot, url, html)
metrics.inc('snapshots_written')
if not args.dry_run:
await asyncio.to_thread(store_parsed, article)
metrics.inc('parsed_written')
canonical = article.get('url') or url
await registry.mark(canonical)
metrics.inc('article_recorded')
metrics.inc('articles_fetched')
counted = True
else:
metrics.inc('non_article_fetched')
if not stop_event.is_set():
new_links = 0
for link in links:
if link.startswith('/'):
link = ORIGIN + link
link_is_article = '/article/' in link
if link_is_article and registry.contains(link):
metrics.inc('article_link_skipped_processed')
continue
if frontier.add(link):
queue.put_nowait(link)
new_links += 1
metrics.inc('links_enqueued', new_links)
counted = True
if is_article:
metrics.inc('article_links_enqueued', new_links)
if counted:
async with fetched_lock:
@ -195,4 +252,3 @@ def main():
if __name__ == '__main__':
main()

49
linetoday/registry.py Normal file
View File

@ -0,0 +1,49 @@
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Iterable
from .frontier import normalize_url
from .storage import STORAGE_DIR
REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'
class ArticleRegistry:
"""Keeps track of processed article URLs across crawler runs."""
def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
self._path = path or REGISTRY_PATH
self._path.parent.mkdir(parents=True, exist_ok=True)
self._lock = asyncio.Lock()
self._processed: set[str] = set()
if self._path.exists():
for line in self._path.read_text(encoding='utf-8').splitlines():
line = line.strip()
if line:
self._processed.add(normalize_url(line))
if preload:
for item in preload:
self._processed.add(normalize_url(item))
def contains(self, url: str) -> bool:
return normalize_url(url) in self._processed
async def mark(self, url: str) -> None:
norm = normalize_url(url)
async with self._lock:
if norm in self._processed:
return
self._processed.add(norm)
await asyncio.to_thread(self._append_line, url)
def _append_line(self, url: str) -> None:
with self._path.open('a', encoding='utf-8') as fh:
fh.write(normalize_url(url))
fh.write('\n')
def size(self) -> int:
return len(self._processed)

View File

@ -1,5 +1,7 @@
from pathlib import Path
import pytest
from linetoday.extractor import extract_article
@ -23,6 +25,8 @@ def test_article_fixture_uses_structured_payload():
def test_snapshots_are_classified_and_sanitised():
# iterate all html snapshots in data/snapshots
if not SNAPSHOT_DIR.exists():
pytest.skip('No snapshot fixtures available')
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
assert files, f'No snapshot files found in {SNAPSHOT_DIR}'

20
tests/test_registry.py Normal file
View File

@ -0,0 +1,20 @@
import asyncio
from pathlib import Path
from linetoday.registry import ArticleRegistry
def test_registry_persist_roundtrip(tmp_path: Path):
registry_path = tmp_path / 'processed.txt'
url = 'https://today.line.me/th/v3/article/test123'
registry = ArticleRegistry(path=registry_path)
assert not registry.contains(url)
asyncio.run(registry.mark(url))
assert registry.contains(url)
# Reload and ensure persistence
registry2 = ArticleRegistry(path=registry_path)
assert registry2.contains(url)
assert registry2.size() == 1