scrape articles concurrently and add articles registry
Some checks failed
CI / test (push) Has been cancelled

This commit is contained in:
Sosokker 2025-10-29 16:42:07 +07:00
parent 0b5b9d98c5
commit 04b39ed883
4 changed files with 140 additions and 11 deletions

View File

@ -9,6 +9,7 @@ from .extractor import extract_article
from .fetcher import Fetcher from .fetcher import Fetcher
from .frontier import Frontier from .frontier import Frontier
from .metrics import MetricsRecorder from .metrics import MetricsRecorder
from .registry import ArticleRegistry
from .robots import RobotsManager from .robots import RobotsManager
from .storage import store_parsed, store_snapshot from .storage import store_parsed, store_snapshot
@ -27,6 +28,18 @@ def build_parser() -> argparse.ArgumentParser:
parser.add_argument('--per-origin-capacity', type=int, default=2, help='Burst size per origin token bucket') parser.add_argument('--per-origin-capacity', type=int, default=2, help='Burst size per origin token bucket')
parser.add_argument('--per-origin-refill', type=float, default=2.0, help='Seconds per token refill for origin bucket') parser.add_argument('--per-origin-refill', type=float, default=2.0, help='Seconds per token refill for origin bucket')
parser.add_argument('--timeout', type=float, default=20.0, help='HTTP request timeout') parser.add_argument('--timeout', type=float, default=20.0, help='HTTP request timeout')
parser.add_argument(
'--sections',
action='append',
default=[],
help='Seed from specific LINE TODAY sections (comma-separated or repeatable)',
)
parser.add_argument(
'--seed-article',
action='append',
default=[],
help='Seed crawler with specific article URLs or IDs (repeatable)',
)
return parser return parser
@ -47,9 +60,33 @@ async def crawl(args) -> None:
metrics = MetricsRecorder() metrics = MetricsRecorder()
queue: asyncio.Queue[str | None] = asyncio.Queue() queue: asyncio.Queue[str | None] = asyncio.Queue()
if frontier.add(START_URL): registry = ArticleRegistry()
queue.put_nowait(START_URL) logging.info('loaded %s processed article(s) from registry', registry.size())
metrics.inc('frontier_seeded')
seeds: set[str] = set()
sections: list[str] = []
for entry in args.sections:
sections.extend([part.strip() for part in entry.split(',') if part.strip()])
if sections:
for section in sections:
seeds.add(f'https://today.line.me/th/v3/page/{section.lstrip("/")}')
else:
seeds.add(START_URL)
for entry in args.seed_article:
val = entry.strip()
if not val:
continue
if val.startswith('http'):
seeds.add(val)
else:
seeds.add(f'https://today.line.me/th/v3/article/{val}')
for seed in seeds:
if frontier.add(seed):
queue.put_nowait(seed)
metrics.inc('frontier_seeded')
fetched = 0 fetched = 0
fetched_lock = asyncio.Lock() fetched_lock = asyncio.Lock()
@ -85,6 +122,12 @@ async def crawl(args) -> None:
queue.task_done() queue.task_done()
continue continue
is_article_url = '/article/' in url
if is_article_url and registry.contains(url):
metrics.inc('article_skipped_processed')
queue.task_done()
continue
path = url[len(ORIGIN):] path = url[len(ORIGIN):]
if not args.ignore_robots: if not args.ignore_robots:
allowed = await asyncio.to_thread(robots.allowed, ORIGIN, path) allowed = await asyncio.to_thread(robots.allowed, ORIGIN, path)
@ -123,24 +166,38 @@ async def crawl(args) -> None:
logger.warning('skipping %s status %s', url, status) logger.warning('skipping %s status %s', url, status)
else: else:
html = res.get('text', '') html = res.get('text', '')
await asyncio.to_thread(store_snapshot, url, html)
metrics.inc('snapshots_written')
article, links = extract_article(html, url) article, links = extract_article(html, url)
if not args.dry_run:
await asyncio.to_thread(store_parsed, article) is_article = bool(article.get('is_article'))
metrics.inc('parsed_written') if is_article:
await asyncio.to_thread(store_snapshot, url, html)
metrics.inc('snapshots_written')
if not args.dry_run:
await asyncio.to_thread(store_parsed, article)
metrics.inc('parsed_written')
canonical = article.get('url') or url
await registry.mark(canonical)
metrics.inc('article_recorded')
metrics.inc('articles_fetched')
counted = True
else:
metrics.inc('non_article_fetched')
if not stop_event.is_set(): if not stop_event.is_set():
new_links = 0 new_links = 0
for link in links: for link in links:
if link.startswith('/'): if link.startswith('/'):
link = ORIGIN + link link = ORIGIN + link
link_is_article = '/article/' in link
if link_is_article and registry.contains(link):
metrics.inc('article_link_skipped_processed')
continue
if frontier.add(link): if frontier.add(link):
queue.put_nowait(link) queue.put_nowait(link)
new_links += 1 new_links += 1
metrics.inc('links_enqueued', new_links) metrics.inc('links_enqueued', new_links)
counted = True if is_article:
metrics.inc('article_links_enqueued', new_links)
if counted: if counted:
async with fetched_lock: async with fetched_lock:
@ -195,4 +252,3 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() main()

49
linetoday/registry.py Normal file
View File

@ -0,0 +1,49 @@
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Iterable
from .frontier import normalize_url
from .storage import STORAGE_DIR
REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'
class ArticleRegistry:
"""Keeps track of processed article URLs across crawler runs."""
def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
self._path = path or REGISTRY_PATH
self._path.parent.mkdir(parents=True, exist_ok=True)
self._lock = asyncio.Lock()
self._processed: set[str] = set()
if self._path.exists():
for line in self._path.read_text(encoding='utf-8').splitlines():
line = line.strip()
if line:
self._processed.add(normalize_url(line))
if preload:
for item in preload:
self._processed.add(normalize_url(item))
def contains(self, url: str) -> bool:
return normalize_url(url) in self._processed
async def mark(self, url: str) -> None:
norm = normalize_url(url)
async with self._lock:
if norm in self._processed:
return
self._processed.add(norm)
await asyncio.to_thread(self._append_line, url)
def _append_line(self, url: str) -> None:
with self._path.open('a', encoding='utf-8') as fh:
fh.write(normalize_url(url))
fh.write('\n')
def size(self) -> int:
return len(self._processed)

View File

@ -1,5 +1,7 @@
from pathlib import Path from pathlib import Path
import pytest
from linetoday.extractor import extract_article from linetoday.extractor import extract_article
@ -23,6 +25,8 @@ def test_article_fixture_uses_structured_payload():
def test_snapshots_are_classified_and_sanitised(): def test_snapshots_are_classified_and_sanitised():
# iterate all html snapshots in data/snapshots # iterate all html snapshots in data/snapshots
if not SNAPSHOT_DIR.exists():
pytest.skip('No snapshot fixtures available')
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html']) files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
assert files, f'No snapshot files found in {SNAPSHOT_DIR}' assert files, f'No snapshot files found in {SNAPSHOT_DIR}'

20
tests/test_registry.py Normal file
View File

@ -0,0 +1,20 @@
import asyncio
from pathlib import Path
from linetoday.registry import ArticleRegistry
def test_registry_persist_roundtrip(tmp_path: Path):
registry_path = tmp_path / 'processed.txt'
url = 'https://today.line.me/th/v3/article/test123'
registry = ArticleRegistry(path=registry_path)
assert not registry.contains(url)
asyncio.run(registry.mark(url))
assert registry.contains(url)
# Reload and ensure persistence
registry2 = ArticleRegistry(path=registry_path)
assert registry2.contains(url)
assert registry2.size() == 1