scrape articles concurrently and add articles registry

2025-10-29 16:42:07 +07:00 · 2025-10-29 16:42:07 +07:00 · 04b39ed883
commit 04b39ed883
parent 0b5b9d98c5
4 changed files with 140 additions and 11 deletions
--- a/linetoday/cli.py
+++ b/linetoday/cli.py
@ -9,6 +9,7 @@ from .extractor import extract_article
 from .fetcher import Fetcher
 from .frontier import Frontier
 from .metrics import MetricsRecorder
+from .registry import ArticleRegistry
 from .robots import RobotsManager
 from .storage import store_parsed, store_snapshot

@ -27,6 +28,18 @@ def build_parser() -> argparse.ArgumentParser:
    parser.add_argument('--per-origin-capacity', type=int, default=2, help='Burst size per origin token bucket')
    parser.add_argument('--per-origin-refill', type=float, default=2.0, help='Seconds per token refill for origin bucket')
    parser.add_argument('--timeout', type=float, default=20.0, help='HTTP request timeout')
+    parser.add_argument(
+        '--sections',
+        action='append',
+        default=[],
+        help='Seed from specific LINE TODAY sections (comma-separated or repeatable)',
+    )
+    parser.add_argument(
+        '--seed-article',
+        action='append',
+        default=[],
+        help='Seed crawler with specific article URLs or IDs (repeatable)',
+    )
    return parser


@ -47,9 +60,33 @@ async def crawl(args) -> None:
    metrics = MetricsRecorder()

    queue: asyncio.Queue[str | None] = asyncio.Queue()
-    if frontier.add(START_URL):
-        queue.put_nowait(START_URL)
-        metrics.inc('frontier_seeded')
+    registry = ArticleRegistry()
+    logging.info('loaded %s processed article(s) from registry', registry.size())
+
+    seeds: set[str] = set()
+    sections: list[str] = []
+    for entry in args.sections:
+        sections.extend([part.strip() for part in entry.split(',') if part.strip()])
+
+    if sections:
+        for section in sections:
+            seeds.add(f'https://today.line.me/th/v3/page/{section.lstrip("/")}')
+    else:
+        seeds.add(START_URL)
+
+    for entry in args.seed_article:
+        val = entry.strip()
+        if not val:
+            continue
+        if val.startswith('http'):
+            seeds.add(val)
+        else:
+            seeds.add(f'https://today.line.me/th/v3/article/{val}')
+
+    for seed in seeds:
+        if frontier.add(seed):
+            queue.put_nowait(seed)
+            metrics.inc('frontier_seeded')

    fetched = 0
    fetched_lock = asyncio.Lock()
@ -85,6 +122,12 @@ async def crawl(args) -> None:
                    queue.task_done()
                    continue

+                is_article_url = '/article/' in url
+                if is_article_url and registry.contains(url):
+                    metrics.inc('article_skipped_processed')
+                    queue.task_done()
+                    continue
+
                path = url[len(ORIGIN):]
                if not args.ignore_robots:
                    allowed = await asyncio.to_thread(robots.allowed, ORIGIN, path)
@ -123,24 +166,38 @@ async def crawl(args) -> None:
                    logger.warning('skipping %s status %s', url, status)
                else:
                    html = res.get('text', '')
-                    await asyncio.to_thread(store_snapshot, url, html)
-                    metrics.inc('snapshots_written')
-
                    article, links = extract_article(html, url)
-                    if not args.dry_run:
-                        await asyncio.to_thread(store_parsed, article)
-                        metrics.inc('parsed_written')
+
+                    is_article = bool(article.get('is_article'))
+                    if is_article:
+                        await asyncio.to_thread(store_snapshot, url, html)
+                        metrics.inc('snapshots_written')
+                        if not args.dry_run:
+                            await asyncio.to_thread(store_parsed, article)
+                            metrics.inc('parsed_written')
+                            canonical = article.get('url') or url
+                            await registry.mark(canonical)
+                            metrics.inc('article_recorded')
+                        metrics.inc('articles_fetched')
+                        counted = True
+                    else:
+                        metrics.inc('non_article_fetched')

                    if not stop_event.is_set():
                        new_links = 0
                        for link in links:
                            if link.startswith('/'):
                                link = ORIGIN + link
+                            link_is_article = '/article/' in link
+                            if link_is_article and registry.contains(link):
+                                metrics.inc('article_link_skipped_processed')
+                                continue
                            if frontier.add(link):
                                queue.put_nowait(link)
                                new_links += 1
                        metrics.inc('links_enqueued', new_links)
-                    counted = True
+                        if is_article:
+                            metrics.inc('article_links_enqueued', new_links)

                if counted:
                    async with fetched_lock:
@ -195,4 +252,3 @@ def main():

 if __name__ == '__main__':
    main()
-
--- a/linetoday/registry.py
+++ b/linetoday/registry.py
@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+from typing import Iterable
+
+from .frontier import normalize_url
+from .storage import STORAGE_DIR
+
+REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'
+
+
+class ArticleRegistry:
+    """Keeps track of processed article URLs across crawler runs."""
+
+    def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
+        self._path = path or REGISTRY_PATH
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        self._lock = asyncio.Lock()
+        self._processed: set[str] = set()
+
+        if self._path.exists():
+            for line in self._path.read_text(encoding='utf-8').splitlines():
+                line = line.strip()
+                if line:
+                    self._processed.add(normalize_url(line))
+
+        if preload:
+            for item in preload:
+                self._processed.add(normalize_url(item))
+
+    def contains(self, url: str) -> bool:
+        return normalize_url(url) in self._processed
+
+    async def mark(self, url: str) -> None:
+        norm = normalize_url(url)
+        async with self._lock:
+            if norm in self._processed:
+                return
+            self._processed.add(norm)
+            await asyncio.to_thread(self._append_line, url)
+
+    def _append_line(self, url: str) -> None:
+        with self._path.open('a', encoding='utf-8') as fh:
+            fh.write(normalize_url(url))
+            fh.write('\n')
+
+    def size(self) -> int:
+        return len(self._processed)
--- a/tests/test_extractor.py
+++ b/tests/test_extractor.py
@ -1,5 +1,7 @@
 from pathlib import Path

+import pytest
+
 from linetoday.extractor import extract_article


@ -23,6 +25,8 @@ def test_article_fixture_uses_structured_payload():

 def test_snapshots_are_classified_and_sanitised():
    # iterate all html snapshots in data/snapshots
+    if not SNAPSHOT_DIR.exists():
+        pytest.skip('No snapshot fixtures available')
    files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
    assert files, f'No snapshot files found in {SNAPSHOT_DIR}'

--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@ -0,0 +1,20 @@
+import asyncio
+from pathlib import Path
+
+from linetoday.registry import ArticleRegistry
+
+
+def test_registry_persist_roundtrip(tmp_path: Path):
+    registry_path = tmp_path / 'processed.txt'
+    url = 'https://today.line.me/th/v3/article/test123'
+
+    registry = ArticleRegistry(path=registry_path)
+    assert not registry.contains(url)
+
+    asyncio.run(registry.mark(url))
+    assert registry.contains(url)
+
+    # Reload and ensure persistence
+    registry2 = ArticleRegistry(path=registry_path)
+    assert registry2.contains(url)
+    assert registry2.size() == 1