scrape articles concurrently and add articles registry
Some checks failed
CI / test (push) Has been cancelled
Some checks failed
CI / test (push) Has been cancelled
This commit is contained in:
parent
0b5b9d98c5
commit
04b39ed883
@ -9,6 +9,7 @@ from .extractor import extract_article
|
|||||||
from .fetcher import Fetcher
|
from .fetcher import Fetcher
|
||||||
from .frontier import Frontier
|
from .frontier import Frontier
|
||||||
from .metrics import MetricsRecorder
|
from .metrics import MetricsRecorder
|
||||||
|
from .registry import ArticleRegistry
|
||||||
from .robots import RobotsManager
|
from .robots import RobotsManager
|
||||||
from .storage import store_parsed, store_snapshot
|
from .storage import store_parsed, store_snapshot
|
||||||
|
|
||||||
@ -27,6 +28,18 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
parser.add_argument('--per-origin-capacity', type=int, default=2, help='Burst size per origin token bucket')
|
parser.add_argument('--per-origin-capacity', type=int, default=2, help='Burst size per origin token bucket')
|
||||||
parser.add_argument('--per-origin-refill', type=float, default=2.0, help='Seconds per token refill for origin bucket')
|
parser.add_argument('--per-origin-refill', type=float, default=2.0, help='Seconds per token refill for origin bucket')
|
||||||
parser.add_argument('--timeout', type=float, default=20.0, help='HTTP request timeout')
|
parser.add_argument('--timeout', type=float, default=20.0, help='HTTP request timeout')
|
||||||
|
parser.add_argument(
|
||||||
|
'--sections',
|
||||||
|
action='append',
|
||||||
|
default=[],
|
||||||
|
help='Seed from specific LINE TODAY sections (comma-separated or repeatable)',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--seed-article',
|
||||||
|
action='append',
|
||||||
|
default=[],
|
||||||
|
help='Seed crawler with specific article URLs or IDs (repeatable)',
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -47,9 +60,33 @@ async def crawl(args) -> None:
|
|||||||
metrics = MetricsRecorder()
|
metrics = MetricsRecorder()
|
||||||
|
|
||||||
queue: asyncio.Queue[str | None] = asyncio.Queue()
|
queue: asyncio.Queue[str | None] = asyncio.Queue()
|
||||||
if frontier.add(START_URL):
|
registry = ArticleRegistry()
|
||||||
queue.put_nowait(START_URL)
|
logging.info('loaded %s processed article(s) from registry', registry.size())
|
||||||
metrics.inc('frontier_seeded')
|
|
||||||
|
seeds: set[str] = set()
|
||||||
|
sections: list[str] = []
|
||||||
|
for entry in args.sections:
|
||||||
|
sections.extend([part.strip() for part in entry.split(',') if part.strip()])
|
||||||
|
|
||||||
|
if sections:
|
||||||
|
for section in sections:
|
||||||
|
seeds.add(f'https://today.line.me/th/v3/page/{section.lstrip("/")}')
|
||||||
|
else:
|
||||||
|
seeds.add(START_URL)
|
||||||
|
|
||||||
|
for entry in args.seed_article:
|
||||||
|
val = entry.strip()
|
||||||
|
if not val:
|
||||||
|
continue
|
||||||
|
if val.startswith('http'):
|
||||||
|
seeds.add(val)
|
||||||
|
else:
|
||||||
|
seeds.add(f'https://today.line.me/th/v3/article/{val}')
|
||||||
|
|
||||||
|
for seed in seeds:
|
||||||
|
if frontier.add(seed):
|
||||||
|
queue.put_nowait(seed)
|
||||||
|
metrics.inc('frontier_seeded')
|
||||||
|
|
||||||
fetched = 0
|
fetched = 0
|
||||||
fetched_lock = asyncio.Lock()
|
fetched_lock = asyncio.Lock()
|
||||||
@ -85,6 +122,12 @@ async def crawl(args) -> None:
|
|||||||
queue.task_done()
|
queue.task_done()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
is_article_url = '/article/' in url
|
||||||
|
if is_article_url and registry.contains(url):
|
||||||
|
metrics.inc('article_skipped_processed')
|
||||||
|
queue.task_done()
|
||||||
|
continue
|
||||||
|
|
||||||
path = url[len(ORIGIN):]
|
path = url[len(ORIGIN):]
|
||||||
if not args.ignore_robots:
|
if not args.ignore_robots:
|
||||||
allowed = await asyncio.to_thread(robots.allowed, ORIGIN, path)
|
allowed = await asyncio.to_thread(robots.allowed, ORIGIN, path)
|
||||||
@ -123,24 +166,38 @@ async def crawl(args) -> None:
|
|||||||
logger.warning('skipping %s status %s', url, status)
|
logger.warning('skipping %s status %s', url, status)
|
||||||
else:
|
else:
|
||||||
html = res.get('text', '')
|
html = res.get('text', '')
|
||||||
await asyncio.to_thread(store_snapshot, url, html)
|
|
||||||
metrics.inc('snapshots_written')
|
|
||||||
|
|
||||||
article, links = extract_article(html, url)
|
article, links = extract_article(html, url)
|
||||||
if not args.dry_run:
|
|
||||||
await asyncio.to_thread(store_parsed, article)
|
is_article = bool(article.get('is_article'))
|
||||||
metrics.inc('parsed_written')
|
if is_article:
|
||||||
|
await asyncio.to_thread(store_snapshot, url, html)
|
||||||
|
metrics.inc('snapshots_written')
|
||||||
|
if not args.dry_run:
|
||||||
|
await asyncio.to_thread(store_parsed, article)
|
||||||
|
metrics.inc('parsed_written')
|
||||||
|
canonical = article.get('url') or url
|
||||||
|
await registry.mark(canonical)
|
||||||
|
metrics.inc('article_recorded')
|
||||||
|
metrics.inc('articles_fetched')
|
||||||
|
counted = True
|
||||||
|
else:
|
||||||
|
metrics.inc('non_article_fetched')
|
||||||
|
|
||||||
if not stop_event.is_set():
|
if not stop_event.is_set():
|
||||||
new_links = 0
|
new_links = 0
|
||||||
for link in links:
|
for link in links:
|
||||||
if link.startswith('/'):
|
if link.startswith('/'):
|
||||||
link = ORIGIN + link
|
link = ORIGIN + link
|
||||||
|
link_is_article = '/article/' in link
|
||||||
|
if link_is_article and registry.contains(link):
|
||||||
|
metrics.inc('article_link_skipped_processed')
|
||||||
|
continue
|
||||||
if frontier.add(link):
|
if frontier.add(link):
|
||||||
queue.put_nowait(link)
|
queue.put_nowait(link)
|
||||||
new_links += 1
|
new_links += 1
|
||||||
metrics.inc('links_enqueued', new_links)
|
metrics.inc('links_enqueued', new_links)
|
||||||
counted = True
|
if is_article:
|
||||||
|
metrics.inc('article_links_enqueued', new_links)
|
||||||
|
|
||||||
if counted:
|
if counted:
|
||||||
async with fetched_lock:
|
async with fetched_lock:
|
||||||
@ -195,4 +252,3 @@ def main():
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|||||||
49
linetoday/registry.py
Normal file
49
linetoday/registry.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from .frontier import normalize_url
|
||||||
|
from .storage import STORAGE_DIR
|
||||||
|
|
||||||
|
REGISTRY_PATH = STORAGE_DIR / 'meta' / 'processed_articles.txt'
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleRegistry:
|
||||||
|
"""Keeps track of processed article URLs across crawler runs."""
|
||||||
|
|
||||||
|
def __init__(self, path: Path | None = None, preload: Iterable[str] | None = None):
|
||||||
|
self._path = path or REGISTRY_PATH
|
||||||
|
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._processed: set[str] = set()
|
||||||
|
|
||||||
|
if self._path.exists():
|
||||||
|
for line in self._path.read_text(encoding='utf-8').splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
self._processed.add(normalize_url(line))
|
||||||
|
|
||||||
|
if preload:
|
||||||
|
for item in preload:
|
||||||
|
self._processed.add(normalize_url(item))
|
||||||
|
|
||||||
|
def contains(self, url: str) -> bool:
|
||||||
|
return normalize_url(url) in self._processed
|
||||||
|
|
||||||
|
async def mark(self, url: str) -> None:
|
||||||
|
norm = normalize_url(url)
|
||||||
|
async with self._lock:
|
||||||
|
if norm in self._processed:
|
||||||
|
return
|
||||||
|
self._processed.add(norm)
|
||||||
|
await asyncio.to_thread(self._append_line, url)
|
||||||
|
|
||||||
|
def _append_line(self, url: str) -> None:
|
||||||
|
with self._path.open('a', encoding='utf-8') as fh:
|
||||||
|
fh.write(normalize_url(url))
|
||||||
|
fh.write('\n')
|
||||||
|
|
||||||
|
def size(self) -> int:
|
||||||
|
return len(self._processed)
|
||||||
@ -1,5 +1,7 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from linetoday.extractor import extract_article
|
from linetoday.extractor import extract_article
|
||||||
|
|
||||||
|
|
||||||
@ -23,6 +25,8 @@ def test_article_fixture_uses_structured_payload():
|
|||||||
|
|
||||||
def test_snapshots_are_classified_and_sanitised():
|
def test_snapshots_are_classified_and_sanitised():
|
||||||
# iterate all html snapshots in data/snapshots
|
# iterate all html snapshots in data/snapshots
|
||||||
|
if not SNAPSHOT_DIR.exists():
|
||||||
|
pytest.skip('No snapshot fixtures available')
|
||||||
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
|
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
|
||||||
assert files, f'No snapshot files found in {SNAPSHOT_DIR}'
|
assert files, f'No snapshot files found in {SNAPSHOT_DIR}'
|
||||||
|
|
||||||
|
|||||||
20
tests/test_registry.py
Normal file
20
tests/test_registry.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from linetoday.registry import ArticleRegistry
|
||||||
|
|
||||||
|
|
||||||
|
def test_registry_persist_roundtrip(tmp_path: Path):
|
||||||
|
registry_path = tmp_path / 'processed.txt'
|
||||||
|
url = 'https://today.line.me/th/v3/article/test123'
|
||||||
|
|
||||||
|
registry = ArticleRegistry(path=registry_path)
|
||||||
|
assert not registry.contains(url)
|
||||||
|
|
||||||
|
asyncio.run(registry.mark(url))
|
||||||
|
assert registry.contains(url)
|
||||||
|
|
||||||
|
# Reload and ensure persistence
|
||||||
|
registry2 = ArticleRegistry(path=registry_path)
|
||||||
|
assert registry2.contains(url)
|
||||||
|
assert registry2.size() == 1
|
||||||
Loading…
Reference in New Issue
Block a user