#!/usr/bin/env python3 """Simple orchestrator: crawl -> fetch -> extract -> store (json + md). Usage: tools/run_crawl.py [--limit N] [--dry-run] [--seed URL] """ import argparse import logging import time from pathlib import Path from linetoday.frontier import Frontier from linetoday.fetcher import Fetcher from linetoday.extractor import extract_article from linetoday.storage import store_snapshot, store_parsed from linetoday.robots import RobotsManager def main(): p = argparse.ArgumentParser() p.add_argument('--limit', type=int, default=10) p.add_argument('--dry-run', action='store_true') p.add_argument('--seed', type=str, default='https://today.line.me/th/v3/tab') p.add_argument('--ignore-robots', action='store_true') p.add_argument('--verbose', action='store_true') args = p.parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, format='%(asctime)s %(levelname)s %(message)s') ua = 'LineTodayCrawler/0.1 (+mailto:ops@example.com)' frontier = Frontier() frontier.add(args.seed) fetcher = Fetcher(user_agent=ua) robots = RobotsManager(user_agent=ua) fetched = 0 origin = 'https://today.line.me' while fetched < args.limit and len(frontier) > 0: url = frontier.pop() if not url: break if not url.startswith(origin): continue path = url[len(origin):] if not args.ignore_robots: allowed = robots.allowed(origin, path) if not allowed: logging.info('robots disallow %s', url) continue logging.info('fetching %s', url) res = fetcher.fetch(url) status = res.get('status') if status is None: logging.error('error fetching %s: %s', url, res.get('error')) if res.get('pause_seconds'): logging.info('pausing for %s seconds', res['pause_seconds']) time.sleep(res['pause_seconds']) continue if status != 200: logging.info('skipping non-200 %s status=%s', url, status) continue html = res.get('text', '') snap = store_snapshot(url, html) article, links = extract_article(html, url) if not args.dry_run: parsed_path = store_parsed(article) logging.info('stored parsed %s', parsed_path) for link in links: if link.startswith('/'): link = origin + link frontier.add(link) fetched += 1 logging.info('done') if __name__ == '__main__': main()