line-today-scrape/tools/run_crawl.py

#!/usr/bin/env python3
"""Simple orchestrator: crawl -> fetch -> extract -> store (json + md).

Usage: tools/run_crawl.py [--limit N] [--dry-run] [--seed URL]
"""
import argparse
import logging
import time
from pathlib import Path

from linetoday.frontier import Frontier
from linetoday.fetcher import Fetcher
from linetoday.extractor import extract_article
from linetoday.storage import store_snapshot, store_parsed
from linetoday.robots import RobotsManager


def main():
    p = argparse.ArgumentParser()
    p.add_argument('--limit', type=int, default=10)
    p.add_argument('--dry-run', action='store_true')
    p.add_argument('--seed', type=str, default='https://today.line.me/th/v3/tab')
    p.add_argument('--ignore-robots', action='store_true')
    p.add_argument('--verbose', action='store_true')
    args = p.parse_args()

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
    ua = 'LineTodayCrawler/0.1 (+mailto:ops@example.com)'
    frontier = Frontier()
    frontier.add(args.seed)
    fetcher = Fetcher(user_agent=ua)
    robots = RobotsManager(user_agent=ua)

    fetched = 0
    origin = 'https://today.line.me'
    while fetched < args.limit and len(frontier) > 0:
        url = frontier.pop()
        if not url:
            break
        if not url.startswith(origin):
            continue
        path = url[len(origin):]
        if not args.ignore_robots:
            allowed = robots.allowed(origin, path)
            if not allowed:
                logging.info('robots disallow %s', url)
                continue
        logging.info('fetching %s', url)
        res = fetcher.fetch(url)
        status = res.get('status')
        if status is None:
            logging.error('error fetching %s: %s', url, res.get('error'))
            if res.get('pause_seconds'):
                logging.info('pausing for %s seconds', res['pause_seconds'])
                time.sleep(res['pause_seconds'])
            continue
        if status != 200:
            logging.info('skipping non-200 %s status=%s', url, status)
            continue
        html = res.get('text', '')
        snap = store_snapshot(url, html)
        article, links = extract_article(html, url)
        if not args.dry_run:
            parsed_path = store_parsed(article)
            logging.info('stored parsed %s', parsed_path)
        for link in links:
            if link.startswith('/'):
                link = origin + link
            frontier.add(link)
        fetched += 1
    logging.info('done')


if __name__ == '__main__':
    main()