line-today-scrape/tools/run_crawl.py
Sosokker 0b5b9d98c5
Some checks are pending
CI / test (push) Waiting to run
add main files
2025-10-29 16:12:55 +07:00

76 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""Simple orchestrator: crawl -> fetch -> extract -> store (json + md).
Usage: tools/run_crawl.py [--limit N] [--dry-run] [--seed URL]
"""
import argparse
import logging
import time
from pathlib import Path
from linetoday.frontier import Frontier
from linetoday.fetcher import Fetcher
from linetoday.extractor import extract_article
from linetoday.storage import store_snapshot, store_parsed
from linetoday.robots import RobotsManager
def main():
p = argparse.ArgumentParser()
p.add_argument('--limit', type=int, default=10)
p.add_argument('--dry-run', action='store_true')
p.add_argument('--seed', type=str, default='https://today.line.me/th/v3/tab')
p.add_argument('--ignore-robots', action='store_true')
p.add_argument('--verbose', action='store_true')
args = p.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
ua = 'LineTodayCrawler/0.1 (+mailto:ops@example.com)'
frontier = Frontier()
frontier.add(args.seed)
fetcher = Fetcher(user_agent=ua)
robots = RobotsManager(user_agent=ua)
fetched = 0
origin = 'https://today.line.me'
while fetched < args.limit and len(frontier) > 0:
url = frontier.pop()
if not url:
break
if not url.startswith(origin):
continue
path = url[len(origin):]
if not args.ignore_robots:
allowed = robots.allowed(origin, path)
if not allowed:
logging.info('robots disallow %s', url)
continue
logging.info('fetching %s', url)
res = fetcher.fetch(url)
status = res.get('status')
if status is None:
logging.error('error fetching %s: %s', url, res.get('error'))
if res.get('pause_seconds'):
logging.info('pausing for %s seconds', res['pause_seconds'])
time.sleep(res['pause_seconds'])
continue
if status != 200:
logging.info('skipping non-200 %s status=%s', url, status)
continue
html = res.get('text', '')
snap = store_snapshot(url, html)
article, links = extract_article(html, url)
if not args.dry_run:
parsed_path = store_parsed(article)
logging.info('stored parsed %s', parsed_path)
for link in links:
if link.startswith('/'):
link = origin + link
frontier.add(link)
fetched += 1
logging.info('done')
if __name__ == '__main__':
main()