76 lines
2.6 KiB
Python
76 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Simple orchestrator: crawl -> fetch -> extract -> store (json + md).
|
|
|
|
Usage: tools/run_crawl.py [--limit N] [--dry-run] [--seed URL]
|
|
"""
|
|
import argparse
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from linetoday.frontier import Frontier
|
|
from linetoday.fetcher import Fetcher
|
|
from linetoday.extractor import extract_article
|
|
from linetoday.storage import store_snapshot, store_parsed
|
|
from linetoday.robots import RobotsManager
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument('--limit', type=int, default=10)
|
|
p.add_argument('--dry-run', action='store_true')
|
|
p.add_argument('--seed', type=str, default='https://today.line.me/th/v3/tab')
|
|
p.add_argument('--ignore-robots', action='store_true')
|
|
p.add_argument('--verbose', action='store_true')
|
|
args = p.parse_args()
|
|
|
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
|
ua = 'LineTodayCrawler/0.1 (+mailto:ops@example.com)'
|
|
frontier = Frontier()
|
|
frontier.add(args.seed)
|
|
fetcher = Fetcher(user_agent=ua)
|
|
robots = RobotsManager(user_agent=ua)
|
|
|
|
fetched = 0
|
|
origin = 'https://today.line.me'
|
|
while fetched < args.limit and len(frontier) > 0:
|
|
url = frontier.pop()
|
|
if not url:
|
|
break
|
|
if not url.startswith(origin):
|
|
continue
|
|
path = url[len(origin):]
|
|
if not args.ignore_robots:
|
|
allowed = robots.allowed(origin, path)
|
|
if not allowed:
|
|
logging.info('robots disallow %s', url)
|
|
continue
|
|
logging.info('fetching %s', url)
|
|
res = fetcher.fetch(url)
|
|
status = res.get('status')
|
|
if status is None:
|
|
logging.error('error fetching %s: %s', url, res.get('error'))
|
|
if res.get('pause_seconds'):
|
|
logging.info('pausing for %s seconds', res['pause_seconds'])
|
|
time.sleep(res['pause_seconds'])
|
|
continue
|
|
if status != 200:
|
|
logging.info('skipping non-200 %s status=%s', url, status)
|
|
continue
|
|
html = res.get('text', '')
|
|
snap = store_snapshot(url, html)
|
|
article, links = extract_article(html, url)
|
|
if not args.dry_run:
|
|
parsed_path = store_parsed(article)
|
|
logging.info('stored parsed %s', parsed_path)
|
|
for link in links:
|
|
if link.startswith('/'):
|
|
link = origin + link
|
|
frontier.add(link)
|
|
fetched += 1
|
|
logging.info('done')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|