import time from dataclasses import dataclass from typing import Optional from urllib.parse import urljoin import httpx import urllib.robotparser as robotparser @dataclass class RobotsPolicy: url: str fetched_at: float raw: str hash: Optional[str] parser: robotparser.RobotFileParser class RobotsManager: """Fetch and cache robots.txt for origins using standard library parser. This implementation fetches robots.txt with httpx and parses Allow/Disallow and Crawl-delay. It caches results for one hour by default. """ def __init__(self, user_agent: str = "LineTodayCrawler/0.1 (+mailto:ops@example.com)"): self.user_agent = user_agent self._cache: dict[str, RobotsPolicy] = {} self._http = httpx.Client(timeout=10.0) def _fetch_robots(self, origin: str) -> RobotsPolicy: robots_url = urljoin(origin, "/robots.txt") try: resp = self._http.get(robots_url) raw = resp.text if resp.status_code == 200 else "" except Exception: raw = "" parser = robotparser.RobotFileParser() parser.set_url(robots_url) try: parser.parse(raw.splitlines()) except Exception: # fall back to empty parser (allows everything) parser = robotparser.RobotFileParser() parser.parse([]) policy = RobotsPolicy(url=robots_url, fetched_at=time.time(), raw=raw, hash=None, parser=parser) return policy def get_policy(self, origin: str) -> RobotsPolicy: now = time.time() policy = self._cache.get(origin) if policy and now - policy.fetched_at < 3600: return policy policy = self._fetch_robots(origin) self._cache[origin] = policy return policy def allowed(self, origin: str, path: str) -> bool: policy = self.get_policy(origin) try: return policy.parser.can_fetch(self.user_agent, path) except Exception: return False def crawl_delay(self, origin: str) -> Optional[float]: policy = self.get_policy(origin) # urllib.robotparser does not expose crawl-delay; parse manually raw = policy.raw if not raw: return None ua = None delay = None for line in raw.splitlines(): line = line.strip() if not line or line.startswith('#'): continue parts = line.split(':', 1) if len(parts) != 2: continue k, v = parts[0].strip().lower(), parts[1].strip() if k == 'user-agent': ua = v if k == 'crawl-delay' and ua in (self.user_agent, '*'): try: delay = float(v) return delay except Exception: continue return None