89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
import time
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
from urllib.parse import urljoin
|
|
import httpx
|
|
import urllib.robotparser as robotparser
|
|
|
|
|
|
@dataclass
|
|
class RobotsPolicy:
|
|
url: str
|
|
fetched_at: float
|
|
raw: str
|
|
hash: Optional[str]
|
|
parser: robotparser.RobotFileParser
|
|
|
|
|
|
class RobotsManager:
|
|
"""Fetch and cache robots.txt for origins using standard library parser.
|
|
|
|
This implementation fetches robots.txt with httpx and parses Allow/Disallow
|
|
and Crawl-delay. It caches results for one hour by default.
|
|
"""
|
|
|
|
def __init__(self, user_agent: str = "LineTodayCrawler/0.1 (+mailto:ops@example.com)"):
|
|
self.user_agent = user_agent
|
|
self._cache: dict[str, RobotsPolicy] = {}
|
|
self._http = httpx.Client(timeout=10.0)
|
|
|
|
def _fetch_robots(self, origin: str) -> RobotsPolicy:
|
|
robots_url = urljoin(origin, "/robots.txt")
|
|
try:
|
|
resp = self._http.get(robots_url)
|
|
raw = resp.text if resp.status_code == 200 else ""
|
|
except Exception:
|
|
raw = ""
|
|
parser = robotparser.RobotFileParser()
|
|
parser.set_url(robots_url)
|
|
try:
|
|
parser.parse(raw.splitlines())
|
|
except Exception:
|
|
# fall back to empty parser (allows everything)
|
|
parser = robotparser.RobotFileParser()
|
|
parser.parse([])
|
|
policy = RobotsPolicy(url=robots_url, fetched_at=time.time(), raw=raw, hash=None, parser=parser)
|
|
return policy
|
|
|
|
def get_policy(self, origin: str) -> RobotsPolicy:
|
|
now = time.time()
|
|
policy = self._cache.get(origin)
|
|
if policy and now - policy.fetched_at < 3600:
|
|
return policy
|
|
policy = self._fetch_robots(origin)
|
|
self._cache[origin] = policy
|
|
return policy
|
|
|
|
def allowed(self, origin: str, path: str) -> bool:
|
|
policy = self.get_policy(origin)
|
|
try:
|
|
return policy.parser.can_fetch(self.user_agent, path)
|
|
except Exception:
|
|
return False
|
|
|
|
def crawl_delay(self, origin: str) -> Optional[float]:
|
|
policy = self.get_policy(origin)
|
|
# urllib.robotparser does not expose crawl-delay; parse manually
|
|
raw = policy.raw
|
|
if not raw:
|
|
return None
|
|
ua = None
|
|
delay = None
|
|
for line in raw.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
parts = line.split(':', 1)
|
|
if len(parts) != 2:
|
|
continue
|
|
k, v = parts[0].strip().lower(), parts[1].strip()
|
|
if k == 'user-agent':
|
|
ua = v
|
|
if k == 'crawl-delay' and ua in (self.user_agent, '*'):
|
|
try:
|
|
delay = float(v)
|
|
return delay
|
|
except Exception:
|
|
continue
|
|
return None
|