line-today-scrape/linetoday/robots.py
Sosokker 0b5b9d98c5
Some checks are pending
CI / test (push) Waiting to run
add main files
2025-10-29 16:12:55 +07:00

89 lines
2.8 KiB
Python

import time
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urljoin
import httpx
import urllib.robotparser as robotparser
@dataclass
class RobotsPolicy:
url: str
fetched_at: float
raw: str
hash: Optional[str]
parser: robotparser.RobotFileParser
class RobotsManager:
"""Fetch and cache robots.txt for origins using standard library parser.
This implementation fetches robots.txt with httpx and parses Allow/Disallow
and Crawl-delay. It caches results for one hour by default.
"""
def __init__(self, user_agent: str = "LineTodayCrawler/0.1 (+mailto:ops@example.com)"):
self.user_agent = user_agent
self._cache: dict[str, RobotsPolicy] = {}
self._http = httpx.Client(timeout=10.0)
def _fetch_robots(self, origin: str) -> RobotsPolicy:
robots_url = urljoin(origin, "/robots.txt")
try:
resp = self._http.get(robots_url)
raw = resp.text if resp.status_code == 200 else ""
except Exception:
raw = ""
parser = robotparser.RobotFileParser()
parser.set_url(robots_url)
try:
parser.parse(raw.splitlines())
except Exception:
# fall back to empty parser (allows everything)
parser = robotparser.RobotFileParser()
parser.parse([])
policy = RobotsPolicy(url=robots_url, fetched_at=time.time(), raw=raw, hash=None, parser=parser)
return policy
def get_policy(self, origin: str) -> RobotsPolicy:
now = time.time()
policy = self._cache.get(origin)
if policy and now - policy.fetched_at < 3600:
return policy
policy = self._fetch_robots(origin)
self._cache[origin] = policy
return policy
def allowed(self, origin: str, path: str) -> bool:
policy = self.get_policy(origin)
try:
return policy.parser.can_fetch(self.user_agent, path)
except Exception:
return False
def crawl_delay(self, origin: str) -> Optional[float]:
policy = self.get_policy(origin)
# urllib.robotparser does not expose crawl-delay; parse manually
raw = policy.raw
if not raw:
return None
ua = None
delay = None
for line in raw.splitlines():
line = line.strip()
if not line or line.startswith('#'):
continue
parts = line.split(':', 1)
if len(parts) != 2:
continue
k, v = parts[0].strip().lower(), parts[1].strip()
if k == 'user-agent':
ua = v
if k == 'crawl-delay' and ua in (self.user_agent, '*'):
try:
delay = float(v)
return delay
except Exception:
continue
return None