line-today-scrape/linetoday/fetcher.py

import asyncio
import logging
import random
import time
from collections import defaultdict
from pathlib import Path
from typing import Dict, Optional
from urllib.parse import urlparse

import json

try:
    import httpx
except Exception:  # pragma: no cover
    httpx = None  # type: ignore

LOG = logging.getLogger(__name__)

METADATA_DIR = Path.cwd() / 'data' / 'meta'
METADATA_DIR.mkdir(parents=True, exist_ok=True)


class AsyncTokenBucket:
    def __init__(self, capacity: int, refill_interval: float):
        self.capacity = capacity
        self.tokens = capacity
        self.refill_interval = refill_interval
        self.lock = asyncio.Lock()
        self.last_refill = time.monotonic()

    def _refill(self, now: float) -> None:
        elapsed = now - self.last_refill
        if elapsed <= 0:
            return
        add = int(elapsed / self.refill_interval)
        if add > 0:
            self.tokens = min(self.capacity, self.tokens + add)
            self.last_refill += add * self.refill_interval

    async def consume(self, timeout: Optional[float] = None) -> bool:
        deadline = None if timeout is None else time.monotonic() + timeout
        while True:
            async with self.lock:
                now = time.monotonic()
                self._refill(now)
                if self.tokens > 0:
                    self.tokens -= 1
                    return True
            if deadline and time.monotonic() > deadline:
                return False
            await asyncio.sleep(0.05)


class Fetcher:
    def __init__(
        self,
        user_agent: str = "LineTodayCrawler/0.1 (+mailto:ops@example.com)",
        max_retries: int = 3,
        circuit_threshold: int = 5,
        circuit_cooldown: float = 60.0,
        playwright_enabled: bool = False,
        timeout: float = 20.0,
        per_origin_capacity: int = 2,
        per_origin_refill: float = 2.0,
    ):
        if httpx is None:  # pragma: no cover
            raise RuntimeError('httpx must be installed to use Fetcher')
        self.user_agent = user_agent
        self._client = httpx.AsyncClient(timeout=timeout, follow_redirects=True)
        self._bucket_capacity = max(1, per_origin_capacity)
        self._bucket_refill = max(0.1, per_origin_refill)
        self._buckets: Dict[str, AsyncTokenBucket] = defaultdict(
            lambda: AsyncTokenBucket(self._bucket_capacity, self._bucket_refill)
        )
        self.max_retries = max_retries
        self._failures: Dict[str, int] = defaultdict(int)
        self._circuit_tripped_until: Dict[str, float] = {}
        self.circuit_threshold = circuit_threshold
        self.circuit_cooldown = circuit_cooldown
        self.playwright_enabled = playwright_enabled
        self._pw = None
        self._browser = None

    async def __aenter__(self) -> "Fetcher":
        return self

    async def __aexit__(self, exc_type, exc, tb) -> None:
        await self.close()

    async def close(self) -> None:
        try:
            if self._browser:
                try:
                    await asyncio.to_thread(self._browser.close)
                except Exception:
                    pass
                self._browser = None
            if self._pw:
                try:
                    await asyncio.to_thread(self._pw.stop)
                except Exception:
                    pass
                self._pw = None
        except Exception:
            LOG.exception('error while closing Playwright resources')
        await self._client.aclose()

    def _origin(self, url: str) -> str:
        p = urlparse(url)
        return f"{p.scheme}://{p.netloc}"

    async def _acquire(self, url: str) -> bool:
        origin = self._origin(url)
        until = self._circuit_tripped_until.get(origin)
        now = time.monotonic()
        if until and now < until:
            LOG.warning('circuit open for %s until %s', origin, until)
            return False
        bucket = self._buckets[origin]
        return await bucket.consume()

    def _meta_path(self, url: str) -> Path:
        name = str(abs(hash(url))) + '.json'
        return METADATA_DIR / name

    def _load_meta_sync(self, url: str) -> dict:
        p = self._meta_path(url)
        if p.exists():
            try:
                return json.loads(p.read_text(encoding='utf-8'))
            except Exception:
                return {}
        return {}

    async def _load_meta(self, url: str) -> dict:
        return await asyncio.to_thread(self._load_meta_sync, url)

    def _save_meta_sync(self, url: str, meta: dict) -> None:
        p = self._meta_path(url)
        p.write_text(json.dumps(meta, ensure_ascii=False), encoding='utf-8')

    async def _save_meta(self, url: str, meta: dict) -> None:
        await asyncio.to_thread(self._save_meta_sync, url, meta)

    def _record_failure(self, origin: str) -> None:
        self._failures[origin] += 1
        if self._failures[origin] >= self.circuit_threshold:
            until = time.monotonic() + self.circuit_cooldown
            self._circuit_tripped_until[origin] = until
            LOG.error('tripping circuit for %s until %.2f after %d failures', origin, until, self._failures[origin])

    def _record_success(self, origin: str) -> None:
        self._failures[origin] = 0
        if origin in self._circuit_tripped_until:
            del self._circuit_tripped_until[origin]

    async def fetch(self, url: str, render_js: bool = False) -> dict:
        origin = self._origin(url)
        ok = await self._acquire(url)
        if not ok:
            return {"url": url, "status": None, "error": "rate_limited_or_circuit_open"}

        headers = {
            "User-Agent": self.user_agent,
            "Accept-Language": "th,en;q=0.8",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        }

        meta = await self._load_meta(url)
        if meta.get('etag'):
            headers['If-None-Match'] = meta['etag']
        if meta.get('last_modified'):
            headers['If-Modified-Since'] = meta['last_modified']

        if render_js and self.playwright_enabled:
            LOG.warning('render_js mode is not supported in async fetcher yet; falling back to HTTP client for %s', url)

        attempt = 0
        start = time.monotonic()

        while True:
            attempt += 1
            try:
                LOG.debug('fetch attempt %d %s', attempt, url)
                resp = await self._client.get(url, headers=headers)
                duration = int((time.monotonic() - start) * 1000)
                status = resp.status_code

                out = {
                    "url": url,
                    "status": status,
                    "headers": dict(resp.headers),
                    "text": resp.text,
                    "elapsed_ms": duration,
                }

                meta2 = {}
                if resp.headers.get('etag'):
                    meta2['etag'] = resp.headers.get('etag')
                if resp.headers.get('last-modified'):
                    meta2['last_modified'] = resp.headers.get('last-modified')
                if meta2:
                    await self._save_meta(url, meta2)

                if 200 <= status < 400:
                    self._record_success(origin)

                if status in (429, 503):
                    ra = resp.headers.get('Retry-After')
                    pause = None
                    if ra:
                        try:
                            pause = int(ra)
                        except ValueError:
                            pause = None
                    if pause and attempt <= self.max_retries:
                        LOG.warning('honoring Retry-After=%s for %s', pause, url)
                        out['pause_seconds'] = pause
                        self._record_failure(origin)
                        await asyncio.sleep(pause)
                        continue

                if status == 429 or (500 <= status < 600):
                    self._record_failure(origin)
                    if attempt <= self.max_retries:
                        backoff = (2 ** (attempt - 1)) + random.random()
                        LOG.warning('transient status %s on %s; retrying in %.2fs (attempt %d)', status, url, backoff, attempt)
                        await asyncio.sleep(backoff)
                        continue
                    LOG.error('giving up after %d attempts for %s status=%s', attempt, url, status)
                return out
            except httpx.RequestError as e:
                duration = int((time.monotonic() - start) * 1000)
                LOG.warning('request error on %s: %s', url, e)
                self._record_failure(origin)
                if attempt <= self.max_retries:
                    backoff = (2 ** (attempt - 1)) + random.random()
                    LOG.debug('retrying after error in %.2fs', backoff)
                    await asyncio.sleep(backoff)
                    continue
                return {"url": url, "status": None, "error": str(e), "elapsed_ms": duration}
            except Exception as e:  # pragma: no cover - defensive
                duration = int((time.monotonic() - start) * 1000)
                LOG.exception('unexpected error fetching %s', url)
                self._record_failure(origin)
                return {"url": url, "status": None, "error": str(e), "elapsed_ms": duration}