from collections import deque from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode TRACKING_PARAMS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"} def normalize_url(url: str) -> str: """Normalize URL: force https, remove fragments, drop tracking params, sort query.""" parsed = urlparse(url) scheme = parsed.scheme or 'https' netloc = parsed.netloc path = parsed.path or '/' # remove fragment query_pairs = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True) if k not in TRACKING_PARAMS] query_pairs.sort() query = urlencode(query_pairs) normalized = urlunparse((scheme, netloc, path, '', query, '')) return normalized class Frontier: def __init__(self): self._seen: set[str] = set() self._queue: deque[str] = deque() def add(self, url: str) -> bool: url = normalize_url(url) if url in self._seen: return False self._seen.add(url) self._queue.append(url) return True def pop(self) -> str | None: if not self._queue: return None return self._queue.popleft() def __len__(self) -> int: return len(self._queue)