41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
from collections import deque
|
|
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
|
|
|
|
TRACKING_PARAMS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"}
|
|
|
|
|
|
def normalize_url(url: str) -> str:
|
|
"""Normalize URL: force https, remove fragments, drop tracking params, sort query."""
|
|
parsed = urlparse(url)
|
|
scheme = parsed.scheme or 'https'
|
|
netloc = parsed.netloc
|
|
path = parsed.path or '/'
|
|
# remove fragment
|
|
query_pairs = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True) if k not in TRACKING_PARAMS]
|
|
query_pairs.sort()
|
|
query = urlencode(query_pairs)
|
|
normalized = urlunparse((scheme, netloc, path, '', query, ''))
|
|
return normalized
|
|
|
|
|
|
class Frontier:
|
|
def __init__(self):
|
|
self._seen: set[str] = set()
|
|
self._queue: deque[str] = deque()
|
|
|
|
def add(self, url: str) -> bool:
|
|
url = normalize_url(url)
|
|
if url in self._seen:
|
|
return False
|
|
self._seen.add(url)
|
|
self._queue.append(url)
|
|
return True
|
|
|
|
def pop(self) -> str | None:
|
|
if not self._queue:
|
|
return None
|
|
return self._queue.popleft()
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._queue)
|