line-today-scrape/linetoday/frontier.py
Sosokker 0b5b9d98c5
Some checks are pending
CI / test (push) Waiting to run
add main files
2025-10-29 16:12:55 +07:00

41 lines
1.2 KiB
Python

from collections import deque
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
TRACKING_PARAMS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"}
def normalize_url(url: str) -> str:
"""Normalize URL: force https, remove fragments, drop tracking params, sort query."""
parsed = urlparse(url)
scheme = parsed.scheme or 'https'
netloc = parsed.netloc
path = parsed.path or '/'
# remove fragment
query_pairs = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True) if k not in TRACKING_PARAMS]
query_pairs.sort()
query = urlencode(query_pairs)
normalized = urlunparse((scheme, netloc, path, '', query, ''))
return normalized
class Frontier:
def __init__(self):
self._seen: set[str] = set()
self._queue: deque[str] = deque()
def add(self, url: str) -> bool:
url = normalize_url(url)
if url in self._seen:
return False
self._seen.add(url)
self._queue.append(url)
return True
def pop(self) -> str | None:
if not self._queue:
return None
return self._queue.popleft()
def __len__(self) -> int:
return len(self._queue)