import re import html import json from datetime import datetime, timezone from typing import Dict, List, Tuple, Optional, Any, Set from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup def _get_meta(soup: BeautifulSoup, prop: str) -> Optional[str]: tag = soup.find('meta', attrs={"property": prop}) or soup.find('meta', attrs={"name": prop}) if tag and tag.get('content'): return tag.get('content').strip() return None def _parse_json_ld(soup: BeautifulSoup) -> Optional[dict]: for tag in soup.find_all('script', type='application/ld+json'): try: raw = tag.string if not raw: continue data = json.loads(raw.strip()) # if list, prefer first NewsArticle if isinstance(data, list): for item in data: if isinstance(item, dict) and item.get('@type') in ('NewsArticle', 'Article'): return item if isinstance(data, dict) and data.get('@type') in ('NewsArticle', 'Article'): return data except Exception: continue return None def _parse_next_data(soup: BeautifulSoup) -> Optional[dict]: tag = soup.find('script', attrs={'id': '__NEXT_DATA__', 'type': 'application/json'}) if not tag: # sometimes it's without type tag = soup.find('script', attrs={'id': '__NEXT_DATA__'}) if not tag: return None try: raw = tag.string or tag.get_text() if not raw: return None raw = raw.strip() # unescape HTML entities that sometimes wrap the JSON raw = html.unescape(raw) try: data = json.loads(raw) return data except Exception: # sometimes the JSON is embedded or has prefix/suffix; try to extract between first and last brace first = raw.find('{') last = raw.rfind('}') if first != -1 and last != -1 and last > first: snippet = raw[first:last+1] try: return json.loads(snippet) except Exception: return None return None except Exception: return None def _find_field(obj: Any, keys: List[str]) -> Optional[Any]: """Recursively search dict/list for first occurrence of any key in keys.""" if isinstance(obj, dict): for k, v in obj.items(): if k in keys: return v found = _find_field(v, keys) if found is not None: return found elif isinstance(obj, list): for item in obj: found = _find_field(item, keys) if found is not None: return found return None def _extract_article_payload(nextdata: dict) -> Tuple[Optional[dict], Optional[str]]: """Locate the structured article payload inside Next.js fallback data.""" props = nextdata.get('props') if isinstance(nextdata, dict) else None page_props = props.get('pageProps') if isinstance(props, dict) else None fallback = page_props.get('fallback') if isinstance(page_props, dict) else None if not isinstance(fallback, dict): return None, None for key, value in fallback.items(): if isinstance(value, dict): data = value.get('data') if isinstance(data, dict) and data.get('content') and data.get('title'): return data, key if value.get('content') and value.get('title'): return value, key return None, None def _text_from_html_fragment(fragment: str) -> str: # Use BeautifulSoup to cleanly extract text from HTML fragments try: frag_soup = BeautifulSoup(fragment, 'lxml') return frag_soup.get_text(separator=' ', strip=True) except Exception: # fallback: strip tags crudely return re.sub('<[^<]+?>', '', fragment).strip() def _normalize_paragraphs(text: str) -> str: """Remove obvious noise (Loading..., duplicate nav labels) while preserving order.""" if not text: return '' cleaned: List[str] = [] seen_counts: Dict[str, int] = {} for raw_line in text.splitlines(): line = raw_line.strip() if not line: continue lower = line.lower() if 'loading' in lower: # Catch "Loading...", "Loading... Loading..." etc. continue # Allow each distinct line up to twice to keep short quotes if seen_counts.get(lower, 0) >= 2: continue seen_counts[lower] = seen_counts.get(lower, 0) + 1 cleaned.append(line) return '\n\n'.join(cleaned) def _is_today_article_url(url: str) -> bool: parsed = urlparse(url) if parsed.netloc and 'today.line.me' not in parsed.netloc: return False return '/article/' in parsed.path def _collect_structured_links(article_payload: Optional[dict]) -> Set[str]: links: Set[str] = set() if not isinstance(article_payload, dict): return links explore = article_payload.get('exploreLinks') if isinstance(explore, list): for entry in explore: page_link = entry.get('pageLink') if isinstance(entry, dict) else None if not isinstance(page_link, dict): continue page_type = page_link.get('pageType') if page_type == 'ARTICLE': hash_val = page_link.get('hash') if hash_val: links.add(f'https://today.line.me/th/v3/article/{hash_val}') elif page_type == 'GENERAL': page = page_link.get('page') if isinstance(page, dict): url_path = page.get('urlPath') if url_path: links.add(f'https://today.line.me/th/v3/page/{url_path}') elif page_type == 'TAG': tag_hash = page_link.get('hash') if tag_hash: links.add(f'https://today.line.me/th/v3/tag/{tag_hash}') return links def extract_article(html: str, url: str) -> Tuple[Dict, List[str]]: soup = BeautifulSoup(html, 'lxml') # meta / og title = _get_meta(soup, 'og:title') or _get_meta(soup, 'title') description = _get_meta(soup, 'og:description') or _get_meta(soup, 'description') image = _get_meta(soup, 'og:image') published = _get_meta(soup, 'article:published_time') # JSON-LD jsonld = _parse_json_ld(soup) if jsonld: title = title or jsonld.get('headline') if not published: published = jsonld.get('datePublished') or jsonld.get('dateCreated') author = None a = jsonld.get('author') if isinstance(a, dict): author = a.get('name') elif isinstance(a, list) and a: author = a[0].get('name') if isinstance(a[0], dict) else None else: author = a publisher = None pub = jsonld.get('publisher') if isinstance(pub, dict): publisher = pub.get('name') else: author = None publisher = None body_html = None content_type = None source_url = None category = None tags: List[str] = [] is_article = False # Try Next.js page data nextdata = _parse_next_data(soup) if nextdata: payload, payload_key = _extract_article_payload(nextdata) if payload: content_type = payload.get('contentType') url_info = payload.get('url') if isinstance(payload.get('url'), dict) else None canonical_url = url_info.get('url') if isinstance(url_info, dict) else None if canonical_url and _is_today_article_url(canonical_url): is_article = True url = canonical_url elif canonical_url: url = canonical_url title = payload.get('title') or title description = payload.get('shortDescription') or description author = payload.get('author') or author publisher = payload.get('publisher') or publisher source_url = payload.get('sourceUrl') category = payload.get('categoryName') publish_unix = payload.get('publishTimeUnix') if publish_unix and not published: try: published = datetime.fromtimestamp(publish_unix, tz=timezone.utc).isoformat() except Exception: published = payload.get('publishTime') or published elif payload.get('publishTime') and not published: published = payload.get('publishTime') body_html = payload.get('content') if body_html: article_body = _text_from_html_fragment(body_html) explore_links = payload.get('exploreLinks') if isinstance(explore_links, list): for entry in explore_links: tag_name = entry.get('name') if isinstance(entry, dict) else None page_link = entry.get('pageLink') if isinstance(entry, dict) else None if tag_name and page_link and page_link.get('pageType') == 'TAG': tags.append(tag_name) else: # search common fields used by news sites / Next.js props nd_title = _find_field(nextdata, ['title', 'headline', 'name', 'seoTitle']) if nd_title and not title: title = nd_title nd_desc = _find_field(nextdata, ['description', 'summary', 'seoDescription']) if nd_desc and not description: description = nd_desc nd_body = _find_field(nextdata, ['articleBody', 'body', 'content', 'html']) if nd_body: if isinstance(nd_body, str): article_body = _text_from_html_fragment(nd_body) elif isinstance(nd_body, list): parts = [] for item in nd_body: if isinstance(item, str): parts.append(_text_from_html_fragment(item)) elif isinstance(item, dict): for k in ('text', 'content', 'body', 'html'): if k in item and isinstance(item[k], str): parts.append(_text_from_html_fragment(item[k])) article_body = '\n\n'.join([p for p in parts if p]) elif isinstance(nd_body, dict): if 'html' in nd_body and isinstance(nd_body['html'], str): article_body = _text_from_html_fragment(nd_body['html']) else: article_body = _text_from_html_fragment(str(nd_body)) else: article_body = '' else: article_body = '' nd_img = _find_field(nextdata, ['image', 'thumbnail', 'ogImage']) if nd_img and not image: if isinstance(nd_img, str): image = urljoin(url, nd_img) elif isinstance(nd_img, dict): candidate = nd_img.get('url') or nd_img.get('src') or nd_img.get('path') if isinstance(candidate, str): image = urljoin(url, candidate) nd_author = _find_field(nextdata, ['author', 'writer', 'creator']) if nd_author and not author: if isinstance(nd_author, str): author = nd_author elif isinstance(nd_author, dict): author = nd_author.get('name') elif isinstance(nd_author, list) and nd_author: first = nd_author[0] if isinstance(first, dict): author = first.get('name') elif isinstance(first, str): author = first if not published: nd_pub = _find_field(nextdata, ['datePublished', 'publishedAt', 'createdAt']) if isinstance(nd_pub, str): published = nd_pub # If payload explicitly marks article content but lacks meta image, attempt to build from payload thumbnail if not image and payload and isinstance(payload.get('thumbnail'), dict): thumb = payload['thumbnail'] thumb_url = thumb.get('url') or thumb.get('src') if isinstance(thumb_url, str): image = urljoin(url, thumb_url) else: article_body = '' # fallback title if not title: h1 = soup.find('h1') if h1: title = h1.get_text(strip=True) # if article_body still empty, apply HTML heuristics if not article_body: # content extraction heuristics # try common article containers candidates = [] for sel in ['article', 'div[class*="article"]', 'div[itemprop="articleBody"]', 'div[class*="content"]']: candidates.extend(soup.select(sel)) if not candidates: # fallback to main main = soup.find('main') if main: candidates = [main] if candidates: # Choose largest candidate by text length best = max(candidates, key=lambda el: len(el.get_text(strip=True))) # remove scripts, styles, ads-like nodes for bad in best.select('script, style, .ad, .ads, .related, .promo'): bad.decompose() paragraphs = [p.get_text(separator=' ', strip=True) for p in best.find_all(['p', 'div']) if p.get_text(strip=True)] article_body = '\n\n'.join(paragraphs) else: # as last resort, combine all
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] article_body = '\n\n'.join(paragraphs) article_body = _normalize_paragraphs(article_body) # collect internal links with a bias toward article detail pages link_candidates: Set[str] = set() if nextdata: payload, _ = _extract_article_payload(nextdata) link_candidates.update(_collect_structured_links(payload)) for a in soup.find_all('a', href=True): href = a['href'] absolute = urljoin(url, href) if _is_today_article_url(absolute): link_candidates.add(absolute) elif absolute.startswith('https://today.line.me/th/'): link_candidates.add(absolute) if is_article: link_candidates.add(url) links = sorted(link_candidates) article = { 'url': url, 'title': title, 'description': description, 'author': author, 'publisher': publisher, 'published_at': published, 'image': image, 'body_text': article_body, 'body_html': body_html, 'content_type': content_type, 'category': category, 'source_url': source_url, 'tags': tags, 'is_article': is_article, } return article, links