from pathlib import Path import pytest from linetoday.extractor import extract_article SNAPSHOT_DIR = Path(__file__).resolve().parents[1] / 'data' / 'snapshots' ARTICLE_FIXTURE = Path(__file__).resolve().parents[0] / 'fixtures' / 'article_RBVxEOM.html' def test_article_fixture_uses_structured_payload(): html = ARTICLE_FIXTURE.read_text(encoding='utf-8') url = 'https://today.line.me/th/v3/article/RBVxEOM' article, links = extract_article(html, url) assert article['is_article'] is True assert article['title'] and 'เปิดดวง' in article['title'] assert article['body_html'], 'expected body_html extracted from structured payload' assert len(article['body_text']) > 500 assert article['published_at'] assert article['author'] == 'Sanook' assert any('/th/v3/article/' in link for link in links) def test_snapshots_are_classified_and_sanitised(): # iterate all html snapshots in data/snapshots if not SNAPSHOT_DIR.exists(): pytest.skip('No snapshot fixtures available') files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html']) assert files, f'No snapshot files found in {SNAPSHOT_DIR}' for f in files: html = f.read_text(encoding='utf-8') url = f'https://today.line.me/snapshot/{f.name}' article, links = extract_article(html, url) assert isinstance(article, dict) assert isinstance(links, list) assert article.get('url') and article['url'].startswith('https://today.line.me') title = article.get('title') assert title and isinstance(title, str) and title.strip(), f'missing title for {f.name}' if article.get('is_article'): assert article.get('body_text') and len(article['body_text'].strip()) > 200 else: body = article.get('body_text', '') assert 'loading' not in body.lower()