45 lines
1.8 KiB
Python
45 lines
1.8 KiB
Python
from pathlib import Path
|
|
|
|
from linetoday.extractor import extract_article
|
|
|
|
|
|
SNAPSHOT_DIR = Path(__file__).resolve().parents[1] / 'data' / 'snapshots'
|
|
ARTICLE_FIXTURE = Path(__file__).resolve().parents[0] / 'fixtures' / 'article_RBVxEOM.html'
|
|
|
|
|
|
def test_article_fixture_uses_structured_payload():
|
|
html = ARTICLE_FIXTURE.read_text(encoding='utf-8')
|
|
url = 'https://today.line.me/th/v3/article/RBVxEOM'
|
|
article, links = extract_article(html, url)
|
|
|
|
assert article['is_article'] is True
|
|
assert article['title'] and 'เปิดดวง' in article['title']
|
|
assert article['body_html'], 'expected body_html extracted from structured payload'
|
|
assert len(article['body_text']) > 500
|
|
assert article['published_at']
|
|
assert article['author'] == 'Sanook'
|
|
assert any('/th/v3/article/' in link for link in links)
|
|
|
|
|
|
def test_snapshots_are_classified_and_sanitised():
|
|
# iterate all html snapshots in data/snapshots
|
|
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
|
|
assert files, f'No snapshot files found in {SNAPSHOT_DIR}'
|
|
|
|
for f in files:
|
|
html = f.read_text(encoding='utf-8')
|
|
url = f'https://today.line.me/snapshot/{f.name}'
|
|
article, links = extract_article(html, url)
|
|
|
|
assert isinstance(article, dict)
|
|
assert isinstance(links, list)
|
|
assert article.get('url') and article['url'].startswith('https://today.line.me')
|
|
title = article.get('title')
|
|
assert title and isinstance(title, str) and title.strip(), f'missing title for {f.name}'
|
|
|
|
if article.get('is_article'):
|
|
assert article.get('body_text') and len(article['body_text'].strip()) > 200
|
|
else:
|
|
body = article.get('body_text', '')
|
|
assert 'loading' not in body.lower()
|