line-today-scrape/tests/test_extractor.py
Sosokker 0b5b9d98c5
Some checks are pending
CI / test (push) Waiting to run
add main files
2025-10-29 16:12:55 +07:00

45 lines
1.8 KiB
Python

from pathlib import Path
from linetoday.extractor import extract_article
SNAPSHOT_DIR = Path(__file__).resolve().parents[1] / 'data' / 'snapshots'
ARTICLE_FIXTURE = Path(__file__).resolve().parents[0] / 'fixtures' / 'article_RBVxEOM.html'
def test_article_fixture_uses_structured_payload():
html = ARTICLE_FIXTURE.read_text(encoding='utf-8')
url = 'https://today.line.me/th/v3/article/RBVxEOM'
article, links = extract_article(html, url)
assert article['is_article'] is True
assert article['title'] and 'เปิดดวง' in article['title']
assert article['body_html'], 'expected body_html extracted from structured payload'
assert len(article['body_text']) > 500
assert article['published_at']
assert article['author'] == 'Sanook'
assert any('/th/v3/article/' in link for link in links)
def test_snapshots_are_classified_and_sanitised():
# iterate all html snapshots in data/snapshots
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
assert files, f'No snapshot files found in {SNAPSHOT_DIR}'
for f in files:
html = f.read_text(encoding='utf-8')
url = f'https://today.line.me/snapshot/{f.name}'
article, links = extract_article(html, url)
assert isinstance(article, dict)
assert isinstance(links, list)
assert article.get('url') and article['url'].startswith('https://today.line.me')
title = article.get('title')
assert title and isinstance(title, str) and title.strip(), f'missing title for {f.name}'
if article.get('is_article'):
assert article.get('body_text') and len(article['body_text'].strip()) > 200
else:
body = article.get('body_text', '')
assert 'loading' not in body.lower()