line-today-scrape/tests/test_extractor.py
Sosokker 04b39ed883
Some checks failed
CI / test (push) Has been cancelled
scrape articles concurrently and add articles registry
2025-10-29 16:42:07 +07:00

49 lines
1.9 KiB
Python

from pathlib import Path
import pytest
from linetoday.extractor import extract_article
SNAPSHOT_DIR = Path(__file__).resolve().parents[1] / 'data' / 'snapshots'
ARTICLE_FIXTURE = Path(__file__).resolve().parents[0] / 'fixtures' / 'article_RBVxEOM.html'
def test_article_fixture_uses_structured_payload():
html = ARTICLE_FIXTURE.read_text(encoding='utf-8')
url = 'https://today.line.me/th/v3/article/RBVxEOM'
article, links = extract_article(html, url)
assert article['is_article'] is True
assert article['title'] and 'เปิดดวง' in article['title']
assert article['body_html'], 'expected body_html extracted from structured payload'
assert len(article['body_text']) > 500
assert article['published_at']
assert article['author'] == 'Sanook'
assert any('/th/v3/article/' in link for link in links)
def test_snapshots_are_classified_and_sanitised():
# iterate all html snapshots in data/snapshots
if not SNAPSHOT_DIR.exists():
pytest.skip('No snapshot fixtures available')
files = sorted([p for p in SNAPSHOT_DIR.iterdir() if p.suffix == '.html'])
assert files, f'No snapshot files found in {SNAPSHOT_DIR}'
for f in files:
html = f.read_text(encoding='utf-8')
url = f'https://today.line.me/snapshot/{f.name}'
article, links = extract_article(html, url)
assert isinstance(article, dict)
assert isinstance(links, list)
assert article.get('url') and article['url'].startswith('https://today.line.me')
title = article.get('title')
assert title and isinstance(title, str) and title.strip(), f'missing title for {f.name}'
if article.get('is_article'):
assert article.get('body_text') and len(article['body_text'].strip()) > 200
else:
body = article.get('body_text', '')
assert 'loading' not in body.lower()