line-today-scrape/linetoday/storage.py

import json
from datetime import datetime
from pathlib import Path

STORAGE_DIR = Path.cwd() / 'data'
SNAPSHOT_DIR = STORAGE_DIR / 'snapshots'
PARSED_DIR = STORAGE_DIR / 'parsed'


def ensure_dirs():
    SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
    PARSED_DIR.mkdir(parents=True, exist_ok=True)


def store_snapshot(url: str, html: str) -> str:
    ensure_dirs()
    ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
    name = f"{ts}_{abs(hash(url))}.html"
    path = SNAPSHOT_DIR / name
    path.write_text(html, encoding='utf-8')
    return str(path)


def _article_to_markdown(article: dict) -> str:
    parts = []
    title = article.get('title')
    if title:
        parts.append(f"# {title}")
    meta = []
    if article.get('author'):
        meta.append(str(article.get('author')))
    if article.get('published_at'):
        meta.append(str(article.get('published_at')))
    if meta:
        parts.append('_' + ' • '.join(meta) + '_')
    if article.get('description'):
        parts.append(article.get('description'))
    if article.get('image'):
        parts.append(f"![image]({article.get('image')})")
    body = article.get('body_text') or ''
    if body:
        parts.append(body)
    return '\n\n'.join(parts)


def store_parsed(article: dict) -> str:
    ensure_dirs()
    ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
    fname = f"{ts}_{abs(hash(article.get('url')))}.json"
    path = PARSED_DIR / fname
    path.write_text(json.dumps(article, ensure_ascii=False, indent=2), encoding='utf-8')
    # also write markdown alongside
    md = _article_to_markdown(article)
    md_name = fname.rsplit('.', 1)[0] + '.md'
    md_path = PARSED_DIR / md_name
    md_path.write_text(md, encoding='utf-8')
    return str(path)