import json from datetime import datetime from pathlib import Path STORAGE_DIR = Path.cwd() / 'data' SNAPSHOT_DIR = STORAGE_DIR / 'snapshots' PARSED_DIR = STORAGE_DIR / 'parsed' def ensure_dirs(): SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True) PARSED_DIR.mkdir(parents=True, exist_ok=True) def store_snapshot(url: str, html: str) -> str: ensure_dirs() ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') name = f"{ts}_{abs(hash(url))}.html" path = SNAPSHOT_DIR / name path.write_text(html, encoding='utf-8') return str(path) def _article_to_markdown(article: dict) -> str: parts = [] title = article.get('title') if title: parts.append(f"# {title}") meta = [] if article.get('author'): meta.append(str(article.get('author'))) if article.get('published_at'): meta.append(str(article.get('published_at'))) if meta: parts.append('_' + ' • '.join(meta) + '_') if article.get('description'): parts.append(article.get('description')) if article.get('image'): parts.append(f"![image]({article.get('image')})") body = article.get('body_text') or '' if body: parts.append(body) return '\n\n'.join(parts) def store_parsed(article: dict) -> str: ensure_dirs() ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') fname = f"{ts}_{abs(hash(article.get('url')))}.json" path = PARSED_DIR / fname path.write_text(json.dumps(article, ensure_ascii=False, indent=2), encoding='utf-8') # also write markdown alongside md = _article_to_markdown(article) md_name = fname.rsplit('.', 1)[0] + '.md' md_path = PARSED_DIR / md_name md_path.write_text(md, encoding='utf-8') return str(path)