58 lines
1.7 KiB
Python
58 lines
1.7 KiB
Python
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
STORAGE_DIR = Path.cwd() / 'data'
|
|
SNAPSHOT_DIR = STORAGE_DIR / 'snapshots'
|
|
PARSED_DIR = STORAGE_DIR / 'parsed'
|
|
|
|
|
|
def ensure_dirs():
|
|
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
|
|
PARSED_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def store_snapshot(url: str, html: str) -> str:
|
|
ensure_dirs()
|
|
ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
|
|
name = f"{ts}_{abs(hash(url))}.html"
|
|
path = SNAPSHOT_DIR / name
|
|
path.write_text(html, encoding='utf-8')
|
|
return str(path)
|
|
|
|
|
|
def _article_to_markdown(article: dict) -> str:
|
|
parts = []
|
|
title = article.get('title')
|
|
if title:
|
|
parts.append(f"# {title}")
|
|
meta = []
|
|
if article.get('author'):
|
|
meta.append(str(article.get('author')))
|
|
if article.get('published_at'):
|
|
meta.append(str(article.get('published_at')))
|
|
if meta:
|
|
parts.append('_' + ' • '.join(meta) + '_')
|
|
if article.get('description'):
|
|
parts.append(article.get('description'))
|
|
if article.get('image'):
|
|
parts.append(f"})")
|
|
body = article.get('body_text') or ''
|
|
if body:
|
|
parts.append(body)
|
|
return '\n\n'.join(parts)
|
|
|
|
|
|
def store_parsed(article: dict) -> str:
|
|
ensure_dirs()
|
|
ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
|
|
fname = f"{ts}_{abs(hash(article.get('url')))}.json"
|
|
path = PARSED_DIR / fname
|
|
path.write_text(json.dumps(article, ensure_ascii=False, indent=2), encoding='utf-8')
|
|
# also write markdown alongside
|
|
md = _article_to_markdown(article)
|
|
md_name = fname.rsplit('.', 1)[0] + '.md'
|
|
md_path = PARSED_DIR / md_name
|
|
md_path.write_text(md, encoding='utf-8')
|
|
return str(path)
|