line-today-scrape/linetoday/storage.py
Sosokker 0b5b9d98c5
Some checks are pending
CI / test (push) Waiting to run
add main files
2025-10-29 16:12:55 +07:00

58 lines
1.7 KiB
Python

import json
from datetime import datetime
from pathlib import Path
STORAGE_DIR = Path.cwd() / 'data'
SNAPSHOT_DIR = STORAGE_DIR / 'snapshots'
PARSED_DIR = STORAGE_DIR / 'parsed'
def ensure_dirs():
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
PARSED_DIR.mkdir(parents=True, exist_ok=True)
def store_snapshot(url: str, html: str) -> str:
ensure_dirs()
ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
name = f"{ts}_{abs(hash(url))}.html"
path = SNAPSHOT_DIR / name
path.write_text(html, encoding='utf-8')
return str(path)
def _article_to_markdown(article: dict) -> str:
parts = []
title = article.get('title')
if title:
parts.append(f"# {title}")
meta = []
if article.get('author'):
meta.append(str(article.get('author')))
if article.get('published_at'):
meta.append(str(article.get('published_at')))
if meta:
parts.append('_' + ''.join(meta) + '_')
if article.get('description'):
parts.append(article.get('description'))
if article.get('image'):
parts.append(f"![image]({article.get('image')})")
body = article.get('body_text') or ''
if body:
parts.append(body)
return '\n\n'.join(parts)
def store_parsed(article: dict) -> str:
ensure_dirs()
ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
fname = f"{ts}_{abs(hash(article.get('url')))}.json"
path = PARSED_DIR / fname
path.write_text(json.dumps(article, ensure_ascii=False, indent=2), encoding='utf-8')
# also write markdown alongside
md = _article_to_markdown(article)
md_name = fname.rsplit('.', 1)[0] + '.md'
md_path = PARSED_DIR / md_name
md_path.write_text(md, encoding='utf-8')
return str(path)