feat: storage JSONL writer and dedup index; add parser tests

This commit is contained in:
Sirin Puenggun 2025-10-13 23:53:37 +07:00
parent 2dfe929b94
commit 7256e17552
3 changed files with 216 additions and 0 deletions

View File

@ -0,0 +1,59 @@
package parser
import (
"testing"
"github.com/yourname/reddit-scraper/internal/fetcher"
)
func TestParseJSONPost_Valid(t *testing.T) {
j := fetcher.JSONPost{
ID: "abc123",
Subreddit: "golang",
Title: "An interesting post",
Author: "alice",
CreatedUTC: 1620000000,
Score: 10,
NumComments: 2,
Selftext: "Hello world",
URL: "https://reddit.com/r/golang/comments/abc123",
Permalink: "/r/golang/comments/abc123",
}
p, err := ParseJSONPost(j)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if p.ID != j.ID {
t.Fatalf("id mismatch: %s != %s", p.ID, j.ID)
}
if p.Author != "alice" {
t.Fatalf("author mismatch: %s", p.Author)
}
}
func TestParseJSONPost_DeletedAuthor(t *testing.T) {
j := fetcher.JSONPost{
ID: "d1",
Subreddit: "test",
Title: "post",
Author: "",
CreatedUTC: 1600000000,
}
p, err := ParseJSONPost(j)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if p.Author != "[deleted]" {
t.Fatalf("expected [deleted], got %s", p.Author)
}
}
func TestParseJSONPost_MissingFields(t *testing.T) {
j := fetcher.JSONPost{
ID: "",
}
_, err := ParseJSONPost(j)
if err == nil {
t.Fatalf("expected error for missing id")
}
}

76
internal/storage/dedup.go Normal file
View File

@ -0,0 +1,76 @@
package storage
import (
"bufio"
"os"
"sync"
)
// DedupIndex keeps an in-memory set of seen IDs and persists them to an append-only file.
type DedupIndex struct {
path string
mu sync.Mutex
set map[string]struct{}
f *os.File
}
// LoadDedup loads existing IDs from path and prepares to append new ones.
func LoadDedup(path string) (*DedupIndex, error) {
idx := &DedupIndex{path: path, set: make(map[string]struct{})}
// Ensure file exists
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o644)
if err != nil {
return nil, err
}
idx.f = f
// Read existing lines
sc := bufio.NewScanner(f)
for sc.Scan() {
idx.set[sc.Text()] = struct{}{}
}
// ignore scanner error for now
return idx, nil
}
// Seen reports whether id was seen before.
func (d *DedupIndex) Seen(id string) bool {
d.mu.Lock()
defer d.mu.Unlock()
_, ok := d.set[id]
return ok
}
// Add marks id as seen and appends to underlying file.
func (d *DedupIndex) Add(id string) error {
d.mu.Lock()
defer d.mu.Unlock()
if _, ok := d.set[id]; ok {
return nil
}
if d.f == nil {
// should not happen if loaded properly
f, err := os.OpenFile(d.path, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o644)
if err != nil {
return err
}
d.f = f
}
if _, err := d.f.WriteString(id + "\n"); err != nil {
return err
}
if err := d.f.Sync(); err != nil {
// best-effort; ignore
}
d.set[id] = struct{}{}
return nil
}
// Close closes the underlying file handle.
func (d *DedupIndex) Close() error {
d.mu.Lock()
defer d.mu.Unlock()
if d.f != nil {
return d.f.Close()
}
return nil
}

View File

@ -0,0 +1,81 @@
package storage
import (
"bufio"
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
)
// JSONLWriter writes records as line-delimited JSON and rotates files by size.
type JSONLWriter struct {
keyword string
outDir string
file *os.File
writer *bufio.Writer
maxBytes int64
written int64
part int
}
// NewJSONLWriter creates a writer for a keyword in outDir. maxBytes triggers rotation.
func NewJSONLWriter(keyword, outDir string, maxBytes int64) (*JSONLWriter, error) {
w := &JSONLWriter{keyword: keyword, outDir: outDir, maxBytes: maxBytes}
if err := w.rotate(); err != nil {
return nil, err
}
return w, nil
}
func (w *JSONLWriter) rotate() error {
if w.writer != nil {
w.writer.Flush()
}
if w.file != nil {
w.file.Close()
}
w.part++
ts := time.Now().UTC().Format("20060102T150405Z")
name := fmt.Sprintf("%s_%s_part%d.jsonl", w.keyword, ts, w.part)
path := filepath.Join(w.outDir, name)
f, err := os.Create(path)
if err != nil {
return err
}
w.file = f
w.writer = bufio.NewWriterSize(f, 16*1024)
w.written = 0
return nil
}
// Write marshals v to JSON and writes it as a newline-delimited record.
func (w *JSONLWriter) Write(v interface{}) error {
b, err := json.Marshal(v)
if err != nil {
return err
}
n, err := w.writer.Write(append(b, '\n'))
if err != nil {
return err
}
w.written += int64(n)
if w.maxBytes > 0 && w.written >= w.maxBytes {
return w.rotate()
}
return nil
}
// Close flushes and closes the underlying file.
func (w *JSONLWriter) Close() error {
if w.writer != nil {
if err := w.writer.Flush(); err != nil {
return err
}
}
if w.file != nil {
return w.file.Close()
}
return nil
}