reddit-scrapper/internal/storage/dedup.go

77 lines
1.6 KiB
Go

package storage
import (
"bufio"
"os"
"sync"
)
// DedupIndex keeps an in-memory set of seen IDs and persists them to an append-only file.
type DedupIndex struct {
path string
mu sync.Mutex
set map[string]struct{}
f *os.File
}
// LoadDedup loads existing IDs from path and prepares to append new ones.
func LoadDedup(path string) (*DedupIndex, error) {
idx := &DedupIndex{path: path, set: make(map[string]struct{})}
// Ensure file exists
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o644)
if err != nil {
return nil, err
}
idx.f = f
// Read existing lines
sc := bufio.NewScanner(f)
for sc.Scan() {
idx.set[sc.Text()] = struct{}{}
}
// ignore scanner error for now
return idx, nil
}
// Seen reports whether id was seen before.
func (d *DedupIndex) Seen(id string) bool {
d.mu.Lock()
defer d.mu.Unlock()
_, ok := d.set[id]
return ok
}
// Add marks id as seen and appends to underlying file.
func (d *DedupIndex) Add(id string) error {
d.mu.Lock()
defer d.mu.Unlock()
if _, ok := d.set[id]; ok {
return nil
}
if d.f == nil {
// should not happen if loaded properly
f, err := os.OpenFile(d.path, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o644)
if err != nil {
return err
}
d.f = f
}
if _, err := d.f.WriteString(id + "\n"); err != nil {
return err
}
if err := d.f.Sync(); err != nil {
// best-effort; ignore
}
d.set[id] = struct{}{}
return nil
}
// Close closes the underlying file handle.
func (d *DedupIndex) Close() error {
d.mu.Lock()
defer d.mu.Unlock()
if d.f != nil {
return d.f.Close()
}
return nil
}