first commit

This commit is contained in:
Sirin Puenggun 2025-10-14 09:19:40 +07:00
parent 7256e17552
commit 0d6556f7c9
7 changed files with 376 additions and 35 deletions

60
TODO.md
View File

@ -9,44 +9,44 @@ Milestone 0 Bootstrap
- [x] M0-04: Create Makefile (pending) - [x] M0-04: Create Makefile (pending)
Milestone 1 Config and Logging Milestone 1 Config and Logging
- [ ] M1-01: Implement logging setup using log/slog - [x] M1-01: Implement logging setup using log/slog
- [ ] M1-02: Implement configuration loader: env + flags + .env - [x] M1-02: Implement configuration loader: env + flags + .env
- [ ] M1-03: Define config schema and defaults - [x] M1-03: Define config schema and defaults
- [ ] M1-04: Add config validation - [x] M1-04: Add config validation
- [ ] M1-05: Unit tests for config parsing precedence - [ ] M1-05: Unit tests for config parsing precedence
Milestone 2 Types and Parser Milestone 2 Types and Parser
- [ ] M2-01: Define normalized data types - [x] M2-01: Define normalized data types
- [ ] M2-02: Define minimal Reddit API response structs - [x] M2-02: Define minimal Reddit API response structs
- [ ] M2-03: Implement parser - [x] M2-03: Implement parser
- [ ] M2-04: Implement deleted/removed filters - [x] M2-04: Implement deleted/removed filters
- [ ] M2-05: Parser unit tests - [x] M2-05: Parser unit tests
Milestone 3 Fetcher and Networking Milestone 3 Fetcher and Networking
- [ ] M3-01: Build HTTP client - [x] M3-01: Build HTTP client
- [ ] M3-02: Implement rate limiter - [x] M3-02: Implement rate limiter
- [ ] M3-03: Implement backoff with jitter - [x] M3-03: Implement backoff with jitter
- [ ] M3-04: URL builder for search.json - [x] M3-04: URL builder for search.json
- [ ] M3-05: Implement fetchPage - [x] M3-05: Implement fetchPage
- [ ] M3-06: Fetcher tests - [ ] M3-06: Fetcher tests
- [ ] M3-07: Implement metrics capture - [ ] M3-07: Implement metrics capture
Milestone 4 Storage and Dedup Milestone 4 Storage and Dedup
- [ ] M4-01: Implement JSONL writer - [x] M4-01: Implement JSONL writer
- [ ] M4-02: File naming and rotation - [x] M4-02: File naming and rotation
- [ ] M4-03: Ensure output dir creation - [x] M4-03: Ensure output dir creation
- [ ] M4-04: Implement dedup index - [x] M4-04: Implement dedup index
- [ ] M4-05: Dedup persistence - [x] M4-05: Dedup persistence
- [ ] M4-06: Storage unit tests - [ ] M4-06: Storage unit tests
Milestone 5 Controller and Orchestration Milestone 5 Controller and Orchestration
- [ ] M5-01: Implement controller orchestrator - [x] M5-01: Implement controller orchestrator
- [ ] M5-02: Pagination loop - [x] M5-02: Pagination loop
- [ ] M5-03: Integrate fetcher→parser→storage - [x] M5-03: Integrate fetcher→parser→storage
- [ ] M5-04: Progress reporting - [x] M5-04: Progress reporting
- [ ] M5-05: Graceful shutdown - [x] M5-05: Graceful shutdown
- [ ] M5-06: Summary report - [x] M5-06: Summary report
- [ ] M5-07: Wire CLI entrypoint - [x] M5-07: Wire CLI entrypoint
- [ ] M5-08: Error code taxonomy - [ ] M5-08: Error code taxonomy
- [ ] M5-09: Controller integration test - [ ] M5-09: Controller integration test
@ -70,5 +70,9 @@ Milestone 9 Verification
Progress notes: Progress notes:
- Created project skeleton and minimal main.go. - Implemented project bootstrap, config loader, and logging.
- Next: implement logging + config and update main.go to use them. - Implemented types and parser with unit tests for parser.
- Implemented fetcher client and page fetcher (rate limiting & Retry-After handling).
- Implemented storage (JSONL writer with rotation) and dedup index (persistent append-only file).
- Implemented controller orchestrator that coordinates fetching, parsing, deduping, and writing; supports graceful shutdown and basic retry/backoff.
- Remaining: add unit tests for fetcher & storage, wire CLI to controller, and add integration tests and metrics.

View File

@ -6,9 +6,9 @@ import (
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
"time"
"github.com/yourname/reddit-scraper/internal/config" "github.com/yourname/reddit-scraper/internal/config"
"github.com/yourname/reddit-scraper/internal/controller"
"github.com/yourname/reddit-scraper/internal/logging" "github.com/yourname/reddit-scraper/internal/logging"
) )
@ -25,11 +25,11 @@ func main() {
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel() defer cancel()
// placeholder: in future wire up controller ctrl := controller.New(cfg, logger)
select { summary, err := ctrl.Run(ctx)
case <-time.After(1 * time.Second): if err != nil {
logger.Info("done (placeholder)") logger.Error("run failed", "err", err)
case <-ctx.Done(): os.Exit(1)
logger.Info("cancelled")
} }
logger.Info("run complete", "summary", summary)
} }

2
go.mod
View File

@ -1,3 +1,5 @@
module github.com/yourname/reddit-scraper module github.com/yourname/reddit-scraper
go 1.25.2 go 1.25.2
require golang.org/x/time v0.14.0

2
go.sum Normal file
View File

@ -0,0 +1,2 @@
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=

View File

@ -0,0 +1,205 @@
package controller
import (
"context"
"errors"
"fmt"
"log/slog"
"math/rand"
"sync"
"time"
"github.com/yourname/reddit-scraper/internal/config"
"github.com/yourname/reddit-scraper/internal/fetcher"
"github.com/yourname/reddit-scraper/internal/parser"
"github.com/yourname/reddit-scraper/internal/storage"
"github.com/yourname/reddit-scraper/internal/types"
)
// Controller orchestrates fetching, parsing, deduplication, and storage.
type Controller struct {
cfg *config.Config
logger *slog.Logger
}
// New creates a controller instance.
func New(cfg *config.Config, logger *slog.Logger) *Controller {
return &Controller{cfg: cfg, logger: logger}
}
// Run performs the scraping job until it collects cfg.Limit posts or context is done.
func (c *Controller) Run(ctx context.Context) (types.Summary, error) {
lg := c.logger
lg.Info("controller starting",
"keyword", c.cfg.Keyword,
"limit", c.cfg.Limit,
"concurrency", c.cfg.Concurrency,
)
// Setup components
client := fetcher.NewClient(c.cfg.UserAgent, 30*time.Second, c.cfg.RateLimitDelay, c.cfg.Concurrency, c.cfg.RetryLimit)
dedup, err := storage.LoadDedup(c.cfg.DedupCachePath)
if err != nil {
return types.Summary{}, fmt.Errorf("loading dedup: %w", err)
}
defer dedup.Close()
writer, err := storage.NewJSONLWriter(c.cfg.Keyword, c.cfg.OutputDir, 10*1024*1024)
if err != nil {
return types.Summary{}, fmt.Errorf("creating writer: %w", err)
}
defer writer.Close()
var mu sync.Mutex
summary := types.Summary{}
startTime := time.Now()
// Channels
tasks := make(chan fetchTask)
results := make(chan fetchResult)
// Worker pool
var wg sync.WaitGroup
for i := 0; i < c.cfg.Concurrency; i++ {
wg.Add(1)
go func(workerID int) {
defer wg.Done()
for t := range tasks {
res, err := c.fetchWithRetries(ctx, client, t.query, t.after, t.limit)
if err != nil {
lg.Warn("worker fetch error", "worker", workerID, "err", err)
results <- fetchResult{err: err}
continue
}
results <- fetchResult{res: res}
}
}(i)
}
// Controller goroutine to manage pages and collection
var collectWg sync.WaitGroup
collectWg.Add(1)
go func() {
defer collectWg.Done()
defer close(results)
collected := 0
after := ""
for collected < c.cfg.Limit {
select {
case <-ctx.Done():
lg.Info("context canceled, stopping page dispatch")
return
default:
}
// dispatch a page task
tasks <- fetchTask{query: c.cfg.Keyword, after: after, limit: min(100, c.cfg.Limit-collected)}
// wait for a result and process it
fr := <-results
if fr.err != nil {
// on error, back off a bit and retry dispatch
lg.Warn("fetch result error", "err", fr.err)
time.Sleep(2 * time.Second)
continue
}
// process posts
for _, jp := range fr.res.Posts {
p, perr := parser.ParseJSONPost(jp)
if perr != nil {
lg.Debug("skipping post", "reason", perr)
continue
}
if dedup.Seen(p.ID) {
mu.Lock()
summary.SkippedDuplicates++
mu.Unlock()
continue
}
if err := writer.Write(p); err != nil {
lg.Warn("write error", "err", err)
continue
}
if err := dedup.Add(p.ID); err != nil {
lg.Warn("dedup add error", "err", err)
}
mu.Lock()
summary.SuccessfulPosts++
mu.Unlock()
collected++
if collected >= c.cfg.Limit {
break
}
}
summary.TotalRequests++
if fr.res.After == "" {
// no more pages
lg.Info("no after token, finishing")
break
}
after = fr.res.After
}
}()
// Wait until pages are dispatched/processed
// Close tasks to stop workers when we're done dispatching
collectWg.Wait()
close(tasks)
wg.Wait()
// finalize
duration := time.Since(startTime)
summary.DurationSec = duration.Seconds()
if summary.SuccessfulPosts > 0 {
summary.AvgLatencyMs = float64(summary.DurationSec*1000) / float64(summary.SuccessfulPosts)
}
lg.Info("controller finished", "summary", summary)
return summary, nil
}
// helper types
type fetchTask struct {
query string
after string
limit int
}
type fetchResult struct {
res fetcher.PageResult
err error
}
// internal fetchWithRetries handles retry attempts with exponential backoff + jitter
func (c *Controller) fetchWithRetries(ctx context.Context, client *fetcher.Client, query, after string, limit int) (fetcher.PageResult, error) {
var lastErr error
var res fetcher.PageResult
for attempt := 0; attempt <= c.cfg.RetryLimit; attempt++ {
if attempt > 0 {
// backoff: base * 2^attempt + jitter
d := time.Duration(500*(1<<attempt)) * time.Millisecond
j := time.Duration(rand.Intn(300)) * time.Millisecond
time.Sleep(d + j)
}
var err error
res, err = client.FetchPage(ctx, query, after, limit)
if err == nil {
return res, nil
}
lastErr = err
// if context canceled, abort
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
break
}
}
return res, fmt.Errorf("fetch failed after retries: %w", lastErr)
}
func min(a, b int) int {
if a < b {
return a
}
return b
}

View File

@ -0,0 +1,28 @@
package fetcher
import (
"net/http"
"time"
"golang.org/x/time/rate"
)
// Client encapsulates HTTP client behavior for fetching Reddit JSON endpoints.
type Client struct {
httpClient *http.Client
userAgent string
limiter *rate.Limiter
retryLimit int
}
// NewClient constructs a fetcher Client.
// - userAgent: User-Agent header
// - timeout: HTTP client timeout
// - rateDelay: minimum duration between requests (rate limiter)
// - burst: burst size (usually equal to concurrency)
// - retryLimit: max retries per request
func NewClient(userAgent string, timeout time.Duration, rateDelay time.Duration, burst int, retryLimit int) *Client {
hc := &http.Client{Timeout: timeout}
lim := rate.NewLimiter(rate.Every(rateDelay), burst)
return &Client{httpClient: hc, userAgent: userAgent, limiter: lim, retryLimit: retryLimit}
}

100
internal/fetcher/fetcher.go Normal file
View File

@ -0,0 +1,100 @@
package fetcher
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"time"
)
// PageResult contains raw page data and the after token.
type PageResult struct {
Posts []JSONPost
After string
Latency time.Duration
}
// buildSearchURL constructs the Reddit search.json URL.
func buildSearchURL(query string, after string, limit int) (string, error) {
u, err := url.Parse("https://www.reddit.com/search.json")
if err != nil {
return "", err
}
q := u.Query()
q.Set("q", query)
q.Set("sort", "new")
q.Set("type", "link")
q.Set("limit", fmt.Sprintf("%d", limit))
q.Set("restrict_sr", "false")
if after != "" {
q.Set("after", after)
}
u.RawQuery = q.Encode()
return u.String(), nil
}
// FetchPage fetches a single search.json page and returns PageResult.
func (c *Client) FetchPage(ctx context.Context, query string, after string, limit int) (PageResult, error) {
var res PageResult
urlStr, err := buildSearchURL(query, after, limit)
if err != nil {
return res, err
}
// wait for rate limiter
if c.limiter != nil {
if err := c.limiter.Wait(ctx); err != nil {
return res, err
}
}
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
if err != nil {
return res, err
}
req.Header.Set("User-Agent", c.userAgent)
req.Header.Set("Accept", "application/json")
resp, err := c.httpClient.Do(req)
if err != nil {
return res, err
}
defer resp.Body.Close()
res.Latency = time.Since(start)
if resp.StatusCode == http.StatusTooManyRequests {
// honor Retry-After if given (seconds or HTTP-date)
if ra := resp.Header.Get("Retry-After"); ra != "" {
// Try seconds first
if secs, perr := strconv.Atoi(ra); perr == nil {
time.Sleep(time.Duration(secs) * time.Second)
} else if d, derr := http.ParseTime(ra); derr == nil {
// If HTTP-date, compute duration until then
dur := time.Until(d)
if dur > 0 {
time.Sleep(dur)
}
}
}
return res, fmt.Errorf("rate limited: 429")
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(resp.Body)
return res, fmt.Errorf("http %d: %s", resp.StatusCode, string(body))
}
var api APIResponse
dec := json.NewDecoder(resp.Body)
if err := dec.Decode(&api); err != nil {
return res, err
}
for _, child := range api.Data.Children {
res.Posts = append(res.Posts, child.Data)
}
res.After = api.Data.After
return res, nil
}