first commit
This commit is contained in:
parent
7256e17552
commit
0d6556f7c9
60
TODO.md
60
TODO.md
@ -9,44 +9,44 @@ Milestone 0 – Bootstrap
|
|||||||
- [x] M0-04: Create Makefile (pending)
|
- [x] M0-04: Create Makefile (pending)
|
||||||
|
|
||||||
Milestone 1 – Config and Logging
|
Milestone 1 – Config and Logging
|
||||||
- [ ] M1-01: Implement logging setup using log/slog
|
- [x] M1-01: Implement logging setup using log/slog
|
||||||
- [ ] M1-02: Implement configuration loader: env + flags + .env
|
- [x] M1-02: Implement configuration loader: env + flags + .env
|
||||||
- [ ] M1-03: Define config schema and defaults
|
- [x] M1-03: Define config schema and defaults
|
||||||
- [ ] M1-04: Add config validation
|
- [x] M1-04: Add config validation
|
||||||
- [ ] M1-05: Unit tests for config parsing precedence
|
- [ ] M1-05: Unit tests for config parsing precedence
|
||||||
|
|
||||||
Milestone 2 – Types and Parser
|
Milestone 2 – Types and Parser
|
||||||
- [ ] M2-01: Define normalized data types
|
- [x] M2-01: Define normalized data types
|
||||||
- [ ] M2-02: Define minimal Reddit API response structs
|
- [x] M2-02: Define minimal Reddit API response structs
|
||||||
- [ ] M2-03: Implement parser
|
- [x] M2-03: Implement parser
|
||||||
- [ ] M2-04: Implement deleted/removed filters
|
- [x] M2-04: Implement deleted/removed filters
|
||||||
- [ ] M2-05: Parser unit tests
|
- [x] M2-05: Parser unit tests
|
||||||
|
|
||||||
Milestone 3 – Fetcher and Networking
|
Milestone 3 – Fetcher and Networking
|
||||||
- [ ] M3-01: Build HTTP client
|
- [x] M3-01: Build HTTP client
|
||||||
- [ ] M3-02: Implement rate limiter
|
- [x] M3-02: Implement rate limiter
|
||||||
- [ ] M3-03: Implement backoff with jitter
|
- [x] M3-03: Implement backoff with jitter
|
||||||
- [ ] M3-04: URL builder for search.json
|
- [x] M3-04: URL builder for search.json
|
||||||
- [ ] M3-05: Implement fetchPage
|
- [x] M3-05: Implement fetchPage
|
||||||
- [ ] M3-06: Fetcher tests
|
- [ ] M3-06: Fetcher tests
|
||||||
- [ ] M3-07: Implement metrics capture
|
- [ ] M3-07: Implement metrics capture
|
||||||
|
|
||||||
Milestone 4 – Storage and Dedup
|
Milestone 4 – Storage and Dedup
|
||||||
- [ ] M4-01: Implement JSONL writer
|
- [x] M4-01: Implement JSONL writer
|
||||||
- [ ] M4-02: File naming and rotation
|
- [x] M4-02: File naming and rotation
|
||||||
- [ ] M4-03: Ensure output dir creation
|
- [x] M4-03: Ensure output dir creation
|
||||||
- [ ] M4-04: Implement dedup index
|
- [x] M4-04: Implement dedup index
|
||||||
- [ ] M4-05: Dedup persistence
|
- [x] M4-05: Dedup persistence
|
||||||
- [ ] M4-06: Storage unit tests
|
- [ ] M4-06: Storage unit tests
|
||||||
|
|
||||||
Milestone 5 – Controller and Orchestration
|
Milestone 5 – Controller and Orchestration
|
||||||
- [ ] M5-01: Implement controller orchestrator
|
- [x] M5-01: Implement controller orchestrator
|
||||||
- [ ] M5-02: Pagination loop
|
- [x] M5-02: Pagination loop
|
||||||
- [ ] M5-03: Integrate fetcher→parser→storage
|
- [x] M5-03: Integrate fetcher→parser→storage
|
||||||
- [ ] M5-04: Progress reporting
|
- [x] M5-04: Progress reporting
|
||||||
- [ ] M5-05: Graceful shutdown
|
- [x] M5-05: Graceful shutdown
|
||||||
- [ ] M5-06: Summary report
|
- [x] M5-06: Summary report
|
||||||
- [ ] M5-07: Wire CLI entrypoint
|
- [x] M5-07: Wire CLI entrypoint
|
||||||
- [ ] M5-08: Error code taxonomy
|
- [ ] M5-08: Error code taxonomy
|
||||||
- [ ] M5-09: Controller integration test
|
- [ ] M5-09: Controller integration test
|
||||||
|
|
||||||
@ -70,5 +70,9 @@ Milestone 9 – Verification
|
|||||||
|
|
||||||
|
|
||||||
Progress notes:
|
Progress notes:
|
||||||
- Created project skeleton and minimal main.go.
|
- Implemented project bootstrap, config loader, and logging.
|
||||||
- Next: implement logging + config and update main.go to use them.
|
- Implemented types and parser with unit tests for parser.
|
||||||
|
- Implemented fetcher client and page fetcher (rate limiting & Retry-After handling).
|
||||||
|
- Implemented storage (JSONL writer with rotation) and dedup index (persistent append-only file).
|
||||||
|
- Implemented controller orchestrator that coordinates fetching, parsing, deduping, and writing; supports graceful shutdown and basic retry/backoff.
|
||||||
|
- Remaining: add unit tests for fetcher & storage, wire CLI to controller, and add integration tests and metrics.
|
||||||
|
|||||||
@ -6,9 +6,9 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/yourname/reddit-scraper/internal/config"
|
"github.com/yourname/reddit-scraper/internal/config"
|
||||||
|
"github.com/yourname/reddit-scraper/internal/controller"
|
||||||
"github.com/yourname/reddit-scraper/internal/logging"
|
"github.com/yourname/reddit-scraper/internal/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -25,11 +25,11 @@ func main() {
|
|||||||
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
// placeholder: in future wire up controller
|
ctrl := controller.New(cfg, logger)
|
||||||
select {
|
summary, err := ctrl.Run(ctx)
|
||||||
case <-time.After(1 * time.Second):
|
if err != nil {
|
||||||
logger.Info("done (placeholder)")
|
logger.Error("run failed", "err", err)
|
||||||
case <-ctx.Done():
|
os.Exit(1)
|
||||||
logger.Info("cancelled")
|
|
||||||
}
|
}
|
||||||
|
logger.Info("run complete", "summary", summary)
|
||||||
}
|
}
|
||||||
|
|||||||
2
go.mod
2
go.mod
@ -1,3 +1,5 @@
|
|||||||
module github.com/yourname/reddit-scraper
|
module github.com/yourname/reddit-scraper
|
||||||
|
|
||||||
go 1.25.2
|
go 1.25.2
|
||||||
|
|
||||||
|
require golang.org/x/time v0.14.0
|
||||||
|
|||||||
2
go.sum
Normal file
2
go.sum
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||||
|
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||||
205
internal/controller/controller.go
Normal file
205
internal/controller/controller.go
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
package controller
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"math/rand"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/yourname/reddit-scraper/internal/config"
|
||||||
|
"github.com/yourname/reddit-scraper/internal/fetcher"
|
||||||
|
"github.com/yourname/reddit-scraper/internal/parser"
|
||||||
|
"github.com/yourname/reddit-scraper/internal/storage"
|
||||||
|
"github.com/yourname/reddit-scraper/internal/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Controller orchestrates fetching, parsing, deduplication, and storage.
|
||||||
|
type Controller struct {
|
||||||
|
cfg *config.Config
|
||||||
|
logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// New creates a controller instance.
|
||||||
|
func New(cfg *config.Config, logger *slog.Logger) *Controller {
|
||||||
|
return &Controller{cfg: cfg, logger: logger}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run performs the scraping job until it collects cfg.Limit posts or context is done.
|
||||||
|
func (c *Controller) Run(ctx context.Context) (types.Summary, error) {
|
||||||
|
lg := c.logger
|
||||||
|
lg.Info("controller starting",
|
||||||
|
"keyword", c.cfg.Keyword,
|
||||||
|
"limit", c.cfg.Limit,
|
||||||
|
"concurrency", c.cfg.Concurrency,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Setup components
|
||||||
|
client := fetcher.NewClient(c.cfg.UserAgent, 30*time.Second, c.cfg.RateLimitDelay, c.cfg.Concurrency, c.cfg.RetryLimit)
|
||||||
|
dedup, err := storage.LoadDedup(c.cfg.DedupCachePath)
|
||||||
|
if err != nil {
|
||||||
|
return types.Summary{}, fmt.Errorf("loading dedup: %w", err)
|
||||||
|
}
|
||||||
|
defer dedup.Close()
|
||||||
|
|
||||||
|
writer, err := storage.NewJSONLWriter(c.cfg.Keyword, c.cfg.OutputDir, 10*1024*1024)
|
||||||
|
if err != nil {
|
||||||
|
return types.Summary{}, fmt.Errorf("creating writer: %w", err)
|
||||||
|
}
|
||||||
|
defer writer.Close()
|
||||||
|
|
||||||
|
var mu sync.Mutex
|
||||||
|
summary := types.Summary{}
|
||||||
|
startTime := time.Now()
|
||||||
|
|
||||||
|
// Channels
|
||||||
|
tasks := make(chan fetchTask)
|
||||||
|
results := make(chan fetchResult)
|
||||||
|
|
||||||
|
// Worker pool
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
for i := 0; i < c.cfg.Concurrency; i++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go func(workerID int) {
|
||||||
|
defer wg.Done()
|
||||||
|
for t := range tasks {
|
||||||
|
res, err := c.fetchWithRetries(ctx, client, t.query, t.after, t.limit)
|
||||||
|
if err != nil {
|
||||||
|
lg.Warn("worker fetch error", "worker", workerID, "err", err)
|
||||||
|
results <- fetchResult{err: err}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
results <- fetchResult{res: res}
|
||||||
|
}
|
||||||
|
}(i)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Controller goroutine to manage pages and collection
|
||||||
|
var collectWg sync.WaitGroup
|
||||||
|
collectWg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer collectWg.Done()
|
||||||
|
defer close(results)
|
||||||
|
|
||||||
|
collected := 0
|
||||||
|
after := ""
|
||||||
|
for collected < c.cfg.Limit {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
lg.Info("context canceled, stopping page dispatch")
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
// dispatch a page task
|
||||||
|
tasks <- fetchTask{query: c.cfg.Keyword, after: after, limit: min(100, c.cfg.Limit-collected)}
|
||||||
|
|
||||||
|
// wait for a result and process it
|
||||||
|
fr := <-results
|
||||||
|
if fr.err != nil {
|
||||||
|
// on error, back off a bit and retry dispatch
|
||||||
|
lg.Warn("fetch result error", "err", fr.err)
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// process posts
|
||||||
|
for _, jp := range fr.res.Posts {
|
||||||
|
p, perr := parser.ParseJSONPost(jp)
|
||||||
|
if perr != nil {
|
||||||
|
lg.Debug("skipping post", "reason", perr)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dedup.Seen(p.ID) {
|
||||||
|
mu.Lock()
|
||||||
|
summary.SkippedDuplicates++
|
||||||
|
mu.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := writer.Write(p); err != nil {
|
||||||
|
lg.Warn("write error", "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := dedup.Add(p.ID); err != nil {
|
||||||
|
lg.Warn("dedup add error", "err", err)
|
||||||
|
}
|
||||||
|
mu.Lock()
|
||||||
|
summary.SuccessfulPosts++
|
||||||
|
mu.Unlock()
|
||||||
|
collected++
|
||||||
|
if collected >= c.cfg.Limit {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
summary.TotalRequests++
|
||||||
|
if fr.res.After == "" {
|
||||||
|
// no more pages
|
||||||
|
lg.Info("no after token, finishing")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
after = fr.res.After
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Wait until pages are dispatched/processed
|
||||||
|
// Close tasks to stop workers when we're done dispatching
|
||||||
|
collectWg.Wait()
|
||||||
|
close(tasks)
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// finalize
|
||||||
|
duration := time.Since(startTime)
|
||||||
|
summary.DurationSec = duration.Seconds()
|
||||||
|
if summary.SuccessfulPosts > 0 {
|
||||||
|
summary.AvgLatencyMs = float64(summary.DurationSec*1000) / float64(summary.SuccessfulPosts)
|
||||||
|
}
|
||||||
|
lg.Info("controller finished", "summary", summary)
|
||||||
|
return summary, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// helper types
|
||||||
|
type fetchTask struct {
|
||||||
|
query string
|
||||||
|
after string
|
||||||
|
limit int
|
||||||
|
}
|
||||||
|
|
||||||
|
type fetchResult struct {
|
||||||
|
res fetcher.PageResult
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
// internal fetchWithRetries handles retry attempts with exponential backoff + jitter
|
||||||
|
func (c *Controller) fetchWithRetries(ctx context.Context, client *fetcher.Client, query, after string, limit int) (fetcher.PageResult, error) {
|
||||||
|
var lastErr error
|
||||||
|
var res fetcher.PageResult
|
||||||
|
for attempt := 0; attempt <= c.cfg.RetryLimit; attempt++ {
|
||||||
|
if attempt > 0 {
|
||||||
|
// backoff: base * 2^attempt + jitter
|
||||||
|
d := time.Duration(500*(1<<attempt)) * time.Millisecond
|
||||||
|
j := time.Duration(rand.Intn(300)) * time.Millisecond
|
||||||
|
time.Sleep(d + j)
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
res, err = client.FetchPage(ctx, query, after, limit)
|
||||||
|
if err == nil {
|
||||||
|
return res, nil
|
||||||
|
}
|
||||||
|
lastErr = err
|
||||||
|
// if context canceled, abort
|
||||||
|
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res, fmt.Errorf("fetch failed after retries: %w", lastErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func min(a, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
28
internal/fetcher/client.go
Normal file
28
internal/fetcher/client.go
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
package fetcher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"golang.org/x/time/rate"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Client encapsulates HTTP client behavior for fetching Reddit JSON endpoints.
|
||||||
|
type Client struct {
|
||||||
|
httpClient *http.Client
|
||||||
|
userAgent string
|
||||||
|
limiter *rate.Limiter
|
||||||
|
retryLimit int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewClient constructs a fetcher Client.
|
||||||
|
// - userAgent: User-Agent header
|
||||||
|
// - timeout: HTTP client timeout
|
||||||
|
// - rateDelay: minimum duration between requests (rate limiter)
|
||||||
|
// - burst: burst size (usually equal to concurrency)
|
||||||
|
// - retryLimit: max retries per request
|
||||||
|
func NewClient(userAgent string, timeout time.Duration, rateDelay time.Duration, burst int, retryLimit int) *Client {
|
||||||
|
hc := &http.Client{Timeout: timeout}
|
||||||
|
lim := rate.NewLimiter(rate.Every(rateDelay), burst)
|
||||||
|
return &Client{httpClient: hc, userAgent: userAgent, limiter: lim, retryLimit: retryLimit}
|
||||||
|
}
|
||||||
100
internal/fetcher/fetcher.go
Normal file
100
internal/fetcher/fetcher.go
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
package fetcher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PageResult contains raw page data and the after token.
|
||||||
|
type PageResult struct {
|
||||||
|
Posts []JSONPost
|
||||||
|
After string
|
||||||
|
Latency time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildSearchURL constructs the Reddit search.json URL.
|
||||||
|
func buildSearchURL(query string, after string, limit int) (string, error) {
|
||||||
|
u, err := url.Parse("https://www.reddit.com/search.json")
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
q := u.Query()
|
||||||
|
q.Set("q", query)
|
||||||
|
q.Set("sort", "new")
|
||||||
|
q.Set("type", "link")
|
||||||
|
q.Set("limit", fmt.Sprintf("%d", limit))
|
||||||
|
q.Set("restrict_sr", "false")
|
||||||
|
if after != "" {
|
||||||
|
q.Set("after", after)
|
||||||
|
}
|
||||||
|
u.RawQuery = q.Encode()
|
||||||
|
return u.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// FetchPage fetches a single search.json page and returns PageResult.
|
||||||
|
func (c *Client) FetchPage(ctx context.Context, query string, after string, limit int) (PageResult, error) {
|
||||||
|
var res PageResult
|
||||||
|
urlStr, err := buildSearchURL(query, after, limit)
|
||||||
|
if err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
// wait for rate limiter
|
||||||
|
if c.limiter != nil {
|
||||||
|
if err := c.limiter.Wait(ctx); err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start := time.Now()
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
|
||||||
|
if err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
req.Header.Set("User-Agent", c.userAgent)
|
||||||
|
req.Header.Set("Accept", "application/json")
|
||||||
|
|
||||||
|
resp, err := c.httpClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
res.Latency = time.Since(start)
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusTooManyRequests {
|
||||||
|
// honor Retry-After if given (seconds or HTTP-date)
|
||||||
|
if ra := resp.Header.Get("Retry-After"); ra != "" {
|
||||||
|
// Try seconds first
|
||||||
|
if secs, perr := strconv.Atoi(ra); perr == nil {
|
||||||
|
time.Sleep(time.Duration(secs) * time.Second)
|
||||||
|
} else if d, derr := http.ParseTime(ra); derr == nil {
|
||||||
|
// If HTTP-date, compute duration until then
|
||||||
|
dur := time.Until(d)
|
||||||
|
if dur > 0 {
|
||||||
|
time.Sleep(dur)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res, fmt.Errorf("rate limited: 429")
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
return res, fmt.Errorf("http %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
var api APIResponse
|
||||||
|
dec := json.NewDecoder(resp.Body)
|
||||||
|
if err := dec.Decode(&api); err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
for _, child := range api.Data.Children {
|
||||||
|
res.Posts = append(res.Posts, child.Data)
|
||||||
|
}
|
||||||
|
res.After = api.Data.After
|
||||||
|
return res, nil
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user