reddit-scrapper/internal/fetcher/fetcher.go
2025-10-14 09:19:40 +07:00

101 lines
2.4 KiB
Go

package fetcher
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"time"
)
// PageResult contains raw page data and the after token.
type PageResult struct {
Posts []JSONPost
After string
Latency time.Duration
}
// buildSearchURL constructs the Reddit search.json URL.
func buildSearchURL(query string, after string, limit int) (string, error) {
u, err := url.Parse("https://www.reddit.com/search.json")
if err != nil {
return "", err
}
q := u.Query()
q.Set("q", query)
q.Set("sort", "new")
q.Set("type", "link")
q.Set("limit", fmt.Sprintf("%d", limit))
q.Set("restrict_sr", "false")
if after != "" {
q.Set("after", after)
}
u.RawQuery = q.Encode()
return u.String(), nil
}
// FetchPage fetches a single search.json page and returns PageResult.
func (c *Client) FetchPage(ctx context.Context, query string, after string, limit int) (PageResult, error) {
var res PageResult
urlStr, err := buildSearchURL(query, after, limit)
if err != nil {
return res, err
}
// wait for rate limiter
if c.limiter != nil {
if err := c.limiter.Wait(ctx); err != nil {
return res, err
}
}
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
if err != nil {
return res, err
}
req.Header.Set("User-Agent", c.userAgent)
req.Header.Set("Accept", "application/json")
resp, err := c.httpClient.Do(req)
if err != nil {
return res, err
}
defer resp.Body.Close()
res.Latency = time.Since(start)
if resp.StatusCode == http.StatusTooManyRequests {
// honor Retry-After if given (seconds or HTTP-date)
if ra := resp.Header.Get("Retry-After"); ra != "" {
// Try seconds first
if secs, perr := strconv.Atoi(ra); perr == nil {
time.Sleep(time.Duration(secs) * time.Second)
} else if d, derr := http.ParseTime(ra); derr == nil {
// If HTTP-date, compute duration until then
dur := time.Until(d)
if dur > 0 {
time.Sleep(dur)
}
}
}
return res, fmt.Errorf("rate limited: 429")
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(resp.Body)
return res, fmt.Errorf("http %d: %s", resp.StatusCode, string(body))
}
var api APIResponse
dec := json.NewDecoder(resp.Body)
if err := dec.Decode(&api); err != nil {
return res, err
}
for _, child := range api.Data.Children {
res.Posts = append(res.Posts, child.Data)
}
res.After = api.Data.After
return res, nil
}