From 2dfe929b94a0f726fd080e1a6737c3d979194710 Mon Sep 17 00:00:00 2001 From: Sirin Puenggun Date: Mon, 13 Oct 2025 23:42:36 +0700 Subject: [PATCH] feat: add types, fetcher response types, and parser conversion --- internal/fetcher/response.go | 50 ++++++++++++++++++++++++++++++++++++ internal/parser/parser.go | 43 +++++++++++++++++++++++++++++++ internal/types/post.go | 24 +++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 internal/fetcher/response.go create mode 100644 internal/parser/parser.go create mode 100644 internal/types/post.go diff --git a/internal/fetcher/response.go b/internal/fetcher/response.go new file mode 100644 index 0000000..d7aab1b --- /dev/null +++ b/internal/fetcher/response.go @@ -0,0 +1,50 @@ +package fetcher + +// Minimal structs to decode Reddit search.json responses we care about. +// We only define the fields we need. + +type APIResponse struct { + Kind string `json:"kind"` + Data struct { + Modhash string `json:"modhash"` + Dist int `json:"dist"` + Children []struct { + Kind string `json:"kind"` + Data JSONPost `json:"data"` + } `json:"children"` + After string `json:"after"` + Before string `json:"before"` + } `json:"data"` +} + +// JSONPost contains the fields extracted from each child.data in Reddit response. +// It mirrors a subset of Reddit's API. + +type JSONPost struct { + ID string `json:"id"` + Subreddit string `json:"subreddit"` + Title string `json:"title"` + Author string `json:"author"` + CreatedUTC float64 `json:"created_utc"` + Score int `json:"score"` + NumComments int `json:"num_comments"` + Selftext string `json:"selftext"` + URL string `json:"url"` + Permalink string `json:"permalink"` +} + +// jsonPost contains the fields extracted from each child.data in Reddit response. +// It mirrors a subset of Reddit's API. + +type jsonPost struct { + ID string `json:"id"` + Subreddit string `json:"subreddit"` + Title string `json:"title"` + Author string `json:"author"` + CreatedUTC float64 `json:"created_utc"` + Score int `json:"score"` + NumComments int `json:"num_comments"` + Selftext string `json:"selftext"` + URL string `json:"url"` + Permalink string `json:"permalink"` +} diff --git a/internal/parser/parser.go b/internal/parser/parser.go new file mode 100644 index 0000000..dea90ab --- /dev/null +++ b/internal/parser/parser.go @@ -0,0 +1,43 @@ +package parser + +import ( + "fmt" + + "github.com/yourname/reddit-scraper/internal/fetcher" + "github.com/yourname/reddit-scraper/internal/types" +) + +// ParseJSONPost converts a fetcher.JSONPost into a normalized types.Post. +// It validates mandatory fields and returns an error if the post should be skipped. +func ParseJSONPost(j fetcher.JSONPost) (types.Post, error) { + var p types.Post + if j.ID == "" { + return p, fmt.Errorf("missing id") + } + if j.Subreddit == "" { + return p, fmt.Errorf("missing subreddit") + } + if j.Title == "" { + return p, fmt.Errorf("missing title") + } + if j.CreatedUTC <= 0 { + return p, fmt.Errorf("invalid created_utc") + } + + p.ID = j.ID + p.Subreddit = j.Subreddit + p.Title = j.Title + if j.Author == "" { + p.Author = "[deleted]" + } else { + p.Author = j.Author + } + p.CreatedUTC = int64(j.CreatedUTC) + p.Score = j.Score + p.NumComments = j.NumComments + p.Content = j.Selftext + p.URL = j.URL + p.Permalink = j.Permalink + + return p, nil +} diff --git a/internal/types/post.go b/internal/types/post.go new file mode 100644 index 0000000..86c9cca --- /dev/null +++ b/internal/types/post.go @@ -0,0 +1,24 @@ +package types + +// Post represents the normalized Reddit post schema for JSONL output. +type Post struct { + ID string `json:"id"` + Subreddit string `json:"subreddit"` + Title string `json:"title"` + Author string `json:"author"` + CreatedUTC int64 `json:"created_utc"` + Score int `json:"score"` + NumComments int `json:"num_comments"` + Content string `json:"content"` + URL string `json:"url"` + Permalink string `json:"permalink"` +} + +// Summary holds run summary metrics. +type Summary struct { + TotalRequests int `json:"total_requests"` + SuccessfulPosts int `json:"successful_posts"` + SkippedDuplicates int `json:"skipped_duplicates"` + AvgLatencyMs float64 `json:"avg_latency_ms"` + DurationSec float64 `json:"duration_sec"` +}