feat: add types, fetcher response types, and parser conversion

This commit is contained in:
Sirin Puenggun 2025-10-13 23:42:36 +07:00
parent b5d4526d91
commit 2dfe929b94
3 changed files with 117 additions and 0 deletions

View File

@ -0,0 +1,50 @@
package fetcher
// Minimal structs to decode Reddit search.json responses we care about.
// We only define the fields we need.
type APIResponse struct {
Kind string `json:"kind"`
Data struct {
Modhash string `json:"modhash"`
Dist int `json:"dist"`
Children []struct {
Kind string `json:"kind"`
Data JSONPost `json:"data"`
} `json:"children"`
After string `json:"after"`
Before string `json:"before"`
} `json:"data"`
}
// JSONPost contains the fields extracted from each child.data in Reddit response.
// It mirrors a subset of Reddit's API.
type JSONPost struct {
ID string `json:"id"`
Subreddit string `json:"subreddit"`
Title string `json:"title"`
Author string `json:"author"`
CreatedUTC float64 `json:"created_utc"`
Score int `json:"score"`
NumComments int `json:"num_comments"`
Selftext string `json:"selftext"`
URL string `json:"url"`
Permalink string `json:"permalink"`
}
// jsonPost contains the fields extracted from each child.data in Reddit response.
// It mirrors a subset of Reddit's API.
type jsonPost struct {
ID string `json:"id"`
Subreddit string `json:"subreddit"`
Title string `json:"title"`
Author string `json:"author"`
CreatedUTC float64 `json:"created_utc"`
Score int `json:"score"`
NumComments int `json:"num_comments"`
Selftext string `json:"selftext"`
URL string `json:"url"`
Permalink string `json:"permalink"`
}

43
internal/parser/parser.go Normal file
View File

@ -0,0 +1,43 @@
package parser
import (
"fmt"
"github.com/yourname/reddit-scraper/internal/fetcher"
"github.com/yourname/reddit-scraper/internal/types"
)
// ParseJSONPost converts a fetcher.JSONPost into a normalized types.Post.
// It validates mandatory fields and returns an error if the post should be skipped.
func ParseJSONPost(j fetcher.JSONPost) (types.Post, error) {
var p types.Post
if j.ID == "" {
return p, fmt.Errorf("missing id")
}
if j.Subreddit == "" {
return p, fmt.Errorf("missing subreddit")
}
if j.Title == "" {
return p, fmt.Errorf("missing title")
}
if j.CreatedUTC <= 0 {
return p, fmt.Errorf("invalid created_utc")
}
p.ID = j.ID
p.Subreddit = j.Subreddit
p.Title = j.Title
if j.Author == "" {
p.Author = "[deleted]"
} else {
p.Author = j.Author
}
p.CreatedUTC = int64(j.CreatedUTC)
p.Score = j.Score
p.NumComments = j.NumComments
p.Content = j.Selftext
p.URL = j.URL
p.Permalink = j.Permalink
return p, nil
}

24
internal/types/post.go Normal file
View File

@ -0,0 +1,24 @@
package types
// Post represents the normalized Reddit post schema for JSONL output.
type Post struct {
ID string `json:"id"`
Subreddit string `json:"subreddit"`
Title string `json:"title"`
Author string `json:"author"`
CreatedUTC int64 `json:"created_utc"`
Score int `json:"score"`
NumComments int `json:"num_comments"`
Content string `json:"content"`
URL string `json:"url"`
Permalink string `json:"permalink"`
}
// Summary holds run summary metrics.
type Summary struct {
TotalRequests int `json:"total_requests"`
SuccessfulPosts int `json:"successful_posts"`
SkippedDuplicates int `json:"skipped_duplicates"`
AvgLatencyMs float64 `json:"avg_latency_ms"`
DurationSec float64 `json:"duration_sec"`
}