feat: add types, fetcher response types, and parser conversion
This commit is contained in:
parent
b5d4526d91
commit
2dfe929b94
50
internal/fetcher/response.go
Normal file
50
internal/fetcher/response.go
Normal file
@ -0,0 +1,50 @@
|
||||
package fetcher
|
||||
|
||||
// Minimal structs to decode Reddit search.json responses we care about.
|
||||
// We only define the fields we need.
|
||||
|
||||
type APIResponse struct {
|
||||
Kind string `json:"kind"`
|
||||
Data struct {
|
||||
Modhash string `json:"modhash"`
|
||||
Dist int `json:"dist"`
|
||||
Children []struct {
|
||||
Kind string `json:"kind"`
|
||||
Data JSONPost `json:"data"`
|
||||
} `json:"children"`
|
||||
After string `json:"after"`
|
||||
Before string `json:"before"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
// JSONPost contains the fields extracted from each child.data in Reddit response.
|
||||
// It mirrors a subset of Reddit's API.
|
||||
|
||||
type JSONPost struct {
|
||||
ID string `json:"id"`
|
||||
Subreddit string `json:"subreddit"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
CreatedUTC float64 `json:"created_utc"`
|
||||
Score int `json:"score"`
|
||||
NumComments int `json:"num_comments"`
|
||||
Selftext string `json:"selftext"`
|
||||
URL string `json:"url"`
|
||||
Permalink string `json:"permalink"`
|
||||
}
|
||||
|
||||
// jsonPost contains the fields extracted from each child.data in Reddit response.
|
||||
// It mirrors a subset of Reddit's API.
|
||||
|
||||
type jsonPost struct {
|
||||
ID string `json:"id"`
|
||||
Subreddit string `json:"subreddit"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
CreatedUTC float64 `json:"created_utc"`
|
||||
Score int `json:"score"`
|
||||
NumComments int `json:"num_comments"`
|
||||
Selftext string `json:"selftext"`
|
||||
URL string `json:"url"`
|
||||
Permalink string `json:"permalink"`
|
||||
}
|
||||
43
internal/parser/parser.go
Normal file
43
internal/parser/parser.go
Normal file
@ -0,0 +1,43 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/yourname/reddit-scraper/internal/fetcher"
|
||||
"github.com/yourname/reddit-scraper/internal/types"
|
||||
)
|
||||
|
||||
// ParseJSONPost converts a fetcher.JSONPost into a normalized types.Post.
|
||||
// It validates mandatory fields and returns an error if the post should be skipped.
|
||||
func ParseJSONPost(j fetcher.JSONPost) (types.Post, error) {
|
||||
var p types.Post
|
||||
if j.ID == "" {
|
||||
return p, fmt.Errorf("missing id")
|
||||
}
|
||||
if j.Subreddit == "" {
|
||||
return p, fmt.Errorf("missing subreddit")
|
||||
}
|
||||
if j.Title == "" {
|
||||
return p, fmt.Errorf("missing title")
|
||||
}
|
||||
if j.CreatedUTC <= 0 {
|
||||
return p, fmt.Errorf("invalid created_utc")
|
||||
}
|
||||
|
||||
p.ID = j.ID
|
||||
p.Subreddit = j.Subreddit
|
||||
p.Title = j.Title
|
||||
if j.Author == "" {
|
||||
p.Author = "[deleted]"
|
||||
} else {
|
||||
p.Author = j.Author
|
||||
}
|
||||
p.CreatedUTC = int64(j.CreatedUTC)
|
||||
p.Score = j.Score
|
||||
p.NumComments = j.NumComments
|
||||
p.Content = j.Selftext
|
||||
p.URL = j.URL
|
||||
p.Permalink = j.Permalink
|
||||
|
||||
return p, nil
|
||||
}
|
||||
24
internal/types/post.go
Normal file
24
internal/types/post.go
Normal file
@ -0,0 +1,24 @@
|
||||
package types
|
||||
|
||||
// Post represents the normalized Reddit post schema for JSONL output.
|
||||
type Post struct {
|
||||
ID string `json:"id"`
|
||||
Subreddit string `json:"subreddit"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
CreatedUTC int64 `json:"created_utc"`
|
||||
Score int `json:"score"`
|
||||
NumComments int `json:"num_comments"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url"`
|
||||
Permalink string `json:"permalink"`
|
||||
}
|
||||
|
||||
// Summary holds run summary metrics.
|
||||
type Summary struct {
|
||||
TotalRequests int `json:"total_requests"`
|
||||
SuccessfulPosts int `json:"successful_posts"`
|
||||
SkippedDuplicates int `json:"skipped_duplicates"`
|
||||
AvgLatencyMs float64 `json:"avg_latency_ms"`
|
||||
DurationSec float64 `json:"duration_sec"`
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user