feat: add types, fetcher response types, and parser conversion
This commit is contained in:
parent
b5d4526d91
commit
2dfe929b94
50
internal/fetcher/response.go
Normal file
50
internal/fetcher/response.go
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
package fetcher
|
||||||
|
|
||||||
|
// Minimal structs to decode Reddit search.json responses we care about.
|
||||||
|
// We only define the fields we need.
|
||||||
|
|
||||||
|
type APIResponse struct {
|
||||||
|
Kind string `json:"kind"`
|
||||||
|
Data struct {
|
||||||
|
Modhash string `json:"modhash"`
|
||||||
|
Dist int `json:"dist"`
|
||||||
|
Children []struct {
|
||||||
|
Kind string `json:"kind"`
|
||||||
|
Data JSONPost `json:"data"`
|
||||||
|
} `json:"children"`
|
||||||
|
After string `json:"after"`
|
||||||
|
Before string `json:"before"`
|
||||||
|
} `json:"data"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// JSONPost contains the fields extracted from each child.data in Reddit response.
|
||||||
|
// It mirrors a subset of Reddit's API.
|
||||||
|
|
||||||
|
type JSONPost struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Subreddit string `json:"subreddit"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Author string `json:"author"`
|
||||||
|
CreatedUTC float64 `json:"created_utc"`
|
||||||
|
Score int `json:"score"`
|
||||||
|
NumComments int `json:"num_comments"`
|
||||||
|
Selftext string `json:"selftext"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
Permalink string `json:"permalink"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// jsonPost contains the fields extracted from each child.data in Reddit response.
|
||||||
|
// It mirrors a subset of Reddit's API.
|
||||||
|
|
||||||
|
type jsonPost struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Subreddit string `json:"subreddit"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Author string `json:"author"`
|
||||||
|
CreatedUTC float64 `json:"created_utc"`
|
||||||
|
Score int `json:"score"`
|
||||||
|
NumComments int `json:"num_comments"`
|
||||||
|
Selftext string `json:"selftext"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
Permalink string `json:"permalink"`
|
||||||
|
}
|
||||||
43
internal/parser/parser.go
Normal file
43
internal/parser/parser.go
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
package parser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/yourname/reddit-scraper/internal/fetcher"
|
||||||
|
"github.com/yourname/reddit-scraper/internal/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseJSONPost converts a fetcher.JSONPost into a normalized types.Post.
|
||||||
|
// It validates mandatory fields and returns an error if the post should be skipped.
|
||||||
|
func ParseJSONPost(j fetcher.JSONPost) (types.Post, error) {
|
||||||
|
var p types.Post
|
||||||
|
if j.ID == "" {
|
||||||
|
return p, fmt.Errorf("missing id")
|
||||||
|
}
|
||||||
|
if j.Subreddit == "" {
|
||||||
|
return p, fmt.Errorf("missing subreddit")
|
||||||
|
}
|
||||||
|
if j.Title == "" {
|
||||||
|
return p, fmt.Errorf("missing title")
|
||||||
|
}
|
||||||
|
if j.CreatedUTC <= 0 {
|
||||||
|
return p, fmt.Errorf("invalid created_utc")
|
||||||
|
}
|
||||||
|
|
||||||
|
p.ID = j.ID
|
||||||
|
p.Subreddit = j.Subreddit
|
||||||
|
p.Title = j.Title
|
||||||
|
if j.Author == "" {
|
||||||
|
p.Author = "[deleted]"
|
||||||
|
} else {
|
||||||
|
p.Author = j.Author
|
||||||
|
}
|
||||||
|
p.CreatedUTC = int64(j.CreatedUTC)
|
||||||
|
p.Score = j.Score
|
||||||
|
p.NumComments = j.NumComments
|
||||||
|
p.Content = j.Selftext
|
||||||
|
p.URL = j.URL
|
||||||
|
p.Permalink = j.Permalink
|
||||||
|
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
24
internal/types/post.go
Normal file
24
internal/types/post.go
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
package types
|
||||||
|
|
||||||
|
// Post represents the normalized Reddit post schema for JSONL output.
|
||||||
|
type Post struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Subreddit string `json:"subreddit"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Author string `json:"author"`
|
||||||
|
CreatedUTC int64 `json:"created_utc"`
|
||||||
|
Score int `json:"score"`
|
||||||
|
NumComments int `json:"num_comments"`
|
||||||
|
Content string `json:"content"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
Permalink string `json:"permalink"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary holds run summary metrics.
|
||||||
|
type Summary struct {
|
||||||
|
TotalRequests int `json:"total_requests"`
|
||||||
|
SuccessfulPosts int `json:"successful_posts"`
|
||||||
|
SkippedDuplicates int `json:"skipped_duplicates"`
|
||||||
|
AvgLatencyMs float64 `json:"avg_latency_ms"`
|
||||||
|
DurationSec float64 `json:"duration_sec"`
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user