reddit-scrapper/internal/config/config.go

109 lines
2.9 KiB
Go

package config
import (
"flag"
"fmt"
"os"
"path/filepath"
"strconv"
"time"
)
// Config holds runtime configuration for the scraper.
type Config struct {
Keyword string
Limit int
Concurrency int
RetryLimit int
RateLimitDelay time.Duration
OutputDir string
DedupCachePath string
LogLevel string
UserAgent string
}
// Load reads configuration from flags and environment. Flags take precedence over env.
func Load() (*Config, error) {
keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)")
limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch")
concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests")
retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request")
rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)")
outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files")
dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path")
logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level")
userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header")
flag.Parse()
if *keyword == "" {
return nil, fmt.Errorf("keyword is required")
}
rd, err := time.ParseDuration(*rateDelay)
if err != nil {
return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err)
}
cfg := &Config{
Keyword: *keyword,
Limit: *limit,
Concurrency: *concurrency,
RetryLimit: *retryLimit,
RateLimitDelay: rd,
OutputDir: *outputDir,
DedupCachePath: *dedup,
LogLevel: *logLevel,
UserAgent: *userAgent,
}
// Ensure output dir exists
if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil {
return nil, fmt.Errorf("creating output dir: %w", err)
}
// Basic validation
if cfg.Limit <= 0 {
return nil, fmt.Errorf("limit must be > 0")
}
if cfg.Concurrency <= 0 {
return nil, fmt.Errorf("concurrency must be > 0")
}
if cfg.RetryLimit < 0 {
return nil, fmt.Errorf("retry-limit must be >= 0")
}
if cfg.RateLimitDelay <= 0 {
return nil, fmt.Errorf("rate-limit delay must be > 0")
}
// Ensure dedup cache parent dir exists
dedupDir := filepath.Dir(cfg.DedupCachePath)
if dedupDir != "" && dedupDir != "." {
if err := os.MkdirAll(filepath.Clean(dedupDir), 0o755); err != nil {
return nil, fmt.Errorf("creating dedup cache dir: %w", err)
}
}
return cfg, nil
}
func getEnv(key, def string) string {
v := os.Getenv(key)
if v == "" {
return def
}
return v
}
func getEnvInt(key string, def int) int {
s := os.Getenv(key)
if s == "" {
return def
}
i, err := strconv.Atoi(s)
if err != nil {
return def
}
return i
}