package config import ( "flag" "fmt" "os" "path/filepath" "strconv" "time" ) // Config holds runtime configuration for the scraper. type Config struct { Keyword string Limit int Concurrency int RetryLimit int RateLimitDelay time.Duration OutputDir string DedupCachePath string LogLevel string UserAgent string } // Load reads configuration from flags and environment. Flags take precedence over env. func Load() (*Config, error) { keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)") limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch") concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests") retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request") rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)") outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files") dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path") logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level") userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header") flag.Parse() if *keyword == "" { return nil, fmt.Errorf("keyword is required") } rd, err := time.ParseDuration(*rateDelay) if err != nil { return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err) } cfg := &Config{ Keyword: *keyword, Limit: *limit, Concurrency: *concurrency, RetryLimit: *retryLimit, RateLimitDelay: rd, OutputDir: *outputDir, DedupCachePath: *dedup, LogLevel: *logLevel, UserAgent: *userAgent, } // Ensure output dir exists if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil { return nil, fmt.Errorf("creating output dir: %w", err) } // Basic validation if cfg.Limit <= 0 { return nil, fmt.Errorf("limit must be > 0") } if cfg.Concurrency <= 0 { return nil, fmt.Errorf("concurrency must be > 0") } if cfg.RetryLimit < 0 { return nil, fmt.Errorf("retry-limit must be >= 0") } if cfg.RateLimitDelay <= 0 { return nil, fmt.Errorf("rate-limit delay must be > 0") } // Ensure dedup cache parent dir exists dedupDir := filepath.Dir(cfg.DedupCachePath) if dedupDir != "" && dedupDir != "." { if err := os.MkdirAll(filepath.Clean(dedupDir), 0o755); err != nil { return nil, fmt.Errorf("creating dedup cache dir: %w", err) } } return cfg, nil } func getEnv(key, def string) string { v := os.Getenv(key) if v == "" { return def } return v } func getEnvInt(key string, def int) int { s := os.Getenv(key) if s == "" { return def } i, err := strconv.Atoi(s) if err != nil { return def } return i }