87 lines
2.3 KiB
Go
87 lines
2.3 KiB
Go
package config
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"time"
|
|
)
|
|
|
|
// Config holds runtime configuration for the scraper.
|
|
type Config struct {
|
|
Keyword string
|
|
Limit int
|
|
Concurrency int
|
|
RetryLimit int
|
|
RateLimitDelay time.Duration
|
|
OutputDir string
|
|
DedupCachePath string
|
|
LogLevel string
|
|
UserAgent string
|
|
}
|
|
|
|
// Load reads configuration from flags and environment. Flags take precedence over env.
|
|
func Load() (*Config, error) {
|
|
keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)")
|
|
limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch")
|
|
concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests")
|
|
retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request")
|
|
rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)")
|
|
outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files")
|
|
dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path")
|
|
logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level")
|
|
userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header")
|
|
|
|
flag.Parse()
|
|
|
|
if *keyword == "" {
|
|
return nil, fmt.Errorf("keyword is required")
|
|
}
|
|
|
|
rd, err := time.ParseDuration(*rateDelay)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err)
|
|
}
|
|
|
|
cfg := &Config{
|
|
Keyword: *keyword,
|
|
Limit: *limit,
|
|
Concurrency: *concurrency,
|
|
RetryLimit: *retryLimit,
|
|
RateLimitDelay: rd,
|
|
OutputDir: *outputDir,
|
|
DedupCachePath: *dedup,
|
|
LogLevel: *logLevel,
|
|
UserAgent: *userAgent,
|
|
}
|
|
|
|
// Ensure output dir exists
|
|
if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil {
|
|
return nil, fmt.Errorf("creating output dir: %w", err)
|
|
}
|
|
|
|
return cfg, nil
|
|
}
|
|
|
|
func getEnv(key, def string) string {
|
|
v := os.Getenv(key)
|
|
if v == "" {
|
|
return def
|
|
}
|
|
return v
|
|
}
|
|
|
|
func getEnvInt(key string, def int) int {
|
|
s := os.Getenv(key)
|
|
if s == "" {
|
|
return def
|
|
}
|
|
i, err := strconv.Atoi(s)
|
|
if err != nil {
|
|
return def
|
|
}
|
|
return i
|
|
}
|