From ec17691cb4ecba2e60384482721a1e4df02c0f65 Mon Sep 17 00:00:00 2001 From: Sirin Puenggun Date: Mon, 13 Oct 2025 23:27:56 +0700 Subject: [PATCH] feat: add config loader and simple logging init; create TODO.md --- TODO.md | 74 ++++++++++++++++++++++++++++++++ internal/config/config.go | 86 ++++++++++++++++++++++++++++++++++++++ internal/logging/logger.go | 15 +++++++ 3 files changed, 175 insertions(+) create mode 100644 TODO.md create mode 100644 internal/config/config.go create mode 100644 internal/logging/logger.go diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..4c1137c --- /dev/null +++ b/TODO.md @@ -0,0 +1,74 @@ +# TODO - Developer Task Tracking + +This file is used temporarily by the assistant to track progress on implementation tasks. It will be removed when all work is complete. + +Milestone 0 – Bootstrap +- [x] M0-01: Create project skeleton directories and files +- [x] M0-02: Initialize Go module and pin go version +- [x] M0-03: Add .gitignore, .env.example, README +- [x] M0-04: Create Makefile (pending) + +Milestone 1 – Config and Logging +- [ ] M1-01: Implement logging setup using log/slog +- [ ] M1-02: Implement configuration loader: env + flags + .env +- [ ] M1-03: Define config schema and defaults +- [ ] M1-04: Add config validation +- [ ] M1-05: Unit tests for config parsing precedence + +Milestone 2 – Types and Parser +- [ ] M2-01: Define normalized data types +- [ ] M2-02: Define minimal Reddit API response structs +- [ ] M2-03: Implement parser +- [ ] M2-04: Implement deleted/removed filters +- [ ] M2-05: Parser unit tests + +Milestone 3 – Fetcher and Networking +- [ ] M3-01: Build HTTP client +- [ ] M3-02: Implement rate limiter +- [ ] M3-03: Implement backoff with jitter +- [ ] M3-04: URL builder for search.json +- [ ] M3-05: Implement fetchPage +- [ ] M3-06: Fetcher tests +- [ ] M3-07: Implement metrics capture + +Milestone 4 – Storage and Dedup +- [ ] M4-01: Implement JSONL writer +- [ ] M4-02: File naming and rotation +- [ ] M4-03: Ensure output dir creation +- [ ] M4-04: Implement dedup index +- [ ] M4-05: Dedup persistence +- [ ] M4-06: Storage unit tests + +Milestone 5 – Controller and Orchestration +- [ ] M5-01: Implement controller orchestrator +- [ ] M5-02: Pagination loop +- [ ] M5-03: Integrate fetcher→parser→storage +- [ ] M5-04: Progress reporting +- [ ] M5-05: Graceful shutdown +- [ ] M5-06: Summary report +- [ ] M5-07: Wire CLI entrypoint +- [ ] M5-08: Error code taxonomy +- [ ] M5-09: Controller integration test + +Milestone 6 – Nice-to-haves +- [ ] M6-01: Date-based subdir option +- [ ] M6-02: Optional compression on rollover + +Milestone 7 – Performance +- [ ] M7-01: Performance runbook +- [ ] M7-02: Benchmark tuning + +Milestone 8 – Docs and Release +- [ ] M8-01: README expansion +- [ ] M8-02: Cron examples +- [ ] M8-03: Sample data +- [ ] M8-04: CI steps +- [ ] M8-05: Tag and build release + +Milestone 9 – Verification +- [ ] M9-01: Post-implementation checklist + + +Progress notes: +- Created project skeleton and minimal main.go. +- Next: implement logging + config and update main.go to use them. diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..6637577 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,86 @@ +package config + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "strconv" + "time" +) + +// Config holds runtime configuration for the scraper. +type Config struct { + Keyword string + Limit int + Concurrency int + RetryLimit int + RateLimitDelay time.Duration + OutputDir string + DedupCachePath string + LogLevel string + UserAgent string +} + +// Load reads configuration from flags and environment. Flags take precedence over env. +func Load() (*Config, error) { + keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)") + limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch") + concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests") + retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request") + rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)") + outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files") + dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path") + logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level") + userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header") + + flag.Parse() + + if *keyword == "" { + return nil, fmt.Errorf("keyword is required") + } + + rd, err := time.ParseDuration(*rateDelay) + if err != nil { + return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err) + } + + cfg := &Config{ + Keyword: *keyword, + Limit: *limit, + Concurrency: *concurrency, + RetryLimit: *retryLimit, + RateLimitDelay: rd, + OutputDir: *outputDir, + DedupCachePath: *dedup, + LogLevel: *logLevel, + UserAgent: *userAgent, + } + + // Ensure output dir exists + if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil { + return nil, fmt.Errorf("creating output dir: %w", err) + } + + return cfg, nil +} + +func getEnv(key, def string) string { + v := os.Getenv(key) + if v == "" { + return def + } + return v +} + +func getEnvInt(key string, def int) int { + s := os.Getenv(key) + if s == "" { + return def + } + i, err := strconv.Atoi(s) + if err != nil { + return def + } + return i +} diff --git a/internal/logging/logger.go b/internal/logging/logger.go new file mode 100644 index 0000000..35a50b0 --- /dev/null +++ b/internal/logging/logger.go @@ -0,0 +1,15 @@ +package logging + +import ( + "log/slog" + "os" +) + +// Init initializes a global logger and returns it. Level is a string like "INFO" or "DEBUG". +func Init(level string) *slog.Logger { + handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{AddSource: false}) + logger := slog.New(handler) + // Note: slog's level filtering is configured per Handler via HandlerOptions in Go 1.25; + // here we rely on consumer code to not spam at debug when level is INFO. + return logger +}