feat: add config loader and simple logging init; create TODO.md

This commit is contained in:
Sirin Puenggun 2025-10-13 23:27:56 +07:00
parent a2acaaadc8
commit ec17691cb4
3 changed files with 175 additions and 0 deletions

74
TODO.md Normal file
View File

@ -0,0 +1,74 @@
# TODO - Developer Task Tracking
This file is used temporarily by the assistant to track progress on implementation tasks. It will be removed when all work is complete.
Milestone 0 Bootstrap
- [x] M0-01: Create project skeleton directories and files
- [x] M0-02: Initialize Go module and pin go version
- [x] M0-03: Add .gitignore, .env.example, README
- [x] M0-04: Create Makefile (pending)
Milestone 1 Config and Logging
- [ ] M1-01: Implement logging setup using log/slog
- [ ] M1-02: Implement configuration loader: env + flags + .env
- [ ] M1-03: Define config schema and defaults
- [ ] M1-04: Add config validation
- [ ] M1-05: Unit tests for config parsing precedence
Milestone 2 Types and Parser
- [ ] M2-01: Define normalized data types
- [ ] M2-02: Define minimal Reddit API response structs
- [ ] M2-03: Implement parser
- [ ] M2-04: Implement deleted/removed filters
- [ ] M2-05: Parser unit tests
Milestone 3 Fetcher and Networking
- [ ] M3-01: Build HTTP client
- [ ] M3-02: Implement rate limiter
- [ ] M3-03: Implement backoff with jitter
- [ ] M3-04: URL builder for search.json
- [ ] M3-05: Implement fetchPage
- [ ] M3-06: Fetcher tests
- [ ] M3-07: Implement metrics capture
Milestone 4 Storage and Dedup
- [ ] M4-01: Implement JSONL writer
- [ ] M4-02: File naming and rotation
- [ ] M4-03: Ensure output dir creation
- [ ] M4-04: Implement dedup index
- [ ] M4-05: Dedup persistence
- [ ] M4-06: Storage unit tests
Milestone 5 Controller and Orchestration
- [ ] M5-01: Implement controller orchestrator
- [ ] M5-02: Pagination loop
- [ ] M5-03: Integrate fetcher→parser→storage
- [ ] M5-04: Progress reporting
- [ ] M5-05: Graceful shutdown
- [ ] M5-06: Summary report
- [ ] M5-07: Wire CLI entrypoint
- [ ] M5-08: Error code taxonomy
- [ ] M5-09: Controller integration test
Milestone 6 Nice-to-haves
- [ ] M6-01: Date-based subdir option
- [ ] M6-02: Optional compression on rollover
Milestone 7 Performance
- [ ] M7-01: Performance runbook
- [ ] M7-02: Benchmark tuning
Milestone 8 Docs and Release
- [ ] M8-01: README expansion
- [ ] M8-02: Cron examples
- [ ] M8-03: Sample data
- [ ] M8-04: CI steps
- [ ] M8-05: Tag and build release
Milestone 9 Verification
- [ ] M9-01: Post-implementation checklist
Progress notes:
- Created project skeleton and minimal main.go.
- Next: implement logging + config and update main.go to use them.

86
internal/config/config.go Normal file
View File

@ -0,0 +1,86 @@
package config
import (
"flag"
"fmt"
"os"
"path/filepath"
"strconv"
"time"
)
// Config holds runtime configuration for the scraper.
type Config struct {
Keyword string
Limit int
Concurrency int
RetryLimit int
RateLimitDelay time.Duration
OutputDir string
DedupCachePath string
LogLevel string
UserAgent string
}
// Load reads configuration from flags and environment. Flags take precedence over env.
func Load() (*Config, error) {
keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)")
limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch")
concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests")
retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request")
rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)")
outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files")
dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path")
logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level")
userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header")
flag.Parse()
if *keyword == "" {
return nil, fmt.Errorf("keyword is required")
}
rd, err := time.ParseDuration(*rateDelay)
if err != nil {
return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err)
}
cfg := &Config{
Keyword: *keyword,
Limit: *limit,
Concurrency: *concurrency,
RetryLimit: *retryLimit,
RateLimitDelay: rd,
OutputDir: *outputDir,
DedupCachePath: *dedup,
LogLevel: *logLevel,
UserAgent: *userAgent,
}
// Ensure output dir exists
if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil {
return nil, fmt.Errorf("creating output dir: %w", err)
}
return cfg, nil
}
func getEnv(key, def string) string {
v := os.Getenv(key)
if v == "" {
return def
}
return v
}
func getEnvInt(key string, def int) int {
s := os.Getenv(key)
if s == "" {
return def
}
i, err := strconv.Atoi(s)
if err != nil {
return def
}
return i
}

View File

@ -0,0 +1,15 @@
package logging
import (
"log/slog"
"os"
)
// Init initializes a global logger and returns it. Level is a string like "INFO" or "DEBUG".
func Init(level string) *slog.Logger {
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{AddSource: false})
logger := slog.New(handler)
// Note: slog's level filtering is configured per Handler via HandlerOptions in Go 1.25;
// here we rely on consumer code to not spam at debug when level is INFO.
return logger
}