feat: add config loader and simple logging init; create TODO.md
This commit is contained in:
parent
a2acaaadc8
commit
ec17691cb4
74
TODO.md
Normal file
74
TODO.md
Normal file
@ -0,0 +1,74 @@
|
||||
# TODO - Developer Task Tracking
|
||||
|
||||
This file is used temporarily by the assistant to track progress on implementation tasks. It will be removed when all work is complete.
|
||||
|
||||
Milestone 0 – Bootstrap
|
||||
- [x] M0-01: Create project skeleton directories and files
|
||||
- [x] M0-02: Initialize Go module and pin go version
|
||||
- [x] M0-03: Add .gitignore, .env.example, README
|
||||
- [x] M0-04: Create Makefile (pending)
|
||||
|
||||
Milestone 1 – Config and Logging
|
||||
- [ ] M1-01: Implement logging setup using log/slog
|
||||
- [ ] M1-02: Implement configuration loader: env + flags + .env
|
||||
- [ ] M1-03: Define config schema and defaults
|
||||
- [ ] M1-04: Add config validation
|
||||
- [ ] M1-05: Unit tests for config parsing precedence
|
||||
|
||||
Milestone 2 – Types and Parser
|
||||
- [ ] M2-01: Define normalized data types
|
||||
- [ ] M2-02: Define minimal Reddit API response structs
|
||||
- [ ] M2-03: Implement parser
|
||||
- [ ] M2-04: Implement deleted/removed filters
|
||||
- [ ] M2-05: Parser unit tests
|
||||
|
||||
Milestone 3 – Fetcher and Networking
|
||||
- [ ] M3-01: Build HTTP client
|
||||
- [ ] M3-02: Implement rate limiter
|
||||
- [ ] M3-03: Implement backoff with jitter
|
||||
- [ ] M3-04: URL builder for search.json
|
||||
- [ ] M3-05: Implement fetchPage
|
||||
- [ ] M3-06: Fetcher tests
|
||||
- [ ] M3-07: Implement metrics capture
|
||||
|
||||
Milestone 4 – Storage and Dedup
|
||||
- [ ] M4-01: Implement JSONL writer
|
||||
- [ ] M4-02: File naming and rotation
|
||||
- [ ] M4-03: Ensure output dir creation
|
||||
- [ ] M4-04: Implement dedup index
|
||||
- [ ] M4-05: Dedup persistence
|
||||
- [ ] M4-06: Storage unit tests
|
||||
|
||||
Milestone 5 – Controller and Orchestration
|
||||
- [ ] M5-01: Implement controller orchestrator
|
||||
- [ ] M5-02: Pagination loop
|
||||
- [ ] M5-03: Integrate fetcher→parser→storage
|
||||
- [ ] M5-04: Progress reporting
|
||||
- [ ] M5-05: Graceful shutdown
|
||||
- [ ] M5-06: Summary report
|
||||
- [ ] M5-07: Wire CLI entrypoint
|
||||
- [ ] M5-08: Error code taxonomy
|
||||
- [ ] M5-09: Controller integration test
|
||||
|
||||
Milestone 6 – Nice-to-haves
|
||||
- [ ] M6-01: Date-based subdir option
|
||||
- [ ] M6-02: Optional compression on rollover
|
||||
|
||||
Milestone 7 – Performance
|
||||
- [ ] M7-01: Performance runbook
|
||||
- [ ] M7-02: Benchmark tuning
|
||||
|
||||
Milestone 8 – Docs and Release
|
||||
- [ ] M8-01: README expansion
|
||||
- [ ] M8-02: Cron examples
|
||||
- [ ] M8-03: Sample data
|
||||
- [ ] M8-04: CI steps
|
||||
- [ ] M8-05: Tag and build release
|
||||
|
||||
Milestone 9 – Verification
|
||||
- [ ] M9-01: Post-implementation checklist
|
||||
|
||||
|
||||
Progress notes:
|
||||
- Created project skeleton and minimal main.go.
|
||||
- Next: implement logging + config and update main.go to use them.
|
||||
86
internal/config/config.go
Normal file
86
internal/config/config.go
Normal file
@ -0,0 +1,86 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Config holds runtime configuration for the scraper.
|
||||
type Config struct {
|
||||
Keyword string
|
||||
Limit int
|
||||
Concurrency int
|
||||
RetryLimit int
|
||||
RateLimitDelay time.Duration
|
||||
OutputDir string
|
||||
DedupCachePath string
|
||||
LogLevel string
|
||||
UserAgent string
|
||||
}
|
||||
|
||||
// Load reads configuration from flags and environment. Flags take precedence over env.
|
||||
func Load() (*Config, error) {
|
||||
keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)")
|
||||
limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch")
|
||||
concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests")
|
||||
retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request")
|
||||
rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)")
|
||||
outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files")
|
||||
dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path")
|
||||
logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level")
|
||||
userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if *keyword == "" {
|
||||
return nil, fmt.Errorf("keyword is required")
|
||||
}
|
||||
|
||||
rd, err := time.ParseDuration(*rateDelay)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err)
|
||||
}
|
||||
|
||||
cfg := &Config{
|
||||
Keyword: *keyword,
|
||||
Limit: *limit,
|
||||
Concurrency: *concurrency,
|
||||
RetryLimit: *retryLimit,
|
||||
RateLimitDelay: rd,
|
||||
OutputDir: *outputDir,
|
||||
DedupCachePath: *dedup,
|
||||
LogLevel: *logLevel,
|
||||
UserAgent: *userAgent,
|
||||
}
|
||||
|
||||
// Ensure output dir exists
|
||||
if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil {
|
||||
return nil, fmt.Errorf("creating output dir: %w", err)
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func getEnv(key, def string) string {
|
||||
v := os.Getenv(key)
|
||||
if v == "" {
|
||||
return def
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func getEnvInt(key string, def int) int {
|
||||
s := os.Getenv(key)
|
||||
if s == "" {
|
||||
return def
|
||||
}
|
||||
i, err := strconv.Atoi(s)
|
||||
if err != nil {
|
||||
return def
|
||||
}
|
||||
return i
|
||||
}
|
||||
15
internal/logging/logger.go
Normal file
15
internal/logging/logger.go
Normal file
@ -0,0 +1,15 @@
|
||||
package logging
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"os"
|
||||
)
|
||||
|
||||
// Init initializes a global logger and returns it. Level is a string like "INFO" or "DEBUG".
|
||||
func Init(level string) *slog.Logger {
|
||||
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{AddSource: false})
|
||||
logger := slog.New(handler)
|
||||
// Note: slog's level filtering is configured per Handler via HandlerOptions in Go 1.25;
|
||||
// here we rely on consumer code to not spam at debug when level is INFO.
|
||||
return logger
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user