feat: add config loader and simple logging init; create TODO.md
This commit is contained in:
parent
a2acaaadc8
commit
ec17691cb4
74
TODO.md
Normal file
74
TODO.md
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
# TODO - Developer Task Tracking
|
||||||
|
|
||||||
|
This file is used temporarily by the assistant to track progress on implementation tasks. It will be removed when all work is complete.
|
||||||
|
|
||||||
|
Milestone 0 – Bootstrap
|
||||||
|
- [x] M0-01: Create project skeleton directories and files
|
||||||
|
- [x] M0-02: Initialize Go module and pin go version
|
||||||
|
- [x] M0-03: Add .gitignore, .env.example, README
|
||||||
|
- [x] M0-04: Create Makefile (pending)
|
||||||
|
|
||||||
|
Milestone 1 – Config and Logging
|
||||||
|
- [ ] M1-01: Implement logging setup using log/slog
|
||||||
|
- [ ] M1-02: Implement configuration loader: env + flags + .env
|
||||||
|
- [ ] M1-03: Define config schema and defaults
|
||||||
|
- [ ] M1-04: Add config validation
|
||||||
|
- [ ] M1-05: Unit tests for config parsing precedence
|
||||||
|
|
||||||
|
Milestone 2 – Types and Parser
|
||||||
|
- [ ] M2-01: Define normalized data types
|
||||||
|
- [ ] M2-02: Define minimal Reddit API response structs
|
||||||
|
- [ ] M2-03: Implement parser
|
||||||
|
- [ ] M2-04: Implement deleted/removed filters
|
||||||
|
- [ ] M2-05: Parser unit tests
|
||||||
|
|
||||||
|
Milestone 3 – Fetcher and Networking
|
||||||
|
- [ ] M3-01: Build HTTP client
|
||||||
|
- [ ] M3-02: Implement rate limiter
|
||||||
|
- [ ] M3-03: Implement backoff with jitter
|
||||||
|
- [ ] M3-04: URL builder for search.json
|
||||||
|
- [ ] M3-05: Implement fetchPage
|
||||||
|
- [ ] M3-06: Fetcher tests
|
||||||
|
- [ ] M3-07: Implement metrics capture
|
||||||
|
|
||||||
|
Milestone 4 – Storage and Dedup
|
||||||
|
- [ ] M4-01: Implement JSONL writer
|
||||||
|
- [ ] M4-02: File naming and rotation
|
||||||
|
- [ ] M4-03: Ensure output dir creation
|
||||||
|
- [ ] M4-04: Implement dedup index
|
||||||
|
- [ ] M4-05: Dedup persistence
|
||||||
|
- [ ] M4-06: Storage unit tests
|
||||||
|
|
||||||
|
Milestone 5 – Controller and Orchestration
|
||||||
|
- [ ] M5-01: Implement controller orchestrator
|
||||||
|
- [ ] M5-02: Pagination loop
|
||||||
|
- [ ] M5-03: Integrate fetcher→parser→storage
|
||||||
|
- [ ] M5-04: Progress reporting
|
||||||
|
- [ ] M5-05: Graceful shutdown
|
||||||
|
- [ ] M5-06: Summary report
|
||||||
|
- [ ] M5-07: Wire CLI entrypoint
|
||||||
|
- [ ] M5-08: Error code taxonomy
|
||||||
|
- [ ] M5-09: Controller integration test
|
||||||
|
|
||||||
|
Milestone 6 – Nice-to-haves
|
||||||
|
- [ ] M6-01: Date-based subdir option
|
||||||
|
- [ ] M6-02: Optional compression on rollover
|
||||||
|
|
||||||
|
Milestone 7 – Performance
|
||||||
|
- [ ] M7-01: Performance runbook
|
||||||
|
- [ ] M7-02: Benchmark tuning
|
||||||
|
|
||||||
|
Milestone 8 – Docs and Release
|
||||||
|
- [ ] M8-01: README expansion
|
||||||
|
- [ ] M8-02: Cron examples
|
||||||
|
- [ ] M8-03: Sample data
|
||||||
|
- [ ] M8-04: CI steps
|
||||||
|
- [ ] M8-05: Tag and build release
|
||||||
|
|
||||||
|
Milestone 9 – Verification
|
||||||
|
- [ ] M9-01: Post-implementation checklist
|
||||||
|
|
||||||
|
|
||||||
|
Progress notes:
|
||||||
|
- Created project skeleton and minimal main.go.
|
||||||
|
- Next: implement logging + config and update main.go to use them.
|
||||||
86
internal/config/config.go
Normal file
86
internal/config/config.go
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Config holds runtime configuration for the scraper.
|
||||||
|
type Config struct {
|
||||||
|
Keyword string
|
||||||
|
Limit int
|
||||||
|
Concurrency int
|
||||||
|
RetryLimit int
|
||||||
|
RateLimitDelay time.Duration
|
||||||
|
OutputDir string
|
||||||
|
DedupCachePath string
|
||||||
|
LogLevel string
|
||||||
|
UserAgent string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load reads configuration from flags and environment. Flags take precedence over env.
|
||||||
|
func Load() (*Config, error) {
|
||||||
|
keyword := flag.String("keyword", os.Getenv("KEYWORD"), "Search keyword (required)")
|
||||||
|
limit := flag.Int("limit", getEnvInt("LIMIT", 1000), "Max posts to fetch")
|
||||||
|
concurrency := flag.Int("concurrency", getEnvInt("CONCURRENCY", 4), "Parallel requests")
|
||||||
|
retryLimit := flag.Int("retry-limit", getEnvInt("RETRY_LIMIT", 5), "Retry attempts per request")
|
||||||
|
rateDelay := flag.String("rate-delay", getEnv("RATE_LIMIT_DELAY", "2s"), "Rate limit delay (e.g., 1s, 500ms)")
|
||||||
|
outputDir := flag.String("output", getEnv("OUTPUT_DIR", "./data"), "Output directory for JSONL files")
|
||||||
|
dedup := flag.String("dedup", getEnv("DEDUP_CACHE_PATH", "./data/dedup_ids.txt"), "Dedup cache path")
|
||||||
|
logLevel := flag.String("log-level", getEnv("LOG_LEVEL", "INFO"), "Log level")
|
||||||
|
userAgent := flag.String("user-agent", getEnv("USER_AGENT", "reddit-scraper-mvp/0.1 (+contact: you@example.com)"), "User-Agent header")
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if *keyword == "" {
|
||||||
|
return nil, fmt.Errorf("keyword is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
rd, err := time.ParseDuration(*rateDelay)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid RATE_LIMIT_DELAY: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &Config{
|
||||||
|
Keyword: *keyword,
|
||||||
|
Limit: *limit,
|
||||||
|
Concurrency: *concurrency,
|
||||||
|
RetryLimit: *retryLimit,
|
||||||
|
RateLimitDelay: rd,
|
||||||
|
OutputDir: *outputDir,
|
||||||
|
DedupCachePath: *dedup,
|
||||||
|
LogLevel: *logLevel,
|
||||||
|
UserAgent: *userAgent,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure output dir exists
|
||||||
|
if err := os.MkdirAll(filepath.Clean(cfg.OutputDir), 0o755); err != nil {
|
||||||
|
return nil, fmt.Errorf("creating output dir: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getEnv(key, def string) string {
|
||||||
|
v := os.Getenv(key)
|
||||||
|
if v == "" {
|
||||||
|
return def
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func getEnvInt(key string, def int) int {
|
||||||
|
s := os.Getenv(key)
|
||||||
|
if s == "" {
|
||||||
|
return def
|
||||||
|
}
|
||||||
|
i, err := strconv.Atoi(s)
|
||||||
|
if err != nil {
|
||||||
|
return def
|
||||||
|
}
|
||||||
|
return i
|
||||||
|
}
|
||||||
15
internal/logging/logger.go
Normal file
15
internal/logging/logger.go
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
package logging
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Init initializes a global logger and returns it. Level is a string like "INFO" or "DEBUG".
|
||||||
|
func Init(level string) *slog.Logger {
|
||||||
|
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{AddSource: false})
|
||||||
|
logger := slog.New(handler)
|
||||||
|
// Note: slog's level filtering is configured per Handler via HandlerOptions in Go 1.25;
|
||||||
|
// here we rely on consumer code to not spam at debug when level is INFO.
|
||||||
|
return logger
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user