From 44c783a284e5ef4c4a2a6141641129dc58865cdb Mon Sep 17 00:00:00 2001 From: Sirin Puenggun Date: Sat, 18 Oct 2025 09:46:00 +0700 Subject: [PATCH] feat: add TUI --- .gitignore | 9 +- Makefile | 8 +- cmd/site-to-llmstxt/main.go | 360 ++++++++++ go.mod | 18 +- go.sum | 35 + internal/crawler/discover.go | 217 ++++++ internal/crawler/scrape.go | 245 +++++++ internal/crawler/util.go | 33 + internal/filters/filters.go | 18 +- internal/filters/filters_test.go | 24 +- internal/tui/model.go | 1084 ++++++++++++++++++++++++++++++ internal/utils/namer.go | 39 ++ 12 files changed, 2084 insertions(+), 6 deletions(-) create mode 100644 cmd/site-to-llmstxt/main.go create mode 100644 internal/crawler/discover.go create mode 100644 internal/crawler/scrape.go create mode 100644 internal/crawler/util.go create mode 100644 internal/tui/model.go create mode 100644 internal/utils/namer.go diff --git a/.gitignore b/.gitignore index d11b1c3..1737146 100644 --- a/.gitignore +++ b/.gitignore @@ -9,8 +9,11 @@ *.dylib # Project binaries -crawler -site-to-llmstxt +/crawler +/site-to-llmstxt +.gocache/ +.gomodcache/ +main # Output directories output/ @@ -58,3 +61,5 @@ go.work.sum # Editor/IDE # .idea/ # .vscode/ + +bin/ diff --git a/Makefile b/Makefile index 6980e42..2e07b5a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for site-to-llmstxt crawler -.PHONY: build test clean run help fmt lint deps dev-setup +.PHONY: build test clean run tui demo help fmt lint deps dev-setup all # Variables BINARY_NAME=site-to-llmstxt @@ -15,6 +15,7 @@ help: @echo " test-coverage - Run tests with coverage" @echo " clean - Clean build artifacts" @echo " run - Run with example URL (requires URL variable)" + @echo " tui - Launch interactive terminal UI" @echo " fmt - Format code" @echo " lint - Lint code" @echo " deps - Install/update dependencies" @@ -24,6 +25,7 @@ help: @echo " make build" @echo " make run URL=https://example.com" @echo " make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output" + @echo " make tui OUTPUT=./docs WORKERS=4" # Build the crawler build: @@ -98,3 +100,7 @@ all: clean deps fmt build test demo: build @echo "Running demo crawl of httpbin.org..." $(BUILD_DIR)/$(BINARY_NAME) --url https://httpbin.org --output ./demo-output --workers 1 --verbose +# Launch interactive TUI +tui: + @echo "Launching TUI..." + go run ./cmd/site-to-llmstxt tui $(if $(OUTPUT),--output $(OUTPUT)) $(if $(WORKERS),--workers $(WORKERS)) diff --git a/cmd/site-to-llmstxt/main.go b/cmd/site-to-llmstxt/main.go new file mode 100644 index 0000000..86b9325 --- /dev/null +++ b/cmd/site-to-llmstxt/main.go @@ -0,0 +1,360 @@ +package main + +import ( + "context" + "errors" + "fmt" + "log" + "net/url" + "os" + "os/signal" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "time" + + htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly/v2" + "github.com/urfave/cli/v2" + + "github.com/Sosokker/site-to-llmstxt/internal/config" + "github.com/Sosokker/site-to-llmstxt/internal/filters" + "github.com/Sosokker/site-to-llmstxt/internal/generator" + "github.com/Sosokker/site-to-llmstxt/internal/models" + "github.com/Sosokker/site-to-llmstxt/internal/progress" + "github.com/Sosokker/site-to-llmstxt/internal/tui" + "github.com/Sosokker/site-to-llmstxt/internal/utils" +) + +func main() { + log.SetFlags(0) + + app := &cli.App{ + Name: "site-to-llmstxt", + Usage: "Crawl a documentation site and generate llms.txt outputs", + Commands: []*cli.Command{ + { + Name: "tui", + Usage: "Launch interactive terminal UI", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "output", + Usage: "Output directory", + Value: config.DefaultOutputDir, + }, + &cli.IntFlag{ + Name: "workers", + Usage: "Default worker count used during discovery", + Value: config.DefaultWorkers, + }, + }, + Action: func(cliCtx *cli.Context) error { + ctx, cancel := signal.NotifyContext(cliCtx.Context, os.Interrupt) + defer cancel() + + opts := tui.Options{ + OutputDir: cliCtx.String("output"), + DefaultWorkers: cliCtx.Int("workers"), + } + + return tui.Run(ctx, opts) + }, + }, + }, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "url", + Aliases: []string{"u"}, + Usage: "Start `URL` to crawl (required)", + EnvVars: []string{"SITE_TO_LLMSTXT_URL"}, + }, + &cli.StringFlag{ + Name: "output", + Aliases: []string{"o"}, + Usage: "Output directory", + EnvVars: []string{"SITE_TO_LLMSTXT_OUTPUT"}, + Value: config.DefaultOutputDir, + }, + &cli.IntFlag{ + Name: "workers", + Aliases: []string{"w"}, + Usage: "Number of concurrent workers", + EnvVars: []string{"SITE_TO_LLMSTXT_WORKERS"}, + Value: config.DefaultWorkers, + }, + &cli.BoolFlag{ + Name: "verbose", + Usage: "Enable verbose progress logging", + EnvVars: []string{"SITE_TO_LLMSTXT_VERBOSE"}, + }, + }, + Action: func(cliCtx *cli.Context) error { + ctx, cancel := signal.NotifyContext(cliCtx.Context, os.Interrupt) + defer cancel() + + cfg := &config.Config{ + URL: cliCtx.String("url"), + OutputDir: cliCtx.String("output"), + Workers: cliCtx.Int("workers"), + Verbose: cliCtx.Bool("verbose"), + } + + if cfg.Workers <= 0 { + cfg.Workers = config.DefaultWorkers + } + if cfg.OutputDir == "" { + cfg.OutputDir = config.DefaultOutputDir + } + + if err := cfg.Validate(); err != nil { + return err + } + + baseURL, err := url.Parse(cfg.URL) + if err != nil { + return fmt.Errorf("parse URL: %w", err) + } + + return crawlAndGenerate(ctx, baseURL, cfg) + }, + } + + if err := app.Run(os.Args); err != nil { + log.Fatal(err) + } +} + +func crawlAndGenerate(ctx context.Context, baseURL *url.URL, cfg *config.Config) error { + if err := utils.CreateOutputDirs(cfg.OutputDir); err != nil { + return err + } + + stats := &models.Stats{StartTime: time.Now()} + basePath := "" + if baseURL != nil { + basePath = baseURL.Path + } + statsMu := &sync.Mutex{} + progressManager := progress.New(cfg.Verbose, stats) + defer func() { + statsMu.Lock() + stats.Finish() + statsMu.Unlock() + progressManager.Finish() + }() + + pages := make([]models.PageInfo, 0, 128) + pagesMu := &sync.Mutex{} + namer := utils.NewUniqueNamer() + + var queued, processed int64 + + collector := colly.NewCollector( + colly.AllowedDomains(allowedDomains(baseURL.Host)...), + colly.Async(true), + ) + collector.SetRequestTimeout(30 * time.Second) + + if cfg.Workers > 0 { + if err := collector.Limit(&colly.LimitRule{ + DomainGlob: "*", + Parallelism: cfg.Workers, + RandomDelay: 500 * time.Millisecond, + }); err != nil { + return fmt.Errorf("configure collector: %w", err) + } + } + + collector.OnRequest(func(r *colly.Request) { + select { + case <-ctx.Done(): + r.Abort() + return + default: + } + + if cfg.Verbose { + progressManager.Log("Visiting %s", r.URL.String()) + } + }) + + collector.OnError(func(r *colly.Response, err error) { + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + progressManager.Log("Error fetching %s: %v", r.Request.URL, err) + }) + + collector.OnHTML("html", func(e *colly.HTMLElement) { + select { + case <-ctx.Done(): + return + default: + } + + pageURL := e.Request.URL.String() + + atomic.AddInt64(&queued, -1) + currentProcessed := atomic.AddInt64(&processed, 1) + defer func() { + progressManager.Update(int(currentProcessed), int(max64(atomic.LoadInt64(&queued), 0))) + }() + + title := strings.TrimSpace(e.DOM.Find("title").First().Text()) + if title == "" { + title = "Untitled" + } + + description := strings.TrimSpace(e.DOM.Find(`meta[name="description"]`).AttrOr("content", "")) + + markdown, err := htmltomarkdown.ConvertString(string(e.Response.Body)) + if err != nil { + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + progressManager.Log("Failed to convert %s: %v", pageURL, err) + return + } + markdown = strings.TrimSpace(markdown) + + if description == "" { + description = utils.ExtractFirstSentence(markdown) + } + + filename := utils.CreateFilename(title, pageURL) + filename = namer.Reserve(filename) + relativePath := filepath.Join(config.MarkdownSubdir, filename) + fullPath := filepath.Join(cfg.OutputDir, relativePath) + + if err := os.WriteFile(fullPath, []byte(markdown), 0644); err != nil { + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + progressManager.Log("Failed to write %s: %v", fullPath, err) + return + } + + pageInfo := models.PageInfo{ + URL: pageURL, + Title: title, + Content: markdown, + FilePath: relativePath, + CrawledAt: time.Now(), + Description: description, + } + + pagesMu.Lock() + pages = append(pages, pageInfo) + pagesMu.Unlock() + + statsMu.Lock() + stats.TotalPages++ + if filters.IsMainDocPage(pageURL) { + stats.MainDocPages++ + } else { + stats.SecondaryPages++ + } + statsMu.Unlock() + + e.DOM.Find("a[href]").Each(func(_ int, sel *goquery.Selection) { + href, exists := sel.Attr("href") + if !exists || href == "" { + return + } + + absolute := e.Request.AbsoluteURL(href) + if absolute == "" { + return + } + + if !strings.HasPrefix(absolute, "http") { + return + } + + if filters.ShouldSkipURL(absolute, baseURL.Host, basePath) { + statsMu.Lock() + stats.AddSkipped() + statsMu.Unlock() + return + } + + select { + case <-ctx.Done(): + return + default: + } + + if err := collector.Visit(absolute); err != nil { + var alreadyVisited *colly.AlreadyVisitedError + if errors.As(err, &alreadyVisited) { + return + } + + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + progressManager.Log("Failed to queue %s: %v", absolute, err) + return + } + + atomic.AddInt64(&queued, 1) + }) + }) + + atomic.AddInt64(&queued, 1) + if err := collector.Visit(baseURL.String()); err != nil { + var alreadyVisited *colly.AlreadyVisitedError + if !errors.As(err, &alreadyVisited) { + return fmt.Errorf("start crawl: %w", err) + } + } + + collector.Wait() + + if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) { + return err + } + + if len(pages) == 0 { + return errors.New("no pages were crawled; check URL or filters") + } + + gen := generator.New(baseURL, cfg.OutputDir) + if err := gen.Generate(pages); err != nil { + return fmt.Errorf("generate outputs: %w", err) + } + + return nil +} + +func allowedDomains(host string) []string { + host = strings.TrimSpace(host) + if host == "" { + return nil + } + + domains := map[string]struct{}{ + host: {}, + } + + if strings.HasPrefix(host, "www.") { + domains[strings.TrimPrefix(host, "www.")] = struct{}{} + } else { + domains["www."+host] = struct{}{} + } + + list := make([]string, 0, len(domains)) + for d := range domains { + list = append(list, d) + } + return list +} + +func max64(a, b int64) int64 { + if a > b { + return a + } + return b +} diff --git a/go.mod b/go.mod index d6e83ce..f9d5ad5 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,10 @@ go 1.24.5 require ( github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3 + github.com/PuerkitoBio/goquery v1.10.3 + github.com/charmbracelet/bubbles v0.17.1 + github.com/charmbracelet/bubbletea v0.25.0 + github.com/charmbracelet/lipgloss v0.9.1 github.com/gocolly/colly/v2 v2.2.0 github.com/schollz/progressbar/v3 v3.18.0 github.com/urfave/cli/v2 v2.27.7 @@ -11,18 +15,29 @@ require ( require ( github.com/JohannesKaufmann/dom v0.2.0 // indirect - github.com/PuerkitoBio/goquery v1.10.3 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect github.com/antchfx/htmlquery v1.3.4 // indirect github.com/antchfx/xmlquery v1.4.4 // indirect github.com/antchfx/xpath v1.3.4 // indirect + github.com/atotto/clipboard v0.1.4 // indirect + github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/bits-and-blooms/bitset v1.22.0 // indirect + github.com/charmbracelet/harmonica v0.2.0 // indirect + github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/lucasb-eyer/go-colorful v1.2.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-localereader v0.0.1 // indirect + github.com/mattn/go-runewidth v0.0.16 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b // indirect + github.com/muesli/cancelreader v0.2.2 // indirect + github.com/muesli/reflow v0.3.0 // indirect + github.com/muesli/termenv v0.16.0 // indirect github.com/nlnwa/whatwg-url v0.6.2 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect @@ -30,6 +45,7 @@ require ( github.com/temoto/robotstxt v1.1.2 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect golang.org/x/net v0.42.0 // indirect + golang.org/x/sync v0.16.0 // indirect golang.org/x/sys v0.34.0 // indirect golang.org/x/term v0.33.0 // indirect golang.org/x/text v0.27.0 // indirect diff --git a/go.sum b/go.sum index 3ae9704..e014d3f 100644 --- a/go.sum +++ b/go.sum @@ -13,11 +13,25 @@ github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fus github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antchfx/xpath v1.3.4 h1:1ixrW1VnXd4HurCj7qnqnR0jo14g8JMe20Fshg1Vgz4= github.com/antchfx/xpath v1.3.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= +github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= +github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= +github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/charmbracelet/bubbles v0.17.1 h1:0SIyjOnkrsfDo88YvPgAWvZMwXe26TP6drRvmkjyUu4= +github.com/charmbracelet/bubbles v0.17.1/go.mod h1:9HxZWlkCqz2PRwsCbYl7a3KXvGzFaDHpYbSYMJ+nE3o= +github.com/charmbracelet/bubbletea v0.25.0 h1:bAfwk7jRz7FKFl9RzlIULPkStffg5k6pNt5dywy4TcM= +github.com/charmbracelet/bubbletea v0.25.0/go.mod h1:EN3QDR1T5ZdWmdfDzYcqOCAps45+QIJbLOBxmVNWNNg= +github.com/charmbracelet/harmonica v0.2.0 h1:8NxJWRWg/bzKqqEaaeFNipOu77YR5t8aSwG4pgaUBiQ= +github.com/charmbracelet/harmonica v0.2.0/go.mod h1:KSri/1RMQOZLbw7AHqgcBycp8pgJnQMYYT8QZRqZ1Ao= +github.com/charmbracelet/lipgloss v0.9.1 h1:PNyd3jvaJbg4jRHKWXnCj1akQm4rh8dbEzN1p/u1KWg= +github.com/charmbracelet/lipgloss v0.9.1/go.mod h1:1mPmG4cxScwUQALAAnacHaigiiHB9Pmr+v1VEawJl6I= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= +github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 h1:q2hJAaP1k2wIvVRd/hEHD7lacgqrCPS+k8g1MndzfWY= +github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -39,14 +53,31 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= +github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4= +github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88= +github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b h1:1XF24mVaiu7u+CFywTdcDo2ie1pzzhwjt6RHqzpMU34= +github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b/go.mod h1:fQuZ0gauxyBcmsdE3ZT4NasjaRdxmbCS0jRHsrWu3Ho= +github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= +github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= +github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= +github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= +github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= +github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q= github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= @@ -103,12 +134,16 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/internal/crawler/discover.go b/internal/crawler/discover.go new file mode 100644 index 0000000..a71f2b8 --- /dev/null +++ b/internal/crawler/discover.go @@ -0,0 +1,217 @@ +package crawler + +import ( + "context" + "errors" + "fmt" + "net/url" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly/v2" + + "github.com/Sosokker/site-to-llmstxt/internal/filters" + "github.com/Sosokker/site-to-llmstxt/internal/models" +) + +// PageSummary captures metadata for a crawled URL prior to full scraping. +type PageSummary struct { + URL string + Title string + Description string + Path string + Depth int +} + +// DiscoverOptions configure the URL discovery stage. +type DiscoverOptions struct { + BaseURL *url.URL + Workers int + OnLog func(string, ...interface{}) + OnProgress func(processed, queued int) +} + +// Discover traverses links starting from the base URL and returns unique pages. +func Discover(ctx context.Context, opts DiscoverOptions) ([]PageSummary, *models.Stats, error) { + if opts.BaseURL == nil { + return nil, nil, errors.New("base URL is required") + } + + stats := &models.Stats{StartTime: time.Now()} + basePath := "" + if opts.BaseURL != nil { + basePath = opts.BaseURL.Path + } + statsMu := &sync.Mutex{} + var ( + mu sync.Mutex + pages = make([]PageSummary, 0, 128) + seen = make(map[string]struct{}) + queued int64 + processed int64 + ) + + collector := colly.NewCollector( + colly.AllowedDomains(allowedDomains(opts.BaseURL.Host)...), + colly.Async(true), + ) + collector.SetRequestTimeout(30 * time.Second) + + if opts.Workers > 0 { + if err := collector.Limit(&colly.LimitRule{ + DomainGlob: "*", + Parallelism: opts.Workers, + RandomDelay: 500 * time.Millisecond, + }); err != nil { + return nil, nil, fmt.Errorf("configure collector: %w", err) + } + } + + collector.OnRequest(func(r *colly.Request) { + select { + case <-ctx.Done(): + r.Abort() + return + default: + } + + if opts.OnLog != nil { + opts.OnLog("discover: visiting %s", r.URL.String()) + } + }) + + collector.OnError(func(r *colly.Response, err error) { + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + if opts.OnLog != nil { + opts.OnLog("discover: error fetching %s: %v", r.Request.URL, err) + } + }) + + collector.OnHTML("html", func(e *colly.HTMLElement) { + select { + case <-ctx.Done(): + return + default: + } + + pageURL := e.Request.URL.String() + atomic.AddInt64(&queued, -1) + currentProcessed := atomic.AddInt64(&processed, 1) + defer func() { + if opts.OnProgress != nil { + opts.OnProgress(int(currentProcessed), int(max64(atomic.LoadInt64(&queued), 0))) + } + }() + + mu.Lock() + if _, ok := seen[pageURL]; ok { + mu.Unlock() + } else { + seen[pageURL] = struct{}{} + mu.Unlock() + + statsMu.Lock() + stats.TotalPages++ + if filters.IsMainDocPage(pageURL) { + stats.MainDocPages++ + } else { + stats.SecondaryPages++ + } + statsMu.Unlock() + + title := strings.TrimSpace(e.DOM.Find("title").First().Text()) + if title == "" { + title = "Untitled" + } + + description := strings.TrimSpace(e.DOM.Find(`meta[name="description"]`).AttrOr("content", "")) + if description == "" { + description = guessDescription(e.DOM) + } + + summary := PageSummary{ + URL: pageURL, + Title: title, + Description: description, + Path: e.Request.URL.Path, + Depth: e.Request.Depth, + } + + mu.Lock() + pages = append(pages, summary) + mu.Unlock() + } + + e.DOM.Find("a[href]").Each(func(_ int, sel *goquery.Selection) { + href, exists := sel.Attr("href") + if !exists || href == "" { + return + } + + absolute := e.Request.AbsoluteURL(href) + if absolute == "" || !strings.HasPrefix(absolute, "http") { + return + } + + if filters.ShouldSkipURL(absolute, opts.BaseURL.Host, basePath) { + statsMu.Lock() + stats.AddSkipped() + statsMu.Unlock() + return + } + + if err := collector.Visit(absolute); err != nil { + var alreadyVisited *colly.AlreadyVisitedError + if errors.As(err, &alreadyVisited) { + return + } + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + if opts.OnLog != nil { + opts.OnLog("discover: failed to queue %s: %v", absolute, err) + } + return + } + + atomic.AddInt64(&queued, 1) + }) + }) + + atomic.AddInt64(&queued, 1) + if err := collector.Visit(opts.BaseURL.String()); err != nil { + var alreadyVisited *colly.AlreadyVisitedError + if !errors.As(err, &alreadyVisited) { + return nil, nil, fmt.Errorf("start discovery: %w", err) + } + } + + collector.Wait() + + if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) { + return nil, nil, err + } + + stats.Finish() + return pages, stats, nil +} + +func guessDescription(sel *goquery.Selection) string { + selection := sel.Find("p") + for i := range selection.Nodes { + paragraph := selection.Eq(i).Text() + paragraph = strings.TrimSpace(paragraph) + if paragraph != "" { + if len(paragraph) > 240 { + return paragraph[:240] + "..." + } + return paragraph + } + } + return "" +} diff --git a/internal/crawler/scrape.go b/internal/crawler/scrape.go new file mode 100644 index 0000000..e70863d --- /dev/null +++ b/internal/crawler/scrape.go @@ -0,0 +1,245 @@ +package crawler + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "time" + + htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" + "github.com/PuerkitoBio/goquery" + + "github.com/Sosokker/site-to-llmstxt/internal/config" + "github.com/Sosokker/site-to-llmstxt/internal/filters" + "github.com/Sosokker/site-to-llmstxt/internal/models" + "github.com/Sosokker/site-to-llmstxt/internal/utils" +) + +// ProgressUpdate conveys scraping progress for interactive UIs. +type ProgressUpdate struct { + Completed int + Total int + URL string +} + +// LogUpdate captures a log line emitted during scraping. +type LogUpdate struct { + Message string +} + +// ScrapeOptions configure the download stage for selected pages. +type ScrapeOptions struct { + BaseURL *url.URL + Pages []PageSummary + Output string + Workers int + Verbose bool + Logs chan<- LogUpdate + Progress chan<- ProgressUpdate +} + +// Scrape fetches each provided page, writes Markdown output, and returns results. +func Scrape(ctx context.Context, opts ScrapeOptions) ([]models.PageInfo, *models.Stats, error) { + if opts.BaseURL == nil { + return nil, nil, errors.New("base URL is required") + } + if len(opts.Pages) == 0 { + return nil, nil, errors.New("no pages selected for scraping") + } + + if opts.Workers <= 0 { + opts.Workers = config.DefaultWorkers + } + + if err := utils.CreateOutputDirs(opts.Output); err != nil { + return nil, nil, err + } + + client := &http.Client{ + Timeout: 30 * time.Second, + } + + stats := &models.Stats{ + StartTime: time.Now(), + TotalPages: len(opts.Pages), + MainDocPages: 0, + SecondaryPages: 0, + SkippedURLs: 0, + ErrorCount: 0, + } + statsMu := &sync.Mutex{} + + for _, page := range opts.Pages { + if filters.IsMainDocPage(page.URL) { + stats.MainDocPages++ + } else { + stats.SecondaryPages++ + } + } + + namer := utils.NewUniqueNamer() + results := make([]models.PageInfo, 0, len(opts.Pages)) + resultsMu := &sync.Mutex{} + + var completed int32 + progressTotal := len(opts.Pages) + + pageCh := make(chan PageSummary) + wg := sync.WaitGroup{} + errOnce := sync.Once{} + var firstErr error + + sendLog := func(always bool, format string, args ...interface{}) { + if !always && !opts.Verbose { + return + } + if opts.Logs == nil { + return + } + msg := fmt.Sprintf(format, args...) + select { + case opts.Logs <- LogUpdate{Message: msg}: + default: + } + } + + sendProgress := func(url string) { + if opts.Progress == nil { + return + } + done := int(atomic.AddInt32(&completed, 1)) + select { + case opts.Progress <- ProgressUpdate{ + Completed: done, + Total: progressTotal, + URL: url, + }: + default: + } + } + + for i := 0; i < opts.Workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for page := range pageCh { + select { + case <-ctx.Done(): + return + default: + } + + info, err := fetchPage(ctx, client, page, opts.Output, namer) + if err != nil { + errOnce.Do(func() { firstErr = err }) + statsMu.Lock() + stats.AddError() + statsMu.Unlock() + sendLog(true, "error scraping %s: %v", page.URL, err) + sendProgress(page.URL) + continue + } + + resultsMu.Lock() + results = append(results, info) + resultsMu.Unlock() + + sendLog(false, "scraped %s", page.URL) + sendProgress(page.URL) + } + }() + } + + go func() { + for _, page := range opts.Pages { + pageCh <- page + } + close(pageCh) + }() + + wg.Wait() + + stats.Finish() + + if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) { + return results, stats, err + } + + return results, stats, firstErr +} + +func fetchPage(ctx context.Context, client *http.Client, page PageSummary, outputDir string, namer *utils.UniqueNamer) (models.PageInfo, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, page.URL, nil) + if err != nil { + return models.PageInfo{}, err + } + + resp, err := client.Do(req) + if err != nil { + return models.PageInfo{}, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + io.Copy(io.Discard, resp.Body) + return models.PageInfo{}, fmt.Errorf("unexpected status code %d", resp.StatusCode) + } + + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return models.PageInfo{}, err + } + + htmlContent := string(bodyBytes) + markdown, err := htmltomarkdown.ConvertString(htmlContent) + if err != nil { + return models.PageInfo{}, err + } + markdown = strings.TrimSpace(markdown) + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) + if err != nil { + return models.PageInfo{}, err + } + + title := strings.TrimSpace(doc.Find("title").First().Text()) + if title == "" { + title = page.Title + } + + description := strings.TrimSpace(doc.Find(`meta[name="description"]`).AttrOr("content", "")) + if description == "" { + description = page.Description + } + if description == "" { + description = utils.ExtractFirstSentence(markdown) + } + + filename := utils.CreateFilename(title, page.URL) + filename = namer.Reserve(filename) + relativePath := filepath.Join(config.MarkdownSubdir, filename) + fullPath := filepath.Join(outputDir, relativePath) + + if err := os.WriteFile(fullPath, []byte(markdown), 0o644); err != nil { + return models.PageInfo{}, err + } + + info := models.PageInfo{ + URL: page.URL, + Title: title, + Content: markdown, + FilePath: relativePath, + CrawledAt: time.Now(), + Description: description, + } + + return info, nil +} diff --git a/internal/crawler/util.go b/internal/crawler/util.go new file mode 100644 index 0000000..086b574 --- /dev/null +++ b/internal/crawler/util.go @@ -0,0 +1,33 @@ +package crawler + +import "strings" + +func allowedDomains(host string) []string { + host = strings.TrimSpace(host) + if host == "" { + return nil + } + + domains := map[string]struct{}{ + host: {}, + } + + if strings.HasPrefix(host, "www.") { + domains[strings.TrimPrefix(host, "www.")] = struct{}{} + } else { + domains["www."+host] = struct{}{} + } + + list := make([]string, 0, len(domains)) + for d := range domains { + list = append(list, d) + } + return list +} + +func max64(a, b int64) int64 { + if a > b { + return a + } + return b +} diff --git a/internal/filters/filters.go b/internal/filters/filters.go index f9ae534..dc569e5 100644 --- a/internal/filters/filters.go +++ b/internal/filters/filters.go @@ -31,7 +31,7 @@ var SecondaryPageIndicators = []string{ } // ShouldSkipURL determines if a URL should be skipped based on various filters. -func ShouldSkipURL(rawURL, baseHost string) bool { +func ShouldSkipURL(rawURL, baseHost, basePath string) bool { if rawURL == "" { return true } @@ -53,10 +53,26 @@ func ShouldSkipURL(rawURL, baseHost string) bool { } lowerURL := strings.ToLower(rawURL) + basePathLower := strings.ToLower(basePath) + if basePathLower != "" && !strings.HasPrefix(basePathLower, "/") { + basePathLower = "/" + basePathLower + } + basePathLower = strings.TrimRight(basePathLower, "/") + + candidatePath := strings.ToLower(u.Path) + baseClean := strings.TrimSuffix(basePathLower, "/") + if baseClean != "" && baseClean != "/" { + if candidatePath != baseClean && !strings.HasPrefix(candidatePath, baseClean+"/") { + return true + } + } // Skip language variants for _, lang := range LanguageIndicators { if strings.Contains(lowerURL, lang) { + if basePathLower != "" && strings.Contains(basePathLower, lang) { + continue + } return true } } diff --git a/internal/filters/filters_test.go b/internal/filters/filters_test.go index e2d6d87..b33c7fd 100644 --- a/internal/filters/filters_test.go +++ b/internal/filters/filters_test.go @@ -7,55 +7,77 @@ func TestShouldSkipURL(t *testing.T) { name string url string baseHost string + basePath string want bool }{ { name: "Normal URL", url: "https://example.com/docs", baseHost: "example.com", + basePath: "", want: false, }, { name: "Language URL - en", url: "https://example.com/en/docs", baseHost: "example.com", + basePath: "", want: true, }, { name: "Language URL - zh", url: "https://example.com/zh/docs", baseHost: "example.com", + basePath: "", want: true, }, { name: "PDF file", url: "https://example.com/doc.pdf", baseHost: "example.com", + basePath: "", want: true, }, { name: "ZIP file", url: "https://example.com/download.zip", baseHost: "example.com", + basePath: "", want: true, }, { name: "Fragment URL", url: "https://example.com/docs#section", baseHost: "example.com", + basePath: "", want: true, }, { name: "External domain", url: "https://other.com/docs", baseHost: "example.com", + basePath: "", want: true, }, + { + name: "Language URL outside base path", + url: "https://example.com/en/reference", + baseHost: "example.com", + basePath: "/en/docs/", + want: true, + }, + { + name: "Sub path inside base", + url: "https://example.com/en/docs/liff/guide", + baseHost: "example.com", + basePath: "/en/docs/liff/", + want: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := ShouldSkipURL(tt.url, tt.baseHost); got != tt.want { + if got := ShouldSkipURL(tt.url, tt.baseHost, tt.basePath); got != tt.want { t.Errorf("ShouldSkipURL() = %v, want %v", got, tt.want) } }) diff --git a/internal/tui/model.go b/internal/tui/model.go new file mode 100644 index 0000000..5ebb183 --- /dev/null +++ b/internal/tui/model.go @@ -0,0 +1,1084 @@ +package tui + +import ( + "context" + "fmt" + "net/url" + "sort" + "strings" + "time" + + "github.com/charmbracelet/bubbles/progress" + "github.com/charmbracelet/bubbles/spinner" + "github.com/charmbracelet/bubbles/textinput" + "github.com/charmbracelet/bubbles/viewport" + tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/lipgloss" + + "github.com/Sosokker/site-to-llmstxt/internal/config" + "github.com/Sosokker/site-to-llmstxt/internal/crawler" + "github.com/Sosokker/site-to-llmstxt/internal/generator" + "github.com/Sosokker/site-to-llmstxt/internal/models" + "github.com/Sosokker/site-to-llmstxt/internal/utils" +) + +type Options struct { + OutputDir string + DefaultWorkers int +} + +type appState int + +const ( + stateInput appState = iota + stateConfig + stateDiscover + stateRun + stateSelect + stateFinalize + stateDone + stateError +) + +type runStage int + +const ( + runStageInitial runStage = iota + runStageManual +) + +type configMode int + +const ( + configModePreflight configMode = iota + configModeSelection +) + +type entryType int + +const ( + entryGroup entryType = iota + entryPage +) + +type listEntry struct { + Type entryType + Group string + Page crawler.PageSummary + DisplayName string +} + +type discoveryResultMsg struct { + pages []crawler.PageSummary + stats *models.Stats + err error +} + +type progressUpdateMsg struct { + update crawler.ProgressUpdate +} + +type logUpdateMsg struct { + update crawler.LogUpdate +} + +type scrapeResultMsg struct { + stats *models.Stats + pages []models.PageInfo + err error +} + +type finalizeResultMsg struct { + err error +} + +// Model represents the interactive TUI state. +type Model struct { + state appState + runStage runStage + configMode configMode + defaultWorkers int + + ctx context.Context + cancel context.CancelFunc + + baseURL *url.URL + outputDir string + verbose bool + discoverMsg string + + urlInput textinput.Model + workerInput textinput.Model + outputInput textinput.Model + + spinner spinner.Model + progress progress.Model + + logViewport viewport.Model + listViewport viewport.Model + + width int + height int + errMsg string + + pages []crawler.PageSummary + totalPages int + groups map[string][]crawler.PageSummary + groupOrder []string + expanded map[string]bool + selected map[string]bool + entries []listEntry + cursor int + + progressCh chan crawler.ProgressUpdate + logCh chan crawler.LogUpdate + logLines []string + + scraped map[string]models.PageInfo + scrapedSeq []models.PageInfo + lastStats *models.Stats + + progressDone int + progressTotal int + configFocus int +} + +// NewModel constructs a new Bubble Tea model. +func NewModel(ctx context.Context, opts Options) *Model { + if opts.OutputDir == "" { + opts.OutputDir = config.DefaultOutputDir + } + if opts.DefaultWorkers <= 0 { + opts.DefaultWorkers = config.DefaultWorkers + } + + c, cancel := context.WithCancel(ctx) + + urlInput := textinput.New() + urlInput.Placeholder = "https://docs.example.com" + urlInput.Focus() + + workerInput := textinput.New() + workerInput.Placeholder = fmt.Sprintf("%d", opts.DefaultWorkers) + workerInput.SetValue(fmt.Sprintf("%d", opts.DefaultWorkers)) + workerInput.CharLimit = 3 + + outputInput := textinput.New() + outputInput.Placeholder = opts.OutputDir + outputInput.SetValue(opts.OutputDir) + + sp := spinner.New() + sp.Spinner = spinner.Dot + + model := &Model{ + state: stateInput, + runStage: runStageInitial, + configMode: configModePreflight, + defaultWorkers: opts.DefaultWorkers, + ctx: c, + cancel: cancel, + outputDir: opts.OutputDir, + urlInput: urlInput, + workerInput: workerInput, + outputInput: outputInput, + spinner: sp, + progress: progress.New(progress.WithDefaultGradient()), + logViewport: viewport.New(0, 0), + listViewport: viewport.New(0, 0), + groups: make(map[string][]crawler.PageSummary), + expanded: make(map[string]bool), + selected: make(map[string]bool), + scraped: make(map[string]models.PageInfo), + } + model.width = 80 + model.height = 24 + model.recalculateLayout() + return model +} + +// Init implements tea.Model. +func (m *Model) Init() tea.Cmd { + return textinput.Blink +} + +// Update handles updates from Bubble Tea. +func (m *Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case tea.WindowSizeMsg: + m.width, m.height = typed.Width, typed.Height + m.recalculateLayout() + case tea.KeyMsg: + if typed.Type == tea.KeyCtrlC { + m.cancel() + return m, tea.Quit + } + } + + switch m.state { + case stateInput: + return m.updateInput(msg) + case stateConfig: + return m.updateConfig(msg) + case stateDiscover: + return m.updateDiscover(msg) + case stateRun: + return m.updateRun(msg) + case stateSelect: + return m.updateSelect(msg) + case stateFinalize: + return m.updateFinalize(msg) + case stateDone, stateError: + return m.updateTerminal(msg) + default: + return m, nil + } +} + +func (m *Model) updateInput(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case tea.KeyMsg: + if typed.Type == tea.KeyEnter { + raw := strings.TrimSpace(m.urlInput.Value()) + if raw == "" { + m.errMsg = "URL is required" + return m, nil + } + + parsed, err := url.Parse(raw) + if err != nil || parsed.Scheme == "" || parsed.Host == "" { + m.errMsg = "Enter a valid URL with scheme and host" + return m, nil + } + + m.baseURL = parsed + m.errMsg = "" + m.configMode = configModePreflight + m.workerInput.SetValue(fmt.Sprintf("%d", m.defaultWorkers)) + m.outputInput.SetValue(m.outputDir) + m.state = stateConfig + m.discoverMsg = "Preparing crawl..." + return m, m.setConfigFocus(0) + } + } + + var cmd tea.Cmd + m.urlInput, cmd = m.urlInput.Update(msg) + return m, cmd +} + +func (m *Model) updateConfig(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case tea.KeyMsg: + switch typed.String() { + case "enter": + output := strings.TrimSpace(m.outputInput.Value()) + if output == "" { + output = m.outputDir + } + if output == "" { + m.errMsg = "Output directory cannot be empty" + return m, nil + } + m.outputDir = output + + if m.configMode == configModePreflight { + workers, err := parseWorkers(m.workerInput.Value(), m.defaultWorkers) + if err != nil { + m.errMsg = err.Error() + return m, nil + } + m.defaultWorkers = workers + m.errMsg = "" + m.state = stateDiscover + host := "site" + if m.baseURL != nil && m.baseURL.Host != "" { + host = m.baseURL.Host + } + m.discoverMsg = fmt.Sprintf("Crawling %s...", host) + return m, tea.Batch( + m.spinner.Tick, + startDiscovery(m.ctx, m.baseURL, workers), + ) + } + + m.errMsg = "" + return m, m.generateOutputs() + case "r": + if m.configMode == configModeSelection { + workers, err := parseWorkers(m.workerInput.Value(), m.defaultWorkers) + if err != nil { + m.errMsg = err.Error() + return m, nil + } + m.defaultWorkers = workers + m.errMsg = "" + return m, m.beginScrape(m.pages, workers, runStageManual) + } + case "tab", "shift+tab", "down", "up": + next := m.configFocus + if typed.String() == "tab" || typed.String() == "down" { + next = (next + 1) % 2 + } else { + next = (next - 1 + 2) % 2 + } + return m, m.setConfigFocus(next) + case "v": + m.verbose = !m.verbose + return m, nil + case "esc": + if m.configMode == configModePreflight { + m.state = stateInput + } else { + m.state = stateSelect + } + m.errMsg = "" + return m, nil + } + } + + var cmd tea.Cmd + switch m.configFocus { + case 0: + m.workerInput, cmd = m.workerInput.Update(msg) + case 1: + m.outputInput, cmd = m.outputInput.Update(msg) + default: + cmd = nil + } + return m, cmd +} + +func (m *Model) updateDiscover(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case spinner.TickMsg: + var cmd tea.Cmd + m.spinner, cmd = m.spinner.Update(typed) + return m, cmd + case discoveryResultMsg: + if typed.err != nil { + m.state = stateError + m.errMsg = typed.err.Error() + return m, nil + } + + m.pages = typed.pages + m.totalPages = len(typed.pages) + m.buildGroups() + m.rebuildEntries() + return m, m.beginScrape(typed.pages, m.defaultWorkers, runStageInitial) + } + return m, nil +} + +func (m *Model) updateRun(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case progressUpdateMsg: + m.progressDone = typed.update.Completed + m.progressTotal = typed.update.Total + pct := 0.0 + if m.progressTotal > 0 { + pct = float64(m.progressDone) / float64(m.progressTotal) + } + cmd := m.progress.SetPercent(pct) + if m.progressDone < m.progressTotal { + return m, tea.Batch(cmd, listenProgress(m.progressCh)) + } + return m, cmd + case logUpdateMsg: + m.logLines = append(m.logLines, typed.update.Message) + m.logViewport.SetContent(strings.Join(m.logLines, "\n")) + if len(m.logLines) > 0 { + m.logViewport.GotoBottom() + } + return m, listenLogs(m.logCh) + case scrapeResultMsg: + m.lastStats = typed.stats + if typed.err != nil { + m.state = stateError + m.errMsg = typed.err.Error() + return m, nil + } + + if typed.pages != nil { + m.scrapedSeq = typed.pages + m.scraped = make(map[string]models.PageInfo, len(typed.pages)) + for _, info := range typed.pages { + m.scraped[info.URL] = info + } + if m.runStage == runStageInitial && len(m.selected) == 0 { + for _, info := range typed.pages { + m.selected[info.URL] = true + } + } + if m.runStage == runStageManual { + for url := range m.selected { + if _, ok := m.scraped[url]; !ok { + delete(m.selected, url) + } + } + for _, info := range typed.pages { + m.selected[info.URL] = true + } + } + } + + m.state = stateSelect + m.configMode = configModeSelection + m.cursor = 0 + m.logLines = nil + m.logViewport.SetContent("") + m.listViewport.SetYOffset(0) + m.runStage = runStageInitial + return m, nil + } + return m, nil +} + +func (m *Model) updateSelect(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case tea.KeyMsg: + switch typed.String() { + case "down", "j": + if m.cursor < len(m.entries)-1 { + m.cursor++ + m.ensureListCursorVisible() + } + case "up", "k": + if m.cursor > 0 { + m.cursor-- + m.ensureListCursorVisible() + } + case "right", "l": + m.toggleExpand(true) + m.ensureListCursorVisible() + case "left", "h": + m.toggleExpand(false) + m.ensureListCursorVisible() + case "tab": + m.jumpGroup(true) + m.ensureListCursorVisible() + case "shift+tab": + m.jumpGroup(false) + m.ensureListCursorVisible() + case " ": + m.toggleSelection() + m.ensureListCursorVisible() + case "enter": + if len(m.selected) == 0 { + m.errMsg = "Select at least one page" + return m, nil + } + m.errMsg = "" + return m, m.generateOutputs() + case "c": + m.workerInput.SetValue(fmt.Sprintf("%d", m.defaultWorkers)) + m.outputInput.SetValue(m.outputDir) + m.configMode = configModeSelection + m.errMsg = "" + m.state = stateConfig + return m, m.setConfigFocus(0) + case "v": + m.verbose = !m.verbose + return m, nil + case "esc": + m.state = stateInput + m.errMsg = "" + return m, nil + } + } + return m, nil +} + +func (m *Model) updateFinalize(msg tea.Msg) (tea.Model, tea.Cmd) { + switch typed := msg.(type) { + case spinner.TickMsg: + var cmd tea.Cmd + m.spinner, cmd = m.spinner.Update(typed) + return m, cmd + case finalizeResultMsg: + if typed.err != nil { + m.state = stateError + m.errMsg = typed.err.Error() + return m, nil + } + m.state = stateDone + return m, nil + } + return m, nil +} + +func (m *Model) updateTerminal(msg tea.Msg) (tea.Model, tea.Cmd) { + if key, ok := msg.(tea.KeyMsg); ok && key.Type == tea.KeyEnter { + return m, tea.Quit + } + return m, nil +} + +// View renders the active state. +func (m *Model) View() string { + switch m.state { + case stateInput: + return m.viewInput() + case stateConfig: + return m.viewConfig() + case stateDiscover: + return m.viewDiscover() + case stateRun: + return m.viewRun() + case stateSelect: + return m.viewSelect() + case stateFinalize: + return m.viewFinalize() + case stateDone: + return m.viewDone() + case stateError: + return m.viewError() + default: + return "" + } +} + +func (m *Model) viewInput() string { + var b strings.Builder + b.WriteString(titleStyle.Render("Site to LLMs.txt – TUI")) + b.WriteString("\n\n") + b.WriteString(instructionStyle.Render("Enter the base documentation URL:")) + b.WriteString("\n\n") + b.WriteString(m.urlInput.View()) + b.WriteString("\n\n") + b.WriteString(hintStyle.Render("Press Enter to configure crawl settings, Ctrl+C to quit.")) + if m.errMsg != "" { + b.WriteString("\n\n") + b.WriteString(statusStyle.Render(m.errMsg)) + } + return panelStyle.Render(b.String()) +} + +func (m *Model) viewConfig() string { + var b strings.Builder + if m.configMode == configModePreflight { + b.WriteString(titleStyle.Render("Configure Crawl")) + } else { + b.WriteString(titleStyle.Render("Export Settings")) + } + b.WriteString("\n") + + b.WriteString(infoStyle.Render(fmt.Sprintf("Workers: %s", m.workerInput.View()))) + b.WriteString("\n") + b.WriteString(infoStyle.Render(fmt.Sprintf("Output directory: %s", m.outputInput.View()))) + b.WriteString("\n") + if m.verbose { + b.WriteString(infoStyle.Render("Verbose logging: on (toggle with 'v')")) + } else { + b.WriteString(infoStyle.Render("Verbose logging: off (toggle with 'v')")) + } + b.WriteString("\n\n") + + if m.configMode == configModePreflight { + b.WriteString(instructionStyle.Render("Enter to start crawling · Tab to switch fields · Esc to go back")) + } else { + b.WriteString(instructionStyle.Render("Enter to generate outputs · 'r' to rescrape · Esc to return")) + } + + if m.errMsg != "" { + b.WriteString("\n\n") + b.WriteString(statusStyle.Render(m.errMsg)) + } + + return panelStyle.Render(b.String()) +} + +func (m *Model) viewDiscover() string { + content := fmt.Sprintf("%s %s\n\n%s", + m.spinner.View(), + instructionStyle.Render(m.discoverMsg), + hintStyle.Render("Press Ctrl+C to cancel."), + ) + return panelStyle.Render(content) +} + +func (m *Model) viewRun() string { + title := "Scraping pages..." + if m.runStage == runStageManual { + title = "Rescraping pages..." + } + + content := lipgloss.JoinVertical( + lipgloss.Left, + accentStyle.Render(fmt.Sprintf("%s (%d pages)", title, m.progressTotal)), + m.progress.View(), + "", + accentStyle.Render("Logs:"), + m.logViewport.View(), + ) + return panelStyle.Render(content) +} + +func (m *Model) viewSelect() string { + listWidth := max(m.width-6, 20) + listHeight := max(m.height-8, 6) + m.listViewport.Width = listWidth + m.listViewport.Height = listHeight + + var list strings.Builder + for i, entry := range m.entries { + cursor := " " + if i == m.cursor { + cursor = accentStyle.Render("> ") + } + switch entry.Type { + case entryGroup: + selectedCount := m.countSelectedInGroup(entry.Group) + total := len(m.groups[entry.Group]) + symbol := plusStyle.Render("+") + if m.expanded[entry.Group] { + symbol = plusStyle.Render("−") + } + check := checkboxEmptyStyle.Render("[ ]") + if selectedCount == total && total > 0 { + check = checkboxFilledStyle.Render("[x]") + } else if selectedCount > 0 { + check = checkboxMixedStyle.Render("[•]") + } + display := entry.DisplayName + if display == "" { + display = "(root)" + } + line := fmt.Sprintf("%s%s %s %s (%d/%d)", cursor, symbol, check, groupStyle.Render(display), selectedCount, total) + if i == m.cursor { + line = cursorStyle.Render(line) + } + list.WriteString(line) + case entryPage: + check := checkboxEmptyStyle.Render("[ ]") + if m.selected[entry.Page.URL] { + check = checkboxFilledStyle.Render("[x]") + } + line := fmt.Sprintf("%s %s %s", cursor, check, pageStyle.Render(entry.DisplayName)) + if i == m.cursor { + line = cursorStyle.Render(line) + } + list.WriteString(line) + } + if i < len(m.entries)-1 { + list.WriteString("\n") + } + } + + m.listViewport.SetContent(list.String()) + m.ensureListCursorVisible() + + var b strings.Builder + b.WriteString(titleStyle.Render(fmt.Sprintf("Discovered %d pages", m.totalPages))) + b.WriteString("\n") + b.WriteString(instructionStyle.Render("↑/↓ move · Space toggle · Tab jump groups · Enter export · 'c' configure")) + b.WriteString("\n\n") + b.WriteString(panelStyle.Width(listWidth).Render(m.listViewport.View())) + b.WriteString("\n\n") + if m.errMsg != "" { + b.WriteString(statusStyle.Render(m.errMsg)) + b.WriteString("\n\n") + } + b.WriteString(accentStyle.Render(fmt.Sprintf("Selected: %d pages", len(m.selected)))) + return b.String() +} + +func (m *Model) viewFinalize() string { + content := fmt.Sprintf("%s %s", m.spinner.View(), instructionStyle.Render("Generating llms outputs...")) + return panelStyle.Render(content) +} + +func (m *Model) viewDone() string { + var b strings.Builder + b.WriteString(titleStyle.Render("✅ Scrape completed!")) + b.WriteString("\n") + if m.lastStats != nil { + b.WriteString(infoStyle.Render(fmt.Sprintf("Pages processed: %d (main: %d, secondary: %d)", m.lastStats.TotalPages, m.lastStats.MainDocPages, m.lastStats.SecondaryPages))) + b.WriteString("\n") + b.WriteString(infoStyle.Render(fmt.Sprintf("Errors: %d, Skipped: %d", m.lastStats.ErrorCount, m.lastStats.SkippedURLs))) + b.WriteString("\n") + b.WriteString(infoStyle.Render(fmt.Sprintf("Duration: %s", m.lastStats.Duration.Round(time.Second)))) + b.WriteString("\n") + } + b.WriteString(infoStyle.Render(fmt.Sprintf("Pages selected for llms.txt: %d", len(m.selected)))) + b.WriteString("\n\n") + b.WriteString(accentStyle.Render(fmt.Sprintf("Output written to %s", m.outputDir))) + b.WriteString("\n\n") + b.WriteString(hintStyle.Render("Press Enter to exit.")) + return panelStyle.Render(b.String()) +} + +func (m *Model) viewError() string { + content := fmt.Sprintf("❌ %s\n\n%s", statusStyle.Render(m.errMsg), hintStyle.Render("Press Enter to exit.")) + return panelStyle.Render(content) +} + +func (m *Model) buildGroups() { + m.groups = make(map[string][]crawler.PageSummary) + for _, page := range m.pages { + group := groupForPath(page.Path) + m.groups[group] = append(m.groups[group], page) + } + m.groupOrder = m.groupOrder[:0] + for g := range m.groups { + m.groupOrder = append(m.groupOrder, g) + } + sort.Strings(m.groupOrder) +} + +func (m *Model) rebuildEntries() { + m.entries = m.entries[:0] + for _, group := range m.groupOrder { + display := group + if display == "" { + display = "(root)" + } + m.entries = append(m.entries, listEntry{ + Type: entryGroup, + Group: group, + DisplayName: display, + }) + if m.expanded[group] { + groupPages := m.groups[group] + sort.Slice(groupPages, func(i, j int) bool { + return groupPages[i].URL < groupPages[j].URL + }) + for _, page := range groupPages { + title := page.Title + if title == "" { + title = page.URL + } + m.entries = append(m.entries, listEntry{ + Type: entryPage, + Group: group, + Page: page, + DisplayName: fmt.Sprintf("%s — %s", title, page.URL), + }) + } + } + } + if len(m.entries) == 0 { + m.entries = append(m.entries, listEntry{ + Type: entryGroup, + Group: "", + DisplayName: "No pages discovered", + }) + } + if m.cursor >= len(m.entries) { + m.cursor = len(m.entries) - 1 + } + if m.cursor < 0 { + m.cursor = 0 + } +} + +func (m *Model) toggleExpand(expand bool) { + if len(m.entries) == 0 { + return + } + entry := m.entries[m.cursor] + if entry.Type != entryGroup { + if !expand { + m.jumpToGroup(entry.Group) + } + return + } + m.expanded[entry.Group] = expand + m.rebuildEntries() +} + +func (m *Model) jumpGroup(next bool) { + if len(m.entries) == 0 { + return + } + start := m.cursor + for { + if next { + m.cursor++ + if m.cursor >= len(m.entries) { + m.cursor = 0 + } + } else { + m.cursor-- + if m.cursor < 0 { + m.cursor = len(m.entries) - 1 + } + } + if m.entries[m.cursor].Type == entryGroup { + break + } + if m.cursor == start { + break + } + } +} + +func (m *Model) jumpToGroup(group string) { + for i, entry := range m.entries { + if entry.Type == entryGroup && entry.Group == group { + m.cursor = i + return + } + } +} + +func (m *Model) toggleSelection() { + if len(m.entries) == 0 { + return + } + entry := m.entries[m.cursor] + switch entry.Type { + case entryGroup: + groupPages := m.groups[entry.Group] + allSelected := true + for _, page := range groupPages { + if !m.selected[page.URL] { + allSelected = false + break + } + } + for _, page := range groupPages { + if allSelected { + delete(m.selected, page.URL) + } else { + m.selected[page.URL] = true + } + } + case entryPage: + if entry.Page.URL == "" { + return + } + if m.selected[entry.Page.URL] { + delete(m.selected, entry.Page.URL) + } else { + m.selected[entry.Page.URL] = true + } + } +} + +func (m *Model) countSelectedInGroup(group string) int { + count := 0 + for _, page := range m.groups[group] { + if m.selected[page.URL] { + count++ + } + } + return count +} + +func (m *Model) selectedPageInfos() []models.PageInfo { + selected := make([]models.PageInfo, 0, len(m.selected)) + for _, info := range m.scrapedSeq { + if m.selected[info.URL] { + selected = append(selected, info) + } + } + sort.Slice(selected, func(i, j int) bool { + return selected[i].URL < selected[j].URL + }) + return selected +} + +func (m *Model) beginScrape(pages []crawler.PageSummary, workers int, stage runStage) tea.Cmd { + if len(pages) == 0 { + m.errMsg = "Nothing to scrape" + return nil + } + m.runStage = stage + m.state = stateRun + m.errMsg = "" + m.progressDone = 0 + m.progressTotal = len(pages) + m.logLines = nil + m.logViewport.SetContent("") + m.listViewport.SetYOffset(0) + m.progress = progress.New(progress.WithDefaultGradient()) + m.progressCh = make(chan crawler.ProgressUpdate) + m.logCh = make(chan crawler.LogUpdate, 32) + return tea.Batch( + startScrape(m.ctx, m.baseURL, m.outputDir, workers, m.verbose, pages, m.progressCh, m.logCh), + listenProgress(m.progressCh), + listenLogs(m.logCh), + ) +} + +func (m *Model) generateOutputs() tea.Cmd { + selected := m.selectedPageInfos() + if len(selected) == 0 { + m.errMsg = "Select at least one page" + return nil + } + if m.baseURL == nil { + m.errMsg = "Base URL missing" + return nil + } + m.errMsg = "" + m.state = stateFinalize + return tea.Batch( + m.spinner.Tick, + func() tea.Msg { + if err := utils.CreateOutputDirs(m.outputDir); err != nil { + return finalizeResultMsg{err: err} + } + gen := generator.New(m.baseURL, m.outputDir) + err := gen.Generate(selected) + return finalizeResultMsg{err: err} + }, + ) +} + +func (m *Model) ensureListCursorVisible() { + if len(m.entries) == 0 { + m.listViewport.SetYOffset(0) + return + } + line := m.cursor + start := m.listViewport.YOffset + end := start + m.listViewport.Height + if line < start { + m.listViewport.SetYOffset(line) + } else if line >= end { + m.listViewport.SetYOffset(line - m.listViewport.Height + 1) + } +} + +func (m *Model) recalculateLayout() { + listWidth := max(m.width-6, 20) + listHeight := max(m.height-8, 6) + m.listViewport.Width = listWidth + m.listViewport.Height = listHeight + + logWidth := max(m.width-6, 20) + logHeight := max(m.height/3, 6) + m.logViewport.Width = logWidth + m.logViewport.Height = logHeight +} + +func parseWorkers(raw string, fallback int) (int, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return fallback, nil + } + var workers int + if _, err := fmt.Sscanf(raw, "%d", &workers); err != nil || workers <= 0 { + return 0, fmt.Errorf("workers must be a positive number") + } + return workers, nil +} + +func groupForPath(path string) string { + path = strings.Trim(path, "/") + if path == "" { + return "" + } + parts := strings.Split(path, "/") + return parts[0] +} + +func startDiscovery(ctx context.Context, baseURL *url.URL, workers int) tea.Cmd { + return func() tea.Msg { + pages, stats, err := crawler.Discover(ctx, crawler.DiscoverOptions{ + BaseURL: baseURL, + Workers: workers, + }) + return discoveryResultMsg{ + pages: pages, + stats: stats, + err: err, + } + } +} + +func listenProgress(ch <-chan crawler.ProgressUpdate) tea.Cmd { + return func() tea.Msg { + update, ok := <-ch + if !ok { + return nil + } + return progressUpdateMsg{update: update} + } +} + +func listenLogs(ch <-chan crawler.LogUpdate) tea.Cmd { + return func() tea.Msg { + update, ok := <-ch + if !ok { + return nil + } + return logUpdateMsg{update: update} + } +} + +func (m *Model) setConfigFocus(index int) tea.Cmd { + if index < 0 { + index = 0 + } + if index > 1 { + index = 1 + } + inputs := []*textinput.Model{&m.workerInput, &m.outputInput} + cmds := make([]tea.Cmd, 0, len(inputs)) + for i, input := range inputs { + if i == index { + cmds = append(cmds, (*input).Focus()) + } else { + (*input).Blur() + } + } + m.configFocus = index + return tea.Batch(cmds...) +} + +func startScrape(ctx context.Context, baseURL *url.URL, outputDir string, workers int, verbose bool, pages []crawler.PageSummary, progressCh chan crawler.ProgressUpdate, logCh chan crawler.LogUpdate) tea.Cmd { + return func() tea.Msg { + resultPages, stats, err := crawler.Scrape(ctx, crawler.ScrapeOptions{ + BaseURL: baseURL, + Pages: pages, + Output: outputDir, + Workers: workers, + Verbose: verbose, + Logs: logCh, + Progress: progressCh, + }) + close(progressCh) + close(logCh) + + return scrapeResultMsg{ + stats: stats, + pages: resultPages, + err: err, + } + } +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} + +var ( + cursorStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("205")) + statusStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("9")).Bold(true) + titleStyle = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("213")) + instructionStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("244")) + hintStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("242")) + accentStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("212")).Bold(true) + infoStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("249")) + groupStyle = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("213")) + pageStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("249")) + checkboxEmptyStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("240")) + checkboxFilledStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("120")).Bold(true) + checkboxMixedStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("178")).Bold(true) + plusStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("244")).Bold(true) + panelStyle = lipgloss.NewStyle().Border(lipgloss.RoundedBorder()).BorderForeground(lipgloss.Color("240")).Padding(1, 2).MaxWidth(96) +) + +// Run launches the TUI program. +func Run(ctx context.Context, opts Options) error { + model := NewModel(ctx, opts) + prog := tea.NewProgram(model, tea.WithAltScreen()) + _, err := prog.Run() + return err +} diff --git a/internal/utils/namer.go b/internal/utils/namer.go new file mode 100644 index 0000000..5e5afd1 --- /dev/null +++ b/internal/utils/namer.go @@ -0,0 +1,39 @@ +package utils + +import ( + "path/filepath" + "strconv" + "strings" + "sync" +) + +// UniqueNamer ensures generated filenames remain unique by appending counters. +type UniqueNamer struct { + mu sync.Mutex + counts map[string]int +} + +// NewUniqueNamer returns an initialized UniqueNamer. +func NewUniqueNamer() *UniqueNamer { + return &UniqueNamer{ + counts: make(map[string]int), + } +} + +// Reserve records a filename and returns a unique variant if needed. +func (n *UniqueNamer) Reserve(filename string) string { + n.mu.Lock() + defer n.mu.Unlock() + + base := strings.TrimSuffix(filename, filepath.Ext(filename)) + ext := filepath.Ext(filename) + count := n.counts[base] + + if count == 0 { + n.counts[base] = 1 + return filename + } + + n.counts[base] = count + 1 + return base + "-" + strconv.Itoa(count) + ext +}