feat: add TUI

This commit is contained in:
Sirin Puenggun 2025-10-18 09:46:00 +07:00
parent b09a897b19
commit 44c783a284
12 changed files with 2084 additions and 6 deletions

9
.gitignore vendored
View File

@ -9,8 +9,11 @@
*.dylib *.dylib
# Project binaries # Project binaries
crawler /crawler
site-to-llmstxt /site-to-llmstxt
.gocache/
.gomodcache/
main
# Output directories # Output directories
output/ output/
@ -58,3 +61,5 @@ go.work.sum
# Editor/IDE # Editor/IDE
# .idea/ # .idea/
# .vscode/ # .vscode/
bin/

View File

@ -1,6 +1,6 @@
# Makefile for site-to-llmstxt crawler # Makefile for site-to-llmstxt crawler
.PHONY: build test clean run help fmt lint deps dev-setup .PHONY: build test clean run tui demo help fmt lint deps dev-setup all
# Variables # Variables
BINARY_NAME=site-to-llmstxt BINARY_NAME=site-to-llmstxt
@ -15,6 +15,7 @@ help:
@echo " test-coverage - Run tests with coverage" @echo " test-coverage - Run tests with coverage"
@echo " clean - Clean build artifacts" @echo " clean - Clean build artifacts"
@echo " run - Run with example URL (requires URL variable)" @echo " run - Run with example URL (requires URL variable)"
@echo " tui - Launch interactive terminal UI"
@echo " fmt - Format code" @echo " fmt - Format code"
@echo " lint - Lint code" @echo " lint - Lint code"
@echo " deps - Install/update dependencies" @echo " deps - Install/update dependencies"
@ -24,6 +25,7 @@ help:
@echo " make build" @echo " make build"
@echo " make run URL=https://example.com" @echo " make run URL=https://example.com"
@echo " make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output" @echo " make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output"
@echo " make tui OUTPUT=./docs WORKERS=4"
# Build the crawler # Build the crawler
build: build:
@ -98,3 +100,7 @@ all: clean deps fmt build test
demo: build demo: build
@echo "Running demo crawl of httpbin.org..." @echo "Running demo crawl of httpbin.org..."
$(BUILD_DIR)/$(BINARY_NAME) --url https://httpbin.org --output ./demo-output --workers 1 --verbose $(BUILD_DIR)/$(BINARY_NAME) --url https://httpbin.org --output ./demo-output --workers 1 --verbose
# Launch interactive TUI
tui:
@echo "Launching TUI..."
go run ./cmd/site-to-llmstxt tui $(if $(OUTPUT),--output $(OUTPUT)) $(if $(WORKERS),--workers $(WORKERS))

360
cmd/site-to-llmstxt/main.go Normal file
View File

@ -0,0 +1,360 @@
package main
import (
"context"
"errors"
"fmt"
"log"
"net/url"
"os"
"os/signal"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"time"
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"github.com/urfave/cli/v2"
"github.com/Sosokker/site-to-llmstxt/internal/config"
"github.com/Sosokker/site-to-llmstxt/internal/filters"
"github.com/Sosokker/site-to-llmstxt/internal/generator"
"github.com/Sosokker/site-to-llmstxt/internal/models"
"github.com/Sosokker/site-to-llmstxt/internal/progress"
"github.com/Sosokker/site-to-llmstxt/internal/tui"
"github.com/Sosokker/site-to-llmstxt/internal/utils"
)
func main() {
log.SetFlags(0)
app := &cli.App{
Name: "site-to-llmstxt",
Usage: "Crawl a documentation site and generate llms.txt outputs",
Commands: []*cli.Command{
{
Name: "tui",
Usage: "Launch interactive terminal UI",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "output",
Usage: "Output directory",
Value: config.DefaultOutputDir,
},
&cli.IntFlag{
Name: "workers",
Usage: "Default worker count used during discovery",
Value: config.DefaultWorkers,
},
},
Action: func(cliCtx *cli.Context) error {
ctx, cancel := signal.NotifyContext(cliCtx.Context, os.Interrupt)
defer cancel()
opts := tui.Options{
OutputDir: cliCtx.String("output"),
DefaultWorkers: cliCtx.Int("workers"),
}
return tui.Run(ctx, opts)
},
},
},
Flags: []cli.Flag{
&cli.StringFlag{
Name: "url",
Aliases: []string{"u"},
Usage: "Start `URL` to crawl (required)",
EnvVars: []string{"SITE_TO_LLMSTXT_URL"},
},
&cli.StringFlag{
Name: "output",
Aliases: []string{"o"},
Usage: "Output directory",
EnvVars: []string{"SITE_TO_LLMSTXT_OUTPUT"},
Value: config.DefaultOutputDir,
},
&cli.IntFlag{
Name: "workers",
Aliases: []string{"w"},
Usage: "Number of concurrent workers",
EnvVars: []string{"SITE_TO_LLMSTXT_WORKERS"},
Value: config.DefaultWorkers,
},
&cli.BoolFlag{
Name: "verbose",
Usage: "Enable verbose progress logging",
EnvVars: []string{"SITE_TO_LLMSTXT_VERBOSE"},
},
},
Action: func(cliCtx *cli.Context) error {
ctx, cancel := signal.NotifyContext(cliCtx.Context, os.Interrupt)
defer cancel()
cfg := &config.Config{
URL: cliCtx.String("url"),
OutputDir: cliCtx.String("output"),
Workers: cliCtx.Int("workers"),
Verbose: cliCtx.Bool("verbose"),
}
if cfg.Workers <= 0 {
cfg.Workers = config.DefaultWorkers
}
if cfg.OutputDir == "" {
cfg.OutputDir = config.DefaultOutputDir
}
if err := cfg.Validate(); err != nil {
return err
}
baseURL, err := url.Parse(cfg.URL)
if err != nil {
return fmt.Errorf("parse URL: %w", err)
}
return crawlAndGenerate(ctx, baseURL, cfg)
},
}
if err := app.Run(os.Args); err != nil {
log.Fatal(err)
}
}
func crawlAndGenerate(ctx context.Context, baseURL *url.URL, cfg *config.Config) error {
if err := utils.CreateOutputDirs(cfg.OutputDir); err != nil {
return err
}
stats := &models.Stats{StartTime: time.Now()}
basePath := ""
if baseURL != nil {
basePath = baseURL.Path
}
statsMu := &sync.Mutex{}
progressManager := progress.New(cfg.Verbose, stats)
defer func() {
statsMu.Lock()
stats.Finish()
statsMu.Unlock()
progressManager.Finish()
}()
pages := make([]models.PageInfo, 0, 128)
pagesMu := &sync.Mutex{}
namer := utils.NewUniqueNamer()
var queued, processed int64
collector := colly.NewCollector(
colly.AllowedDomains(allowedDomains(baseURL.Host)...),
colly.Async(true),
)
collector.SetRequestTimeout(30 * time.Second)
if cfg.Workers > 0 {
if err := collector.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: cfg.Workers,
RandomDelay: 500 * time.Millisecond,
}); err != nil {
return fmt.Errorf("configure collector: %w", err)
}
}
collector.OnRequest(func(r *colly.Request) {
select {
case <-ctx.Done():
r.Abort()
return
default:
}
if cfg.Verbose {
progressManager.Log("Visiting %s", r.URL.String())
}
})
collector.OnError(func(r *colly.Response, err error) {
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
progressManager.Log("Error fetching %s: %v", r.Request.URL, err)
})
collector.OnHTML("html", func(e *colly.HTMLElement) {
select {
case <-ctx.Done():
return
default:
}
pageURL := e.Request.URL.String()
atomic.AddInt64(&queued, -1)
currentProcessed := atomic.AddInt64(&processed, 1)
defer func() {
progressManager.Update(int(currentProcessed), int(max64(atomic.LoadInt64(&queued), 0)))
}()
title := strings.TrimSpace(e.DOM.Find("title").First().Text())
if title == "" {
title = "Untitled"
}
description := strings.TrimSpace(e.DOM.Find(`meta[name="description"]`).AttrOr("content", ""))
markdown, err := htmltomarkdown.ConvertString(string(e.Response.Body))
if err != nil {
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
progressManager.Log("Failed to convert %s: %v", pageURL, err)
return
}
markdown = strings.TrimSpace(markdown)
if description == "" {
description = utils.ExtractFirstSentence(markdown)
}
filename := utils.CreateFilename(title, pageURL)
filename = namer.Reserve(filename)
relativePath := filepath.Join(config.MarkdownSubdir, filename)
fullPath := filepath.Join(cfg.OutputDir, relativePath)
if err := os.WriteFile(fullPath, []byte(markdown), 0644); err != nil {
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
progressManager.Log("Failed to write %s: %v", fullPath, err)
return
}
pageInfo := models.PageInfo{
URL: pageURL,
Title: title,
Content: markdown,
FilePath: relativePath,
CrawledAt: time.Now(),
Description: description,
}
pagesMu.Lock()
pages = append(pages, pageInfo)
pagesMu.Unlock()
statsMu.Lock()
stats.TotalPages++
if filters.IsMainDocPage(pageURL) {
stats.MainDocPages++
} else {
stats.SecondaryPages++
}
statsMu.Unlock()
e.DOM.Find("a[href]").Each(func(_ int, sel *goquery.Selection) {
href, exists := sel.Attr("href")
if !exists || href == "" {
return
}
absolute := e.Request.AbsoluteURL(href)
if absolute == "" {
return
}
if !strings.HasPrefix(absolute, "http") {
return
}
if filters.ShouldSkipURL(absolute, baseURL.Host, basePath) {
statsMu.Lock()
stats.AddSkipped()
statsMu.Unlock()
return
}
select {
case <-ctx.Done():
return
default:
}
if err := collector.Visit(absolute); err != nil {
var alreadyVisited *colly.AlreadyVisitedError
if errors.As(err, &alreadyVisited) {
return
}
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
progressManager.Log("Failed to queue %s: %v", absolute, err)
return
}
atomic.AddInt64(&queued, 1)
})
})
atomic.AddInt64(&queued, 1)
if err := collector.Visit(baseURL.String()); err != nil {
var alreadyVisited *colly.AlreadyVisitedError
if !errors.As(err, &alreadyVisited) {
return fmt.Errorf("start crawl: %w", err)
}
}
collector.Wait()
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
return err
}
if len(pages) == 0 {
return errors.New("no pages were crawled; check URL or filters")
}
gen := generator.New(baseURL, cfg.OutputDir)
if err := gen.Generate(pages); err != nil {
return fmt.Errorf("generate outputs: %w", err)
}
return nil
}
func allowedDomains(host string) []string {
host = strings.TrimSpace(host)
if host == "" {
return nil
}
domains := map[string]struct{}{
host: {},
}
if strings.HasPrefix(host, "www.") {
domains[strings.TrimPrefix(host, "www.")] = struct{}{}
} else {
domains["www."+host] = struct{}{}
}
list := make([]string, 0, len(domains))
for d := range domains {
list = append(list, d)
}
return list
}
func max64(a, b int64) int64 {
if a > b {
return a
}
return b
}

18
go.mod
View File

@ -4,6 +4,10 @@ go 1.24.5
require ( require (
github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3 github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3
github.com/PuerkitoBio/goquery v1.10.3
github.com/charmbracelet/bubbles v0.17.1
github.com/charmbracelet/bubbletea v0.25.0
github.com/charmbracelet/lipgloss v0.9.1
github.com/gocolly/colly/v2 v2.2.0 github.com/gocolly/colly/v2 v2.2.0
github.com/schollz/progressbar/v3 v3.18.0 github.com/schollz/progressbar/v3 v3.18.0
github.com/urfave/cli/v2 v2.27.7 github.com/urfave/cli/v2 v2.27.7
@ -11,18 +15,29 @@ require (
require ( require (
github.com/JohannesKaufmann/dom v0.2.0 // indirect github.com/JohannesKaufmann/dom v0.2.0 // indirect
github.com/PuerkitoBio/goquery v1.10.3 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/antchfx/htmlquery v1.3.4 // indirect github.com/antchfx/htmlquery v1.3.4 // indirect
github.com/antchfx/xmlquery v1.4.4 // indirect github.com/antchfx/xmlquery v1.4.4 // indirect
github.com/antchfx/xpath v1.3.4 // indirect github.com/antchfx/xpath v1.3.4 // indirect
github.com/atotto/clipboard v0.1.4 // indirect
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
github.com/bits-and-blooms/bitset v1.22.0 // indirect github.com/bits-and-blooms/bitset v1.22.0 // indirect
github.com/charmbracelet/harmonica v0.2.0 // indirect
github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
github.com/golang/protobuf v1.5.4 // indirect github.com/golang/protobuf v1.5.4 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-localereader v0.0.1 // indirect
github.com/mattn/go-runewidth v0.0.16 // indirect
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b // indirect
github.com/muesli/cancelreader v0.2.2 // indirect
github.com/muesli/reflow v0.3.0 // indirect
github.com/muesli/termenv v0.16.0 // indirect
github.com/nlnwa/whatwg-url v0.6.2 // indirect github.com/nlnwa/whatwg-url v0.6.2 // indirect
github.com/rivo/uniseg v0.4.7 // indirect github.com/rivo/uniseg v0.4.7 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect
@ -30,6 +45,7 @@ require (
github.com/temoto/robotstxt v1.1.2 // indirect github.com/temoto/robotstxt v1.1.2 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
golang.org/x/net v0.42.0 // indirect golang.org/x/net v0.42.0 // indirect
golang.org/x/sync v0.16.0 // indirect
golang.org/x/sys v0.34.0 // indirect golang.org/x/sys v0.34.0 // indirect
golang.org/x/term v0.33.0 // indirect golang.org/x/term v0.33.0 // indirect
golang.org/x/text v0.27.0 // indirect golang.org/x/text v0.27.0 // indirect

35
go.sum
View File

@ -13,11 +13,25 @@ github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fus
github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/antchfx/xpath v1.3.4 h1:1ixrW1VnXd4HurCj7qnqnR0jo14g8JMe20Fshg1Vgz4= github.com/antchfx/xpath v1.3.4 h1:1ixrW1VnXd4HurCj7qnqnR0jo14g8JMe20Fshg1Vgz4=
github.com/antchfx/xpath v1.3.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antchfx/xpath v1.3.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/charmbracelet/bubbles v0.17.1 h1:0SIyjOnkrsfDo88YvPgAWvZMwXe26TP6drRvmkjyUu4=
github.com/charmbracelet/bubbles v0.17.1/go.mod h1:9HxZWlkCqz2PRwsCbYl7a3KXvGzFaDHpYbSYMJ+nE3o=
github.com/charmbracelet/bubbletea v0.25.0 h1:bAfwk7jRz7FKFl9RzlIULPkStffg5k6pNt5dywy4TcM=
github.com/charmbracelet/bubbletea v0.25.0/go.mod h1:EN3QDR1T5ZdWmdfDzYcqOCAps45+QIJbLOBxmVNWNNg=
github.com/charmbracelet/harmonica v0.2.0 h1:8NxJWRWg/bzKqqEaaeFNipOu77YR5t8aSwG4pgaUBiQ=
github.com/charmbracelet/harmonica v0.2.0/go.mod h1:KSri/1RMQOZLbw7AHqgcBycp8pgJnQMYYT8QZRqZ1Ao=
github.com/charmbracelet/lipgloss v0.9.1 h1:PNyd3jvaJbg4jRHKWXnCj1akQm4rh8dbEzN1p/u1KWg=
github.com/charmbracelet/lipgloss v0.9.1/go.mod h1:1mPmG4cxScwUQALAAnacHaigiiHB9Pmr+v1VEawJl6I=
github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 h1:q2hJAaP1k2wIvVRd/hEHD7lacgqrCPS+k8g1MndzfWY=
github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -39,14 +53,31 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b h1:1XF24mVaiu7u+CFywTdcDo2ie1pzzhwjt6RHqzpMU34=
github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b/go.mod h1:fQuZ0gauxyBcmsdE3ZT4NasjaRdxmbCS0jRHsrWu3Ho=
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q= github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk= github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
@ -103,12 +134,16 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=

View File

@ -0,0 +1,217 @@
package crawler
import (
"context"
"errors"
"fmt"
"net/url"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"github.com/Sosokker/site-to-llmstxt/internal/filters"
"github.com/Sosokker/site-to-llmstxt/internal/models"
)
// PageSummary captures metadata for a crawled URL prior to full scraping.
type PageSummary struct {
URL string
Title string
Description string
Path string
Depth int
}
// DiscoverOptions configure the URL discovery stage.
type DiscoverOptions struct {
BaseURL *url.URL
Workers int
OnLog func(string, ...interface{})
OnProgress func(processed, queued int)
}
// Discover traverses links starting from the base URL and returns unique pages.
func Discover(ctx context.Context, opts DiscoverOptions) ([]PageSummary, *models.Stats, error) {
if opts.BaseURL == nil {
return nil, nil, errors.New("base URL is required")
}
stats := &models.Stats{StartTime: time.Now()}
basePath := ""
if opts.BaseURL != nil {
basePath = opts.BaseURL.Path
}
statsMu := &sync.Mutex{}
var (
mu sync.Mutex
pages = make([]PageSummary, 0, 128)
seen = make(map[string]struct{})
queued int64
processed int64
)
collector := colly.NewCollector(
colly.AllowedDomains(allowedDomains(opts.BaseURL.Host)...),
colly.Async(true),
)
collector.SetRequestTimeout(30 * time.Second)
if opts.Workers > 0 {
if err := collector.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: opts.Workers,
RandomDelay: 500 * time.Millisecond,
}); err != nil {
return nil, nil, fmt.Errorf("configure collector: %w", err)
}
}
collector.OnRequest(func(r *colly.Request) {
select {
case <-ctx.Done():
r.Abort()
return
default:
}
if opts.OnLog != nil {
opts.OnLog("discover: visiting %s", r.URL.String())
}
})
collector.OnError(func(r *colly.Response, err error) {
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
if opts.OnLog != nil {
opts.OnLog("discover: error fetching %s: %v", r.Request.URL, err)
}
})
collector.OnHTML("html", func(e *colly.HTMLElement) {
select {
case <-ctx.Done():
return
default:
}
pageURL := e.Request.URL.String()
atomic.AddInt64(&queued, -1)
currentProcessed := atomic.AddInt64(&processed, 1)
defer func() {
if opts.OnProgress != nil {
opts.OnProgress(int(currentProcessed), int(max64(atomic.LoadInt64(&queued), 0)))
}
}()
mu.Lock()
if _, ok := seen[pageURL]; ok {
mu.Unlock()
} else {
seen[pageURL] = struct{}{}
mu.Unlock()
statsMu.Lock()
stats.TotalPages++
if filters.IsMainDocPage(pageURL) {
stats.MainDocPages++
} else {
stats.SecondaryPages++
}
statsMu.Unlock()
title := strings.TrimSpace(e.DOM.Find("title").First().Text())
if title == "" {
title = "Untitled"
}
description := strings.TrimSpace(e.DOM.Find(`meta[name="description"]`).AttrOr("content", ""))
if description == "" {
description = guessDescription(e.DOM)
}
summary := PageSummary{
URL: pageURL,
Title: title,
Description: description,
Path: e.Request.URL.Path,
Depth: e.Request.Depth,
}
mu.Lock()
pages = append(pages, summary)
mu.Unlock()
}
e.DOM.Find("a[href]").Each(func(_ int, sel *goquery.Selection) {
href, exists := sel.Attr("href")
if !exists || href == "" {
return
}
absolute := e.Request.AbsoluteURL(href)
if absolute == "" || !strings.HasPrefix(absolute, "http") {
return
}
if filters.ShouldSkipURL(absolute, opts.BaseURL.Host, basePath) {
statsMu.Lock()
stats.AddSkipped()
statsMu.Unlock()
return
}
if err := collector.Visit(absolute); err != nil {
var alreadyVisited *colly.AlreadyVisitedError
if errors.As(err, &alreadyVisited) {
return
}
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
if opts.OnLog != nil {
opts.OnLog("discover: failed to queue %s: %v", absolute, err)
}
return
}
atomic.AddInt64(&queued, 1)
})
})
atomic.AddInt64(&queued, 1)
if err := collector.Visit(opts.BaseURL.String()); err != nil {
var alreadyVisited *colly.AlreadyVisitedError
if !errors.As(err, &alreadyVisited) {
return nil, nil, fmt.Errorf("start discovery: %w", err)
}
}
collector.Wait()
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
return nil, nil, err
}
stats.Finish()
return pages, stats, nil
}
func guessDescription(sel *goquery.Selection) string {
selection := sel.Find("p")
for i := range selection.Nodes {
paragraph := selection.Eq(i).Text()
paragraph = strings.TrimSpace(paragraph)
if paragraph != "" {
if len(paragraph) > 240 {
return paragraph[:240] + "..."
}
return paragraph
}
}
return ""
}

245
internal/crawler/scrape.go Normal file
View File

@ -0,0 +1,245 @@
package crawler
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"time"
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/PuerkitoBio/goquery"
"github.com/Sosokker/site-to-llmstxt/internal/config"
"github.com/Sosokker/site-to-llmstxt/internal/filters"
"github.com/Sosokker/site-to-llmstxt/internal/models"
"github.com/Sosokker/site-to-llmstxt/internal/utils"
)
// ProgressUpdate conveys scraping progress for interactive UIs.
type ProgressUpdate struct {
Completed int
Total int
URL string
}
// LogUpdate captures a log line emitted during scraping.
type LogUpdate struct {
Message string
}
// ScrapeOptions configure the download stage for selected pages.
type ScrapeOptions struct {
BaseURL *url.URL
Pages []PageSummary
Output string
Workers int
Verbose bool
Logs chan<- LogUpdate
Progress chan<- ProgressUpdate
}
// Scrape fetches each provided page, writes Markdown output, and returns results.
func Scrape(ctx context.Context, opts ScrapeOptions) ([]models.PageInfo, *models.Stats, error) {
if opts.BaseURL == nil {
return nil, nil, errors.New("base URL is required")
}
if len(opts.Pages) == 0 {
return nil, nil, errors.New("no pages selected for scraping")
}
if opts.Workers <= 0 {
opts.Workers = config.DefaultWorkers
}
if err := utils.CreateOutputDirs(opts.Output); err != nil {
return nil, nil, err
}
client := &http.Client{
Timeout: 30 * time.Second,
}
stats := &models.Stats{
StartTime: time.Now(),
TotalPages: len(opts.Pages),
MainDocPages: 0,
SecondaryPages: 0,
SkippedURLs: 0,
ErrorCount: 0,
}
statsMu := &sync.Mutex{}
for _, page := range opts.Pages {
if filters.IsMainDocPage(page.URL) {
stats.MainDocPages++
} else {
stats.SecondaryPages++
}
}
namer := utils.NewUniqueNamer()
results := make([]models.PageInfo, 0, len(opts.Pages))
resultsMu := &sync.Mutex{}
var completed int32
progressTotal := len(opts.Pages)
pageCh := make(chan PageSummary)
wg := sync.WaitGroup{}
errOnce := sync.Once{}
var firstErr error
sendLog := func(always bool, format string, args ...interface{}) {
if !always && !opts.Verbose {
return
}
if opts.Logs == nil {
return
}
msg := fmt.Sprintf(format, args...)
select {
case opts.Logs <- LogUpdate{Message: msg}:
default:
}
}
sendProgress := func(url string) {
if opts.Progress == nil {
return
}
done := int(atomic.AddInt32(&completed, 1))
select {
case opts.Progress <- ProgressUpdate{
Completed: done,
Total: progressTotal,
URL: url,
}:
default:
}
}
for i := 0; i < opts.Workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for page := range pageCh {
select {
case <-ctx.Done():
return
default:
}
info, err := fetchPage(ctx, client, page, opts.Output, namer)
if err != nil {
errOnce.Do(func() { firstErr = err })
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
sendLog(true, "error scraping %s: %v", page.URL, err)
sendProgress(page.URL)
continue
}
resultsMu.Lock()
results = append(results, info)
resultsMu.Unlock()
sendLog(false, "scraped %s", page.URL)
sendProgress(page.URL)
}
}()
}
go func() {
for _, page := range opts.Pages {
pageCh <- page
}
close(pageCh)
}()
wg.Wait()
stats.Finish()
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
return results, stats, err
}
return results, stats, firstErr
}
func fetchPage(ctx context.Context, client *http.Client, page PageSummary, outputDir string, namer *utils.UniqueNamer) (models.PageInfo, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, page.URL, nil)
if err != nil {
return models.PageInfo{}, err
}
resp, err := client.Do(req)
if err != nil {
return models.PageInfo{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
io.Copy(io.Discard, resp.Body)
return models.PageInfo{}, fmt.Errorf("unexpected status code %d", resp.StatusCode)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return models.PageInfo{}, err
}
htmlContent := string(bodyBytes)
markdown, err := htmltomarkdown.ConvertString(htmlContent)
if err != nil {
return models.PageInfo{}, err
}
markdown = strings.TrimSpace(markdown)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
return models.PageInfo{}, err
}
title := strings.TrimSpace(doc.Find("title").First().Text())
if title == "" {
title = page.Title
}
description := strings.TrimSpace(doc.Find(`meta[name="description"]`).AttrOr("content", ""))
if description == "" {
description = page.Description
}
if description == "" {
description = utils.ExtractFirstSentence(markdown)
}
filename := utils.CreateFilename(title, page.URL)
filename = namer.Reserve(filename)
relativePath := filepath.Join(config.MarkdownSubdir, filename)
fullPath := filepath.Join(outputDir, relativePath)
if err := os.WriteFile(fullPath, []byte(markdown), 0o644); err != nil {
return models.PageInfo{}, err
}
info := models.PageInfo{
URL: page.URL,
Title: title,
Content: markdown,
FilePath: relativePath,
CrawledAt: time.Now(),
Description: description,
}
return info, nil
}

33
internal/crawler/util.go Normal file
View File

@ -0,0 +1,33 @@
package crawler
import "strings"
func allowedDomains(host string) []string {
host = strings.TrimSpace(host)
if host == "" {
return nil
}
domains := map[string]struct{}{
host: {},
}
if strings.HasPrefix(host, "www.") {
domains[strings.TrimPrefix(host, "www.")] = struct{}{}
} else {
domains["www."+host] = struct{}{}
}
list := make([]string, 0, len(domains))
for d := range domains {
list = append(list, d)
}
return list
}
func max64(a, b int64) int64 {
if a > b {
return a
}
return b
}

View File

@ -31,7 +31,7 @@ var SecondaryPageIndicators = []string{
} }
// ShouldSkipURL determines if a URL should be skipped based on various filters. // ShouldSkipURL determines if a URL should be skipped based on various filters.
func ShouldSkipURL(rawURL, baseHost string) bool { func ShouldSkipURL(rawURL, baseHost, basePath string) bool {
if rawURL == "" { if rawURL == "" {
return true return true
} }
@ -53,10 +53,26 @@ func ShouldSkipURL(rawURL, baseHost string) bool {
} }
lowerURL := strings.ToLower(rawURL) lowerURL := strings.ToLower(rawURL)
basePathLower := strings.ToLower(basePath)
if basePathLower != "" && !strings.HasPrefix(basePathLower, "/") {
basePathLower = "/" + basePathLower
}
basePathLower = strings.TrimRight(basePathLower, "/")
candidatePath := strings.ToLower(u.Path)
baseClean := strings.TrimSuffix(basePathLower, "/")
if baseClean != "" && baseClean != "/" {
if candidatePath != baseClean && !strings.HasPrefix(candidatePath, baseClean+"/") {
return true
}
}
// Skip language variants // Skip language variants
for _, lang := range LanguageIndicators { for _, lang := range LanguageIndicators {
if strings.Contains(lowerURL, lang) { if strings.Contains(lowerURL, lang) {
if basePathLower != "" && strings.Contains(basePathLower, lang) {
continue
}
return true return true
} }
} }

View File

@ -7,55 +7,77 @@ func TestShouldSkipURL(t *testing.T) {
name string name string
url string url string
baseHost string baseHost string
basePath string
want bool want bool
}{ }{
{ {
name: "Normal URL", name: "Normal URL",
url: "https://example.com/docs", url: "https://example.com/docs",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: false, want: false,
}, },
{ {
name: "Language URL - en", name: "Language URL - en",
url: "https://example.com/en/docs", url: "https://example.com/en/docs",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: true, want: true,
}, },
{ {
name: "Language URL - zh", name: "Language URL - zh",
url: "https://example.com/zh/docs", url: "https://example.com/zh/docs",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: true, want: true,
}, },
{ {
name: "PDF file", name: "PDF file",
url: "https://example.com/doc.pdf", url: "https://example.com/doc.pdf",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: true, want: true,
}, },
{ {
name: "ZIP file", name: "ZIP file",
url: "https://example.com/download.zip", url: "https://example.com/download.zip",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: true, want: true,
}, },
{ {
name: "Fragment URL", name: "Fragment URL",
url: "https://example.com/docs#section", url: "https://example.com/docs#section",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: true, want: true,
}, },
{ {
name: "External domain", name: "External domain",
url: "https://other.com/docs", url: "https://other.com/docs",
baseHost: "example.com", baseHost: "example.com",
basePath: "",
want: true, want: true,
}, },
{
name: "Language URL outside base path",
url: "https://example.com/en/reference",
baseHost: "example.com",
basePath: "/en/docs/",
want: true,
},
{
name: "Sub path inside base",
url: "https://example.com/en/docs/liff/guide",
baseHost: "example.com",
basePath: "/en/docs/liff/",
want: false,
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
if got := ShouldSkipURL(tt.url, tt.baseHost); got != tt.want { if got := ShouldSkipURL(tt.url, tt.baseHost, tt.basePath); got != tt.want {
t.Errorf("ShouldSkipURL() = %v, want %v", got, tt.want) t.Errorf("ShouldSkipURL() = %v, want %v", got, tt.want)
} }
}) })

1084
internal/tui/model.go Normal file

File diff suppressed because it is too large Load Diff

39
internal/utils/namer.go Normal file
View File

@ -0,0 +1,39 @@
package utils
import (
"path/filepath"
"strconv"
"strings"
"sync"
)
// UniqueNamer ensures generated filenames remain unique by appending counters.
type UniqueNamer struct {
mu sync.Mutex
counts map[string]int
}
// NewUniqueNamer returns an initialized UniqueNamer.
func NewUniqueNamer() *UniqueNamer {
return &UniqueNamer{
counts: make(map[string]int),
}
}
// Reserve records a filename and returns a unique variant if needed.
func (n *UniqueNamer) Reserve(filename string) string {
n.mu.Lock()
defer n.mu.Unlock()
base := strings.TrimSuffix(filename, filepath.Ext(filename))
ext := filepath.Ext(filename)
count := n.counts[base]
if count == 0 {
n.counts[base] = 1
return filename
}
n.counts[base] = count + 1
return base + "-" + strconv.Itoa(count) + ext
}