mirror of
https://github.com/Sosokker/site-to-llmstxt.git
synced 2025-12-18 13:34:06 +01:00
feat: add TUI
This commit is contained in:
parent
b09a897b19
commit
44c783a284
9
.gitignore
vendored
9
.gitignore
vendored
@ -9,8 +9,11 @@
|
||||
*.dylib
|
||||
|
||||
# Project binaries
|
||||
crawler
|
||||
site-to-llmstxt
|
||||
/crawler
|
||||
/site-to-llmstxt
|
||||
.gocache/
|
||||
.gomodcache/
|
||||
main
|
||||
|
||||
# Output directories
|
||||
output/
|
||||
@ -58,3 +61,5 @@ go.work.sum
|
||||
# Editor/IDE
|
||||
# .idea/
|
||||
# .vscode/
|
||||
|
||||
bin/
|
||||
|
||||
8
Makefile
8
Makefile
@ -1,6 +1,6 @@
|
||||
# Makefile for site-to-llmstxt crawler
|
||||
|
||||
.PHONY: build test clean run help fmt lint deps dev-setup
|
||||
.PHONY: build test clean run tui demo help fmt lint deps dev-setup all
|
||||
|
||||
# Variables
|
||||
BINARY_NAME=site-to-llmstxt
|
||||
@ -15,6 +15,7 @@ help:
|
||||
@echo " test-coverage - Run tests with coverage"
|
||||
@echo " clean - Clean build artifacts"
|
||||
@echo " run - Run with example URL (requires URL variable)"
|
||||
@echo " tui - Launch interactive terminal UI"
|
||||
@echo " fmt - Format code"
|
||||
@echo " lint - Lint code"
|
||||
@echo " deps - Install/update dependencies"
|
||||
@ -24,6 +25,7 @@ help:
|
||||
@echo " make build"
|
||||
@echo " make run URL=https://example.com"
|
||||
@echo " make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output"
|
||||
@echo " make tui OUTPUT=./docs WORKERS=4"
|
||||
|
||||
# Build the crawler
|
||||
build:
|
||||
@ -98,3 +100,7 @@ all: clean deps fmt build test
|
||||
demo: build
|
||||
@echo "Running demo crawl of httpbin.org..."
|
||||
$(BUILD_DIR)/$(BINARY_NAME) --url https://httpbin.org --output ./demo-output --workers 1 --verbose
|
||||
# Launch interactive TUI
|
||||
tui:
|
||||
@echo "Launching TUI..."
|
||||
go run ./cmd/site-to-llmstxt tui $(if $(OUTPUT),--output $(OUTPUT)) $(if $(WORKERS),--workers $(WORKERS))
|
||||
|
||||
360
cmd/site-to-llmstxt/main.go
Normal file
360
cmd/site-to-llmstxt/main.go
Normal file
@ -0,0 +1,360 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/config"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/filters"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/generator"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/models"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/progress"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/tui"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/utils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
log.SetFlags(0)
|
||||
|
||||
app := &cli.App{
|
||||
Name: "site-to-llmstxt",
|
||||
Usage: "Crawl a documentation site and generate llms.txt outputs",
|
||||
Commands: []*cli.Command{
|
||||
{
|
||||
Name: "tui",
|
||||
Usage: "Launch interactive terminal UI",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "output",
|
||||
Usage: "Output directory",
|
||||
Value: config.DefaultOutputDir,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "workers",
|
||||
Usage: "Default worker count used during discovery",
|
||||
Value: config.DefaultWorkers,
|
||||
},
|
||||
},
|
||||
Action: func(cliCtx *cli.Context) error {
|
||||
ctx, cancel := signal.NotifyContext(cliCtx.Context, os.Interrupt)
|
||||
defer cancel()
|
||||
|
||||
opts := tui.Options{
|
||||
OutputDir: cliCtx.String("output"),
|
||||
DefaultWorkers: cliCtx.Int("workers"),
|
||||
}
|
||||
|
||||
return tui.Run(ctx, opts)
|
||||
},
|
||||
},
|
||||
},
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "url",
|
||||
Aliases: []string{"u"},
|
||||
Usage: "Start `URL` to crawl (required)",
|
||||
EnvVars: []string{"SITE_TO_LLMSTXT_URL"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "output",
|
||||
Aliases: []string{"o"},
|
||||
Usage: "Output directory",
|
||||
EnvVars: []string{"SITE_TO_LLMSTXT_OUTPUT"},
|
||||
Value: config.DefaultOutputDir,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "workers",
|
||||
Aliases: []string{"w"},
|
||||
Usage: "Number of concurrent workers",
|
||||
EnvVars: []string{"SITE_TO_LLMSTXT_WORKERS"},
|
||||
Value: config.DefaultWorkers,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "verbose",
|
||||
Usage: "Enable verbose progress logging",
|
||||
EnvVars: []string{"SITE_TO_LLMSTXT_VERBOSE"},
|
||||
},
|
||||
},
|
||||
Action: func(cliCtx *cli.Context) error {
|
||||
ctx, cancel := signal.NotifyContext(cliCtx.Context, os.Interrupt)
|
||||
defer cancel()
|
||||
|
||||
cfg := &config.Config{
|
||||
URL: cliCtx.String("url"),
|
||||
OutputDir: cliCtx.String("output"),
|
||||
Workers: cliCtx.Int("workers"),
|
||||
Verbose: cliCtx.Bool("verbose"),
|
||||
}
|
||||
|
||||
if cfg.Workers <= 0 {
|
||||
cfg.Workers = config.DefaultWorkers
|
||||
}
|
||||
if cfg.OutputDir == "" {
|
||||
cfg.OutputDir = config.DefaultOutputDir
|
||||
}
|
||||
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
baseURL, err := url.Parse(cfg.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse URL: %w", err)
|
||||
}
|
||||
|
||||
return crawlAndGenerate(ctx, baseURL, cfg)
|
||||
},
|
||||
}
|
||||
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func crawlAndGenerate(ctx context.Context, baseURL *url.URL, cfg *config.Config) error {
|
||||
if err := utils.CreateOutputDirs(cfg.OutputDir); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stats := &models.Stats{StartTime: time.Now()}
|
||||
basePath := ""
|
||||
if baseURL != nil {
|
||||
basePath = baseURL.Path
|
||||
}
|
||||
statsMu := &sync.Mutex{}
|
||||
progressManager := progress.New(cfg.Verbose, stats)
|
||||
defer func() {
|
||||
statsMu.Lock()
|
||||
stats.Finish()
|
||||
statsMu.Unlock()
|
||||
progressManager.Finish()
|
||||
}()
|
||||
|
||||
pages := make([]models.PageInfo, 0, 128)
|
||||
pagesMu := &sync.Mutex{}
|
||||
namer := utils.NewUniqueNamer()
|
||||
|
||||
var queued, processed int64
|
||||
|
||||
collector := colly.NewCollector(
|
||||
colly.AllowedDomains(allowedDomains(baseURL.Host)...),
|
||||
colly.Async(true),
|
||||
)
|
||||
collector.SetRequestTimeout(30 * time.Second)
|
||||
|
||||
if cfg.Workers > 0 {
|
||||
if err := collector.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: cfg.Workers,
|
||||
RandomDelay: 500 * time.Millisecond,
|
||||
}); err != nil {
|
||||
return fmt.Errorf("configure collector: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
collector.OnRequest(func(r *colly.Request) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
r.Abort()
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
if cfg.Verbose {
|
||||
progressManager.Log("Visiting %s", r.URL.String())
|
||||
}
|
||||
})
|
||||
|
||||
collector.OnError(func(r *colly.Response, err error) {
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
progressManager.Log("Error fetching %s: %v", r.Request.URL, err)
|
||||
})
|
||||
|
||||
collector.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := e.Request.URL.String()
|
||||
|
||||
atomic.AddInt64(&queued, -1)
|
||||
currentProcessed := atomic.AddInt64(&processed, 1)
|
||||
defer func() {
|
||||
progressManager.Update(int(currentProcessed), int(max64(atomic.LoadInt64(&queued), 0)))
|
||||
}()
|
||||
|
||||
title := strings.TrimSpace(e.DOM.Find("title").First().Text())
|
||||
if title == "" {
|
||||
title = "Untitled"
|
||||
}
|
||||
|
||||
description := strings.TrimSpace(e.DOM.Find(`meta[name="description"]`).AttrOr("content", ""))
|
||||
|
||||
markdown, err := htmltomarkdown.ConvertString(string(e.Response.Body))
|
||||
if err != nil {
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
progressManager.Log("Failed to convert %s: %v", pageURL, err)
|
||||
return
|
||||
}
|
||||
markdown = strings.TrimSpace(markdown)
|
||||
|
||||
if description == "" {
|
||||
description = utils.ExtractFirstSentence(markdown)
|
||||
}
|
||||
|
||||
filename := utils.CreateFilename(title, pageURL)
|
||||
filename = namer.Reserve(filename)
|
||||
relativePath := filepath.Join(config.MarkdownSubdir, filename)
|
||||
fullPath := filepath.Join(cfg.OutputDir, relativePath)
|
||||
|
||||
if err := os.WriteFile(fullPath, []byte(markdown), 0644); err != nil {
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
progressManager.Log("Failed to write %s: %v", fullPath, err)
|
||||
return
|
||||
}
|
||||
|
||||
pageInfo := models.PageInfo{
|
||||
URL: pageURL,
|
||||
Title: title,
|
||||
Content: markdown,
|
||||
FilePath: relativePath,
|
||||
CrawledAt: time.Now(),
|
||||
Description: description,
|
||||
}
|
||||
|
||||
pagesMu.Lock()
|
||||
pages = append(pages, pageInfo)
|
||||
pagesMu.Unlock()
|
||||
|
||||
statsMu.Lock()
|
||||
stats.TotalPages++
|
||||
if filters.IsMainDocPage(pageURL) {
|
||||
stats.MainDocPages++
|
||||
} else {
|
||||
stats.SecondaryPages++
|
||||
}
|
||||
statsMu.Unlock()
|
||||
|
||||
e.DOM.Find("a[href]").Each(func(_ int, sel *goquery.Selection) {
|
||||
href, exists := sel.Attr("href")
|
||||
if !exists || href == "" {
|
||||
return
|
||||
}
|
||||
|
||||
absolute := e.Request.AbsoluteURL(href)
|
||||
if absolute == "" {
|
||||
return
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(absolute, "http") {
|
||||
return
|
||||
}
|
||||
|
||||
if filters.ShouldSkipURL(absolute, baseURL.Host, basePath) {
|
||||
statsMu.Lock()
|
||||
stats.AddSkipped()
|
||||
statsMu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
if err := collector.Visit(absolute); err != nil {
|
||||
var alreadyVisited *colly.AlreadyVisitedError
|
||||
if errors.As(err, &alreadyVisited) {
|
||||
return
|
||||
}
|
||||
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
progressManager.Log("Failed to queue %s: %v", absolute, err)
|
||||
return
|
||||
}
|
||||
|
||||
atomic.AddInt64(&queued, 1)
|
||||
})
|
||||
})
|
||||
|
||||
atomic.AddInt64(&queued, 1)
|
||||
if err := collector.Visit(baseURL.String()); err != nil {
|
||||
var alreadyVisited *colly.AlreadyVisitedError
|
||||
if !errors.As(err, &alreadyVisited) {
|
||||
return fmt.Errorf("start crawl: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
collector.Wait()
|
||||
|
||||
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(pages) == 0 {
|
||||
return errors.New("no pages were crawled; check URL or filters")
|
||||
}
|
||||
|
||||
gen := generator.New(baseURL, cfg.OutputDir)
|
||||
if err := gen.Generate(pages); err != nil {
|
||||
return fmt.Errorf("generate outputs: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func allowedDomains(host string) []string {
|
||||
host = strings.TrimSpace(host)
|
||||
if host == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
domains := map[string]struct{}{
|
||||
host: {},
|
||||
}
|
||||
|
||||
if strings.HasPrefix(host, "www.") {
|
||||
domains[strings.TrimPrefix(host, "www.")] = struct{}{}
|
||||
} else {
|
||||
domains["www."+host] = struct{}{}
|
||||
}
|
||||
|
||||
list := make([]string, 0, len(domains))
|
||||
for d := range domains {
|
||||
list = append(list, d)
|
||||
}
|
||||
return list
|
||||
}
|
||||
|
||||
func max64(a, b int64) int64 {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
18
go.mod
18
go.mod
@ -4,6 +4,10 @@ go 1.24.5
|
||||
|
||||
require (
|
||||
github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3
|
||||
github.com/PuerkitoBio/goquery v1.10.3
|
||||
github.com/charmbracelet/bubbles v0.17.1
|
||||
github.com/charmbracelet/bubbletea v0.25.0
|
||||
github.com/charmbracelet/lipgloss v0.9.1
|
||||
github.com/gocolly/colly/v2 v2.2.0
|
||||
github.com/schollz/progressbar/v3 v3.18.0
|
||||
github.com/urfave/cli/v2 v2.27.7
|
||||
@ -11,18 +15,29 @@ require (
|
||||
|
||||
require (
|
||||
github.com/JohannesKaufmann/dom v0.2.0 // indirect
|
||||
github.com/PuerkitoBio/goquery v1.10.3 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/antchfx/htmlquery v1.3.4 // indirect
|
||||
github.com/antchfx/xmlquery v1.4.4 // indirect
|
||||
github.com/antchfx/xpath v1.3.4 // indirect
|
||||
github.com/atotto/clipboard v0.1.4 // indirect
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.22.0 // indirect
|
||||
github.com/charmbracelet/harmonica v0.2.0 // indirect
|
||||
github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.16 // indirect
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||
github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/reflow v0.3.0 // indirect
|
||||
github.com/muesli/termenv v0.16.0 // indirect
|
||||
github.com/nlnwa/whatwg-url v0.6.2 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
@ -30,6 +45,7 @@ require (
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
|
||||
golang.org/x/net v0.42.0 // indirect
|
||||
golang.org/x/sync v0.16.0 // indirect
|
||||
golang.org/x/sys v0.34.0 // indirect
|
||||
golang.org/x/term v0.33.0 // indirect
|
||||
golang.org/x/text v0.27.0 // indirect
|
||||
|
||||
35
go.sum
35
go.sum
@ -13,11 +13,25 @@ github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fus
|
||||
github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/antchfx/xpath v1.3.4 h1:1ixrW1VnXd4HurCj7qnqnR0jo14g8JMe20Fshg1Vgz4=
|
||||
github.com/antchfx/xpath v1.3.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
|
||||
github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
||||
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
|
||||
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/charmbracelet/bubbles v0.17.1 h1:0SIyjOnkrsfDo88YvPgAWvZMwXe26TP6drRvmkjyUu4=
|
||||
github.com/charmbracelet/bubbles v0.17.1/go.mod h1:9HxZWlkCqz2PRwsCbYl7a3KXvGzFaDHpYbSYMJ+nE3o=
|
||||
github.com/charmbracelet/bubbletea v0.25.0 h1:bAfwk7jRz7FKFl9RzlIULPkStffg5k6pNt5dywy4TcM=
|
||||
github.com/charmbracelet/bubbletea v0.25.0/go.mod h1:EN3QDR1T5ZdWmdfDzYcqOCAps45+QIJbLOBxmVNWNNg=
|
||||
github.com/charmbracelet/harmonica v0.2.0 h1:8NxJWRWg/bzKqqEaaeFNipOu77YR5t8aSwG4pgaUBiQ=
|
||||
github.com/charmbracelet/harmonica v0.2.0/go.mod h1:KSri/1RMQOZLbw7AHqgcBycp8pgJnQMYYT8QZRqZ1Ao=
|
||||
github.com/charmbracelet/lipgloss v0.9.1 h1:PNyd3jvaJbg4jRHKWXnCj1akQm4rh8dbEzN1p/u1KWg=
|
||||
github.com/charmbracelet/lipgloss v0.9.1/go.mod h1:1mPmG4cxScwUQALAAnacHaigiiHB9Pmr+v1VEawJl6I=
|
||||
github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
|
||||
github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
|
||||
github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 h1:q2hJAaP1k2wIvVRd/hEHD7lacgqrCPS+k8g1MndzfWY=
|
||||
github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
@ -39,14 +53,31 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
|
||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
||||
github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b h1:1XF24mVaiu7u+CFywTdcDo2ie1pzzhwjt6RHqzpMU34=
|
||||
github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b/go.mod h1:fQuZ0gauxyBcmsdE3ZT4NasjaRdxmbCS0jRHsrWu3Ho=
|
||||
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
|
||||
github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
|
||||
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
|
||||
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
|
||||
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
|
||||
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
@ -103,12 +134,16 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
|
||||
golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
|
||||
217
internal/crawler/discover.go
Normal file
217
internal/crawler/discover.go
Normal file
@ -0,0 +1,217 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gocolly/colly/v2"
|
||||
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/filters"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/models"
|
||||
)
|
||||
|
||||
// PageSummary captures metadata for a crawled URL prior to full scraping.
|
||||
type PageSummary struct {
|
||||
URL string
|
||||
Title string
|
||||
Description string
|
||||
Path string
|
||||
Depth int
|
||||
}
|
||||
|
||||
// DiscoverOptions configure the URL discovery stage.
|
||||
type DiscoverOptions struct {
|
||||
BaseURL *url.URL
|
||||
Workers int
|
||||
OnLog func(string, ...interface{})
|
||||
OnProgress func(processed, queued int)
|
||||
}
|
||||
|
||||
// Discover traverses links starting from the base URL and returns unique pages.
|
||||
func Discover(ctx context.Context, opts DiscoverOptions) ([]PageSummary, *models.Stats, error) {
|
||||
if opts.BaseURL == nil {
|
||||
return nil, nil, errors.New("base URL is required")
|
||||
}
|
||||
|
||||
stats := &models.Stats{StartTime: time.Now()}
|
||||
basePath := ""
|
||||
if opts.BaseURL != nil {
|
||||
basePath = opts.BaseURL.Path
|
||||
}
|
||||
statsMu := &sync.Mutex{}
|
||||
var (
|
||||
mu sync.Mutex
|
||||
pages = make([]PageSummary, 0, 128)
|
||||
seen = make(map[string]struct{})
|
||||
queued int64
|
||||
processed int64
|
||||
)
|
||||
|
||||
collector := colly.NewCollector(
|
||||
colly.AllowedDomains(allowedDomains(opts.BaseURL.Host)...),
|
||||
colly.Async(true),
|
||||
)
|
||||
collector.SetRequestTimeout(30 * time.Second)
|
||||
|
||||
if opts.Workers > 0 {
|
||||
if err := collector.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: opts.Workers,
|
||||
RandomDelay: 500 * time.Millisecond,
|
||||
}); err != nil {
|
||||
return nil, nil, fmt.Errorf("configure collector: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
collector.OnRequest(func(r *colly.Request) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
r.Abort()
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
if opts.OnLog != nil {
|
||||
opts.OnLog("discover: visiting %s", r.URL.String())
|
||||
}
|
||||
})
|
||||
|
||||
collector.OnError(func(r *colly.Response, err error) {
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
if opts.OnLog != nil {
|
||||
opts.OnLog("discover: error fetching %s: %v", r.Request.URL, err)
|
||||
}
|
||||
})
|
||||
|
||||
collector.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := e.Request.URL.String()
|
||||
atomic.AddInt64(&queued, -1)
|
||||
currentProcessed := atomic.AddInt64(&processed, 1)
|
||||
defer func() {
|
||||
if opts.OnProgress != nil {
|
||||
opts.OnProgress(int(currentProcessed), int(max64(atomic.LoadInt64(&queued), 0)))
|
||||
}
|
||||
}()
|
||||
|
||||
mu.Lock()
|
||||
if _, ok := seen[pageURL]; ok {
|
||||
mu.Unlock()
|
||||
} else {
|
||||
seen[pageURL] = struct{}{}
|
||||
mu.Unlock()
|
||||
|
||||
statsMu.Lock()
|
||||
stats.TotalPages++
|
||||
if filters.IsMainDocPage(pageURL) {
|
||||
stats.MainDocPages++
|
||||
} else {
|
||||
stats.SecondaryPages++
|
||||
}
|
||||
statsMu.Unlock()
|
||||
|
||||
title := strings.TrimSpace(e.DOM.Find("title").First().Text())
|
||||
if title == "" {
|
||||
title = "Untitled"
|
||||
}
|
||||
|
||||
description := strings.TrimSpace(e.DOM.Find(`meta[name="description"]`).AttrOr("content", ""))
|
||||
if description == "" {
|
||||
description = guessDescription(e.DOM)
|
||||
}
|
||||
|
||||
summary := PageSummary{
|
||||
URL: pageURL,
|
||||
Title: title,
|
||||
Description: description,
|
||||
Path: e.Request.URL.Path,
|
||||
Depth: e.Request.Depth,
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
pages = append(pages, summary)
|
||||
mu.Unlock()
|
||||
}
|
||||
|
||||
e.DOM.Find("a[href]").Each(func(_ int, sel *goquery.Selection) {
|
||||
href, exists := sel.Attr("href")
|
||||
if !exists || href == "" {
|
||||
return
|
||||
}
|
||||
|
||||
absolute := e.Request.AbsoluteURL(href)
|
||||
if absolute == "" || !strings.HasPrefix(absolute, "http") {
|
||||
return
|
||||
}
|
||||
|
||||
if filters.ShouldSkipURL(absolute, opts.BaseURL.Host, basePath) {
|
||||
statsMu.Lock()
|
||||
stats.AddSkipped()
|
||||
statsMu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
if err := collector.Visit(absolute); err != nil {
|
||||
var alreadyVisited *colly.AlreadyVisitedError
|
||||
if errors.As(err, &alreadyVisited) {
|
||||
return
|
||||
}
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
if opts.OnLog != nil {
|
||||
opts.OnLog("discover: failed to queue %s: %v", absolute, err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
atomic.AddInt64(&queued, 1)
|
||||
})
|
||||
})
|
||||
|
||||
atomic.AddInt64(&queued, 1)
|
||||
if err := collector.Visit(opts.BaseURL.String()); err != nil {
|
||||
var alreadyVisited *colly.AlreadyVisitedError
|
||||
if !errors.As(err, &alreadyVisited) {
|
||||
return nil, nil, fmt.Errorf("start discovery: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
collector.Wait()
|
||||
|
||||
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
stats.Finish()
|
||||
return pages, stats, nil
|
||||
}
|
||||
|
||||
func guessDescription(sel *goquery.Selection) string {
|
||||
selection := sel.Find("p")
|
||||
for i := range selection.Nodes {
|
||||
paragraph := selection.Eq(i).Text()
|
||||
paragraph = strings.TrimSpace(paragraph)
|
||||
if paragraph != "" {
|
||||
if len(paragraph) > 240 {
|
||||
return paragraph[:240] + "..."
|
||||
}
|
||||
return paragraph
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
245
internal/crawler/scrape.go
Normal file
245
internal/crawler/scrape.go
Normal file
@ -0,0 +1,245 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/config"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/filters"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/models"
|
||||
"github.com/Sosokker/site-to-llmstxt/internal/utils"
|
||||
)
|
||||
|
||||
// ProgressUpdate conveys scraping progress for interactive UIs.
|
||||
type ProgressUpdate struct {
|
||||
Completed int
|
||||
Total int
|
||||
URL string
|
||||
}
|
||||
|
||||
// LogUpdate captures a log line emitted during scraping.
|
||||
type LogUpdate struct {
|
||||
Message string
|
||||
}
|
||||
|
||||
// ScrapeOptions configure the download stage for selected pages.
|
||||
type ScrapeOptions struct {
|
||||
BaseURL *url.URL
|
||||
Pages []PageSummary
|
||||
Output string
|
||||
Workers int
|
||||
Verbose bool
|
||||
Logs chan<- LogUpdate
|
||||
Progress chan<- ProgressUpdate
|
||||
}
|
||||
|
||||
// Scrape fetches each provided page, writes Markdown output, and returns results.
|
||||
func Scrape(ctx context.Context, opts ScrapeOptions) ([]models.PageInfo, *models.Stats, error) {
|
||||
if opts.BaseURL == nil {
|
||||
return nil, nil, errors.New("base URL is required")
|
||||
}
|
||||
if len(opts.Pages) == 0 {
|
||||
return nil, nil, errors.New("no pages selected for scraping")
|
||||
}
|
||||
|
||||
if opts.Workers <= 0 {
|
||||
opts.Workers = config.DefaultWorkers
|
||||
}
|
||||
|
||||
if err := utils.CreateOutputDirs(opts.Output); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
stats := &models.Stats{
|
||||
StartTime: time.Now(),
|
||||
TotalPages: len(opts.Pages),
|
||||
MainDocPages: 0,
|
||||
SecondaryPages: 0,
|
||||
SkippedURLs: 0,
|
||||
ErrorCount: 0,
|
||||
}
|
||||
statsMu := &sync.Mutex{}
|
||||
|
||||
for _, page := range opts.Pages {
|
||||
if filters.IsMainDocPage(page.URL) {
|
||||
stats.MainDocPages++
|
||||
} else {
|
||||
stats.SecondaryPages++
|
||||
}
|
||||
}
|
||||
|
||||
namer := utils.NewUniqueNamer()
|
||||
results := make([]models.PageInfo, 0, len(opts.Pages))
|
||||
resultsMu := &sync.Mutex{}
|
||||
|
||||
var completed int32
|
||||
progressTotal := len(opts.Pages)
|
||||
|
||||
pageCh := make(chan PageSummary)
|
||||
wg := sync.WaitGroup{}
|
||||
errOnce := sync.Once{}
|
||||
var firstErr error
|
||||
|
||||
sendLog := func(always bool, format string, args ...interface{}) {
|
||||
if !always && !opts.Verbose {
|
||||
return
|
||||
}
|
||||
if opts.Logs == nil {
|
||||
return
|
||||
}
|
||||
msg := fmt.Sprintf(format, args...)
|
||||
select {
|
||||
case opts.Logs <- LogUpdate{Message: msg}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
sendProgress := func(url string) {
|
||||
if opts.Progress == nil {
|
||||
return
|
||||
}
|
||||
done := int(atomic.AddInt32(&completed, 1))
|
||||
select {
|
||||
case opts.Progress <- ProgressUpdate{
|
||||
Completed: done,
|
||||
Total: progressTotal,
|
||||
URL: url,
|
||||
}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < opts.Workers; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for page := range pageCh {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
info, err := fetchPage(ctx, client, page, opts.Output, namer)
|
||||
if err != nil {
|
||||
errOnce.Do(func() { firstErr = err })
|
||||
statsMu.Lock()
|
||||
stats.AddError()
|
||||
statsMu.Unlock()
|
||||
sendLog(true, "error scraping %s: %v", page.URL, err)
|
||||
sendProgress(page.URL)
|
||||
continue
|
||||
}
|
||||
|
||||
resultsMu.Lock()
|
||||
results = append(results, info)
|
||||
resultsMu.Unlock()
|
||||
|
||||
sendLog(false, "scraped %s", page.URL)
|
||||
sendProgress(page.URL)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
go func() {
|
||||
for _, page := range opts.Pages {
|
||||
pageCh <- page
|
||||
}
|
||||
close(pageCh)
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
stats.Finish()
|
||||
|
||||
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
|
||||
return results, stats, err
|
||||
}
|
||||
|
||||
return results, stats, firstErr
|
||||
}
|
||||
|
||||
func fetchPage(ctx context.Context, client *http.Client, page PageSummary, outputDir string, namer *utils.UniqueNamer) (models.PageInfo, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, page.URL, nil)
|
||||
if err != nil {
|
||||
return models.PageInfo{}, err
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return models.PageInfo{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
io.Copy(io.Discard, resp.Body)
|
||||
return models.PageInfo{}, fmt.Errorf("unexpected status code %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return models.PageInfo{}, err
|
||||
}
|
||||
|
||||
htmlContent := string(bodyBytes)
|
||||
markdown, err := htmltomarkdown.ConvertString(htmlContent)
|
||||
if err != nil {
|
||||
return models.PageInfo{}, err
|
||||
}
|
||||
markdown = strings.TrimSpace(markdown)
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
||||
if err != nil {
|
||||
return models.PageInfo{}, err
|
||||
}
|
||||
|
||||
title := strings.TrimSpace(doc.Find("title").First().Text())
|
||||
if title == "" {
|
||||
title = page.Title
|
||||
}
|
||||
|
||||
description := strings.TrimSpace(doc.Find(`meta[name="description"]`).AttrOr("content", ""))
|
||||
if description == "" {
|
||||
description = page.Description
|
||||
}
|
||||
if description == "" {
|
||||
description = utils.ExtractFirstSentence(markdown)
|
||||
}
|
||||
|
||||
filename := utils.CreateFilename(title, page.URL)
|
||||
filename = namer.Reserve(filename)
|
||||
relativePath := filepath.Join(config.MarkdownSubdir, filename)
|
||||
fullPath := filepath.Join(outputDir, relativePath)
|
||||
|
||||
if err := os.WriteFile(fullPath, []byte(markdown), 0o644); err != nil {
|
||||
return models.PageInfo{}, err
|
||||
}
|
||||
|
||||
info := models.PageInfo{
|
||||
URL: page.URL,
|
||||
Title: title,
|
||||
Content: markdown,
|
||||
FilePath: relativePath,
|
||||
CrawledAt: time.Now(),
|
||||
Description: description,
|
||||
}
|
||||
|
||||
return info, nil
|
||||
}
|
||||
33
internal/crawler/util.go
Normal file
33
internal/crawler/util.go
Normal file
@ -0,0 +1,33 @@
|
||||
package crawler
|
||||
|
||||
import "strings"
|
||||
|
||||
func allowedDomains(host string) []string {
|
||||
host = strings.TrimSpace(host)
|
||||
if host == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
domains := map[string]struct{}{
|
||||
host: {},
|
||||
}
|
||||
|
||||
if strings.HasPrefix(host, "www.") {
|
||||
domains[strings.TrimPrefix(host, "www.")] = struct{}{}
|
||||
} else {
|
||||
domains["www."+host] = struct{}{}
|
||||
}
|
||||
|
||||
list := make([]string, 0, len(domains))
|
||||
for d := range domains {
|
||||
list = append(list, d)
|
||||
}
|
||||
return list
|
||||
}
|
||||
|
||||
func max64(a, b int64) int64 {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@ -31,7 +31,7 @@ var SecondaryPageIndicators = []string{
|
||||
}
|
||||
|
||||
// ShouldSkipURL determines if a URL should be skipped based on various filters.
|
||||
func ShouldSkipURL(rawURL, baseHost string) bool {
|
||||
func ShouldSkipURL(rawURL, baseHost, basePath string) bool {
|
||||
if rawURL == "" {
|
||||
return true
|
||||
}
|
||||
@ -53,10 +53,26 @@ func ShouldSkipURL(rawURL, baseHost string) bool {
|
||||
}
|
||||
|
||||
lowerURL := strings.ToLower(rawURL)
|
||||
basePathLower := strings.ToLower(basePath)
|
||||
if basePathLower != "" && !strings.HasPrefix(basePathLower, "/") {
|
||||
basePathLower = "/" + basePathLower
|
||||
}
|
||||
basePathLower = strings.TrimRight(basePathLower, "/")
|
||||
|
||||
candidatePath := strings.ToLower(u.Path)
|
||||
baseClean := strings.TrimSuffix(basePathLower, "/")
|
||||
if baseClean != "" && baseClean != "/" {
|
||||
if candidatePath != baseClean && !strings.HasPrefix(candidatePath, baseClean+"/") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Skip language variants
|
||||
for _, lang := range LanguageIndicators {
|
||||
if strings.Contains(lowerURL, lang) {
|
||||
if basePathLower != "" && strings.Contains(basePathLower, lang) {
|
||||
continue
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
@ -7,55 +7,77 @@ func TestShouldSkipURL(t *testing.T) {
|
||||
name string
|
||||
url string
|
||||
baseHost string
|
||||
basePath string
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "Normal URL",
|
||||
url: "https://example.com/docs",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "Language URL - en",
|
||||
url: "https://example.com/en/docs",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Language URL - zh",
|
||||
url: "https://example.com/zh/docs",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "PDF file",
|
||||
url: "https://example.com/doc.pdf",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "ZIP file",
|
||||
url: "https://example.com/download.zip",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Fragment URL",
|
||||
url: "https://example.com/docs#section",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "External domain",
|
||||
url: "https://other.com/docs",
|
||||
baseHost: "example.com",
|
||||
basePath: "",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Language URL outside base path",
|
||||
url: "https://example.com/en/reference",
|
||||
baseHost: "example.com",
|
||||
basePath: "/en/docs/",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Sub path inside base",
|
||||
url: "https://example.com/en/docs/liff/guide",
|
||||
baseHost: "example.com",
|
||||
basePath: "/en/docs/liff/",
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := ShouldSkipURL(tt.url, tt.baseHost); got != tt.want {
|
||||
if got := ShouldSkipURL(tt.url, tt.baseHost, tt.basePath); got != tt.want {
|
||||
t.Errorf("ShouldSkipURL() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
|
||||
1084
internal/tui/model.go
Normal file
1084
internal/tui/model.go
Normal file
File diff suppressed because it is too large
Load Diff
39
internal/utils/namer.go
Normal file
39
internal/utils/namer.go
Normal file
@ -0,0 +1,39 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// UniqueNamer ensures generated filenames remain unique by appending counters.
|
||||
type UniqueNamer struct {
|
||||
mu sync.Mutex
|
||||
counts map[string]int
|
||||
}
|
||||
|
||||
// NewUniqueNamer returns an initialized UniqueNamer.
|
||||
func NewUniqueNamer() *UniqueNamer {
|
||||
return &UniqueNamer{
|
||||
counts: make(map[string]int),
|
||||
}
|
||||
}
|
||||
|
||||
// Reserve records a filename and returns a unique variant if needed.
|
||||
func (n *UniqueNamer) Reserve(filename string) string {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
base := strings.TrimSuffix(filename, filepath.Ext(filename))
|
||||
ext := filepath.Ext(filename)
|
||||
count := n.counts[base]
|
||||
|
||||
if count == 0 {
|
||||
n.counts[base] = 1
|
||||
return filename
|
||||
}
|
||||
|
||||
n.counts[base] = count + 1
|
||||
return base + "-" + strconv.Itoa(count) + ext
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user