mirror of
https://github.com/Sosokker/site-to-llmstxt.git
synced 2025-12-18 13:34:06 +01:00
feat: add llms.txt gen, with improved CLI and output generation
This commit is contained in:
parent
f702db6ede
commit
4aa2c4be52
18
Makefile
18
Makefile
@ -14,7 +14,7 @@ help:
|
||||
@echo "Examples:"
|
||||
@echo " make build"
|
||||
@echo " make run URL=https://example.com"
|
||||
@echo " make run URL=https://httpbin.org WORKERS=3 OUTPUT=./test-output"
|
||||
@echo " make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output"
|
||||
|
||||
# Build the crawler
|
||||
build:
|
||||
@ -40,18 +40,24 @@ install:
|
||||
@echo "Installing dependencies..."
|
||||
go mod tidy
|
||||
|
||||
# Run with parameters
|
||||
# Run with parameters (updated for new CLI)
|
||||
run: build
|
||||
@if [ -z "$(URL)" ]; then \
|
||||
echo "Error: URL is required. Usage: make run URL=https://example.com"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo "Running crawler with URL: $(URL)"
|
||||
./crawler -url $(URL) \
|
||||
$(if $(WORKERS),-workers $(WORKERS)) \
|
||||
$(if $(OUTPUT),-output $(OUTPUT)) \
|
||||
$(if $(VERBOSE),-verbose)
|
||||
./crawler \
|
||||
--url $(URL) \
|
||||
$(if $(WORKERS),--workers $(WORKERS)) \
|
||||
$(if $(OUTPUT),--output $(OUTPUT)) \
|
||||
$(if $(VERBOSE),--verbose)
|
||||
|
||||
# Build and test everything
|
||||
all: clean install build test
|
||||
@echo "All tasks completed successfully!"
|
||||
|
||||
# Quick test with a small site
|
||||
demo: build
|
||||
@echo "Running demo crawl of httpbin.org..."
|
||||
./crawler --url https://httpbin.org --output ./demo-output --workers 1 --verbose
|
||||
|
||||
6
go.mod
6
go.mod
@ -1,4 +1,4 @@
|
||||
module site-to-llmstxt
|
||||
module github.com/Sosokker/site-to-llmstxt
|
||||
|
||||
go 1.24.5
|
||||
|
||||
@ -6,6 +6,7 @@ require (
|
||||
github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3
|
||||
github.com/gocolly/colly/v2 v2.2.0
|
||||
github.com/schollz/progressbar/v3 v3.18.0
|
||||
github.com/urfave/cli/v2 v2.27.7
|
||||
)
|
||||
|
||||
require (
|
||||
@ -16,6 +17,7 @@ require (
|
||||
github.com/antchfx/xmlquery v1.4.4 // indirect
|
||||
github.com/antchfx/xpath v1.3.4 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.22.0 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
@ -23,8 +25,10 @@ require (
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||
github.com/nlnwa/whatwg-url v0.6.2 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
|
||||
golang.org/x/net v0.42.0 // indirect
|
||||
golang.org/x/sys v0.34.0 // indirect
|
||||
golang.org/x/term v0.33.0 // indirect
|
||||
|
||||
8
go.sum
8
go.sum
@ -18,6 +18,8 @@ github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCk
|
||||
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
|
||||
github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
@ -47,6 +49,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||
github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA=
|
||||
@ -61,6 +65,10 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
|
||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU=
|
||||
github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4=
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
github.com/yuin/goldmark v1.7.11 h1:ZCxLyDMtz0nT2HFfsYG8WZ47Trip2+JyLysKcMYE5bo=
|
||||
github.com/yuin/goldmark v1.7.11/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
|
||||
|
||||
487
main.go
487
main.go
@ -1,14 +1,15 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@ -19,6 +20,16 @@ import (
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/gocolly/colly/v2/debug"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultWorkers is the default number of concurrent workers
|
||||
DefaultWorkers = 1
|
||||
// DefaultOutputDir is the default output directory
|
||||
DefaultOutputDir = "./output"
|
||||
// MarkdownSubdir is the subdirectory for markdown files
|
||||
MarkdownSubdir = "pages"
|
||||
)
|
||||
|
||||
// Config holds crawler configuration
|
||||
@ -29,18 +40,30 @@ type Config struct {
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
// PageInfo represents information about a crawled page
|
||||
type PageInfo struct {
|
||||
URL string
|
||||
Title string
|
||||
Content string
|
||||
FilePath string
|
||||
CrawledAt time.Time
|
||||
Description string
|
||||
}
|
||||
|
||||
// Crawler manages the web crawling process
|
||||
type Crawler struct {
|
||||
config *Config
|
||||
collector *colly.Collector
|
||||
converter *converter.Converter
|
||||
visited map[string]bool
|
||||
queue chan string
|
||||
wg sync.WaitGroup
|
||||
mu sync.RWMutex
|
||||
baseURL *url.URL
|
||||
bar *progressbar.ProgressBar
|
||||
processed int
|
||||
config *Config
|
||||
collector *colly.Collector
|
||||
converter *converter.Converter
|
||||
visited map[string]bool
|
||||
queue chan string
|
||||
wg sync.WaitGroup
|
||||
mu sync.RWMutex
|
||||
baseURL *url.URL
|
||||
bar *progressbar.ProgressBar
|
||||
processed int
|
||||
pages []PageInfo
|
||||
pagesMutex sync.Mutex
|
||||
}
|
||||
|
||||
// LanguageFilter contains patterns to exclude language-specific URLs
|
||||
@ -67,35 +90,92 @@ var FileExtensionFilter = []string{
|
||||
}
|
||||
|
||||
func main() {
|
||||
config := parseFlags()
|
||||
app := &cli.App{
|
||||
Name: "site-to-llmstxt",
|
||||
Usage: "Web crawler that converts websites to LLMs.txt format",
|
||||
Description: `A high-performance web crawler that scrapes websites and converts them to LLMs.txt format.
|
||||
|
||||
The crawler generates:
|
||||
- llms.txt: A curated overview following the LLMs.txt specification
|
||||
- llms-full.txt: Complete content of all crawled pages
|
||||
- pages/: Directory containing individual markdown files
|
||||
|
||||
The crawler respects robots.txt, filters out language variants and file downloads,
|
||||
and only crawls within the same domain.`,
|
||||
Version: "1.0.0",
|
||||
Authors: []*cli.Author{
|
||||
{
|
||||
Name: "Site-to-LLMsTxt",
|
||||
},
|
||||
},
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "url",
|
||||
Aliases: []string{"u"},
|
||||
Usage: "Root URL to crawl (required)",
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "output",
|
||||
Aliases: []string{"o"},
|
||||
Usage: "Output directory",
|
||||
Value: DefaultOutputDir,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "workers",
|
||||
Aliases: []string{"w"},
|
||||
Usage: "Number of concurrent workers",
|
||||
Value: DefaultWorkers,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "verbose",
|
||||
Usage: "Enable verbose logging",
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
config := &Config{
|
||||
URL: c.String("url"),
|
||||
OutputDir: c.String("output"),
|
||||
Workers: c.Int("workers"),
|
||||
Verbose: c.Bool("verbose"),
|
||||
}
|
||||
|
||||
return runCrawler(config)
|
||||
},
|
||||
}
|
||||
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func runCrawler(config *Config) error {
|
||||
if err := validateConfig(config); err != nil {
|
||||
log.Fatalf("Invalid configuration: %v", err)
|
||||
return fmt.Errorf("invalid configuration: %w", err)
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create crawler: %v", err)
|
||||
return fmt.Errorf("failed to create crawler: %w", err)
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
if err := crawler.Start(ctx); err != nil {
|
||||
log.Fatalf("Crawling failed: %v", err)
|
||||
return fmt.Errorf("crawling failed: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("\nCrawling completed successfully! Files saved to: %s\n", config.OutputDir)
|
||||
}
|
||||
if err := crawler.GenerateLLMSFiles(); err != nil {
|
||||
return fmt.Errorf("failed to generate LLMS files: %w", err)
|
||||
}
|
||||
|
||||
func parseFlags() *Config {
|
||||
config := &Config{}
|
||||
fmt.Printf("\nCrawling completed successfully!\n")
|
||||
fmt.Printf("Generated files:\n")
|
||||
fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms.txt"))
|
||||
fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms-full.txt"))
|
||||
fmt.Printf(" - %s/ (individual pages)\n", filepath.Join(config.OutputDir, MarkdownSubdir))
|
||||
fmt.Printf("Total pages crawled: %d\n", len(crawler.pages))
|
||||
|
||||
flag.StringVar(&config.URL, "url", "", "Root URL to crawl (required)")
|
||||
flag.StringVar(&config.OutputDir, "output", "./output", "Output directory for markdown files")
|
||||
flag.IntVar(&config.Workers, "workers", 5, "Number of concurrent workers")
|
||||
flag.BoolVar(&config.Verbose, "verbose", false, "Enable verbose logging")
|
||||
flag.Parse()
|
||||
|
||||
return config
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateConfig(config *Config) error {
|
||||
@ -103,14 +183,13 @@ func validateConfig(config *Config) error {
|
||||
return fmt.Errorf("URL is required")
|
||||
}
|
||||
|
||||
parsedURL, err := url.Parse(config.URL)
|
||||
u, err := url.Parse(config.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Check if URL has a valid scheme and host
|
||||
if parsedURL.Scheme == "" || parsedURL.Host == "" {
|
||||
return fmt.Errorf("URL must include scheme (http/https) and host")
|
||||
if u.Scheme != "http" && u.Scheme != "https" {
|
||||
return fmt.Errorf("URL must have http or https scheme")
|
||||
}
|
||||
|
||||
if config.Workers <= 0 {
|
||||
@ -127,9 +206,9 @@ func NewCrawler(config *Config) (*Crawler, error) {
|
||||
return nil, fmt.Errorf("failed to parse base URL: %w", err)
|
||||
}
|
||||
|
||||
// Create output directory
|
||||
if err := os.MkdirAll(config.OutputDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("failed to create output directory: %w", err)
|
||||
// Create output directory structure
|
||||
if err := createOutputDirs(config.OutputDir); err != nil {
|
||||
return nil, fmt.Errorf("failed to create output directories: %w", err)
|
||||
}
|
||||
|
||||
// Setup colly collector
|
||||
@ -145,7 +224,7 @@ func NewCrawler(config *Config) (*Crawler, error) {
|
||||
c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: config.Workers,
|
||||
Delay: 100 * time.Millisecond,
|
||||
Delay: 200 * time.Millisecond, // Slightly more conservative
|
||||
})
|
||||
|
||||
// Setup HTML to Markdown converter
|
||||
@ -164,6 +243,7 @@ func NewCrawler(config *Config) (*Crawler, error) {
|
||||
queue: make(chan string, 1000),
|
||||
baseURL: baseURL,
|
||||
bar: progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")),
|
||||
pages: make([]PageInfo, 0),
|
||||
}
|
||||
|
||||
crawler.setupCallbacks()
|
||||
@ -171,6 +251,21 @@ func NewCrawler(config *Config) (*Crawler, error) {
|
||||
return crawler, nil
|
||||
}
|
||||
|
||||
func createOutputDirs(outputDir string) error {
|
||||
dirs := []string{
|
||||
outputDir,
|
||||
filepath.Join(outputDir, MarkdownSubdir),
|
||||
}
|
||||
|
||||
for _, dir := range dirs {
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory %s: %w", dir, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Crawler) setupCallbacks() {
|
||||
// Handle HTML content
|
||||
c.collector.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
@ -199,9 +294,16 @@ func (c *Crawler) setupCallbacks() {
|
||||
|
||||
func (c *Crawler) processPage(e *colly.HTMLElement) {
|
||||
// Get page title
|
||||
title := e.ChildText("title")
|
||||
title := strings.TrimSpace(e.ChildText("title"))
|
||||
if title == "" {
|
||||
title = "untitled"
|
||||
title = "Untitled"
|
||||
}
|
||||
|
||||
// Get meta description
|
||||
description := strings.TrimSpace(e.ChildAttr("meta[name='description']", "content"))
|
||||
if description == "" {
|
||||
// Try og:description
|
||||
description = strings.TrimSpace(e.ChildAttr("meta[property='og:description']", "content"))
|
||||
}
|
||||
|
||||
// Convert HTML to Markdown
|
||||
@ -217,31 +319,62 @@ func (c *Crawler) processPage(e *colly.HTMLElement) {
|
||||
return
|
||||
}
|
||||
|
||||
// Save to file
|
||||
if err := c.saveMarkdown(e.Request.URL, title, markdown); err != nil {
|
||||
// Create page info
|
||||
pageInfo := PageInfo{
|
||||
URL: e.Request.URL.String(),
|
||||
Title: title,
|
||||
Content: markdown,
|
||||
CrawledAt: time.Now(),
|
||||
Description: description,
|
||||
}
|
||||
|
||||
// Save individual markdown file
|
||||
filename := c.createFilename(e.Request.URL, title)
|
||||
pageInfo.FilePath = filepath.Join(MarkdownSubdir, filename)
|
||||
fullPath := filepath.Join(c.config.OutputDir, pageInfo.FilePath)
|
||||
|
||||
if err := c.saveMarkdown(fullPath, pageInfo); err != nil {
|
||||
log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Add to pages collection
|
||||
c.pagesMutex.Lock()
|
||||
c.pages = append(c.pages, pageInfo)
|
||||
c.pagesMutex.Unlock()
|
||||
|
||||
c.mu.Lock()
|
||||
c.processed++
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
func (c *Crawler) saveMarkdown(pageURL *url.URL, title, markdown string) error {
|
||||
// Create filename from URL path
|
||||
filename := c.createFilename(pageURL, title)
|
||||
filePath := filepath.Join(c.config.OutputDir, filename)
|
||||
|
||||
func (c *Crawler) saveMarkdown(filePath string, pageInfo PageInfo) error {
|
||||
// Ensure directory exists
|
||||
dir := filepath.Dir(filePath)
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory %s: %w", dir, err)
|
||||
}
|
||||
|
||||
// Add metadata header
|
||||
content := fmt.Sprintf("# %s\n\nURL: %s\nCrawled: %s\n\n---\n\n%s",
|
||||
title, pageURL.String(), time.Now().Format(time.RFC3339), markdown)
|
||||
// Create content with metadata
|
||||
content := fmt.Sprintf(`# %s
|
||||
|
||||
URL: %s
|
||||
Crawled: %s
|
||||
%s
|
||||
|
||||
---
|
||||
|
||||
%s`,
|
||||
pageInfo.Title,
|
||||
pageInfo.URL,
|
||||
pageInfo.CrawledAt.Format(time.RFC3339),
|
||||
func() string {
|
||||
if pageInfo.Description != "" {
|
||||
return fmt.Sprintf("Description: %s", pageInfo.Description)
|
||||
}
|
||||
return ""
|
||||
}(),
|
||||
pageInfo.Content)
|
||||
|
||||
// Write file
|
||||
if err := os.WriteFile(filePath, []byte(content), 0644); err != nil {
|
||||
@ -267,6 +400,11 @@ func (c *Crawler) createFilename(pageURL *url.URL, title string) string {
|
||||
filename = strings.ReplaceAll(urlPath, "/", "-")
|
||||
}
|
||||
|
||||
// Limit filename length
|
||||
if len(filename) > 100 {
|
||||
filename = filename[:100]
|
||||
}
|
||||
|
||||
// Ensure .md extension
|
||||
if !strings.HasSuffix(filename, ".md") {
|
||||
filename += ".md"
|
||||
@ -329,7 +467,7 @@ func (c *Crawler) shouldSkipURL(urlStr string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Skip fragments and query parameters that might be irrelevant
|
||||
// Skip fragments
|
||||
if strings.Contains(urlStr, "#") {
|
||||
return true
|
||||
}
|
||||
@ -342,6 +480,10 @@ func (c *Crawler) Start(ctx context.Context) error {
|
||||
fmt.Printf("Output directory: %s\n", c.config.OutputDir)
|
||||
fmt.Printf("Workers: %d\n", c.config.Workers)
|
||||
|
||||
// Create a cancellable context for workers
|
||||
workerCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
// Add seed URL to queue
|
||||
c.queue <- c.config.URL
|
||||
c.visited[c.config.URL] = true
|
||||
@ -349,13 +491,25 @@ func (c *Crawler) Start(ctx context.Context) error {
|
||||
// Start workers
|
||||
for i := 0; i < c.config.Workers; i++ {
|
||||
c.wg.Add(1)
|
||||
go c.worker(ctx)
|
||||
go c.worker(workerCtx)
|
||||
}
|
||||
|
||||
// Monitor progress
|
||||
go c.monitor(ctx)
|
||||
// Monitor progress and handle completion
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
c.monitor(workerCtx)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
// Wait for completion
|
||||
// Wait for either completion or cancellation
|
||||
select {
|
||||
case <-done:
|
||||
cancel() // Stop workers
|
||||
case <-ctx.Done():
|
||||
// External cancellation
|
||||
}
|
||||
|
||||
// Wait for workers to finish
|
||||
c.wg.Wait()
|
||||
close(c.queue)
|
||||
c.bar.Finish()
|
||||
@ -386,7 +540,7 @@ func (c *Crawler) worker(ctx context.Context) {
|
||||
}
|
||||
|
||||
func (c *Crawler) monitor(ctx context.Context) {
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
ticker := time.NewTicker(2 * time.Second) // Check more frequently
|
||||
defer ticker.Stop()
|
||||
|
||||
lastProcessed := 0
|
||||
@ -404,8 +558,12 @@ func (c *Crawler) monitor(ctx context.Context) {
|
||||
|
||||
if current == lastProcessed {
|
||||
noProgressCount++
|
||||
if noProgressCount >= 6 && queueLen == 0 { // 30 seconds with no progress and empty queue
|
||||
fmt.Println("\nNo progress detected, stopping crawler...")
|
||||
// More aggressive completion detection
|
||||
if (noProgressCount >= 3 && queueLen == 0) || // 6 seconds with no progress and empty queue
|
||||
(noProgressCount >= 15) { // Or 30 seconds regardless
|
||||
if c.config.Verbose {
|
||||
fmt.Println("\nNo progress detected, stopping crawler...")
|
||||
}
|
||||
return
|
||||
}
|
||||
} else {
|
||||
@ -419,3 +577,224 @@ func (c *Crawler) monitor(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GenerateLLMSFiles creates both llms.txt and llms-full.txt files
|
||||
func (c *Crawler) GenerateLLMSFiles() error {
|
||||
if err := c.generateLLMSTxt(); err != nil {
|
||||
return fmt.Errorf("failed to generate llms.txt: %w", err)
|
||||
}
|
||||
|
||||
if err := c.generateLLMSFullTxt(); err != nil {
|
||||
return fmt.Errorf("failed to generate llms-full.txt: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Crawler) generateLLMSTxt() error {
|
||||
// Sort pages by URL for consistent output
|
||||
sortedPages := make([]PageInfo, len(c.pages))
|
||||
copy(sortedPages, c.pages)
|
||||
sort.Slice(sortedPages, func(i, j int) bool {
|
||||
return sortedPages[i].URL < sortedPages[j].URL
|
||||
})
|
||||
|
||||
var content strings.Builder
|
||||
|
||||
// H1 title (required)
|
||||
siteTitle := c.getSiteTitle()
|
||||
content.WriteString(fmt.Sprintf("# %s\n\n", siteTitle))
|
||||
|
||||
// Blockquote summary (optional but recommended)
|
||||
summary := c.generateSiteSummary()
|
||||
if summary != "" {
|
||||
content.WriteString(fmt.Sprintf("> %s\n\n", summary))
|
||||
}
|
||||
|
||||
// Additional details
|
||||
content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n",
|
||||
c.config.URL, time.Now().Format("January 2, 2006")))
|
||||
|
||||
// Main documentation section
|
||||
content.WriteString("## Documentation\n\n")
|
||||
for _, page := range sortedPages {
|
||||
if c.isMainDocPage(page) {
|
||||
description := page.Description
|
||||
if description == "" {
|
||||
description = c.extractFirstSentence(page.Content)
|
||||
}
|
||||
if description != "" {
|
||||
content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", page.Title, page.URL, description))
|
||||
} else {
|
||||
content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optional section for secondary pages
|
||||
secondaryPages := c.getSecondaryPages(sortedPages)
|
||||
if len(secondaryPages) > 0 {
|
||||
content.WriteString("\n## Optional\n\n")
|
||||
for _, page := range secondaryPages {
|
||||
content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
|
||||
}
|
||||
}
|
||||
|
||||
// Write to file
|
||||
filePath := filepath.Join(c.config.OutputDir, "llms.txt")
|
||||
return os.WriteFile(filePath, []byte(content.String()), 0644)
|
||||
}
|
||||
|
||||
func (c *Crawler) generateLLMSFullTxt() error {
|
||||
// Sort pages by URL for consistent output
|
||||
sortedPages := make([]PageInfo, len(c.pages))
|
||||
copy(sortedPages, c.pages)
|
||||
sort.Slice(sortedPages, func(i, j int) bool {
|
||||
return sortedPages[i].URL < sortedPages[j].URL
|
||||
})
|
||||
|
||||
var content strings.Builder
|
||||
|
||||
// H1 title
|
||||
siteTitle := c.getSiteTitle()
|
||||
content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteTitle))
|
||||
|
||||
// Summary
|
||||
summary := c.generateSiteSummary()
|
||||
if summary != "" {
|
||||
content.WriteString(fmt.Sprintf("> %s\n\n", summary))
|
||||
}
|
||||
|
||||
content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n",
|
||||
c.config.URL, time.Now().Format("January 2, 2006")))
|
||||
|
||||
content.WriteString("---\n\n")
|
||||
|
||||
// Include full content of each page
|
||||
for i, page := range sortedPages {
|
||||
content.WriteString(fmt.Sprintf("## %s\n\n", page.Title))
|
||||
content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL))
|
||||
|
||||
if page.Description != "" {
|
||||
content.WriteString(fmt.Sprintf("**Description:** %s\n\n", page.Description))
|
||||
}
|
||||
|
||||
content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339)))
|
||||
|
||||
// Clean and include content
|
||||
cleanContent := c.cleanContentForLLMS(page.Content)
|
||||
content.WriteString(cleanContent)
|
||||
|
||||
// Add separator between pages (except for the last one)
|
||||
if i < len(sortedPages)-1 {
|
||||
content.WriteString("\n\n---\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// Write to file
|
||||
filePath := filepath.Join(c.config.OutputDir, "llms-full.txt")
|
||||
return os.WriteFile(filePath, []byte(content.String()), 0644)
|
||||
}
|
||||
|
||||
func (c *Crawler) getSiteTitle() string {
|
||||
// Try to get site title from the main page
|
||||
for _, page := range c.pages {
|
||||
if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
|
||||
if page.Title != "" && page.Title != "Untitled" {
|
||||
return page.Title
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to domain name
|
||||
return c.baseURL.Host
|
||||
}
|
||||
|
||||
func (c *Crawler) generateSiteSummary() string {
|
||||
// Try to get description from the main page
|
||||
for _, page := range c.pages {
|
||||
if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
|
||||
if page.Description != "" {
|
||||
return page.Description
|
||||
}
|
||||
// Extract first meaningful paragraph
|
||||
return c.extractFirstSentence(page.Content)
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Sprintf("Documentation and content from %s", c.baseURL.Host)
|
||||
}
|
||||
|
||||
func (c *Crawler) isMainDocPage(page PageInfo) bool {
|
||||
// Consider a page "main documentation" if it's not in typical secondary sections
|
||||
lowerURL := strings.ToLower(page.URL)
|
||||
|
||||
// Skip pages that are typically secondary
|
||||
secondaryIndicators := []string{
|
||||
"/blog", "/news", "/archive", "/changelog", "/release",
|
||||
"/about", "/contact", "/legal", "/privacy", "/terms",
|
||||
"/community", "/forum", "/discuss",
|
||||
}
|
||||
|
||||
for _, indicator := range secondaryIndicators {
|
||||
// Check for the indicator followed by either / or end of URL
|
||||
if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *Crawler) getSecondaryPages(allPages []PageInfo) []PageInfo {
|
||||
var secondary []PageInfo
|
||||
for _, page := range allPages {
|
||||
if !c.isMainDocPage(page) {
|
||||
secondary = append(secondary, page)
|
||||
}
|
||||
}
|
||||
return secondary
|
||||
}
|
||||
|
||||
func (c *Crawler) extractFirstSentence(content string) string {
|
||||
// Clean the content and extract the first meaningful sentence
|
||||
lines := strings.Split(content, "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
// Skip empty lines, headers, and markdown syntax
|
||||
if len(line) > 50 && !strings.HasPrefix(line, "#") && !strings.HasPrefix(line, "**") {
|
||||
// Find the first sentence
|
||||
sentences := strings.Split(line, ".")
|
||||
if len(sentences) > 0 && len(sentences[0]) > 20 {
|
||||
return strings.TrimSpace(sentences[0]) + "."
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (c *Crawler) cleanContentForLLMS(content string) string {
|
||||
// Clean the content for better readability in LLMs context
|
||||
var cleaned strings.Builder
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
var inCodeBlock bool
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Handle code blocks
|
||||
if strings.HasPrefix(strings.TrimSpace(line), "```") {
|
||||
inCodeBlock = !inCodeBlock
|
||||
}
|
||||
|
||||
// Skip empty lines unless in code block
|
||||
if strings.TrimSpace(line) == "" && !inCodeBlock {
|
||||
continue
|
||||
}
|
||||
|
||||
cleaned.WriteString(line)
|
||||
cleaned.WriteString("\n")
|
||||
}
|
||||
|
||||
return strings.TrimSpace(cleaned.String())
|
||||
}
|
||||
|
||||
220
main_test.go
220
main_test.go
@ -5,38 +5,55 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestShouldSkipURL(t *testing.T) {
|
||||
config := &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./test-output",
|
||||
Workers: 1,
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
func TestValidateConfig(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
url string
|
||||
expected bool
|
||||
name string
|
||||
config *Config
|
||||
wantErr bool
|
||||
}{
|
||||
{"Normal URL", "https://example.com/page", false},
|
||||
{"Language URL - en", "https://example.com/en/page", true},
|
||||
{"Language URL - zh", "https://example.com/zh/page", true},
|
||||
{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
|
||||
{"PDF file", "https://example.com/document.pdf", true},
|
||||
{"ZIP file", "https://example.com/archive.zip", true},
|
||||
{"Fragment URL", "https://example.com/page#section", true},
|
||||
{"Image file", "https://example.com/image.jpg", true},
|
||||
{
|
||||
name: "Valid config",
|
||||
config: &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./output",
|
||||
Workers: 1,
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "Empty URL",
|
||||
config: &Config{
|
||||
URL: "",
|
||||
OutputDir: "./output",
|
||||
Workers: 1,
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "Invalid URL",
|
||||
config: &Config{
|
||||
URL: "not-a-url",
|
||||
OutputDir: "./output",
|
||||
Workers: 1,
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "Zero workers",
|
||||
config: &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./output",
|
||||
Workers: 0,
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.shouldSkipURL(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||
err := validateConfig(tt.config)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -97,55 +114,136 @@ func TestCreateFilename(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfig(t *testing.T) {
|
||||
func TestShouldSkipURL(t *testing.T) {
|
||||
config := &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./test-output",
|
||||
Workers: 1,
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
config *Config
|
||||
wantErr bool
|
||||
name string
|
||||
url string
|
||||
expected bool
|
||||
}{
|
||||
{"Normal URL", "https://example.com/page", false},
|
||||
{"Language URL - en", "https://example.com/en/page", true},
|
||||
{"Language URL - zh", "https://example.com/zh/page", true},
|
||||
{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
|
||||
{"PDF file", "https://example.com/document.pdf", true},
|
||||
{"ZIP file", "https://example.com/archive.zip", true},
|
||||
{"Fragment URL", "https://example.com/page#section", true},
|
||||
{"Image file", "https://example.com/image.jpg", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.shouldSkipURL(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractFirstSentence(t *testing.T) {
|
||||
config := &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./test-output",
|
||||
Workers: 1,
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "Valid config",
|
||||
config: &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./output",
|
||||
Workers: 5,
|
||||
},
|
||||
wantErr: false,
|
||||
name: "Simple sentence",
|
||||
content: "This is a simple sentence about something interesting. This is another sentence.",
|
||||
expected: "This is a simple sentence about something interesting.",
|
||||
},
|
||||
{
|
||||
name: "Empty URL",
|
||||
config: &Config{
|
||||
URL: "",
|
||||
OutputDir: "./output",
|
||||
Workers: 5,
|
||||
},
|
||||
wantErr: true,
|
||||
name: "With headers",
|
||||
content: "# Header\n\nThis is the main content that should be extracted as the first sentence.",
|
||||
expected: "This is the main content that should be extracted as the first sentence.",
|
||||
},
|
||||
{
|
||||
name: "Invalid URL",
|
||||
config: &Config{
|
||||
URL: "not-a-url",
|
||||
OutputDir: "./output",
|
||||
Workers: 5,
|
||||
},
|
||||
wantErr: true,
|
||||
name: "Short content",
|
||||
content: "Short text",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Zero workers",
|
||||
config: &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./output",
|
||||
Workers: 0,
|
||||
},
|
||||
wantErr: true,
|
||||
name: "Empty content",
|
||||
content: "",
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := validateConfig(tt.config)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
|
||||
result := crawler.extractFirstSentence(tt.content)
|
||||
if result != tt.expected {
|
||||
t.Errorf("extractFirstSentence() = %q, want %q", result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsMainDocPage(t *testing.T) {
|
||||
config := &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./test-output",
|
||||
Workers: 1,
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
page PageInfo
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "Main documentation page",
|
||||
page: PageInfo{URL: "https://example.com/docs/getting-started"},
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Blog page",
|
||||
page: PageInfo{URL: "https://example.com/blog/latest-news"},
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "About page",
|
||||
page: PageInfo{URL: "https://example.com/about"},
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "API documentation",
|
||||
page: PageInfo{URL: "https://example.com/api/reference"},
|
||||
expected: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.isMainDocPage(tt.page)
|
||||
if result != tt.expected {
|
||||
t.Errorf("isMainDocPage() = %v, want %v", result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user