From 4aa2c4be52ddada642b291b58b4e57b874b4e4c4 Mon Sep 17 00:00:00 2001 From: Sirin Puenggun Date: Sat, 12 Jul 2025 16:06:10 +0000 Subject: [PATCH] feat: add llms.txt gen, with improved CLI and output generation --- Makefile | 18 +- go.mod | 6 +- go.sum | 8 + main.go | 487 +++++++++++++++++++++++++++++++++++++++++++++------ main_test.go | 220 ++++++++++++++++------- 5 files changed, 617 insertions(+), 122 deletions(-) diff --git a/Makefile b/Makefile index a3cd21a..6a13fa4 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ help: @echo "Examples:" @echo " make build" @echo " make run URL=https://example.com" - @echo " make run URL=https://httpbin.org WORKERS=3 OUTPUT=./test-output" + @echo " make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output" # Build the crawler build: @@ -40,18 +40,24 @@ install: @echo "Installing dependencies..." go mod tidy -# Run with parameters +# Run with parameters (updated for new CLI) run: build @if [ -z "$(URL)" ]; then \ echo "Error: URL is required. Usage: make run URL=https://example.com"; \ exit 1; \ fi @echo "Running crawler with URL: $(URL)" - ./crawler -url $(URL) \ - $(if $(WORKERS),-workers $(WORKERS)) \ - $(if $(OUTPUT),-output $(OUTPUT)) \ - $(if $(VERBOSE),-verbose) + ./crawler \ + --url $(URL) \ + $(if $(WORKERS),--workers $(WORKERS)) \ + $(if $(OUTPUT),--output $(OUTPUT)) \ + $(if $(VERBOSE),--verbose) # Build and test everything all: clean install build test @echo "All tasks completed successfully!" + +# Quick test with a small site +demo: build + @echo "Running demo crawl of httpbin.org..." + ./crawler --url https://httpbin.org --output ./demo-output --workers 1 --verbose diff --git a/go.mod b/go.mod index cc6ebe3..d6e83ce 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module site-to-llmstxt +module github.com/Sosokker/site-to-llmstxt go 1.24.5 @@ -6,6 +6,7 @@ require ( github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3 github.com/gocolly/colly/v2 v2.2.0 github.com/schollz/progressbar/v3 v3.18.0 + github.com/urfave/cli/v2 v2.27.7 ) require ( @@ -16,6 +17,7 @@ require ( github.com/antchfx/xmlquery v1.4.4 // indirect github.com/antchfx/xpath v1.3.4 // indirect github.com/bits-and-blooms/bitset v1.22.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/protobuf v1.5.4 // indirect @@ -23,8 +25,10 @@ require ( github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/nlnwa/whatwg-url v0.6.2 // indirect github.com/rivo/uniseg v0.4.7 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/temoto/robotstxt v1.1.2 // indirect + github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect golang.org/x/net v0.42.0 // indirect golang.org/x/sys v0.34.0 // indirect golang.org/x/term v0.33.0 // indirect diff --git a/go.sum b/go.sum index ff930d4..3ae9704 100644 --- a/go.sum +++ b/go.sum @@ -18,6 +18,8 @@ github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCk github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= +github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= +github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -47,6 +49,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA= @@ -61,6 +65,10 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= +github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= +github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= +github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yuin/goldmark v1.7.11 h1:ZCxLyDMtz0nT2HFfsYG8WZ47Trip2+JyLysKcMYE5bo= github.com/yuin/goldmark v1.7.11/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= diff --git a/main.go b/main.go index e986729..7dae338 100644 --- a/main.go +++ b/main.go @@ -1,14 +1,15 @@ package main import ( + "bufio" "context" - "flag" "fmt" "log" "net/url" "os" "path/filepath" "regexp" + "sort" "strings" "sync" "time" @@ -19,6 +20,16 @@ import ( "github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2/debug" "github.com/schollz/progressbar/v3" + "github.com/urfave/cli/v2" +) + +const ( + // DefaultWorkers is the default number of concurrent workers + DefaultWorkers = 1 + // DefaultOutputDir is the default output directory + DefaultOutputDir = "./output" + // MarkdownSubdir is the subdirectory for markdown files + MarkdownSubdir = "pages" ) // Config holds crawler configuration @@ -29,18 +40,30 @@ type Config struct { Verbose bool } +// PageInfo represents information about a crawled page +type PageInfo struct { + URL string + Title string + Content string + FilePath string + CrawledAt time.Time + Description string +} + // Crawler manages the web crawling process type Crawler struct { - config *Config - collector *colly.Collector - converter *converter.Converter - visited map[string]bool - queue chan string - wg sync.WaitGroup - mu sync.RWMutex - baseURL *url.URL - bar *progressbar.ProgressBar - processed int + config *Config + collector *colly.Collector + converter *converter.Converter + visited map[string]bool + queue chan string + wg sync.WaitGroup + mu sync.RWMutex + baseURL *url.URL + bar *progressbar.ProgressBar + processed int + pages []PageInfo + pagesMutex sync.Mutex } // LanguageFilter contains patterns to exclude language-specific URLs @@ -67,35 +90,92 @@ var FileExtensionFilter = []string{ } func main() { - config := parseFlags() + app := &cli.App{ + Name: "site-to-llmstxt", + Usage: "Web crawler that converts websites to LLMs.txt format", + Description: `A high-performance web crawler that scrapes websites and converts them to LLMs.txt format. + +The crawler generates: +- llms.txt: A curated overview following the LLMs.txt specification +- llms-full.txt: Complete content of all crawled pages +- pages/: Directory containing individual markdown files +The crawler respects robots.txt, filters out language variants and file downloads, +and only crawls within the same domain.`, + Version: "1.0.0", + Authors: []*cli.Author{ + { + Name: "Site-to-LLMsTxt", + }, + }, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "url", + Aliases: []string{"u"}, + Usage: "Root URL to crawl (required)", + Required: true, + }, + &cli.StringFlag{ + Name: "output", + Aliases: []string{"o"}, + Usage: "Output directory", + Value: DefaultOutputDir, + }, + &cli.IntFlag{ + Name: "workers", + Aliases: []string{"w"}, + Usage: "Number of concurrent workers", + Value: DefaultWorkers, + }, + &cli.BoolFlag{ + Name: "verbose", + Usage: "Enable verbose logging", + }, + }, + Action: func(c *cli.Context) error { + config := &Config{ + URL: c.String("url"), + OutputDir: c.String("output"), + Workers: c.Int("workers"), + Verbose: c.Bool("verbose"), + } + + return runCrawler(config) + }, + } + + if err := app.Run(os.Args); err != nil { + log.Fatal(err) + } +} + +func runCrawler(config *Config) error { if err := validateConfig(config); err != nil { - log.Fatalf("Invalid configuration: %v", err) + return fmt.Errorf("invalid configuration: %w", err) } crawler, err := NewCrawler(config) if err != nil { - log.Fatalf("Failed to create crawler: %v", err) + return fmt.Errorf("failed to create crawler: %w", err) } ctx := context.Background() if err := crawler.Start(ctx); err != nil { - log.Fatalf("Crawling failed: %v", err) + return fmt.Errorf("crawling failed: %w", err) } - fmt.Printf("\nCrawling completed successfully! Files saved to: %s\n", config.OutputDir) -} + if err := crawler.GenerateLLMSFiles(); err != nil { + return fmt.Errorf("failed to generate LLMS files: %w", err) + } -func parseFlags() *Config { - config := &Config{} + fmt.Printf("\nCrawling completed successfully!\n") + fmt.Printf("Generated files:\n") + fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms.txt")) + fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms-full.txt")) + fmt.Printf(" - %s/ (individual pages)\n", filepath.Join(config.OutputDir, MarkdownSubdir)) + fmt.Printf("Total pages crawled: %d\n", len(crawler.pages)) - flag.StringVar(&config.URL, "url", "", "Root URL to crawl (required)") - flag.StringVar(&config.OutputDir, "output", "./output", "Output directory for markdown files") - flag.IntVar(&config.Workers, "workers", 5, "Number of concurrent workers") - flag.BoolVar(&config.Verbose, "verbose", false, "Enable verbose logging") - flag.Parse() - - return config + return nil } func validateConfig(config *Config) error { @@ -103,14 +183,13 @@ func validateConfig(config *Config) error { return fmt.Errorf("URL is required") } - parsedURL, err := url.Parse(config.URL) + u, err := url.Parse(config.URL) if err != nil { return fmt.Errorf("invalid URL: %w", err) } - // Check if URL has a valid scheme and host - if parsedURL.Scheme == "" || parsedURL.Host == "" { - return fmt.Errorf("URL must include scheme (http/https) and host") + if u.Scheme != "http" && u.Scheme != "https" { + return fmt.Errorf("URL must have http or https scheme") } if config.Workers <= 0 { @@ -127,9 +206,9 @@ func NewCrawler(config *Config) (*Crawler, error) { return nil, fmt.Errorf("failed to parse base URL: %w", err) } - // Create output directory - if err := os.MkdirAll(config.OutputDir, 0755); err != nil { - return nil, fmt.Errorf("failed to create output directory: %w", err) + // Create output directory structure + if err := createOutputDirs(config.OutputDir); err != nil { + return nil, fmt.Errorf("failed to create output directories: %w", err) } // Setup colly collector @@ -145,7 +224,7 @@ func NewCrawler(config *Config) (*Crawler, error) { c.Limit(&colly.LimitRule{ DomainGlob: "*", Parallelism: config.Workers, - Delay: 100 * time.Millisecond, + Delay: 200 * time.Millisecond, // Slightly more conservative }) // Setup HTML to Markdown converter @@ -164,6 +243,7 @@ func NewCrawler(config *Config) (*Crawler, error) { queue: make(chan string, 1000), baseURL: baseURL, bar: progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")), + pages: make([]PageInfo, 0), } crawler.setupCallbacks() @@ -171,6 +251,21 @@ func NewCrawler(config *Config) (*Crawler, error) { return crawler, nil } +func createOutputDirs(outputDir string) error { + dirs := []string{ + outputDir, + filepath.Join(outputDir, MarkdownSubdir), + } + + for _, dir := range dirs { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %s: %w", dir, err) + } + } + + return nil +} + func (c *Crawler) setupCallbacks() { // Handle HTML content c.collector.OnHTML("html", func(e *colly.HTMLElement) { @@ -199,9 +294,16 @@ func (c *Crawler) setupCallbacks() { func (c *Crawler) processPage(e *colly.HTMLElement) { // Get page title - title := e.ChildText("title") + title := strings.TrimSpace(e.ChildText("title")) if title == "" { - title = "untitled" + title = "Untitled" + } + + // Get meta description + description := strings.TrimSpace(e.ChildAttr("meta[name='description']", "content")) + if description == "" { + // Try og:description + description = strings.TrimSpace(e.ChildAttr("meta[property='og:description']", "content")) } // Convert HTML to Markdown @@ -217,31 +319,62 @@ func (c *Crawler) processPage(e *colly.HTMLElement) { return } - // Save to file - if err := c.saveMarkdown(e.Request.URL, title, markdown); err != nil { + // Create page info + pageInfo := PageInfo{ + URL: e.Request.URL.String(), + Title: title, + Content: markdown, + CrawledAt: time.Now(), + Description: description, + } + + // Save individual markdown file + filename := c.createFilename(e.Request.URL, title) + pageInfo.FilePath = filepath.Join(MarkdownSubdir, filename) + fullPath := filepath.Join(c.config.OutputDir, pageInfo.FilePath) + + if err := c.saveMarkdown(fullPath, pageInfo); err != nil { log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err) return } + // Add to pages collection + c.pagesMutex.Lock() + c.pages = append(c.pages, pageInfo) + c.pagesMutex.Unlock() + c.mu.Lock() c.processed++ c.mu.Unlock() } -func (c *Crawler) saveMarkdown(pageURL *url.URL, title, markdown string) error { - // Create filename from URL path - filename := c.createFilename(pageURL, title) - filePath := filepath.Join(c.config.OutputDir, filename) - +func (c *Crawler) saveMarkdown(filePath string, pageInfo PageInfo) error { // Ensure directory exists dir := filepath.Dir(filePath) if err := os.MkdirAll(dir, 0755); err != nil { return fmt.Errorf("failed to create directory %s: %w", dir, err) } - // Add metadata header - content := fmt.Sprintf("# %s\n\nURL: %s\nCrawled: %s\n\n---\n\n%s", - title, pageURL.String(), time.Now().Format(time.RFC3339), markdown) + // Create content with metadata + content := fmt.Sprintf(`# %s + +URL: %s +Crawled: %s +%s + +--- + +%s`, + pageInfo.Title, + pageInfo.URL, + pageInfo.CrawledAt.Format(time.RFC3339), + func() string { + if pageInfo.Description != "" { + return fmt.Sprintf("Description: %s", pageInfo.Description) + } + return "" + }(), + pageInfo.Content) // Write file if err := os.WriteFile(filePath, []byte(content), 0644); err != nil { @@ -267,6 +400,11 @@ func (c *Crawler) createFilename(pageURL *url.URL, title string) string { filename = strings.ReplaceAll(urlPath, "/", "-") } + // Limit filename length + if len(filename) > 100 { + filename = filename[:100] + } + // Ensure .md extension if !strings.HasSuffix(filename, ".md") { filename += ".md" @@ -329,7 +467,7 @@ func (c *Crawler) shouldSkipURL(urlStr string) bool { } } - // Skip fragments and query parameters that might be irrelevant + // Skip fragments if strings.Contains(urlStr, "#") { return true } @@ -342,6 +480,10 @@ func (c *Crawler) Start(ctx context.Context) error { fmt.Printf("Output directory: %s\n", c.config.OutputDir) fmt.Printf("Workers: %d\n", c.config.Workers) + // Create a cancellable context for workers + workerCtx, cancel := context.WithCancel(ctx) + defer cancel() + // Add seed URL to queue c.queue <- c.config.URL c.visited[c.config.URL] = true @@ -349,13 +491,25 @@ func (c *Crawler) Start(ctx context.Context) error { // Start workers for i := 0; i < c.config.Workers; i++ { c.wg.Add(1) - go c.worker(ctx) + go c.worker(workerCtx) } - // Monitor progress - go c.monitor(ctx) + // Monitor progress and handle completion + done := make(chan struct{}) + go func() { + c.monitor(workerCtx) + close(done) + }() - // Wait for completion + // Wait for either completion or cancellation + select { + case <-done: + cancel() // Stop workers + case <-ctx.Done(): + // External cancellation + } + + // Wait for workers to finish c.wg.Wait() close(c.queue) c.bar.Finish() @@ -386,7 +540,7 @@ func (c *Crawler) worker(ctx context.Context) { } func (c *Crawler) monitor(ctx context.Context) { - ticker := time.NewTicker(5 * time.Second) + ticker := time.NewTicker(2 * time.Second) // Check more frequently defer ticker.Stop() lastProcessed := 0 @@ -404,8 +558,12 @@ func (c *Crawler) monitor(ctx context.Context) { if current == lastProcessed { noProgressCount++ - if noProgressCount >= 6 && queueLen == 0 { // 30 seconds with no progress and empty queue - fmt.Println("\nNo progress detected, stopping crawler...") + // More aggressive completion detection + if (noProgressCount >= 3 && queueLen == 0) || // 6 seconds with no progress and empty queue + (noProgressCount >= 15) { // Or 30 seconds regardless + if c.config.Verbose { + fmt.Println("\nNo progress detected, stopping crawler...") + } return } } else { @@ -419,3 +577,224 @@ func (c *Crawler) monitor(ctx context.Context) { } } } + +// GenerateLLMSFiles creates both llms.txt and llms-full.txt files +func (c *Crawler) GenerateLLMSFiles() error { + if err := c.generateLLMSTxt(); err != nil { + return fmt.Errorf("failed to generate llms.txt: %w", err) + } + + if err := c.generateLLMSFullTxt(); err != nil { + return fmt.Errorf("failed to generate llms-full.txt: %w", err) + } + + return nil +} + +func (c *Crawler) generateLLMSTxt() error { + // Sort pages by URL for consistent output + sortedPages := make([]PageInfo, len(c.pages)) + copy(sortedPages, c.pages) + sort.Slice(sortedPages, func(i, j int) bool { + return sortedPages[i].URL < sortedPages[j].URL + }) + + var content strings.Builder + + // H1 title (required) + siteTitle := c.getSiteTitle() + content.WriteString(fmt.Sprintf("# %s\n\n", siteTitle)) + + // Blockquote summary (optional but recommended) + summary := c.generateSiteSummary() + if summary != "" { + content.WriteString(fmt.Sprintf("> %s\n\n", summary)) + } + + // Additional details + content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n", + c.config.URL, time.Now().Format("January 2, 2006"))) + + // Main documentation section + content.WriteString("## Documentation\n\n") + for _, page := range sortedPages { + if c.isMainDocPage(page) { + description := page.Description + if description == "" { + description = c.extractFirstSentence(page.Content) + } + if description != "" { + content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", page.Title, page.URL, description)) + } else { + content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL)) + } + } + } + + // Optional section for secondary pages + secondaryPages := c.getSecondaryPages(sortedPages) + if len(secondaryPages) > 0 { + content.WriteString("\n## Optional\n\n") + for _, page := range secondaryPages { + content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL)) + } + } + + // Write to file + filePath := filepath.Join(c.config.OutputDir, "llms.txt") + return os.WriteFile(filePath, []byte(content.String()), 0644) +} + +func (c *Crawler) generateLLMSFullTxt() error { + // Sort pages by URL for consistent output + sortedPages := make([]PageInfo, len(c.pages)) + copy(sortedPages, c.pages) + sort.Slice(sortedPages, func(i, j int) bool { + return sortedPages[i].URL < sortedPages[j].URL + }) + + var content strings.Builder + + // H1 title + siteTitle := c.getSiteTitle() + content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteTitle)) + + // Summary + summary := c.generateSiteSummary() + if summary != "" { + content.WriteString(fmt.Sprintf("> %s\n\n", summary)) + } + + content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n", + c.config.URL, time.Now().Format("January 2, 2006"))) + + content.WriteString("---\n\n") + + // Include full content of each page + for i, page := range sortedPages { + content.WriteString(fmt.Sprintf("## %s\n\n", page.Title)) + content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL)) + + if page.Description != "" { + content.WriteString(fmt.Sprintf("**Description:** %s\n\n", page.Description)) + } + + content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339))) + + // Clean and include content + cleanContent := c.cleanContentForLLMS(page.Content) + content.WriteString(cleanContent) + + // Add separator between pages (except for the last one) + if i < len(sortedPages)-1 { + content.WriteString("\n\n---\n\n") + } + } + + // Write to file + filePath := filepath.Join(c.config.OutputDir, "llms-full.txt") + return os.WriteFile(filePath, []byte(content.String()), 0644) +} + +func (c *Crawler) getSiteTitle() string { + // Try to get site title from the main page + for _, page := range c.pages { + if page.URL == c.config.URL || page.URL == c.config.URL+"/" { + if page.Title != "" && page.Title != "Untitled" { + return page.Title + } + } + } + + // Fallback to domain name + return c.baseURL.Host +} + +func (c *Crawler) generateSiteSummary() string { + // Try to get description from the main page + for _, page := range c.pages { + if page.URL == c.config.URL || page.URL == c.config.URL+"/" { + if page.Description != "" { + return page.Description + } + // Extract first meaningful paragraph + return c.extractFirstSentence(page.Content) + } + } + + return fmt.Sprintf("Documentation and content from %s", c.baseURL.Host) +} + +func (c *Crawler) isMainDocPage(page PageInfo) bool { + // Consider a page "main documentation" if it's not in typical secondary sections + lowerURL := strings.ToLower(page.URL) + + // Skip pages that are typically secondary + secondaryIndicators := []string{ + "/blog", "/news", "/archive", "/changelog", "/release", + "/about", "/contact", "/legal", "/privacy", "/terms", + "/community", "/forum", "/discuss", + } + + for _, indicator := range secondaryIndicators { + // Check for the indicator followed by either / or end of URL + if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) { + return false + } + } + + return true +} + +func (c *Crawler) getSecondaryPages(allPages []PageInfo) []PageInfo { + var secondary []PageInfo + for _, page := range allPages { + if !c.isMainDocPage(page) { + secondary = append(secondary, page) + } + } + return secondary +} + +func (c *Crawler) extractFirstSentence(content string) string { + // Clean the content and extract the first meaningful sentence + lines := strings.Split(content, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + // Skip empty lines, headers, and markdown syntax + if len(line) > 50 && !strings.HasPrefix(line, "#") && !strings.HasPrefix(line, "**") { + // Find the first sentence + sentences := strings.Split(line, ".") + if len(sentences) > 0 && len(sentences[0]) > 20 { + return strings.TrimSpace(sentences[0]) + "." + } + } + } + return "" +} + +func (c *Crawler) cleanContentForLLMS(content string) string { + // Clean the content for better readability in LLMs context + var cleaned strings.Builder + scanner := bufio.NewScanner(strings.NewReader(content)) + + var inCodeBlock bool + for scanner.Scan() { + line := scanner.Text() + + // Handle code blocks + if strings.HasPrefix(strings.TrimSpace(line), "```") { + inCodeBlock = !inCodeBlock + } + + // Skip empty lines unless in code block + if strings.TrimSpace(line) == "" && !inCodeBlock { + continue + } + + cleaned.WriteString(line) + cleaned.WriteString("\n") + } + + return strings.TrimSpace(cleaned.String()) +} diff --git a/main_test.go b/main_test.go index 6716493..8da2925 100644 --- a/main_test.go +++ b/main_test.go @@ -5,38 +5,55 @@ import ( "testing" ) -func TestShouldSkipURL(t *testing.T) { - config := &Config{ - URL: "https://example.com", - OutputDir: "./test-output", - Workers: 1, - } - - crawler, err := NewCrawler(config) - if err != nil { - t.Fatalf("Failed to create crawler: %v", err) - } - +func TestValidateConfig(t *testing.T) { tests := []struct { - name string - url string - expected bool + name string + config *Config + wantErr bool }{ - {"Normal URL", "https://example.com/page", false}, - {"Language URL - en", "https://example.com/en/page", true}, - {"Language URL - zh", "https://example.com/zh/page", true}, - {"Language URL - zh-hant", "https://example.com/zh-hant/page", true}, - {"PDF file", "https://example.com/document.pdf", true}, - {"ZIP file", "https://example.com/archive.zip", true}, - {"Fragment URL", "https://example.com/page#section", true}, - {"Image file", "https://example.com/image.jpg", true}, + { + name: "Valid config", + config: &Config{ + URL: "https://example.com", + OutputDir: "./output", + Workers: 1, + }, + wantErr: false, + }, + { + name: "Empty URL", + config: &Config{ + URL: "", + OutputDir: "./output", + Workers: 1, + }, + wantErr: true, + }, + { + name: "Invalid URL", + config: &Config{ + URL: "not-a-url", + OutputDir: "./output", + Workers: 1, + }, + wantErr: true, + }, + { + name: "Zero workers", + config: &Config{ + URL: "https://example.com", + OutputDir: "./output", + Workers: 0, + }, + wantErr: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result := crawler.shouldSkipURL(tt.url) - if result != tt.expected { - t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected) + err := validateConfig(tt.config) + if (err != nil) != tt.wantErr { + t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr) } }) } @@ -97,55 +114,136 @@ func TestCreateFilename(t *testing.T) { } } -func TestValidateConfig(t *testing.T) { +func TestShouldSkipURL(t *testing.T) { + config := &Config{ + URL: "https://example.com", + OutputDir: "./test-output", + Workers: 1, + } + + crawler, err := NewCrawler(config) + if err != nil { + t.Fatalf("Failed to create crawler: %v", err) + } + tests := []struct { - name string - config *Config - wantErr bool + name string + url string + expected bool + }{ + {"Normal URL", "https://example.com/page", false}, + {"Language URL - en", "https://example.com/en/page", true}, + {"Language URL - zh", "https://example.com/zh/page", true}, + {"Language URL - zh-hant", "https://example.com/zh-hant/page", true}, + {"PDF file", "https://example.com/document.pdf", true}, + {"ZIP file", "https://example.com/archive.zip", true}, + {"Fragment URL", "https://example.com/page#section", true}, + {"Image file", "https://example.com/image.jpg", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := crawler.shouldSkipURL(tt.url) + if result != tt.expected { + t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected) + } + }) + } +} + +func TestExtractFirstSentence(t *testing.T) { + config := &Config{ + URL: "https://example.com", + OutputDir: "./test-output", + Workers: 1, + } + + crawler, err := NewCrawler(config) + if err != nil { + t.Fatalf("Failed to create crawler: %v", err) + } + + tests := []struct { + name string + content string + expected string }{ { - name: "Valid config", - config: &Config{ - URL: "https://example.com", - OutputDir: "./output", - Workers: 5, - }, - wantErr: false, + name: "Simple sentence", + content: "This is a simple sentence about something interesting. This is another sentence.", + expected: "This is a simple sentence about something interesting.", }, { - name: "Empty URL", - config: &Config{ - URL: "", - OutputDir: "./output", - Workers: 5, - }, - wantErr: true, + name: "With headers", + content: "# Header\n\nThis is the main content that should be extracted as the first sentence.", + expected: "This is the main content that should be extracted as the first sentence.", }, { - name: "Invalid URL", - config: &Config{ - URL: "not-a-url", - OutputDir: "./output", - Workers: 5, - }, - wantErr: true, + name: "Short content", + content: "Short text", + expected: "", }, { - name: "Zero workers", - config: &Config{ - URL: "https://example.com", - OutputDir: "./output", - Workers: 0, - }, - wantErr: true, + name: "Empty content", + content: "", + expected: "", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - err := validateConfig(tt.config) - if (err != nil) != tt.wantErr { - t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr) + result := crawler.extractFirstSentence(tt.content) + if result != tt.expected { + t.Errorf("extractFirstSentence() = %q, want %q", result, tt.expected) + } + }) + } +} + +func TestIsMainDocPage(t *testing.T) { + config := &Config{ + URL: "https://example.com", + OutputDir: "./test-output", + Workers: 1, + } + + crawler, err := NewCrawler(config) + if err != nil { + t.Fatalf("Failed to create crawler: %v", err) + } + + tests := []struct { + name string + page PageInfo + expected bool + }{ + { + name: "Main documentation page", + page: PageInfo{URL: "https://example.com/docs/getting-started"}, + expected: true, + }, + { + name: "Blog page", + page: PageInfo{URL: "https://example.com/blog/latest-news"}, + expected: false, + }, + { + name: "About page", + page: PageInfo{URL: "https://example.com/about"}, + expected: false, + }, + { + name: "API documentation", + page: PageInfo{URL: "https://example.com/api/reference"}, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := crawler.isMainDocPage(tt.page) + if result != tt.expected { + t.Errorf("isMainDocPage() = %v, want %v", result, tt.expected) } }) }