feat: add llms.txt gen, with improved CLI and output generation

2025-12-18 13:34:06 +01:00 · 2025-07-12 16:06:10 +00:00 · 2025-07-12 16:06:10 +00:00 · 4aa2c4be52
commit 4aa2c4be52
parent f702db6ede
5 changed files with 617 additions and 122 deletions
--- a/18
+++ b/18
@ -14,7 +14,7 @@ help:
 	@echo "Examples:"
 	@echo "  make build"
 	@echo "  make run URL=https://example.com"
-	@echo "  make run URL=https://httpbin.org WORKERS=3 OUTPUT=./test-output"
+	@echo "  make run URL=https://httpbin.org WORKERS=2 OUTPUT=./test-output"

 # Build the crawler
 build:
@ -40,18 +40,24 @@ install:
 	@echo "Installing dependencies..."
 	go mod tidy

-# Run with parameters
+# Run with parameters (updated for new CLI)
 run: build
 	@if [ -z "$(URL)" ]; then \
 		echo "Error: URL is required. Usage: make run URL=https://example.com"; \
 		exit 1; \
 	fi
 	@echo "Running crawler with URL: $(URL)"
-	./crawler -url $(URL) \
-		$(if $(WORKERS),-workers $(WORKERS)) \
-		$(if $(OUTPUT),-output $(OUTPUT)) \
-		$(if $(VERBOSE),-verbose)
+	./crawler \
+		--url $(URL) \
+		$(if $(WORKERS),--workers $(WORKERS)) \
+		$(if $(OUTPUT),--output $(OUTPUT)) \
+		$(if $(VERBOSE),--verbose)

 # Build and test everything
 all: clean install build test
 	@echo "All tasks completed successfully!"
+
+# Quick test with a small site
+demo: build
+	@echo "Running demo crawl of httpbin.org..."
+	./crawler --url https://httpbin.org --output ./demo-output --workers 1 --verbose
--- a/go.mod
+++ b/go.mod
@ -1,4 +1,4 @@
-module site-to-llmstxt
+module github.com/Sosokker/site-to-llmstxt

 go 1.24.5

@ -6,6 +6,7 @@ require (
 	github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3
 	github.com/gocolly/colly/v2 v2.2.0
 	github.com/schollz/progressbar/v3 v3.18.0
+	github.com/urfave/cli/v2 v2.27.7
 )

 require (
@ -16,6 +17,7 @@ require (
 	github.com/antchfx/xmlquery v1.4.4 // indirect
 	github.com/antchfx/xpath v1.3.4 // indirect
 	github.com/bits-and-blooms/bitset v1.22.0 // indirect
+	github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
 	github.com/gobwas/glob v0.2.3 // indirect
 	github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
 	github.com/golang/protobuf v1.5.4 // indirect
@ -23,8 +25,10 @@ require (
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
 	github.com/nlnwa/whatwg-url v0.6.2 // indirect
 	github.com/rivo/uniseg v0.4.7 // indirect
+	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
 	github.com/temoto/robotstxt v1.1.2 // indirect
+	github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
 	golang.org/x/net v0.42.0 // indirect
 	golang.org/x/sys v0.34.0 // indirect
 	golang.org/x/term v0.33.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -18,6 +18,8 @@ github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCk
 github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
 github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
+github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
+github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -47,6 +49,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
+github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
 github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
 github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA=
@ -61,6 +65,10 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
 github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
+github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU=
+github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4=
+github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
+github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 github.com/yuin/goldmark v1.7.11 h1:ZCxLyDMtz0nT2HFfsYG8WZ47Trip2+JyLysKcMYE5bo=
 github.com/yuin/goldmark v1.7.11/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
--- a/main.go
+++ b/main.go
@ -1,14 +1,15 @@
 package main

 import (
+	"bufio"
 	"context"
-	"flag"
 	"fmt"
 	"log"
 	"net/url"
 	"os"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 	"sync"
 	"time"
@ -19,6 +20,16 @@ import (
 	"github.com/gocolly/colly/v2"
 	"github.com/gocolly/colly/v2/debug"
 	"github.com/schollz/progressbar/v3"
+	"github.com/urfave/cli/v2"
+)
+
+const (
+	// DefaultWorkers is the default number of concurrent workers
+	DefaultWorkers = 1
+	// DefaultOutputDir is the default output directory
+	DefaultOutputDir = "./output"
+	// MarkdownSubdir is the subdirectory for markdown files
+	MarkdownSubdir = "pages"
 )

 // Config holds crawler configuration
@ -29,18 +40,30 @@ type Config struct {
 	Verbose   bool
 }

+// PageInfo represents information about a crawled page
+type PageInfo struct {
+	URL         string
+	Title       string
+	Content     string
+	FilePath    string
+	CrawledAt   time.Time
+	Description string
+}
+
 // Crawler manages the web crawling process
 type Crawler struct {
-	config    *Config
-	collector *colly.Collector
-	converter *converter.Converter
-	visited   map[string]bool
-	queue     chan string
-	wg        sync.WaitGroup
-	mu        sync.RWMutex
-	baseURL   *url.URL
-	bar       *progressbar.ProgressBar
-	processed int
+	config     *Config
+	collector  *colly.Collector
+	converter  *converter.Converter
+	visited    map[string]bool
+	queue      chan string
+	wg         sync.WaitGroup
+	mu         sync.RWMutex
+	baseURL    *url.URL
+	bar        *progressbar.ProgressBar
+	processed  int
+	pages      []PageInfo
+	pagesMutex sync.Mutex
 }

 // LanguageFilter contains patterns to exclude language-specific URLs
@ -67,35 +90,92 @@ var FileExtensionFilter = []string{
 }

 func main() {
-	config := parseFlags()
+	app := &cli.App{
+		Name:  "site-to-llmstxt",
+		Usage: "Web crawler that converts websites to LLMs.txt format",
+		Description: `A high-performance web crawler that scrapes websites and converts them to LLMs.txt format.
+		
+The crawler generates:
+- llms.txt: A curated overview following the LLMs.txt specification
+- llms-full.txt: Complete content of all crawled pages
+- pages/: Directory containing individual markdown files

+The crawler respects robots.txt, filters out language variants and file downloads,
+and only crawls within the same domain.`,
+		Version: "1.0.0",
+		Authors: []*cli.Author{
+			{
+				Name: "Site-to-LLMsTxt",
+			},
+		},
+		Flags: []cli.Flag{
+			&cli.StringFlag{
+				Name:     "url",
+				Aliases:  []string{"u"},
+				Usage:    "Root URL to crawl (required)",
+				Required: true,
+			},
+			&cli.StringFlag{
+				Name:    "output",
+				Aliases: []string{"o"},
+				Usage:   "Output directory",
+				Value:   DefaultOutputDir,
+			},
+			&cli.IntFlag{
+				Name:    "workers",
+				Aliases: []string{"w"},
+				Usage:   "Number of concurrent workers",
+				Value:   DefaultWorkers,
+			},
+			&cli.BoolFlag{
+				Name:  "verbose",
+				Usage: "Enable verbose logging",
+			},
+		},
+		Action: func(c *cli.Context) error {
+			config := &Config{
+				URL:       c.String("url"),
+				OutputDir: c.String("output"),
+				Workers:   c.Int("workers"),
+				Verbose:   c.Bool("verbose"),
+			}
+
+			return runCrawler(config)
+		},
+	}
+
+	if err := app.Run(os.Args); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func runCrawler(config *Config) error {
 	if err := validateConfig(config); err != nil {
-		log.Fatalf("Invalid configuration: %v", err)
+		return fmt.Errorf("invalid configuration: %w", err)
 	}

 	crawler, err := NewCrawler(config)
 	if err != nil {
-		log.Fatalf("Failed to create crawler: %v", err)
+		return fmt.Errorf("failed to create crawler: %w", err)
 	}

 	ctx := context.Background()
 	if err := crawler.Start(ctx); err != nil {
-		log.Fatalf("Crawling failed: %v", err)
+		return fmt.Errorf("crawling failed: %w", err)
 	}

-	fmt.Printf("\nCrawling completed successfully! Files saved to: %s\n", config.OutputDir)
-}
+	if err := crawler.GenerateLLMSFiles(); err != nil {
+		return fmt.Errorf("failed to generate LLMS files: %w", err)
+	}

-func parseFlags() *Config {
-	config := &Config{}
+	fmt.Printf("\nCrawling completed successfully!\n")
+	fmt.Printf("Generated files:\n")
+	fmt.Printf("  - %s\n", filepath.Join(config.OutputDir, "llms.txt"))
+	fmt.Printf("  - %s\n", filepath.Join(config.OutputDir, "llms-full.txt"))
+	fmt.Printf("  - %s/ (individual pages)\n", filepath.Join(config.OutputDir, MarkdownSubdir))
+	fmt.Printf("Total pages crawled: %d\n", len(crawler.pages))

-	flag.StringVar(&config.URL, "url", "", "Root URL to crawl (required)")
-	flag.StringVar(&config.OutputDir, "output", "./output", "Output directory for markdown files")
-	flag.IntVar(&config.Workers, "workers", 5, "Number of concurrent workers")
-	flag.BoolVar(&config.Verbose, "verbose", false, "Enable verbose logging")
-	flag.Parse()
-
-	return config
+	return nil
 }

 func validateConfig(config *Config) error {
@ -103,14 +183,13 @@ func validateConfig(config *Config) error {
 		return fmt.Errorf("URL is required")
 	}

-	parsedURL, err := url.Parse(config.URL)
+	u, err := url.Parse(config.URL)
 	if err != nil {
 		return fmt.Errorf("invalid URL: %w", err)
 	}

-	// Check if URL has a valid scheme and host
-	if parsedURL.Scheme == "" || parsedURL.Host == "" {
-		return fmt.Errorf("URL must include scheme (http/https) and host")
+	if u.Scheme != "http" && u.Scheme != "https" {
+		return fmt.Errorf("URL must have http or https scheme")
 	}

 	if config.Workers <= 0 {
@ -127,9 +206,9 @@ func NewCrawler(config *Config) (*Crawler, error) {
 		return nil, fmt.Errorf("failed to parse base URL: %w", err)
 	}

-	// Create output directory
-	if err := os.MkdirAll(config.OutputDir, 0755); err != nil {
-		return nil, fmt.Errorf("failed to create output directory: %w", err)
+	// Create output directory structure
+	if err := createOutputDirs(config.OutputDir); err != nil {
+		return nil, fmt.Errorf("failed to create output directories: %w", err)
 	}

 	// Setup colly collector
@ -145,7 +224,7 @@ func NewCrawler(config *Config) (*Crawler, error) {
 	c.Limit(&colly.LimitRule{
 		DomainGlob:  "*",
 		Parallelism: config.Workers,
-		Delay:       100 * time.Millisecond,
+		Delay:       200 * time.Millisecond, // Slightly more conservative
 	})

 	// Setup HTML to Markdown converter
@ -164,6 +243,7 @@ func NewCrawler(config *Config) (*Crawler, error) {
 		queue:     make(chan string, 1000),
 		baseURL:   baseURL,
 		bar:       progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")),
+		pages:     make([]PageInfo, 0),
 	}

 	crawler.setupCallbacks()
@ -171,6 +251,21 @@ func NewCrawler(config *Config) (*Crawler, error) {
 	return crawler, nil
 }

+func createOutputDirs(outputDir string) error {
+	dirs := []string{
+		outputDir,
+		filepath.Join(outputDir, MarkdownSubdir),
+	}
+
+	for _, dir := range dirs {
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			return fmt.Errorf("failed to create directory %s: %w", dir, err)
+		}
+	}
+
+	return nil
+}
+
 func (c *Crawler) setupCallbacks() {
 	// Handle HTML content
 	c.collector.OnHTML("html", func(e *colly.HTMLElement) {
@ -199,9 +294,16 @@ func (c *Crawler) setupCallbacks() {

 func (c *Crawler) processPage(e *colly.HTMLElement) {
 	// Get page title
-	title := e.ChildText("title")
+	title := strings.TrimSpace(e.ChildText("title"))
 	if title == "" {
-		title = "untitled"
+		title = "Untitled"
+	}
+
+	// Get meta description
+	description := strings.TrimSpace(e.ChildAttr("meta[name='description']", "content"))
+	if description == "" {
+		// Try og:description
+		description = strings.TrimSpace(e.ChildAttr("meta[property='og:description']", "content"))
 	}

 	// Convert HTML to Markdown
@ -217,31 +319,62 @@ func (c *Crawler) processPage(e *colly.HTMLElement) {
 		return
 	}

-	// Save to file
-	if err := c.saveMarkdown(e.Request.URL, title, markdown); err != nil {
+	// Create page info
+	pageInfo := PageInfo{
+		URL:         e.Request.URL.String(),
+		Title:       title,
+		Content:     markdown,
+		CrawledAt:   time.Now(),
+		Description: description,
+	}
+
+	// Save individual markdown file
+	filename := c.createFilename(e.Request.URL, title)
+	pageInfo.FilePath = filepath.Join(MarkdownSubdir, filename)
+	fullPath := filepath.Join(c.config.OutputDir, pageInfo.FilePath)
+
+	if err := c.saveMarkdown(fullPath, pageInfo); err != nil {
 		log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err)
 		return
 	}

+	// Add to pages collection
+	c.pagesMutex.Lock()
+	c.pages = append(c.pages, pageInfo)
+	c.pagesMutex.Unlock()
+
 	c.mu.Lock()
 	c.processed++
 	c.mu.Unlock()
 }

-func (c *Crawler) saveMarkdown(pageURL *url.URL, title, markdown string) error {
-	// Create filename from URL path
-	filename := c.createFilename(pageURL, title)
-	filePath := filepath.Join(c.config.OutputDir, filename)
-
+func (c *Crawler) saveMarkdown(filePath string, pageInfo PageInfo) error {
 	// Ensure directory exists
 	dir := filepath.Dir(filePath)
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		return fmt.Errorf("failed to create directory %s: %w", dir, err)
 	}

-	// Add metadata header
-	content := fmt.Sprintf("# %s\n\nURL: %s\nCrawled: %s\n\n---\n\n%s",
-		title, pageURL.String(), time.Now().Format(time.RFC3339), markdown)
+	// Create content with metadata
+	content := fmt.Sprintf(`# %s
+
+URL: %s
+Crawled: %s
+%s
+
+---
+
+%s`,
+		pageInfo.Title,
+		pageInfo.URL,
+		pageInfo.CrawledAt.Format(time.RFC3339),
+		func() string {
+			if pageInfo.Description != "" {
+				return fmt.Sprintf("Description: %s", pageInfo.Description)
+			}
+			return ""
+		}(),
+		pageInfo.Content)

 	// Write file
 	if err := os.WriteFile(filePath, []byte(content), 0644); err != nil {
@ -267,6 +400,11 @@ func (c *Crawler) createFilename(pageURL *url.URL, title string) string {
 		filename = strings.ReplaceAll(urlPath, "/", "-")
 	}

+	// Limit filename length
+	if len(filename) > 100 {
+		filename = filename[:100]
+	}
+
 	// Ensure .md extension
 	if !strings.HasSuffix(filename, ".md") {
 		filename += ".md"
@ -329,7 +467,7 @@ func (c *Crawler) shouldSkipURL(urlStr string) bool {
 		}
 	}

-	// Skip fragments and query parameters that might be irrelevant
+	// Skip fragments
 	if strings.Contains(urlStr, "#") {
 		return true
 	}
@ -342,6 +480,10 @@ func (c *Crawler) Start(ctx context.Context) error {
 	fmt.Printf("Output directory: %s\n", c.config.OutputDir)
 	fmt.Printf("Workers: %d\n", c.config.Workers)

+	// Create a cancellable context for workers
+	workerCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
 	// Add seed URL to queue
 	c.queue <- c.config.URL
 	c.visited[c.config.URL] = true
@ -349,13 +491,25 @@ func (c *Crawler) Start(ctx context.Context) error {
 	// Start workers
 	for i := 0; i < c.config.Workers; i++ {
 		c.wg.Add(1)
-		go c.worker(ctx)
+		go c.worker(workerCtx)
 	}

-	// Monitor progress
-	go c.monitor(ctx)
+	// Monitor progress and handle completion
+	done := make(chan struct{})
+	go func() {
+		c.monitor(workerCtx)
+		close(done)
+	}()

-	// Wait for completion
+	// Wait for either completion or cancellation
+	select {
+	case <-done:
+		cancel() // Stop workers
+	case <-ctx.Done():
+		// External cancellation
+	}
+
+	// Wait for workers to finish
 	c.wg.Wait()
 	close(c.queue)
 	c.bar.Finish()
@ -386,7 +540,7 @@ func (c *Crawler) worker(ctx context.Context) {
 }

 func (c *Crawler) monitor(ctx context.Context) {
-	ticker := time.NewTicker(5 * time.Second)
+	ticker := time.NewTicker(2 * time.Second) // Check more frequently
 	defer ticker.Stop()

 	lastProcessed := 0
@ -404,8 +558,12 @@ func (c *Crawler) monitor(ctx context.Context) {

 			if current == lastProcessed {
 				noProgressCount++
-				if noProgressCount >= 6 && queueLen == 0 { // 30 seconds with no progress and empty queue
-					fmt.Println("\nNo progress detected, stopping crawler...")
+				// More aggressive completion detection
+				if (noProgressCount >= 3 && queueLen == 0) || // 6 seconds with no progress and empty queue
+					(noProgressCount >= 15) { // Or 30 seconds regardless
+					if c.config.Verbose {
+						fmt.Println("\nNo progress detected, stopping crawler...")
+					}
 					return
 				}
 			} else {
@ -419,3 +577,224 @@ func (c *Crawler) monitor(ctx context.Context) {
 		}
 	}
 }
+
+// GenerateLLMSFiles creates both llms.txt and llms-full.txt files
+func (c *Crawler) GenerateLLMSFiles() error {
+	if err := c.generateLLMSTxt(); err != nil {
+		return fmt.Errorf("failed to generate llms.txt: %w", err)
+	}
+
+	if err := c.generateLLMSFullTxt(); err != nil {
+		return fmt.Errorf("failed to generate llms-full.txt: %w", err)
+	}
+
+	return nil
+}
+
+func (c *Crawler) generateLLMSTxt() error {
+	// Sort pages by URL for consistent output
+	sortedPages := make([]PageInfo, len(c.pages))
+	copy(sortedPages, c.pages)
+	sort.Slice(sortedPages, func(i, j int) bool {
+		return sortedPages[i].URL < sortedPages[j].URL
+	})
+
+	var content strings.Builder
+
+	// H1 title (required)
+	siteTitle := c.getSiteTitle()
+	content.WriteString(fmt.Sprintf("# %s\n\n", siteTitle))
+
+	// Blockquote summary (optional but recommended)
+	summary := c.generateSiteSummary()
+	if summary != "" {
+		content.WriteString(fmt.Sprintf("> %s\n\n", summary))
+	}
+
+	// Additional details
+	content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n",
+		c.config.URL, time.Now().Format("January 2, 2006")))
+
+	// Main documentation section
+	content.WriteString("## Documentation\n\n")
+	for _, page := range sortedPages {
+		if c.isMainDocPage(page) {
+			description := page.Description
+			if description == "" {
+				description = c.extractFirstSentence(page.Content)
+			}
+			if description != "" {
+				content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", page.Title, page.URL, description))
+			} else {
+				content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
+			}
+		}
+	}
+
+	// Optional section for secondary pages
+	secondaryPages := c.getSecondaryPages(sortedPages)
+	if len(secondaryPages) > 0 {
+		content.WriteString("\n## Optional\n\n")
+		for _, page := range secondaryPages {
+			content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
+		}
+	}
+
+	// Write to file
+	filePath := filepath.Join(c.config.OutputDir, "llms.txt")
+	return os.WriteFile(filePath, []byte(content.String()), 0644)
+}
+
+func (c *Crawler) generateLLMSFullTxt() error {
+	// Sort pages by URL for consistent output
+	sortedPages := make([]PageInfo, len(c.pages))
+	copy(sortedPages, c.pages)
+	sort.Slice(sortedPages, func(i, j int) bool {
+		return sortedPages[i].URL < sortedPages[j].URL
+	})
+
+	var content strings.Builder
+
+	// H1 title
+	siteTitle := c.getSiteTitle()
+	content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteTitle))
+
+	// Summary
+	summary := c.generateSiteSummary()
+	if summary != "" {
+		content.WriteString(fmt.Sprintf("> %s\n\n", summary))
+	}
+
+	content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n",
+		c.config.URL, time.Now().Format("January 2, 2006")))
+
+	content.WriteString("---\n\n")
+
+	// Include full content of each page
+	for i, page := range sortedPages {
+		content.WriteString(fmt.Sprintf("## %s\n\n", page.Title))
+		content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL))
+
+		if page.Description != "" {
+			content.WriteString(fmt.Sprintf("**Description:** %s\n\n", page.Description))
+		}
+
+		content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339)))
+
+		// Clean and include content
+		cleanContent := c.cleanContentForLLMS(page.Content)
+		content.WriteString(cleanContent)
+
+		// Add separator between pages (except for the last one)
+		if i < len(sortedPages)-1 {
+			content.WriteString("\n\n---\n\n")
+		}
+	}
+
+	// Write to file
+	filePath := filepath.Join(c.config.OutputDir, "llms-full.txt")
+	return os.WriteFile(filePath, []byte(content.String()), 0644)
+}
+
+func (c *Crawler) getSiteTitle() string {
+	// Try to get site title from the main page
+	for _, page := range c.pages {
+		if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
+			if page.Title != "" && page.Title != "Untitled" {
+				return page.Title
+			}
+		}
+	}
+
+	// Fallback to domain name
+	return c.baseURL.Host
+}
+
+func (c *Crawler) generateSiteSummary() string {
+	// Try to get description from the main page
+	for _, page := range c.pages {
+		if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
+			if page.Description != "" {
+				return page.Description
+			}
+			// Extract first meaningful paragraph
+			return c.extractFirstSentence(page.Content)
+		}
+	}
+
+	return fmt.Sprintf("Documentation and content from %s", c.baseURL.Host)
+}
+
+func (c *Crawler) isMainDocPage(page PageInfo) bool {
+	// Consider a page "main documentation" if it's not in typical secondary sections
+	lowerURL := strings.ToLower(page.URL)
+
+	// Skip pages that are typically secondary
+	secondaryIndicators := []string{
+		"/blog", "/news", "/archive", "/changelog", "/release",
+		"/about", "/contact", "/legal", "/privacy", "/terms",
+		"/community", "/forum", "/discuss",
+	}
+
+	for _, indicator := range secondaryIndicators {
+		// Check for the indicator followed by either / or end of URL
+		if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) {
+			return false
+		}
+	}
+
+	return true
+}
+
+func (c *Crawler) getSecondaryPages(allPages []PageInfo) []PageInfo {
+	var secondary []PageInfo
+	for _, page := range allPages {
+		if !c.isMainDocPage(page) {
+			secondary = append(secondary, page)
+		}
+	}
+	return secondary
+}
+
+func (c *Crawler) extractFirstSentence(content string) string {
+	// Clean the content and extract the first meaningful sentence
+	lines := strings.Split(content, "\n")
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		// Skip empty lines, headers, and markdown syntax
+		if len(line) > 50 && !strings.HasPrefix(line, "#") && !strings.HasPrefix(line, "**") {
+			// Find the first sentence
+			sentences := strings.Split(line, ".")
+			if len(sentences) > 0 && len(sentences[0]) > 20 {
+				return strings.TrimSpace(sentences[0]) + "."
+			}
+		}
+	}
+	return ""
+}
+
+func (c *Crawler) cleanContentForLLMS(content string) string {
+	// Clean the content for better readability in LLMs context
+	var cleaned strings.Builder
+	scanner := bufio.NewScanner(strings.NewReader(content))
+
+	var inCodeBlock bool
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Handle code blocks
+		if strings.HasPrefix(strings.TrimSpace(line), "```") {
+			inCodeBlock = !inCodeBlock
+		}
+
+		// Skip empty lines unless in code block
+		if strings.TrimSpace(line) == "" && !inCodeBlock {
+			continue
+		}
+
+		cleaned.WriteString(line)
+		cleaned.WriteString("\n")
+	}
+
+	return strings.TrimSpace(cleaned.String())
+}
--- a/main_test.go
+++ b/main_test.go
@ -5,38 +5,55 @@ import (
 	"testing"
 )

-func TestShouldSkipURL(t *testing.T) {
-	config := &Config{
-		URL:       "https://example.com",
-		OutputDir: "./test-output",
-		Workers:   1,
-	}
-
-	crawler, err := NewCrawler(config)
-	if err != nil {
-		t.Fatalf("Failed to create crawler: %v", err)
-	}
-
+func TestValidateConfig(t *testing.T) {
 	tests := []struct {
-		name     string
-		url      string
-		expected bool
+		name    string
+		config  *Config
+		wantErr bool
 	}{
-		{"Normal URL", "https://example.com/page", false},
-		{"Language URL - en", "https://example.com/en/page", true},
-		{"Language URL - zh", "https://example.com/zh/page", true},
-		{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
-		{"PDF file", "https://example.com/document.pdf", true},
-		{"ZIP file", "https://example.com/archive.zip", true},
-		{"Fragment URL", "https://example.com/page#section", true},
-		{"Image file", "https://example.com/image.jpg", true},
+		{
+			name: "Valid config",
+			config: &Config{
+				URL:       "https://example.com",
+				OutputDir: "./output",
+				Workers:   1,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Empty URL",
+			config: &Config{
+				URL:       "",
+				OutputDir: "./output",
+				Workers:   1,
+			},
+			wantErr: true,
+		},
+		{
+			name: "Invalid URL",
+			config: &Config{
+				URL:       "not-a-url",
+				OutputDir: "./output",
+				Workers:   1,
+			},
+			wantErr: true,
+		},
+		{
+			name: "Zero workers",
+			config: &Config{
+				URL:       "https://example.com",
+				OutputDir: "./output",
+				Workers:   0,
+			},
+			wantErr: true,
+		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result := crawler.shouldSkipURL(tt.url)
-			if result != tt.expected {
-				t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
+			err := validateConfig(tt.config)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
 			}
 		})
 	}
@ -97,55 +114,136 @@ func TestCreateFilename(t *testing.T) {
 	}
 }

-func TestValidateConfig(t *testing.T) {
+func TestShouldSkipURL(t *testing.T) {
+	config := &Config{
+		URL:       "https://example.com",
+		OutputDir: "./test-output",
+		Workers:   1,
+	}
+
+	crawler, err := NewCrawler(config)
+	if err != nil {
+		t.Fatalf("Failed to create crawler: %v", err)
+	}
+
 	tests := []struct {
-		name    string
-		config  *Config
-		wantErr bool
+		name     string
+		url      string
+		expected bool
+	}{
+		{"Normal URL", "https://example.com/page", false},
+		{"Language URL - en", "https://example.com/en/page", true},
+		{"Language URL - zh", "https://example.com/zh/page", true},
+		{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
+		{"PDF file", "https://example.com/document.pdf", true},
+		{"ZIP file", "https://example.com/archive.zip", true},
+		{"Fragment URL", "https://example.com/page#section", true},
+		{"Image file", "https://example.com/image.jpg", true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.shouldSkipURL(tt.url)
+			if result != tt.expected {
+				t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestExtractFirstSentence(t *testing.T) {
+	config := &Config{
+		URL:       "https://example.com",
+		OutputDir: "./test-output",
+		Workers:   1,
+	}
+
+	crawler, err := NewCrawler(config)
+	if err != nil {
+		t.Fatalf("Failed to create crawler: %v", err)
+	}
+
+	tests := []struct {
+		name     string
+		content  string
+		expected string
 	}{
 		{
-			name: "Valid config",
-			config: &Config{
-				URL:       "https://example.com",
-				OutputDir: "./output",
-				Workers:   5,
-			},
-			wantErr: false,
+			name:     "Simple sentence",
+			content:  "This is a simple sentence about something interesting. This is another sentence.",
+			expected: "This is a simple sentence about something interesting.",
 		},
 		{
-			name: "Empty URL",
-			config: &Config{
-				URL:       "",
-				OutputDir: "./output",
-				Workers:   5,
-			},
-			wantErr: true,
+			name:     "With headers",
+			content:  "# Header\n\nThis is the main content that should be extracted as the first sentence.",
+			expected: "This is the main content that should be extracted as the first sentence.",
 		},
 		{
-			name: "Invalid URL",
-			config: &Config{
-				URL:       "not-a-url",
-				OutputDir: "./output",
-				Workers:   5,
-			},
-			wantErr: true,
+			name:     "Short content",
+			content:  "Short text",
+			expected: "",
 		},
 		{
-			name: "Zero workers",
-			config: &Config{
-				URL:       "https://example.com",
-				OutputDir: "./output",
-				Workers:   0,
-			},
-			wantErr: true,
+			name:     "Empty content",
+			content:  "",
+			expected: "",
 		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			err := validateConfig(tt.config)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
+			result := crawler.extractFirstSentence(tt.content)
+			if result != tt.expected {
+				t.Errorf("extractFirstSentence() = %q, want %q", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestIsMainDocPage(t *testing.T) {
+	config := &Config{
+		URL:       "https://example.com",
+		OutputDir: "./test-output",
+		Workers:   1,
+	}
+
+	crawler, err := NewCrawler(config)
+	if err != nil {
+		t.Fatalf("Failed to create crawler: %v", err)
+	}
+
+	tests := []struct {
+		name     string
+		page     PageInfo
+		expected bool
+	}{
+		{
+			name:     "Main documentation page",
+			page:     PageInfo{URL: "https://example.com/docs/getting-started"},
+			expected: true,
+		},
+		{
+			name:     "Blog page",
+			page:     PageInfo{URL: "https://example.com/blog/latest-news"},
+			expected: false,
+		},
+		{
+			name:     "About page",
+			page:     PageInfo{URL: "https://example.com/about"},
+			expected: false,
+		},
+		{
+			name:     "API documentation",
+			page:     PageInfo{URL: "https://example.com/api/reference"},
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.isMainDocPage(tt.page)
+			if result != tt.expected {
+				t.Errorf("isMainDocPage() = %v, want %v", result, tt.expected)
 			}
 		})
 	}