mirror of
https://github.com/Sosokker/site-to-llmstxt.git
synced 2025-12-18 21:44:06 +01:00
801 lines
19 KiB
Go
801 lines
19 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
|
|
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
|
|
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
|
|
"github.com/gocolly/colly/v2"
|
|
"github.com/gocolly/colly/v2/debug"
|
|
"github.com/schollz/progressbar/v3"
|
|
"github.com/urfave/cli/v2"
|
|
)
|
|
|
|
const (
|
|
// DefaultWorkers is the default number of concurrent workers
|
|
DefaultWorkers = 1
|
|
// DefaultOutputDir is the default output directory
|
|
DefaultOutputDir = "./output"
|
|
// MarkdownSubdir is the subdirectory for markdown files
|
|
MarkdownSubdir = "pages"
|
|
)
|
|
|
|
// Config holds crawler configuration
|
|
type Config struct {
|
|
URL string
|
|
OutputDir string
|
|
Workers int
|
|
Verbose bool
|
|
}
|
|
|
|
// PageInfo represents information about a crawled page
|
|
type PageInfo struct {
|
|
URL string
|
|
Title string
|
|
Content string
|
|
FilePath string
|
|
CrawledAt time.Time
|
|
Description string
|
|
}
|
|
|
|
// Crawler manages the web crawling process
|
|
type Crawler struct {
|
|
config *Config
|
|
collector *colly.Collector
|
|
converter *converter.Converter
|
|
visited map[string]bool
|
|
queue chan string
|
|
wg sync.WaitGroup
|
|
mu sync.RWMutex
|
|
baseURL *url.URL
|
|
bar *progressbar.ProgressBar
|
|
processed int
|
|
pages []PageInfo
|
|
pagesMutex sync.Mutex
|
|
}
|
|
|
|
// LanguageFilter contains patterns to exclude language-specific URLs
|
|
var LanguageFilter = []string{
|
|
`/en/`, `/en$`,
|
|
`/zh/`, `/zh$`, `/zh-cn/`, `/zh-cn$`, `/zh-tw/`, `/zh-tw$`, `/zh-hant/`, `/zh-hant$`,
|
|
`/ja/`, `/ja$`,
|
|
`/ko/`, `/ko$`,
|
|
`/fr/`, `/fr$`,
|
|
`/de/`, `/de$`,
|
|
`/es/`, `/es$`,
|
|
`/it/`, `/it$`,
|
|
`/pt/`, `/pt$`,
|
|
`/ru/`, `/ru$`,
|
|
}
|
|
|
|
// FileExtensionFilter contains patterns to exclude file downloads
|
|
var FileExtensionFilter = []string{
|
|
`\.pdf$`, `\.doc$`, `\.docx$`, `\.xls$`, `\.xlsx$`, `\.ppt$`, `\.pptx$`,
|
|
`\.zip$`, `\.rar$`, `\.tar$`, `\.gz$`, `\.7z$`,
|
|
`\.mp3$`, `\.mp4$`, `\.avi$`, `\.mov$`, `\.wmv$`,
|
|
`\.jpg$`, `\.jpeg$`, `\.png$`, `\.gif$`, `\.bmp$`, `\.svg$`,
|
|
`\.exe$`, `\.msi$`, `\.dmg$`, `\.deb$`, `\.rpm$`,
|
|
}
|
|
|
|
func main() {
|
|
app := &cli.App{
|
|
Name: "site-to-llmstxt",
|
|
Usage: "Web crawler that converts websites to LLMs.txt format",
|
|
Description: `A high-performance web crawler that scrapes websites and converts them to LLMs.txt format.
|
|
|
|
The crawler generates:
|
|
- llms.txt: A curated overview following the LLMs.txt specification
|
|
- llms-full.txt: Complete content of all crawled pages
|
|
- pages/: Directory containing individual markdown files
|
|
|
|
The crawler respects robots.txt, filters out language variants and file downloads,
|
|
and only crawls within the same domain.`,
|
|
Version: "1.0.0",
|
|
Authors: []*cli.Author{
|
|
{
|
|
Name: "Site-to-LLMsTxt",
|
|
},
|
|
},
|
|
Flags: []cli.Flag{
|
|
&cli.StringFlag{
|
|
Name: "url",
|
|
Aliases: []string{"u"},
|
|
Usage: "Root URL to crawl (required)",
|
|
Required: true,
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "output",
|
|
Aliases: []string{"o"},
|
|
Usage: "Output directory",
|
|
Value: DefaultOutputDir,
|
|
},
|
|
&cli.IntFlag{
|
|
Name: "workers",
|
|
Aliases: []string{"w"},
|
|
Usage: "Number of concurrent workers",
|
|
Value: DefaultWorkers,
|
|
},
|
|
&cli.BoolFlag{
|
|
Name: "verbose",
|
|
Usage: "Enable verbose logging",
|
|
},
|
|
},
|
|
Action: func(c *cli.Context) error {
|
|
config := &Config{
|
|
URL: c.String("url"),
|
|
OutputDir: c.String("output"),
|
|
Workers: c.Int("workers"),
|
|
Verbose: c.Bool("verbose"),
|
|
}
|
|
|
|
return runCrawler(config)
|
|
},
|
|
}
|
|
|
|
if err := app.Run(os.Args); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func runCrawler(config *Config) error {
|
|
if err := validateConfig(config); err != nil {
|
|
return fmt.Errorf("invalid configuration: %w", err)
|
|
}
|
|
|
|
crawler, err := NewCrawler(config)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create crawler: %w", err)
|
|
}
|
|
|
|
ctx := context.Background()
|
|
if err := crawler.Start(ctx); err != nil {
|
|
return fmt.Errorf("crawling failed: %w", err)
|
|
}
|
|
|
|
if err := crawler.GenerateLLMSFiles(); err != nil {
|
|
return fmt.Errorf("failed to generate LLMS files: %w", err)
|
|
}
|
|
|
|
fmt.Printf("\nCrawling completed successfully!\n")
|
|
fmt.Printf("Generated files:\n")
|
|
fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms.txt"))
|
|
fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms-full.txt"))
|
|
fmt.Printf(" - %s/ (individual pages)\n", filepath.Join(config.OutputDir, MarkdownSubdir))
|
|
fmt.Printf("Total pages crawled: %d\n", len(crawler.pages))
|
|
|
|
return nil
|
|
}
|
|
|
|
func validateConfig(config *Config) error {
|
|
if config.URL == "" {
|
|
return fmt.Errorf("URL is required")
|
|
}
|
|
|
|
u, err := url.Parse(config.URL)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid URL: %w", err)
|
|
}
|
|
|
|
if u.Scheme != "http" && u.Scheme != "https" {
|
|
return fmt.Errorf("URL must have http or https scheme")
|
|
}
|
|
|
|
if config.Workers <= 0 {
|
|
return fmt.Errorf("workers must be greater than 0")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// NewCrawler creates a new crawler instance
|
|
func NewCrawler(config *Config) (*Crawler, error) {
|
|
baseURL, err := url.Parse(config.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse base URL: %w", err)
|
|
}
|
|
|
|
// Create output directory structure
|
|
if err := createOutputDirs(config.OutputDir); err != nil {
|
|
return nil, fmt.Errorf("failed to create output directories: %w", err)
|
|
}
|
|
|
|
// Setup colly collector
|
|
c := colly.NewCollector(
|
|
colly.AllowedDomains(baseURL.Host),
|
|
)
|
|
|
|
if config.Verbose {
|
|
c.SetDebugger(&debug.LogDebugger{})
|
|
}
|
|
|
|
// Rate limiting
|
|
c.Limit(&colly.LimitRule{
|
|
DomainGlob: "*",
|
|
Parallelism: config.Workers,
|
|
Delay: 200 * time.Millisecond, // Slightly more conservative
|
|
})
|
|
|
|
// Setup HTML to Markdown converter
|
|
conv := converter.NewConverter(
|
|
converter.WithPlugins(
|
|
base.NewBasePlugin(),
|
|
commonmark.NewCommonmarkPlugin(),
|
|
),
|
|
)
|
|
|
|
crawler := &Crawler{
|
|
config: config,
|
|
collector: c,
|
|
converter: conv,
|
|
visited: make(map[string]bool),
|
|
queue: make(chan string, 1000),
|
|
baseURL: baseURL,
|
|
bar: progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")),
|
|
pages: make([]PageInfo, 0),
|
|
}
|
|
|
|
crawler.setupCallbacks()
|
|
|
|
return crawler, nil
|
|
}
|
|
|
|
func createOutputDirs(outputDir string) error {
|
|
dirs := []string{
|
|
outputDir,
|
|
filepath.Join(outputDir, MarkdownSubdir),
|
|
}
|
|
|
|
for _, dir := range dirs {
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
return fmt.Errorf("failed to create directory %s: %w", dir, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Crawler) setupCallbacks() {
|
|
// Handle HTML content
|
|
c.collector.OnHTML("html", func(e *colly.HTMLElement) {
|
|
c.processPage(e)
|
|
})
|
|
|
|
// Extract links
|
|
c.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
link := e.Attr("href")
|
|
c.addToQueue(link, e.Request.URL)
|
|
})
|
|
|
|
// Request callback
|
|
c.collector.OnRequest(func(r *colly.Request) {
|
|
if c.config.Verbose {
|
|
fmt.Printf("Visiting: %s\n", r.URL)
|
|
}
|
|
c.bar.Add(1)
|
|
})
|
|
|
|
// Error handling
|
|
c.collector.OnError(func(r *colly.Response, err error) {
|
|
log.Printf("Error visiting %s: %v", r.Request.URL, err)
|
|
})
|
|
}
|
|
|
|
func (c *Crawler) processPage(e *colly.HTMLElement) {
|
|
// Get page title
|
|
title := strings.TrimSpace(e.ChildText("title"))
|
|
if title == "" {
|
|
title = "Untitled"
|
|
}
|
|
|
|
// Get meta description
|
|
description := strings.TrimSpace(e.ChildAttr("meta[name='description']", "content"))
|
|
if description == "" {
|
|
// Try og:description
|
|
description = strings.TrimSpace(e.ChildAttr("meta[property='og:description']", "content"))
|
|
}
|
|
|
|
// Convert HTML to Markdown
|
|
html, err := e.DOM.Html()
|
|
if err != nil {
|
|
log.Printf("Failed to get HTML for %s: %v", e.Request.URL, err)
|
|
return
|
|
}
|
|
|
|
markdown, err := c.converter.ConvertString(html)
|
|
if err != nil {
|
|
log.Printf("Failed to convert HTML to Markdown for %s: %v", e.Request.URL, err)
|
|
return
|
|
}
|
|
|
|
// Create page info
|
|
pageInfo := PageInfo{
|
|
URL: e.Request.URL.String(),
|
|
Title: title,
|
|
Content: markdown,
|
|
CrawledAt: time.Now(),
|
|
Description: description,
|
|
}
|
|
|
|
// Save individual markdown file
|
|
filename := c.createFilename(e.Request.URL, title)
|
|
pageInfo.FilePath = filepath.Join(MarkdownSubdir, filename)
|
|
fullPath := filepath.Join(c.config.OutputDir, pageInfo.FilePath)
|
|
|
|
if err := c.saveMarkdown(fullPath, pageInfo); err != nil {
|
|
log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err)
|
|
return
|
|
}
|
|
|
|
// Add to pages collection
|
|
c.pagesMutex.Lock()
|
|
c.pages = append(c.pages, pageInfo)
|
|
c.pagesMutex.Unlock()
|
|
|
|
c.mu.Lock()
|
|
c.processed++
|
|
c.mu.Unlock()
|
|
}
|
|
|
|
func (c *Crawler) saveMarkdown(filePath string, pageInfo PageInfo) error {
|
|
// Ensure directory exists
|
|
dir := filepath.Dir(filePath)
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
return fmt.Errorf("failed to create directory %s: %w", dir, err)
|
|
}
|
|
|
|
// Create content with metadata
|
|
content := fmt.Sprintf(`# %s
|
|
|
|
URL: %s
|
|
Crawled: %s
|
|
%s
|
|
|
|
---
|
|
|
|
%s`,
|
|
pageInfo.Title,
|
|
pageInfo.URL,
|
|
pageInfo.CrawledAt.Format(time.RFC3339),
|
|
func() string {
|
|
if pageInfo.Description != "" {
|
|
return fmt.Sprintf("Description: %s", pageInfo.Description)
|
|
}
|
|
return ""
|
|
}(),
|
|
pageInfo.Content)
|
|
|
|
// Write file
|
|
if err := os.WriteFile(filePath, []byte(content), 0644); err != nil {
|
|
return fmt.Errorf("failed to write file %s: %w", filePath, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Crawler) createFilename(pageURL *url.URL, title string) string {
|
|
// Clean title for filename
|
|
filename := strings.TrimSpace(title)
|
|
filename = regexp.MustCompile(`[^a-zA-Z0-9\-_\s]`).ReplaceAllString(filename, "")
|
|
filename = regexp.MustCompile(`\s+`).ReplaceAllString(filename, "-")
|
|
filename = strings.ToLower(filename)
|
|
|
|
if filename == "" || filename == "untitled" {
|
|
// Use URL path
|
|
urlPath := strings.Trim(pageURL.Path, "/")
|
|
if urlPath == "" {
|
|
urlPath = "index"
|
|
}
|
|
filename = strings.ReplaceAll(urlPath, "/", "-")
|
|
}
|
|
|
|
// Limit filename length
|
|
if len(filename) > 100 {
|
|
filename = filename[:100]
|
|
}
|
|
|
|
// Ensure .md extension
|
|
if !strings.HasSuffix(filename, ".md") {
|
|
filename += ".md"
|
|
}
|
|
|
|
return filename
|
|
}
|
|
|
|
func (c *Crawler) addToQueue(link string, baseURL *url.URL) {
|
|
// Parse and resolve URL
|
|
linkURL, err := url.Parse(link)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
resolvedURL := baseURL.ResolveReference(linkURL)
|
|
|
|
// Check if it's within the same domain
|
|
if resolvedURL.Host != c.baseURL.Host {
|
|
return
|
|
}
|
|
|
|
// Apply filters
|
|
if c.shouldSkipURL(resolvedURL.String()) {
|
|
return
|
|
}
|
|
|
|
urlStr := resolvedURL.String()
|
|
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
// Check if already visited
|
|
if c.visited[urlStr] {
|
|
return
|
|
}
|
|
|
|
c.visited[urlStr] = true
|
|
|
|
// Add to queue
|
|
select {
|
|
case c.queue <- urlStr:
|
|
default:
|
|
// Queue is full, skip this URL
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) shouldSkipURL(urlStr string) bool {
|
|
// Check language filters
|
|
for _, pattern := range LanguageFilter {
|
|
if matched, _ := regexp.MatchString(pattern, urlStr); matched {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Check file extension filters
|
|
for _, pattern := range FileExtensionFilter {
|
|
if matched, _ := regexp.MatchString(pattern, urlStr); matched {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Skip fragments
|
|
if strings.Contains(urlStr, "#") {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (c *Crawler) Start(ctx context.Context) error {
|
|
fmt.Printf("Starting crawl of: %s\n", c.config.URL)
|
|
fmt.Printf("Output directory: %s\n", c.config.OutputDir)
|
|
fmt.Printf("Workers: %d\n", c.config.Workers)
|
|
|
|
// Create a cancellable context for workers
|
|
workerCtx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
// Add seed URL to queue
|
|
c.queue <- c.config.URL
|
|
c.visited[c.config.URL] = true
|
|
|
|
// Start workers
|
|
for i := 0; i < c.config.Workers; i++ {
|
|
c.wg.Add(1)
|
|
go c.worker(workerCtx)
|
|
}
|
|
|
|
// Monitor progress and handle completion
|
|
done := make(chan struct{})
|
|
go func() {
|
|
c.monitor(workerCtx)
|
|
close(done)
|
|
}()
|
|
|
|
// Wait for either completion or cancellation
|
|
select {
|
|
case <-done:
|
|
cancel() // Stop workers
|
|
case <-ctx.Done():
|
|
// External cancellation
|
|
}
|
|
|
|
// Wait for workers to finish
|
|
c.wg.Wait()
|
|
close(c.queue)
|
|
c.bar.Finish()
|
|
|
|
fmt.Printf("\nProcessed %d pages\n", c.processed)
|
|
return nil
|
|
}
|
|
|
|
func (c *Crawler) worker(ctx context.Context) {
|
|
defer c.wg.Done()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case urlStr, ok := <-c.queue:
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
if err := c.collector.Visit(urlStr); err != nil {
|
|
if c.config.Verbose {
|
|
log.Printf("Failed to visit %s: %v", urlStr, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) monitor(ctx context.Context) {
|
|
ticker := time.NewTicker(2 * time.Second) // Check more frequently
|
|
defer ticker.Stop()
|
|
|
|
lastProcessed := 0
|
|
noProgressCount := 0
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
c.mu.RLock()
|
|
current := c.processed
|
|
queueLen := len(c.queue)
|
|
c.mu.RUnlock()
|
|
|
|
if current == lastProcessed {
|
|
noProgressCount++
|
|
// More aggressive completion detection
|
|
if (noProgressCount >= 3 && queueLen == 0) || // 6 seconds with no progress and empty queue
|
|
(noProgressCount >= 15) { // Or 30 seconds regardless
|
|
if c.config.Verbose {
|
|
fmt.Println("\nNo progress detected, stopping crawler...")
|
|
}
|
|
return
|
|
}
|
|
} else {
|
|
noProgressCount = 0
|
|
lastProcessed = current
|
|
}
|
|
|
|
if c.config.Verbose {
|
|
fmt.Printf("Progress: %d pages processed, %d in queue\n", current, queueLen)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// GenerateLLMSFiles creates both llms.txt and llms-full.txt files
|
|
func (c *Crawler) GenerateLLMSFiles() error {
|
|
if err := c.generateLLMSTxt(); err != nil {
|
|
return fmt.Errorf("failed to generate llms.txt: %w", err)
|
|
}
|
|
|
|
if err := c.generateLLMSFullTxt(); err != nil {
|
|
return fmt.Errorf("failed to generate llms-full.txt: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Crawler) generateLLMSTxt() error {
|
|
// Sort pages by URL for consistent output
|
|
sortedPages := make([]PageInfo, len(c.pages))
|
|
copy(sortedPages, c.pages)
|
|
sort.Slice(sortedPages, func(i, j int) bool {
|
|
return sortedPages[i].URL < sortedPages[j].URL
|
|
})
|
|
|
|
var content strings.Builder
|
|
|
|
// H1 title (required)
|
|
siteTitle := c.getSiteTitle()
|
|
content.WriteString(fmt.Sprintf("# %s\n\n", siteTitle))
|
|
|
|
// Blockquote summary (optional but recommended)
|
|
summary := c.generateSiteSummary()
|
|
if summary != "" {
|
|
content.WriteString(fmt.Sprintf("> %s\n\n", summary))
|
|
}
|
|
|
|
// Additional details
|
|
content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n",
|
|
c.config.URL, time.Now().Format("January 2, 2006")))
|
|
|
|
// Main documentation section
|
|
content.WriteString("## Documentation\n\n")
|
|
for _, page := range sortedPages {
|
|
if c.isMainDocPage(page) {
|
|
description := page.Description
|
|
if description == "" {
|
|
description = c.extractFirstSentence(page.Content)
|
|
}
|
|
if description != "" {
|
|
content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", page.Title, page.URL, description))
|
|
} else {
|
|
content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Optional section for secondary pages
|
|
secondaryPages := c.getSecondaryPages(sortedPages)
|
|
if len(secondaryPages) > 0 {
|
|
content.WriteString("\n## Optional\n\n")
|
|
for _, page := range secondaryPages {
|
|
content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
|
|
}
|
|
}
|
|
|
|
// Write to file
|
|
filePath := filepath.Join(c.config.OutputDir, "llms.txt")
|
|
return os.WriteFile(filePath, []byte(content.String()), 0644)
|
|
}
|
|
|
|
func (c *Crawler) generateLLMSFullTxt() error {
|
|
// Sort pages by URL for consistent output
|
|
sortedPages := make([]PageInfo, len(c.pages))
|
|
copy(sortedPages, c.pages)
|
|
sort.Slice(sortedPages, func(i, j int) bool {
|
|
return sortedPages[i].URL < sortedPages[j].URL
|
|
})
|
|
|
|
var content strings.Builder
|
|
|
|
// H1 title
|
|
siteTitle := c.getSiteTitle()
|
|
content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteTitle))
|
|
|
|
// Summary
|
|
summary := c.generateSiteSummary()
|
|
if summary != "" {
|
|
content.WriteString(fmt.Sprintf("> %s\n\n", summary))
|
|
}
|
|
|
|
content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n",
|
|
c.config.URL, time.Now().Format("January 2, 2006")))
|
|
|
|
content.WriteString("---\n\n")
|
|
|
|
// Include full content of each page
|
|
for i, page := range sortedPages {
|
|
content.WriteString(fmt.Sprintf("## %s\n\n", page.Title))
|
|
content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL))
|
|
|
|
if page.Description != "" {
|
|
content.WriteString(fmt.Sprintf("**Description:** %s\n\n", page.Description))
|
|
}
|
|
|
|
content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339)))
|
|
|
|
// Clean and include content
|
|
cleanContent := c.cleanContentForLLMS(page.Content)
|
|
content.WriteString(cleanContent)
|
|
|
|
// Add separator between pages (except for the last one)
|
|
if i < len(sortedPages)-1 {
|
|
content.WriteString("\n\n---\n\n")
|
|
}
|
|
}
|
|
|
|
// Write to file
|
|
filePath := filepath.Join(c.config.OutputDir, "llms-full.txt")
|
|
return os.WriteFile(filePath, []byte(content.String()), 0644)
|
|
}
|
|
|
|
func (c *Crawler) getSiteTitle() string {
|
|
// Try to get site title from the main page
|
|
for _, page := range c.pages {
|
|
if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
|
|
if page.Title != "" && page.Title != "Untitled" {
|
|
return page.Title
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback to domain name
|
|
return c.baseURL.Host
|
|
}
|
|
|
|
func (c *Crawler) generateSiteSummary() string {
|
|
// Try to get description from the main page
|
|
for _, page := range c.pages {
|
|
if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
|
|
if page.Description != "" {
|
|
return page.Description
|
|
}
|
|
// Extract first meaningful paragraph
|
|
return c.extractFirstSentence(page.Content)
|
|
}
|
|
}
|
|
|
|
return fmt.Sprintf("Documentation and content from %s", c.baseURL.Host)
|
|
}
|
|
|
|
func (c *Crawler) isMainDocPage(page PageInfo) bool {
|
|
// Consider a page "main documentation" if it's not in typical secondary sections
|
|
lowerURL := strings.ToLower(page.URL)
|
|
|
|
// Skip pages that are typically secondary
|
|
secondaryIndicators := []string{
|
|
"/blog", "/news", "/archive", "/changelog", "/release",
|
|
"/about", "/contact", "/legal", "/privacy", "/terms",
|
|
"/community", "/forum", "/discuss",
|
|
}
|
|
|
|
for _, indicator := range secondaryIndicators {
|
|
// Check for the indicator followed by either / or end of URL
|
|
if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (c *Crawler) getSecondaryPages(allPages []PageInfo) []PageInfo {
|
|
var secondary []PageInfo
|
|
for _, page := range allPages {
|
|
if !c.isMainDocPage(page) {
|
|
secondary = append(secondary, page)
|
|
}
|
|
}
|
|
return secondary
|
|
}
|
|
|
|
func (c *Crawler) extractFirstSentence(content string) string {
|
|
// Clean the content and extract the first meaningful sentence
|
|
lines := strings.Split(content, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
// Skip empty lines, headers, and markdown syntax
|
|
if len(line) > 50 && !strings.HasPrefix(line, "#") && !strings.HasPrefix(line, "**") {
|
|
// Find the first sentence
|
|
sentences := strings.Split(line, ".")
|
|
if len(sentences) > 0 && len(sentences[0]) > 20 {
|
|
return strings.TrimSpace(sentences[0]) + "."
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func (c *Crawler) cleanContentForLLMS(content string) string {
|
|
// Clean the content for better readability in LLMs context
|
|
var cleaned strings.Builder
|
|
scanner := bufio.NewScanner(strings.NewReader(content))
|
|
|
|
var inCodeBlock bool
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
|
|
// Handle code blocks
|
|
if strings.HasPrefix(strings.TrimSpace(line), "```") {
|
|
inCodeBlock = !inCodeBlock
|
|
}
|
|
|
|
// Skip empty lines unless in code block
|
|
if strings.TrimSpace(line) == "" && !inCodeBlock {
|
|
continue
|
|
}
|
|
|
|
cleaned.WriteString(line)
|
|
cleaned.WriteString("\n")
|
|
}
|
|
|
|
return strings.TrimSpace(cleaned.String())
|
|
}
|