diff --git a/Makefile b/Makefile index 6a13fa4..6980e42 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,24 @@ # Makefile for site-to-llmstxt crawler -.PHONY: build test clean run help +.PHONY: build test clean run help fmt lint deps dev-setup + +# Variables +BINARY_NAME=site-to-llmstxt +CMD_PATH=./cmd/site-to-llmstxt +BUILD_DIR=./bin # Default target help: @echo "Available targets:" - @echo " build - Build the crawler binary" - @echo " test - Run tests" - @echo " clean - Clean build artifacts" - @echo " run - Run with example URL (requires URL variable)" - @echo " install - Install dependencies" + @echo " build - Build the crawler binary" + @echo " test - Run tests" + @echo " test-coverage - Run tests with coverage" + @echo " clean - Clean build artifacts" + @echo " run - Run with example URL (requires URL variable)" + @echo " fmt - Format code" + @echo " lint - Lint code" + @echo " deps - Install/update dependencies" + @echo " dev-setup - Set up development environment" @echo "" @echo "Examples:" @echo " make build" @@ -18,27 +27,55 @@ help: # Build the crawler build: - @echo "Building crawler..." - go build -o crawler main.go - @echo "Build complete: ./crawler" + @echo "Building $(BINARY_NAME)..." + @mkdir -p $(BUILD_DIR) + go build -o $(BUILD_DIR)/$(BINARY_NAME) $(CMD_PATH) + @echo "Build complete: $(BUILD_DIR)/$(BINARY_NAME)" # Run tests test: @echo "Running tests..." - go test -v + go test -v ./... + +# Run tests with coverage +test-coverage: + @echo "Running tests with coverage..." + go test -v -coverprofile=coverage.out ./... + go tool cover -html=coverage.out -o coverage.html + @echo "Coverage report generated: coverage.html" # Clean build artifacts clean: @echo "Cleaning..." - rm -f crawler + rm -rf $(BUILD_DIR) rm -rf output/ rm -rf test-output/ - rm -rf example-output/ + rm -rf demo-output/ + rm -f coverage.out coverage.html -# Install dependencies -install: +# Format code +fmt: + @echo "Formatting code..." + go fmt ./... + @which goimports > /dev/null && goimports -w . || echo "goimports not found, skipping import formatting" + +# Lint code (requires golangci-lint) +lint: + @echo "Linting code..." + @which golangci-lint > /dev/null && golangci-lint run || echo "golangci-lint not found, skipping linting" + +# Install/update dependencies +deps: @echo "Installing dependencies..." go mod tidy + go mod download + +# Development setup +dev-setup: deps + @echo "Setting up development environment..." + @echo "Installing development tools..." + go install golang.org/x/tools/cmd/goimports@latest + @echo "Development setup complete!" # Run with parameters (updated for new CLI) run: build @@ -47,17 +84,17 @@ run: build exit 1; \ fi @echo "Running crawler with URL: $(URL)" - ./crawler \ + $(BUILD_DIR)/$(BINARY_NAME) \ --url $(URL) \ $(if $(WORKERS),--workers $(WORKERS)) \ $(if $(OUTPUT),--output $(OUTPUT)) \ $(if $(VERBOSE),--verbose) # Build and test everything -all: clean install build test +all: clean deps fmt build test @echo "All tasks completed successfully!" # Quick test with a small site demo: build @echo "Running demo crawl of httpbin.org..." - ./crawler --url https://httpbin.org --output ./demo-output --workers 1 --verbose + $(BUILD_DIR)/$(BINARY_NAME) --url https://httpbin.org --output ./demo-output --workers 1 --verbose diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..c53d9a6 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,42 @@ +package config + +import ( + "fmt" + "net/url" +) + +const ( + DefaultWorkers = 1 + DefaultOutputDir = "./output" + MarkdownSubdir = "pages" +) + +// Config holds crawler configuration. +type Config struct { + URL string + OutputDir string + Workers int + Verbose bool +} + +// Validate validates the configuration and returns an error if invalid. +func (c *Config) Validate() error { + if c.URL == "" { + return fmt.Errorf("URL is required") + } + + u, err := url.Parse(c.URL) + if err != nil { + return fmt.Errorf("invalid URL: %w", err) + } + + if u.Scheme != "http" && u.Scheme != "https" { + return fmt.Errorf("URL must have http or https scheme") + } + + if c.Workers <= 0 { + return fmt.Errorf("workers must be greater than 0") + } + + return nil +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 0000000..7f55620 --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,57 @@ +package config + +import "testing" + +func TestConfig_Validate(t *testing.T) { + tests := []struct { + name string + config *Config + wantErr bool + }{ + { + name: "Valid config", + config: &Config{ + URL: "https://example.com", + OutputDir: "./output", + Workers: 1, + }, + wantErr: false, + }, + { + name: "Empty URL", + config: &Config{ + URL: "", + OutputDir: "./output", + Workers: 1, + }, + wantErr: true, + }, + { + name: "Invalid URL", + config: &Config{ + URL: "not-a-url", + OutputDir: "./output", + Workers: 1, + }, + wantErr: true, + }, + { + name: "Zero workers", + config: &Config{ + URL: "https://example.com", + OutputDir: "./output", + Workers: 0, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if (err != nil) != tt.wantErr { + t.Errorf("Config.Validate() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} diff --git a/internal/filters/filters.go b/internal/filters/filters.go new file mode 100644 index 0000000..f9ae534 --- /dev/null +++ b/internal/filters/filters.go @@ -0,0 +1,86 @@ +package filters + +import ( + "net/url" + "strings" +) + +// LanguageIndicators are URL patterns that indicate language-specific pages. +var LanguageIndicators = []string{ + "/en/", "/zh/", "/fr/", "/de/", "/es/", "/it/", "/ja/", "/ko/", + "/pt/", "/ru/", "/ar/", "/hi/", "/th/", "/vi/", "/id/", "/ms/", + "/tl/", "/zh-cn/", "/zh-tw/", "/zh-hk/", "/zh-hant/", "/zh-hans/", + "/en-us/", "/en-gb/", "/fr-fr/", "/de-de/", "/es-es/", "/pt-br/", + "/pt-pt/", "/ja-jp/", "/ko-kr/", "/it-it/", "/ru-ru/", "/ar-sa/", +} + +// FileExtensions are file extensions that should be skipped. +var FileExtensions = []string{ + ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", + ".zip", ".rar", ".tar", ".gz", ".7z", ".bz2", + ".mp3", ".mp4", ".avi", ".mov", ".wmv", ".flv", ".webm", + ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp", + ".exe", ".msi", ".dmg", ".deb", ".rpm", ".pkg", +} + +// SecondaryPageIndicators are URL patterns for secondary content. +var SecondaryPageIndicators = []string{ + "/blog", "/news", "/archive", "/changelog", "/release", + "/about", "/contact", "/legal", "/privacy", "/terms", + "/community", "/forum", "/discuss", +} + +// ShouldSkipURL determines if a URL should be skipped based on various filters. +func ShouldSkipURL(rawURL, baseHost string) bool { + if rawURL == "" { + return true + } + + // Parse URL + u, err := url.Parse(rawURL) + if err != nil { + return true + } + + // Skip external domains + if u.Host != "" && u.Host != baseHost { + return true + } + + // Skip fragments + if u.Fragment != "" { + return true + } + + lowerURL := strings.ToLower(rawURL) + + // Skip language variants + for _, lang := range LanguageIndicators { + if strings.Contains(lowerURL, lang) { + return true + } + } + + // Skip file downloads + for _, ext := range FileExtensions { + if strings.HasSuffix(lowerURL, ext) { + return true + } + } + + return false +} + +// IsMainDocPage determines if a page is main documentation or secondary content. +func IsMainDocPage(pageURL string) bool { + lowerURL := strings.ToLower(pageURL) + + for _, indicator := range SecondaryPageIndicators { + // Check for the indicator followed by either / or end of URL + if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) { + return false + } + } + + return true +} diff --git a/internal/filters/filters_test.go b/internal/filters/filters_test.go new file mode 100644 index 0000000..e2d6d87 --- /dev/null +++ b/internal/filters/filters_test.go @@ -0,0 +1,105 @@ +package filters + +import "testing" + +func TestShouldSkipURL(t *testing.T) { + tests := []struct { + name string + url string + baseHost string + want bool + }{ + { + name: "Normal URL", + url: "https://example.com/docs", + baseHost: "example.com", + want: false, + }, + { + name: "Language URL - en", + url: "https://example.com/en/docs", + baseHost: "example.com", + want: true, + }, + { + name: "Language URL - zh", + url: "https://example.com/zh/docs", + baseHost: "example.com", + want: true, + }, + { + name: "PDF file", + url: "https://example.com/doc.pdf", + baseHost: "example.com", + want: true, + }, + { + name: "ZIP file", + url: "https://example.com/download.zip", + baseHost: "example.com", + want: true, + }, + { + name: "Fragment URL", + url: "https://example.com/docs#section", + baseHost: "example.com", + want: true, + }, + { + name: "External domain", + url: "https://other.com/docs", + baseHost: "example.com", + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ShouldSkipURL(tt.url, tt.baseHost); got != tt.want { + t.Errorf("ShouldSkipURL() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestIsMainDocPage(t *testing.T) { + tests := []struct { + name string + url string + want bool + }{ + { + name: "Main documentation page", + url: "https://example.com/docs/api", + want: true, + }, + { + name: "Blog page", + url: "https://example.com/blog/latest-news", + want: false, + }, + { + name: "About page", + url: "https://example.com/about", + want: false, + }, + { + name: "API documentation", + url: "https://example.com/api/reference", + want: true, + }, + { + name: "Contact page", + url: "https://example.com/contact", + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsMainDocPage(tt.url); got != tt.want { + t.Errorf("IsMainDocPage() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/internal/generator/llms.go b/internal/generator/llms.go new file mode 100644 index 0000000..a69121e --- /dev/null +++ b/internal/generator/llms.go @@ -0,0 +1,198 @@ +package generator + +import ( + "fmt" + "net/url" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "github.com/Sosokker/site-to-llmstxt/internal/filters" + "github.com/Sosokker/site-to-llmstxt/internal/models" + "github.com/Sosokker/site-to-llmstxt/internal/utils" +) + +// LLMsGenerator generates LLMs.txt format files. +type LLMsGenerator struct { + baseURL *url.URL + outputDir string +} + +// New creates a new LLMs.txt generator. +func New(baseURL *url.URL, outputDir string) *LLMsGenerator { + return &LLMsGenerator{ + baseURL: baseURL, + outputDir: outputDir, + } +} + +// Generate creates both llms.txt and llms-full.txt files. +func (g *LLMsGenerator) Generate(pages []models.PageInfo) error { + if err := g.generateLLMsFile(pages); err != nil { + return fmt.Errorf("failed to generate llms.txt: %w", err) + } + + if err := g.generateFullFile(pages); err != nil { + return fmt.Errorf("failed to generate llms-full.txt: %w", err) + } + + return nil +} + +func (g *LLMsGenerator) generateLLMsFile(pages []models.PageInfo) error { + var content strings.Builder + + // Header + siteName := g.baseURL.Host + if siteName == "" { + siteName = "Documentation" + } + + content.WriteString(fmt.Sprintf("# %s\n\n", siteName)) + + // Summary from first page or generate one + summary := g.generateSummary(pages) + if summary != "" { + content.WriteString(fmt.Sprintf("> %s\n\n", summary)) + } + + content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n", + g.baseURL.String(), time.Now().Format("January 2, 2006"))) + + // Main documentation section + mainPages := g.filterMainPages(pages) + if len(mainPages) > 0 { + content.WriteString("## Documentation\n\n") + g.writePageLinks(&content, mainPages) + } + + // Optional section for secondary content + secondaryPages := g.filterSecondaryPages(pages) + if len(secondaryPages) > 0 { + content.WriteString("\n## Optional\n\n") + g.writePageLinks(&content, secondaryPages) + } + + return g.writeFile("llms.txt", content.String()) +} + +func (g *LLMsGenerator) generateFullFile(pages []models.PageInfo) error { + var content strings.Builder + + // Header + siteName := g.baseURL.Host + content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteName)) + + summary := g.generateSummary(pages) + if summary != "" { + content.WriteString(fmt.Sprintf("> %s\n\n", summary)) + } + + content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n", + g.baseURL.String(), time.Now().Format("January 2, 2006"))) + + content.WriteString(strings.Repeat("-", 80) + "\n\n") + + // Sort pages by URL for consistent output + sortedPages := make([]models.PageInfo, len(pages)) + copy(sortedPages, pages) + sort.Slice(sortedPages, func(i, j int) bool { + return sortedPages[i].URL < sortedPages[j].URL + }) + + // Add each page's content + for i, page := range sortedPages { + if i > 0 { + content.WriteString("\n" + strings.Repeat("-", 80) + "\n\n") + } + + content.WriteString(fmt.Sprintf("## %s\n\n", page.Title)) + content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL)) + content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339))) + + if page.Content != "" { + content.WriteString(page.Content + "\n") + } + } + + return g.writeFile("llms-full.txt", content.String()) +} + +func (g *LLMsGenerator) generateSummary(pages []models.PageInfo) string { + // Try to get summary from the first page (usually homepage) + if len(pages) > 0 { + for _, page := range pages { + if page.Description != "" { + return page.Description + } + } + + // Fallback to first sentence of first page content + for _, page := range pages { + if page.Content != "" { + return utils.ExtractFirstSentence(page.Content) + } + } + } + + return "" +} + +func (g *LLMsGenerator) filterMainPages(pages []models.PageInfo) []models.PageInfo { + var main []models.PageInfo + for _, page := range pages { + if filters.IsMainDocPage(page.URL) { + main = append(main, page) + } + } + + // Sort by URL + sort.Slice(main, func(i, j int) bool { + return main[i].URL < main[j].URL + }) + + return main +} + +func (g *LLMsGenerator) filterSecondaryPages(pages []models.PageInfo) []models.PageInfo { + var secondary []models.PageInfo + for _, page := range pages { + if !filters.IsMainDocPage(page.URL) { + secondary = append(secondary, page) + } + } + + // Sort by URL + sort.Slice(secondary, func(i, j int) bool { + return secondary[i].URL < secondary[j].URL + }) + + return secondary +} + +func (g *LLMsGenerator) writePageLinks(content *strings.Builder, pages []models.PageInfo) { + for _, page := range pages { + title := page.Title + if title == "" || title == "Untitled" { + title = "Untitled" + } + + description := page.Description + if description == "" && page.Content != "" { + description = utils.ExtractFirstSentence(page.Content) + } + + if description != "" { + content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", title, page.URL, description)) + } else { + content.WriteString(fmt.Sprintf("- [%s](%s)\n", title, page.URL)) + } + } +} + +func (g *LLMsGenerator) writeFile(filename, content string) error { + path := filepath.Join(g.outputDir, filename) + return os.WriteFile(path, []byte(content), 0644) +} diff --git a/internal/models/models.go b/internal/models/models.go new file mode 100644 index 0000000..20189ca --- /dev/null +++ b/internal/models/models.go @@ -0,0 +1,41 @@ +package models + +import "time" + +// PageInfo represents information about a crawled page. +type PageInfo struct { + URL string + Title string + Content string + FilePath string + CrawledAt time.Time + Description string +} + +// Stats holds crawling statistics. +type Stats struct { + TotalPages int + MainDocPages int + SecondaryPages int + StartTime time.Time + EndTime time.Time + Duration time.Duration + ErrorCount int + SkippedURLs int +} + +// AddError increments the error count. +func (s *Stats) AddError() { + s.ErrorCount++ +} + +// AddSkipped increments the skipped URL count. +func (s *Stats) AddSkipped() { + s.SkippedURLs++ +} + +// Finish sets the end time and calculates duration. +func (s *Stats) Finish() { + s.EndTime = time.Now() + s.Duration = s.EndTime.Sub(s.StartTime) +} diff --git a/internal/progress/progress.go b/internal/progress/progress.go new file mode 100644 index 0000000..ec4d191 --- /dev/null +++ b/internal/progress/progress.go @@ -0,0 +1,82 @@ +package progress + +import ( + "fmt" + "strings" + "time" + + "github.com/schollz/progressbar/v3" + + "github.com/Sosokker/site-to-llmstxt/internal/models" +) + +// Manager handles progress tracking and UI updates. +type Manager struct { + bar *progressbar.ProgressBar + verbose bool + stats *models.Stats +} + +// New creates a new progress manager. +func New(verbose bool, stats *models.Stats) *Manager { + bar := progressbar.NewOptions(-1, + progressbar.OptionSetDescription("Crawling"), + progressbar.OptionSpinnerType(14), + progressbar.OptionSetWidth(50), + progressbar.OptionThrottle(100*time.Millisecond), + ) + + return &Manager{ + bar: bar, + verbose: verbose, + stats: stats, + } +} + +// Update updates the progress bar with current status. +func (m *Manager) Update(processed, queued int) { + if m.verbose { + fmt.Printf("\rProgress: %d pages processed, %d in queue", processed, queued) + } + m.bar.Add(1) +} + +// Finish completes the progress bar and shows final statistics. +func (m *Manager) Finish() { + m.bar.Finish() + m.showSummary() +} + +// showSummary displays a comprehensive summary of the crawling session. +func (m *Manager) showSummary() { + fmt.Println("\n" + strings.Repeat("=", 60)) + fmt.Println("📊 CRAWLING SUMMARY") + fmt.Println(strings.Repeat("=", 60)) + + // Basic stats + fmt.Printf("🔍 Total pages crawled: %d\n", m.stats.TotalPages) + fmt.Printf("📚 Main documentation: %d pages\n", m.stats.MainDocPages) + fmt.Printf("📝 Secondary content: %d pages\n", m.stats.SecondaryPages) + + // Performance stats + if m.stats.Duration > 0 { + pagesPerSecond := float64(m.stats.TotalPages) / m.stats.Duration.Seconds() + fmt.Printf("⏱️ Duration: %v (%.1f pages/sec)\n", + m.stats.Duration.Round(time.Second), pagesPerSecond) + } + + // Error stats + if m.stats.ErrorCount > 0 || m.stats.SkippedURLs > 0 { + fmt.Printf("⚠️ Errors: %d, Skipped URLs: %d\n", m.stats.ErrorCount, m.stats.SkippedURLs) + } + + fmt.Println(strings.Repeat("-", 60)) + fmt.Println("✅ Crawling completed successfully!") +} + +// Log outputs a message if verbose mode is enabled. +func (m *Manager) Log(format string, args ...interface{}) { + if m.verbose { + fmt.Printf(format+"\n", args...) + } +} diff --git a/internal/utils/utils.go b/internal/utils/utils.go new file mode 100644 index 0000000..a6f6754 --- /dev/null +++ b/internal/utils/utils.go @@ -0,0 +1,136 @@ +package utils + +import ( + "fmt" + "net/url" + "os" + "path/filepath" + "regexp" + "strings" + "time" + "unicode" +) + +var ( + filenameRegex = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1f]`) + spaceRegex = regexp.MustCompile(`\s+`) +) + +// CreateFilename creates a safe filename from a title and URL. +func CreateFilename(title, rawURL string) string { + if title == "" || title == "Untitled" { + // Extract from URL path + if rawURL != "" { + u, err := url.Parse(rawURL) + if err == nil && u.Path != "" && u.Path != "/" { + parts := strings.Split(strings.Trim(u.Path, "/"), "/") + if len(parts) > 0 && parts[len(parts)-1] != "" { + title = parts[len(parts)-1] + } + } + } + if title == "" { + title = "index" + } + } + + // Clean the filename + cleaned := filenameRegex.ReplaceAllString(title, "") + cleaned = spaceRegex.ReplaceAllString(cleaned, "-") + cleaned = strings.Trim(cleaned, "-.") + + if cleaned == "" { + cleaned = "untitled" + } + + return cleaned + ".md" +} + +// ExtractFirstSentence extracts the first meaningful sentence from content. +func ExtractFirstSentence(content string) string { + if content == "" { + return "" + } + + // Remove markdown headers and clean up + lines := strings.Split(content, "\n") + var text strings.Builder + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + // Remove markdown formatting + line = strings.ReplaceAll(line, "**", "") + line = strings.ReplaceAll(line, "*", "") + line = strings.ReplaceAll(line, "`", "") + + if line != "" { + text.WriteString(line) + text.WriteString(" ") + } + } + + cleaned := strings.TrimSpace(text.String()) + if len(cleaned) == 0 { + return "" + } + + // Find first sentence ending + for i, r := range cleaned { + if r == '.' || r == '!' || r == '?' { + // Make sure it's not just a decimal or abbreviation + if i+1 < len(cleaned) && unicode.IsSpace(rune(cleaned[i+1])) { + sentence := strings.TrimSpace(cleaned[:i+1]) + if len(sentence) > 20 { // Only return substantial sentences + return sentence + } + } + } + } + + // If no sentence ending found, return first ~200 chars + if len(cleaned) > 200 { + words := strings.Fields(cleaned[:200]) + if len(words) > 1 { + // Remove last word to avoid cutting mid-word + return strings.Join(words[:len(words)-1], " ") + "..." + } + } + + return cleaned +} + +// FormatDuration formats a duration into a human-readable string. +func FormatDuration(d time.Duration) string { + if d < time.Minute { + return fmt.Sprintf("%.1fs", d.Seconds()) + } + if d < time.Hour { + return fmt.Sprintf("%.1fm", d.Minutes()) + } + return fmt.Sprintf("%.1fh", d.Hours()) +} + +// EnsureDir creates a directory if it doesn't exist. +func EnsureDir(dir string) error { + return os.MkdirAll(dir, 0755) +} + +// CreateOutputDirs creates all necessary output directories. +func CreateOutputDirs(outputDir string) error { + dirs := []string{ + outputDir, + filepath.Join(outputDir, "pages"), + } + + for _, dir := range dirs { + if err := EnsureDir(dir); err != nil { + return fmt.Errorf("failed to create directory %s: %w", dir, err) + } + } + + return nil +} diff --git a/internal/utils/utils_test.go b/internal/utils/utils_test.go new file mode 100644 index 0000000..7954fd4 --- /dev/null +++ b/internal/utils/utils_test.go @@ -0,0 +1,82 @@ +package utils + +import "testing" + +func TestCreateFilename(t *testing.T) { + tests := []struct { + name string + title string + rawURL string + want string + }{ + { + name: "Normal title", + title: "Getting Started", + rawURL: "https://example.com/getting-started", + want: "Getting-Started.md", + }, + { + name: "Title with special characters", + title: "API Reference: ", + rawURL: "https://example.com/api", + want: "API-Reference-Advanced.md", + }, + { + name: "Empty title", + title: "", + rawURL: "https://example.com/api/reference", + want: "reference.md", + }, + { + name: "Root URL", + title: "", + rawURL: "https://example.com/", + want: "index.md", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := CreateFilename(tt.title, tt.rawURL); got != tt.want { + t.Errorf("CreateFilename() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestExtractFirstSentence(t *testing.T) { + tests := []struct { + name string + content string + want string + }{ + { + name: "Simple sentence", + content: "This is a simple sentence. This is another sentence.", + want: "This is a simple sentence.", + }, + { + name: "With headers", + content: "# Header\n\nThis is the first sentence. Another sentence follows.", + want: "This is the first sentence.", + }, + { + name: "Short content", + content: "Short content without period", + want: "Short content without period", + }, + { + name: "Empty content", + content: "", + want: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ExtractFirstSentence(tt.content); got != tt.want { + t.Errorf("ExtractFirstSentence() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/main.go b/main.go deleted file mode 100644 index 7dae338..0000000 --- a/main.go +++ /dev/null @@ -1,800 +0,0 @@ -package main - -import ( - "bufio" - "context" - "fmt" - "log" - "net/url" - "os" - "path/filepath" - "regexp" - "sort" - "strings" - "sync" - "time" - - "github.com/JohannesKaufmann/html-to-markdown/v2/converter" - "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base" - "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark" - "github.com/gocolly/colly/v2" - "github.com/gocolly/colly/v2/debug" - "github.com/schollz/progressbar/v3" - "github.com/urfave/cli/v2" -) - -const ( - // DefaultWorkers is the default number of concurrent workers - DefaultWorkers = 1 - // DefaultOutputDir is the default output directory - DefaultOutputDir = "./output" - // MarkdownSubdir is the subdirectory for markdown files - MarkdownSubdir = "pages" -) - -// Config holds crawler configuration -type Config struct { - URL string - OutputDir string - Workers int - Verbose bool -} - -// PageInfo represents information about a crawled page -type PageInfo struct { - URL string - Title string - Content string - FilePath string - CrawledAt time.Time - Description string -} - -// Crawler manages the web crawling process -type Crawler struct { - config *Config - collector *colly.Collector - converter *converter.Converter - visited map[string]bool - queue chan string - wg sync.WaitGroup - mu sync.RWMutex - baseURL *url.URL - bar *progressbar.ProgressBar - processed int - pages []PageInfo - pagesMutex sync.Mutex -} - -// LanguageFilter contains patterns to exclude language-specific URLs -var LanguageFilter = []string{ - `/en/`, `/en$`, - `/zh/`, `/zh$`, `/zh-cn/`, `/zh-cn$`, `/zh-tw/`, `/zh-tw$`, `/zh-hant/`, `/zh-hant$`, - `/ja/`, `/ja$`, - `/ko/`, `/ko$`, - `/fr/`, `/fr$`, - `/de/`, `/de$`, - `/es/`, `/es$`, - `/it/`, `/it$`, - `/pt/`, `/pt$`, - `/ru/`, `/ru$`, -} - -// FileExtensionFilter contains patterns to exclude file downloads -var FileExtensionFilter = []string{ - `\.pdf$`, `\.doc$`, `\.docx$`, `\.xls$`, `\.xlsx$`, `\.ppt$`, `\.pptx$`, - `\.zip$`, `\.rar$`, `\.tar$`, `\.gz$`, `\.7z$`, - `\.mp3$`, `\.mp4$`, `\.avi$`, `\.mov$`, `\.wmv$`, - `\.jpg$`, `\.jpeg$`, `\.png$`, `\.gif$`, `\.bmp$`, `\.svg$`, - `\.exe$`, `\.msi$`, `\.dmg$`, `\.deb$`, `\.rpm$`, -} - -func main() { - app := &cli.App{ - Name: "site-to-llmstxt", - Usage: "Web crawler that converts websites to LLMs.txt format", - Description: `A high-performance web crawler that scrapes websites and converts them to LLMs.txt format. - -The crawler generates: -- llms.txt: A curated overview following the LLMs.txt specification -- llms-full.txt: Complete content of all crawled pages -- pages/: Directory containing individual markdown files - -The crawler respects robots.txt, filters out language variants and file downloads, -and only crawls within the same domain.`, - Version: "1.0.0", - Authors: []*cli.Author{ - { - Name: "Site-to-LLMsTxt", - }, - }, - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "url", - Aliases: []string{"u"}, - Usage: "Root URL to crawl (required)", - Required: true, - }, - &cli.StringFlag{ - Name: "output", - Aliases: []string{"o"}, - Usage: "Output directory", - Value: DefaultOutputDir, - }, - &cli.IntFlag{ - Name: "workers", - Aliases: []string{"w"}, - Usage: "Number of concurrent workers", - Value: DefaultWorkers, - }, - &cli.BoolFlag{ - Name: "verbose", - Usage: "Enable verbose logging", - }, - }, - Action: func(c *cli.Context) error { - config := &Config{ - URL: c.String("url"), - OutputDir: c.String("output"), - Workers: c.Int("workers"), - Verbose: c.Bool("verbose"), - } - - return runCrawler(config) - }, - } - - if err := app.Run(os.Args); err != nil { - log.Fatal(err) - } -} - -func runCrawler(config *Config) error { - if err := validateConfig(config); err != nil { - return fmt.Errorf("invalid configuration: %w", err) - } - - crawler, err := NewCrawler(config) - if err != nil { - return fmt.Errorf("failed to create crawler: %w", err) - } - - ctx := context.Background() - if err := crawler.Start(ctx); err != nil { - return fmt.Errorf("crawling failed: %w", err) - } - - if err := crawler.GenerateLLMSFiles(); err != nil { - return fmt.Errorf("failed to generate LLMS files: %w", err) - } - - fmt.Printf("\nCrawling completed successfully!\n") - fmt.Printf("Generated files:\n") - fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms.txt")) - fmt.Printf(" - %s\n", filepath.Join(config.OutputDir, "llms-full.txt")) - fmt.Printf(" - %s/ (individual pages)\n", filepath.Join(config.OutputDir, MarkdownSubdir)) - fmt.Printf("Total pages crawled: %d\n", len(crawler.pages)) - - return nil -} - -func validateConfig(config *Config) error { - if config.URL == "" { - return fmt.Errorf("URL is required") - } - - u, err := url.Parse(config.URL) - if err != nil { - return fmt.Errorf("invalid URL: %w", err) - } - - if u.Scheme != "http" && u.Scheme != "https" { - return fmt.Errorf("URL must have http or https scheme") - } - - if config.Workers <= 0 { - return fmt.Errorf("workers must be greater than 0") - } - - return nil -} - -// NewCrawler creates a new crawler instance -func NewCrawler(config *Config) (*Crawler, error) { - baseURL, err := url.Parse(config.URL) - if err != nil { - return nil, fmt.Errorf("failed to parse base URL: %w", err) - } - - // Create output directory structure - if err := createOutputDirs(config.OutputDir); err != nil { - return nil, fmt.Errorf("failed to create output directories: %w", err) - } - - // Setup colly collector - c := colly.NewCollector( - colly.AllowedDomains(baseURL.Host), - ) - - if config.Verbose { - c.SetDebugger(&debug.LogDebugger{}) - } - - // Rate limiting - c.Limit(&colly.LimitRule{ - DomainGlob: "*", - Parallelism: config.Workers, - Delay: 200 * time.Millisecond, // Slightly more conservative - }) - - // Setup HTML to Markdown converter - conv := converter.NewConverter( - converter.WithPlugins( - base.NewBasePlugin(), - commonmark.NewCommonmarkPlugin(), - ), - ) - - crawler := &Crawler{ - config: config, - collector: c, - converter: conv, - visited: make(map[string]bool), - queue: make(chan string, 1000), - baseURL: baseURL, - bar: progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")), - pages: make([]PageInfo, 0), - } - - crawler.setupCallbacks() - - return crawler, nil -} - -func createOutputDirs(outputDir string) error { - dirs := []string{ - outputDir, - filepath.Join(outputDir, MarkdownSubdir), - } - - for _, dir := range dirs { - if err := os.MkdirAll(dir, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", dir, err) - } - } - - return nil -} - -func (c *Crawler) setupCallbacks() { - // Handle HTML content - c.collector.OnHTML("html", func(e *colly.HTMLElement) { - c.processPage(e) - }) - - // Extract links - c.collector.OnHTML("a[href]", func(e *colly.HTMLElement) { - link := e.Attr("href") - c.addToQueue(link, e.Request.URL) - }) - - // Request callback - c.collector.OnRequest(func(r *colly.Request) { - if c.config.Verbose { - fmt.Printf("Visiting: %s\n", r.URL) - } - c.bar.Add(1) - }) - - // Error handling - c.collector.OnError(func(r *colly.Response, err error) { - log.Printf("Error visiting %s: %v", r.Request.URL, err) - }) -} - -func (c *Crawler) processPage(e *colly.HTMLElement) { - // Get page title - title := strings.TrimSpace(e.ChildText("title")) - if title == "" { - title = "Untitled" - } - - // Get meta description - description := strings.TrimSpace(e.ChildAttr("meta[name='description']", "content")) - if description == "" { - // Try og:description - description = strings.TrimSpace(e.ChildAttr("meta[property='og:description']", "content")) - } - - // Convert HTML to Markdown - html, err := e.DOM.Html() - if err != nil { - log.Printf("Failed to get HTML for %s: %v", e.Request.URL, err) - return - } - - markdown, err := c.converter.ConvertString(html) - if err != nil { - log.Printf("Failed to convert HTML to Markdown for %s: %v", e.Request.URL, err) - return - } - - // Create page info - pageInfo := PageInfo{ - URL: e.Request.URL.String(), - Title: title, - Content: markdown, - CrawledAt: time.Now(), - Description: description, - } - - // Save individual markdown file - filename := c.createFilename(e.Request.URL, title) - pageInfo.FilePath = filepath.Join(MarkdownSubdir, filename) - fullPath := filepath.Join(c.config.OutputDir, pageInfo.FilePath) - - if err := c.saveMarkdown(fullPath, pageInfo); err != nil { - log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err) - return - } - - // Add to pages collection - c.pagesMutex.Lock() - c.pages = append(c.pages, pageInfo) - c.pagesMutex.Unlock() - - c.mu.Lock() - c.processed++ - c.mu.Unlock() -} - -func (c *Crawler) saveMarkdown(filePath string, pageInfo PageInfo) error { - // Ensure directory exists - dir := filepath.Dir(filePath) - if err := os.MkdirAll(dir, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", dir, err) - } - - // Create content with metadata - content := fmt.Sprintf(`# %s - -URL: %s -Crawled: %s -%s - ---- - -%s`, - pageInfo.Title, - pageInfo.URL, - pageInfo.CrawledAt.Format(time.RFC3339), - func() string { - if pageInfo.Description != "" { - return fmt.Sprintf("Description: %s", pageInfo.Description) - } - return "" - }(), - pageInfo.Content) - - // Write file - if err := os.WriteFile(filePath, []byte(content), 0644); err != nil { - return fmt.Errorf("failed to write file %s: %w", filePath, err) - } - - return nil -} - -func (c *Crawler) createFilename(pageURL *url.URL, title string) string { - // Clean title for filename - filename := strings.TrimSpace(title) - filename = regexp.MustCompile(`[^a-zA-Z0-9\-_\s]`).ReplaceAllString(filename, "") - filename = regexp.MustCompile(`\s+`).ReplaceAllString(filename, "-") - filename = strings.ToLower(filename) - - if filename == "" || filename == "untitled" { - // Use URL path - urlPath := strings.Trim(pageURL.Path, "/") - if urlPath == "" { - urlPath = "index" - } - filename = strings.ReplaceAll(urlPath, "/", "-") - } - - // Limit filename length - if len(filename) > 100 { - filename = filename[:100] - } - - // Ensure .md extension - if !strings.HasSuffix(filename, ".md") { - filename += ".md" - } - - return filename -} - -func (c *Crawler) addToQueue(link string, baseURL *url.URL) { - // Parse and resolve URL - linkURL, err := url.Parse(link) - if err != nil { - return - } - - resolvedURL := baseURL.ResolveReference(linkURL) - - // Check if it's within the same domain - if resolvedURL.Host != c.baseURL.Host { - return - } - - // Apply filters - if c.shouldSkipURL(resolvedURL.String()) { - return - } - - urlStr := resolvedURL.String() - - c.mu.Lock() - defer c.mu.Unlock() - - // Check if already visited - if c.visited[urlStr] { - return - } - - c.visited[urlStr] = true - - // Add to queue - select { - case c.queue <- urlStr: - default: - // Queue is full, skip this URL - } -} - -func (c *Crawler) shouldSkipURL(urlStr string) bool { - // Check language filters - for _, pattern := range LanguageFilter { - if matched, _ := regexp.MatchString(pattern, urlStr); matched { - return true - } - } - - // Check file extension filters - for _, pattern := range FileExtensionFilter { - if matched, _ := regexp.MatchString(pattern, urlStr); matched { - return true - } - } - - // Skip fragments - if strings.Contains(urlStr, "#") { - return true - } - - return false -} - -func (c *Crawler) Start(ctx context.Context) error { - fmt.Printf("Starting crawl of: %s\n", c.config.URL) - fmt.Printf("Output directory: %s\n", c.config.OutputDir) - fmt.Printf("Workers: %d\n", c.config.Workers) - - // Create a cancellable context for workers - workerCtx, cancel := context.WithCancel(ctx) - defer cancel() - - // Add seed URL to queue - c.queue <- c.config.URL - c.visited[c.config.URL] = true - - // Start workers - for i := 0; i < c.config.Workers; i++ { - c.wg.Add(1) - go c.worker(workerCtx) - } - - // Monitor progress and handle completion - done := make(chan struct{}) - go func() { - c.monitor(workerCtx) - close(done) - }() - - // Wait for either completion or cancellation - select { - case <-done: - cancel() // Stop workers - case <-ctx.Done(): - // External cancellation - } - - // Wait for workers to finish - c.wg.Wait() - close(c.queue) - c.bar.Finish() - - fmt.Printf("\nProcessed %d pages\n", c.processed) - return nil -} - -func (c *Crawler) worker(ctx context.Context) { - defer c.wg.Done() - - for { - select { - case <-ctx.Done(): - return - case urlStr, ok := <-c.queue: - if !ok { - return - } - - if err := c.collector.Visit(urlStr); err != nil { - if c.config.Verbose { - log.Printf("Failed to visit %s: %v", urlStr, err) - } - } - } - } -} - -func (c *Crawler) monitor(ctx context.Context) { - ticker := time.NewTicker(2 * time.Second) // Check more frequently - defer ticker.Stop() - - lastProcessed := 0 - noProgressCount := 0 - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - c.mu.RLock() - current := c.processed - queueLen := len(c.queue) - c.mu.RUnlock() - - if current == lastProcessed { - noProgressCount++ - // More aggressive completion detection - if (noProgressCount >= 3 && queueLen == 0) || // 6 seconds with no progress and empty queue - (noProgressCount >= 15) { // Or 30 seconds regardless - if c.config.Verbose { - fmt.Println("\nNo progress detected, stopping crawler...") - } - return - } - } else { - noProgressCount = 0 - lastProcessed = current - } - - if c.config.Verbose { - fmt.Printf("Progress: %d pages processed, %d in queue\n", current, queueLen) - } - } - } -} - -// GenerateLLMSFiles creates both llms.txt and llms-full.txt files -func (c *Crawler) GenerateLLMSFiles() error { - if err := c.generateLLMSTxt(); err != nil { - return fmt.Errorf("failed to generate llms.txt: %w", err) - } - - if err := c.generateLLMSFullTxt(); err != nil { - return fmt.Errorf("failed to generate llms-full.txt: %w", err) - } - - return nil -} - -func (c *Crawler) generateLLMSTxt() error { - // Sort pages by URL for consistent output - sortedPages := make([]PageInfo, len(c.pages)) - copy(sortedPages, c.pages) - sort.Slice(sortedPages, func(i, j int) bool { - return sortedPages[i].URL < sortedPages[j].URL - }) - - var content strings.Builder - - // H1 title (required) - siteTitle := c.getSiteTitle() - content.WriteString(fmt.Sprintf("# %s\n\n", siteTitle)) - - // Blockquote summary (optional but recommended) - summary := c.generateSiteSummary() - if summary != "" { - content.WriteString(fmt.Sprintf("> %s\n\n", summary)) - } - - // Additional details - content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n", - c.config.URL, time.Now().Format("January 2, 2006"))) - - // Main documentation section - content.WriteString("## Documentation\n\n") - for _, page := range sortedPages { - if c.isMainDocPage(page) { - description := page.Description - if description == "" { - description = c.extractFirstSentence(page.Content) - } - if description != "" { - content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", page.Title, page.URL, description)) - } else { - content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL)) - } - } - } - - // Optional section for secondary pages - secondaryPages := c.getSecondaryPages(sortedPages) - if len(secondaryPages) > 0 { - content.WriteString("\n## Optional\n\n") - for _, page := range secondaryPages { - content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL)) - } - } - - // Write to file - filePath := filepath.Join(c.config.OutputDir, "llms.txt") - return os.WriteFile(filePath, []byte(content.String()), 0644) -} - -func (c *Crawler) generateLLMSFullTxt() error { - // Sort pages by URL for consistent output - sortedPages := make([]PageInfo, len(c.pages)) - copy(sortedPages, c.pages) - sort.Slice(sortedPages, func(i, j int) bool { - return sortedPages[i].URL < sortedPages[j].URL - }) - - var content strings.Builder - - // H1 title - siteTitle := c.getSiteTitle() - content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteTitle)) - - // Summary - summary := c.generateSiteSummary() - if summary != "" { - content.WriteString(fmt.Sprintf("> %s\n\n", summary)) - } - - content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n", - c.config.URL, time.Now().Format("January 2, 2006"))) - - content.WriteString("---\n\n") - - // Include full content of each page - for i, page := range sortedPages { - content.WriteString(fmt.Sprintf("## %s\n\n", page.Title)) - content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL)) - - if page.Description != "" { - content.WriteString(fmt.Sprintf("**Description:** %s\n\n", page.Description)) - } - - content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339))) - - // Clean and include content - cleanContent := c.cleanContentForLLMS(page.Content) - content.WriteString(cleanContent) - - // Add separator between pages (except for the last one) - if i < len(sortedPages)-1 { - content.WriteString("\n\n---\n\n") - } - } - - // Write to file - filePath := filepath.Join(c.config.OutputDir, "llms-full.txt") - return os.WriteFile(filePath, []byte(content.String()), 0644) -} - -func (c *Crawler) getSiteTitle() string { - // Try to get site title from the main page - for _, page := range c.pages { - if page.URL == c.config.URL || page.URL == c.config.URL+"/" { - if page.Title != "" && page.Title != "Untitled" { - return page.Title - } - } - } - - // Fallback to domain name - return c.baseURL.Host -} - -func (c *Crawler) generateSiteSummary() string { - // Try to get description from the main page - for _, page := range c.pages { - if page.URL == c.config.URL || page.URL == c.config.URL+"/" { - if page.Description != "" { - return page.Description - } - // Extract first meaningful paragraph - return c.extractFirstSentence(page.Content) - } - } - - return fmt.Sprintf("Documentation and content from %s", c.baseURL.Host) -} - -func (c *Crawler) isMainDocPage(page PageInfo) bool { - // Consider a page "main documentation" if it's not in typical secondary sections - lowerURL := strings.ToLower(page.URL) - - // Skip pages that are typically secondary - secondaryIndicators := []string{ - "/blog", "/news", "/archive", "/changelog", "/release", - "/about", "/contact", "/legal", "/privacy", "/terms", - "/community", "/forum", "/discuss", - } - - for _, indicator := range secondaryIndicators { - // Check for the indicator followed by either / or end of URL - if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) { - return false - } - } - - return true -} - -func (c *Crawler) getSecondaryPages(allPages []PageInfo) []PageInfo { - var secondary []PageInfo - for _, page := range allPages { - if !c.isMainDocPage(page) { - secondary = append(secondary, page) - } - } - return secondary -} - -func (c *Crawler) extractFirstSentence(content string) string { - // Clean the content and extract the first meaningful sentence - lines := strings.Split(content, "\n") - for _, line := range lines { - line = strings.TrimSpace(line) - // Skip empty lines, headers, and markdown syntax - if len(line) > 50 && !strings.HasPrefix(line, "#") && !strings.HasPrefix(line, "**") { - // Find the first sentence - sentences := strings.Split(line, ".") - if len(sentences) > 0 && len(sentences[0]) > 20 { - return strings.TrimSpace(sentences[0]) + "." - } - } - } - return "" -} - -func (c *Crawler) cleanContentForLLMS(content string) string { - // Clean the content for better readability in LLMs context - var cleaned strings.Builder - scanner := bufio.NewScanner(strings.NewReader(content)) - - var inCodeBlock bool - for scanner.Scan() { - line := scanner.Text() - - // Handle code blocks - if strings.HasPrefix(strings.TrimSpace(line), "```") { - inCodeBlock = !inCodeBlock - } - - // Skip empty lines unless in code block - if strings.TrimSpace(line) == "" && !inCodeBlock { - continue - } - - cleaned.WriteString(line) - cleaned.WriteString("\n") - } - - return strings.TrimSpace(cleaned.String()) -} diff --git a/main_test.go b/main_test.go deleted file mode 100644 index 8da2925..0000000 --- a/main_test.go +++ /dev/null @@ -1,250 +0,0 @@ -package main - -import ( - "net/url" - "testing" -) - -func TestValidateConfig(t *testing.T) { - tests := []struct { - name string - config *Config - wantErr bool - }{ - { - name: "Valid config", - config: &Config{ - URL: "https://example.com", - OutputDir: "./output", - Workers: 1, - }, - wantErr: false, - }, - { - name: "Empty URL", - config: &Config{ - URL: "", - OutputDir: "./output", - Workers: 1, - }, - wantErr: true, - }, - { - name: "Invalid URL", - config: &Config{ - URL: "not-a-url", - OutputDir: "./output", - Workers: 1, - }, - wantErr: true, - }, - { - name: "Zero workers", - config: &Config{ - URL: "https://example.com", - OutputDir: "./output", - Workers: 0, - }, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := validateConfig(tt.config) - if (err != nil) != tt.wantErr { - t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr) - } - }) - } -} - -func TestCreateFilename(t *testing.T) { - config := &Config{ - URL: "https://example.com", - OutputDir: "./test-output", - Workers: 1, - } - - crawler, err := NewCrawler(config) - if err != nil { - t.Fatalf("Failed to create crawler: %v", err) - } - - tests := []struct { - name string - url string - title string - expected string - }{ - { - name: "Normal title", - url: "https://example.com/about", - title: "About Us", - expected: "about-us.md", - }, - { - name: "Title with special characters", - url: "https://example.com/contact", - title: "Contact Us! (Get in Touch)", - expected: "contact-us-get-in-touch.md", - }, - { - name: "Empty title", - url: "https://example.com/services/web-design", - title: "", - expected: "services-web-design.md", - }, - { - name: "Root URL", - url: "https://example.com/", - title: "Homepage", - expected: "homepage.md", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - pageURL, _ := url.Parse(tt.url) - result := crawler.createFilename(pageURL, tt.title) - if result != tt.expected { - t.Errorf("createFilename(%q, %q) = %q, want %q", tt.url, tt.title, result, tt.expected) - } - }) - } -} - -func TestShouldSkipURL(t *testing.T) { - config := &Config{ - URL: "https://example.com", - OutputDir: "./test-output", - Workers: 1, - } - - crawler, err := NewCrawler(config) - if err != nil { - t.Fatalf("Failed to create crawler: %v", err) - } - - tests := []struct { - name string - url string - expected bool - }{ - {"Normal URL", "https://example.com/page", false}, - {"Language URL - en", "https://example.com/en/page", true}, - {"Language URL - zh", "https://example.com/zh/page", true}, - {"Language URL - zh-hant", "https://example.com/zh-hant/page", true}, - {"PDF file", "https://example.com/document.pdf", true}, - {"ZIP file", "https://example.com/archive.zip", true}, - {"Fragment URL", "https://example.com/page#section", true}, - {"Image file", "https://example.com/image.jpg", true}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := crawler.shouldSkipURL(tt.url) - if result != tt.expected { - t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected) - } - }) - } -} - -func TestExtractFirstSentence(t *testing.T) { - config := &Config{ - URL: "https://example.com", - OutputDir: "./test-output", - Workers: 1, - } - - crawler, err := NewCrawler(config) - if err != nil { - t.Fatalf("Failed to create crawler: %v", err) - } - - tests := []struct { - name string - content string - expected string - }{ - { - name: "Simple sentence", - content: "This is a simple sentence about something interesting. This is another sentence.", - expected: "This is a simple sentence about something interesting.", - }, - { - name: "With headers", - content: "# Header\n\nThis is the main content that should be extracted as the first sentence.", - expected: "This is the main content that should be extracted as the first sentence.", - }, - { - name: "Short content", - content: "Short text", - expected: "", - }, - { - name: "Empty content", - content: "", - expected: "", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := crawler.extractFirstSentence(tt.content) - if result != tt.expected { - t.Errorf("extractFirstSentence() = %q, want %q", result, tt.expected) - } - }) - } -} - -func TestIsMainDocPage(t *testing.T) { - config := &Config{ - URL: "https://example.com", - OutputDir: "./test-output", - Workers: 1, - } - - crawler, err := NewCrawler(config) - if err != nil { - t.Fatalf("Failed to create crawler: %v", err) - } - - tests := []struct { - name string - page PageInfo - expected bool - }{ - { - name: "Main documentation page", - page: PageInfo{URL: "https://example.com/docs/getting-started"}, - expected: true, - }, - { - name: "Blog page", - page: PageInfo{URL: "https://example.com/blog/latest-news"}, - expected: false, - }, - { - name: "About page", - page: PageInfo{URL: "https://example.com/about"}, - expected: false, - }, - { - name: "API documentation", - page: PageInfo{URL: "https://example.com/api/reference"}, - expected: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := crawler.isMainDocPage(tt.page) - if result != tt.expected { - t.Errorf("isMainDocPage() = %v, want %v", result, tt.expected) - } - }) - } -}