feat: restructure codebase by separating models, progress tracking, and utility functions

2025-12-18 13:34:06 +01:00 · 2025-07-12 16:45:31 +00:00 · 2025-07-12 16:45:31 +00:00 · c54a27e458
commit c54a27e458
parent 4aa2c4be52
12 changed files with 883 additions and 1067 deletions
--- a/71
+++ b/71
@ -1,15 +1,24 @@
 # Makefile for site-to-llmstxt crawler

-.PHONY: build test clean run help
+.PHONY: build test clean run help fmt lint deps dev-setup
+
+# Variables
+BINARY_NAME=site-to-llmstxt
+CMD_PATH=./cmd/site-to-llmstxt
+BUILD_DIR=./bin

 # Default target
 help:
 	@echo "Available targets:"
-	@echo "  build   - Build the crawler binary"
-	@echo "  test    - Run tests"
-	@echo "  clean   - Clean build artifacts"
-	@echo "  run     - Run with example URL (requires URL variable)"
-	@echo "  install - Install dependencies"
+	@echo "  build         - Build the crawler binary"
+	@echo "  test          - Run tests"
+	@echo "  test-coverage - Run tests with coverage"
+	@echo "  clean         - Clean build artifacts"
+	@echo "  run           - Run with example URL (requires URL variable)"
+	@echo "  fmt           - Format code"
+	@echo "  lint          - Lint code"
+	@echo "  deps          - Install/update dependencies"
+	@echo "  dev-setup     - Set up development environment"
 	@echo ""
 	@echo "Examples:"
 	@echo "  make build"
@ -18,27 +27,55 @@ help:

 # Build the crawler
 build:
-	@echo "Building crawler..."
-	go build -o crawler main.go
-	@echo "Build complete: ./crawler"
+	@echo "Building $(BINARY_NAME)..."
+	@mkdir -p $(BUILD_DIR)
+	go build -o $(BUILD_DIR)/$(BINARY_NAME) $(CMD_PATH)
+	@echo "Build complete: $(BUILD_DIR)/$(BINARY_NAME)"

 # Run tests
 test:
 	@echo "Running tests..."
-	go test -v
+	go test -v ./...
+
+# Run tests with coverage
+test-coverage:
+	@echo "Running tests with coverage..."
+	go test -v -coverprofile=coverage.out ./...
+	go tool cover -html=coverage.out -o coverage.html
+	@echo "Coverage report generated: coverage.html"

 # Clean build artifacts
 clean:
 	@echo "Cleaning..."
-	rm -f crawler
+	rm -rf $(BUILD_DIR)
 	rm -rf output/
 	rm -rf test-output/
-	rm -rf example-output/
+	rm -rf demo-output/
+	rm -f coverage.out coverage.html

-# Install dependencies
-install:
+# Format code
+fmt:
+	@echo "Formatting code..."
+	go fmt ./...
+	@which goimports > /dev/null && goimports -w . || echo "goimports not found, skipping import formatting"
+
+# Lint code (requires golangci-lint)
+lint:
+	@echo "Linting code..."
+	@which golangci-lint > /dev/null && golangci-lint run || echo "golangci-lint not found, skipping linting"
+
+# Install/update dependencies
+deps:
 	@echo "Installing dependencies..."
 	go mod tidy
+	go mod download
+
+# Development setup
+dev-setup: deps
+	@echo "Setting up development environment..."
+	@echo "Installing development tools..."
+	go install golang.org/x/tools/cmd/goimports@latest
+	@echo "Development setup complete!"

 # Run with parameters (updated for new CLI)
 run: build
@ -47,17 +84,17 @@ run: build
 		exit 1; \
 	fi
 	@echo "Running crawler with URL: $(URL)"
-	./crawler \
+	$(BUILD_DIR)/$(BINARY_NAME) \
 		--url $(URL) \
 		$(if $(WORKERS),--workers $(WORKERS)) \
 		$(if $(OUTPUT),--output $(OUTPUT)) \
 		$(if $(VERBOSE),--verbose)

 # Build and test everything
-all: clean install build test
+all: clean deps fmt build test
 	@echo "All tasks completed successfully!"

 # Quick test with a small site
 demo: build
 	@echo "Running demo crawl of httpbin.org..."
-	./crawler --url https://httpbin.org --output ./demo-output --workers 1 --verbose
+	$(BUILD_DIR)/$(BINARY_NAME) --url https://httpbin.org --output ./demo-output --workers 1 --verbose
--- a/internal/config/config.go
+++ b/internal/config/config.go
@ -0,0 +1,42 @@
+package config
+
+import (
+	"fmt"
+	"net/url"
+)
+
+const (
+	DefaultWorkers   = 1
+	DefaultOutputDir = "./output"
+	MarkdownSubdir   = "pages"
+)
+
+// Config holds crawler configuration.
+type Config struct {
+	URL       string
+	OutputDir string
+	Workers   int
+	Verbose   bool
+}
+
+// Validate validates the configuration and returns an error if invalid.
+func (c *Config) Validate() error {
+	if c.URL == "" {
+		return fmt.Errorf("URL is required")
+	}
+
+	u, err := url.Parse(c.URL)
+	if err != nil {
+		return fmt.Errorf("invalid URL: %w", err)
+	}
+
+	if u.Scheme != "http" && u.Scheme != "https" {
+		return fmt.Errorf("URL must have http or https scheme")
+	}
+
+	if c.Workers <= 0 {
+		return fmt.Errorf("workers must be greater than 0")
+	}
+
+	return nil
+}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@ -0,0 +1,57 @@
+package config
+
+import "testing"
+
+func TestConfig_Validate(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  *Config
+		wantErr bool
+	}{
+		{
+			name: "Valid config",
+			config: &Config{
+				URL:       "https://example.com",
+				OutputDir: "./output",
+				Workers:   1,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Empty URL",
+			config: &Config{
+				URL:       "",
+				OutputDir: "./output",
+				Workers:   1,
+			},
+			wantErr: true,
+		},
+		{
+			name: "Invalid URL",
+			config: &Config{
+				URL:       "not-a-url",
+				OutputDir: "./output",
+				Workers:   1,
+			},
+			wantErr: true,
+		},
+		{
+			name: "Zero workers",
+			config: &Config{
+				URL:       "https://example.com",
+				OutputDir: "./output",
+				Workers:   0,
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tt.config.Validate()
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Config.Validate() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
--- a/internal/filters/filters.go
+++ b/internal/filters/filters.go
@ -0,0 +1,86 @@
+package filters
+
+import (
+	"net/url"
+	"strings"
+)
+
+// LanguageIndicators are URL patterns that indicate language-specific pages.
+var LanguageIndicators = []string{
+	"/en/", "/zh/", "/fr/", "/de/", "/es/", "/it/", "/ja/", "/ko/",
+	"/pt/", "/ru/", "/ar/", "/hi/", "/th/", "/vi/", "/id/", "/ms/",
+	"/tl/", "/zh-cn/", "/zh-tw/", "/zh-hk/", "/zh-hant/", "/zh-hans/",
+	"/en-us/", "/en-gb/", "/fr-fr/", "/de-de/", "/es-es/", "/pt-br/",
+	"/pt-pt/", "/ja-jp/", "/ko-kr/", "/it-it/", "/ru-ru/", "/ar-sa/",
+}
+
+// FileExtensions are file extensions that should be skipped.
+var FileExtensions = []string{
+	".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
+	".zip", ".rar", ".tar", ".gz", ".7z", ".bz2",
+	".mp3", ".mp4", ".avi", ".mov", ".wmv", ".flv", ".webm",
+	".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp",
+	".exe", ".msi", ".dmg", ".deb", ".rpm", ".pkg",
+}
+
+// SecondaryPageIndicators are URL patterns for secondary content.
+var SecondaryPageIndicators = []string{
+	"/blog", "/news", "/archive", "/changelog", "/release",
+	"/about", "/contact", "/legal", "/privacy", "/terms",
+	"/community", "/forum", "/discuss",
+}
+
+// ShouldSkipURL determines if a URL should be skipped based on various filters.
+func ShouldSkipURL(rawURL, baseHost string) bool {
+	if rawURL == "" {
+		return true
+	}
+
+	// Parse URL
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return true
+	}
+
+	// Skip external domains
+	if u.Host != "" && u.Host != baseHost {
+		return true
+	}
+
+	// Skip fragments
+	if u.Fragment != "" {
+		return true
+	}
+
+	lowerURL := strings.ToLower(rawURL)
+
+	// Skip language variants
+	for _, lang := range LanguageIndicators {
+		if strings.Contains(lowerURL, lang) {
+			return true
+		}
+	}
+
+	// Skip file downloads
+	for _, ext := range FileExtensions {
+		if strings.HasSuffix(lowerURL, ext) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// IsMainDocPage determines if a page is main documentation or secondary content.
+func IsMainDocPage(pageURL string) bool {
+	lowerURL := strings.ToLower(pageURL)
+
+	for _, indicator := range SecondaryPageIndicators {
+		// Check for the indicator followed by either / or end of URL
+		if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) {
+			return false
+		}
+	}
+
+	return true
+}
--- a/internal/filters/filters_test.go
+++ b/internal/filters/filters_test.go
@ -0,0 +1,105 @@
+package filters
+
+import "testing"
+
+func TestShouldSkipURL(t *testing.T) {
+	tests := []struct {
+		name     string
+		url      string
+		baseHost string
+		want     bool
+	}{
+		{
+			name:     "Normal URL",
+			url:      "https://example.com/docs",
+			baseHost: "example.com",
+			want:     false,
+		},
+		{
+			name:     "Language URL - en",
+			url:      "https://example.com/en/docs",
+			baseHost: "example.com",
+			want:     true,
+		},
+		{
+			name:     "Language URL - zh",
+			url:      "https://example.com/zh/docs",
+			baseHost: "example.com",
+			want:     true,
+		},
+		{
+			name:     "PDF file",
+			url:      "https://example.com/doc.pdf",
+			baseHost: "example.com",
+			want:     true,
+		},
+		{
+			name:     "ZIP file",
+			url:      "https://example.com/download.zip",
+			baseHost: "example.com",
+			want:     true,
+		},
+		{
+			name:     "Fragment URL",
+			url:      "https://example.com/docs#section",
+			baseHost: "example.com",
+			want:     true,
+		},
+		{
+			name:     "External domain",
+			url:      "https://other.com/docs",
+			baseHost: "example.com",
+			want:     true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := ShouldSkipURL(tt.url, tt.baseHost); got != tt.want {
+				t.Errorf("ShouldSkipURL() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestIsMainDocPage(t *testing.T) {
+	tests := []struct {
+		name string
+		url  string
+		want bool
+	}{
+		{
+			name: "Main documentation page",
+			url:  "https://example.com/docs/api",
+			want: true,
+		},
+		{
+			name: "Blog page",
+			url:  "https://example.com/blog/latest-news",
+			want: false,
+		},
+		{
+			name: "About page",
+			url:  "https://example.com/about",
+			want: false,
+		},
+		{
+			name: "API documentation",
+			url:  "https://example.com/api/reference",
+			want: true,
+		},
+		{
+			name: "Contact page",
+			url:  "https://example.com/contact",
+			want: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := IsMainDocPage(tt.url); got != tt.want {
+				t.Errorf("IsMainDocPage() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/internal/generator/llms.go
+++ b/internal/generator/llms.go
@ -0,0 +1,198 @@
+package generator
+
+import (
+	"fmt"
+	"net/url"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/Sosokker/site-to-llmstxt/internal/filters"
+	"github.com/Sosokker/site-to-llmstxt/internal/models"
+	"github.com/Sosokker/site-to-llmstxt/internal/utils"
+)
+
+// LLMsGenerator generates LLMs.txt format files.
+type LLMsGenerator struct {
+	baseURL   *url.URL
+	outputDir string
+}
+
+// New creates a new LLMs.txt generator.
+func New(baseURL *url.URL, outputDir string) *LLMsGenerator {
+	return &LLMsGenerator{
+		baseURL:   baseURL,
+		outputDir: outputDir,
+	}
+}
+
+// Generate creates both llms.txt and llms-full.txt files.
+func (g *LLMsGenerator) Generate(pages []models.PageInfo) error {
+	if err := g.generateLLMsFile(pages); err != nil {
+		return fmt.Errorf("failed to generate llms.txt: %w", err)
+	}
+
+	if err := g.generateFullFile(pages); err != nil {
+		return fmt.Errorf("failed to generate llms-full.txt: %w", err)
+	}
+
+	return nil
+}
+
+func (g *LLMsGenerator) generateLLMsFile(pages []models.PageInfo) error {
+	var content strings.Builder
+
+	// Header
+	siteName := g.baseURL.Host
+	if siteName == "" {
+		siteName = "Documentation"
+	}
+
+	content.WriteString(fmt.Sprintf("# %s\n\n", siteName))
+
+	// Summary from first page or generate one
+	summary := g.generateSummary(pages)
+	if summary != "" {
+		content.WriteString(fmt.Sprintf("> %s\n\n", summary))
+	}
+
+	content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n",
+		g.baseURL.String(), time.Now().Format("January 2, 2006")))
+
+	// Main documentation section
+	mainPages := g.filterMainPages(pages)
+	if len(mainPages) > 0 {
+		content.WriteString("## Documentation\n\n")
+		g.writePageLinks(&content, mainPages)
+	}
+
+	// Optional section for secondary content
+	secondaryPages := g.filterSecondaryPages(pages)
+	if len(secondaryPages) > 0 {
+		content.WriteString("\n## Optional\n\n")
+		g.writePageLinks(&content, secondaryPages)
+	}
+
+	return g.writeFile("llms.txt", content.String())
+}
+
+func (g *LLMsGenerator) generateFullFile(pages []models.PageInfo) error {
+	var content strings.Builder
+
+	// Header
+	siteName := g.baseURL.Host
+	content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteName))
+
+	summary := g.generateSummary(pages)
+	if summary != "" {
+		content.WriteString(fmt.Sprintf("> %s\n\n", summary))
+	}
+
+	content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n",
+		g.baseURL.String(), time.Now().Format("January 2, 2006")))
+
+	content.WriteString(strings.Repeat("-", 80) + "\n\n")
+
+	// Sort pages by URL for consistent output
+	sortedPages := make([]models.PageInfo, len(pages))
+	copy(sortedPages, pages)
+	sort.Slice(sortedPages, func(i, j int) bool {
+		return sortedPages[i].URL < sortedPages[j].URL
+	})
+
+	// Add each page's content
+	for i, page := range sortedPages {
+		if i > 0 {
+			content.WriteString("\n" + strings.Repeat("-", 80) + "\n\n")
+		}
+
+		content.WriteString(fmt.Sprintf("## %s\n\n", page.Title))
+		content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL))
+		content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339)))
+
+		if page.Content != "" {
+			content.WriteString(page.Content + "\n")
+		}
+	}
+
+	return g.writeFile("llms-full.txt", content.String())
+}
+
+func (g *LLMsGenerator) generateSummary(pages []models.PageInfo) string {
+	// Try to get summary from the first page (usually homepage)
+	if len(pages) > 0 {
+		for _, page := range pages {
+			if page.Description != "" {
+				return page.Description
+			}
+		}
+
+		// Fallback to first sentence of first page content
+		for _, page := range pages {
+			if page.Content != "" {
+				return utils.ExtractFirstSentence(page.Content)
+			}
+		}
+	}
+
+	return ""
+}
+
+func (g *LLMsGenerator) filterMainPages(pages []models.PageInfo) []models.PageInfo {
+	var main []models.PageInfo
+	for _, page := range pages {
+		if filters.IsMainDocPage(page.URL) {
+			main = append(main, page)
+		}
+	}
+
+	// Sort by URL
+	sort.Slice(main, func(i, j int) bool {
+		return main[i].URL < main[j].URL
+	})
+
+	return main
+}
+
+func (g *LLMsGenerator) filterSecondaryPages(pages []models.PageInfo) []models.PageInfo {
+	var secondary []models.PageInfo
+	for _, page := range pages {
+		if !filters.IsMainDocPage(page.URL) {
+			secondary = append(secondary, page)
+		}
+	}
+
+	// Sort by URL
+	sort.Slice(secondary, func(i, j int) bool {
+		return secondary[i].URL < secondary[j].URL
+	})
+
+	return secondary
+}
+
+func (g *LLMsGenerator) writePageLinks(content *strings.Builder, pages []models.PageInfo) {
+	for _, page := range pages {
+		title := page.Title
+		if title == "" || title == "Untitled" {
+			title = "Untitled"
+		}
+
+		description := page.Description
+		if description == "" && page.Content != "" {
+			description = utils.ExtractFirstSentence(page.Content)
+		}
+
+		if description != "" {
+			content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", title, page.URL, description))
+		} else {
+			content.WriteString(fmt.Sprintf("- [%s](%s)\n", title, page.URL))
+		}
+	}
+}
+
+func (g *LLMsGenerator) writeFile(filename, content string) error {
+	path := filepath.Join(g.outputDir, filename)
+	return os.WriteFile(path, []byte(content), 0644)
+}
--- a/internal/models/models.go
+++ b/internal/models/models.go
@ -0,0 +1,41 @@
+package models
+
+import "time"
+
+// PageInfo represents information about a crawled page.
+type PageInfo struct {
+	URL         string
+	Title       string
+	Content     string
+	FilePath    string
+	CrawledAt   time.Time
+	Description string
+}
+
+// Stats holds crawling statistics.
+type Stats struct {
+	TotalPages     int
+	MainDocPages   int
+	SecondaryPages int
+	StartTime      time.Time
+	EndTime        time.Time
+	Duration       time.Duration
+	ErrorCount     int
+	SkippedURLs    int
+}
+
+// AddError increments the error count.
+func (s *Stats) AddError() {
+	s.ErrorCount++
+}
+
+// AddSkipped increments the skipped URL count.
+func (s *Stats) AddSkipped() {
+	s.SkippedURLs++
+}
+
+// Finish sets the end time and calculates duration.
+func (s *Stats) Finish() {
+	s.EndTime = time.Now()
+	s.Duration = s.EndTime.Sub(s.StartTime)
+}
--- a/internal/progress/progress.go
+++ b/internal/progress/progress.go
@ -0,0 +1,82 @@
+package progress
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/schollz/progressbar/v3"
+
+	"github.com/Sosokker/site-to-llmstxt/internal/models"
+)
+
+// Manager handles progress tracking and UI updates.
+type Manager struct {
+	bar     *progressbar.ProgressBar
+	verbose bool
+	stats   *models.Stats
+}
+
+// New creates a new progress manager.
+func New(verbose bool, stats *models.Stats) *Manager {
+	bar := progressbar.NewOptions(-1,
+		progressbar.OptionSetDescription("Crawling"),
+		progressbar.OptionSpinnerType(14),
+		progressbar.OptionSetWidth(50),
+		progressbar.OptionThrottle(100*time.Millisecond),
+	)
+
+	return &Manager{
+		bar:     bar,
+		verbose: verbose,
+		stats:   stats,
+	}
+}
+
+// Update updates the progress bar with current status.
+func (m *Manager) Update(processed, queued int) {
+	if m.verbose {
+		fmt.Printf("\rProgress: %d pages processed, %d in queue", processed, queued)
+	}
+	m.bar.Add(1)
+}
+
+// Finish completes the progress bar and shows final statistics.
+func (m *Manager) Finish() {
+	m.bar.Finish()
+	m.showSummary()
+}
+
+// showSummary displays a comprehensive summary of the crawling session.
+func (m *Manager) showSummary() {
+	fmt.Println("\n" + strings.Repeat("=", 60))
+	fmt.Println("📊 CRAWLING SUMMARY")
+	fmt.Println(strings.Repeat("=", 60))
+
+	// Basic stats
+	fmt.Printf("🔍 Total pages crawled: %d\n", m.stats.TotalPages)
+	fmt.Printf("📚 Main documentation: %d pages\n", m.stats.MainDocPages)
+	fmt.Printf("📝 Secondary content: %d pages\n", m.stats.SecondaryPages)
+
+	// Performance stats
+	if m.stats.Duration > 0 {
+		pagesPerSecond := float64(m.stats.TotalPages) / m.stats.Duration.Seconds()
+		fmt.Printf("⏱️  Duration: %v (%.1f pages/sec)\n",
+			m.stats.Duration.Round(time.Second), pagesPerSecond)
+	}
+
+	// Error stats
+	if m.stats.ErrorCount > 0 || m.stats.SkippedURLs > 0 {
+		fmt.Printf("⚠️  Errors: %d, Skipped URLs: %d\n", m.stats.ErrorCount, m.stats.SkippedURLs)
+	}
+
+	fmt.Println(strings.Repeat("-", 60))
+	fmt.Println("✅ Crawling completed successfully!")
+}
+
+// Log outputs a message if verbose mode is enabled.
+func (m *Manager) Log(format string, args ...interface{}) {
+	if m.verbose {
+		fmt.Printf(format+"\n", args...)
+	}
+}
--- a/internal/utils/utils.go
+++ b/internal/utils/utils.go
@ -0,0 +1,136 @@
+package utils
+
+import (
+	"fmt"
+	"net/url"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"time"
+	"unicode"
+)
+
+var (
+	filenameRegex = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1f]`)
+	spaceRegex    = regexp.MustCompile(`\s+`)
+)
+
+// CreateFilename creates a safe filename from a title and URL.
+func CreateFilename(title, rawURL string) string {
+	if title == "" || title == "Untitled" {
+		// Extract from URL path
+		if rawURL != "" {
+			u, err := url.Parse(rawURL)
+			if err == nil && u.Path != "" && u.Path != "/" {
+				parts := strings.Split(strings.Trim(u.Path, "/"), "/")
+				if len(parts) > 0 && parts[len(parts)-1] != "" {
+					title = parts[len(parts)-1]
+				}
+			}
+		}
+		if title == "" {
+			title = "index"
+		}
+	}
+
+	// Clean the filename
+	cleaned := filenameRegex.ReplaceAllString(title, "")
+	cleaned = spaceRegex.ReplaceAllString(cleaned, "-")
+	cleaned = strings.Trim(cleaned, "-.")
+
+	if cleaned == "" {
+		cleaned = "untitled"
+	}
+
+	return cleaned + ".md"
+}
+
+// ExtractFirstSentence extracts the first meaningful sentence from content.
+func ExtractFirstSentence(content string) string {
+	if content == "" {
+		return ""
+	}
+
+	// Remove markdown headers and clean up
+	lines := strings.Split(content, "\n")
+	var text strings.Builder
+
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+
+		// Remove markdown formatting
+		line = strings.ReplaceAll(line, "**", "")
+		line = strings.ReplaceAll(line, "*", "")
+		line = strings.ReplaceAll(line, "`", "")
+
+		if line != "" {
+			text.WriteString(line)
+			text.WriteString(" ")
+		}
+	}
+
+	cleaned := strings.TrimSpace(text.String())
+	if len(cleaned) == 0 {
+		return ""
+	}
+
+	// Find first sentence ending
+	for i, r := range cleaned {
+		if r == '.' || r == '!' || r == '?' {
+			// Make sure it's not just a decimal or abbreviation
+			if i+1 < len(cleaned) && unicode.IsSpace(rune(cleaned[i+1])) {
+				sentence := strings.TrimSpace(cleaned[:i+1])
+				if len(sentence) > 20 { // Only return substantial sentences
+					return sentence
+				}
+			}
+		}
+	}
+
+	// If no sentence ending found, return first ~200 chars
+	if len(cleaned) > 200 {
+		words := strings.Fields(cleaned[:200])
+		if len(words) > 1 {
+			// Remove last word to avoid cutting mid-word
+			return strings.Join(words[:len(words)-1], " ") + "..."
+		}
+	}
+
+	return cleaned
+}
+
+// FormatDuration formats a duration into a human-readable string.
+func FormatDuration(d time.Duration) string {
+	if d < time.Minute {
+		return fmt.Sprintf("%.1fs", d.Seconds())
+	}
+	if d < time.Hour {
+		return fmt.Sprintf("%.1fm", d.Minutes())
+	}
+	return fmt.Sprintf("%.1fh", d.Hours())
+}
+
+// EnsureDir creates a directory if it doesn't exist.
+func EnsureDir(dir string) error {
+	return os.MkdirAll(dir, 0755)
+}
+
+// CreateOutputDirs creates all necessary output directories.
+func CreateOutputDirs(outputDir string) error {
+	dirs := []string{
+		outputDir,
+		filepath.Join(outputDir, "pages"),
+	}
+
+	for _, dir := range dirs {
+		if err := EnsureDir(dir); err != nil {
+			return fmt.Errorf("failed to create directory %s: %w", dir, err)
+		}
+	}
+
+	return nil
+}
--- a/internal/utils/utils_test.go
+++ b/internal/utils/utils_test.go
@ -0,0 +1,82 @@
+package utils
+
+import "testing"
+
+func TestCreateFilename(t *testing.T) {
+	tests := []struct {
+		name   string
+		title  string
+		rawURL string
+		want   string
+	}{
+		{
+			name:   "Normal title",
+			title:  "Getting Started",
+			rawURL: "https://example.com/getting-started",
+			want:   "Getting-Started.md",
+		},
+		{
+			name:   "Title with special characters",
+			title:  "API Reference: <Advanced>",
+			rawURL: "https://example.com/api",
+			want:   "API-Reference-Advanced.md",
+		},
+		{
+			name:   "Empty title",
+			title:  "",
+			rawURL: "https://example.com/api/reference",
+			want:   "reference.md",
+		},
+		{
+			name:   "Root URL",
+			title:  "",
+			rawURL: "https://example.com/",
+			want:   "index.md",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := CreateFilename(tt.title, tt.rawURL); got != tt.want {
+				t.Errorf("CreateFilename() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestExtractFirstSentence(t *testing.T) {
+	tests := []struct {
+		name    string
+		content string
+		want    string
+	}{
+		{
+			name:    "Simple sentence",
+			content: "This is a simple sentence. This is another sentence.",
+			want:    "This is a simple sentence.",
+		},
+		{
+			name:    "With headers",
+			content: "# Header\n\nThis is the first sentence. Another sentence follows.",
+			want:    "This is the first sentence.",
+		},
+		{
+			name:    "Short content",
+			content: "Short content without period",
+			want:    "Short content without period",
+		},
+		{
+			name:    "Empty content",
+			content: "",
+			want:    "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := ExtractFirstSentence(tt.content); got != tt.want {
+				t.Errorf("ExtractFirstSentence() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/main.go
+++ b/main.go
@ -1,800 +0,0 @@
-package main
-
-import (
-	"bufio"
-	"context"
-	"fmt"
-	"log"
-	"net/url"
-	"os"
-	"path/filepath"
-	"regexp"
-	"sort"
-	"strings"
-	"sync"
-	"time"
-
-	"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
-	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
-	"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
-	"github.com/gocolly/colly/v2"
-	"github.com/gocolly/colly/v2/debug"
-	"github.com/schollz/progressbar/v3"
-	"github.com/urfave/cli/v2"
-)
-
-const (
-	// DefaultWorkers is the default number of concurrent workers
-	DefaultWorkers = 1
-	// DefaultOutputDir is the default output directory
-	DefaultOutputDir = "./output"
-	// MarkdownSubdir is the subdirectory for markdown files
-	MarkdownSubdir = "pages"
-)
-
-// Config holds crawler configuration
-type Config struct {
-	URL       string
-	OutputDir string
-	Workers   int
-	Verbose   bool
-}
-
-// PageInfo represents information about a crawled page
-type PageInfo struct {
-	URL         string
-	Title       string
-	Content     string
-	FilePath    string
-	CrawledAt   time.Time
-	Description string
-}
-
-// Crawler manages the web crawling process
-type Crawler struct {
-	config     *Config
-	collector  *colly.Collector
-	converter  *converter.Converter
-	visited    map[string]bool
-	queue      chan string
-	wg         sync.WaitGroup
-	mu         sync.RWMutex
-	baseURL    *url.URL
-	bar        *progressbar.ProgressBar
-	processed  int
-	pages      []PageInfo
-	pagesMutex sync.Mutex
-}
-
-// LanguageFilter contains patterns to exclude language-specific URLs
-var LanguageFilter = []string{
-	`/en/`, `/en$`,
-	`/zh/`, `/zh$`, `/zh-cn/`, `/zh-cn$`, `/zh-tw/`, `/zh-tw$`, `/zh-hant/`, `/zh-hant$`,
-	`/ja/`, `/ja$`,
-	`/ko/`, `/ko$`,
-	`/fr/`, `/fr$`,
-	`/de/`, `/de$`,
-	`/es/`, `/es$`,
-	`/it/`, `/it$`,
-	`/pt/`, `/pt$`,
-	`/ru/`, `/ru$`,
-}
-
-// FileExtensionFilter contains patterns to exclude file downloads
-var FileExtensionFilter = []string{
-	`\.pdf$`, `\.doc$`, `\.docx$`, `\.xls$`, `\.xlsx$`, `\.ppt$`, `\.pptx$`,
-	`\.zip$`, `\.rar$`, `\.tar$`, `\.gz$`, `\.7z$`,
-	`\.mp3$`, `\.mp4$`, `\.avi$`, `\.mov$`, `\.wmv$`,
-	`\.jpg$`, `\.jpeg$`, `\.png$`, `\.gif$`, `\.bmp$`, `\.svg$`,
-	`\.exe$`, `\.msi$`, `\.dmg$`, `\.deb$`, `\.rpm$`,
-}
-
-func main() {
-	app := &cli.App{
-		Name:  "site-to-llmstxt",
-		Usage: "Web crawler that converts websites to LLMs.txt format",
-		Description: `A high-performance web crawler that scrapes websites and converts them to LLMs.txt format.
-		
-The crawler generates:
- llms.txt: A curated overview following the LLMs.txt specification
- llms-full.txt: Complete content of all crawled pages
- pages/: Directory containing individual markdown files
-
-The crawler respects robots.txt, filters out language variants and file downloads,
-and only crawls within the same domain.`,
-		Version: "1.0.0",
-		Authors: []*cli.Author{
-			{
-				Name: "Site-to-LLMsTxt",
-			},
-		},
-		Flags: []cli.Flag{
-			&cli.StringFlag{
-				Name:     "url",
-				Aliases:  []string{"u"},
-				Usage:    "Root URL to crawl (required)",
-				Required: true,
-			},
-			&cli.StringFlag{
-				Name:    "output",
-				Aliases: []string{"o"},
-				Usage:   "Output directory",
-				Value:   DefaultOutputDir,
-			},
-			&cli.IntFlag{
-				Name:    "workers",
-				Aliases: []string{"w"},
-				Usage:   "Number of concurrent workers",
-				Value:   DefaultWorkers,
-			},
-			&cli.BoolFlag{
-				Name:  "verbose",
-				Usage: "Enable verbose logging",
-			},
-		},
-		Action: func(c *cli.Context) error {
-			config := &Config{
-				URL:       c.String("url"),
-				OutputDir: c.String("output"),
-				Workers:   c.Int("workers"),
-				Verbose:   c.Bool("verbose"),
-			}
-
-			return runCrawler(config)
-		},
-	}
-
-	if err := app.Run(os.Args); err != nil {
-		log.Fatal(err)
-	}
-}
-
-func runCrawler(config *Config) error {
-	if err := validateConfig(config); err != nil {
-		return fmt.Errorf("invalid configuration: %w", err)
-	}
-
-	crawler, err := NewCrawler(config)
-	if err != nil {
-		return fmt.Errorf("failed to create crawler: %w", err)
-	}
-
-	ctx := context.Background()
-	if err := crawler.Start(ctx); err != nil {
-		return fmt.Errorf("crawling failed: %w", err)
-	}
-
-	if err := crawler.GenerateLLMSFiles(); err != nil {
-		return fmt.Errorf("failed to generate LLMS files: %w", err)
-	}
-
-	fmt.Printf("\nCrawling completed successfully!\n")
-	fmt.Printf("Generated files:\n")
-	fmt.Printf("  - %s\n", filepath.Join(config.OutputDir, "llms.txt"))
-	fmt.Printf("  - %s\n", filepath.Join(config.OutputDir, "llms-full.txt"))
-	fmt.Printf("  - %s/ (individual pages)\n", filepath.Join(config.OutputDir, MarkdownSubdir))
-	fmt.Printf("Total pages crawled: %d\n", len(crawler.pages))
-
-	return nil
-}
-
-func validateConfig(config *Config) error {
-	if config.URL == "" {
-		return fmt.Errorf("URL is required")
-	}
-
-	u, err := url.Parse(config.URL)
-	if err != nil {
-		return fmt.Errorf("invalid URL: %w", err)
-	}
-
-	if u.Scheme != "http" && u.Scheme != "https" {
-		return fmt.Errorf("URL must have http or https scheme")
-	}
-
-	if config.Workers <= 0 {
-		return fmt.Errorf("workers must be greater than 0")
-	}
-
-	return nil
-}
-
-// NewCrawler creates a new crawler instance
-func NewCrawler(config *Config) (*Crawler, error) {
-	baseURL, err := url.Parse(config.URL)
-	if err != nil {
-		return nil, fmt.Errorf("failed to parse base URL: %w", err)
-	}
-
-	// Create output directory structure
-	if err := createOutputDirs(config.OutputDir); err != nil {
-		return nil, fmt.Errorf("failed to create output directories: %w", err)
-	}
-
-	// Setup colly collector
-	c := colly.NewCollector(
-		colly.AllowedDomains(baseURL.Host),
-	)
-
-	if config.Verbose {
-		c.SetDebugger(&debug.LogDebugger{})
-	}
-
-	// Rate limiting
-	c.Limit(&colly.LimitRule{
-		DomainGlob:  "*",
-		Parallelism: config.Workers,
-		Delay:       200 * time.Millisecond, // Slightly more conservative
-	})
-
-	// Setup HTML to Markdown converter
-	conv := converter.NewConverter(
-		converter.WithPlugins(
-			base.NewBasePlugin(),
-			commonmark.NewCommonmarkPlugin(),
-		),
-	)
-
-	crawler := &Crawler{
-		config:    config,
-		collector: c,
-		converter: conv,
-		visited:   make(map[string]bool),
-		queue:     make(chan string, 1000),
-		baseURL:   baseURL,
-		bar:       progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")),
-		pages:     make([]PageInfo, 0),
-	}
-
-	crawler.setupCallbacks()
-
-	return crawler, nil
-}
-
-func createOutputDirs(outputDir string) error {
-	dirs := []string{
-		outputDir,
-		filepath.Join(outputDir, MarkdownSubdir),
-	}
-
-	for _, dir := range dirs {
-		if err := os.MkdirAll(dir, 0755); err != nil {
-			return fmt.Errorf("failed to create directory %s: %w", dir, err)
-		}
-	}
-
-	return nil
-}
-
-func (c *Crawler) setupCallbacks() {
-	// Handle HTML content
-	c.collector.OnHTML("html", func(e *colly.HTMLElement) {
-		c.processPage(e)
-	})
-
-	// Extract links
-	c.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
-		link := e.Attr("href")
-		c.addToQueue(link, e.Request.URL)
-	})
-
-	// Request callback
-	c.collector.OnRequest(func(r *colly.Request) {
-		if c.config.Verbose {
-			fmt.Printf("Visiting: %s\n", r.URL)
-		}
-		c.bar.Add(1)
-	})
-
-	// Error handling
-	c.collector.OnError(func(r *colly.Response, err error) {
-		log.Printf("Error visiting %s: %v", r.Request.URL, err)
-	})
-}
-
-func (c *Crawler) processPage(e *colly.HTMLElement) {
-	// Get page title
-	title := strings.TrimSpace(e.ChildText("title"))
-	if title == "" {
-		title = "Untitled"
-	}
-
-	// Get meta description
-	description := strings.TrimSpace(e.ChildAttr("meta[name='description']", "content"))
-	if description == "" {
-		// Try og:description
-		description = strings.TrimSpace(e.ChildAttr("meta[property='og:description']", "content"))
-	}
-
-	// Convert HTML to Markdown
-	html, err := e.DOM.Html()
-	if err != nil {
-		log.Printf("Failed to get HTML for %s: %v", e.Request.URL, err)
-		return
-	}
-
-	markdown, err := c.converter.ConvertString(html)
-	if err != nil {
-		log.Printf("Failed to convert HTML to Markdown for %s: %v", e.Request.URL, err)
-		return
-	}
-
-	// Create page info
-	pageInfo := PageInfo{
-		URL:         e.Request.URL.String(),
-		Title:       title,
-		Content:     markdown,
-		CrawledAt:   time.Now(),
-		Description: description,
-	}
-
-	// Save individual markdown file
-	filename := c.createFilename(e.Request.URL, title)
-	pageInfo.FilePath = filepath.Join(MarkdownSubdir, filename)
-	fullPath := filepath.Join(c.config.OutputDir, pageInfo.FilePath)
-
-	if err := c.saveMarkdown(fullPath, pageInfo); err != nil {
-		log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err)
-		return
-	}
-
-	// Add to pages collection
-	c.pagesMutex.Lock()
-	c.pages = append(c.pages, pageInfo)
-	c.pagesMutex.Unlock()
-
-	c.mu.Lock()
-	c.processed++
-	c.mu.Unlock()
-}
-
-func (c *Crawler) saveMarkdown(filePath string, pageInfo PageInfo) error {
-	// Ensure directory exists
-	dir := filepath.Dir(filePath)
-	if err := os.MkdirAll(dir, 0755); err != nil {
-		return fmt.Errorf("failed to create directory %s: %w", dir, err)
-	}
-
-	// Create content with metadata
-	content := fmt.Sprintf(`# %s
-
-URL: %s
-Crawled: %s
-%s
-
---
-
-%s`,
-		pageInfo.Title,
-		pageInfo.URL,
-		pageInfo.CrawledAt.Format(time.RFC3339),
-		func() string {
-			if pageInfo.Description != "" {
-				return fmt.Sprintf("Description: %s", pageInfo.Description)
-			}
-			return ""
-		}(),
-		pageInfo.Content)
-
-	// Write file
-	if err := os.WriteFile(filePath, []byte(content), 0644); err != nil {
-		return fmt.Errorf("failed to write file %s: %w", filePath, err)
-	}
-
-	return nil
-}
-
-func (c *Crawler) createFilename(pageURL *url.URL, title string) string {
-	// Clean title for filename
-	filename := strings.TrimSpace(title)
-	filename = regexp.MustCompile(`[^a-zA-Z0-9\-_\s]`).ReplaceAllString(filename, "")
-	filename = regexp.MustCompile(`\s+`).ReplaceAllString(filename, "-")
-	filename = strings.ToLower(filename)
-
-	if filename == "" || filename == "untitled" {
-		// Use URL path
-		urlPath := strings.Trim(pageURL.Path, "/")
-		if urlPath == "" {
-			urlPath = "index"
-		}
-		filename = strings.ReplaceAll(urlPath, "/", "-")
-	}
-
-	// Limit filename length
-	if len(filename) > 100 {
-		filename = filename[:100]
-	}
-
-	// Ensure .md extension
-	if !strings.HasSuffix(filename, ".md") {
-		filename += ".md"
-	}
-
-	return filename
-}
-
-func (c *Crawler) addToQueue(link string, baseURL *url.URL) {
-	// Parse and resolve URL
-	linkURL, err := url.Parse(link)
-	if err != nil {
-		return
-	}
-
-	resolvedURL := baseURL.ResolveReference(linkURL)
-
-	// Check if it's within the same domain
-	if resolvedURL.Host != c.baseURL.Host {
-		return
-	}
-
-	// Apply filters
-	if c.shouldSkipURL(resolvedURL.String()) {
-		return
-	}
-
-	urlStr := resolvedURL.String()
-
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	// Check if already visited
-	if c.visited[urlStr] {
-		return
-	}
-
-	c.visited[urlStr] = true
-
-	// Add to queue
-	select {
-	case c.queue <- urlStr:
-	default:
-		// Queue is full, skip this URL
-	}
-}
-
-func (c *Crawler) shouldSkipURL(urlStr string) bool {
-	// Check language filters
-	for _, pattern := range LanguageFilter {
-		if matched, _ := regexp.MatchString(pattern, urlStr); matched {
-			return true
-		}
-	}
-
-	// Check file extension filters
-	for _, pattern := range FileExtensionFilter {
-		if matched, _ := regexp.MatchString(pattern, urlStr); matched {
-			return true
-		}
-	}
-
-	// Skip fragments
-	if strings.Contains(urlStr, "#") {
-		return true
-	}
-
-	return false
-}
-
-func (c *Crawler) Start(ctx context.Context) error {
-	fmt.Printf("Starting crawl of: %s\n", c.config.URL)
-	fmt.Printf("Output directory: %s\n", c.config.OutputDir)
-	fmt.Printf("Workers: %d\n", c.config.Workers)
-
-	// Create a cancellable context for workers
-	workerCtx, cancel := context.WithCancel(ctx)
-	defer cancel()
-
-	// Add seed URL to queue
-	c.queue <- c.config.URL
-	c.visited[c.config.URL] = true
-
-	// Start workers
-	for i := 0; i < c.config.Workers; i++ {
-		c.wg.Add(1)
-		go c.worker(workerCtx)
-	}
-
-	// Monitor progress and handle completion
-	done := make(chan struct{})
-	go func() {
-		c.monitor(workerCtx)
-		close(done)
-	}()
-
-	// Wait for either completion or cancellation
-	select {
-	case <-done:
-		cancel() // Stop workers
-	case <-ctx.Done():
-		// External cancellation
-	}
-
-	// Wait for workers to finish
-	c.wg.Wait()
-	close(c.queue)
-	c.bar.Finish()
-
-	fmt.Printf("\nProcessed %d pages\n", c.processed)
-	return nil
-}
-
-func (c *Crawler) worker(ctx context.Context) {
-	defer c.wg.Done()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case urlStr, ok := <-c.queue:
-			if !ok {
-				return
-			}
-
-			if err := c.collector.Visit(urlStr); err != nil {
-				if c.config.Verbose {
-					log.Printf("Failed to visit %s: %v", urlStr, err)
-				}
-			}
-		}
-	}
-}
-
-func (c *Crawler) monitor(ctx context.Context) {
-	ticker := time.NewTicker(2 * time.Second) // Check more frequently
-	defer ticker.Stop()
-
-	lastProcessed := 0
-	noProgressCount := 0
-
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-ticker.C:
-			c.mu.RLock()
-			current := c.processed
-			queueLen := len(c.queue)
-			c.mu.RUnlock()
-
-			if current == lastProcessed {
-				noProgressCount++
-				// More aggressive completion detection
-				if (noProgressCount >= 3 && queueLen == 0) || // 6 seconds with no progress and empty queue
-					(noProgressCount >= 15) { // Or 30 seconds regardless
-					if c.config.Verbose {
-						fmt.Println("\nNo progress detected, stopping crawler...")
-					}
-					return
-				}
-			} else {
-				noProgressCount = 0
-				lastProcessed = current
-			}
-
-			if c.config.Verbose {
-				fmt.Printf("Progress: %d pages processed, %d in queue\n", current, queueLen)
-			}
-		}
-	}
-}
-
-// GenerateLLMSFiles creates both llms.txt and llms-full.txt files
-func (c *Crawler) GenerateLLMSFiles() error {
-	if err := c.generateLLMSTxt(); err != nil {
-		return fmt.Errorf("failed to generate llms.txt: %w", err)
-	}
-
-	if err := c.generateLLMSFullTxt(); err != nil {
-		return fmt.Errorf("failed to generate llms-full.txt: %w", err)
-	}
-
-	return nil
-}
-
-func (c *Crawler) generateLLMSTxt() error {
-	// Sort pages by URL for consistent output
-	sortedPages := make([]PageInfo, len(c.pages))
-	copy(sortedPages, c.pages)
-	sort.Slice(sortedPages, func(i, j int) bool {
-		return sortedPages[i].URL < sortedPages[j].URL
-	})
-
-	var content strings.Builder
-
-	// H1 title (required)
-	siteTitle := c.getSiteTitle()
-	content.WriteString(fmt.Sprintf("# %s\n\n", siteTitle))
-
-	// Blockquote summary (optional but recommended)
-	summary := c.generateSiteSummary()
-	if summary != "" {
-		content.WriteString(fmt.Sprintf("> %s\n\n", summary))
-	}
-
-	// Additional details
-	content.WriteString(fmt.Sprintf("This documentation was automatically crawled from %s on %s.\n\n",
-		c.config.URL, time.Now().Format("January 2, 2006")))
-
-	// Main documentation section
-	content.WriteString("## Documentation\n\n")
-	for _, page := range sortedPages {
-		if c.isMainDocPage(page) {
-			description := page.Description
-			if description == "" {
-				description = c.extractFirstSentence(page.Content)
-			}
-			if description != "" {
-				content.WriteString(fmt.Sprintf("- [%s](%s): %s\n", page.Title, page.URL, description))
-			} else {
-				content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
-			}
-		}
-	}
-
-	// Optional section for secondary pages
-	secondaryPages := c.getSecondaryPages(sortedPages)
-	if len(secondaryPages) > 0 {
-		content.WriteString("\n## Optional\n\n")
-		for _, page := range secondaryPages {
-			content.WriteString(fmt.Sprintf("- [%s](%s)\n", page.Title, page.URL))
-		}
-	}
-
-	// Write to file
-	filePath := filepath.Join(c.config.OutputDir, "llms.txt")
-	return os.WriteFile(filePath, []byte(content.String()), 0644)
-}
-
-func (c *Crawler) generateLLMSFullTxt() error {
-	// Sort pages by URL for consistent output
-	sortedPages := make([]PageInfo, len(c.pages))
-	copy(sortedPages, c.pages)
-	sort.Slice(sortedPages, func(i, j int) bool {
-		return sortedPages[i].URL < sortedPages[j].URL
-	})
-
-	var content strings.Builder
-
-	// H1 title
-	siteTitle := c.getSiteTitle()
-	content.WriteString(fmt.Sprintf("# %s - Complete Documentation\n\n", siteTitle))
-
-	// Summary
-	summary := c.generateSiteSummary()
-	if summary != "" {
-		content.WriteString(fmt.Sprintf("> %s\n\n", summary))
-	}
-
-	content.WriteString(fmt.Sprintf("This file contains the complete content of all pages crawled from %s on %s.\n\n",
-		c.config.URL, time.Now().Format("January 2, 2006")))
-
-	content.WriteString("---\n\n")
-
-	// Include full content of each page
-	for i, page := range sortedPages {
-		content.WriteString(fmt.Sprintf("## %s\n\n", page.Title))
-		content.WriteString(fmt.Sprintf("**URL:** %s\n\n", page.URL))
-
-		if page.Description != "" {
-			content.WriteString(fmt.Sprintf("**Description:** %s\n\n", page.Description))
-		}
-
-		content.WriteString(fmt.Sprintf("**Crawled:** %s\n\n", page.CrawledAt.Format(time.RFC3339)))
-
-		// Clean and include content
-		cleanContent := c.cleanContentForLLMS(page.Content)
-		content.WriteString(cleanContent)
-
-		// Add separator between pages (except for the last one)
-		if i < len(sortedPages)-1 {
-			content.WriteString("\n\n---\n\n")
-		}
-	}
-
-	// Write to file
-	filePath := filepath.Join(c.config.OutputDir, "llms-full.txt")
-	return os.WriteFile(filePath, []byte(content.String()), 0644)
-}
-
-func (c *Crawler) getSiteTitle() string {
-	// Try to get site title from the main page
-	for _, page := range c.pages {
-		if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
-			if page.Title != "" && page.Title != "Untitled" {
-				return page.Title
-			}
-		}
-	}
-
-	// Fallback to domain name
-	return c.baseURL.Host
-}
-
-func (c *Crawler) generateSiteSummary() string {
-	// Try to get description from the main page
-	for _, page := range c.pages {
-		if page.URL == c.config.URL || page.URL == c.config.URL+"/" {
-			if page.Description != "" {
-				return page.Description
-			}
-			// Extract first meaningful paragraph
-			return c.extractFirstSentence(page.Content)
-		}
-	}
-
-	return fmt.Sprintf("Documentation and content from %s", c.baseURL.Host)
-}
-
-func (c *Crawler) isMainDocPage(page PageInfo) bool {
-	// Consider a page "main documentation" if it's not in typical secondary sections
-	lowerURL := strings.ToLower(page.URL)
-
-	// Skip pages that are typically secondary
-	secondaryIndicators := []string{
-		"/blog", "/news", "/archive", "/changelog", "/release",
-		"/about", "/contact", "/legal", "/privacy", "/terms",
-		"/community", "/forum", "/discuss",
-	}
-
-	for _, indicator := range secondaryIndicators {
-		// Check for the indicator followed by either / or end of URL
-		if strings.Contains(lowerURL, indicator+"/") || strings.HasSuffix(lowerURL, indicator) {
-			return false
-		}
-	}
-
-	return true
-}
-
-func (c *Crawler) getSecondaryPages(allPages []PageInfo) []PageInfo {
-	var secondary []PageInfo
-	for _, page := range allPages {
-		if !c.isMainDocPage(page) {
-			secondary = append(secondary, page)
-		}
-	}
-	return secondary
-}
-
-func (c *Crawler) extractFirstSentence(content string) string {
-	// Clean the content and extract the first meaningful sentence
-	lines := strings.Split(content, "\n")
-	for _, line := range lines {
-		line = strings.TrimSpace(line)
-		// Skip empty lines, headers, and markdown syntax
-		if len(line) > 50 && !strings.HasPrefix(line, "#") && !strings.HasPrefix(line, "**") {
-			// Find the first sentence
-			sentences := strings.Split(line, ".")
-			if len(sentences) > 0 && len(sentences[0]) > 20 {
-				return strings.TrimSpace(sentences[0]) + "."
-			}
-		}
-	}
-	return ""
-}
-
-func (c *Crawler) cleanContentForLLMS(content string) string {
-	// Clean the content for better readability in LLMs context
-	var cleaned strings.Builder
-	scanner := bufio.NewScanner(strings.NewReader(content))
-
-	var inCodeBlock bool
-	for scanner.Scan() {
-		line := scanner.Text()
-
-		// Handle code blocks
-		if strings.HasPrefix(strings.TrimSpace(line), "```") {
-			inCodeBlock = !inCodeBlock
-		}
-
-		// Skip empty lines unless in code block
-		if strings.TrimSpace(line) == "" && !inCodeBlock {
-			continue
-		}
-
-		cleaned.WriteString(line)
-		cleaned.WriteString("\n")
-	}
-
-	return strings.TrimSpace(cleaned.String())
-}
--- a/main_test.go
+++ b/main_test.go
@ -1,250 +0,0 @@
-package main
-
-import (
-	"net/url"
-	"testing"
-)
-
-func TestValidateConfig(t *testing.T) {
-	tests := []struct {
-		name    string
-		config  *Config
-		wantErr bool
-	}{
-		{
-			name: "Valid config",
-			config: &Config{
-				URL:       "https://example.com",
-				OutputDir: "./output",
-				Workers:   1,
-			},
-			wantErr: false,
-		},
-		{
-			name: "Empty URL",
-			config: &Config{
-				URL:       "",
-				OutputDir: "./output",
-				Workers:   1,
-			},
-			wantErr: true,
-		},
-		{
-			name: "Invalid URL",
-			config: &Config{
-				URL:       "not-a-url",
-				OutputDir: "./output",
-				Workers:   1,
-			},
-			wantErr: true,
-		},
-		{
-			name: "Zero workers",
-			config: &Config{
-				URL:       "https://example.com",
-				OutputDir: "./output",
-				Workers:   0,
-			},
-			wantErr: true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			err := validateConfig(tt.config)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
-			}
-		})
-	}
-}
-
-func TestCreateFilename(t *testing.T) {
-	config := &Config{
-		URL:       "https://example.com",
-		OutputDir: "./test-output",
-		Workers:   1,
-	}
-
-	crawler, err := NewCrawler(config)
-	if err != nil {
-		t.Fatalf("Failed to create crawler: %v", err)
-	}
-
-	tests := []struct {
-		name     string
-		url      string
-		title    string
-		expected string
-	}{
-		{
-			name:     "Normal title",
-			url:      "https://example.com/about",
-			title:    "About Us",
-			expected: "about-us.md",
-		},
-		{
-			name:     "Title with special characters",
-			url:      "https://example.com/contact",
-			title:    "Contact Us! (Get in Touch)",
-			expected: "contact-us-get-in-touch.md",
-		},
-		{
-			name:     "Empty title",
-			url:      "https://example.com/services/web-design",
-			title:    "",
-			expected: "services-web-design.md",
-		},
-		{
-			name:     "Root URL",
-			url:      "https://example.com/",
-			title:    "Homepage",
-			expected: "homepage.md",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			pageURL, _ := url.Parse(tt.url)
-			result := crawler.createFilename(pageURL, tt.title)
-			if result != tt.expected {
-				t.Errorf("createFilename(%q, %q) = %q, want %q", tt.url, tt.title, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestShouldSkipURL(t *testing.T) {
-	config := &Config{
-		URL:       "https://example.com",
-		OutputDir: "./test-output",
-		Workers:   1,
-	}
-
-	crawler, err := NewCrawler(config)
-	if err != nil {
-		t.Fatalf("Failed to create crawler: %v", err)
-	}
-
-	tests := []struct {
-		name     string
-		url      string
-		expected bool
-	}{
-		{"Normal URL", "https://example.com/page", false},
-		{"Language URL - en", "https://example.com/en/page", true},
-		{"Language URL - zh", "https://example.com/zh/page", true},
-		{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
-		{"PDF file", "https://example.com/document.pdf", true},
-		{"ZIP file", "https://example.com/archive.zip", true},
-		{"Fragment URL", "https://example.com/page#section", true},
-		{"Image file", "https://example.com/image.jpg", true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := crawler.shouldSkipURL(tt.url)
-			if result != tt.expected {
-				t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestExtractFirstSentence(t *testing.T) {
-	config := &Config{
-		URL:       "https://example.com",
-		OutputDir: "./test-output",
-		Workers:   1,
-	}
-
-	crawler, err := NewCrawler(config)
-	if err != nil {
-		t.Fatalf("Failed to create crawler: %v", err)
-	}
-
-	tests := []struct {
-		name     string
-		content  string
-		expected string
-	}{
-		{
-			name:     "Simple sentence",
-			content:  "This is a simple sentence about something interesting. This is another sentence.",
-			expected: "This is a simple sentence about something interesting.",
-		},
-		{
-			name:     "With headers",
-			content:  "# Header\n\nThis is the main content that should be extracted as the first sentence.",
-			expected: "This is the main content that should be extracted as the first sentence.",
-		},
-		{
-			name:     "Short content",
-			content:  "Short text",
-			expected: "",
-		},
-		{
-			name:     "Empty content",
-			content:  "",
-			expected: "",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := crawler.extractFirstSentence(tt.content)
-			if result != tt.expected {
-				t.Errorf("extractFirstSentence() = %q, want %q", result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestIsMainDocPage(t *testing.T) {
-	config := &Config{
-		URL:       "https://example.com",
-		OutputDir: "./test-output",
-		Workers:   1,
-	}
-
-	crawler, err := NewCrawler(config)
-	if err != nil {
-		t.Fatalf("Failed to create crawler: %v", err)
-	}
-
-	tests := []struct {
-		name     string
-		page     PageInfo
-		expected bool
-	}{
-		{
-			name:     "Main documentation page",
-			page:     PageInfo{URL: "https://example.com/docs/getting-started"},
-			expected: true,
-		},
-		{
-			name:     "Blog page",
-			page:     PageInfo{URL: "https://example.com/blog/latest-news"},
-			expected: false,
-		},
-		{
-			name:     "About page",
-			page:     PageInfo{URL: "https://example.com/about"},
-			expected: false,
-		},
-		{
-			name:     "API documentation",
-			page:     PageInfo{URL: "https://example.com/api/reference"},
-			expected: true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := crawler.isMainDocPage(tt.page)
-			if result != tt.expected {
-				t.Errorf("isMainDocPage() = %v, want %v", result, tt.expected)
-			}
-		})
-	}
-}