mirror of
https://github.com/Sosokker/site-to-llmstxt.git
synced 2025-12-18 21:44:06 +01:00
251 lines
5.4 KiB
Go
251 lines
5.4 KiB
Go
package main
|
|
|
|
import (
|
|
"net/url"
|
|
"testing"
|
|
)
|
|
|
|
func TestValidateConfig(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
config *Config
|
|
wantErr bool
|
|
}{
|
|
{
|
|
name: "Valid config",
|
|
config: &Config{
|
|
URL: "https://example.com",
|
|
OutputDir: "./output",
|
|
Workers: 1,
|
|
},
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "Empty URL",
|
|
config: &Config{
|
|
URL: "",
|
|
OutputDir: "./output",
|
|
Workers: 1,
|
|
},
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "Invalid URL",
|
|
config: &Config{
|
|
URL: "not-a-url",
|
|
OutputDir: "./output",
|
|
Workers: 1,
|
|
},
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "Zero workers",
|
|
config: &Config{
|
|
URL: "https://example.com",
|
|
OutputDir: "./output",
|
|
Workers: 0,
|
|
},
|
|
wantErr: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
err := validateConfig(tt.config)
|
|
if (err != nil) != tt.wantErr {
|
|
t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCreateFilename(t *testing.T) {
|
|
config := &Config{
|
|
URL: "https://example.com",
|
|
OutputDir: "./test-output",
|
|
Workers: 1,
|
|
}
|
|
|
|
crawler, err := NewCrawler(config)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create crawler: %v", err)
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
url string
|
|
title string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "Normal title",
|
|
url: "https://example.com/about",
|
|
title: "About Us",
|
|
expected: "about-us.md",
|
|
},
|
|
{
|
|
name: "Title with special characters",
|
|
url: "https://example.com/contact",
|
|
title: "Contact Us! (Get in Touch)",
|
|
expected: "contact-us-get-in-touch.md",
|
|
},
|
|
{
|
|
name: "Empty title",
|
|
url: "https://example.com/services/web-design",
|
|
title: "",
|
|
expected: "services-web-design.md",
|
|
},
|
|
{
|
|
name: "Root URL",
|
|
url: "https://example.com/",
|
|
title: "Homepage",
|
|
expected: "homepage.md",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
pageURL, _ := url.Parse(tt.url)
|
|
result := crawler.createFilename(pageURL, tt.title)
|
|
if result != tt.expected {
|
|
t.Errorf("createFilename(%q, %q) = %q, want %q", tt.url, tt.title, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestShouldSkipURL(t *testing.T) {
|
|
config := &Config{
|
|
URL: "https://example.com",
|
|
OutputDir: "./test-output",
|
|
Workers: 1,
|
|
}
|
|
|
|
crawler, err := NewCrawler(config)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create crawler: %v", err)
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
url string
|
|
expected bool
|
|
}{
|
|
{"Normal URL", "https://example.com/page", false},
|
|
{"Language URL - en", "https://example.com/en/page", true},
|
|
{"Language URL - zh", "https://example.com/zh/page", true},
|
|
{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
|
|
{"PDF file", "https://example.com/document.pdf", true},
|
|
{"ZIP file", "https://example.com/archive.zip", true},
|
|
{"Fragment URL", "https://example.com/page#section", true},
|
|
{"Image file", "https://example.com/image.jpg", true},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := crawler.shouldSkipURL(tt.url)
|
|
if result != tt.expected {
|
|
t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractFirstSentence(t *testing.T) {
|
|
config := &Config{
|
|
URL: "https://example.com",
|
|
OutputDir: "./test-output",
|
|
Workers: 1,
|
|
}
|
|
|
|
crawler, err := NewCrawler(config)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create crawler: %v", err)
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
content string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "Simple sentence",
|
|
content: "This is a simple sentence about something interesting. This is another sentence.",
|
|
expected: "This is a simple sentence about something interesting.",
|
|
},
|
|
{
|
|
name: "With headers",
|
|
content: "# Header\n\nThis is the main content that should be extracted as the first sentence.",
|
|
expected: "This is the main content that should be extracted as the first sentence.",
|
|
},
|
|
{
|
|
name: "Short content",
|
|
content: "Short text",
|
|
expected: "",
|
|
},
|
|
{
|
|
name: "Empty content",
|
|
content: "",
|
|
expected: "",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := crawler.extractFirstSentence(tt.content)
|
|
if result != tt.expected {
|
|
t.Errorf("extractFirstSentence() = %q, want %q", result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestIsMainDocPage(t *testing.T) {
|
|
config := &Config{
|
|
URL: "https://example.com",
|
|
OutputDir: "./test-output",
|
|
Workers: 1,
|
|
}
|
|
|
|
crawler, err := NewCrawler(config)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create crawler: %v", err)
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
page PageInfo
|
|
expected bool
|
|
}{
|
|
{
|
|
name: "Main documentation page",
|
|
page: PageInfo{URL: "https://example.com/docs/getting-started"},
|
|
expected: true,
|
|
},
|
|
{
|
|
name: "Blog page",
|
|
page: PageInfo{URL: "https://example.com/blog/latest-news"},
|
|
expected: false,
|
|
},
|
|
{
|
|
name: "About page",
|
|
page: PageInfo{URL: "https://example.com/about"},
|
|
expected: false,
|
|
},
|
|
{
|
|
name: "API documentation",
|
|
page: PageInfo{URL: "https://example.com/api/reference"},
|
|
expected: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := crawler.isMainDocPage(tt.page)
|
|
if result != tt.expected {
|
|
t.Errorf("isMainDocPage() = %v, want %v", result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|