site-to-llmstxt/internal/crawler/scrape.go
2025-10-18 09:46:00 +07:00

246 lines
5.3 KiB
Go

package crawler
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"time"
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/PuerkitoBio/goquery"
"github.com/Sosokker/site-to-llmstxt/internal/config"
"github.com/Sosokker/site-to-llmstxt/internal/filters"
"github.com/Sosokker/site-to-llmstxt/internal/models"
"github.com/Sosokker/site-to-llmstxt/internal/utils"
)
// ProgressUpdate conveys scraping progress for interactive UIs.
type ProgressUpdate struct {
Completed int
Total int
URL string
}
// LogUpdate captures a log line emitted during scraping.
type LogUpdate struct {
Message string
}
// ScrapeOptions configure the download stage for selected pages.
type ScrapeOptions struct {
BaseURL *url.URL
Pages []PageSummary
Output string
Workers int
Verbose bool
Logs chan<- LogUpdate
Progress chan<- ProgressUpdate
}
// Scrape fetches each provided page, writes Markdown output, and returns results.
func Scrape(ctx context.Context, opts ScrapeOptions) ([]models.PageInfo, *models.Stats, error) {
if opts.BaseURL == nil {
return nil, nil, errors.New("base URL is required")
}
if len(opts.Pages) == 0 {
return nil, nil, errors.New("no pages selected for scraping")
}
if opts.Workers <= 0 {
opts.Workers = config.DefaultWorkers
}
if err := utils.CreateOutputDirs(opts.Output); err != nil {
return nil, nil, err
}
client := &http.Client{
Timeout: 30 * time.Second,
}
stats := &models.Stats{
StartTime: time.Now(),
TotalPages: len(opts.Pages),
MainDocPages: 0,
SecondaryPages: 0,
SkippedURLs: 0,
ErrorCount: 0,
}
statsMu := &sync.Mutex{}
for _, page := range opts.Pages {
if filters.IsMainDocPage(page.URL) {
stats.MainDocPages++
} else {
stats.SecondaryPages++
}
}
namer := utils.NewUniqueNamer()
results := make([]models.PageInfo, 0, len(opts.Pages))
resultsMu := &sync.Mutex{}
var completed int32
progressTotal := len(opts.Pages)
pageCh := make(chan PageSummary)
wg := sync.WaitGroup{}
errOnce := sync.Once{}
var firstErr error
sendLog := func(always bool, format string, args ...interface{}) {
if !always && !opts.Verbose {
return
}
if opts.Logs == nil {
return
}
msg := fmt.Sprintf(format, args...)
select {
case opts.Logs <- LogUpdate{Message: msg}:
default:
}
}
sendProgress := func(url string) {
if opts.Progress == nil {
return
}
done := int(atomic.AddInt32(&completed, 1))
select {
case opts.Progress <- ProgressUpdate{
Completed: done,
Total: progressTotal,
URL: url,
}:
default:
}
}
for i := 0; i < opts.Workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for page := range pageCh {
select {
case <-ctx.Done():
return
default:
}
info, err := fetchPage(ctx, client, page, opts.Output, namer)
if err != nil {
errOnce.Do(func() { firstErr = err })
statsMu.Lock()
stats.AddError()
statsMu.Unlock()
sendLog(true, "error scraping %s: %v", page.URL, err)
sendProgress(page.URL)
continue
}
resultsMu.Lock()
results = append(results, info)
resultsMu.Unlock()
sendLog(false, "scraped %s", page.URL)
sendProgress(page.URL)
}
}()
}
go func() {
for _, page := range opts.Pages {
pageCh <- page
}
close(pageCh)
}()
wg.Wait()
stats.Finish()
if err := ctx.Err(); err != nil && !errors.Is(err, context.Canceled) {
return results, stats, err
}
return results, stats, firstErr
}
func fetchPage(ctx context.Context, client *http.Client, page PageSummary, outputDir string, namer *utils.UniqueNamer) (models.PageInfo, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, page.URL, nil)
if err != nil {
return models.PageInfo{}, err
}
resp, err := client.Do(req)
if err != nil {
return models.PageInfo{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
io.Copy(io.Discard, resp.Body)
return models.PageInfo{}, fmt.Errorf("unexpected status code %d", resp.StatusCode)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return models.PageInfo{}, err
}
htmlContent := string(bodyBytes)
markdown, err := htmltomarkdown.ConvertString(htmlContent)
if err != nil {
return models.PageInfo{}, err
}
markdown = strings.TrimSpace(markdown)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
return models.PageInfo{}, err
}
title := strings.TrimSpace(doc.Find("title").First().Text())
if title == "" {
title = page.Title
}
description := strings.TrimSpace(doc.Find(`meta[name="description"]`).AttrOr("content", ""))
if description == "" {
description = page.Description
}
if description == "" {
description = utils.ExtractFirstSentence(markdown)
}
filename := utils.CreateFilename(title, page.URL)
filename = namer.Reserve(filename)
relativePath := filepath.Join(config.MarkdownSubdir, filename)
fullPath := filepath.Join(outputDir, relativePath)
if err := os.WriteFile(fullPath, []byte(markdown), 0o644); err != nil {
return models.PageInfo{}, err
}
info := models.PageInfo{
URL: page.URL,
Title: title,
Content: markdown,
FilePath: relativePath,
CrawledAt: time.Now(),
Description: description,
}
return info, nil
}