mirror of
https://github.com/Sosokker/site-to-llmstxt.git
synced 2025-12-18 05:24:06 +01:00
feat: web crawler with Markdown conversion and testing suite
This commit is contained in:
parent
f20cf82084
commit
f702db6ede
28
.gitignore
vendored
28
.gitignore
vendored
@ -8,6 +8,16 @@
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Project binaries
|
||||
crawler
|
||||
site-to-llmstxt
|
||||
|
||||
# Output directories
|
||||
output/
|
||||
test-output/
|
||||
example-output/
|
||||
crawled-docs/
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
@ -17,6 +27,24 @@ coverage.*
|
||||
*.coverprofile
|
||||
profile.cov
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
|
||||
57
Makefile
Normal file
57
Makefile
Normal file
@ -0,0 +1,57 @@
|
||||
# Makefile for site-to-llmstxt crawler
|
||||
|
||||
.PHONY: build test clean run help
|
||||
|
||||
# Default target
|
||||
help:
|
||||
@echo "Available targets:"
|
||||
@echo " build - Build the crawler binary"
|
||||
@echo " test - Run tests"
|
||||
@echo " clean - Clean build artifacts"
|
||||
@echo " run - Run with example URL (requires URL variable)"
|
||||
@echo " install - Install dependencies"
|
||||
@echo ""
|
||||
@echo "Examples:"
|
||||
@echo " make build"
|
||||
@echo " make run URL=https://example.com"
|
||||
@echo " make run URL=https://httpbin.org WORKERS=3 OUTPUT=./test-output"
|
||||
|
||||
# Build the crawler
|
||||
build:
|
||||
@echo "Building crawler..."
|
||||
go build -o crawler main.go
|
||||
@echo "Build complete: ./crawler"
|
||||
|
||||
# Run tests
|
||||
test:
|
||||
@echo "Running tests..."
|
||||
go test -v
|
||||
|
||||
# Clean build artifacts
|
||||
clean:
|
||||
@echo "Cleaning..."
|
||||
rm -f crawler
|
||||
rm -rf output/
|
||||
rm -rf test-output/
|
||||
rm -rf example-output/
|
||||
|
||||
# Install dependencies
|
||||
install:
|
||||
@echo "Installing dependencies..."
|
||||
go mod tidy
|
||||
|
||||
# Run with parameters
|
||||
run: build
|
||||
@if [ -z "$(URL)" ]; then \
|
||||
echo "Error: URL is required. Usage: make run URL=https://example.com"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo "Running crawler with URL: $(URL)"
|
||||
./crawler -url $(URL) \
|
||||
$(if $(WORKERS),-workers $(WORKERS)) \
|
||||
$(if $(OUTPUT),-output $(OUTPUT)) \
|
||||
$(if $(VERBOSE),-verbose)
|
||||
|
||||
# Build and test everything
|
||||
all: clean install build test
|
||||
@echo "All tasks completed successfully!"
|
||||
34
go.mod
Normal file
34
go.mod
Normal file
@ -0,0 +1,34 @@
|
||||
module site-to-llmstxt
|
||||
|
||||
go 1.24.5
|
||||
|
||||
require (
|
||||
github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3
|
||||
github.com/gocolly/colly/v2 v2.2.0
|
||||
github.com/schollz/progressbar/v3 v3.18.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/JohannesKaufmann/dom v0.2.0 // indirect
|
||||
github.com/PuerkitoBio/goquery v1.10.3 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/antchfx/htmlquery v1.3.4 // indirect
|
||||
github.com/antchfx/xmlquery v1.4.4 // indirect
|
||||
github.com/antchfx/xpath v1.3.4 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.22.0 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||
github.com/nlnwa/whatwg-url v0.6.2 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
golang.org/x/net v0.42.0 // indirect
|
||||
golang.org/x/sys v0.34.0 // indirect
|
||||
golang.org/x/term v0.33.0 // indirect
|
||||
golang.org/x/text v0.27.0 // indirect
|
||||
google.golang.org/appengine v1.6.8 // indirect
|
||||
google.golang.org/protobuf v1.36.6 // indirect
|
||||
)
|
||||
151
go.sum
Normal file
151
go.sum
Normal file
@ -0,0 +1,151 @@
|
||||
github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ=
|
||||
github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo=
|
||||
github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3 h1:r3fokGFRDk/8pHmwLwJ8zsX4qiqfS1/1TZm2BH8ueY8=
|
||||
github.com/JohannesKaufmann/html-to-markdown/v2 v2.3.3/go.mod h1:HtsP+1Fchp4dVvaiIsLHAl/yqL3H1YLwqLC9kNwqQEg=
|
||||
github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
|
||||
github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ=
|
||||
github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM=
|
||||
github.com/antchfx/xmlquery v1.4.4 h1:mxMEkdYP3pjKSftxss4nUHfjBhnMk4imGoR96FRY2dg=
|
||||
github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fusrx9b12fc=
|
||||
github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/antchfx/xpath v1.3.4 h1:1ixrW1VnXd4HurCj7qnqnR0jo14g8JMe20Fshg1Vgz4=
|
||||
github.com/antchfx/xpath v1.3.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
|
||||
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
|
||||
github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
||||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
|
||||
github.com/gocolly/colly/v2 v2.2.0 h1:FQGxcqvTdFAvOpMRhk52o20Qsf6KtRU5HSf0bITS38I=
|
||||
github.com/gocolly/colly/v2 v2.2.0/go.mod h1:YOQwv1ofoQOzJiELnkThDd6ObOfl6odUk2i6Czbx3Ws=
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ=
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw=
|
||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
|
||||
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||
github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA=
|
||||
github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
|
||||
github.com/sebdah/goldie/v2 v2.5.5 h1:rx1mwF95RxZ3/83sdS4Yp7t2C5TCokvWP4TBRbAyEWY=
|
||||
github.com/sebdah/goldie/v2 v2.5.5/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
|
||||
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
|
||||
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
|
||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
github.com/yuin/goldmark v1.7.11 h1:ZCxLyDMtz0nT2HFfsYG8WZ47Trip2+JyLysKcMYE5bo=
|
||||
github.com/yuin/goldmark v1.7.11/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
|
||||
golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs=
|
||||
golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA=
|
||||
golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
|
||||
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
||||
golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
|
||||
golang.org/x/term v0.33.0 h1:NuFncQrRcaRvVmgRkvM3j/F00gWIAlcmlB8ACEKmGIg=
|
||||
golang.org/x/term v0.33.0/go.mod h1:s18+ql9tYWp1IfpV9DmCtQDDSRBUjKaw9M1eAv5UeF0=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4=
|
||||
golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
|
||||
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM=
|
||||
google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
|
||||
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
|
||||
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
421
main.go
Normal file
421
main.go
Normal file
@ -0,0 +1,421 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/converter"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/gocolly/colly/v2/debug"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
// Config holds crawler configuration
|
||||
type Config struct {
|
||||
URL string
|
||||
OutputDir string
|
||||
Workers int
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
// Crawler manages the web crawling process
|
||||
type Crawler struct {
|
||||
config *Config
|
||||
collector *colly.Collector
|
||||
converter *converter.Converter
|
||||
visited map[string]bool
|
||||
queue chan string
|
||||
wg sync.WaitGroup
|
||||
mu sync.RWMutex
|
||||
baseURL *url.URL
|
||||
bar *progressbar.ProgressBar
|
||||
processed int
|
||||
}
|
||||
|
||||
// LanguageFilter contains patterns to exclude language-specific URLs
|
||||
var LanguageFilter = []string{
|
||||
`/en/`, `/en$`,
|
||||
`/zh/`, `/zh$`, `/zh-cn/`, `/zh-cn$`, `/zh-tw/`, `/zh-tw$`, `/zh-hant/`, `/zh-hant$`,
|
||||
`/ja/`, `/ja$`,
|
||||
`/ko/`, `/ko$`,
|
||||
`/fr/`, `/fr$`,
|
||||
`/de/`, `/de$`,
|
||||
`/es/`, `/es$`,
|
||||
`/it/`, `/it$`,
|
||||
`/pt/`, `/pt$`,
|
||||
`/ru/`, `/ru$`,
|
||||
}
|
||||
|
||||
// FileExtensionFilter contains patterns to exclude file downloads
|
||||
var FileExtensionFilter = []string{
|
||||
`\.pdf$`, `\.doc$`, `\.docx$`, `\.xls$`, `\.xlsx$`, `\.ppt$`, `\.pptx$`,
|
||||
`\.zip$`, `\.rar$`, `\.tar$`, `\.gz$`, `\.7z$`,
|
||||
`\.mp3$`, `\.mp4$`, `\.avi$`, `\.mov$`, `\.wmv$`,
|
||||
`\.jpg$`, `\.jpeg$`, `\.png$`, `\.gif$`, `\.bmp$`, `\.svg$`,
|
||||
`\.exe$`, `\.msi$`, `\.dmg$`, `\.deb$`, `\.rpm$`,
|
||||
}
|
||||
|
||||
func main() {
|
||||
config := parseFlags()
|
||||
|
||||
if err := validateConfig(config); err != nil {
|
||||
log.Fatalf("Invalid configuration: %v", err)
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
if err := crawler.Start(ctx); err != nil {
|
||||
log.Fatalf("Crawling failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("\nCrawling completed successfully! Files saved to: %s\n", config.OutputDir)
|
||||
}
|
||||
|
||||
func parseFlags() *Config {
|
||||
config := &Config{}
|
||||
|
||||
flag.StringVar(&config.URL, "url", "", "Root URL to crawl (required)")
|
||||
flag.StringVar(&config.OutputDir, "output", "./output", "Output directory for markdown files")
|
||||
flag.IntVar(&config.Workers, "workers", 5, "Number of concurrent workers")
|
||||
flag.BoolVar(&config.Verbose, "verbose", false, "Enable verbose logging")
|
||||
flag.Parse()
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
func validateConfig(config *Config) error {
|
||||
if config.URL == "" {
|
||||
return fmt.Errorf("URL is required")
|
||||
}
|
||||
|
||||
parsedURL, err := url.Parse(config.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Check if URL has a valid scheme and host
|
||||
if parsedURL.Scheme == "" || parsedURL.Host == "" {
|
||||
return fmt.Errorf("URL must include scheme (http/https) and host")
|
||||
}
|
||||
|
||||
if config.Workers <= 0 {
|
||||
return fmt.Errorf("workers must be greater than 0")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewCrawler creates a new crawler instance
|
||||
func NewCrawler(config *Config) (*Crawler, error) {
|
||||
baseURL, err := url.Parse(config.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse base URL: %w", err)
|
||||
}
|
||||
|
||||
// Create output directory
|
||||
if err := os.MkdirAll(config.OutputDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("failed to create output directory: %w", err)
|
||||
}
|
||||
|
||||
// Setup colly collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains(baseURL.Host),
|
||||
)
|
||||
|
||||
if config.Verbose {
|
||||
c.SetDebugger(&debug.LogDebugger{})
|
||||
}
|
||||
|
||||
// Rate limiting
|
||||
c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: config.Workers,
|
||||
Delay: 100 * time.Millisecond,
|
||||
})
|
||||
|
||||
// Setup HTML to Markdown converter
|
||||
conv := converter.NewConverter(
|
||||
converter.WithPlugins(
|
||||
base.NewBasePlugin(),
|
||||
commonmark.NewCommonmarkPlugin(),
|
||||
),
|
||||
)
|
||||
|
||||
crawler := &Crawler{
|
||||
config: config,
|
||||
collector: c,
|
||||
converter: conv,
|
||||
visited: make(map[string]bool),
|
||||
queue: make(chan string, 1000),
|
||||
baseURL: baseURL,
|
||||
bar: progressbar.NewOptions(-1, progressbar.OptionSetDescription("Crawling pages")),
|
||||
}
|
||||
|
||||
crawler.setupCallbacks()
|
||||
|
||||
return crawler, nil
|
||||
}
|
||||
|
||||
func (c *Crawler) setupCallbacks() {
|
||||
// Handle HTML content
|
||||
c.collector.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
c.processPage(e)
|
||||
})
|
||||
|
||||
// Extract links
|
||||
c.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
c.addToQueue(link, e.Request.URL)
|
||||
})
|
||||
|
||||
// Request callback
|
||||
c.collector.OnRequest(func(r *colly.Request) {
|
||||
if c.config.Verbose {
|
||||
fmt.Printf("Visiting: %s\n", r.URL)
|
||||
}
|
||||
c.bar.Add(1)
|
||||
})
|
||||
|
||||
// Error handling
|
||||
c.collector.OnError(func(r *colly.Response, err error) {
|
||||
log.Printf("Error visiting %s: %v", r.Request.URL, err)
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Crawler) processPage(e *colly.HTMLElement) {
|
||||
// Get page title
|
||||
title := e.ChildText("title")
|
||||
if title == "" {
|
||||
title = "untitled"
|
||||
}
|
||||
|
||||
// Convert HTML to Markdown
|
||||
html, err := e.DOM.Html()
|
||||
if err != nil {
|
||||
log.Printf("Failed to get HTML for %s: %v", e.Request.URL, err)
|
||||
return
|
||||
}
|
||||
|
||||
markdown, err := c.converter.ConvertString(html)
|
||||
if err != nil {
|
||||
log.Printf("Failed to convert HTML to Markdown for %s: %v", e.Request.URL, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Save to file
|
||||
if err := c.saveMarkdown(e.Request.URL, title, markdown); err != nil {
|
||||
log.Printf("Failed to save markdown for %s: %v", e.Request.URL, err)
|
||||
return
|
||||
}
|
||||
|
||||
c.mu.Lock()
|
||||
c.processed++
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
func (c *Crawler) saveMarkdown(pageURL *url.URL, title, markdown string) error {
|
||||
// Create filename from URL path
|
||||
filename := c.createFilename(pageURL, title)
|
||||
filePath := filepath.Join(c.config.OutputDir, filename)
|
||||
|
||||
// Ensure directory exists
|
||||
dir := filepath.Dir(filePath)
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory %s: %w", dir, err)
|
||||
}
|
||||
|
||||
// Add metadata header
|
||||
content := fmt.Sprintf("# %s\n\nURL: %s\nCrawled: %s\n\n---\n\n%s",
|
||||
title, pageURL.String(), time.Now().Format(time.RFC3339), markdown)
|
||||
|
||||
// Write file
|
||||
if err := os.WriteFile(filePath, []byte(content), 0644); err != nil {
|
||||
return fmt.Errorf("failed to write file %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Crawler) createFilename(pageURL *url.URL, title string) string {
|
||||
// Clean title for filename
|
||||
filename := strings.TrimSpace(title)
|
||||
filename = regexp.MustCompile(`[^a-zA-Z0-9\-_\s]`).ReplaceAllString(filename, "")
|
||||
filename = regexp.MustCompile(`\s+`).ReplaceAllString(filename, "-")
|
||||
filename = strings.ToLower(filename)
|
||||
|
||||
if filename == "" || filename == "untitled" {
|
||||
// Use URL path
|
||||
urlPath := strings.Trim(pageURL.Path, "/")
|
||||
if urlPath == "" {
|
||||
urlPath = "index"
|
||||
}
|
||||
filename = strings.ReplaceAll(urlPath, "/", "-")
|
||||
}
|
||||
|
||||
// Ensure .md extension
|
||||
if !strings.HasSuffix(filename, ".md") {
|
||||
filename += ".md"
|
||||
}
|
||||
|
||||
return filename
|
||||
}
|
||||
|
||||
func (c *Crawler) addToQueue(link string, baseURL *url.URL) {
|
||||
// Parse and resolve URL
|
||||
linkURL, err := url.Parse(link)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
resolvedURL := baseURL.ResolveReference(linkURL)
|
||||
|
||||
// Check if it's within the same domain
|
||||
if resolvedURL.Host != c.baseURL.Host {
|
||||
return
|
||||
}
|
||||
|
||||
// Apply filters
|
||||
if c.shouldSkipURL(resolvedURL.String()) {
|
||||
return
|
||||
}
|
||||
|
||||
urlStr := resolvedURL.String()
|
||||
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// Check if already visited
|
||||
if c.visited[urlStr] {
|
||||
return
|
||||
}
|
||||
|
||||
c.visited[urlStr] = true
|
||||
|
||||
// Add to queue
|
||||
select {
|
||||
case c.queue <- urlStr:
|
||||
default:
|
||||
// Queue is full, skip this URL
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) shouldSkipURL(urlStr string) bool {
|
||||
// Check language filters
|
||||
for _, pattern := range LanguageFilter {
|
||||
if matched, _ := regexp.MatchString(pattern, urlStr); matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Check file extension filters
|
||||
for _, pattern := range FileExtensionFilter {
|
||||
if matched, _ := regexp.MatchString(pattern, urlStr); matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Skip fragments and query parameters that might be irrelevant
|
||||
if strings.Contains(urlStr, "#") {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *Crawler) Start(ctx context.Context) error {
|
||||
fmt.Printf("Starting crawl of: %s\n", c.config.URL)
|
||||
fmt.Printf("Output directory: %s\n", c.config.OutputDir)
|
||||
fmt.Printf("Workers: %d\n", c.config.Workers)
|
||||
|
||||
// Add seed URL to queue
|
||||
c.queue <- c.config.URL
|
||||
c.visited[c.config.URL] = true
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < c.config.Workers; i++ {
|
||||
c.wg.Add(1)
|
||||
go c.worker(ctx)
|
||||
}
|
||||
|
||||
// Monitor progress
|
||||
go c.monitor(ctx)
|
||||
|
||||
// Wait for completion
|
||||
c.wg.Wait()
|
||||
close(c.queue)
|
||||
c.bar.Finish()
|
||||
|
||||
fmt.Printf("\nProcessed %d pages\n", c.processed)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Crawler) worker(ctx context.Context) {
|
||||
defer c.wg.Done()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case urlStr, ok := <-c.queue:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
if err := c.collector.Visit(urlStr); err != nil {
|
||||
if c.config.Verbose {
|
||||
log.Printf("Failed to visit %s: %v", urlStr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) monitor(ctx context.Context) {
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
lastProcessed := 0
|
||||
noProgressCount := 0
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.mu.RLock()
|
||||
current := c.processed
|
||||
queueLen := len(c.queue)
|
||||
c.mu.RUnlock()
|
||||
|
||||
if current == lastProcessed {
|
||||
noProgressCount++
|
||||
if noProgressCount >= 6 && queueLen == 0 { // 30 seconds with no progress and empty queue
|
||||
fmt.Println("\nNo progress detected, stopping crawler...")
|
||||
return
|
||||
}
|
||||
} else {
|
||||
noProgressCount = 0
|
||||
lastProcessed = current
|
||||
}
|
||||
|
||||
if c.config.Verbose {
|
||||
fmt.Printf("Progress: %d pages processed, %d in queue\n", current, queueLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
152
main_test.go
Normal file
152
main_test.go
Normal file
@ -0,0 +1,152 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestShouldSkipURL(t *testing.T) {
|
||||
config := &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./test-output",
|
||||
Workers: 1,
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
url string
|
||||
expected bool
|
||||
}{
|
||||
{"Normal URL", "https://example.com/page", false},
|
||||
{"Language URL - en", "https://example.com/en/page", true},
|
||||
{"Language URL - zh", "https://example.com/zh/page", true},
|
||||
{"Language URL - zh-hant", "https://example.com/zh-hant/page", true},
|
||||
{"PDF file", "https://example.com/document.pdf", true},
|
||||
{"ZIP file", "https://example.com/archive.zip", true},
|
||||
{"Fragment URL", "https://example.com/page#section", true},
|
||||
{"Image file", "https://example.com/image.jpg", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.shouldSkipURL(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("shouldSkipURL(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateFilename(t *testing.T) {
|
||||
config := &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./test-output",
|
||||
Workers: 1,
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler(config)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create crawler: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
url string
|
||||
title string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "Normal title",
|
||||
url: "https://example.com/about",
|
||||
title: "About Us",
|
||||
expected: "about-us.md",
|
||||
},
|
||||
{
|
||||
name: "Title with special characters",
|
||||
url: "https://example.com/contact",
|
||||
title: "Contact Us! (Get in Touch)",
|
||||
expected: "contact-us-get-in-touch.md",
|
||||
},
|
||||
{
|
||||
name: "Empty title",
|
||||
url: "https://example.com/services/web-design",
|
||||
title: "",
|
||||
expected: "services-web-design.md",
|
||||
},
|
||||
{
|
||||
name: "Root URL",
|
||||
url: "https://example.com/",
|
||||
title: "Homepage",
|
||||
expected: "homepage.md",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
pageURL, _ := url.Parse(tt.url)
|
||||
result := crawler.createFilename(pageURL, tt.title)
|
||||
if result != tt.expected {
|
||||
t.Errorf("createFilename(%q, %q) = %q, want %q", tt.url, tt.title, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfig(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
config *Config
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "Valid config",
|
||||
config: &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./output",
|
||||
Workers: 5,
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "Empty URL",
|
||||
config: &Config{
|
||||
URL: "",
|
||||
OutputDir: "./output",
|
||||
Workers: 5,
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "Invalid URL",
|
||||
config: &Config{
|
||||
URL: "not-a-url",
|
||||
OutputDir: "./output",
|
||||
Workers: 5,
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "Zero workers",
|
||||
config: &Config{
|
||||
URL: "https://example.com",
|
||||
OutputDir: "./output",
|
||||
Workers: 0,
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := validateConfig(tt.config)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
25
test.sh
Normal file
25
test.sh
Normal file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Simple test script to demonstrate the crawler
|
||||
# Usage: ./test.sh
|
||||
|
||||
echo "Building crawler..."
|
||||
go build -o crawler main.go
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Build successful!"
|
||||
|
||||
echo "Testing help message..."
|
||||
./crawler -h
|
||||
|
||||
echo ""
|
||||
echo "Example usage:"
|
||||
echo "./crawler -url https://example.com -workers 3 -output ./example-output -verbose"
|
||||
|
||||
echo ""
|
||||
echo "To test with a real website (be respectful!):"
|
||||
echo "./crawler -url https://httpbin.org -workers 2 -output ./test-output"
|
||||
else
|
||||
echo "Build failed!"
|
||||
exit 1
|
||||
fi
|
||||
Loading…
Reference in New Issue
Block a user