+
Skip to content

Use github.com/nlnwa/whatwg-url for URL parsing #673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
"github.com/gocolly/colly/v2/debug"
"github.com/gocolly/colly/v2/storage"
"github.com/kennygrant/sanitize"
whatwgUrl "github.com/nlnwa/whatwg-url/url"
"github.com/temoto/robotstxt"
"google.golang.org/appengine/urlfetch"
)
Expand Down Expand Up @@ -549,7 +550,11 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
}

func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
parsedURL, err := url.Parse(u)
parsedWhatwgURL, err := whatwgUrl.Parse(u)
if err != nil {
return err
}
parsedURL, err := url.Parse(parsedWhatwgURL.Href(false))
if err != nil {
return err
}
Expand Down Expand Up @@ -1077,10 +1082,14 @@ func (c *Collector) handleOnHTML(resp *Response) error {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
baseURL, err := resp.Request.URL.Parse(href)
u, err := whatwgUrl.ParseRef(resp.Request.URL.String(), href)
if err == nil {
resp.Request.baseURL = baseURL
baseURL, err := url.Parse(u.Href(false))
if err == nil {
resp.Request.baseURL = baseURL
}
}

}
for _, cc := range c.htmlCallbacks {
i := 0
Expand Down
62 changes: 62 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,36 @@ func newTestServer() *httptest.Server {
`))
})

mux.HandleFunc("/tabs_and_newlines", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<base href="/foo bar/" />
</head>
<body>
<a href="x
y">link</a>
</body>
</html>
`))
})

mux.HandleFunc("/foobar/xy", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<p>hello</p>
</body>
</html>
`))
})

mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/octet-stream")
ww := bufio.NewWriter(w)
Expand Down Expand Up @@ -852,6 +882,38 @@ func TestBaseTagRelative(t *testing.T) {
c2.Visit(ts.URL + "/base_relative")
}

func TestTabsAndNewlines(t *testing.T) {
// this test might look odd, but see step 3 of
// https://url.spec.whatwg.org/#concept-basic-url-parser

ts := newTestServer()
defer ts.Close()

visited := map[string]struct{}{}
expected := map[string]struct{}{
"/tabs_and_newlines": {},
"/foobar/xy": {},
}

c := NewCollector()
c.OnResponse(func(res *Response) {
visited[res.Request.URL.EscapedPath()] = struct{}{}
})
c.OnHTML("a[href]", func(e *HTMLElement) {
if err := e.Request.Visit(e.Attr("href")); err != nil {
t.Errorf("visit failed: %v", err)
}
})

if err := c.Visit(ts.URL + "/tabs_and_newlines"); err != nil {
t.Errorf("visit failed: %v", err)
}

if !reflect.DeepEqual(visited, expected) {
t.Errorf("visited=%v expected=%v", visited, expected)
}
}

func TestCollectorCookies(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ require (
github.com/golang/protobuf v1.4.2 // indirect
github.com/jawher/mow.cli v1.1.0
github.com/kennygrant/sanitize v1.2.4
github.com/nlnwa/whatwg-url v0.1.0
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca
github.com/temoto/robotstxt v1.1.1
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc
golang.org/x/net v0.0.0-20201020065357-d65d470038a5
google.golang.org/appengine v1.6.6
google.golang.org/protobuf v1.24.0 // indirect
)
13 changes: 9 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18h
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
Expand All @@ -46,6 +45,8 @@ github.com/jawher/mow.cli v1.1.0 h1:NdtHXRc0CwZQ507wMvQ/IS+Q3W3x2fycn973/b8Zuk8=
github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/nlnwa/whatwg-url v0.1.0 h1:nJcUTPO+K/jjP7ZsrALylQ8a7XtDDvh0aqGDMdKO4co=
github.com/nlnwa/whatwg-url v0.1.0/go.mod h1:L97nLsTBZQV+fZTyMl1z6RdDhqgGzZTMmrpTkZDEdts=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
Expand All @@ -57,6 +58,8 @@ github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/willf/bitset v1.1.10 h1:NotGKqX0KwQ72NUzqrjZq5ipPNDQex9lo3WpaS8L2sc=
github.com/willf/bitset v1.1.10/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
Expand All @@ -72,8 +75,9 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc h1:zK/HqS5bZxDptfPJNq8v7vJfXtkU7r9TLIoSr1bXaP4=
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20201020065357-d65d470038a5 h1:KrxvpY64uUzANd9wKWr6ZAsufiii93XnvXaeikyCJ2g=
golang.org/x/net v0.0.0-20201020065357-d65d470038a5/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -82,9 +86,11 @@ golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
Expand All @@ -108,7 +114,6 @@ google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQ
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
Expand Down
10 changes: 8 additions & 2 deletions queue/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"net/url"
"sync"

whatwgUrl "github.com/nlnwa/whatwg-url/url"

"github.com/gocolly/colly/v2"
)

Expand Down Expand Up @@ -75,12 +77,16 @@ func (q *Queue) IsEmpty() bool {

// AddURL adds a new URL to the queue
func (q *Queue) AddURL(URL string) error {
u, err := url.Parse(URL)
u, err := whatwgUrl.Parse(URL)
if err != nil {
return err
}
u2, err := url.Parse(u.Href(false))
if err != nil {
return err
}
r := &colly.Request{
URL: u,
URL: u2,
Method: "GET",
}
d, err := r.Marshal()
Expand Down
19 changes: 11 additions & 8 deletions request.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"net/url"
"strings"
"sync/atomic"

whatwgUrl "github.com/nlnwa/whatwg-url/url"
)

// Request is the representation of a HTTP request made by a Collector
Expand Down Expand Up @@ -64,13 +66,17 @@ type serializableRequest struct {

// New creates a new request with the context of the original request
func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
u, err := url.Parse(URL)
u, err := whatwgUrl.Parse(URL)
if err != nil {
return nil, err
}
u2, err := url.Parse(u.Href(false))
if err != nil {
return nil, err
}
return &Request{
Method: method,
URL: u,
URL: u2,
Body: body,
Ctx: r.Ctx,
Headers: &http.Header{},
Expand All @@ -97,15 +103,12 @@ func (r *Request) AbsoluteURL(u string) string {
} else {
base = r.URL
}
absURL, err := base.Parse(u)

absURL, err := whatwgUrl.ParseRef(base.String(), u)
if err != nil {
return ""
}
absURL.Fragment = ""
if absURL.Scheme == "//" {
absURL.Scheme = r.URL.Scheme
}
return absURL.String()
return absURL.Href(false)
}

// Visit continues Collector's collecting job by creating a
Expand Down
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载