+
Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
}

func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
parsedURL, err := url.Parse(u)
parsedURL, err := url.Parse(RemoveAsciiTabAndNewlines(u))
if err != nil {
return err
}
Expand Down Expand Up @@ -1074,7 +1074,7 @@ func (c *Collector) handleOnHTML(resp *Response) error {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
baseURL, err := resp.Request.URL.Parse(href)
baseURL, err := resp.Request.URL.Parse(RemoveAsciiTabAndNewlines(href))
if err == nil {
resp.Request.baseURL = baseURL
}
Expand Down Expand Up @@ -1116,7 +1116,7 @@ func (c *Collector) handleOnXML(resp *Response) error {
if e := htmlquery.FindOne(doc, "//base"); e != nil {
for _, a := range e.Attr {
if a.Key == "href" {
baseURL, err := resp.Request.URL.Parse(a.Val)
baseURL, err := resp.Request.URL.Parse(RemoveAsciiTabAndNewlines(a.Val))
if err == nil {
resp.Request.baseURL = baseURL
}
Expand Down Expand Up @@ -1452,3 +1452,21 @@ func streamToByte(r io.Reader) []byte {

return buf.Bytes()
}

// RemoveAsciiTabAndNewlines removes the corresponding characters
// according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser.
// Although step 2 says "validation error", this is not a hard error,
// and browsers do in fact just silently remove those.
//
// This function is mostly used internally, but it's exported for extra
// convenience.
func RemoveAsciiTabAndNewlines(s string) string {
return strings.Map(func(r rune) rune {
switch r {
case '\t', '\n', '\r':
return -1
default:
return r
}
}, s)
}
58 changes: 58 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,36 @@ func newTestServer() *httptest.Server {
`))
})

mux.HandleFunc("/tabs_and_newlines", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<base href="/foo bar/" />
</head>
<body>
<a href="x
y">link</a>
</body>
</html>
`))
})

mux.HandleFunc("/foobar/xy", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<p>hello</p>
</body>
</html>
`))
})

mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/octet-stream")
ww := bufio.NewWriter(w)
Expand Down Expand Up @@ -841,6 +871,34 @@ func TestBaseTagRelative(t *testing.T) {
c2.Visit(ts.URL + "/base_relative")
}

func TestTabsAndNewlines(t *testing.T) {
// this test might look odd, but see step 3 of
// https://url.spec.whatwg.org/#concept-basic-url-parser

ts := newTestServer()
defer ts.Close()

visited := map[string]struct{}{}
expected := map[string]struct{}{
"/tabs_and_newlines": {},
"/foobar/xy": {},
}

c := NewCollector()
c.OnResponse(func(res *Response) {
visited[res.Request.URL.EscapedPath()] = struct{}{}
})
c.OnHTML("a[href]", func(e *HTMLElement) {
e.Request.Visit(e.Attr("href"))
})

c.Visit(ts.URL + "/tabs_and_newlines")

if !reflect.DeepEqual(visited, expected) {
t.Errorf("visited=%v expected=%v", visited, expected)
}
}

func TestCollectorCookies(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down
2 changes: 1 addition & 1 deletion queue/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func (q *Queue) IsEmpty() bool {

// AddURL adds a new URL to the queue
func (q *Queue) AddURL(URL string) error {
u, err := url.Parse(URL)
u, err := url.Parse(colly.RemoveAsciiTabAndNewlines(URL))
if err != nil {
return err
}
Expand Down
3 changes: 2 additions & 1 deletion request.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ type serializableRequest struct {

// New creates a new request with the context of the original request
func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
u, err := url.Parse(URL)
u, err := url.Parse(RemoveAsciiTabAndNewlines(URL))
if err != nil {
return nil, err
}
Expand All @@ -88,6 +88,7 @@ func (r *Request) Abort() {
// AbsoluteURL returns empty string if the URL chunk is a fragment or
// could not be parsed
func (r *Request) AbsoluteURL(u string) string {
u = RemoveAsciiTabAndNewlines(u)
if strings.HasPrefix(u, "#") {
return ""
}
Expand Down
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载