diff --git a/colly_test.go b/colly_test.go index 5d8de590..355c86ac 100644 --- a/colly_test.go +++ b/colly_test.go @@ -51,6 +51,8 @@ const testXML = ` This is a test paragraph ` +const custom404 = `404 not found` + func newUnstartedTestServer() *httptest.Server { mux := http.NewServeMux() @@ -86,6 +88,10 @@ func newUnstartedTestServer() *httptest.Server { ww.Write([]byte(testXML)) }) + mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) { + http.Error(w, custom404, http.StatusNotFound) + }) + mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { if r.Method == "POST" { w.Header().Set("Content-Type", "text/html") @@ -1477,6 +1483,35 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) { testCollectorOnXMLWithXML(t, "/test.xml.gz") } +func TestCollectorNonexistentXMLGZ(t *testing.T) { + // This is a regression test for colly + // attempting to decompress all .xml.gz URLs + // even if they're not compressed. + ts := newTestServer() + defer ts.Close() + + c := NewCollector(ParseHTTPErrorResponse()) + + onResponseCalled := false + + c.OnResponse(func(resp *Response) { + onResponseCalled = true + if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want { + t.Errorf("wrong response body got=%q want=%q", got, want) + } + }) + + c.OnError(func(resp *Response, err error) { + t.Errorf("called on OnError: err=%v", err) + }) + + c.Visit(ts.URL + "/nonexistent.xml.gz") + + if !onResponseCalled { + t.Error("OnResponse was not called") + } +} + func TestCollectorVisitWithTrace(t *testing.T) { ts := newTestServer() defer ts.Close() diff --git a/http_backend.go b/http_backend.go index e580f7a2..b910a382 100644 --- a/http_backend.go +++ b/http_backend.go @@ -15,6 +15,7 @@ package colly import ( + "bufio" "crypto/sha1" "encoding/gob" "encoding/hex" @@ -202,11 +203,23 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c } contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) { - bodyReader, err = gzip.NewReader(bodyReader) + // Even if URL contains .xml.gz, it doesn't mean that we get gzip + // compressed data back. We might get 404 error page instead, + // for example. So check gzip magic bytes. + bufReader := bufio.NewReader(bodyReader) + bodyReader = bufReader + magic, err := bufReader.Peek(2) if err != nil { return nil, err } - defer bodyReader.(*gzip.Reader).Close() + // gzip magic, as specified in RFC 1952 + if magic[0] == 0x1f && magic[1] == 0x8b { + bodyReader, err = gzip.NewReader(bufReader) + if err != nil { + return nil, err + } + defer bodyReader.(*gzip.Reader).Close() + } } body, err := io.ReadAll(bodyReader) if err != nil {