diff --git a/config/config.go b/config/config.go index ee9abab3..612d8f69 100644 --- a/config/config.go +++ b/config/config.go @@ -33,6 +33,7 @@ type Repo struct { ExcludeDotFiles bool `json:"exclude-dot-files"` EnablePollUpdates *bool `json:"enable-poll-updates"` EnablePushUpdates *bool `json:"enable-push-updates"` + FallbackEncoding string `json:"fallback-encoding"` } // Used for interpreting the config value for fields that use *bool. If a value diff --git a/go.mod b/go.mod index 275c0bcc..a6a93da6 100644 --- a/go.mod +++ b/go.mod @@ -5,4 +5,5 @@ go 1.13 require ( github.com/blang/semver v3.5.1+incompatible github.com/go-bindata/go-bindata v3.1.2+incompatible // indirect + golang.org/x/text v0.3.5 ) diff --git a/go.sum b/go.sum index 44ac122f..49f0eeee 100644 --- a/go.sum +++ b/go.sum @@ -4,3 +4,6 @@ github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnweb github.com/go-bindata/go-bindata v1.0.0 h1:DZ34txDXWn1DyWa+vQf7V9ANc2ILTtrEjtlsdJRF26M= github.com/go-bindata/go-bindata v3.1.2+incompatible h1:5vjJMVhowQdPzjE1LdxyFF7YFTXg5IgGVW4gBr5IbvE= github.com/go-bindata/go-bindata v3.1.2+incompatible/go.mod h1:xK8Dsgwmeed+BBsSy2XTopBn/8uK2HWuGSnA11C3Joo= +golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ= +golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/index/index.go b/index/index.go index d956f3eb..2225b717 100644 --- a/index/index.go +++ b/index/index.go @@ -14,13 +14,14 @@ import ( "github.com/hound-search/hound/codesearch/index" "github.com/hound-search/hound/codesearch/regexp" + "golang.org/x/text/encoding" ) const ( matchLimit = 5000 manifestFilename = "metadata.gob" excludedFileJsonFilename = "excluded_files.json" - filePeekSize = 2048 + filePeekSize = 1 << 20 ) const ( @@ -38,6 +39,7 @@ type Index struct { type IndexOptions struct { ExcludeDotFiles bool SpecialFiles []string + FallbackEnc encoding.Encoding } type SearchOptions struct { @@ -236,34 +238,34 @@ func (n *Index) Search(pat string, opt *SearchOptions) (*SearchResponse, error) Matches: results, FilesWithMatch: filesFound, FilesOpened: filesOpened, - Duration: time.Now().Sub(startedAt), //nolint + Duration: time.Now().Sub(startedAt), //nolint Revision: n.Ref.Rev, }, nil } -func isTextFile(filename string) (bool, error) { +func isTextFile(filename string) (isText bool, isUTF8 bool, err error) { buf := make([]byte, filePeekSize) r, err := os.Open(filename) if err != nil { - return false, err + return false, false, err } defer r.Close() n, err := io.ReadFull(r, buf) if err != nil && err != io.ErrUnexpectedEOF && err != io.EOF { - return false, err + return false, false, err } buf = buf[:n] - if n < filePeekSize { - // read the whole file, must be valid. - return utf8.Valid(buf), nil + if n < filePeekSize && utf8.Valid(buf) || // read the whole file, must be valid. + n >= filePeekSize && validUTF8IgnoringPartialTrailingRune(buf) { // read a prefix, allow trailing partial runes. + return true, true, nil } - - // read a prefix, allow trailing partial runes. - return validUTF8IgnoringPartialTrailingRune(buf), nil - + if isBinary(buf) { + return false, false, nil + } + return true, false, nil } // Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed @@ -292,17 +294,30 @@ func validUTF8IgnoringPartialTrailingRune(p []byte) bool { return true } -func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) { +func isBinary(p []byte) bool { + for _, c := range p { + if c < 10 { + return true + } + } + return false +} + +func addFileToIndex(ix *index.IndexWriter, dst, src, path string, enc encoding.Encoding) (string, error) { rel, err := filepath.Rel(src, path) if err != nil { return "", err } - r, err := os.Open(path) + fh, err := os.Open(path) if err != nil { return "", err } - defer r.Close() + defer fh.Close() + r := io.Reader(fh) + if enc != nil { + r = enc.NewDecoder().Reader(r) + } dup := filepath.Join(dst, "raw", rel) w, err := os.Create(dup) @@ -364,7 +379,7 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error { } defer fileHandle.Close() - if err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { //nolint + if err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { //nolint name := info.Name() rel, err := filepath.Rel(src, path) if err != nil { @@ -404,20 +419,24 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error { return nil } - txt, err := isTextFile(path) + isText, isUTF8, err := isTextFile(path) if err != nil { return err } - if !txt { + if !isText { excluded = append(excluded, &ExcludedFile{ rel, reasonNotText, }) return nil } + var enc encoding.Encoding + if !isUTF8 { + enc = opt.FallbackEnc + } - reasonForExclusion, err := addFileToIndex(ix, dst, src, path) + reasonForExclusion, err := addFileToIndex(ix, dst, src, path, enc) if err != nil { return err } diff --git a/searcher/searcher.go b/searcher/searcher.go index e2f9b350..7da52703 100644 --- a/searcher/searcher.go +++ b/searcher/searcher.go @@ -16,6 +16,7 @@ import ( "github.com/hound-search/hound/config" "github.com/hound-search/hound/index" "github.com/hound-search/hound/vcs" + "golang.org/x/text/encoding/htmlindex" ) type Searcher struct { @@ -264,7 +265,7 @@ func reportOnMemory() { // Utility function for producing a hex encoded sha1 hash for a string. func hashFor(name string) string { h := sha1.New() - h.Write([]byte(name)) //nolint + h.Write([]byte(name)) //nolint return hex.EncodeToString(h.Sum(nil)) } @@ -411,6 +412,11 @@ func newSearcher( ExcludeDotFiles: repo.ExcludeDotFiles, SpecialFiles: wd.SpecialFiles(), } + if repo.FallbackEncoding != "" { + if opt.FallbackEnc, err = htmlindex.Get(repo.FallbackEncoding); err != nil { + return nil, fmt.Errorf("%s.fallback-encoding=%q: %w", name, repo.FallbackEncoding, err) + } + } rev, err := wd.PullOrClone(vcsDir, repo.Url) if err != nil {