From e057ef0b0b2412afd0075587c4a11fc7bd0ad43a Mon Sep 17 00:00:00 2001 From: Shawn Hurley Date: Thu, 16 Nov 2023 12:08:08 -0500 Subject: [PATCH] :bug: using golang for filecontent searching (#430) Working on increasing the performance inside podman VM's on Mac and Windows. Signed-off-by: Shawn Hurley --- provider/internal/builtin/service_client.go | 127 ++++++++++++++------ rule-example.yaml | 4 +- 2 files changed, 95 insertions(+), 36 deletions(-) diff --git a/provider/internal/builtin/service_client.go b/provider/internal/builtin/service_client.go index 77be39da..8e476b73 100644 --- a/provider/internal/builtin/service_client.go +++ b/provider/internal/builtin/service_client.go @@ -1,14 +1,13 @@ package builtin import ( + "bufio" "context" "fmt" "io/fs" "os" - "os/exec" "path/filepath" "regexp" - "strconv" "strings" "sync" @@ -16,6 +15,7 @@ import ( "github.com/antchfx/xmlquery" "github.com/antchfx/xpath" "github.com/go-logr/logr" + "github.com/konveyor/analyzer-lsp/lsp/protocol" "github.com/konveyor/analyzer-lsp/provider" "github.com/konveyor/analyzer-lsp/tracing" "go.lsp.dev/uri" @@ -84,53 +84,30 @@ func (p *builtinServiceClient) Evaluate(ctx context.Context, cap string, conditi if c.Pattern == "" { return response, fmt.Errorf("could not parse provided regex pattern as string: %v", conditionInfo) } - var outputBytes []byte - grep := exec.Command("grep", "-o", "-n", "-R", "-P", c.Pattern, p.config.Location) - outputBytes, err := grep.Output() + patternRegex, err := regexp.Compile(c.Pattern) if err != nil { - if exitError, ok := err.(*exec.ExitError); ok && exitError.ExitCode() == 1 { - return response, nil - } - return response, fmt.Errorf("could not run grep with provided pattern %+v", err) + return response, err } - - matches := []string{} - outputString := strings.TrimSpace(string(outputBytes)) - if outputString != "" { - matches = append(matches, strings.Split(outputString, "\n")...) + matches, err := parallelWalk(p.config.Location, patternRegex) + if err != nil { + return response, err } for _, match := range matches { //TODO(fabianvf): This will not work if there is a `:` in the filename, do we care? - pieces := strings.SplitN(match, ":", 3) - if len(pieces) != 3 { - //TODO(fabianvf): Just log or return? - //(shawn-hurley): I think the return is good personally - return response, fmt.Errorf( - "malformed response from grep, cannot parse grep output '%s' with pattern {filepath}:{lineNumber}:{matchingText}", match) - } - - containsFile, err := provider.FilterFilePattern(c.FilePattern, pieces[0]) + containsFile, err := provider.FilterFilePattern(c.FilePattern, match.positionParams.TextDocument.URI) if err != nil { return response, err } if !containsFile { continue } - - ab, err := filepath.Abs(pieces[0]) - if err != nil { - ab = pieces[0] - } - lineNumber, err := strconv.Atoi(pieces[1]) - if err != nil { - return response, fmt.Errorf("cannot convert line number string to integer") - } + lineNumber := int(match.positionParams.Position.Line) response.Incidents = append(response.Incidents, provider.IncidentContext{ - FileURI: uri.File(ab), + FileURI: uri.URI(match.positionParams.TextDocument.URI), LineNumber: &lineNumber, Variables: map[string]interface{}{ - "matchingText": pieces[2], + "matchingText": match.match, }, CodeLocation: &provider.Location{ StartPosition: provider.Position{Line: float64(lineNumber)}, @@ -372,3 +349,85 @@ func findFilesMatchingPattern(root, pattern string) ([]string, error) { }) return matches, err } + +type walkResult struct { + positionParams protocol.TextDocumentPositionParams + match string +} + +func parallelWalk(location string, regex *regexp.Regexp) ([]walkResult, error) { + var positions []walkResult + positionsChan := make(chan walkResult) + wg := &sync.WaitGroup{} + + go func() { + err := filepath.Walk(location, func(path string, f os.FileInfo, err error) error { + if err != nil { + return err + } + + if f.Mode().IsRegular() { + wg.Add(1) + go processFile(path, regex, positionsChan, wg) + } + + return nil + }) + + if err != nil { + return + } + + wg.Wait() + close(positionsChan) + }() + + for pos := range positionsChan { + positions = append(positions, pos) + } + + return positions, nil +} + +func processFile(path string, regex *regexp.Regexp, positionsChan chan<- walkResult, wg *sync.WaitGroup) { + defer wg.Done() + + content, err := os.ReadFile(path) + if err != nil { + return + } + + // Must go through each line, + forceFullFileScan := false + if strings.Contains(regex.String(), "^") { + forceFullFileScan = true + } + + if regex.Match(content) || forceFullFileScan { + scanner := bufio.NewScanner(strings.NewReader(string(content))) + lineNumber := 1 + for scanner.Scan() { + matchLocations := regex.FindAllStringIndex(scanner.Text(), -1) + matchStrings := regex.FindAllString(scanner.Text(), -1) + for i, loc := range matchLocations { + absPath, err := filepath.Abs(path) + if err != nil { + return + } + positionsChan <- walkResult{ + positionParams: protocol.TextDocumentPositionParams{ + TextDocument: protocol.TextDocumentIdentifier{ + URI: fmt.Sprintf("file://%s", absPath), + }, + Position: protocol.Position{ + Line: uint32(lineNumber), + Character: uint32(loc[1]), + }, + }, + match: matchStrings[i], + } + } + lineNumber++ + } + } +} diff --git a/rule-example.yaml b/rule-example.yaml index 81c957e1..1ce511a1 100644 --- a/rule-example.yaml +++ b/rule-example.yaml @@ -184,7 +184,7 @@ ruleID: filecontent-codesnip-test when: builtin.filecontent: - pattern: "^FROM.*openjdk-11.*" + pattern: "^.*openjdk-11.*" filePattern: "Dockerfile" - message: python sample rule 001 @@ -201,4 +201,4 @@ ruleID: python-sample-rule-003 when: python.referenced: - pattern: "create_custom_resource_definition" \ No newline at end of file + pattern: "create_custom_resource_definition"