diff --git a/internal/staticanalysis/analyze.go b/internal/staticanalysis/analyze.go index ed9a5dcb..9d8dbea7 100644 --- a/internal/staticanalysis/analyze.go +++ b/internal/staticanalysis/analyze.go @@ -30,10 +30,6 @@ func enumeratePackageFiles(extractDir string) ([]string, error) { return paths, err } -func getPathInArchive(path, extractDir string) string { - return strings.TrimPrefix(path, extractDir+string(os.PathSeparator)) -} - /* AnalyzePackageFiles walks a tree of extracted package files and runs the analysis tasks listed in analysisTasks to produce the result data. @@ -76,14 +72,13 @@ func AnalyzePackageFiles(extractDir string, jsParserConfig parsing.ParserConfig, result := Result{} - archivePath := map[string]string{} - for _, path := range fileList { - archivePath[path] = getPathInArchive(path, extractDir) + getPathInArchive := func(absolutePath string) string { + return strings.TrimPrefix(absolutePath, extractDir+string(os.PathSeparator)) } if runTask[Basic] { log.Info("run basic analysis") - basicData, err := GetBasicData(fileList, archivePath) + basicData, err := GetBasicData(fileList, getPathInArchive) if err != nil { log.Error("static analysis error", log.Label("task", string(Basic)), "error", err) } else { @@ -101,8 +96,8 @@ func AnalyzePackageFiles(extractDir string, jsParserConfig parsing.ParserConfig, log.Error("static analysis error", log.Label("task", string(Parsing)), "error", err) } else { // change absolute path in parsingResults to package-relative path - for _, parseResult := range parsingResults { - parseResult.Filename = archivePath[parseResult.Filename] + for i, r := range parsingResults { + parsingResults[i].Filename = getPathInArchive(r.Filename) } result.ParsingData = parsingResults } diff --git a/internal/staticanalysis/basic_data.go b/internal/staticanalysis/basic_data.go index 04c23ac3..adc499ed 100644 --- a/internal/staticanalysis/basic_data.go +++ b/internal/staticanalysis/basic_data.go @@ -107,7 +107,7 @@ some files should not prevent the analysis of other files. pathInArchive maps the absolute paths in fileList to relative paths in the package archive, to use for results. */ -func GetBasicData(fileList []string, pathInArchive map[string]string) (*BasicPackageData, error) { +func GetBasicData(fileList []string, pathInArchive func(absolutePath string) string) (*BasicPackageData, error) { // First, run file in batch processing mode to get all the file types at once. // Then, file size, hash and line lengths can be done in a simple loop @@ -124,7 +124,7 @@ func GetBasicData(fileList []string, pathInArchive map[string]string) (*BasicPac } for index, filePath := range fileList { - archivePath := pathInArchive[filePath] + archivePath := pathInArchive(filePath) fileType := fileTypes[index] var fileSize int64 diff --git a/internal/staticanalysis/basic_data_test.go b/internal/staticanalysis/basic_data_test.go index 22311b66..ee14a924 100644 --- a/internal/staticanalysis/basic_data_test.go +++ b/internal/staticanalysis/basic_data_test.go @@ -4,55 +4,98 @@ import ( "os" "path/filepath" "reflect" + "strings" "testing" + + "github.com/ossf/package-analysis/internal/utils" + "github.com/ossf/package-analysis/internal/utils/valuecounts" ) -func TestGetFileTypes(t *testing.T) { - testDir := t.TempDir() - fileName1 := filepath.Join(testDir, "test1.txt") - fileName2 := filepath.Join(testDir, "test2.txt") +type testFile struct { + filename string + contents []byte + contentsHash string + fileType string + lineLengths valuecounts.ValueCounts +} - if err := os.WriteFile(fileName1, []byte("hello test 1!\n"), 0o666); err != nil { - t.Fatalf("failed to write test file 1: %v", err) - } - if err := os.WriteFile(fileName2, []byte("#! /bin/bash\necho 'Hello test 2'\n"), 0o666); err != nil { - t.Fatalf("failed to write test file 2: %v", err) - } +var testFiles = []testFile{ + { + filename: "test1.txt", + contents: []byte("hello test 1!\n"), + contentsHash: "sha256:bd96959573979235b87180b0b7513c7f1d5cbf046b263f366f2f10fe1b966494", + fileType: "ASCII text", + lineLengths: valuecounts.Count([]int{13}), + }, + { + filename: "test2.txt", + contents: []byte("#! /bin/bash\necho 'Hello test 2'\n"), + contentsHash: "sha256:6179db3c673ceddcdbd384116ae4d301d64e65fc2686db9ba64945677a5a893c", + fileType: "Bourne-Again shell script, ASCII text executable", + lineLengths: valuecounts.Count([]int{12, 19}), + }, +} +func TestGetBasicData(t *testing.T) { tests := []struct { - name string - fileList []string - want []string - wantErr bool + name string + files []testFile + wantErr bool }{ { - name: "test no files", - fileList: []string{}, - want: []string{}, - wantErr: false, + name: "test no files", + files: nil, + wantErr: false, }, { - name: "test one file", - fileList: []string{fileName1}, - want: []string{"ASCII text"}, - wantErr: false, + name: "test one file", + files: []testFile{testFiles[0]}, + wantErr: false, }, { - name: "test two files", - fileList: []string{fileName1, fileName2}, - want: []string{"ASCII text", "Bourne-Again shell script, ASCII text executable"}, - wantErr: false, + name: "test two files", + files: []testFile{testFiles[0], testFiles[1]}, + wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := getFileTypes(tt.fileList) + testDir := t.TempDir() + paths := utils.Transform(tt.files, func(f testFile) string { + return filepath.Join(testDir, f.filename) + }) + + for i := range tt.files { + if err := os.WriteFile(paths[i], tt.files[i].contents, 0o666); err != nil { + t.Fatalf("failed to write test file %d: %v", i, err) + } + } + + getArchivePath := func(absolutePath string) string { + return strings.TrimPrefix(absolutePath, testDir+string(os.PathSeparator)) + } + + got, err := GetBasicData(paths, getArchivePath) if (err != nil) != tt.wantErr { t.Errorf("getFileTypes() error = %v, wantErr %v", err, tt.wantErr) return } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("getFileTypes() got = %#v, want %#v", got, tt.want) + + wantData := utils.Transform(tt.files, func(f testFile) BasicFileData { + return BasicFileData{ + Filename: f.filename, + FileType: f.fileType, + Size: int64(len(f.contents)), + Hash: f.contentsHash, + LineLengths: f.lineLengths, + } + }) + + gotData := got.Files + + if !reflect.DeepEqual(gotData, wantData) { + t.Errorf("TestGetBasicData() data mismatch:\n"+ + "== got == \n%v\n== want ==\n%v", got, wantData) } }) } diff --git a/internal/staticanalysis/obfuscation/file_signals_test.go b/internal/staticanalysis/obfuscation/file_signals_test.go index f6f22baa..e443af4e 100644 --- a/internal/staticanalysis/obfuscation/file_signals_test.go +++ b/internal/staticanalysis/obfuscation/file_signals_test.go @@ -29,8 +29,8 @@ var fileSignalsTestCases = []fileSignalsTestCase{ FloatLiterals: []token.Float{}, }, expectedSignals: FileSignals{ - StringLengths: valuecounts.ValueCounts{5: 1}, - IdentifierLengths: valuecounts.ValueCounts{1: 1}, + StringLengths: valuecounts.Count([]int{5}), + IdentifierLengths: valuecounts.Count([]int{1}), SuspiciousIdentifiers: []SuspiciousIdentifier{{Name: "a", Rule: "single"}}, EscapedStrings: []EscapedString{}, Base64Strings: []string{}, @@ -61,8 +61,8 @@ var fileSignalsTestCases = []fileSignalsTestCase{ FloatLiterals: []token.Float{}, }, expectedSignals: FileSignals{ - StringLengths: valuecounts.ValueCounts{5: 2}, - IdentifierLengths: valuecounts.ValueCounts{1: 3, 4: 1}, + StringLengths: valuecounts.Count([]int{5, 5}), + IdentifierLengths: valuecounts.Count([]int{4, 1, 1, 1}), SuspiciousIdentifiers: []SuspiciousIdentifier{ {Name: "a", Rule: "single"}, {Name: "b", Rule: "single"}, @@ -76,6 +76,41 @@ var fileSignalsTestCases = []fileSignalsTestCase{ URLs: []string{}, }, }, + { + name: "one of everything", + parseData: parsing.SingleResult{ + Identifiers: []token.Identifier{ + {Name: "_0x12414124", Type: token.Variable}, + {Name: "a", Type: token.Parameter}, + {Name: "d1912931", Type: token.Parameter}, + }, + StringLiterals: []token.String{ + {Value: "hello@email.me", Raw: `"hello@email.me"`}, + {Value: "https://this.is.a.website.com", Raw: `"https://this.is.a.website.com"`}, + {Value: "aGVsbG8gd29ybGQK", Raw: `"aGVsbG8gd29ybGQK"`}, + {Value: "8.8.8.8", Raw: `"8.8.8.8"`}, + {Value: "e3fc:234a:2341::abcd", Raw: `"e3fc:234a:2341::abcd"`}, + {Value: "0x21323492394", Raw: `"0x21323492394"`}, + }, + IntLiterals: []token.Int{}, + FloatLiterals: []token.Float{}, + }, + expectedSignals: FileSignals{ + IdentifierLengths: valuecounts.Count([]int{11, 1, 8}), + StringLengths: valuecounts.Count([]int{14, 29, 16, 7, 20, 13}), + SuspiciousIdentifiers: []SuspiciousIdentifier{ + {Name: "_0x12414124", Rule: "hex"}, + {Name: "a", Rule: "single"}, + {Name: "d1912931", Rule: "numeric"}, + }, + EscapedStrings: []EscapedString{}, + Base64Strings: []string{"aGVsbG8gd29ybGQK"}, + EmailAddresses: []string{"hello@email.me"}, + HexStrings: []string{"21323492394"}, + IPAddresses: []string{"8.8.8.8", "e3fc:234a:2341::abcd"}, + URLs: []string{"https://this.is.a.website.com"}, + }, + }, } func TestComputeSignals(t *testing.T) { diff --git a/internal/staticanalysis/obfuscation/stats/sample_statistics.go b/internal/staticanalysis/obfuscation/stats/sample_statistics.go index 36bae5e7..4758f198 100644 --- a/internal/staticanalysis/obfuscation/stats/sample_statistics.go +++ b/internal/staticanalysis/obfuscation/stats/sample_statistics.go @@ -8,7 +8,6 @@ import ( "golang.org/x/exp/slices" "github.com/ossf/package-analysis/internal/utils" - "github.com/ossf/package-analysis/internal/utils/valuecounts" ) type RealNumber interface { @@ -209,11 +208,3 @@ func Summarise[T RealNumber](sample []T) SampleStatistics { q := quartiles(sample) return SampleStatistics{Size: l, Mean: m, Variance: v, Skewness: s, Quartiles: q} } - -func CountDistinct(sample []int) valuecounts.ValueCounts { - counts := valuecounts.New() - for _, t := range sample { - counts[t] += 1 - } - return counts -} diff --git a/internal/staticanalysis/obfuscation/stats/sample_statistics_test.go b/internal/staticanalysis/obfuscation/stats/sample_statistics_test.go index 315fa81a..d97ce6ad 100644 --- a/internal/staticanalysis/obfuscation/stats/sample_statistics_test.go +++ b/internal/staticanalysis/obfuscation/stats/sample_statistics_test.go @@ -2,10 +2,7 @@ package stats import ( "math" - "reflect" "testing" - - "github.com/ossf/package-analysis/internal/utils/valuecounts" ) func TestSummary(t *testing.T) { @@ -115,30 +112,3 @@ func TestSummary7(t *testing.T) { t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual) } } - -func TestCountDistinct1(t *testing.T) { - data := []int{1, 2, 3, 4, 3, 2, 1, 2} - actual := CountDistinct(data) - expected := valuecounts.ValueCounts{1: 2, 2: 3, 3: 2, 4: 1} - if !reflect.DeepEqual(actual, expected) { - t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual) - } -} - -func TestCountDistinct2(t *testing.T) { - data := []int{1} - actual := CountDistinct(data) - expected := valuecounts.ValueCounts{1: 1} - if !reflect.DeepEqual(actual, expected) { - t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual) - } -} - -func TestCountDistinct3(t *testing.T) { - data := []int{} - actual := CountDistinct(data) - expected := valuecounts.ValueCounts{} - if !reflect.DeepEqual(actual, expected) { - t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual) - } -} diff --git a/internal/staticanalysis/obfuscation/stringentropy/string_entropy.go b/internal/staticanalysis/obfuscation/stringentropy/string_entropy.go index 48549d2d..7bec4dc4 100644 --- a/internal/staticanalysis/obfuscation/stringentropy/string_entropy.go +++ b/internal/staticanalysis/obfuscation/stringentropy/string_entropy.go @@ -6,7 +6,7 @@ import ( ) /* -CalculateEntropy calculates entropy of a string S of characters over an alphabet A, which is defined as +Calculate finds the entropy of a string S of characters over an alphabet A, which is defined as E(S) = - sum(i in A) { (p(i)) * log(p(i)) }, @@ -24,7 +24,7 @@ the entropy approaches 0. Reference: https://link.springer.com/chapter/10.1007/978-3-642-10509-8_19 */ -func CalculateEntropy(s string, prob map[rune]float64) float64 { +func Calculate(s string, prob map[rune]float64) float64 { if len(s) == 0 { return 0 } @@ -46,7 +46,7 @@ func CalculateEntropy(s string, prob map[rune]float64) float64 { } /* -CalculateNormalisedEntropy returns the string entropy normalised by the log of the length of the string. +CalculateNormalised returns the string entropy normalised by the log of the length of the string. This quantity is used because for log(N) is the maximum possible entropy out of all strings with length N, where N > 0. Special cases are empty strings (0) and single character strings (1). As a formula: @@ -59,7 +59,7 @@ As a formula: */ // TODO does this make sense when a general probability structure is used? // TODO calculate max string entropy for a given set of character counts. -func CalculateNormalisedEntropy(s string, prob map[rune]float64) float64 { +func CalculateNormalised(s string, prob map[rune]float64) float64 { length := utf8.RuneCountInString(s) switch length { case 0: @@ -67,7 +67,7 @@ func CalculateNormalisedEntropy(s string, prob map[rune]float64) float64 { case 1: return 1 default: - return CalculateEntropy(s, prob) / math.Log(float64(length)) + return Calculate(s, prob) / math.Log(float64(length)) } } diff --git a/internal/staticanalysis/obfuscation/stringentropy/string_entropy_test.go b/internal/staticanalysis/obfuscation/stringentropy/string_entropy_test.go index 334e4913..70585f73 100644 --- a/internal/staticanalysis/obfuscation/stringentropy/string_entropy_test.go +++ b/internal/staticanalysis/obfuscation/stringentropy/string_entropy_test.go @@ -24,7 +24,7 @@ func TestStringEntropy(t *testing.T) { {"aaA", -2*2.0/3.0*math.Log(2.0/3.0) - math.Log(1.0/3.0)/3.0}, } for index, test := range testCases { - actual := CalculateEntropy(test.s, nil) + actual := Calculate(test.s, nil) if !utils.FloatEquals(test.expected, actual, tolerance) { t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n", index+1, test.s, test.expected, actual) @@ -53,7 +53,7 @@ func TestStringEntropyWithFixedProbs(t *testing.T) { {" a \n", -a * math.Log(a)}, } for index, test := range testCases { - actual := CalculateEntropy(test.s, probs) + actual := Calculate(test.s, probs) if !utils.FloatEquals(test.expected, actual, tolerance) { t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n", index+1, test.s, test.expected, actual) @@ -73,7 +73,7 @@ func TestStringEntropyRatio(t *testing.T) { {"aaA", (-2*2.0*math.Log(2.0/3.0) - math.Log(1.0/3.0)) / (3.0 * math.Log(3))}, } for index, test := range testCases { - actual := CalculateNormalisedEntropy(test.s, nil) + actual := CalculateNormalised(test.s, nil) if !utils.FloatEquals(test.expected, actual, tolerance) { t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n", index+1, test.s, test.expected, actual) diff --git a/internal/staticanalysis/parsing/analyze.go b/internal/staticanalysis/parsing/analyze.go index 82039eb5..bee7c58d 100644 --- a/internal/staticanalysis/parsing/analyze.go +++ b/internal/staticanalysis/parsing/analyze.go @@ -49,7 +49,7 @@ func processJsData(filename string, fileData singleParseData) *SingleResult { } for _, c := range fileData.Comments { - result.Comments = append(result.Comments, token.Comment{Value: c.Data}) + result.Comments = append(result.Comments, token.Comment{Text: c.Data}) } return result } @@ -62,11 +62,11 @@ func computeEntropy(parseResults []*SingleResult) { var identifiers []string for _, result := range parseResults { - for _, sl := range result.StringLiterals { - strings = append(strings, sl.Value) + for _, str := range result.StringLiterals { + strings = append(strings, str.Value) } - for _, id := range result.Identifiers { - identifiers = append(identifiers, id.Name) + for _, ident := range result.Identifiers { + identifiers = append(identifiers, ident.Name) } } @@ -74,11 +74,11 @@ func computeEntropy(parseResults []*SingleResult) { identifierCharDistribution := stringentropy.CharacterProbabilities(identifiers) for _, result := range parseResults { - for _, sl := range result.StringLiterals { - sl.Entropy = stringentropy.CalculateEntropy(sl.Value, stringLiteralCharDistribution) + for i := range result.StringLiterals { + result.StringLiterals[i].ComputeEntropy(stringLiteralCharDistribution) } - for _, id := range result.Identifiers { - id.Entropy = stringentropy.CalculateEntropy(id.Name, identifierCharDistribution) + for i := range result.Identifiers { + result.Identifiers[i].ComputeEntropy(identifierCharDistribution) } } } diff --git a/internal/staticanalysis/parsing/analyze_test.go b/internal/staticanalysis/parsing/analyze_test.go index edadee36..3a3f279a 100644 --- a/internal/staticanalysis/parsing/analyze_test.go +++ b/internal/staticanalysis/parsing/analyze_test.go @@ -4,18 +4,27 @@ import ( "reflect" "testing" - "github.com/ossf/package-analysis/internal/log" "github.com/ossf/package-analysis/internal/staticanalysis/externalcmd" + "github.com/ossf/package-analysis/internal/staticanalysis/obfuscation/stringentropy" "github.com/ossf/package-analysis/internal/staticanalysis/token" ) -type collectDataTestCase struct { +type analyzeTestcase struct { name string jsSource string expectedData SingleResult } -var collectDataTestCases = []collectDataTestCase{ +var literalCharProbs = []map[rune]float64{ + stringentropy.CharacterProbabilities([]string{"hello"}), + stringentropy.CharacterProbabilities([]string{"hello", "apple"}), +} +var identifierCharProbs = []map[rune]float64{ + stringentropy.CharacterProbabilities([]string{"a"}), + stringentropy.CharacterProbabilities([]string{"test", "a", "b", "c"}), +} + +var analyzeTestcases = []analyzeTestcase{ { name: "simple 1", jsSource: ` @@ -23,10 +32,10 @@ var a = "hello" `, expectedData: SingleResult{ Identifiers: []token.Identifier{ - {Name: "a", Type: token.Variable}, + {Name: "a", Type: token.Variable, Entropy: stringentropy.Calculate("a", identifierCharProbs[0])}, }, StringLiterals: []token.String{ - {Value: "hello", Raw: `"hello"`}, + {Value: "hello", Raw: `"hello"`, Entropy: stringentropy.Calculate("hello", literalCharProbs[0])}, }, IntLiterals: []token.Int{}, FloatLiterals: []token.Float{}, @@ -47,14 +56,14 @@ function test(a, b = 2) { `, expectedData: SingleResult{ Identifiers: []token.Identifier{ - {Name: "test", Type: token.Function}, - {Name: "a", Type: token.Parameter}, - {Name: "b", Type: token.Parameter}, - {Name: "c", Type: token.Variable}, + {Name: "test", Type: token.Function, Entropy: stringentropy.Calculate("test", identifierCharProbs[1])}, + {Name: "a", Type: token.Parameter, Entropy: stringentropy.Calculate("a", identifierCharProbs[1])}, + {Name: "b", Type: token.Parameter, Entropy: stringentropy.Calculate("b", identifierCharProbs[1])}, + {Name: "c", Type: token.Variable, Entropy: stringentropy.Calculate("c", identifierCharProbs[1])}, }, StringLiterals: []token.String{ - {Value: "hello", Raw: `"hello"`}, - {Value: "apple", Raw: `"apple"`}, + {Value: "hello", Raw: `"hello"`, Entropy: stringentropy.Calculate("hello", literalCharProbs[1])}, + {Value: "apple", Raw: `"apple"`, Entropy: stringentropy.Calculate("apple", literalCharProbs[1])}, }, IntLiterals: []token.Int{ {Value: 2, Raw: "2"}, @@ -66,17 +75,13 @@ function test(a, b = 2) { }, } -func init() { - log.Initialize("") -} - -func TestCollectData(t *testing.T) { +func TestAnalyze(t *testing.T) { parserConfig, err := InitParser(t.TempDir()) if err != nil { t.Fatalf("failed to init parser: %v", err) } - for _, tt := range collectDataTestCases { + for _, tt := range analyzeTestcases { t.Run(tt.name, func(t *testing.T) { result, err := Analyze(parserConfig, externalcmd.StringInput(tt.jsSource), false) if err != nil { diff --git a/internal/staticanalysis/parsing/result.go b/internal/staticanalysis/parsing/result.go index 6781041b..ce06bdc2 100644 --- a/internal/staticanalysis/parsing/result.go +++ b/internal/staticanalysis/parsing/result.go @@ -17,6 +17,7 @@ type SingleResult struct { IntLiterals []token.Int `json:"int_literals"` FloatLiterals []token.Float `json:"float_literals"` Comments []token.Comment `json:"comments"` + // future: external function calls / references (e.g. eval) } func (r SingleResult) String() string { diff --git a/internal/staticanalysis/token/tokens.go b/internal/staticanalysis/token/tokens.go index b10a9a7a..2c767f75 100644 --- a/internal/staticanalysis/token/tokens.go +++ b/internal/staticanalysis/token/tokens.go @@ -1,27 +1,41 @@ package token +import "github.com/ossf/package-analysis/internal/staticanalysis/obfuscation/stringentropy" + type Identifier struct { - Name string - Type IdentifierType - Entropy float64 + Name string `json:"name"` + Type IdentifierType `json:"type"` + Entropy float64 `json:"entropy"` } -type Comment struct { - Value string +// ComputeEntropy computes the entropy of this identifier under the given +// character distribution and sets its Entropy field to the result value +func (i *Identifier) ComputeEntropy(probs map[rune]float64) { + i.Entropy = stringentropy.Calculate(i.Name, probs) } type String struct { - Value string - Raw string - Entropy float64 + Value string `json:"value"` + Raw string `json:"raw"` + Entropy float64 `json:"entropy"` +} + +// ComputeEntropy computes the entropy of this string literal under the given +// character distribution and sets its Entropy field to the result value +func (s *String) ComputeEntropy(probs map[rune]float64) { + s.Entropy = stringentropy.Calculate(s.Value, probs) } type Int struct { - Value int64 - Raw string + Value int64 `json:"value"` + Raw string `json:"raw"` } type Float struct { - Value float64 - Raw string + Value float64 `json:"value"` + Raw string `json:"raw"` +} + +type Comment struct { + Text string `json:"text"` } diff --git a/internal/utils/valuecounts/value_counts.go b/internal/utils/valuecounts/value_counts.go index a1d20053..3d035fcd 100644 --- a/internal/utils/valuecounts/value_counts.go +++ b/internal/utils/valuecounts/value_counts.go @@ -12,7 +12,9 @@ import ( // ValueCounts stores unordered counts of integer values as a map // from value (int) to count (int). It can be serialized to JSON // as an array of (value, count) pairs. -type ValueCounts map[int]int +type ValueCounts struct { + data map[int]int +} // Aside: I know using 'value' to refer to map keys is not great, but the // other names I came up with like 'size' and 'length' were all usage-specific. @@ -23,23 +25,41 @@ type Pair struct { Count int `json:"count"` } +// New creates a new empty ValueCounts object func New() ValueCounts { - return ValueCounts{} + return ValueCounts{ + data: map[int]int{}, + } +} + +// FromMap creates a new ValueCounts object and initialises its counts from the given map +func FromMap(data map[int]int) ValueCounts { + vc := New() + for value, count := range data { + vc.data[value] = count + } + return vc } // Count produces a new ValueCounts by counting repetitions of values in the input data func Count(data []int) ValueCounts { vc := New() for _, value := range data { - vc[value] += 1 + vc.data[value] += 1 } return vc } +// Len returns the number of values stored by this ValueCounts. +// It is equivalent to the length of the slice returned by ToPairs() +func (vc ValueCounts) Len() int { + return len(vc.data) +} + // String() returns a string representation of this ValueCounts // with values sorted in ascending order func (vc ValueCounts) String() string { - pairStrings := make([]string, 0, len(vc)) + pairStrings := make([]string, 0, len(vc.data)) for _, pair := range vc.ToPairs() { pairStrings = append(pairStrings, fmt.Sprintf("%d: %d", pair.Value, pair.Count)) } @@ -50,14 +70,14 @@ func (vc ValueCounts) String() string { // The values are sorted in increasing order so that the output is deterministic. // If this ValueCounts is empty, returns an empty slice. func (vc ValueCounts) ToPairs() []Pair { - pairs := make([]Pair, 0, len(vc)) + pairs := make([]Pair, 0, len(vc.data)) // sort the values so that the output is in a deterministic order - values := maps.Keys(vc) + values := maps.Keys(vc.data) slices.Sort(values) for _, value := range values { - count := vc[value] + count := vc.data[value] pairs = append(pairs, Pair{Value: value, Count: count}) } @@ -71,10 +91,10 @@ func FromPairs(pairs []Pair) (ValueCounts, error) { valueCounts := New() for _, item := range pairs { - if _, seen := valueCounts[item.Value]; seen { - return nil, fmt.Errorf("value occurs multiple times: %d", item.Value) + if _, seen := valueCounts.data[item.Value]; seen { + return ValueCounts{}, fmt.Errorf("value occurs multiple times: %d", item.Value) } - valueCounts[item.Value] = item.Count + valueCounts.data[item.Value] = item.Count } return valueCounts, nil diff --git a/internal/utils/valuecounts/value_counts_test.go b/internal/utils/valuecounts/value_counts_test.go index e1e1f66b..5bcab0e5 100644 --- a/internal/utils/valuecounts/value_counts_test.go +++ b/internal/utils/valuecounts/value_counts_test.go @@ -7,7 +7,7 @@ import ( "github.com/ossf/package-analysis/internal/utils" ) -func TestCountData_ToValueCountPairs(t *testing.T) { +func TestValueCounts_ToValueCountPairs(t *testing.T) { tests := []struct { name string vc ValueCounts @@ -15,22 +15,22 @@ func TestCountData_ToValueCountPairs(t *testing.T) { }{ { "nil", - nil, + New(), []Pair{}, }, { "empty", - ValueCounts{}, + New(), []Pair{}, }, { "single item", - ValueCounts{0: 1}, + FromMap(map[int]int{0: 1}), []Pair{{0, 1}}, }, { "multiple items", - ValueCounts{0: 1, 1: 2, 2: 3}, + FromMap(map[int]int{0: 1, 1: 2, 2: 3}), []Pair{{0, 1}, {1, 2}, {2, 3}}, }, } @@ -53,31 +53,31 @@ func TestFromValueCountPairs(t *testing.T) { { "nil", nil, - ValueCounts{}, + New(), false, }, { "empty non-nil", []Pair{}, - ValueCounts{}, + New(), false, }, { "single item", []Pair{{0, 1}}, - ValueCounts{0: 1}, + FromMap(map[int]int{0: 1}), false, }, { "multiple items", []Pair{{0, 1}, {1, 2}}, - ValueCounts{0: 1, 1: 2}, + FromMap(map[int]int{0: 1, 1: 2}), false, }, { "repeated items", []Pair{{0, 1}, {0, 1}}, - nil, + New(), true, }, } @@ -88,8 +88,11 @@ func TestFromValueCountPairs(t *testing.T) { t.Errorf("FromPairs() error = %v, wantErr %v", err, tt.wantErr) return } + if err != nil { + return + } if !reflect.DeepEqual(got, tt.want) { - t.Errorf("FromPairs() got = %v, want %v", got, tt.want) + t.Errorf("FromPairs() got %v, want %v", got, tt.want) } }) } @@ -104,25 +107,25 @@ func TestCountData_MarshalJSON(t *testing.T) { }{ { "nil", - nil, + ValueCounts{}, "[]", false, }, { "empty", - ValueCounts{}, + New(), "[]", false, }, { "single item", - ValueCounts{0: 1}, + FromMap(map[int]int{0: 1}), `[ {"value": 0, "count": 1} ]`, false, }, { "multiple items", - ValueCounts{0: 1, 1: 2, 2: 3}, + FromMap(map[int]int{0: 1, 1: 2, 2: 3}), `[ {"value":0, "count": 1}, {"value": 1, "count": 2}, {"value": 2, "count": 3} ]`, false, }, @@ -139,7 +142,7 @@ func TestCountData_MarshalJSON(t *testing.T) { if equal, err := utils.JSONEquals(gotBytes, []byte(tt.want)); err != nil { t.Errorf("MarshalJSON() error decoding JSON: %v", err) } else if !equal { - t.Errorf("MarshalJSON() got = %s, want %s", got, tt.want) + t.Errorf("MarshalJSON() got %s, want %s", got, tt.want) } }) } @@ -155,25 +158,25 @@ func TestCountData_UnmarshalJSON(t *testing.T) { { "null", "null", - ValueCounts{}, + New(), false, }, { "empty", "[]", - ValueCounts{}, + New(), false, }, { "single item", `[{"value": 0, "count": 1}]`, - ValueCounts{0: 1}, + FromMap(map[int]int{0: 1}), false, }, { "multiple items", `[{"value":0,"count":1},{"value":1,"count":2},{"value":2,"count":3}]`, - ValueCounts{0: 1, 1: 2, 2: 3}, + FromMap(map[int]int{0: 1, 1: 2, 2: 3}), false, }, // TODO: Add test cases. @@ -192,3 +195,72 @@ func TestCountData_UnmarshalJSON(t *testing.T) { }) } } + +func TestFromMap(t *testing.T) { + tests := []struct { + name string + data map[int]int + want ValueCounts + }{ + { + "nil", + nil, + New(), + }, + { + "empty", + map[int]int{}, + New(), + }, + { + "basic", + map[int]int{-1: 210, 10: 102, 0: 34, 3: 0}, + ValueCounts{ + data: map[int]int{-1: 210, 0: 34, 3: 0, 10: 102}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := FromMap(tt.data); !reflect.DeepEqual(got, tt.want) { + t.Errorf("FromMap() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestCount(t *testing.T) { + tests := []struct { + name string + data []int + want ValueCounts + }{ + { + "nil", + nil, + New(), + }, + { + "empty", + []int{}, + New(), + }, + { + "single", + []int{1}, + FromMap(map[int]int{1: 1}), + }, + { + "multiple", + []int{1, 2, 3, 4, 3, 2, 1, 2}, + FromMap(map[int]int{1: 2, 2: 3, 3: 2, 4: 1}), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := Count(tt.data); !reflect.DeepEqual(got, tt.want) { + t.Errorf("Count() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/scripts/format-static-analysis-json.py b/scripts/format-static-analysis-json.py index be7d5267..877b9f9d 100755 --- a/scripts/format-static-analysis-json.py +++ b/scripts/format-static-analysis-json.py @@ -14,50 +14,30 @@ # Changes JSON structs that are formatted like: # { -# "Name": "...", -# "Type": "..." +# "key1": ..., +# "key2": ... # } # into ones like -# { "Name": "...", "Type": "..." } -name_type_substitution = ( - re.compile('{$\\n^\\s*"Name": ?"(.*)",$\\n^\\s*"Type": ?"(.*)"$\\n^\\s*}', re.MULTILINE), - '{ "Name": "\\1", "Type": "\\2" }' +# { "key1": ..., "key2": ... } +struct_pair_substitution = ( + re.compile('{$\\n^\\s*"(.+)": ?(.*),$\\n^\\s*"(.+)": ?(.*)$\\n^\\s*}', re.MULTILINE), + '{ "\\1": \\2, "\\3": \\4 }' ) # Changes JSON structs that are formatted like: # { -# "Value": ..., (may not be a string) -# "Raw": "..." +# "key1": ..., +# "key2": ..., +# "key3": ... # } # into ones like -# { "Value": ..., "Raw": "..." } -value_raw_substitution = ( - re.compile('{$\\n^\\s*"Value": ?(.*),$\\n^\\s*"Raw": ?"(.*)"$\\n^\\s*}', re.MULTILINE), - '{ "Value": \\1, "Raw": "\\2" }' +# { "key1": ..., "key2": ..., "key3": ... } +struct_triple_substitution = ( + re.compile('{$\\n^\\s*"(.+)": ?(.*),$\\n^\\s*"(.+)": ?(.*),$\\n^\\s*"(.+)": ?(.*)$\\n^\\s*}', re.MULTILINE), + '{ "\\1": \\2, "\\3": \\4, "\\5": \\6 }' ) -# Changes JSON arrays that are formatted like: -# "Quartiles": [ -# 0.1762, -# 1.3075, -# 1.4424, -# 1.4766, -# 1.6646 -# ] -# into ones like -# "Quartiles": [ 0.1762, 1.3075, 1.4424, 1.4766, 1.6646 ] -quartile_substitution = ( - re.compile('"Quartiles": \\[$\\n' - '^\\s*(\\d+\\.?\\d*),$\\n' - '^\\s*(\\d+\\.?\\d*),$\\n' - '^\\s*(\\d+\\.?\\d*),$\\n' - '^\\s*(\\d+\\.?\\d*),$\\n' - '^\\s*(\\d+\\.?\\d*)$\\n' - '^\\s*]', re.MULTILINE), - '"Quartiles": [ \\1, \\2, \\3, \\4, \\5 ]' -) - -all_substitutions = (name_type_substitution, value_raw_substitution, quartile_substitution) +all_substitutions = (struct_pair_substitution, struct_triple_substitution) # Pretty prints a JSON object with newlines and indentation, then applies