improve test coverage for static analysis GetBasicData and ComputeFil…

…eSignals functions, fix bug in string entropy field population, encapsulate ValueCounts map and add more tests, minor refactoring (#843) Signed-off-by: Max Fisher <[email protected]>
ossf · Aug 29, 2023 · 039fb69 · 039fb69
1 parent 149529b
commit 039fb69
Show file tree

Hide file tree

Showing 15 changed files with 320 additions and 194 deletions.
diff --git a/internal/staticanalysis/analyze.go b/internal/staticanalysis/analyze.go
@@ -30,10 +30,6 @@ func enumeratePackageFiles(extractDir string) ([]string, error) {
 	return paths, err
 }
 
-func getPathInArchive(path, extractDir string) string {
-	return strings.TrimPrefix(path, extractDir+string(os.PathSeparator))
-}
-
 /*
 AnalyzePackageFiles walks a tree of extracted package files and runs the analysis tasks
 listed in analysisTasks to produce the result data.
@@ -76,14 +72,13 @@ func AnalyzePackageFiles(extractDir string, jsParserConfig parsing.ParserConfig,
 
 	result := Result{}
 
-	archivePath := map[string]string{}
-	for _, path := range fileList {
-		archivePath[path] = getPathInArchive(path, extractDir)
+	getPathInArchive := func(absolutePath string) string {
+		return strings.TrimPrefix(absolutePath, extractDir+string(os.PathSeparator))
 	}
 
 	if runTask[Basic] {
 		log.Info("run basic analysis")
-		basicData, err := GetBasicData(fileList, archivePath)
+		basicData, err := GetBasicData(fileList, getPathInArchive)
 		if err != nil {
 			log.Error("static analysis error", log.Label("task", string(Basic)), "error", err)
 		} else {
@@ -101,8 +96,8 @@ func AnalyzePackageFiles(extractDir string, jsParserConfig parsing.ParserConfig,
 			log.Error("static analysis error", log.Label("task", string(Parsing)), "error", err)
 		} else {
 			// change absolute path in parsingResults to package-relative path
-			for _, parseResult := range parsingResults {
-				parseResult.Filename = archivePath[parseResult.Filename]
+			for i, r := range parsingResults {
+				parsingResults[i].Filename = getPathInArchive(r.Filename)
 			}
 			result.ParsingData = parsingResults
 		}

diff --git a/internal/staticanalysis/basic_data.go b/internal/staticanalysis/basic_data.go
@@ -107,7 +107,7 @@ some files should not prevent the analysis of other files.
 pathInArchive maps the absolute paths in fileList to relative paths
 in the package archive, to use for results.
 */
-func GetBasicData(fileList []string, pathInArchive map[string]string) (*BasicPackageData, error) {
+func GetBasicData(fileList []string, pathInArchive func(absolutePath string) string) (*BasicPackageData, error) {
 	// First, run file in batch processing mode to get all the file types at once.
 	// Then, file size, hash and line lengths can be done in a simple loop
 
@@ -124,7 +124,7 @@ func GetBasicData(fileList []string, pathInArchive map[string]string) (*BasicPac
 	}
 
 	for index, filePath := range fileList {
-		archivePath := pathInArchive[filePath]
+		archivePath := pathInArchive(filePath)
 		fileType := fileTypes[index]
 
 		var fileSize int64

diff --git a/internal/staticanalysis/basic_data_test.go b/internal/staticanalysis/basic_data_test.go
@@ -4,55 +4,98 @@ import (
 	"os"
 	"path/filepath"
 	"reflect"
+	"strings"
 	"testing"
+
+	"github.com/ossf/package-analysis/internal/utils"
+	"github.com/ossf/package-analysis/internal/utils/valuecounts"
 )
 
-func TestGetFileTypes(t *testing.T) {
-	testDir := t.TempDir()
-	fileName1 := filepath.Join(testDir, "test1.txt")
-	fileName2 := filepath.Join(testDir, "test2.txt")
+type testFile struct {
+	filename     string
+	contents     []byte
+	contentsHash string
+	fileType     string
+	lineLengths  valuecounts.ValueCounts
+}
 
-	if err := os.WriteFile(fileName1, []byte("hello test 1!\n"), 0o666); err != nil {
-		t.Fatalf("failed to write test file 1: %v", err)
-	}
-	if err := os.WriteFile(fileName2, []byte("#! /bin/bash\necho 'Hello test 2'\n"), 0o666); err != nil {
-		t.Fatalf("failed to write test file 2: %v", err)
-	}
+var testFiles = []testFile{
+	{
+		filename:     "test1.txt",
+		contents:     []byte("hello test 1!\n"),
+		contentsHash: "sha256:bd96959573979235b87180b0b7513c7f1d5cbf046b263f366f2f10fe1b966494",
+		fileType:     "ASCII text",
+		lineLengths:  valuecounts.Count([]int{13}),
+	},
+	{
+		filename:     "test2.txt",
+		contents:     []byte("#! /bin/bash\necho 'Hello test 2'\n"),
+		contentsHash: "sha256:6179db3c673ceddcdbd384116ae4d301d64e65fc2686db9ba64945677a5a893c",
+		fileType:     "Bourne-Again shell script, ASCII text executable",
+		lineLengths:  valuecounts.Count([]int{12, 19}),
+	},
+}
 
+func TestGetBasicData(t *testing.T) {
 	tests := []struct {
-		name     string
-		fileList []string
-		want     []string
-		wantErr  bool
+		name    string
+		files   []testFile
+		wantErr bool
 	}{
 		{
-			name:     "test no files",
-			fileList: []string{},
-			want:     []string{},
-			wantErr:  false,
+			name:    "test no files",
+			files:   nil,
+			wantErr: false,
 		},
 		{
-			name:     "test one file",
-			fileList: []string{fileName1},
-			want:     []string{"ASCII text"},
-			wantErr:  false,
+			name:    "test one file",
+			files:   []testFile{testFiles[0]},
+			wantErr: false,
 		},
 		{
-			name:     "test two files",
-			fileList: []string{fileName1, fileName2},
-			want:     []string{"ASCII text", "Bourne-Again shell script, ASCII text executable"},
-			wantErr:  false,
+			name:    "test two files",
+			files:   []testFile{testFiles[0], testFiles[1]},
+			wantErr: false,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got, err := getFileTypes(tt.fileList)
+			testDir := t.TempDir()
+			paths := utils.Transform(tt.files, func(f testFile) string {
+				return filepath.Join(testDir, f.filename)
+			})
+
+			for i := range tt.files {
+				if err := os.WriteFile(paths[i], tt.files[i].contents, 0o666); err != nil {
+					t.Fatalf("failed to write test file %d: %v", i, err)
+				}
+			}
+
+			getArchivePath := func(absolutePath string) string {
+				return strings.TrimPrefix(absolutePath, testDir+string(os.PathSeparator))
+			}
+
+			got, err := GetBasicData(paths, getArchivePath)
 			if (err != nil) != tt.wantErr {
 				t.Errorf("getFileTypes() error = %v, wantErr %v", err, tt.wantErr)
 				return
 			}
-			if !reflect.DeepEqual(got, tt.want) {
-				t.Errorf("getFileTypes() got = %#v, want %#v", got, tt.want)
+
+			wantData := utils.Transform(tt.files, func(f testFile) BasicFileData {
+				return BasicFileData{
+					Filename:    f.filename,
+					FileType:    f.fileType,
+					Size:        int64(len(f.contents)),
+					Hash:        f.contentsHash,
+					LineLengths: f.lineLengths,
+				}
+			})
+
+			gotData := got.Files
+
+			if !reflect.DeepEqual(gotData, wantData) {
+				t.Errorf("TestGetBasicData() data mismatch:\n"+
+					"== got == \n%v\n== want ==\n%v", got, wantData)
 			}
 		})
 	}

diff --git a/internal/staticanalysis/obfuscation/file_signals_test.go b/internal/staticanalysis/obfuscation/file_signals_test.go
@@ -29,8 +29,8 @@ var fileSignalsTestCases = []fileSignalsTestCase{
 			FloatLiterals: []token.Float{},
 		},
 		expectedSignals: FileSignals{
-			StringLengths:         valuecounts.ValueCounts{5: 1},
-			IdentifierLengths:     valuecounts.ValueCounts{1: 1},
+			StringLengths:         valuecounts.Count([]int{5}),
+			IdentifierLengths:     valuecounts.Count([]int{1}),
 			SuspiciousIdentifiers: []SuspiciousIdentifier{{Name: "a", Rule: "single"}},
 			EscapedStrings:        []EscapedString{},
 			Base64Strings:         []string{},
@@ -61,8 +61,8 @@ var fileSignalsTestCases = []fileSignalsTestCase{
 			FloatLiterals: []token.Float{},
 		},
 		expectedSignals: FileSignals{
-			StringLengths:     valuecounts.ValueCounts{5: 2},
-			IdentifierLengths: valuecounts.ValueCounts{1: 3, 4: 1},
+			StringLengths:     valuecounts.Count([]int{5, 5}),
+			IdentifierLengths: valuecounts.Count([]int{4, 1, 1, 1}),
 			SuspiciousIdentifiers: []SuspiciousIdentifier{
 				{Name: "a", Rule: "single"},
 				{Name: "b", Rule: "single"},
@@ -76,6 +76,41 @@ var fileSignalsTestCases = []fileSignalsTestCase{
 			URLs:           []string{},
 		},
 	},
+	{
+		name: "one of everything",
+		parseData: parsing.SingleResult{
+			Identifiers: []token.Identifier{
+				{Name: "_0x12414124", Type: token.Variable},
+				{Name: "a", Type: token.Parameter},
+				{Name: "d1912931", Type: token.Parameter},
+			},
+			StringLiterals: []token.String{
+				{Value: "[email protected]", Raw: `"[email protected]"`},
+				{Value: "https://this.is.a.website.com", Raw: `"https://this.is.a.website.com"`},
+				{Value: "aGVsbG8gd29ybGQK", Raw: `"aGVsbG8gd29ybGQK"`},
+				{Value: "8.8.8.8", Raw: `"8.8.8.8"`},
+				{Value: "e3fc:234a:2341::abcd", Raw: `"e3fc:234a:2341::abcd"`},
+				{Value: "0x21323492394", Raw: `"0x21323492394"`},
+			},
+			IntLiterals:   []token.Int{},
+			FloatLiterals: []token.Float{},
+		},
+		expectedSignals: FileSignals{
+			IdentifierLengths: valuecounts.Count([]int{11, 1, 8}),
+			StringLengths:     valuecounts.Count([]int{14, 29, 16, 7, 20, 13}),
+			SuspiciousIdentifiers: []SuspiciousIdentifier{
+				{Name: "_0x12414124", Rule: "hex"},
+				{Name: "a", Rule: "single"},
+				{Name: "d1912931", Rule: "numeric"},
+			},
+			EscapedStrings: []EscapedString{},
+			Base64Strings:  []string{"aGVsbG8gd29ybGQK"},
+			EmailAddresses: []string{"[email protected]"},
+			HexStrings:     []string{"21323492394"},
+			IPAddresses:    []string{"8.8.8.8", "e3fc:234a:2341::abcd"},
+			URLs:           []string{"https://this.is.a.website.com"},
+		},
+	},
 }
 
 func TestComputeSignals(t *testing.T) {

diff --git a/internal/staticanalysis/obfuscation/stats/sample_statistics.go b/internal/staticanalysis/obfuscation/stats/sample_statistics.go
@@ -8,7 +8,6 @@ import (
 	"golang.org/x/exp/slices"
 
 	"github.com/ossf/package-analysis/internal/utils"
-	"github.com/ossf/package-analysis/internal/utils/valuecounts"
 )
 
 type RealNumber interface {
@@ -209,11 +208,3 @@ func Summarise[T RealNumber](sample []T) SampleStatistics {
 	q := quartiles(sample)
 	return SampleStatistics{Size: l, Mean: m, Variance: v, Skewness: s, Quartiles: q}
 }
-
-func CountDistinct(sample []int) valuecounts.ValueCounts {
-	counts := valuecounts.New()
-	for _, t := range sample {
-		counts[t] += 1
-	}
-	return counts
-}
diff --git a/internal/staticanalysis/obfuscation/stats/sample_statistics_test.go b/internal/staticanalysis/obfuscation/stats/sample_statistics_test.go
@@ -2,10 +2,7 @@ package stats
 
 import (
 	"math"
-	"reflect"
 	"testing"
-
-	"github.com/ossf/package-analysis/internal/utils/valuecounts"
 )
 
 func TestSummary(t *testing.T) {
@@ -115,30 +112,3 @@ func TestSummary7(t *testing.T) {
 		t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual)
 	}
 }
-
-func TestCountDistinct1(t *testing.T) {
-	data := []int{1, 2, 3, 4, 3, 2, 1, 2}
-	actual := CountDistinct(data)
-	expected := valuecounts.ValueCounts{1: 2, 2: 3, 3: 2, 4: 1}
-	if !reflect.DeepEqual(actual, expected) {
-		t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual)
-	}
-}
-
-func TestCountDistinct2(t *testing.T) {
-	data := []int{1}
-	actual := CountDistinct(data)
-	expected := valuecounts.ValueCounts{1: 1}
-	if !reflect.DeepEqual(actual, expected) {
-		t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual)
-	}
-}
-
-func TestCountDistinct3(t *testing.T) {
-	data := []int{}
-	actual := CountDistinct(data)
-	expected := valuecounts.ValueCounts{}
-	if !reflect.DeepEqual(actual, expected) {
-		t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual)
-	}
-}
diff --git a/internal/staticanalysis/obfuscation/stringentropy/string_entropy.go b/internal/staticanalysis/obfuscation/stringentropy/string_entropy.go
@@ -6,7 +6,7 @@ import (
 )
 
 /*
-CalculateEntropy calculates entropy of a string S of characters over an alphabet A, which is defined as
+Calculate finds the entropy of a string S of characters over an alphabet A, which is defined as
 
 	E(S) = - sum(i in A) { (p(i)) * log(p(i)) },
 
@@ -24,7 +24,7 @@ the entropy approaches 0.
 
 Reference: https://link.springer.com/chapter/10.1007/978-3-642-10509-8_19
 */
-func CalculateEntropy(s string, prob map[rune]float64) float64 {
+func Calculate(s string, prob map[rune]float64) float64 {
 	if len(s) == 0 {
 		return 0
 	}
@@ -46,7 +46,7 @@ func CalculateEntropy(s string, prob map[rune]float64) float64 {
 }
 
 /*
-CalculateNormalisedEntropy returns the string entropy normalised by the log of the length of the string.
+CalculateNormalised returns the string entropy normalised by the log of the length of the string.
 This quantity is used because for log(N) is the maximum possible entropy out of all strings with length N,
 where N > 0. Special cases are empty strings (0) and single character strings (1).
 As a formula:
@@ -59,15 +59,15 @@ As a formula:
 */
 // TODO does this make sense when a general probability structure is used?
 // TODO calculate max string entropy for a given set of character counts.
-func CalculateNormalisedEntropy(s string, prob map[rune]float64) float64 {
+func CalculateNormalised(s string, prob map[rune]float64) float64 {
 	length := utf8.RuneCountInString(s)
 	switch length {
 	case 0:
 		return 0
 	case 1:
 		return 1
 	default:
-		return CalculateEntropy(s, prob) / math.Log(float64(length))
+		return Calculate(s, prob) / math.Log(float64(length))
 	}
 }
 

diff --git a/internal/staticanalysis/obfuscation/stringentropy/string_entropy_test.go b/internal/staticanalysis/obfuscation/stringentropy/string_entropy_test.go
@@ -24,7 +24,7 @@ func TestStringEntropy(t *testing.T) {
 		{"aaA", -2*2.0/3.0*math.Log(2.0/3.0) - math.Log(1.0/3.0)/3.0},
 	}
 	for index, test := range testCases {
-		actual := CalculateEntropy(test.s, nil)
+		actual := Calculate(test.s, nil)
 		if !utils.FloatEquals(test.expected, actual, tolerance) {
 			t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n",
 				index+1, test.s, test.expected, actual)
@@ -53,7 +53,7 @@ func TestStringEntropyWithFixedProbs(t *testing.T) {
 		{" a \n", -a * math.Log(a)},
 	}
 	for index, test := range testCases {
-		actual := CalculateEntropy(test.s, probs)
+		actual := Calculate(test.s, probs)
 		if !utils.FloatEquals(test.expected, actual, tolerance) {
 			t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n",
 				index+1, test.s, test.expected, actual)
@@ -73,7 +73,7 @@ func TestStringEntropyRatio(t *testing.T) {
 		{"aaA", (-2*2.0*math.Log(2.0/3.0) - math.Log(1.0/3.0)) / (3.0 * math.Log(3))},
 	}
 	for index, test := range testCases {
-		actual := CalculateNormalisedEntropy(test.s, nil)
+		actual := CalculateNormalised(test.s, nil)
 		if !utils.FloatEquals(test.expected, actual, tolerance) {
 			t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n",
 				index+1, test.s, test.expected, actual)