Skip to content

Commit

Permalink
improve test coverage for static analysis GetBasicData and ComputeFil…
Browse files Browse the repository at this point in the history
…eSignals functions, fix bug in string entropy field population, encapsulate ValueCounts map and add more tests, minor refactoring (#843)

Signed-off-by: Max Fisher <[email protected]>
  • Loading branch information
maxfisher-g authored Aug 29, 2023
1 parent 149529b commit 039fb69
Show file tree
Hide file tree
Showing 15 changed files with 320 additions and 194 deletions.
15 changes: 5 additions & 10 deletions internal/staticanalysis/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ func enumeratePackageFiles(extractDir string) ([]string, error) {
return paths, err
}

func getPathInArchive(path, extractDir string) string {
return strings.TrimPrefix(path, extractDir+string(os.PathSeparator))
}

/*
AnalyzePackageFiles walks a tree of extracted package files and runs the analysis tasks
listed in analysisTasks to produce the result data.
Expand Down Expand Up @@ -76,14 +72,13 @@ func AnalyzePackageFiles(extractDir string, jsParserConfig parsing.ParserConfig,

result := Result{}

archivePath := map[string]string{}
for _, path := range fileList {
archivePath[path] = getPathInArchive(path, extractDir)
getPathInArchive := func(absolutePath string) string {
return strings.TrimPrefix(absolutePath, extractDir+string(os.PathSeparator))
}

if runTask[Basic] {
log.Info("run basic analysis")
basicData, err := GetBasicData(fileList, archivePath)
basicData, err := GetBasicData(fileList, getPathInArchive)
if err != nil {
log.Error("static analysis error", log.Label("task", string(Basic)), "error", err)
} else {
Expand All @@ -101,8 +96,8 @@ func AnalyzePackageFiles(extractDir string, jsParserConfig parsing.ParserConfig,
log.Error("static analysis error", log.Label("task", string(Parsing)), "error", err)
} else {
// change absolute path in parsingResults to package-relative path
for _, parseResult := range parsingResults {
parseResult.Filename = archivePath[parseResult.Filename]
for i, r := range parsingResults {
parsingResults[i].Filename = getPathInArchive(r.Filename)
}
result.ParsingData = parsingResults
}
Expand Down
4 changes: 2 additions & 2 deletions internal/staticanalysis/basic_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ some files should not prevent the analysis of other files.
pathInArchive maps the absolute paths in fileList to relative paths
in the package archive, to use for results.
*/
func GetBasicData(fileList []string, pathInArchive map[string]string) (*BasicPackageData, error) {
func GetBasicData(fileList []string, pathInArchive func(absolutePath string) string) (*BasicPackageData, error) {
// First, run file in batch processing mode to get all the file types at once.
// Then, file size, hash and line lengths can be done in a simple loop

Expand All @@ -124,7 +124,7 @@ func GetBasicData(fileList []string, pathInArchive map[string]string) (*BasicPac
}

for index, filePath := range fileList {
archivePath := pathInArchive[filePath]
archivePath := pathInArchive(filePath)
fileType := fileTypes[index]

var fileSize int64
Expand Down
101 changes: 72 additions & 29 deletions internal/staticanalysis/basic_data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,98 @@ import (
"os"
"path/filepath"
"reflect"
"strings"
"testing"

"github.com/ossf/package-analysis/internal/utils"
"github.com/ossf/package-analysis/internal/utils/valuecounts"
)

func TestGetFileTypes(t *testing.T) {
testDir := t.TempDir()
fileName1 := filepath.Join(testDir, "test1.txt")
fileName2 := filepath.Join(testDir, "test2.txt")
type testFile struct {
filename string
contents []byte
contentsHash string
fileType string
lineLengths valuecounts.ValueCounts
}

if err := os.WriteFile(fileName1, []byte("hello test 1!\n"), 0o666); err != nil {
t.Fatalf("failed to write test file 1: %v", err)
}
if err := os.WriteFile(fileName2, []byte("#! /bin/bash\necho 'Hello test 2'\n"), 0o666); err != nil {
t.Fatalf("failed to write test file 2: %v", err)
}
var testFiles = []testFile{
{
filename: "test1.txt",
contents: []byte("hello test 1!\n"),
contentsHash: "sha256:bd96959573979235b87180b0b7513c7f1d5cbf046b263f366f2f10fe1b966494",
fileType: "ASCII text",
lineLengths: valuecounts.Count([]int{13}),
},
{
filename: "test2.txt",
contents: []byte("#! /bin/bash\necho 'Hello test 2'\n"),
contentsHash: "sha256:6179db3c673ceddcdbd384116ae4d301d64e65fc2686db9ba64945677a5a893c",
fileType: "Bourne-Again shell script, ASCII text executable",
lineLengths: valuecounts.Count([]int{12, 19}),
},
}

func TestGetBasicData(t *testing.T) {
tests := []struct {
name string
fileList []string
want []string
wantErr bool
name string
files []testFile
wantErr bool
}{
{
name: "test no files",
fileList: []string{},
want: []string{},
wantErr: false,
name: "test no files",
files: nil,
wantErr: false,
},
{
name: "test one file",
fileList: []string{fileName1},
want: []string{"ASCII text"},
wantErr: false,
name: "test one file",
files: []testFile{testFiles[0]},
wantErr: false,
},
{
name: "test two files",
fileList: []string{fileName1, fileName2},
want: []string{"ASCII text", "Bourne-Again shell script, ASCII text executable"},
wantErr: false,
name: "test two files",
files: []testFile{testFiles[0], testFiles[1]},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := getFileTypes(tt.fileList)
testDir := t.TempDir()
paths := utils.Transform(tt.files, func(f testFile) string {
return filepath.Join(testDir, f.filename)
})

for i := range tt.files {
if err := os.WriteFile(paths[i], tt.files[i].contents, 0o666); err != nil {
t.Fatalf("failed to write test file %d: %v", i, err)
}
}

getArchivePath := func(absolutePath string) string {
return strings.TrimPrefix(absolutePath, testDir+string(os.PathSeparator))
}

got, err := GetBasicData(paths, getArchivePath)
if (err != nil) != tt.wantErr {
t.Errorf("getFileTypes() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("getFileTypes() got = %#v, want %#v", got, tt.want)

wantData := utils.Transform(tt.files, func(f testFile) BasicFileData {
return BasicFileData{
Filename: f.filename,
FileType: f.fileType,
Size: int64(len(f.contents)),
Hash: f.contentsHash,
LineLengths: f.lineLengths,
}
})

gotData := got.Files

if !reflect.DeepEqual(gotData, wantData) {
t.Errorf("TestGetBasicData() data mismatch:\n"+
"== got == \n%v\n== want ==\n%v", got, wantData)
}
})
}
Expand Down
43 changes: 39 additions & 4 deletions internal/staticanalysis/obfuscation/file_signals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ var fileSignalsTestCases = []fileSignalsTestCase{
FloatLiterals: []token.Float{},
},
expectedSignals: FileSignals{
StringLengths: valuecounts.ValueCounts{5: 1},
IdentifierLengths: valuecounts.ValueCounts{1: 1},
StringLengths: valuecounts.Count([]int{5}),
IdentifierLengths: valuecounts.Count([]int{1}),
SuspiciousIdentifiers: []SuspiciousIdentifier{{Name: "a", Rule: "single"}},
EscapedStrings: []EscapedString{},
Base64Strings: []string{},
Expand Down Expand Up @@ -61,8 +61,8 @@ var fileSignalsTestCases = []fileSignalsTestCase{
FloatLiterals: []token.Float{},
},
expectedSignals: FileSignals{
StringLengths: valuecounts.ValueCounts{5: 2},
IdentifierLengths: valuecounts.ValueCounts{1: 3, 4: 1},
StringLengths: valuecounts.Count([]int{5, 5}),
IdentifierLengths: valuecounts.Count([]int{4, 1, 1, 1}),
SuspiciousIdentifiers: []SuspiciousIdentifier{
{Name: "a", Rule: "single"},
{Name: "b", Rule: "single"},
Expand All @@ -76,6 +76,41 @@ var fileSignalsTestCases = []fileSignalsTestCase{
URLs: []string{},
},
},
{
name: "one of everything",
parseData: parsing.SingleResult{
Identifiers: []token.Identifier{
{Name: "_0x12414124", Type: token.Variable},
{Name: "a", Type: token.Parameter},
{Name: "d1912931", Type: token.Parameter},
},
StringLiterals: []token.String{
{Value: "[email protected]", Raw: `"[email protected]"`},
{Value: "https://this.is.a.website.com", Raw: `"https://this.is.a.website.com"`},
{Value: "aGVsbG8gd29ybGQK", Raw: `"aGVsbG8gd29ybGQK"`},
{Value: "8.8.8.8", Raw: `"8.8.8.8"`},
{Value: "e3fc:234a:2341::abcd", Raw: `"e3fc:234a:2341::abcd"`},
{Value: "0x21323492394", Raw: `"0x21323492394"`},
},
IntLiterals: []token.Int{},
FloatLiterals: []token.Float{},
},
expectedSignals: FileSignals{
IdentifierLengths: valuecounts.Count([]int{11, 1, 8}),
StringLengths: valuecounts.Count([]int{14, 29, 16, 7, 20, 13}),
SuspiciousIdentifiers: []SuspiciousIdentifier{
{Name: "_0x12414124", Rule: "hex"},
{Name: "a", Rule: "single"},
{Name: "d1912931", Rule: "numeric"},
},
EscapedStrings: []EscapedString{},
Base64Strings: []string{"aGVsbG8gd29ybGQK"},
EmailAddresses: []string{"[email protected]"},
HexStrings: []string{"21323492394"},
IPAddresses: []string{"8.8.8.8", "e3fc:234a:2341::abcd"},
URLs: []string{"https://this.is.a.website.com"},
},
},
}

func TestComputeSignals(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"golang.org/x/exp/slices"

"github.com/ossf/package-analysis/internal/utils"
"github.com/ossf/package-analysis/internal/utils/valuecounts"
)

type RealNumber interface {
Expand Down Expand Up @@ -209,11 +208,3 @@ func Summarise[T RealNumber](sample []T) SampleStatistics {
q := quartiles(sample)
return SampleStatistics{Size: l, Mean: m, Variance: v, Skewness: s, Quartiles: q}
}

func CountDistinct(sample []int) valuecounts.ValueCounts {
counts := valuecounts.New()
for _, t := range sample {
counts[t] += 1
}
return counts
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@ package stats

import (
"math"
"reflect"
"testing"

"github.com/ossf/package-analysis/internal/utils/valuecounts"
)

func TestSummary(t *testing.T) {
Expand Down Expand Up @@ -115,30 +112,3 @@ func TestSummary7(t *testing.T) {
t.Errorf("Expected summary: %v\nactual summary %v\n", expected, actual)
}
}

func TestCountDistinct1(t *testing.T) {
data := []int{1, 2, 3, 4, 3, 2, 1, 2}
actual := CountDistinct(data)
expected := valuecounts.ValueCounts{1: 2, 2: 3, 3: 2, 4: 1}
if !reflect.DeepEqual(actual, expected) {
t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual)
}
}

func TestCountDistinct2(t *testing.T) {
data := []int{1}
actual := CountDistinct(data)
expected := valuecounts.ValueCounts{1: 1}
if !reflect.DeepEqual(actual, expected) {
t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual)
}
}

func TestCountDistinct3(t *testing.T) {
data := []int{}
actual := CountDistinct(data)
expected := valuecounts.ValueCounts{}
if !reflect.DeepEqual(actual, expected) {
t.Errorf("Expected counts: %v\nactual counts %v\n", expected, actual)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
)

/*
CalculateEntropy calculates entropy of a string S of characters over an alphabet A, which is defined as
Calculate finds the entropy of a string S of characters over an alphabet A, which is defined as
E(S) = - sum(i in A) { (p(i)) * log(p(i)) },
Expand All @@ -24,7 +24,7 @@ the entropy approaches 0.
Reference: https://link.springer.com/chapter/10.1007/978-3-642-10509-8_19
*/
func CalculateEntropy(s string, prob map[rune]float64) float64 {
func Calculate(s string, prob map[rune]float64) float64 {
if len(s) == 0 {
return 0
}
Expand All @@ -46,7 +46,7 @@ func CalculateEntropy(s string, prob map[rune]float64) float64 {
}

/*
CalculateNormalisedEntropy returns the string entropy normalised by the log of the length of the string.
CalculateNormalised returns the string entropy normalised by the log of the length of the string.
This quantity is used because for log(N) is the maximum possible entropy out of all strings with length N,
where N > 0. Special cases are empty strings (0) and single character strings (1).
As a formula:
Expand All @@ -59,15 +59,15 @@ As a formula:
*/
// TODO does this make sense when a general probability structure is used?
// TODO calculate max string entropy for a given set of character counts.
func CalculateNormalisedEntropy(s string, prob map[rune]float64) float64 {
func CalculateNormalised(s string, prob map[rune]float64) float64 {
length := utf8.RuneCountInString(s)
switch length {
case 0:
return 0
case 1:
return 1
default:
return CalculateEntropy(s, prob) / math.Log(float64(length))
return Calculate(s, prob) / math.Log(float64(length))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func TestStringEntropy(t *testing.T) {
{"aaA", -2*2.0/3.0*math.Log(2.0/3.0) - math.Log(1.0/3.0)/3.0},
}
for index, test := range testCases {
actual := CalculateEntropy(test.s, nil)
actual := Calculate(test.s, nil)
if !utils.FloatEquals(test.expected, actual, tolerance) {
t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n",
index+1, test.s, test.expected, actual)
Expand Down Expand Up @@ -53,7 +53,7 @@ func TestStringEntropyWithFixedProbs(t *testing.T) {
{" a \n", -a * math.Log(a)},
}
for index, test := range testCases {
actual := CalculateEntropy(test.s, probs)
actual := Calculate(test.s, probs)
if !utils.FloatEquals(test.expected, actual, tolerance) {
t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n",
index+1, test.s, test.expected, actual)
Expand All @@ -73,7 +73,7 @@ func TestStringEntropyRatio(t *testing.T) {
{"aaA", (-2*2.0*math.Log(2.0/3.0) - math.Log(1.0/3.0)) / (3.0 * math.Log(3))},
}
for index, test := range testCases {
actual := CalculateNormalisedEntropy(test.s, nil)
actual := CalculateNormalised(test.s, nil)
if !utils.FloatEquals(test.expected, actual, tolerance) {
t.Errorf("Test case %d failed (str: %s, expected: %f, actual: %f\n",
index+1, test.s, test.expected, actual)
Expand Down
Loading

0 comments on commit 039fb69

Please sign in to comment.