aquasecurity · namandf · Aug 7, 2023 · Aug 7, 2023 · Aug 8, 2023 · Aug 8, 2023
diff --git a/go.mod b/go.mod
@@ -8,6 +8,8 @@ require (
 	github.com/hashicorp/go-retryablehttp v0.7.2
 	github.com/spf13/cobra v1.6.1
 	github.com/stretchr/testify v1.8.1
+	github.com/vifraa/gopom v0.2.2
+	golang.org/x/net v0.0.0-20201021035429-f5854403a974
 	golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9
 	golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2
 	k8s.io/utils v0.0.0-20230115233650-391b47cb4029
@@ -32,8 +34,8 @@ require (
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	golang.org/x/mod v0.3.0 // indirect
-	golang.org/x/net v0.0.0-20201021035429-f5854403a974 // indirect
 	golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect
+	golang.org/x/text v0.3.3 // indirect
 	golang.org/x/tools v0.0.0-20201124115921-2c860bdd6e78 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 	lukechampine.com/uint128 v1.2.0 // indirect

diff --git a/go.sum b/go.sum
@@ -53,10 +53,13 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/vifraa/gopom v0.2.2 h1:zrqoCUVIplcsETouv3xxHPvfI/WV1GUPrdX2+Diahzo=
+github.com/vifraa/gopom v0.2.2/go.mod h1:oPa1dcrGrtlO37WPDBm5SqHAT+wTgF8An1Q71Z6Vv4o=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@@ -81,6 +84,7 @@ golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=

diff --git a/pkg/builder/builder.go b/pkg/builder/builder.go
@@ -56,6 +56,7 @@ func (b *Builder) Build(cacheDir string) error {
 				Version:     ver.Version,
 				SHA1:        ver.SHA1,
 				ArchiveType: index.ArchiveType,
+				License:     ver.License,
 			})
 		}
 		bar.Increment()

diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
@@ -5,17 +5,20 @@ import (
 	"encoding/hex"
 	"encoding/xml"
 	"fmt"
-	"github.com/aquasecurity/trivy-java-db/pkg/fileutil"
-	"github.com/aquasecurity/trivy-java-db/pkg/types"
 	"io"
 	"log"
 	"net/http"
 	"path/filepath"
 	"strings"
 	"sync"
 
+	"github.com/aquasecurity/trivy-java-db/pkg/fileutil"
+	"github.com/aquasecurity/trivy-java-db/pkg/types"
+
 	"github.com/PuerkitoBio/goquery"
 	"github.com/hashicorp/go-retryablehttp"
+	"github.com/vifraa/gopom"
+	"golang.org/x/net/html/charset"
 	"golang.org/x/sync/semaphore"
 	"golang.org/x/xerrors"
 )
@@ -176,9 +179,19 @@ func (c *Crawler) crawlSHA1(baseURL string, meta *Metadata) error {
 			return err
 		}
 		if len(sha1) != 0 {
+			// fetch licenses
+			pomFileName := fmt.Sprintf("/%s-%s.pom", meta.ArtifactID, version)
+			pomURL := baseURL + version + pomFileName
+			pomLicense, err := c.fetchPOMLicense(pomURL)
+			if err != nil {
+				// TODO: Check if we can change this log to a warning or ignore it
+				log.Println(err)
+			}
+
 			v := Version{
 				Version: version,
 				SHA1:    sha1,
+				License: pomLicense,
 			}
 			versions = append(versions, v)
 		}
@@ -262,3 +275,36 @@ func (c *Crawler) fetchSHA1(url string) ([]byte, error) {
 	}
 	return sha1b, nil
 }
+
+func (c *Crawler) fetchPOMLicense(url string) (string, error) {
+	resp, err := c.http.Get(url)
+	if resp.StatusCode == http.StatusNotFound {
+		return "", nil
+	}
+	if err != nil {
+		return "", xerrors.Errorf("can't get pom xml from %s: %w", url, err)
+	}
+	defer resp.Body.Close()
+
+	var pomProject gopom.Project
+
+	decoder := xml.NewDecoder(resp.Body)
+	decoder.CharsetReader = charset.NewReaderLabel
+	err = decoder.Decode(&pomProject)
+
+	if err != nil {
+		return "", xerrors.Errorf("can't parse pom xml from %s: %w", url, err)
+	}
+
+	if len(pomProject.Licenses) == 0 {
+		return "", nil
+	}
+
+	var licenses []string
+	for _, l := range pomProject.Licenses {
+		licenses = append(licenses, l.Name)
+	}
+
+	// TODO: Check if we can limit the length of license string i.e trim and save
+	return strings.Join(licenses, ","), nil
+}
diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go
@@ -2,13 +2,14 @@ package crawler_test
 
 import (
 	"context"
-	"github.com/stretchr/testify/assert"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"testing"
 
+	"github.com/stretchr/testify/assert"
+
 	"github.com/aquasecurity/trivy-java-db/pkg/crawler"
 )
 
@@ -27,8 +28,11 @@ func TestCrawl(t *testing.T) {
 				"/maven2/abbot/abbot/":                   "testdata/abbot_abbot.html",
 				"/maven2/abbot/abbot/maven-metadata.xml": "testdata/maven-metadata.xml",
 				"/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar.sha1": "testdata/abbot-0.12.3.jar.sha1",
+				"/maven2/abbot/abbot/0.12.3/abbot-0.12.3.pom":      "testdata/abbot-0.12.3.pom",
 				"/maven2/abbot/abbot/0.13.0/abbot-0.13.0.jar.sha1": "testdata/abbot-0.13.0.jar.sha1",
+				"/maven2/abbot/abbot/0.13.0/abbot-0.13.0.pom":      "testdata/abbot-0.13.0.pom",
 				"/maven2/abbot/abbot/1.4.0/abbot-1.4.0.jar.sha1":   "testdata/abbot-1.4.0.jar.sha1",
+				"/maven2/abbot/abbot/1.4.0/abbot-1.4.0.pom":        "testdata/abbot-1.4.0.pom",
 			},
 			goldenPath: "testdata/golden/abbot.json",
 			filePath:   "indexes/abbot/abbot.json",

diff --git a/pkg/crawler/testdata/abbot-0.12.3.pom b/pkg/crawler/testdata/abbot-0.12.3.pom
@@ -0,0 +1 @@
+invalid pom
diff --git a/pkg/crawler/testdata/abbot-0.13.0.pom b/pkg/crawler/testdata/abbot-0.13.0.pom
@@ -0,0 +1 @@
+invalid pom
diff --git a/pkg/crawler/testdata/abbot-1.4.0.pom b/pkg/crawler/testdata/abbot-1.4.0.pom
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project>
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>abbot</groupId>
+    <artifactId>abbot</artifactId>
+    <version>1.4.0</version>
+    <name>Abbot Java GUI Test Library</name>
+    <description>Abbot provides a wrapper around java.awt.Robot to make testing AWT and Swing Applications easier</description>
+    <url>http://abbot.sf.net/</url>
+
+
+    <licenses>
+        <license>
+            <name>EPL</name>
+            <url>https://www.eclipse.org/legal/epl-v10.html</url>
+        </license>
+    </licenses>
+
+
+ <developers>
+    <developer>
+      <name>Gerard Davisonr</name>
+      <email>[email protected]</email>
+      <organization>Oralce</organization>
+      <organizationUrl>http://www.oracle.com</organizationUrl>
+    </developer>
+  </developers>
+
+
+<scm>
+  <connection>scm:svn://svn.code.sf.net/p/abbot/svn/trunkabbot/trunk/</connection>
+  <developerConnection>scm:svn://svn.code.sf.net/p/abbot/svn/trunkabbot/trunk/</developerConnection>
+  <url>http://sourceforge.net/p/abbot/svn/HEAD/tree/abbot/trunk/</url>
+</scm>
+
+     <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.8.2</version>
+        </dependency>
+    </dependencies>
+
+
+
+</project>
diff --git a/pkg/crawler/testdata/golden/abbot.json b/pkg/crawler/testdata/golden/abbot.json
@@ -4,15 +4,18 @@
   "Versions": [
     {
       "Version": "0.12.3",
-      "SHA1": "UdKKJ9kZzoaQpA9PM1udWRzrFuk="
+      "SHA1": "UdKKJ9kZzoaQpA9PM1udWRzrFuk=",
+      "License": ""
     },
     {
       "Version": "0.13.0",
-      "SHA1": "WW2R5nYxsN6wX7aF2NG2c18+T2A="
+      "SHA1": "WW2R5nYxsN6wX7aF2NG2c18+T2A=",
+      "License": ""
     },
     {
       "Version": "1.4.0",
-      "SHA1": "ojY2RqndBZVWM7RQAQtZohr4pCM="
+      "SHA1": "ojY2RqndBZVWM7RQAQtZohr4pCM=",
+      "License": "EPL"
     }
   ],
   "ArchiveType": "jar"

diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go
@@ -24,4 +24,5 @@ type Index struct {
 type Version struct {
 	Version string
 	SHA1    []byte
+	License string
 }
diff --git a/pkg/db/db.go b/pkg/db/db.go
@@ -59,7 +59,7 @@ func (db *DB) Init() error {
 	if _, err := db.client.Exec("CREATE TABLE artifacts(id INTEGER PRIMARY KEY, group_id TEXT, artifact_id TEXT)"); err != nil {
 		return xerrors.Errorf("unable to create 'artifacts' table: %w", err)
 	}
-	if _, err := db.client.Exec("CREATE TABLE indices(artifact_id INTEGER, version TEXT, sha1 BLOB, archive_type TEXT, foreign key (artifact_id) references artifacts(id))"); err != nil {
+	if _, err := db.client.Exec("CREATE TABLE indices(artifact_id INTEGER, version TEXT, sha1 BLOB, archive_type TEXT, license TEXT, foreign key (artifact_id) references artifacts(id))"); err != nil {
 		return xerrors.Errorf("unable to create 'indices' table: %w", err)
 	}
 
@@ -110,13 +110,13 @@ func (db *DB) InsertIndexes(indexes []types.Index) error {
 
 	for _, index := range indexes {
 		_, err = tx.Exec(`
-			INSERT INTO indices(artifact_id, version, sha1, archive_type)
+			INSERT INTO indices(artifact_id, version, sha1, archive_type, license)
 			VALUES (
 			        (SELECT id FROM artifacts 
 			            WHERE group_id=? AND artifact_id=?), 
-			        ?, ?, ?
+			        ?, ?, ?, ? 
 			) ON CONFLICT(sha1) DO NOTHING`,
-			index.GroupID, index.ArtifactID, index.Version, index.SHA1, index.ArchiveType)
+			index.GroupID, index.ArtifactID, index.Version, index.SHA1, index.ArchiveType, index.License)
 		if err != nil {
 			return xerrors.Errorf("unable to insert to 'indices' table: %w", err)
 		}
@@ -147,12 +147,12 @@ func (db *DB) SelectIndexBySha1(sha1 string) (types.Index, error) {
 		return index, xerrors.Errorf("sha1 decode error: %w", err)
 	}
 	row := db.client.QueryRow(`
-		SELECT a.group_id, a.artifact_id, i.version, i.sha1, i.archive_type 
+		SELECT a.group_id, a.artifact_id, i.version, i.sha1, i.archive_type, i.license
 		FROM indices i
 		JOIN artifacts a ON a.id = i.artifact_id
         WHERE i.sha1 = ?`,
 		sha1b)
-	err = row.Scan(&index.GroupID, &index.ArtifactID, &index.Version, &index.SHA1, &index.ArchiveType)
+	err = row.Scan(&index.GroupID, &index.ArtifactID, &index.Version, &index.SHA1, &index.ArchiveType, &index.License)
 	if err != nil && !errors.Is(err, sql.ErrNoRows) {
 		return index, xerrors.Errorf("select index error: %w", err)
 	}
@@ -162,12 +162,12 @@ func (db *DB) SelectIndexBySha1(sha1 string) (types.Index, error) {
 func (db *DB) SelectIndexByArtifactIDAndGroupID(artifactID, groupID string) (types.Index, error) {
 	var index types.Index
 	row := db.client.QueryRow(`
-		SELECT a.group_id, a.artifact_id, i.version, i.sha1, i.archive_type
+		SELECT a.group_id, a.artifact_id, i.version, i.sha1, i.archive_type, i.license
 		FROM indices i 
 		JOIN artifacts a ON a.id = i.artifact_id
         WHERE a.group_id = ? AND a.artifact_id = ?`,
 		groupID, artifactID)
-	err := row.Scan(&index.GroupID, &index.ArtifactID, &index.Version, &index.SHA1, &index.ArchiveType)
+	err := row.Scan(&index.GroupID, &index.ArtifactID, &index.Version, &index.SHA1, &index.ArchiveType, &index.License)
 	if err != nil && !errors.Is(err, sql.ErrNoRows) {
 		return index, xerrors.Errorf("select index error: %w", err)
 	}
@@ -178,7 +178,7 @@ func (db *DB) SelectIndexesByArtifactIDAndFileType(artifactID string, fileType t
 	error) {
 	var indexes []types.Index
 	rows, err := db.client.Query(`
-		SELECT a.group_id, a.artifact_id, i.version, i.sha1, i.archive_type
+		SELECT a.group_id, a.artifact_id, i.version, i.sha1, i.archive_type, i.license
 		FROM indices i 
 		JOIN artifacts a ON a.id = i.artifact_id
         WHERE a.artifact_id = ? AND i.archive_type = ?`,
@@ -188,7 +188,7 @@ func (db *DB) SelectIndexesByArtifactIDAndFileType(artifactID string, fileType t
 	}
 	for rows.Next() {
 		var index types.Index
-		if err = rows.Scan(&index.GroupID, &index.ArtifactID, &index.Version, &index.SHA1, &index.ArchiveType); err != nil {
+		if err = rows.Scan(&index.GroupID, &index.ArtifactID, &index.Version, &index.SHA1, &index.ArchiveType, &index.License); err != nil {
 			return nil, xerrors.Errorf("scan row error: %w", err)
 		}
 		indexes = append(indexes, index)

diff --git a/pkg/db/db_test.go b/pkg/db/db_test.go
@@ -17,6 +17,7 @@ import (
 var (
 	jstlSha1b, _         = hex.DecodeString("9c581de633e94be1e7a955bd4e8292f16e554387")
 	javaxServletSha1b, _ = hex.DecodeString("bca201e52333629c59e459e874e5ecd8f9899e15")
+	junitSHA, _          = hex.DecodeString("1013627e3993319870863a020034004717505815")
 	indexJstl            = types.Index{
 		GroupID:     "jstl",
 		ArtifactID:  "jstl",
@@ -31,6 +32,14 @@ var (
 		SHA1:        javaxServletSha1b,
 		ArchiveType: types.JarType,
 	}
+	indexJunit = types.Index{
+		GroupID:     "junit",
+		ArtifactID:  "junit",
+		Version:     "4.9",
+		SHA1:        junitSHA,
+		ArchiveType: types.JarType,
+		License:     "Common Public License Version 1.0",
+	}
 )
 
 func TestSelectIndexBySha1(t *testing.T) {
@@ -52,12 +61,19 @@ func TestSelectIndexBySha1(t *testing.T) {
 			want:      types.Index{},
 			assertErr: assert.NoError,
 		},
+		{
+			name:      "index with license using sha",
+			sha1:      "1013627e3993319870863a020034004717505815",
+			want:      indexJunit,
+			assertErr: assert.NoError,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			dbc, err := dbtest.InitDB(t, []types.Index{
 				indexJstl,
 				indexJavaxServlet,
+				indexJunit,
 			})
 			require.NoError(t, err)
 
@@ -97,12 +113,20 @@ func TestSelectIndexByArtifactIDAndGroupID(t *testing.T) {
 			want:       types.Index{},
 			assertErr:  assert.NoError,
 		},
+		{
+			name:       "index with license using groupid and artifactid",
+			groupID:    "junit",
+			artifactID: "junit",
+			want:       indexJunit,
+			assertErr:  assert.NoError,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			dbc, err := dbtest.InitDB(t, []types.Index{
 				indexJstl,
 				indexJavaxServlet,
+				indexJunit,
 			})
 			require.NoError(t, err)
 
@@ -139,12 +163,21 @@ func TestSelectIndexesByArtifactIDAndFileType(t *testing.T) {
 			artifactID:  "jstl",
 			archiveType: "wrong",
 		},
+		{
+			name:        "index with license using artifactid and archivetype",
+			artifactID:  "junit",
+			archiveType: types.JarType,
+			want: []types.Index{
+				indexJunit,
+			},
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			dbc, err := dbtest.InitDB(t, []types.Index{
 				indexJstl,
 				indexJavaxServlet,
+				indexJunit,
 			})
 			require.NoError(t, err)