From db2d8a9d0cd03e10cbf6ce47b1478fc9d5182d5d Mon Sep 17 00:00:00 2001 From: Ian Kretz <44385082+ikretz@users.noreply.github.com> Date: Wed, 16 Oct 2024 18:41:08 +0200 Subject: [PATCH 1/4] Compute SHA-256 in bundled_binary --- guarddog/analyzer/metadata/bundled_binary.py | 25 ++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/guarddog/analyzer/metadata/bundled_binary.py b/guarddog/analyzer/metadata/bundled_binary.py index a0d252c8..87c5c8e1 100644 --- a/guarddog/analyzer/metadata/bundled_binary.py +++ b/guarddog/analyzer/metadata/bundled_binary.py @@ -4,6 +4,7 @@ import os from functools import reduce import logging +import hashlib log = logging.getLogger("guarddog") @@ -28,6 +29,14 @@ def detect( name: Optional[str] = None, version: Optional[str] = None, ) -> tuple[bool, str]: + def format_file(file: str, kind: str) -> str: + return f"{file} ({kind})" + + def sha256(file: str) -> str: + with open(file, "rb") as f: + hasher = hashlib.sha256() + hasher.update(f.read()) + return hasher.hexdigest() log.debug( f"Running bundled binary heuristic on package {name} version {version}" @@ -35,14 +44,22 @@ def detect( if not path: raise ValueError("path is needed to run heuristic " + self.get_name()) - bin_files = [] + bin_files = {} for root, _, files in os.walk(path): for f in files: - kind = self.is_binary(os.path.join(root, f)) + path = os.path.join(root, f) + kind = self.is_binary(path) if kind: - bin_files.append(f"{f} type {kind}") + digest = sha256(path) + if digest not in bin_files: + bin_files[digest] = [format_file(f, kind)] + else: + bin_files[digest].append(format_file(f, kind)) if bin_files: - return True, "Binary file/s detected in package: " + reduce(lambda x, y: f"{x}, {y}", bin_files) + output_lines = '\n'.join( + f"{digest}: {', '.join(files)}" for digest, files in bin_files.items() + ) + return True, f"Binary file/s detected in package:\n{output_lines}" return False, "" def is_binary(self, path: str) -> Optional[str]: From 391f0bcd3e651ef7428088b2f24af6f6cb8f016e Mon Sep 17 00:00:00 2001 From: Ian Kretz <44385082+ikretz@users.noreply.github.com> Date: Wed, 16 Oct 2024 18:54:06 +0200 Subject: [PATCH 2/4] Remove unused import --- guarddog/analyzer/metadata/bundled_binary.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/guarddog/analyzer/metadata/bundled_binary.py b/guarddog/analyzer/metadata/bundled_binary.py index 87c5c8e1..a8744523 100644 --- a/guarddog/analyzer/metadata/bundled_binary.py +++ b/guarddog/analyzer/metadata/bundled_binary.py @@ -1,10 +1,10 @@ -from guarddog.analyzer.metadata.detector import Detector from abc import abstractmethod -from typing import Optional -import os -from functools import reduce -import logging import hashlib +import logging +import os +from typing import Optional + +from guarddog.analyzer.metadata.detector import Detector log = logging.getLogger("guarddog") From 76436d35a053355c0a02605afbd5993e947c8538 Mon Sep 17 00:00:00 2001 From: Ian Kretz <44385082+ikretz@users.noreply.github.com> Date: Wed, 16 Oct 2024 23:45:28 +0200 Subject: [PATCH 3/4] Don't read the whole file in at once --- guarddog/analyzer/metadata/bundled_binary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/guarddog/analyzer/metadata/bundled_binary.py b/guarddog/analyzer/metadata/bundled_binary.py index a8744523..5323f68f 100644 --- a/guarddog/analyzer/metadata/bundled_binary.py +++ b/guarddog/analyzer/metadata/bundled_binary.py @@ -35,7 +35,8 @@ def format_file(file: str, kind: str) -> str: def sha256(file: str) -> str: with open(file, "rb") as f: hasher = hashlib.sha256() - hasher.update(f.read()) + while (chunk := f.read(4096)): + hasher.update(chunk) return hasher.hexdigest() log.debug( From e7020d0d0e49c990a7b061a347a85605afc10620 Mon Sep 17 00:00:00 2001 From: Ian Kretz <44385082+ikretz@users.noreply.github.com> Date: Thu, 17 Oct 2024 13:29:03 +0200 Subject: [PATCH 4/4] Incorporate review comments --- guarddog/analyzer/metadata/bundled_binary.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/guarddog/analyzer/metadata/bundled_binary.py b/guarddog/analyzer/metadata/bundled_binary.py index 5323f68f..5b6e8757 100644 --- a/guarddog/analyzer/metadata/bundled_binary.py +++ b/guarddog/analyzer/metadata/bundled_binary.py @@ -56,12 +56,14 @@ def sha256(file: str) -> str: bin_files[digest] = [format_file(f, kind)] else: bin_files[digest].append(format_file(f, kind)) - if bin_files: - output_lines = '\n'.join( - f"{digest}: {', '.join(files)}" for digest, files in bin_files.items() - ) - return True, f"Binary file/s detected in package:\n{output_lines}" - return False, "" + + if not bin_files: + return False, "" + + output_lines = '\n'.join( + f"{digest}: {', '.join(files)}" for digest, files in bin_files.items() + ) + return True, f"Binary file/s detected in package:\n{output_lines}" def is_binary(self, path: str) -> Optional[str]: max_head = len(max(self.magic_bytes.values()))