diff --git a/guarddog/analyzer/metadata/bundled_binary.py b/guarddog/analyzer/metadata/bundled_binary.py index a0d252c8..5b6e8757 100644 --- a/guarddog/analyzer/metadata/bundled_binary.py +++ b/guarddog/analyzer/metadata/bundled_binary.py @@ -1,9 +1,10 @@ -from guarddog.analyzer.metadata.detector import Detector from abc import abstractmethod -from typing import Optional -import os -from functools import reduce +import hashlib import logging +import os +from typing import Optional + +from guarddog.analyzer.metadata.detector import Detector log = logging.getLogger("guarddog") @@ -28,6 +29,15 @@ def detect( name: Optional[str] = None, version: Optional[str] = None, ) -> tuple[bool, str]: + def format_file(file: str, kind: str) -> str: + return f"{file} ({kind})" + + def sha256(file: str) -> str: + with open(file, "rb") as f: + hasher = hashlib.sha256() + while (chunk := f.read(4096)): + hasher.update(chunk) + return hasher.hexdigest() log.debug( f"Running bundled binary heuristic on package {name} version {version}" @@ -35,15 +45,25 @@ def detect( if not path: raise ValueError("path is needed to run heuristic " + self.get_name()) - bin_files = [] + bin_files = {} for root, _, files in os.walk(path): for f in files: - kind = self.is_binary(os.path.join(root, f)) + path = os.path.join(root, f) + kind = self.is_binary(path) if kind: - bin_files.append(f"{f} type {kind}") - if bin_files: - return True, "Binary file/s detected in package: " + reduce(lambda x, y: f"{x}, {y}", bin_files) - return False, "" + digest = sha256(path) + if digest not in bin_files: + bin_files[digest] = [format_file(f, kind)] + else: + bin_files[digest].append(format_file(f, kind)) + + if not bin_files: + return False, "" + + output_lines = '\n'.join( + f"{digest}: {', '.join(files)}" for digest, files in bin_files.items() + ) + return True, f"Binary file/s detected in package:\n{output_lines}" def is_binary(self, path: str) -> Optional[str]: max_head = len(max(self.magic_bytes.values()))