Skip to content

Commit

Permalink
Merge branch 'main' of github.com:Imageomics/sum-buddy into auto-test
Browse files Browse the repository at this point in the history
check fix for system-specific hash availability works for GH action test
  • Loading branch information
egrace479 committed Jul 29, 2024
2 parents dcd0b9f + cd5eac8 commit 0126139
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 146 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ options:
Hash algorithm to use (default: md5; available: ripemd160, sha3_224, sha512_224, blake2b, sha384, sha256, sm3, sha3_256, shake_256, sha512, sha1, sha224, md5, md5-sha1, sha3_384, sha3_512, sha512_256, shake_128, blake2s)
```

> Note: The available algorithms are determined by those available to `hashlib` and may vary depending on your system and OpenSSL version, so the set shown on your system with `sum-buddy -h` may be different from above. At a minimum, it should include: `{blake2s, blake2b, md5, sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256}`, which is given by `hashlib.algorithms_guaranteed`.
#### CLI Examples

- **Basic Usage:**
Expand Down
20 changes: 15 additions & 5 deletions src/sumbuddy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
from sumbuddy.hasher import Hasher
from sumbuddy.mapper import Mapper
from sumbuddy.filter import Filter
from sumbuddy.exceptions import EmptyInputDirectoryError, NoFilesAfterFilteringError, LengthUsedForFixedLengthHashError
import csv
import hashlib
from tqdm import tqdm
import sys
import os

def get_checksums(input_directory, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='md5'):
def get_checksums(input_directory, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='md5', length=None):
"""
Generate a CSV file with the filepath, filename, and checksum of all files in the input directory according to patterns to ignore. Checksum column is labeled by the selected algorithm (e.g., 'md5' or 'sha256').
Expand All @@ -19,10 +20,14 @@ def get_checksums(input_directory, output_filepath=None, ignore_file=None, inclu
ignore_file - String [optional]. Filepath for the ignore patterns file.
include_hidden - Boolean [optional]. Whether to include hidden files. Default is False.
algorithm - String. Algorithm to use for checksums. Default: 'md5', see options with 'hashlib.algorithms_available'.
length - Integer [conditionally optional]. Length of the digest for SHAKE (required) and BLAKE (optional) algorithms in bytes.
"""
mapper = Mapper()
file_paths = mapper.gather_file_paths(input_directory, ignore_file=ignore_file, include_hidden=include_hidden)

try:
file_paths = mapper.gather_file_paths(input_directory, ignore_file=ignore_file, include_hidden=include_hidden)
except (EmptyInputDirectoryError, NoFilesAfterFilteringError) as e:
sys.exit(str(e))

# Exclude the output file from being hashed
if output_filepath:
output_file_abs_path = os.path.abspath(output_filepath)
Expand All @@ -37,7 +42,7 @@ def get_checksums(input_directory, output_filepath=None, ignore_file=None, inclu

disable_tqdm = output_filepath is None
for file_path in tqdm(file_paths, desc=f"Calculating {algorithm} checksums on {input_directory}", disable=disable_tqdm):
checksum = hasher.checksum_file(file_path)
checksum = hasher.checksum_file(file_path, algorithm=algorithm, length=length)
writer.writerow([file_path, os.path.basename(file_path), checksum])

finally:
Expand All @@ -58,6 +63,7 @@ def main():
group.add_argument("-i", "--ignore-file", help="Filepath for the ignore patterns file")
group.add_argument("-H", "--include-hidden", action="store_true", help="Include hidden files")
parser.add_argument("-a", "--algorithm", default="md5", help=f"Hash algorithm to use (default: md5; available: {available_algorithms})")
parser.add_argument("-l", "--length", type=int, help="Length of the digest for SHAKE (required) or BLAKE (optional) algorithms in bytes")

args = parser.parse_args()

Expand All @@ -69,7 +75,11 @@ def main():
if overwrite.lower() != 'y':
sys.exit("Exited without executing")

get_checksums(args.input_dir, args.output_file, args.ignore_file, args.include_hidden, args.algorithm)
try:
get_checksums(args.input_dir, args.output_file, args.ignore_file, args.include_hidden, args.algorithm, args.length)
except (LengthUsedForFixedLengthHashError) as e:
sys.exit(str(e))


if __name__ == "__main__":
main()
14 changes: 14 additions & 0 deletions src/sumbuddy/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class EmptyInputDirectoryError(Exception):
def __init__(self, input_directory):
message = f"The directory {input_directory} and subdirectories (if any) contain no files. \nPlease provide a directory with files."
super().__init__(message)

class NoFilesAfterFilteringError(Exception):
def __init__(self, input_directory, ignore_file):
message = f"The directory {input_directory} contains files, but all are filtered out. \nCheck patterns in your {ignore_file} file and/or hidden files settings."
super().__init__(message)

class LengthUsedForFixedLengthHashError(Exception):
def __init__(self, algorithm):
message = f"Length paremeter is not applicable for fixed-length algorithm '{algorithm}'."
super().__init__(message)
3 changes: 2 additions & 1 deletion src/sumbuddy/hasher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
from sumbuddy.exceptions import LengthUsedForFixedLengthHashError

class Hasher:
def __init__(self, algorithm='md5'):
Expand Down Expand Up @@ -51,7 +52,7 @@ def checksum_file(self, file_path, algorithm=None, length=None):
# Other algorithms
else:
if length is not None:
raise ValueError(f"Length parameter is not applicable for fixed-length algorithm '{algorithm}'")
raise LengthUsedForFixedLengthHashError(algorithm)
hash_func = hashlib.new(algorithm)

# Read the file and update the hash function
Expand Down
9 changes: 9 additions & 0 deletions src/sumbuddy/mapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from sumbuddy.filter import Filter
from sumbuddy.exceptions import EmptyInputDirectoryError, NoFilesAfterFilteringError

class Mapper:
def __init__(self):
Expand Down Expand Up @@ -42,11 +43,19 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa

file_paths = []
root_directory = os.path.abspath(input_directory)
has_files = False

for root, dirs, files in os.walk(input_directory):
if files:
has_files = True
for name in files:
file_path = os.path.join(root, name)
if self.filter_manager.should_include(file_path, root_directory):
file_paths.append(file_path)

if not has_files:
raise EmptyInputDirectoryError(input_directory)
if not file_paths:
raise NoFilesAfterFilteringError(input_directory, ignore_file)

return file_paths
10 changes: 5 additions & 5 deletions tests/test_getChecksums.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUp(self):
@patch('os.path.exists', return_value=True)
@patch('builtins.open', new_callable=mock_open)
@patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt'])
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x: 'dummychecksum')
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum')
def test_get_checksums_to_file(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath):
get_checksums(self.input_directory, self.output_filepath, ignore_file=None, include_hidden=False, algorithm=self.algorithm)

Expand All @@ -35,7 +35,7 @@ def test_get_checksums_to_file(self, mock_checksum, mock_gather, mock_open, mock
@patch('os.path.exists', return_value=True)
@patch('builtins.open', new_callable=mock_open)
@patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt'])
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x: 'dummychecksum')
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum')
def test_get_checksums_to_stdout(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath):
output_stream = StringIO()
with patch('sys.stdout', new=output_stream):
Expand All @@ -50,7 +50,7 @@ def test_get_checksums_to_stdout(self, mock_checksum, mock_gather, mock_open, mo
@patch('os.path.exists', return_value=True)
@patch('builtins.open', new_callable=mock_open)
@patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt'])
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x: 'dummychecksum')
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum')
def test_get_checksums_with_ignore_file(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath):
get_checksums(self.input_directory, output_filepath=None, ignore_file=self.ignore_file, include_hidden=False, algorithm=self.algorithm)
mock_gather.assert_called_with(self.input_directory, ignore_file=self.ignore_file, include_hidden=False)
Expand All @@ -59,7 +59,7 @@ def test_get_checksums_with_ignore_file(self, mock_checksum, mock_gather, mock_o
@patch('os.path.exists', return_value=True)
@patch('builtins.open', new_callable=mock_open)
@patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt', '.hidden_file'])
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x: 'dummychecksum')
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum')
def test_get_checksums_include_hidden(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath):
get_checksums(self.input_directory, output_filepath=None, ignore_file=None, include_hidden=True, algorithm=self.algorithm)
mock_gather.assert_called_with(self.input_directory, ignore_file=None, include_hidden=True)
Expand All @@ -68,7 +68,7 @@ def test_get_checksums_include_hidden(self, mock_checksum, mock_gather, mock_ope
@patch('os.path.exists', return_value=True)
@patch('builtins.open', new_callable=mock_open)
@patch('sumbuddy.Mapper.gather_file_paths', return_value=['file1.txt', 'file2.txt'])
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x: 'dummychecksum')
@patch('sumbuddy.Hasher.checksum_file', side_effect=lambda x, **kwargs: 'dummychecksum')
def test_get_checksums_different_algorithm(self, mock_checksum, mock_gather, mock_open, mock_exists, mock_abspath):
algorithm = 'sha256'
get_checksums(self.input_directory, output_filepath=None, ignore_file=None, include_hidden=False, algorithm=algorithm)
Expand Down
Loading

0 comments on commit 0126139

Please sign in to comment.