From 691ad166e6fb04a9938301aa376ac0bb83856428 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 08:44:17 +1100 Subject: [PATCH 01/14] 0.4 beta: - now able to interpet files that are a simple list of checksums --- mhl-compare.py | 91 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/mhl-compare.py b/mhl-compare.py index f3dc70e..3da9435 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -8,6 +8,7 @@ import os import argparse import codecs +import re from datetime import datetime import xmltodict @@ -35,8 +36,8 @@ else: LOG_APPTYPE = 'Python' -LOG_VERSION = '0.3' -LOG_AUTHOR_AND_LICENSE = '(Author: Sebastian Reategui) (MIT License)' +LOG_VERSION = '0.4' +LOG_AUTHOR_AND_LICENSE = '(Author: Sebastian Reategui) (MIT License) (2020-03-21)' LOG_STARTUP_LINE = 'mhl-compare (v{}) ({}) {}'.format( LOG_VERSION, LOG_APPTYPE, LOG_AUTHOR_AND_LICENSE) @@ -88,7 +89,8 @@ def __init__(self, listObj, filepath): self.hashes = {} self.duplicates = set() - self.hashlist_version = listObj['hashlist']['@version'] + if '@version' in listObj['hashlist']: + self.hashlist_version = listObj['hashlist']['@version'] if 'creatorinfo' in listObj['hashlist']: self.creatorinfo = listObj['hashlist']['creatorinfo'] @@ -167,7 +169,8 @@ def count(self): def totalSize(self): sum = 0 for h in self.hashes.values(): - sum += h.size + if h.sizeDefined: + sum += h.size return showSize(sum) @@ -195,17 +198,28 @@ def __init__(self, xmlObject, mhlIdentifier): # For some reason, the entry is missing a attribute # Probably should throw an error and let the user know their MHL is malformed self.filepath = False - self.size = int( xmlObject['size'] ) - self.sizeHuman = showSize(self.size) + xmlObjectKeys = xmlObject.keys() - # Try do the date parsing, hopefully without errors - modDate = dateutilParser.parse( xmlObject['lastmodificationdate'] ) - if modDate.tzinfo is None: - self.lastmodificationdate = modDate.replace(tzinfo=tzutc()) - else: - self.lastmodificationdate = modDate + if 'size' in xmlObjectKeys: + if xmlObject['size']: + self.sizeDefined = True + self.size = int( xmlObject['size'] ) + self.sizeHuman = showSize(self.size) + else: + # It's "None", unspecified + self.sizeDefined = False + self.size = None + self.sizeHuman = 'Not specified' + + if 'lastmodificationdate' in xmlObjectKeys: + # Try do the date parsing, hopefully without errors + modDate = dateutilParser.parse( xmlObject['lastmodificationdate'] ) + if modDate.tzinfo is None: + self.lastmodificationdate = modDate.replace(tzinfo=tzutc()) + else: + self.lastmodificationdate = modDate if 'creationdate' in xmlObjectKeys: self.creationdate = dateutilParser.parse( xmlObject['creationdate'] ) @@ -351,6 +365,11 @@ def checkCommon(self): logDetail( ' Hash: identical: {} ({})'.format( hashA.identifier, hashA.identifierType ) ) if 'size' in dChanged: + # First, check if the Size is simply "Not specified" + if hashA.sizeDefined == False or hashB.sizeDefined == False: + self.COUNT['MINOR'] += 1 + beenCounted = True + # It is an anomaly if the size has changed, but not the hash. # Report it as impossible, but also print it to the user anyway. if not beenCounted: @@ -616,7 +635,7 @@ def printCount(self): 'desc': 'matched perfectly' }, 'MINOR': { - 'desc': 'matched, but with differences in name, directory or modification date' + 'desc': 'matched (but with differences in name, directory or modification date)' }, 'HASH_TYPE_DIFFERENT': { 'desc': 'had incomparable hash types and could not be compared', @@ -721,13 +740,47 @@ def printCount(self): ##### +PATTERN_XXHASHLIST = re.compile('^([0-9a-fA-F]{16})\s{2}(.*)$') + +def parseFile(filepath): + + # (1) Try to parse it as XML + try: + with open(filepath, 'r') as f: + parsed = xmltodict.parse( f.read(), dict_constructor=dict ) + return parsed -fA = open(file_path_A, 'r') -fB = open(file_path_B, 'r') -PARSE_FILE_A = xmltodict.parse( fA.read(), dict_constructor=dict ) -PARSE_FILE_B = xmltodict.parse( fB.read(), dict_constructor=dict ) -fA.close() -fB.close() + except: + # Syntax error from xmldict + # (2) Try parsing this as an .xxhash simple list of sums + with open(filepath, 'r') as f: + lines = f.readlines() + f.close() + + fauxMHL = { + '_ORIGIN': os.path.basename(filepath), + 'hashlist': { + 'hash': [] + } + } + for line in lines: + match = PATTERN_XXHASHLIST.match(line) + if match: + hash = match[1] + hashFilepath = match[2] + + # Create a faux MHL line, imitating XML already parsed as a dict + fauxMHL_hash = { + 'file': hashFilepath, + 'size': None, + 'xxhash64be': hash, + } + fauxMHL['hashlist']['hash'].append(fauxMHL_hash) + + return fauxMHL + +PARSE_FILE_A = parseFile(file_path_A) +PARSE_FILE_B = parseFile(file_path_B) MHL_FILE_A = MHL(PARSE_FILE_A, file_path_A) MHL_FILE_B = MHL(PARSE_FILE_B, file_path_B) From 6116c7ea153ea0fd6c2c1987b30d802216e48b40 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 09:05:08 +1100 Subject: [PATCH 02/14] - By default, human sizing is shown in Decimal (1 KB = 1000 bytes) - Add CLI argument to show Binary instead (1 KiB = 1024 bytes), "-b" --- mhl-compare.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mhl-compare.py b/mhl-compare.py index 3da9435..1baf57a 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -23,6 +23,7 @@ HASH_TYPES_ACCEPTABLE = [ 'xxhash64be', 'xxhash64', 'xxhash', 'md5', 'sha1' ] LOG_TIME_FORMAT = '%Y-%m-%d %H:%M:%S' +LOG_SIZE_FORMAT = 'decimal' # By default, 1000 bytes is 1 KB LOG_VERBOSE = False # By default, don't show detail about which files changed LOG_COLOR_MHL_A = 'green' @@ -58,7 +59,11 @@ def showSize(numBytes): if numBytes < 1024: return str(numBytes) + " bytes" else: - return humanize.naturalsize(numBytes, binary=True) + " ({} bytes)".format(numBytes) + if LOG_SIZE_FORMAT == 'binary': + humanize_binary_setting = True + else: + humanize_binary_setting = False + return humanize.naturalsize(numBytes, binary=humanize_binary_setting) + " ({} bytes)".format(numBytes) def logDetail(*args, **kwargs): @@ -712,6 +717,11 @@ def printCount(self): help="gives greater detail on all files affected", action="store_true" ) +parser.add_argument( + "-b", "--binary", + help="Shows sizes in binary format, appropriate for Windows (1024 bytes = 1 KiB)", + action="store_true" +) args = parser.parse_args() @@ -737,6 +747,9 @@ def printCount(self): if args.verbose: LOG_VERBOSE = True +if args.binary: + LOG_SIZE_FORMAT = 'binary' + ##### From 5716ac45b3b52945a207044146fbf96eee1d62d0 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 12:56:29 +1100 Subject: [PATCH 03/14] - move dictdiffer out of lib, and then hide lib (gets used by venv apparently) --- .gitignore | 2 ++ lib/dictdiffer.py => dictdiffer.py | 0 mhl-compare.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) rename lib/dictdiffer.py => dictdiffer.py (100%) diff --git a/.gitignore b/.gitignore index 1c42a86..4556805 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +lib/* + # Created by https://www.gitignore.io/api/macos,python # Edit at https://www.gitignore.io/?templates=macos,python diff --git a/lib/dictdiffer.py b/dictdiffer.py similarity index 100% rename from lib/dictdiffer.py rename to dictdiffer.py diff --git a/mhl-compare.py b/mhl-compare.py index 1baf57a..d253332 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -16,7 +16,7 @@ from dateutil.tz import tzutc from dateutil import parser as dateutilParser from termcolor import colored -from lib.dictdiffer import DictDiffer +from dictdiffer import DictDiffer # Program defaults HASH_TYPE_PREFERRED = 'xxhash64be' From 47da49c56c8909aff69e86ce86c5cc6555c684a1 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 13:24:31 +1100 Subject: [PATCH 04/14] - change to structure: parsing the MHL now takes place inside MHL object > origin (MHL versus plain list of sums) is defined > Size no longer showing as "0 bytes" for a list of sum (misleading) --- mhl-compare.py | 101 ++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/mhl-compare.py b/mhl-compare.py index d253332..71f7aea 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -25,6 +25,7 @@ LOG_TIME_FORMAT = '%Y-%m-%d %H:%M:%S' LOG_SIZE_FORMAT = 'decimal' # By default, 1000 bytes is 1 KB LOG_VERBOSE = False # By default, don't show detail about which files changed +LOG_SHOW_DATES = False # By default, don't report on modification dates, hashdates, or creationdates LOG_COLOR_MHL_A = 'green' LOG_COLOR_MHL_B = 'yellow' @@ -87,13 +88,50 @@ def hashConvertEndian(hashString): return codecs.encode(codecs.decode(hashString, 'hex')[::-1], 'hex').decode() + class MHL: - def __init__(self, listObj, filepath): + def __init__(self, filepath): self.filepath = filepath self.mhlIdentifier = filepath self.hashes = {} self.duplicates = set() + PATTERN_XXHASHLIST = re.compile('^([0-9a-fA-F]{16})\s{2}(.*)$') + + # (1) Try to parse it as XML + try: + with open(self.filepath, 'r') as f: + listObj = xmltodict.parse( f.read(), dict_constructor=dict ) + self.originType = 'MHL' + except: + # Syntax error from xmldict + # (2) Try parsing this as an .xxhash simple list of sums + with open(filepath, 'r') as f: + lines = f.readlines() + f.close() + + fauxMHL = { + '_ORIGIN': os.path.basename(filepath), + 'hashlist': { + 'hash': [] + } + } + for line in lines: + match = PATTERN_XXHASHLIST.match(line) + if match: + hash = match[1] + hashFilepath = match[2] + + # Create a faux MHL line, imitating XML already parsed as a dict + fauxMHL_hash = { + 'file': hashFilepath, + 'size': None, + 'xxhash64be': hash, + } + fauxMHL['hashlist']['hash'].append(fauxMHL_hash) + listObj = fauxMHL + self.originType = 'HASHLIST_PLAIN' + if '@version' in listObj['hashlist']: self.hashlist_version = listObj['hashlist']['@version'] @@ -623,15 +661,26 @@ def printInfo(self): count_files_A = str( self.A.count() ) + " files" count_files_B = str( self.B.count() ) + " files" + + + if self.A.originType == 'HASHLIST_PLAIN': + displayed_size_A = 'Size not specified (file is a simple list of checksums)' + else: + displayed_size_A = self.A.totalSize() + if self.B.originType == 'HASHLIST_PLAIN': + displayed_size_B = 'Size not specified (file is a simple list of checksums)' + else: + displayed_size_B = self.B.totalSize() + print('') if LOG_VERBOSE: print('Summary:') print('1st MHL file:', color(self.A.filepath, LOG_COLOR_MHL_A) ) print(' ', color(count_files_A, LOG_COLOR_MHL_A) ) - print(' ', color(self.A.totalSize(), LOG_COLOR_MHL_A) ) + print(' ', color(displayed_size_A, LOG_COLOR_MHL_A) ) print('2nd MHL file:', color(self.B.filepath, LOG_COLOR_MHL_B) ) print(' ', color(count_files_B, LOG_COLOR_MHL_B) ) - print(' ', color(self.B.totalSize(), LOG_COLOR_MHL_B) ) + print(' ', color(displayed_size_B, LOG_COLOR_MHL_B) ) return def printCount(self): @@ -753,50 +802,8 @@ def printCount(self): ##### -PATTERN_XXHASHLIST = re.compile('^([0-9a-fA-F]{16})\s{2}(.*)$') - -def parseFile(filepath): - - # (1) Try to parse it as XML - try: - with open(filepath, 'r') as f: - parsed = xmltodict.parse( f.read(), dict_constructor=dict ) - return parsed - - except: - # Syntax error from xmldict - # (2) Try parsing this as an .xxhash simple list of sums - with open(filepath, 'r') as f: - lines = f.readlines() - f.close() - - fauxMHL = { - '_ORIGIN': os.path.basename(filepath), - 'hashlist': { - 'hash': [] - } - } - for line in lines: - match = PATTERN_XXHASHLIST.match(line) - if match: - hash = match[1] - hashFilepath = match[2] - - # Create a faux MHL line, imitating XML already parsed as a dict - fauxMHL_hash = { - 'file': hashFilepath, - 'size': None, - 'xxhash64be': hash, - } - fauxMHL['hashlist']['hash'].append(fauxMHL_hash) - - return fauxMHL - -PARSE_FILE_A = parseFile(file_path_A) -PARSE_FILE_B = parseFile(file_path_B) - -MHL_FILE_A = MHL(PARSE_FILE_A, file_path_A) -MHL_FILE_B = MHL(PARSE_FILE_B, file_path_B) +MHL_FILE_A = MHL(file_path_A) +MHL_FILE_B = MHL(file_path_B) compare = Comparison(MHL_FILE_A, MHL_FILE_B) compare.printInfo() From af6046db9878dd138370ee646790927f921695fa Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 13:48:06 +1100 Subject: [PATCH 05/14] - Exclude dates as a factor in the comparsion - Only report on dates if the user specifies with -d or --dates --- mhl-compare.py | 121 +++++++++++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 43 deletions(-) diff --git a/mhl-compare.py b/mhl-compare.py index 71f7aea..76c4430 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -26,6 +26,7 @@ LOG_SIZE_FORMAT = 'decimal' # By default, 1000 bytes is 1 KB LOG_VERBOSE = False # By default, don't show detail about which files changed LOG_SHOW_DATES = False # By default, don't report on modification dates, hashdates, or creationdates +DATE_ATTRIBS_TO_FILTER = [ 'lastmodificationdate', 'creationdate', 'hashdate' ] LOG_COLOR_MHL_A = 'green' LOG_COLOR_MHL_B = 'yellow' @@ -376,8 +377,8 @@ def checkCommon(self): dChanged = diff.changed() dUnchanged = diff.unchanged() - if { 'filename', 'directory', 'size', 'lastmodificationdate' }.issubset(dUnchanged): - # If neither of these variables have changed, then we have a clean match. + if { 'filename', 'directory', 'size' }.issubset(dUnchanged): + # If neither of these variables have changed, then we have a perfect match. # Report it and move on. if not beenCounted: self.COUNT['PERFECT'] += 1 @@ -410,7 +411,7 @@ def checkCommon(self): if 'size' in dChanged: # First, check if the Size is simply "Not specified" if hashA.sizeDefined == False or hashB.sizeDefined == False: - self.COUNT['MINOR'] += 1 + self.COUNT['PERFECT'] += 1 beenCounted = True # It is an anomaly if the size has changed, but not the hash. @@ -424,30 +425,41 @@ def checkCommon(self): logDetail( ' ' + 'Size: identical: ' + hashA.sizeHuman ) if 'lastmodificationdate' in dChanged: - if not beenCounted: - self.COUNT['MINOR'] += 1 - beenCounted = True - logDetail( - ' Modified date: different (1st):', - color( hashA.lastmodificationdate, LOG_COLOR_MHL_A ) - ) - logDetail( - ' (2nd):', - color( hashB.lastmodificationdate, LOG_COLOR_MHL_B ) - ) + if LOG_SHOW_DATES: + if not beenCounted: + self.COUNT['MINOR'] += 1 + beenCounted = True + logDetail( + ' Modified date: different (1st):', + color( hashA.lastmodificationdate, LOG_COLOR_MHL_A ) + ) + logDetail( + ' (2nd):', + color( hashB.lastmodificationdate, LOG_COLOR_MHL_B ) + ) + else: + # Don't count date changes unless user wants it (LOG_SHOW_DATES is true) + pass # Briefly explain to the user what attributes were added/removed - if len(dAdded) > 0: - dAddedList = ', '.join( str(i) for i in dAdded ) + if LOG_SHOW_DATES == False: + dAddedFiltered = [ i for i in dAdded if i not in DATE_ATTRIBS_TO_FILTER ] + dRemovedFiltered = [ i for i in dRemoved if i not in DATE_ATTRIBS_TO_FILTER ] + else: + dAddedFiltered = dAdded + dRemovedFiltered = dRemoved + + if len(dAddedFiltered) > 0: + dAddedString = ', '.join( str(i) for i in dAddedFiltered ) logDetail( ' These attributes exist in 1st only:', - color(dAddedList, LOG_COLOR_MHL_A ) + color(dAddedString, LOG_COLOR_MHL_A ) ) - if len(dRemoved) > 0: - dRemovedList = ', '.join( str(i) for i in dRemoved ) + if len(dRemovedFiltered) > 0: + dRemovedString = ', '.join( str(i) for i in dRemovedFiltered ) logDetail( ' These attributes exist in 2nd only:', - color(dRemovedList, LOG_COLOR_MHL_B ) + color(dRemovedString, LOG_COLOR_MHL_B ) ) def checkDelta(self, letter): @@ -579,8 +591,8 @@ def checkDelta(self, letter): ) ) - if { 'filename', 'directory', 'size', 'lastmodificationdate' }.issubset(dUnchanged): - # If neither of these variables have changed, then we almost have a clean match. + if { 'filename', 'directory', 'size' }.issubset(dUnchanged): + # If neither of these variables have changed, then we almost have a perfect match. # EVEN THOUGH we used a slightly different preferred hash. if not beenCounted: self.COUNT['PERFECT'] += 1 @@ -608,39 +620,56 @@ def checkDelta(self, letter): logDetail( ' Path: identical:', hash.directory ) if 'size' in dChanged: - # It is an anomaly if the size has changed, but not the hash. - # Report it as impossible, but also print it to the user anyway. - if not beenCounted: - self.COUNT['IMPOSSIBLE'] += 1 + # First, check if the Size is simply "Not specified" + # This is not an anomaly if so. + if hashA.sizeDefined == False or hashB.sizeDefined == False: + self.COUNT['PERFECT'] += 1 beenCounted = True - logDetail( ' Size: different (1st):', color( hash.sizeHuman, LOG_COLOR_MHL_A ) ) - logDetail( ' (2nd):', color( hashPossible.sizeHuman, LOG_COLOR_MHL_B ) ) + else: + # It is an anomaly if the size has changed while the hash has not. + # Report it as impossible, but also print it to the user anyway. + if not beenCounted: + self.COUNT['IMPOSSIBLE'] += 1 + beenCounted = True + logDetail( ' Size: different (1st):', color( hash.sizeHuman, LOG_COLOR_MHL_A ) ) + logDetail( ' (2nd):', color( hashPossible.sizeHuman, LOG_COLOR_MHL_B ) ) else: logDetail( ' ' + 'Size: identical: ' + hashPossible.sizeHuman ) if 'lastmodificationdate' in dChanged: - if not beenCounted: - self.COUNT['MINOR'] += 1 - beenCounted = True + if LOG_SHOW_DATES: + if not beenCounted: + self.COUNT['MINOR'] += 1 + beenCounted = True - hModDate = showDate(hash.lastmodificationdate) - hPModDate = showDate(hashPossible.lastmodificationdate) + hModDate = showDate(hash.lastmodificationdate) + hPModDate = showDate(hashPossible.lastmodificationdate) - logDetail( ' Modified date: different (1st):', color( hModDate, LOG_COLOR_MHL_A ) ) - logDetail( ' (2nd):', color( hPModDate, LOG_COLOR_MHL_B ) ) + logDetail( ' Modified date: different (1st):', color( hModDate, LOG_COLOR_MHL_A ) ) + logDetail( ' (2nd):', color( hPModDate, LOG_COLOR_MHL_B ) ) + else: + # Don't count date changes unless user wants it (LOG_SHOW_DATES is true) + pass # Briefly explain to the user what attributes were added/removed - if len(dAdded) > 0: - dAddedList = ', '.join( str(i) for i in dAdded ) + if LOG_SHOW_DATES == False: + dAddedFiltered = [ i for i in dAdded if i not in DATE_ATTRIBS_TO_FILTER ] + dRemovedFiltered = [ i for i in dRemoved if i not in DATE_ATTRIBS_TO_FILTER ] + else: + dAddedFiltered = dAdded + dRemovedFiltered = dRemoved + + if len(dAddedFiltered) > 0: + dAddedString = ', '.join( str(i) for i in dAddedFiltered ) logDetail( ' These attributes exist in 1st only:', - color(dAddedList, LOG_COLOR_MHL_A ) + color(dAddedString, LOG_COLOR_MHL_A ) ) - if len(dRemoved) > 0: - dRemovedList = ', '.join( str(i) for i in dRemoved ) + if len(dRemovedFiltered) > 0: + dRemovedString = ', '.join( str(i) for i in dRemovedFiltered ) logDetail( ' These attributes exist in 2nd only:', - color(dRemovedList, LOG_COLOR_MHL_B ) + color(dRemovedString, LOG_COLOR_MHL_B ) ) pass @@ -689,7 +718,7 @@ def printCount(self): 'desc': 'matched perfectly' }, 'MINOR': { - 'desc': 'matched (but with differences in name, directory or modification date)' + 'desc': 'matched (but with differences in name or directory)' }, 'HASH_TYPE_DIFFERENT': { 'desc': 'had incomparable hash types and could not be compared', @@ -771,6 +800,11 @@ def printCount(self): help="Shows sizes in binary format, appropriate for Windows (1024 bytes = 1 KiB)", action="store_true" ) +parser.add_argument( + "-d", "--dates", + help="Report on differences in modification date, creation date or hash date", + action="store_true" +) args = parser.parse_args() @@ -795,9 +829,10 @@ def printCount(self): if args.verbose: LOG_VERBOSE = True - if args.binary: LOG_SIZE_FORMAT = 'binary' +if args.dates: + LOG_SHOW_DATES = True ##### From a27eb6d6c1ee6f4ae20e3c1f1b8187b258fc8963 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 22:05:04 +1100 Subject: [PATCH 06/14] - When given 1 MHL file, print a summary of the files contained within it - Sizes are now 2 d.p. and bytes are excluded, unless explicitly defined - If one file is a Hashlist, return None when size is queried - Update arguments structure, get rid of redundant A and B code --- mhl-compare.py | 134 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 41 deletions(-) diff --git a/mhl-compare.py b/mhl-compare.py index 76c4430..886c33f 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -9,6 +9,7 @@ import argparse import codecs import re +import itertools from datetime import datetime import xmltodict @@ -26,7 +27,7 @@ LOG_SIZE_FORMAT = 'decimal' # By default, 1000 bytes is 1 KB LOG_VERBOSE = False # By default, don't show detail about which files changed LOG_SHOW_DATES = False # By default, don't report on modification dates, hashdates, or creationdates -DATE_ATTRIBS_TO_FILTER = [ 'lastmodificationdate', 'creationdate', 'hashdate' ] +LIST_OF_DATE_ATTRIBUTES = [ 'lastmodificationdate', 'creationdate', 'hashdate' ] LOG_COLOR_MHL_A = 'green' LOG_COLOR_MHL_B = 'yellow' @@ -57,7 +58,7 @@ def showDate(dt): return dt.strftime(LOG_TIME_FORMAT) -def showSize(numBytes): +def humanSize(numBytes, showBytes=False): if numBytes < 1024: return str(numBytes) + " bytes" else: @@ -65,7 +66,18 @@ def showSize(numBytes): humanize_binary_setting = True else: humanize_binary_setting = False - return humanize.naturalsize(numBytes, binary=humanize_binary_setting) + " ({} bytes)".format(numBytes) + + display_human_size = humanize.naturalsize( + numBytes, + binary=humanize_binary_setting, + format="%.2f" # 2 decimal places + ) + + # If yes, display (1024 bytes) in brackets next to the human amount. + if showBytes: + return display_human_size + ' ({} bytes)'.format(str(numBytes)) + else: + return display_human_size def logDetail(*args, **kwargs): @@ -215,7 +227,11 @@ def totalSize(self): for h in self.hashes.values(): if h.sizeDefined: sum += h.size - return showSize(sum) + if self.originType == 'HASHLIST_PLAIN': + # Then there is no record of sizes + return None + else: + return sum class Hash(MHL): @@ -250,7 +266,7 @@ def __init__(self, xmlObject, mhlIdentifier): if xmlObject['size']: self.sizeDefined = True self.size = int( xmlObject['size'] ) - self.sizeHuman = showSize(self.size) + self.sizeHuman = humanSize(self.size) else: # It's "None", unspecified self.sizeDefined = False @@ -443,8 +459,8 @@ def checkCommon(self): # Briefly explain to the user what attributes were added/removed if LOG_SHOW_DATES == False: - dAddedFiltered = [ i for i in dAdded if i not in DATE_ATTRIBS_TO_FILTER ] - dRemovedFiltered = [ i for i in dRemoved if i not in DATE_ATTRIBS_TO_FILTER ] + dAddedFiltered = [ i for i in dAdded if i not in LIST_OF_DATE_ATTRIBUTES ] + dRemovedFiltered = [ i for i in dRemoved if i not in LIST_OF_DATE_ATTRIBUTES ] else: dAddedFiltered = dAdded dRemovedFiltered = dRemoved @@ -653,8 +669,8 @@ def checkDelta(self, letter): # Briefly explain to the user what attributes were added/removed if LOG_SHOW_DATES == False: - dAddedFiltered = [ i for i in dAdded if i not in DATE_ATTRIBS_TO_FILTER ] - dRemovedFiltered = [ i for i in dRemoved if i not in DATE_ATTRIBS_TO_FILTER ] + dAddedFiltered = [ i for i in dAdded if i not in LIST_OF_DATE_ATTRIBUTES ] + dRemovedFiltered = [ i for i in dRemoved if i not in LIST_OF_DATE_ATTRIBUTES ] else: dAddedFiltered = dAdded dRemovedFiltered = dRemoved @@ -695,11 +711,11 @@ def printInfo(self): if self.A.originType == 'HASHLIST_PLAIN': displayed_size_A = 'Size not specified (file is a simple list of checksums)' else: - displayed_size_A = self.A.totalSize() + displayed_size_A = humanSize(self.A.totalSize(), showBytes=True) if self.B.originType == 'HASHLIST_PLAIN': displayed_size_B = 'Size not specified (file is a simple list of checksums)' else: - displayed_size_B = self.B.totalSize() + displayed_size_B = humanSize(self.B.totalSize(), showBytes=True) print('') if LOG_VERBOSE: @@ -788,8 +804,7 @@ def printCount(self): parser = argparse.ArgumentParser() -parser.add_argument( "PATH_A", help="path to list A", type=str) -parser.add_argument( "PATH_B", help="path to list B", type=str) +parser.add_argument( "FILEPATH", nargs='+', help="Path to the first file") parser.add_argument( "-v", "--verbose", "--info", help="gives greater detail on all files affected", @@ -808,25 +823,6 @@ def printCount(self): args = parser.parse_args() -if args.PATH_A and args.PATH_B: - pass -else: - raise Exception('Two files need to be included when you run the command') - -foundA = os.path.isfile(args.PATH_A) -foundB = os.path.isfile(args.PATH_B) - -if foundA is True and foundB is True: - file_path_A = args.PATH_A - file_path_B = args.PATH_B -else: - not_found_string = '' - if foundA is False: - not_found_string += " " + args.PATH_A + "\n" - if foundB is False: - not_found_string += " " + args.PATH_B + "\n" - raise FileNotFoundError('Could not find these MHL file(s). Check the path for typos?\n' + not_found_string) - if args.verbose: LOG_VERBOSE = True if args.binary: @@ -835,15 +831,71 @@ def printCount(self): LOG_SHOW_DATES = True -##### +if len(args.FILEPATH) == 1: + # Print a summary of just this file + filepath = args.FILEPATH[0] + if not os.path.isfile(filepath): + raise FileNotFoundError('\n\nCould not find this MHL file. Check the path for typos?\n{}'.format(filepath)) -MHL_FILE_A = MHL(file_path_A) -MHL_FILE_B = MHL(file_path_B) + MHL = MHL(filepath) + + def keyfunc(x): + return x.directory + + MHL_items = sorted(MHL.hashes.values()) + for dir, items in itertools.groupby(MHL_items, keyfunc): + print(color(dir, 'green', attrs=LOG_COLOR_BOLD) + ':') + for item in items: + print_filename = ' > ' + item.filename + print_log_detail_to_add = '\t{} {}'.format( + color('({})'.format(item.identifier), 'yellow'), + item.sizeHuman + ) + if LOG_VERBOSE == True: + print(print_filename + print_log_detail_to_add) + else: + print(print_filename) + + # Show date information, if user requests + if LOG_SHOW_DATES: + for attrib in LIST_OF_DATE_ATTRIBUTES: + if hasattr(item, attrib): + logDetail( ' {:<20}:'.format(attrib), getattr(item, attrib)) + # After each directory, line break + print() + print('--------------') + # Summarise the MHL + print('{} files, {} in total'.format( + MHL.count(), + humanSize( MHL.totalSize(), showBytes=True ) + ) + ) + + +elif len(args.FILEPATH) == 2: + # Our main comparison will take place with 2 files. + # Check the paths exist first. + for filepath in args.FILEPATH: + if not os.path.isfile(filepath): + raise FileNotFoundError('\n\nCould not find this MHL file. Check the path for typos?\n{}'.format(filepath)) + # Then define our A and B files. + filepath_A = args.FILEPATH[0] + filepath_B = args.FILEPATH[1] + + MHL_FILE_A = MHL(filepath_A) + MHL_FILE_B = MHL(filepath_B) + + compare = Comparison(MHL_FILE_A, MHL_FILE_B) + compare.printInfo() + compare.checkCommon() + compare.checkDelta('A') + compare.checkDelta('B') + compare.printCount() + +else: + raise Exception('\n\nYou have specified {} files. Only two at a time are supported for comparison.\nDouble check you have not included any erroneous spaces in the file path.'.format(len(args.FILEPATH))) + + +##### -compare = Comparison(MHL_FILE_A, MHL_FILE_B) -compare.printInfo() -compare.checkCommon() -compare.checkDelta('A') -compare.checkDelta('B') -compare.printCount() print('--------------') From 074484c2b3d88d071c10f35bee9c67cf41bb1e32 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 22:10:16 +1100 Subject: [PATCH 07/14] - Security fix: Use PyInstaller >= 3.6 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c8c5e5b..30617d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ future==0.17.1 humanize==0.5.1 macholib==1.11 pefile==2018.8.8 -PyInstaller==3.4 +PyInstaller>=3.6 python-dateutil==2.8.0 six==1.12.0 termcolor==1.1.0 From ebea3a34dc23902d6b5e0a3e1fd55e887c84d263 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Sat, 21 Mar 2020 22:27:40 +1100 Subject: [PATCH 08/14] Commenting improvements --- mhl-compare.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mhl-compare.py b/mhl-compare.py index 886c33f..5d767b3 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -516,9 +516,10 @@ def checkDelta(self, letter): beenCounted = False # If this hash has been counted yet # Look for a match by other hash + # E.g., if XXHASH and MD5 present, search by MD5 for otherHashType, otherHashValue in hash.recordedHashes.items(): if otherHashType == hash.identifierType: - pass # to next hash in the list + pass # Skip the hash type we are already using hashPossible = oppositeMHL.findByOtherHash( otherHashType, otherHashValue ) if isinstance(hashPossible, HashNonexistent): @@ -608,7 +609,7 @@ def checkDelta(self, letter): ) if { 'filename', 'directory', 'size' }.issubset(dUnchanged): - # If neither of these variables have changed, then we almost have a perfect match. + # If neither of these variables have changed, then we have a perfect match. # EVEN THOUGH we used a slightly different preferred hash. if not beenCounted: self.COUNT['PERFECT'] += 1 @@ -638,7 +639,9 @@ def checkDelta(self, letter): if 'size' in dChanged: # First, check if the Size is simply "Not specified" # This is not an anomaly if so. - if hashA.sizeDefined == False or hashB.sizeDefined == False: + if hash.sizeDefined == False: + # If we have come this far (hash match, name, directory) but size can't be compared + # That is as good as we are gonna get. self.COUNT['PERFECT'] += 1 beenCounted = True else: @@ -690,8 +693,8 @@ def checkDelta(self, letter): pass - if foundHashPossible is False: - # Begin to print the results + else: + # Else if foundHashPossible was False. self.COUNT['MISSING'] += 1 logDetail(' ' + color(hash.filename, listColor, attrs=LOG_COLOR_BOLD)) logDetail( From 4940a2e05fa56eb5e87f898a864834a67b3a6b2f Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Tue, 24 Mar 2020 22:27:23 +1100 Subject: [PATCH 09/14] - Better text display when sizing is not specified (i.e. a simple list of sums) - Handle greater variety of simple list of sums, drawn from samples I have from TeraCopy --- mhl-compare.py | 60 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/mhl-compare.py b/mhl-compare.py index 5d767b3..6bbc799 100644 --- a/mhl-compare.py +++ b/mhl-compare.py @@ -59,6 +59,10 @@ def showDate(dt): def humanSize(numBytes, showBytes=False): + if not numBytes: + # If for some reason you can't do maths on this 'None' + # Avoid it + return None if numBytes < 1024: return str(numBytes) + " bytes" else: @@ -109,7 +113,12 @@ def __init__(self, filepath): self.hashes = {} self.duplicates = set() - PATTERN_XXHASHLIST = re.compile('^([0-9a-fA-F]{16})\s{2}(.*)$') + PATTERNS_HASHLIST_SIMPLE = [ + '^([0-9a-fA-F]{16})\s{2}(.*)$', + '^([0-9a-fA-F]{16})\s\?XXHASH64\*(.*)$', + '^([0-9a-fA-F]{16})\s\*(.*)$', + '^([0-9a-fA-F]{32})\s\*(.*)$' + ] # (1) Try to parse it as XML try: @@ -117,12 +126,15 @@ def __init__(self, filepath): listObj = xmltodict.parse( f.read(), dict_constructor=dict ) self.originType = 'MHL' except: - # Syntax error from xmldict - # (2) Try parsing this as an .xxhash simple list of sums + # (2) Try parsing this as an .xxhash simple hashlist + # This is a basic single-line per file list, with hash at the + # beginning, 2 or so spaces, then filename. + # Typically no other data attributes, such as found in an MHL. with open(filepath, 'r') as f: lines = f.readlines() - f.close() - + # We will create a "faux MHL" with XML structure. + # The rest of Class MHL() will then navigate this XML just like + # it would our regular .mhl files. fauxMHL = { '_ORIGIN': os.path.basename(filepath), 'hashlist': { @@ -130,18 +142,33 @@ def __init__(self, filepath): } } for line in lines: - match = PATTERN_XXHASHLIST.match(line) + # Now test each line against our identified patterns above. + match = False + for pattern in PATTERNS_HASHLIST_SIMPLE: + match = re.compile(pattern).match(line) + if match: + # Found a match, work with it + break + else: + # Keep going until we match + continue if match: hash = match[1] hashFilepath = match[2] - - # Create a faux MHL line, imitating XML already parsed as a dict fauxMHL_hash = { 'file': hashFilepath, 'size': None, 'xxhash64be': hash, } fauxMHL['hashlist']['hash'].append(fauxMHL_hash) + else: + # This line doesn't match, keep moving on. + continue + if len(fauxMHL['hashlist']['hash']) == 0: + # If no lines matched, then no hashes were added. + # Tell the user we couldn't get anything useful from file. + raise Exception("\n\n Unrecognised file: not an MHL nor a simple list of checksums." + "\n " + filepath) + # Now introduce the imposter XML. listObj = fauxMHL self.originType = 'HASHLIST_PLAIN' @@ -850,9 +877,14 @@ def keyfunc(x): print(color(dir, 'green', attrs=LOG_COLOR_BOLD) + ':') for item in items: print_filename = ' > ' + item.filename + if item.sizeDefined: + print_size = item.sizeHuman + else: + # Don't tack on the size if it's not defined + print_size = "" print_log_detail_to_add = '\t{} {}'.format( color('({})'.format(item.identifier), 'yellow'), - item.sizeHuman + print_size ) if LOG_VERBOSE == True: print(print_filename + print_log_detail_to_add) @@ -868,11 +900,11 @@ def keyfunc(x): print() print('--------------') # Summarise the MHL - print('{} files, {} in total'.format( - MHL.count(), - humanSize( MHL.totalSize(), showBytes=True ) - ) - ) + if MHL.totalSize(): + total_size_display = humanSize( MHL.totalSize(), showBytes=true ) + ' in total' + else: + total_size_display = 'No filesize information was present' + print('{} files, {}'.format(MHL.count(), total_size_display)) elif len(args.FILEPATH) == 2: From 52af7515aefde8d776a530de0926f3ff11582f09 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Tue, 24 Mar 2020 23:02:38 +1100 Subject: [PATCH 10/14] - big documentation update > although pending because im yet to update the Installer and binary methods, thats more of a 'tomorrow' thing --- README.md | 85 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index aab988a..7e20044 100644 --- a/README.md +++ b/README.md @@ -9,43 +9,53 @@ Useful when comparing two copies of media files that are intended to be the same ### Installation -* **Download version 0.3** (latest): - * https://github.com/seb26/mhl-compare/releases/download/v0.3/mhl-compare-v0.3.zip +* **Download version 0.4** (latest): + * https://github.com/seb26/mhl-compare/releases/download/v0.4/mhl-compare-v0.4.zip * Extract the zip -* In Finder, right-click on `install.command` > 'Open'. Don't *double-click*. -* Observe the warning about unidenitifed developer, and choose 'Open' anyway +* Open a Terminal window +* Copy the binary to your `/usr/local/bin` + `cp ~/Downloads/mhl-compare-v0.4/bin/mhl-compare /usr/local/bin/` +* That completes the installation, now you can use `mhl-compare` from anywhere in a Terminal. -#### Note -* If you *double-click* on `install.command` in Finder, it will give the warning: "can't be opened because it is from an unidentified developer". -* Instead, you must right-click on it in Finder and choose 'Open' from the context menu. -* Then, you are presented with the same dialog warning but now with an 'Open' button. -* Alternatively, open a Terminal window and run `./install.command`. +### Usage: compare two files -The unidentified developer status is being looked at in [issue #7](https://github.com/seb26/mhl-compare/issues/7). +In a Terminal window, run: -**What does install.command do anyway?** +``` +mhl-compare first.mhl second.mhl +``` -It only copies the `bin/mhl-compare` program into `/usr/local/bin/` which is the standard location on macOS for a user's command-line programs. Then, you can run mhl-compare from anywhere in a Terminal. +Alternatively, you can type just `mhl-compare`, and from Finder, drag the files onto the Terminal window directly. -### Usage +This will insert the full file path(s) for you, saving you from typing them manually. -1. In a Terminal window, run: +Then hit enter and check out the result. -``` -mhl-compare first.mhl second.mhl -``` +### Usage: summarise just one file -Alternatively, you can type just: ``` -mhl-compare +mhl-compare file.mhl ``` -And from Finder, drag the MHL files one-by-one, or together, and place them on the Terminal line. This will copy their full path for you, so you don't have to type it. -#### Options +A summary of the files listed in the MHL will be output, including a total number and a total size. They will appear grouped by directory. + +Useful if you just want to review the contents of an MHL rapidly, without tediously navigating the XML manually with your eyes or Ctrl+F searching it with difficulty. + +By default, only the files' names are shown in a long list. Run this with options (below) to see more details, such as hash, size, or date information. + +### Options * `-v, --verbose, --info` * Shows detailed, file-by-file description of the differences in each file. - * By default, only a brief summary counting the number of issues is shown on screen. + * Default without this option: a short summary of the similarity is shown including number of clips in common. There is no per-file detail. + +* `-b, --binary` + * Sizes are specified in binary format (i.e. 1 KiB = 1,024 bytes) which is relevant on Windows platform. + * Default without this option: sizes are shown in decimal format (1 KB = 1,000 bytes), relevant for macOS. +* `-d, --dates` + * Shows date-related attributes contained within a file, if available. + * These may include a file's creation date (`creationdate`), modified date (`lastmodificationdate`) or date of hashing (`hashdate`). + * Default without this option: Dates are not shown at all. --- ### Example scenario @@ -57,23 +67,42 @@ The benefit is that you and your colleague would be able to see if you legitimat Additionally, MHL files are small (typically much less than 500 KB) and contain just XML, so it may be more practical to compare *them* instead when working with large media collections, where it is too time-consuming to read and verify the media files themselves, or they are stored in other physical locations. +### Example scenario 2 +You have a single MHL file and wish to understand which files are listed within it. Double-clicking or opening the file in any other program will prompt you to verify the contents, you don't wish to do so, you only want to know the basics first. + +You can run mhl-compare on just this one file, and see a list of files contained within the MHL and other attributes about them. mhl-compare will spit out the number of files and also the total size. Learning the size will permit you to immediately perform a size-based comparison against the size of the folder of files itself. + +*Without mhl-compare*: while it is possible to open an MHL file in any text editor and view the list of files described within and Ctrl + F to find relevant files, it is not laid out in a very accessible or easy-to-read fashion. It is also completely impractical to quickly figure out the total size of the files, since the size is only shown individually, and you would have to use a calculator or some other automated means to combine all the individual sizes into a single sum. + --- ### Compatibility -#### With MHL files -Can open any MHL file that is in [the standard XML format for MHL files](https://mediahashlist.org/mhl-specification/). +#### Files it can open +Any MHL file that follows [the standard XML format for MHL files](https://mediahashlist.org/mhl-specification/) + * This includes MHL files from: Silverstack, ShotPut Pro, YoYotta, Hedge, TeraCopy, and others. -#### With running the program itself (the regular download) +Also, simple lists of checksums are supported as well, such as `.md5` or `.xxhash` files, which are typically in a one-line-per-file structure like below: +``` +09ad6a59a9232f81 file.txt +``` + +#### Running the program itself (the regular download) Only runs on macOS. Tested only on macOS 10.14.3. It is likely to run successfully on older versions though, it's not a very complex program. -#### With running the program as a Python script -Has been tested on Python 3.7.2 on macOS 10.14.3. Written in Python 3, so in its source format, it is not compatible with Python 2 branch, and cannot be run on macOS. +#### Running the program as a Python script +Originally written on Python 3.7.2 on macOS 10.14.3, and developed today on Python 3.7.4 with macOS 10.14.6. Because it has been written with the Python 3 branch, it is not compatible with Python 2, and cannot be run on macOS without the additional installation of Python 3. -Has not been tested on Windows or Linux, but Python is generally pretty functional across OSs. +I understand this is a huge caveat since it would be great to install and use mhl-compare quickly on foreign machines, which you might not have admin access to or the time required to install additional software like Python 3. +##### Dependencies Dependency libraries: [`xmltodict`](https://github.com/martinblech/xmltodict), [`dateutil`](https://dateutil.readthedocs.io/en/stable/), [`humanize`](https://pypi.org/project/humanize/), [`termcolor`](https://pypi.org/project/termcolor/), [`dictdiffer`](https://github.com/hughdbrown/dictdiffer). +#### On other operating systems + +Has not been tested on Windows or Linux, but Python is generally pretty functional across OSs, so it is likely to work fine. + + --- ### Development From cfe578fb163da1c80eef5c0b065f9bdcecde11f4 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Wed, 25 Mar 2020 10:25:48 +1100 Subject: [PATCH 11/14] organise folders a bit --- zResearch/MediaHashList_v1_1.xsd | 161 ++++++++++++++++++ zResearch/pomfort_xsd_observations.md | 42 +++++ {scraps => zScraps}/desired_console_output.md | 0 .../legacy_mhl_compare_script.py | 0 {scraps => zScraps}/mhl-compare-intersect.py | 0 {scraps => zScraps}/outcomes.csv | 0 {scraps => zScraps}/sandbox.py | 0 {scraps => zScraps}/sandbox2.py | 0 {scraps => zScraps}/test_recordResult.py | 0 9 files changed, 203 insertions(+) create mode 100644 zResearch/MediaHashList_v1_1.xsd create mode 100644 zResearch/pomfort_xsd_observations.md rename {scraps => zScraps}/desired_console_output.md (100%) rename {scraps => zScraps}/legacy_mhl_compare_script.py (100%) rename {scraps => zScraps}/mhl-compare-intersect.py (100%) rename {scraps => zScraps}/outcomes.csv (100%) rename {scraps => zScraps}/sandbox.py (100%) rename {scraps => zScraps}/sandbox2.py (100%) rename {scraps => zScraps}/test_recordResult.py (100%) diff --git a/zResearch/MediaHashList_v1_1.xsd b/zResearch/MediaHashList_v1_1.xsd new file mode 100644 index 0000000..a83900a --- /dev/null +++ b/zResearch/MediaHashList_v1_1.xsd @@ -0,0 +1,161 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/zResearch/pomfort_xsd_observations.md b/zResearch/pomfort_xsd_observations.md new file mode 100644 index 0000000..10e7990 --- /dev/null +++ b/zResearch/pomfort_xsd_observations.md @@ -0,0 +1,42 @@ +My observations of the XSD format provided by Pomfort. + +My words written in dot points, and code blocks show parts of the schema that I don't quite understand. + +## Types +* MD5, restricted to hexBinary, length: 16 +* SHA1, restricted to hexBinary, length 20 +* xxhash, restricted to integer, total digits 10 +* xxhash64, restricted to hexBinary, length 8 +* Version: +``` + + + + + + +``` + +## Simple elements +* `file` is a string +* Size is a positive integer +* All dates are dateTime (note: no apparent requirement for timezone information) +* Most MHL metadata, like name, computer name, program generator, are strings + +## Attributes +``` + + + +``` + +* `referencehhashlist` is used to make reference to another MHL file, to allow for recursive searching and verifying of other directories as described by other MHL files (see Pomfort's [January 2012 proposal document, page 7](https://mediahashlist.org/wp-content/uploads/2012/01/Media-Hash-File-Format-Proposal-v1_3.pdf)). + +## Complex elements +* `` +* `` +* `` +* `` -- A comment describes this as "deprecated, little endian xxhash64" +* `` +* A null element, string, only used for "file size verification" +* `` -- which packages the metadata about who created the MHL file diff --git a/scraps/desired_console_output.md b/zScraps/desired_console_output.md similarity index 100% rename from scraps/desired_console_output.md rename to zScraps/desired_console_output.md diff --git a/scraps/legacy_mhl_compare_script.py b/zScraps/legacy_mhl_compare_script.py similarity index 100% rename from scraps/legacy_mhl_compare_script.py rename to zScraps/legacy_mhl_compare_script.py diff --git a/scraps/mhl-compare-intersect.py b/zScraps/mhl-compare-intersect.py similarity index 100% rename from scraps/mhl-compare-intersect.py rename to zScraps/mhl-compare-intersect.py diff --git a/scraps/outcomes.csv b/zScraps/outcomes.csv similarity index 100% rename from scraps/outcomes.csv rename to zScraps/outcomes.csv diff --git a/scraps/sandbox.py b/zScraps/sandbox.py similarity index 100% rename from scraps/sandbox.py rename to zScraps/sandbox.py diff --git a/scraps/sandbox2.py b/zScraps/sandbox2.py similarity index 100% rename from scraps/sandbox2.py rename to zScraps/sandbox2.py diff --git a/scraps/test_recordResult.py b/zScraps/test_recordResult.py similarity index 100% rename from scraps/test_recordResult.py rename to zScraps/test_recordResult.py From 370dcfaf9eaf7c06aba432bc17140260730084d3 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Wed, 25 Mar 2020 10:26:21 +1100 Subject: [PATCH 12/14] more organise --- mhl-analysis/MediaHashList_v1_1.xsd | 161 ----------------------- mhl-analysis/pomfort_xsd_observations.md | 42 ------ 2 files changed, 203 deletions(-) delete mode 100644 mhl-analysis/MediaHashList_v1_1.xsd delete mode 100644 mhl-analysis/pomfort_xsd_observations.md diff --git a/mhl-analysis/MediaHashList_v1_1.xsd b/mhl-analysis/MediaHashList_v1_1.xsd deleted file mode 100644 index a83900a..0000000 --- a/mhl-analysis/MediaHashList_v1_1.xsd +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/mhl-analysis/pomfort_xsd_observations.md b/mhl-analysis/pomfort_xsd_observations.md deleted file mode 100644 index 10e7990..0000000 --- a/mhl-analysis/pomfort_xsd_observations.md +++ /dev/null @@ -1,42 +0,0 @@ -My observations of the XSD format provided by Pomfort. - -My words written in dot points, and code blocks show parts of the schema that I don't quite understand. - -## Types -* MD5, restricted to hexBinary, length: 16 -* SHA1, restricted to hexBinary, length 20 -* xxhash, restricted to integer, total digits 10 -* xxhash64, restricted to hexBinary, length 8 -* Version: -``` - - - - - - -``` - -## Simple elements -* `file` is a string -* Size is a positive integer -* All dates are dateTime (note: no apparent requirement for timezone information) -* Most MHL metadata, like name, computer name, program generator, are strings - -## Attributes -``` - - - -``` - -* `referencehhashlist` is used to make reference to another MHL file, to allow for recursive searching and verifying of other directories as described by other MHL files (see Pomfort's [January 2012 proposal document, page 7](https://mediahashlist.org/wp-content/uploads/2012/01/Media-Hash-File-Format-Proposal-v1_3.pdf)). - -## Complex elements -* `` -* `` -* `` -* `` -- A comment describes this as "deprecated, little endian xxhash64" -* `` -* A null element, string, only used for "file size verification" -* `` -- which packages the metadata about who created the MHL file From e83699951c63e077fda0fc7e7ba1ac395dee951b Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Wed, 25 Mar 2020 10:27:52 +1100 Subject: [PATCH 13/14] overhaul installation - no more install.commands, which introduced silly security issues on macOS and are not very user friendly either --- build-zip.sh | 20 ++++++++++++-------- dist-include/install.command | 9 --------- dist-include/uninstall.command | 17 ----------------- 3 files changed, 12 insertions(+), 34 deletions(-) delete mode 100755 dist-include/install.command delete mode 100755 dist-include/uninstall.command diff --git a/build-zip.sh b/build-zip.sh index 986dc2b..43ebf6d 100755 --- a/build-zip.sh +++ b/build-zip.sh @@ -1,8 +1,12 @@ -rm -r .tmp -mkdir .tmp -mkdir .tmp/bin -cp dist/mhl-compare ./.tmp/bin/mhl-compare -cp dist-include/* ./.tmp/ -cp README.md ./.tmp/ -cd .tmp -zip -9r - * > ../downloads/mhl-compare-$1.zip +if [[ $1 ]]; then + rm -rf .tmp/ || true + mkdir .tmp/ + mkdir .tmp/bin/ + cp dist/mhl-compare .tmp/mhl-compare + cp README.md .tmp/ + cd .tmp + zip -9r -FS - * > ../downloads/mhl-compare-$1.zip +else + echo Please run again and specify the version number as first parameter. + exit 1 +fi diff --git a/dist-include/install.command b/dist-include/install.command deleted file mode 100755 index b02a116..0000000 --- a/dist-include/install.command +++ /dev/null @@ -1,9 +0,0 @@ -echo "--------------------------------" -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd "${DIR}" -echo "Installing mhl-compare to /usr/local/bin..." -cp bin/mhl-compare /usr/local/bin -echo "Done." -echo "Now you can run it anywhere in a Terminal by typing: mhl-compare" -echo "" -echo -e "\x1B[36mYou can now close this window.\x1B[0m" diff --git a/dist-include/uninstall.command b/dist-include/uninstall.command deleted file mode 100755 index bd21e3b..0000000 --- a/dist-include/uninstall.command +++ /dev/null @@ -1,17 +0,0 @@ -echo "--------------------------------" -echo "This will uninstall mhl-compare." -echo "It will simply delete mhl-compare from: /usr/local/bin" -echo "It won't modify anything else on the system." -echo "" -read -r -p "Are you sure you want to uninstall? [y/N] " response -case "$response" in - [yY][eE][sS]|[yY]) - echo "Uninstalling..." - rm /usr/local/bin/mhl-compare - ;; - *) - echo "OK, nothing done." - ;; -esac -echo "" -echo -e "\x1B[36mYou can now close this Terminal window.\x1B[0m" From df601a1baef473faae4c8cb373110915a5d098a5 Mon Sep 17 00:00:00 2001 From: Sebastian Reategui Date: Wed, 25 Mar 2020 10:28:10 +1100 Subject: [PATCH 14/14] clarify the readme a bit --- README.md | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7e20044..3096931 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,33 @@ Useful when comparing two copies of media files that are intended to be the same --- -### Installation +### Download -* **Download version 0.4** (latest): +**Download version 0.4** (latest): * https://github.com/seb26/mhl-compare/releases/download/v0.4/mhl-compare-v0.4.zip + +### Installation + +**Method via Finder** +* Extract the zip (double-click it) +* **Copy** the binary file `mhl-compare` to clipboard +* In Finder toolbar, choose Go > Go to Folder... or *Shift + Cmd + G* +* Type: `/usr/local/bin` +* Inside this folder /usr/local/bin, **paste** the file you just copied +* Done. To use, scroll down to section on Usage. + +**Method via Terminal** + * Extract the zip -* Open a Terminal window -* Copy the binary to your `/usr/local/bin` - `cp ~/Downloads/mhl-compare-v0.4/bin/mhl-compare /usr/local/bin/` -* That completes the installation, now you can use `mhl-compare` from anywhere in a Terminal. +* Then run: + +``` +cp ~/Downloads/mhl-compare-v0.4/mhl-compare /usr/local/bin/ +``` + +That completes the installation, now you can run `mhl-compare` from anywhere in a Terminal. + +---- ### Usage: compare two files