diff --git a/README.md b/README.md index 1f75a79..bc59592 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,6 @@ For technical details, please see the paper cited below. ## Coming soon - Dockerfile - - BinaryNinja plugin -------------------------------------------------------------------------------- @@ -24,9 +23,9 @@ For technical details, please see the paper cited below. - Requirements: - Tested on Debian 10 (Buster) - Python 3 (tested with Python 3.7.3) and pip - - virtualenvwrapper (optional, but recommended) - BinaryNinja 2.3 (used to extract features and function information from binaries) - PostgreSQL 11.10 (to store results) + - virtualenvwrapper (optional, but recommended) - parallel (optional, but recommended) - Setup: ``` diff --git a/autoencoder/mse_func.py b/autoencoder/mse_func.py index c719676..8ea9d3e 100644 --- a/autoencoder/mse_func.py +++ b/autoencoder/mse_func.py @@ -37,12 +37,12 @@ def _main(): X_addr = np.load(roiAddr) # Read in score data (for storing in database, not for clustering) - sample = list() + sample = set() for mseFN in X_fn: base = '/'.join(mseFN.split('/')[-2:]) funcFN = os.path.join(funcFolder,base[:-3]+'txt') featFN = os.path.join(featFolder,base) - sample.append([mseFN,funcFN,featFN]) + sample.add((mseFN,funcFN,featFN)) data = RoI(sample,threshold,None) @@ -50,7 +50,7 @@ def _main(): count = 0 func_score = dict() for mse_func,mseFN in data.function_highlight_generator(): - sys.stderr.write('Processing functions: {0}/{1}\r'.format(count+1,len(X_addr))) + sys.stderr.write('Processing functions: {0}/{1}\r'.format(count+1,len(sample))) sys.stderr.flush() for f_addr,t in mse_func.items(): @@ -72,9 +72,9 @@ def _main(): for e,mseFN in enumerate(X_fn): addr = int(X_addr[e],16) - #TODO - why does this happen? + # NOTE: not sure why this happens sometimes if addr not in func_score[mseFN]: - sys.stderr.write('{0} {1} {2}\n'.format(e, mseFN, hex(addr))) + sys.stderr.write('Wasn\'t found in original mse file: {0} {1} {2}\n'.format(e, mseFN, hex(addr))) X_score.append(-1) continue diff --git a/binaryninja/deepreflect/README.md b/binaryninja/deepreflect/README.md index 8e3129e..34c4121 100644 --- a/binaryninja/deepreflect/README.md +++ b/binaryninja/deepreflect/README.md @@ -11,5 +11,3 @@ Thank you to [Scott Bergstresser](https://github.com/sab4tg) for creating the al ``` $ cp -r ./binaryninja/deepreflect ~/.binaryninja/plugins/ ``` - -## Usage diff --git a/binaryninja/deepreflect/__init__.py b/binaryninja/deepreflect/__init__.py index 66ab8af..6161189 100644 --- a/binaryninja/deepreflect/__init__.py +++ b/binaryninja/deepreflect/__init__.py @@ -30,8 +30,8 @@ def connect_db(bv): try: conn = psycopg2.connect("dbname='{0}' user='{1}' host='localhost' password='{2}'".format(dbName,dbUser,dbPass)) except Exception as e: - sys.stdout.write('{0}\n'.format(str(e))) - sys.stdout.write('No connection made to db. See log window for details.\n') + sys.stderr.write('{0}\n'.format(str(e))) + sys.stderr.write('No connection made to db. See log window for details.\n') return conn,hash_val @@ -94,15 +94,17 @@ def display_all(bv): # Get all functions in database functions = get_all_functions(cur,hash_val) - sys.stdout.write('{0} | {1} | {2} | {3}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16))) + sample_hash = functions[0][1] + sample_family = functions[0][2] + sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16))) + + sys.stdout.write('{0} | {1}\n'.format('function address'.ljust(18),'function label'.ljust(16))) for row in functions: - sample_hash = row[1] - sample_family = row[2] func_label = row[3] func_addr = row[4] - cid = row[5] - sys.stdout.write('{0} | {1} | {2} | {3}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),func_label.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(func_addr.ljust(18),func_label.ljust(16))) sys.stdout.write('\n') conn.commit() @@ -125,15 +127,17 @@ def display_highlight(bv): # Get highlighted functions in database functions = get_highlight_functions(cur,hash_val) - sys.stdout.write('{0} | {1} | {2} | {3}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16))) + sample_hash = functions[0][1] + sample_family = functions[0][2] + sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16))) + + sys.stdout.write('{0} | {1}\n'.format('function address'.ljust(18),'function label'.ljust(16))) for row in functions: - sample_hash = row[1] - sample_family = row[2] func_label = row[3] func_addr = row[4] - cid = row[5] - sys.stdout.write('{0} | {1} | {2} | {3}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),func_label.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(func_addr.ljust(18),func_label.ljust(16))) sys.stdout.write('\n') conn.commit() @@ -196,7 +200,6 @@ def modify_label(bv, function): # Close connection conn.close() -#TODO # Sort functions by DeepReflect score def sort_score(bv): # Connect to database @@ -220,7 +223,7 @@ def sort_score(bv): func_label = row[3] func_addr = row[4] cid = row[5] - score = row[8] + score = row[7] sort_func.append((sample_hash,sample_family,func_addr,func_label,score)) @@ -229,11 +232,13 @@ def sort_score(bv): # Close connection conn.close() - # Get function callees + sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16))) + # Sort functions by number of callees - sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16),'score')) - for func_addr in sorted(count, key=lambda x: x[-1], reverse=True): - sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),func_label.ljust(16),score)) + sys.stdout.write('{0} | {1} | {2}\n'.format('function address'.ljust(18),'function label'.ljust(16),'score')) + for sample_hash,sample_family,func_addr,func_label,score in sorted(sort_func, key=lambda x: x[-1], reverse=True): + sys.stdout.write('{0} | {1} | {2}\n'.format(func_addr.ljust(18),func_label.ljust(16),score)) sys.stdout.write('\n') # Sort highlighted functions by number of basic blocks @@ -279,10 +284,13 @@ def sort_size(bv): num_bb = len(function.basic_blocks) count[func_addr] = num_bb + sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16))) + # Sort functions by number of callees - sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16),'number of basic blocks')) + sys.stdout.write('{0} | {1} | {2}\n'.format('function address'.ljust(18),'function label'.ljust(16),'number of basic blocks')) for func_addr,_ in sorted(count.items(), key=lambda x: x[1], reverse=True): - sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr])) + sys.stdout.write('{0} | {1} | {2}\n'.format(func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr])) sys.stdout.write('\n') # Sort highlighted functions by number of callees @@ -332,10 +340,13 @@ def sort_callee(bv): if symbol_type in [0,1,2,4,5,6]: count[func_addr] += 1 + sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16))) + sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16))) + # Sort functions by number of callees - sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16),'number of callees')) + sys.stdout.write('{0} | {1} | {2}\n'.format('function address'.ljust(18),'function label'.ljust(16),'number of callees')) for func_addr,_ in sorted(count.items(), key=lambda x: x[1], reverse=True): - sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr])) + sys.stdout.write('{0} | {1} | {2}\n'.format(func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr])) sys.stdout.write('\n') # Register plugin options diff --git a/cluster/pca_hdbscan.py b/cluster/pca_hdbscan.py index e4cdf82..0ea4f64 100644 --- a/cluster/pca_hdbscan.py +++ b/cluster/pca_hdbscan.py @@ -116,14 +116,14 @@ def _main(): sample_hash = fn.split('/')[-1][:-4] family = fn.split('/')[-2] + sys.stdout.write('{0} {1} {2} {3}\n'.format(fn,addr,k,p)) + # Construct unique identifier unique_string = str(sample_hash + family + addr).encode('utf-8') unique_ID = hashlib.sha256(unique_string).hexdigest() # If entry already exists, just update cluster ID - cur.execute("INSERT INTO dr(unique_ID,hash,family,func_addr,cid,score) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT UPDATE cid=%s, score=%s", (unique_ID, sample_hash, family, addr, cid, score, cid, score)) - - sys.stdout.write('{0} {1} {2} {3}\n'.format(fn,addr,k,p)) + cur.execute("INSERT INTO dr(unique_ID,hash,family,func_addr,cid,score) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (unique_ID) DO UPDATE SET cid=%s, score=%s", (unique_ID, sample_hash, family, addr, cid, score, cid, score)) # Commit transactions conn.commit() diff --git a/db/create.sql b/db/create.sql index 4c2be05..4a29dd2 100644 --- a/db/create.sql +++ b/db/create.sql @@ -4,7 +4,7 @@ CREATE TABLE dr( /* SHA256 hash of sample */ hash char(64), /* family label */ - family varchar(64) DEFAULT 'unknown', + family varchar(128) DEFAULT 'unknown', /* function label */ label varchar(128) DEFAULT 'unlabeled', /* function address */ diff --git a/dr.py b/dr.py index 95b917d..053da5f 100644 --- a/dr.py +++ b/dr.py @@ -243,7 +243,6 @@ def roi_generator(self): continue sys.stdout.write('Number of RoIs (basic blocks): {0}\n'.format(len(addr))) -# sys.stderr.write('BB HIGHLIGHT: {0} {1}\n'.format(mseFN,addr)) # Get mapping between basic blocks and the functions they belong to bb_map,func_map = self.get_mapping(funcFN)