Skip to content

Commit

Permalink
update binaryninja plugin to display and sort highlighted functions
Browse files Browse the repository at this point in the history
  • Loading branch information
evandowning committed Mar 12, 2021
1 parent bb6a817 commit 3609359
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 36 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ For technical details, please see the paper cited below.

## Coming soon
- Dockerfile
- BinaryNinja plugin

--------------------------------------------------------------------------------

## Setup
- Requirements:
- Tested on Debian 10 (Buster)
- Python 3 (tested with Python 3.7.3) and pip
- virtualenvwrapper (optional, but recommended)
- BinaryNinja 2.3 (used to extract features and function information from binaries)
- PostgreSQL 11.10 (to store results)
- virtualenvwrapper (optional, but recommended)
- parallel (optional, but recommended)
- Setup:
```
Expand Down
10 changes: 5 additions & 5 deletions autoencoder/mse_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,20 @@ def _main():
X_addr = np.load(roiAddr)

# Read in score data (for storing in database, not for clustering)
sample = list()
sample = set()
for mseFN in X_fn:
base = '/'.join(mseFN.split('/')[-2:])
funcFN = os.path.join(funcFolder,base[:-3]+'txt')
featFN = os.path.join(featFolder,base)
sample.append([mseFN,funcFN,featFN])
sample.add((mseFN,funcFN,featFN))

data = RoI(sample,threshold,None)

# Get MSE values of highlighted functions
count = 0
func_score = dict()
for mse_func,mseFN in data.function_highlight_generator():
sys.stderr.write('Processing functions: {0}/{1}\r'.format(count+1,len(X_addr)))
sys.stderr.write('Processing functions: {0}/{1}\r'.format(count+1,len(sample)))
sys.stderr.flush()

for f_addr,t in mse_func.items():
Expand All @@ -72,9 +72,9 @@ def _main():
for e,mseFN in enumerate(X_fn):
addr = int(X_addr[e],16)

#TODO - why does this happen?
# NOTE: not sure why this happens sometimes
if addr not in func_score[mseFN]:
sys.stderr.write('{0} {1} {2}\n'.format(e, mseFN, hex(addr)))
sys.stderr.write('Wasn\'t found in original mse file: {0} {1} {2}\n'.format(e, mseFN, hex(addr)))
X_score.append(-1)
continue

Expand Down
2 changes: 0 additions & 2 deletions binaryninja/deepreflect/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,3 @@ Thank you to [Scott Bergstresser](https://github.com/sab4tg) for creating the al
```
$ cp -r ./binaryninja/deepreflect ~/.binaryninja/plugins/
```

## Usage
55 changes: 33 additions & 22 deletions binaryninja/deepreflect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def connect_db(bv):
try:
conn = psycopg2.connect("dbname='{0}' user='{1}' host='localhost' password='{2}'".format(dbName,dbUser,dbPass))
except Exception as e:
sys.stdout.write('{0}\n'.format(str(e)))
sys.stdout.write('No connection made to db. See log window for details.\n')
sys.stderr.write('{0}\n'.format(str(e)))
sys.stderr.write('No connection made to db. See log window for details.\n')

return conn,hash_val

Expand Down Expand Up @@ -94,15 +94,17 @@ def display_all(bv):
# Get all functions in database
functions = get_all_functions(cur,hash_val)

sys.stdout.write('{0} | {1} | {2} | {3}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16)))
sample_hash = functions[0][1]
sample_family = functions[0][2]
sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16)))

sys.stdout.write('{0} | {1}\n'.format('function address'.ljust(18),'function label'.ljust(16)))
for row in functions:
sample_hash = row[1]
sample_family = row[2]
func_label = row[3]
func_addr = row[4]
cid = row[5]

sys.stdout.write('{0} | {1} | {2} | {3}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),func_label.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(func_addr.ljust(18),func_label.ljust(16)))
sys.stdout.write('\n')

conn.commit()
Expand All @@ -125,15 +127,17 @@ def display_highlight(bv):
# Get highlighted functions in database
functions = get_highlight_functions(cur,hash_val)

sys.stdout.write('{0} | {1} | {2} | {3}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16)))
sample_hash = functions[0][1]
sample_family = functions[0][2]
sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16)))

sys.stdout.write('{0} | {1}\n'.format('function address'.ljust(18),'function label'.ljust(16)))
for row in functions:
sample_hash = row[1]
sample_family = row[2]
func_label = row[3]
func_addr = row[4]
cid = row[5]

sys.stdout.write('{0} | {1} | {2} | {3}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),func_label.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(func_addr.ljust(18),func_label.ljust(16)))
sys.stdout.write('\n')

conn.commit()
Expand Down Expand Up @@ -196,7 +200,6 @@ def modify_label(bv, function):
# Close connection
conn.close()

#TODO
# Sort functions by DeepReflect score
def sort_score(bv):
# Connect to database
Expand All @@ -220,7 +223,7 @@ def sort_score(bv):
func_label = row[3]
func_addr = row[4]
cid = row[5]
score = row[8]
score = row[7]

sort_func.append((sample_hash,sample_family,func_addr,func_label,score))

Expand All @@ -229,11 +232,13 @@ def sort_score(bv):
# Close connection
conn.close()

# Get function callees
sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16)))

# Sort functions by number of callees
sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16),'score'))
for func_addr in sorted(count, key=lambda x: x[-1], reverse=True):
sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),func_label.ljust(16),score))
sys.stdout.write('{0} | {1} | {2}\n'.format('function address'.ljust(18),'function label'.ljust(16),'score'))
for sample_hash,sample_family,func_addr,func_label,score in sorted(sort_func, key=lambda x: x[-1], reverse=True):
sys.stdout.write('{0} | {1} | {2}\n'.format(func_addr.ljust(18),func_label.ljust(16),score))
sys.stdout.write('\n')

# Sort highlighted functions by number of basic blocks
Expand Down Expand Up @@ -279,10 +284,13 @@ def sort_size(bv):
num_bb = len(function.basic_blocks)
count[func_addr] = num_bb

sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16)))

# Sort functions by number of callees
sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16),'number of basic blocks'))
sys.stdout.write('{0} | {1} | {2}\n'.format('function address'.ljust(18),'function label'.ljust(16),'number of basic blocks'))
for func_addr,_ in sorted(count.items(), key=lambda x: x[1], reverse=True):
sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr]))
sys.stdout.write('{0} | {1} | {2}\n'.format(func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr]))
sys.stdout.write('\n')

# Sort highlighted functions by number of callees
Expand Down Expand Up @@ -332,10 +340,13 @@ def sort_callee(bv):
if symbol_type in [0,1,2,4,5,6]:
count[func_addr] += 1

sys.stdout.write('{0} | {1}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16)))
sys.stdout.write('{0} | {1}\n'.format(sample_hash.ljust(64),sample_family.ljust(16)))

# Sort functions by number of callees
sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format('sample hash'.ljust(64),'sample family'.ljust(16),'function address'.ljust(18),'function label'.ljust(16),'number of callees'))
sys.stdout.write('{0} | {1} | {2}\n'.format('function address'.ljust(18),'function label'.ljust(16),'number of callees'))
for func_addr,_ in sorted(count.items(), key=lambda x: x[1], reverse=True):
sys.stdout.write('{0} | {1} | {2} | {3} | {4}\n'.format(sample_hash.ljust(64),sample_family.ljust(16),func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr]))
sys.stdout.write('{0} | {1} | {2}\n'.format(func_addr.ljust(18),label[func_addr].ljust(16),count[func_addr]))
sys.stdout.write('\n')

# Register plugin options
Expand Down
6 changes: 3 additions & 3 deletions cluster/pca_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,14 @@ def _main():
sample_hash = fn.split('/')[-1][:-4]
family = fn.split('/')[-2]

sys.stdout.write('{0} {1} {2} {3}\n'.format(fn,addr,k,p))

# Construct unique identifier
unique_string = str(sample_hash + family + addr).encode('utf-8')
unique_ID = hashlib.sha256(unique_string).hexdigest()

# If entry already exists, just update cluster ID
cur.execute("INSERT INTO dr(unique_ID,hash,family,func_addr,cid,score) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT UPDATE cid=%s, score=%s", (unique_ID, sample_hash, family, addr, cid, score, cid, score))

sys.stdout.write('{0} {1} {2} {3}\n'.format(fn,addr,k,p))
cur.execute("INSERT INTO dr(unique_ID,hash,family,func_addr,cid,score) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (unique_ID) DO UPDATE SET cid=%s, score=%s", (unique_ID, sample_hash, family, addr, cid, score, cid, score))

# Commit transactions
conn.commit()
Expand Down
2 changes: 1 addition & 1 deletion db/create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CREATE TABLE dr(
/* SHA256 hash of sample */
hash char(64),
/* family label */
family varchar(64) DEFAULT 'unknown',
family varchar(128) DEFAULT 'unknown',
/* function label */
label varchar(128) DEFAULT 'unlabeled',
/* function address */
Expand Down
1 change: 0 additions & 1 deletion dr.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ def roi_generator(self):
continue

sys.stdout.write('Number of RoIs (basic blocks): {0}\n'.format(len(addr)))
# sys.stderr.write('BB HIGHLIGHT: {0} {1}\n'.format(mseFN,addr))

# Get mapping between basic blocks and the functions they belong to
bb_map,func_map = self.get_mapping(funcFN)
Expand Down

0 comments on commit 3609359

Please sign in to comment.