-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfixPairwiseWorker.py
78 lines (73 loc) · 3.64 KB
/
fixPairwiseWorker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# One updated was not inserted in the update list.
# Making all subsequent ids incorrect when using get_precomp_feats...
# This file will remove similar images where query or similar ids are higher than the last correct id
# http://happybase.readthedocs.org/en/latest/user.html
import happybase, sys
import numpy as np
import pickle
import csv
import time
import sys
connection = happybase.Connection('10.1.94.57')
# This fails...
# alltables = connection.tables()
tab = connection.table('aaron_memex_ht-images')
biggest_correct_htid = 89830995
biggest_correct_uniqueid = 22541770
step = 4000000
if len(sys.argv) > 1:
worker_id = int(sys.argv[1])
else:
worker_id = 0
print "I am worker", str(worker_id)
time.sleep(5)
if __name__ == '__main__':
our_batch_size = 1000
b = tab.batch()
start_row = step * worker_id
end_row = step * (worker_id + 1)
print "Getting rows in range",str(start_row)+"-"+str(end_row)
for one_start in range(start_row,end_row,our_batch_size):
rows_ids = [str(row) for row in range(one_start,one_start+our_batch_size)]
batch_nbmod = 0
#for key, data in tab.scan(row_start=str(start_row),row_stop=str(end_row),batch_size=our_batch_size,columns=('meta:columbia_near_dups','meta:columbia_near_dups_dist','meta:columbia_near_dups_biggest_dbid')):
for key, data in tab.rows(rows_ids, columns=('meta:columbia_near_dups', 'meta:columbia_near_dups_dist', 'meta:columbia_near_dups_biggest_dbid')):
#print key, data
if 'meta:columbia_near_dups' in data.keys():
if int(key) > biggest_correct_htid: # remove all columbia_near_dups stuff
print "key is too big we should delete all columbia_near_dups", key
b.delete(key, columns=('meta:columbia_near_dups', 'meta:columbia_near_dups_dist', 'meta:columbia_near_dups_biggest_dbid'))
batch_nbmod = batch_nbmod + 1
else: # key is ok but are near_dups
# update biggest id
b.put(key, {'meta:columbia_near_dups_biggest_dbid': '' + str(biggest_correct_uniqueid) + ''})
neighbors_list = data['meta:columbia_near_dups'].rsplit(',')
dist_list = data['meta:columbia_near_dups_dist'].rsplit(',')
#print key, "Initially we have these neighbors", neighbors_list
del_neighs = []
for neigh in neighbors_list:
#print neigh
if int(neigh) > biggest_correct_htid:
#print "We should remove neighbor", neigh
# we need to deal with biggest_id and dist fields too
del_neigh = neighbors_list.index(neigh)
neighbors_list.pop(del_neigh)
dist_list.pop(del_neigh)
del_neighs.append(neigh)
# should we delete the neigh row too? We might delete twice...
#b.delete(neigh,columns=('meta:columbia_near_dups','meta:columbia_near_dups_dist','meta:columbia_near_dups_biggest_dbid'))
if len(del_neighs) > 0:
print key, "We have deleted these similar images:", del_neighs
neighs_str = ','.join(map(str, neighbors_list))
dist_str = ','.join(map(str, dist_list))
#print len(neighbors_list), neighs_str
#print len(dist_list), dist_str
b.put(key, {'meta:columbia_near_dups': '' + neighs_str + ''})
b.put(key, {'meta:columbia_near_dups_dist': '' + dist_str + ''})
batch_nbmod = batch_nbmod + 1
if batch_nbmod > our_batch_size:
# push batch
print "Pushing batch of modification"
b.send()
batch_nbmod = 0
#quit()