-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfindCCHTImages.py
141 lines (137 loc) · 4.63 KB
/
findCCHTImages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# http://happybase.readthedocs.org/en/latest/user.html
import happybase,sys
import numpy as np
import pickle
import csv
connection = happybase.Connection('10.1.94.57')
# This fails...
#alltables = connection.tables()
tab = connection.table('aaron_memex_ht-images')
def getCC(images_ht): # only for undirected graph
visited=[]
CC=[]
CCcount=0
for pos,one_id in enumerate(images_ht['id']):
if one_id not in visited:
visited.append(int(one_id))
CC.append([int(one_id)])
#print "pos",pos
#print "edges",images_ht['edges'][pos]
CCqueue=images_ht['edges'][pos].split(',')
while len(CCqueue)>0:
tmp_id=int(CCqueue.pop())
if tmp_id not in visited:
#print tmp_id
visited.append(tmp_id)
CC[CCcount].append(tmp_id)
#print CC
try: # neighbor may not be in list, if we limited initial scan
tmp_pos=images_ht['id'].index(tmp_id)
CCqueue.extend(images_ht['edges'][tmp_pos].split(','))
except:
pass
CCcount=CCcount+1
return CC
def clean_htimages(images_ht,max_edges,max_id=None):
for pos,key in enumerate(images_ht['id']):
if max_id:
# Cleaning corrupted rows
if int(key)>max_id:
print "Deleting row because id is too big",key
images_ht['id'].pop(pos)
images_ht['edges'].pop(pos)
continue
edges_list=images_ht['edges'][pos].rsplit(",")
# Discard too many edges
if len(edges_list)>max_edges:
print "Deleting row because too many edges",key
images_ht['id'].pop(pos)
images_ht['edges'].pop(pos)
continue
mod=0
# Cleaning corrupted edges
if max_id:
for one_edge in edges_list:
if int(one_edge)>max_id:
pos_edge=edges_list.index(one_edge)
edges_list.pop(pos_edge)
mod=mod+1
# Tricky, make sure graph is indirected and that there is no link with non existing nodes...
for one_edge in edges_list:
try:
neigh_pos=images_ht['id'].index(one_edge)
# Is there the returning edge?
neigh_edges_list=images_ht['edges'][neigh_pos].rsplit(",")
if one_edge not in neigh_edges_list:
neigh_edges_list.append(one_edge)
images_ht['edges'][neigh_pos]=",".join(map(str,neigh_edges_list))
except: # link is pointing to non existing node
pos_edge=edges_list.index(one_edge)
edges_list.pop(pos_edge)
mod=mod+1
if mod>0:
images_ht['edges'][pos]=",".join(map(str,edges_list))
return images_ht
if __name__ == '__main__':
max_images=-1
save_images=False
our_batch_size=10000
max_edges=100
max_id=89830995
#images_ht={}
#images_ht['id']=range(10)
#images_ht['edges']=[]
#images_ht['edges'].append('2,3')
#images_ht['edges'].append('7,8,9')
#images_ht['edges'].append('0,2')
#images_ht['edges'].append('2,0')
#images_ht['edges'].append('5,6')
#images_ht['edges'].append('4,6')
#images_ht['edges'].append('4,5')
#images_ht['edges'].append('9,8,1')
#images_ht['edges'].append('7,9,1')
#images_ht['edges'].append('7,8,1')
try:
if max_images>0:
images_ht=pickle.load(open("ht_images"+str(max_images)+".pkl","rb"))
else:
images_ht=pickle.load(open("ht_images.pkl","rb"))
print "Loaded precomputed list of images"
#ok=True
except:
if max_images>0:
print "Getting",str(max_images),"images."
else:
print "Getting all images."
images_ht={}
images_ht['id']=[]
images_ht['edges']=[]
#images_ht['URL']=[]
for key, data in tab.scan(batch_size=our_batch_size,columns=('meta:columbia_near_dups',)):
#print key, data
if 'meta:columbia_near_dups' in data.keys():
print len(images_ht['id']), key
#print len(images_ht['id']), key, data['meta:columbia_near_dups'], data['meta:location']
images_ht['id'].append(key)
images_ht['edges'].append(data['meta:columbia_near_dups'])
#images_ht['URL'].append(data['meta:location'])
#else:
# print key,"not indexed."
if len(images_ht['id'])>=max_images and max_images>0:
print "Reached max images number",str(max_images),"Saving."
pickle.dump(images_ht,open("ht_images"+str(max_images)+".pkl","wb"),2)
break
#pickle.dump(images_ht,open("ht_images.pkl","wb"),2)
#print images_ht['id'],images_ht['edges']#,images_ht['URL']
# SHOULD CHECK EDGES REALLY DEFINE AN UNDIRECTED GRPAH...
images_ht=clean_htimages(images_ht,max_edges,max_id)
pickle.dump(images_ht,open("ht_images_clean.pkl","wb"),2)
CC=getCC(images_ht)
#print CC
#pickle.dump(CC,open("CC_ht_images"+str(max_images)+".pkl","wb"),2)
with open('CC_ht_images.csv', 'wb') as csvfile:
CCwriter = csv.writer(csvfile, delimiter=',')
for oneCC in CC:
CCwriter.writerow(oneCC)
csvfile.close()
#create