-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_image.py
323 lines (280 loc) · 8.37 KB
/
fetch_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
import urllib2, cookielib
import urllib
from sgmllib import SGMLParser
import os
url_head = 'http://www.chictopia.com'
def log(message, level = 'INFO'):
print '[' + level + ']', message
class ItemParser(SGMLParser):
start_get_urls = False
start_get_tags = False
start_get_num = False
tags = []
urls = []
number = None
def __Init__(self):
SGMLParser.__init__(self)
self.urls = []
self.tags = []
def convert_url(self, url):
url = 'http://images' + self.number + url[14:]
index = url.index('_sm.jpg')
url = url[:index] + '_400.jpg'
return url
def start_div(self, attrs):
for k,v in attrs:
if k == 'class' and v == 'subphoto_items':
self.start_get_urls = True
if k == 'class' and v == 'left px10':
self.start_get_tags = True
if k == 'id' and v == 'image_wrap':
self.start_get_num = True
def end_div(self):
self.start_get_urls = False
self.start_get_num = False
def start_img(self, attrs):
if self.start_get_num:
for k,v in attrs:
if k == 'src':
self.number = v[13]
self.start_get_num = False
if self.start_get_urls:
for k,v in attrs:
if k == 'src':
try:
self.urls.append(self.convert_url(v))
except:
log('convert url fail:' + v, 'ERROR')
def handle_data(self, data):
if self.start_get_tags == True:
self.tags.append(data)
self.start_get_tags = False
def clean(self):
self.urls = []
self.tags = []
ItemParser.urls = []
ItemParser.tags = []
class Item:
def __init__(self):
self.tags = []
self.new_tags = []
self.image_urls = []
self.url = ''
self.images = []
def add_tag(self, tag):
self.tags.append(tag)
def set_url(self, url):
self.url = url
def show(self):
print self.url
print self.tags
print self.new_tags
print self.image_urls
def parse(self):
try:
log("start parsing " + self.url)
content = urllib2.urlopen(self.url).read()
parser = ItemParser()
parser.feed(content)
self.new_tags = parser.tags
self.image_urls = parser.urls
parser.clean()
return True
except:
return False
def get_images(self):
if len(self.image_urls) <= 1:
return False
try:
for url in self.image_urls:
log('fetch image ' + url)
content = urllib2.urlopen(url).read()
self.images.append(content)
return True
except:
return False
def clean_images(self):
self.images = []
def convert_tag(tag):
index = tag.index('/info')
return tag[1:index]
class ItemListParser(SGMLParser):
to_end = False
start = False
get_a = False
to_end = True
start_get_tags = False
items = []
cur_item = None
def __Init__(self):
SGMLParser.__init__(self)
self.items = []
def start_div(self, attrs):
for k,v in attrs:
if k != 'class':
continue
if v == 'lg_photo photo_hover':
self.start = True
self.get_a = False
self.to_end = False
self.start_get_tags = False
self.cur_item = Item()
return
if v == 'white px10 ellipsis':
if not self.get_a:
return
self.start_get_tags = True
return
if v == 'white px10':
self.start_get_tags = False
self.to_end = True
def start_a(self, attrs):
if not self.start:
return
if not self.get_a:
for k,v in attrs:
if k == 'href':
self.get_a = True
self.cur_item.set_url(url_head + v)
self.items.append(self.cur_item)
if self.start_get_tags:
for k,v in attrs:
if k == 'href':
try:
self.cur_item.add_tag(convert_tag(v))
except:
log('convert tag fail', 'ERROR')
def end_div(self):
if self.to_end:
self.to_end = False
def clean_items(self):
self.items = []
ItemListParser.items = []
#url1='http://www.chictopia.com/photo/show/1059006-Like+The+Cool+Kids-brown-winners-sandals-black-nina-ricci-bag-dark-brown-firmoo-sunglasses'
#url2='http://images2.chictopia.com/photos/KristaniA/4220878207/black-nina-ricci-bag-dark-brown-firmoo-sunglasses_400.jpg'
def get_new_id():
global start_id
start_id += 1
return start_id
def complete_dir(dir):
if dir[len(dir) - 1] != '/':
dir += '/'
return dir
def write_single_img(dir, filename, data):
dir = complete_dir(dir)
try:
img = open(dir + filename, 'wb')
img.write(data)
img.close()
except:
log('write image fail:' + dir + filename, 'ERROR')
def write_txt(dir, filename, elements):
dir = complete_dir(dir)
try:
file = open(dir + filename, 'w')
for i in range(len(elements)):
file.write(str(i + 1) + ' ' + elements[i] + '\n')
file.close()
except:
log('write txt fail:' + dir + filename, 'ERROR')
def write_images(dir, images):
for i in range(len(images)):
write_single_img(dir, str(i + 1), images[i])
def save_item(item):
global save_path
id = get_new_id()
dir = save_path + str(id)
if not os.path.exists(dir):
os.makedirs(dir)
write_txt(dir, 'url.txt', item.image_urls)
write_txt(dir, 'tag.txt', item.tags)
write_txt(dir, 'ntag.txt', item.new_tags)
write_images(dir, item.images)
def parse_and_save_item(item):
if not item.parse():
return
if not item.get_images():
return
save_item(item)
def parse_and_save_all_item(items):
for item in items:
if already_fetched(item.url):
log(item.url + ' is already fetched')
continue
parse_and_save_item(item)
record_fetched(item.url)
item.clean_images()
fetched = set()
url='http://www.chictopia.com/browse/people'
def load_fetched_urls():
global fetched
try:
file = open('fetched', 'r')
while True:
url = file.readline()
if url == '':
break
if url == None:
break
fetched.add(url[:len(url) - 1])
file.close()
except:
pass
def save_fetched_url(url):
try:
file = open('fetched', 'a+')
file.write(url +'\n')
file.close()
except:
pass
def record_fetched(url):
global fetched
save_fetched_url(url)
fetched.add(url)
def already_fetched(url):
global fetched
if url in fetched:
return True
return False
start_id = 0
end_page_index = 50000
def fetch():
global url
global start_page_index
global end_page_index
global start_id
global max_count
for i in range(start_page_index, end_page_index + 1):
local_url = url + '/' + str(i)
print '\nprocessing the page ' + str(i)
print 'url', local_url
response = urllib2.urlopen(local_url).read()
parser = ItemListParser()
parser.feed(response)
items = parser.items
parser.clean_items()
print 'elements number:', len(items), '\n'
parse_and_save_all_item(items)
if start_id > max_count:
log('finished')
break
def init_start_id():
global start_id
global max_count
global save_path
for i in range(max_count):
if not os.path.exists(save_path + str(i + 1)):
start_id = i
return
start_id = max_count + 1
save_path = './images/'
start_page_index = 1
max_count = 20000 + 1000
if __name__ == '__main__':
init_start_id()
if start_id >= max_count:
log('already have ' + str(max_count) + ' directories')
log('finished')
else:
load_fetched_urls()
fetch()