-
Notifications
You must be signed in to change notification settings - Fork 2
/
1024.py
148 lines (116 loc) · 3.92 KB
/
1024.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/bin/env python
#encoding=utf-8
# Author: Aaron
# Last modified: 2014-10-22 23:42
# Filename: 1024.py
# Description:
import sys
import time
import socks
import socket
import urllib2
import urllib
import re
import os
import threading
SOCKS_PORT = 1080
USING_PROXY = False
def get_url():
pass
if len(sys.argv) != 3:
print >> sys.stderr, "need a url and output name"
sys.exit(-1)
return sys.argv[1]
def get_outfname():
pass
if len(sys.argv) != 3:
print >> sys.stderr, "need a url and output name"
sys.exit(-1)
return sys.argv[2]
def get_html(url):
pass
global SOCKS_PORT
global USING_PROXY
if USING_PROXY:
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT)
socket.socket = socks.socksocket
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib2.urlopen(req)
return response.read()
def find_image_urls(html):
p = re.compile(r'https?://\S+\.jpe?g')
urls = p.findall(html)
#urls = re.findall(r'https?://\S+\.(jpeg|jpg|png)', html)
if len(urls) == 0:
print >> sys.stderr, "Can not find image urls"
# filter www.viidii.info redirection
urls = map(lambda url: 'www.viidii.info' in url and urllib.unquote_plus(url)[urllib.unquote_plus(url).find('url=')+4:] or url, urls)
domain_count = {}
for url in urls:
domain = url.split('/')[2]
domain = '.'.join(domain.split('.')[-2:])
if domain not in domain_count:
domain_count[domain] = 1
else:
domain_count[domain] += 1
domain_count = sorted(domain_count.items(), key=lambda x:x[1], reverse=True)
top_domain = domain_count[0][0]
urls = filter(lambda x : top_domain in x, urls)
out_f = file('download_list.txt', 'w')
for url in urls:
out_f.write(url+'\n')
out_f.close
return urls
def download_images_wget(image_urls):
os.system('rm -rf tmp; mkdir tmp')
os.system("cd tmp; wget -i ../download_list.txt; cd -")
def download_images(image_urls):
total_images_count = len(image_urls)
os.system('rm -rf tmp; mkdir tmp')
print >> sys.stderr, "Downloading %d images ..." % (total_images_count)
threads = []
for i in xrange(0, total_images_count):
image_url = image_urls[i]
t = threading.Thread(target=down_load_single_image, args=(image_url, i))
threads.append(t)
for t in threads:
while threading.activeCount() > 11:
time.sleep(1)
t.start()
while threading.activeCount() > 1:
time.sleep(1)
class AppURLopener(urllib.FancyURLopener):
pass
version="Mozilla/5.0"
def down_load_single_image(image_url, index):
global SOCKS_PORT
global USING_PROXY
max_retry = 3
is_done = False
for cur_try in xrange(1, max_retry+1):
print >> sys.stderr, "Start downloading image %d attempt %d, url: %s" % (index,cur_try, image_url)
if USING_PROXY:
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT)
socket.setdefaulttimeout(20)
socket.socket = socks.socksocket
try:
urllib._urlopener = AppURLopener()
urllib.urlretrieve(image_url, 'tmp/%d.jpg' % (index+1))
except Exception,e:
print >> sys.stderr, e
continue
print >> sys.stderr, "Finish downloading image %d on attempt %d, url: %s" % (index,cur_try, image_url)
is_done = True
return
if False == is_done:
print >> sys.stderr, "Failed downloading image %d, url: %s" % (index, image_url)
def create_pdf(fname):
os.system("./convert tmp/*.jpg %s.pdf" % fname)
if __name__ == '__main__':
url = get_url()
html = get_html(url)
image_urls = find_image_urls(html)
download_images(image_urls)
fname = get_outfname()
create_pdf(fname)
print "Thanks to Aaron, good person one life flat safe, 1024 Amen!"