-
Notifications
You must be signed in to change notification settings - Fork 0
/
brandonscanincremental.py
executable file
·113 lines (107 loc) · 3.27 KB
/
brandonscanincremental.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import urllib2, httplib
import re
import sys
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
ourl= req.get_full_url()
result = urllib2.HTTPRedirectHandler.http_error_301(
self, req, fp, code, msg, headers)
print "301"
if not hasattr(result, 'chain'):
result.chain=[]
result.chain.append(req.get_full_url())
if hasattr(result, 'status'):
if result.status!=404:
result.status = code
else:
nurl=result.url
#print "from "+ourl
#print " to "+nurl
if ourl!=nurl:
result.status = 777
else:
result.status=code
return result
def http_error_302(self, req, fp, code, msg, headers):
#print req.get_full_url()
result = urllib2.HTTPRedirectHandler.http_error_302(
self, req, fp, code, msg, headers)
if not hasattr(result, 'chain'):
result.chain=[]
print "302"
result.chain.append(req.get_full_url())
result.status = code
return result
class SmartErrorHandler(urllib2.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, hdrs):
result=self
#print result
result.url=req.get_full_url()
if not hasattr(result, 'chain'):
result.chain=[]
result.chain.append(req.get_full_url())
#result.chain.append(result.url)
#print result.url
#result = urllib2.HTTPDefaultErrorHandler.http_error_default(
# self, req, fp, code, msg, hdrs)
result.status = code
return result
fromnum=1
try:
fl=open('brandondata.txt','r')
dat=fl.read()
pat=re.compile("(\d+?):\{\{(.+?)\}\} \{\{(.+?)\}\}")
k=pat.findall(dat)
fromnum=int(k[-1][0])+1
print "Starting from "+str(fromnum)
except:
fromnum=1
opener = urllib2.build_opener(SmartRedirectHandler(),SmartErrorHandler())
file=open('brandondata.txt','a')
pat=re.compile('<title>(.+?)</title>',re.MULTILINE+re.I)
for page_id in range(fromnum,12000):
try:
#print page_id
p=opener.open("http://brandonsanderson.com/?p="+str(page_id))
except KeyboardInterrupt:
exit()
except:
print str(page_id)+ ": Error: "+str(sys.exc_value)
else:
redirchain=[]
if hasattr(p,"chain"):
#print p.chain
#print p.url
p.chain.pop()
while len(p.chain)>0:
redirchain.append(p.chain.pop())
p.chain=[]
if (len(redirchain)>=1 and redirchain[-1]!= p.url) or(len(redirchain)==0) :
redirchain.append(p.url)
strp=' :-> '.join(redirchain)
#print redirchain
canread=False
try:
http=p.read()
canread=True
except:
canread=False
#print canread
# print p.status
if (p.status==301 or p.status==302 ) and canread==True:
#print p.status
#http=p.read()
m1=pat.search(http)
if m1:
m=m1.group(1)
strg= str(page_id)+':{{'+m+'}} {{'+strp+'}}'
file.write(strg+"\r\n")
file.flush()
print strg
elif p.status==777:
hidden='hidden'
strg= str(page_id)+':{{'+hidden+'}} {{'+strp+'}}'
file.write(strg+"\r\n")
file.flush()
print strg
file.close()