This repository has been archived by the owner on Oct 31, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathokcupid.py
347 lines (296 loc) · 10.1 KB
/
okcupid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/usr/bin/env python
#http://sujitpal.blogspot.com/2007/04/building-tag-cloud-with-python.html
import pickle
import os
from getpass import getpass
from urllib2 import URLError
from time import sleep, time
from random import choice
from collections import Counter
from itertools import count
from mechanize import Browser
import lxml.html as html
try:
from matplotlib import pyplot as plot
matplotlib = True
except ImportError:
matplotlib = False
NUM_ZERO_GROWTH_SEARCHES = 15
LOGIN_URL = "http://www.okcupid.com/login"
PROFILE_URL_BASE = "http://www.okcupid.com/profile/"
SEARCH_URL = ("http://www.okcupid.com/match?"
"filter1=0,63&"
"filter2=2,100,18&"
"filter3=5,31536000&"
"filter4=1,0&"
"filter5=35,0&"
"locid=0&"
"timekey=1&"
"matchOrderBy=MATCH&"
"custom_search=0&"
"fromWhoOnline=0&"
"mygender=m&"
"update_prefs=0&"
"sort_type=0&"
"sa=1&"
)
# SEARCH_URL = ("http://www.okcupid.com/match"
# "?filter1=0,34"
# "&filter2=2,100,18" # age
# "&filter3=5,2678400"
# "&filter4=1,1"
# "&filter5=35,0"
# "&locid=0"
# "&timekey=1"
# "&matchOrderBy=MATCH"
# "&custom_search=0"
# "&fromWhoOnline=0"
# "&mygender=m"
# "&update_prefs=0"
# "&sort_type=0"
# "&sa=1")
class User(object):
def __init__(self, username, percents, age, img_url, sexuality, status,
location, reply_rate):
self.username = username
self.match, self.friend, self.enemy = percents
self.age = age
self.img = img_url
self.orientation = sexuality
self.status = status
self.city, self.state = location.split(", ")[-2:]
self.reply_rate = reply_rate
def __hash__(self):
return hash(self.username)
def merge(lists):
''' function to merge many sorted lists with unique elements
used with permission from Hardy Jones III
released under an Apache Version 2.0 license
'''
# Get the first sorted list to work with.
result, rest = lists[0], lists[1:]
# Iterate each of the rest of the sorted lists.
for each_list in rest:
# We'll need the index and item in the current list.
for i, element in enumerate(each_list):
# If we've already seen the element,
# we need to check some stuff.
if element in result:
pos = result.index(element)
# First make sure we're checking within bounds.
# If the previous element is already before
# the present element, then as far as we know,
# the elements in the result are sorted.
if i > 0 and result.index(each_list[i-1]) < pos:
continue
# Otherwise, we need to make sure everything we've
# entered so far is in the sorted position.
else:
for each in each_list[:i]:
# If an element isn't sorted, then sort it.
if result.index(each) > pos:
result.remove(each)
result.insert(pos, each)
# If we made it here, then we've never seen this element.
# Just throw it on the back.
else:
result.append(element)
return result
def login(br, username, password):
br.open(LOGIN_URL)
br.select_form(nr=0)
br["username"] = username
br["password"] = password
br.submit()
def do_search(br):
while True:
try:
resp = br.open(SEARCH_URL)
break
except URLError:
pass
page = html.parse(resp)
match_results = page.getroot().get_element_by_id("match_results")
users = {}
usernames = []
for _wrap, percentages, actions in match_results:
_user_info, essay = _wrap
user_image, match_row_screenname, aso, location, activity = _user_info
percents = [int(percent.text_content().split('%')[0])
for percent in percentages]
if percents[0] < 99: break
username = match_row_screenname[0][0].text_content()
age, sex, sexuality, status = (aso_element.encode("utf-8")
for aso_element in aso.text_content().split()[::2])
try:
reply_rate = activity[0][1].text_content()
except IndexError:
reply_rate = activity.text_content()
user = User(username.encode("utf-8"), percents,
int(age), user_image[0].get("src"), sexuality, status,
location.text_content().encode("utf-8"), reply_rate
)
users[username] = user
usernames.append(user.username)
return users, usernames
def get_data(br):
if os.path.isfile("okcupid.pickle"):
with open("okcupid.pickle", "rb") as okc_file:
all_users = pickle.load(okc_file)
username_lists = pickle.load(okc_file)
return all_users, username_lists
all_users = {}
username_lists = []
num_users = 0
for search in count(1):
users, usernames = do_search(br)
username_lists.append(usernames)
all_users.update(users)
growth = len(all_users) - num_users
print "Growth:", growth
print "99%ers so far:", len(all_users)
if growth == 0:
give_up_yet -= 1 # not init'd; but will be
print "Remaining 0 growth searches:", give_up_yet
if give_up_yet == 0:
print "stopped after", search, "searches"
break
else:
give_up_yet = NUM_ZERO_GROWTH_SEARCHES
num_users = len(all_users)
sleep(choice(range(7, 14))) # to look less suspicious
try:
with open("okcupid.pickle", "wb") as okc_file:
pickle.dump(all_users, okc_file)
pickle.dump(username_lists, okc_file)
except TypeError:
print "It looks like pickling failed because of a TypeError"
return all_users, username_lists
def inform(all_users, username_lists):
states = {}
for city, state in ((u.city, u.state) for u in all_users.values()):
if state in states:
states[state].append(city)
else:
states[state] = [city]
key = lambda x: len(x[1])
for state, cities in sorted(states.iteritems(), key=key):
print state, "({}, {:.2f}%)".format(
len(cities), 100 * float(len(cities)) / len(all_users))
for city, count in sorted(Counter(cities).iteritems(),
key=lambda x: x[1], reverse=True):
print " ", city,
print "({})".format(count) if count > 1 else ""
print
print "Top 10 users (out of {}):".format(len(all_users))
for place, username in enumerate(merge(username_lists)[:10], 1):
print "{:2}) {}".format(place, username)
if not matplotlib:
print "Install matplotlib for some pretty graphs"
return False
orientation = Counter(user.orientation for user in all_users.values())
plot.title("Orientation")
labels, fracs = zip(*orientation.iteritems())
plot.pie(fracs, labels=labels, shadow=True, autopct='%1.1f%%')
plot.show()
status = Counter(user.status for user in all_users.values())
plot.title("Relationship Status")
labels, fracs = zip(*status.iteritems())
explode = [0.05 if label == 'Single' else 0 for label in labels]
plot.pie(fracs, explode, labels, shadow=True, autopct='%1.1f%%')
plot.show()
reply_rates = Counter(user.reply_rate for user in all_users.values())
plot.title("Replies...")
labels = ("Go for it.", "often", "selectively", "very selectively")
fracs = [reply_rates[label] for label in labels]
plot.pie(fracs, labels=labels, shadow=True, autopct='%1.1f%%')
plot.show()
plot.bar(*zip(
*Counter(user.friend for user in all_users.values()).items())
)
plot.title("Friend percents")
plot.xlabel("Percent")
plot.ylabel("Number of People")
plot.show()
plot.bar(*zip(
*Counter(user.enemy for user in all_users.values()).items())
)
plot.title("Enemy percents")
plot.xlabel("Percent")
plot.ylabel("Number of People")
plot.show()
plot.bar(*zip(
*Counter(user.age for user in all_users.values()).items())
)
plot.xlabel("Years")
plot.ylabel("Number of People")
plot.title("Age")
plot.show()
def scrape_user(br, username):
resp = br.open(PROFILE_URL_BASE + username)
page = html.parse(resp).getroot()
last_online, ethnicity, height, body_type, diet, looking_for, smokes, \
drinks, drugs, religion, sign, education, job, income, offspring, \
pets, speaks = page.get_element_by_id("profile_details")
essays = {}
for sum_num in xrange(10):
key = "essay_text_{}".format(sum_num)
try:
element = page.get_element_by_id(key)
tags = filter(lambda element: element.tag != "br", self_summary)
essays[key] = element
except KeyError:
essays[key] = None
self_summary, doing_with_life, good_at, people_notice, favorites, \
never_do_without, think_about, friday, private, message_if = \
(essays["essay_text_{}".format(sum_num)] for sum_num in xrange(10))
if self_summary is None:
tags = filter(lambda element: element.tag != "br", self_summary)
else:
pass
if doing_with_life is None:
pass
else:
pass
if good_at is None:
pass
else:
pass
if people_notice is None:
pass
else:
pass
if favorites is None:
pass
else:
pass
if never_do_without is None:
pass
else:
pass
if think_about is None:
pass
else:
pass
if friday is None:
pass
else:
pass
if private is None:
pass
else:
pass
if message_if is None:
pass
else:
pass
if __name__ == "__main__":
username = raw_input("Enter username: ")
password = getpass()
br = Browser()
login(br, username, password)
start = time()
all_users, username_lists = get_data(br)
print "{}:{:2}".format(*divmod(time() - start, 60))
#sorted_usernames = merge(username_lists)
#inform(all_users, username_lists)