-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata.py
39 lines (32 loc) · 1.41 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS.union([
"carpediem", "Carpediem", "olin", "lists", "students", "listinfo", "edu",
"3d", "font", "style", "mso", "xmlns", "skipped", "multipart", "alternative",
"content", "type", "bounces", "sent", "mailto", "subject", "fw",
"_______________________________________________",
])
CLIENT = MongoClient(os.environ.get('MONGODB_URI', ''))
EMAIL_COLLECTION = CLIENT.futureboard.emails
EMAILS = EMAIL_COLLECTION.find().limit(30000)
DOCUMENTS = [email.get("subject", False) for email in EMAILS if not False]
# documents = ["aidan"]
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=10)
X = vectorizer.fit_transform(DOCUMENTS)
def test_k(k_val):
true_k = k_val
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1)
model.fit(X)
print("\n")
print("Top terms per cluster (" + str(k_val) + " clusters!):")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print "Cluster %d:" % i,
print " ".join([terms[ind] for ind in order_centroids[i, :5]])
if __name__ == "__main__":
[test_k(i) for i in range(5, 15)]