forked from m-cahana/rapcast
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcitydata.py
55 lines (43 loc) · 2.06 KB
/
citydata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
rappers = pd.read_csv('./input/rappers.csv')
with open('./input/raw_lyrics.json') as f:
raw = json.load(f)
for artist, lyrics in raw.items():
raw[artist] = " ".join(lyrics)
df = pd.DataFrame.from_dict(raw, orient = 'index')
df = df.reset_index()
df.columns = ['artist', 'corpus']
df = df[df['artist'].isin(rapper_list)]
df = df.merge(rappers[['artist', 'city']], how='left', on='artist')
df['city_id'] = df['city'].factorize()[0]
region_id_df = df[['city', 'city_id']].drop_duplicates().sort_values('city_id')
region_to_id = dict(region_id_df.values)
id_to_region = dict(region_id_df[['city_id', 'city']].values)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.corpus).toarray()
labels = df.city_id
N = 2
for region, region_id in sorted(region_to_id.items()):
features_chi2 = chi2(features, labels == region_id)
indices = np.argsort(features_chi2[0])
feature_names = np.array(tfidf.get_feature_names())[indices]
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
print("# '{}':".format(region))
print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
X_train, X_test, y_train, y_test = train_test_split(df['corpus'], df['city'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)