-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing.py
69 lines (57 loc) · 2.31 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF
def preprocess(filename, col_name, data_frac, random_sample, num_max_feature, print_enabled = False):
'''
Preprocess a column of a csv file for nonnegative matrix factorization.
Parameters:
filename: str
col_name: str
data_frac: float
- float in [0, 1], specifying the fraction of data we will be using
random_sample: boolean
- if true, randomly sample the dataset
max_feature_num: int
- integer specifing the maximum number of extracted features
print_enabled: boolean
- if ture, output print statements
Returns:
A: ndarray
- (# of features) by (# of datapoints) ndarray that will be used for NMF
features: list
- list with length of (# of features) containing strings of extracted features
'''
# Import dataset and show first few columnns.
df = pd.read_csv(filename)
# Experiment with 10000 data points
if random_sample:
df = df.sample(frac = data_frac, random_state = 42)
else:
df = df.sample(frac = data_frac)
if print_enabled:
print('Prerpocessing ' + col_name + ' column of ' + filename + ' file...')
print('---------------------------------------------------------------------')
print('First few lines of ' + filename +'\n')
print(df.head())
print('')
# Transform the specified column of the given csv file into a matrix A.
# Store the discovered features to features.
headlines = df[col_name]
vectorizer = CountVectorizer(max_features = num_max_feature, stop_words = 'english')
count_mat = vectorizer.fit_transform(headlines)
transformer = TfidfTransformer(smooth_idf=False)
A = transformer.fit_transform(count_mat).toarray().T
features = vectorizer.get_feature_names()
if print_enabled:
# Print out results.
print('---------------------------------------------------------------------')
print(str(np.size(A, 0)) + ' by ' + str(np.size(A, 1)) + ' matrix representing ' + col_name + ' column of ' + filename + ' file\n')
print(A)
print('')
print('---------------------------------------------------------------------')
print(str(len(features)) + ' features of ' + col_name + ' column of ' + filename + ' file\n')
print(features)
print('')
return A, features