-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathtest_set.py
executable file
·140 lines (113 loc) · 4.7 KB
/
test_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys, os, argparse
import pandas as pd
########################
### Parse Input Args ###
########################
parser = argparse.ArgumentParser(
description='Define a test set that will be held out during feature selection \
and model training. For regression models (-type r), test will be a random X percent or n.\
For classification models (-type c), test will be X percent or number from each class',
epilog='https://github.com/ShiuLab')
### Input arguments ###
# Required
req_group = parser.add_argument_group(title='REQUIRED INPUT')
req_group.add_argument('-df', help='Feature & class dataframe for ML, (example: example_binary.txt) ', required=True)
req_group.add_argument('-type', help='c/r (classification vs. regression)', required=True)
req_group.add_argument('-p', '-percent', help='Percent of instances to hold out (0.1 = 10%), can also use -n', required=False, type=float, default=0)
req_group.add_argument('-n', '-num', help='Number of instances to hold out, can also use -p', required=False, type=int, default=0)
# Optional
inp_group = parser.add_argument_group(title='OPTIONAL INPUT')
inp_group.add_argument('-y_name', help='Name of column to predict', default='Class')
inp_group.add_argument('-df2', help='Class data (if not in -df). Need to provide -a.y_name', default='')
inp_group.add_argument('-sep', help='Deliminator', default='\t')
inp_group.add_argument('-use', help='List of classes to include in test set', default='all')
inp_group.add_argument('-skip', help='List of classes to not include in test set (i.e. unknown)', default='')
inp_group.add_argument('-drop_na', help='T/F to drop rows with NAs', default='f')
inp_group.add_argument('-save', help='Adjust save name prefix. Default = [df]_test.', default='default')
a = parser.parse_args()
if a.skip != "":
skip = a.skip.split(',')
if a.save == 'default':
a.save = a.df + "_test.txt"
#########################
### Read in dataframe ###
#########################
df = pd.read_csv(a.df, sep=a.sep, index_col = 0)
# If features and class info are in separate files, merge them:
if a.df2 != '':
start_dim = a.df.shape
df_class = pd.read_csv(a.df2, sep=a.sep, index_col = 0)
df = pd.concat([df_class[a.y_name], df], axis=1, join='inner')
print('Merging the feature & class dataframes changed the dimensions from %s to %s (instance, features).'
% (str(start_dim), str(df.shape)))
# Specify Y column - default = Class
if a.type.lower() == 'c' or a.type.lower() == 'classificaton':
if a.y_name != 'Class':
df = df.rename(columns = {a.y_name:'Class'})
elif a.type.lower() == 'r' or a.type.lower() == 'regression':
if a.y_name != 'Y':
df = df.rename(columns = {a.y_name:'Y'})
else:
print('Model type not recognized, define as classification (c) or rregression (r)')
exit()
if a.skip != '':
try:
df = df[~(df['Class'].isin(a.skip))]
except:
df = df[~(df['Y'].isin(a.skip))]
# Check for Nas
if df.isnull().values.any() == True:
if a.drop_na.lower() == 't' or a.drop_na.lower() == 'true':
start_dim = df.shape
df = df.dropna(axis=0)
print('Dropping rows with NA values changed the dimensions from %s to %s.'
% (str(start_dim), str(df.shape)))
else:
print(df.columns[df.isnull().any()].tolist())
print('There are Na values in your dataframe.\n Impute them or add -drop_na True to remove rows with nas')
quit()
if a.p != 0.0:
print('Holding out %.1f percent' % (a.p*100))
elif a.n != 0:
print('Holding out %i instances per class' % (a.n))
else:
print('Either -p or -n is required!')
quit()
#######################
### Define test set ###
#######################
def pull_sample(temp, p, n):
if p != 0.0:
temp_sample = temp.sample(frac = p)
elif n != 0:
temp_sample = temp.sample(n = n)
return temp_sample
test = []
if a.type.lower() == 'c' or a.type.lower() == 'classificaton':
if a.use == 'all':
use_list = df.Class.unique()
else:
use_list = a.use.strip().split(',')
print('Pulling test set from classes: %s' % str(use_list))
min_size = (df.groupby('Class').size()).min() - 1
for cl in use_list:
temp = df[df['Class']==cl].sample(min_size, random_state=42)
temp_sample = pull_sample(temp, a.p, a.n)
keep_test = list(temp_sample.index)
test.extend(keep_test)
elif a.type.lower() == 'r' or a.type.lower() == 'regression':
if a.p != 0:
temp_sample = df.sample(frac = a.p)
elif a.n != 0:
temp_sample = df.sample(n = a.n)
keep_test = list(temp_sample.index)
test.extend(keep_test)
else:
print('Model type not recognized, define as c or r')
exit()
print('%i instances in test set' % len(test))
out = open(a.save, 'w')
for ho in test:
out.write('%s\n' % ho)
out.close()
print('finished!')