-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscikit_train_predict_supervised.py
292 lines (241 loc) · 12.3 KB
/
scikit_train_predict_supervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
##% This file is part of scikit-from-matlab.
##%
##% scikit-from-matlab is free software: you can redistribute it and/or modify
##% it under the terms of the GNU General Public License as published by
##% the Free Software Foundation, either version 3 of the License, or
##% (at your option) any later version.
##%
##% scikit-from-matlab is distributed in the hope that it will be useful,
##% but WITHOUT ANY WARRANTY; without even the implied warranty of
##% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
##% GNU General Public License for more details.
##%
##% You should have received a copy of the GNU General Public License
##% along with scikit-from-matlab. If not, see <https://www.gnu.org/licenses/>.
##
##% Author: Abhishek Jaiantilal ([email protected])
## scikit-from-matlab 0.0.1
##if you run this script in python, it will construct numpy arrays (of type that matlab also passes to the script)
##and runs all the algorithms below to test them.
##note that if your favorite algorithm is missing (either missing in scikit or not mentioned below), it is very easy to add it below
##missing in scikit: you need to have (i think) a fit, score, predict function defined http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/
##missing below: what i have done is make a dict of algo->import-library
##let's say you wanted to add RandomForestRegressor & RandomForestClassifier (already added but just as an example), then you will be defining
##rf = ['RandomForestRegressor', 'RandomForestClassifier']
##rf_lib = ['sklearn.ensemble'] #<- the library from where both the algorithm can be imported
## modify __return_external_libs__() function and add: external_libs.update( __construct_mapping_algo_to_lib__(rf, rf_lib) )
##if you want to add a new CV algorithm the same idea goes and you just modify this dict CV_search_algorithms
try:
import numpy as np
except ImportError:
print('Install Numpy/Scipy (https://scipy.org/install.html) it can be as easy as pip install numpy scipy --user on the command line')
try:
import sklearn
from sklearn.model_selection import cross_validate, GridSearchCV #Additional scklearn functions
from sklearn import datasets
except ImportError:
print('Install scikit-learn (https://scikit-learn.org/stable/install.html) it can be as easy as pip install scikit-learn --user on the command line')
import importlib, warnings, sys, traceback,math
#disable deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
#Mapping of Algorithms to the library they come from. Modify here if some algorithm is missing
#e.g. GLM comes from sklearn.linear_model
#what we do is at runtime import the algorithm from the library
GLM = [
'ARDRegression', 'BayesianRidge', 'ElasticNet','ElasticNetCV', 'HuberRegressor','Lars',
'LarsCV','Lasso','LassoCV','LassoLars','LassoLarsCV','LassoLarsIC','LinearRegression','LogisticRegression',
'LogisticRegressionCV',
'OrthogonalMatchingPursuit','OrthogonalMatchingPursuitCV','PassiveAggressiveClassifier',
'PassiveAggressiveRegressor','Perceptron','Ridge','RidgeCV','RidgeClassifier','RidgeClassifierCV',
'SGDClassifier','SGDRegressor','TheilSenRegressor',
#,'RANSACRegressor' - was seeming to crap out
#,'MultiTaskElasticNet','MultiTaskElasticNetCV''MultiTaskLassoCV''MultiTaskLasso', - seems to not work with the twonorm dataset
]
GLM_lib = ['sklearn.linear_model']
#discriminant analysis family
Discriminant = ['LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis']
Discriminant_lib = ['sklearn.discriminant_analysis']
#ensemble family
Ensemble = ['AdaBoostClassifier', 'AdaBoostRegressor','BaggingClassifier','BaggingRegressor', 'ExtraTreesClassifier',
'ExtraTreesRegressor', 'GradientBoostingClassifier','GradientBoostingRegressor','IsolationForest',
'RandomForestClassifier','RandomForestRegressor',
#,'VotingClassifier''VotingRegressor',,'RandomTreesEmbedding'
#not found 'HistGradientBoostingRegressor','HistGradientBoostingClassifier'
]
Ensemble_lib = ['sklearn.ensemble']
#Xgboost is separate from the scikit base so
XGboost = ['XGBRegressor', 'XGBClassifier']
XGboost_lib = ['xgboost.sklearn']
#gaussian process family
Gaussian_processes = ['GaussianProcessClassifier', 'GaussianProcessRegressor']
Gaussian_processes_lib = ['sklearn.gaussian_process']
#kernel ridge family
Kernel_ridge = ['KernelRidge']
Kernel_ridge_lib = ['sklearn.kernel_ridge']
# svm family
SVM = ['LinearSVC', 'LinearSVR', 'NuSVC', 'NuSVR', 'SVC', 'SVR']
SVM_lib = ['sklearn.svm']
#decision tree family
DecisionTrees = ['DecisionTreeClassifier', 'DecisionTreeRegressor', 'ExtraTreeClassifier','ExtraTreeRegressor']
DecisionTrees_lib = ['sklearn.tree']
#CV search types and mapping to the libraries
CV_search_algorithms = {'GridSearchCV':'sklearn.model_selection', 'RandomizedSearchCV':'sklearn.model_selection'}
def __construct_mapping_algo_to_lib__(algo_list, lib_family):
#constructs a mapping from algos to the library. note that as multiple algos may come from a single family
#this will just construct a mapping dict
return dict(zip(algo_list, lib_family * len(algo_list)))
def __return_external_libs__():
#constructs the mapping of different algorithm to their respective libraries
external_libs = {}
external_libs.update( __construct_mapping_algo_to_lib__(GLM, GLM_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(Discriminant, Discriminant_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(Ensemble, Ensemble_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(XGboost, XGboost_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(Gaussian_processes, Gaussian_processes_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(Kernel_ridge, Kernel_ridge_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(SVM, SVM_lib) )
external_libs.update( __construct_mapping_algo_to_lib__(DecisionTrees, DecisionTrees_lib) )
#if you want to add an existing algorithm from a library add it here
return (external_libs)
external_libs = __return_external_libs__()
def list_of_algorithms():
#send a list of all algorithms known
return list(external_libs.keys())
def create_algo_object_with_params(algo_name, params):
'''
Based on algorithm to run, we try to load the package/module required to run the package
Then we load the sub-module within that package and pass the params that were given by the
user and then return the object
'''
try:
#print(algo_name)
if algo_name in external_libs:
#print(external_libs[algo_name])
algo_module = importlib.import_module(external_libs[algo_name])
algo_object = getattr(algo_module, algo_name)(**params)
#print(algo_object)
except Exception as e:
sys.stdout.write(__file__ + traceback.format_exc())
raise
return (algo_object)
def create_CV_object_with_params(CV_name, algo_object, CV_params_for_algo, CV_params):
'''
Based on CV search to run, we try to load the package/module required to run the package
Then we load the sub-module within that package and pass the params that were given by the
user and then return the object
'''
try:
if CV_name in CV_search_algorithms:
#print(external_libs[CV_name])
CV_module = importlib.import_module(CV_search_algorithms[CV_name])
try:
algo_object = getattr(CV_module, CV_name)(algo_object, param_grid = CV_params_for_algo, **CV_params)
except ValueError as e:
warnings.warn('Ensure that the parameter passed as CV parameters are correct')
raise
#print(algo_object)
except Exception as e:
sys.stdout.write(__file__ + traceback.format_exc())
raise
return (algo_object)
def __reshape_np_array(x):
#matlab sends in a list (numpy array flattened, dim_1 size, dim_2 size)
#what we do is reshape the numpy array from 1D back to 2D
#no need to do that for label/targets/y
if x[2]==1:
return np.array(x[0][:])
else:
return np.array(x[0][:]).reshape(x[1],x[2])
def train(xtrn, ytrn, algo_name, algo_params):
#we reshape the input X array to 2D
#then create an algorithm object depending on the name of algo and params passed
#then use the data with the algorithm using the fit function
try:
reshaped_Xtrn = __reshape_np_array(xtrn)
reshaped_Ytrn = __reshape_np_array(ytrn)
algo_object = create_algo_object_with_params(algo_name, algo_params)
algo_object.fit(reshaped_Xtrn, reshaped_Ytrn)
except Exception as e:
sys.stdout.write(__file__ + traceback.format_exc())
raise
return(algo_object)
def trainCV(xtrn, ytrn, algo_name, algo_params, CV_strategy, CV_params_for_algo, CV_params):
#we reshape the input X array to 2D
#then create an algorithm object depending on the name of algo and params passed
#ALSO, create a CV object with params in conjuction with the algorithm object
#then use the data with the algorithm using the fit function
try:
reshaped_Xtrn = __reshape_np_array(xtrn)
reshaped_Ytrn = __reshape_np_array(ytrn)
for key in CV_params_for_algo:
CV_params_for_algo[key] = CV_params_for_algo[key].tolist()
algo_object = create_algo_object_with_params(algo_name, algo_params)
clf = create_CV_object_with_params(CV_strategy, algo_object, CV_params_for_algo, CV_params)
clf.fit(reshaped_Xtrn, reshaped_Ytrn)
except Exception as e:
sys.stdout.write(__file__ + traceback.format_exc())
raise
return(clf)
def predict(xtst, clf):
#we reshape the input Xtst array to 2D and get predictions on the input array Xtst
try:
reshaped_Xtst = __reshape_np_array(xtst)
ypred = clf.predict(reshaped_Xtst)
except Exception as e:
sys.stdout.write(__file__ + traceback.format_exc())
raise
return (ypred)
def TestMe():
def reshape_to_mimic_matlab_inputs(data):
data_list = list()
data_shape = [x for x in data.shape]
if len(data_shape)==1:
data_shape.append(1)
data_list.append(data)
else:
data_list.append(data.reshape(data_shape[0], data_shape[1]))
data_list.append(data_shape[0])
data_list.append(data_shape[1])
return(data_list)
with open("data/X_twonorm.txt") as f:
data = np.loadtxt(f)
with open("data/Y_twonorm.txt") as f:
label = np.loadtxt(f)
#reshape to make it the same format as the matlab call
data_list = reshape_to_mimic_matlab_inputs(data)
label_list= reshape_to_mimic_matlab_inputs(label)
list_algorithms = list_of_algorithms()
#Without CV, just the default parameters
res = []
for algo in list_algorithms:
clf = train(data_list, label_list, algo, dict())
ypred = predict(data_list, clf)
res.append(np.linalg.norm((ypred - label)/math.sqrt(len(ypred))))
sort_indx = np.argsort(res)
print('Testing algorithms with default Parameters')
print('%30s %s' %('Algorithm','norm diff'))
for i in range(len(res)):
print('%30s %0.3f' %(list_algorithms[sort_indx[i]], res[sort_indx[i]]))
#With CV
res = []
cv_type = []
print('\n\nTesting algorithms with CV and default Parameters, technically just testing if CV is working correctly')
print('RandomizedSearchCV requires a param grid so omitting in testing below')
for CV_type in CV_search_algorithms.keys():
if CV_type=='RandomizedSearchCV':
continue
for algo in list_algorithms:
if algo=='IsolationForest':
print('IsolationForest was requiring a score so omitting in testing below')
continue
clf = trainCV(data_list, label_list, algo, dict(), CV_type, dict(), dict())
ypred = predict(data_list, clf)
res.append(np.linalg.norm((ypred - label)/math.sqrt(len(ypred))))
cv_type.append(CV_type)
sort_indx = np.argsort(res)
print('%30s %s %s' %('Algorithm','norm diff', 'CV strategy'))
for i in range(len(res)):
print('%30s %0.3f %15s' %(list_algorithms[sort_indx[i]], res[sort_indx[i]], cv_type[sort_indx[i]]))
if __name__ == "__main__":
TestMe()