-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutil_preprocessing_funcs.py
298 lines (238 loc) · 10.7 KB
/
util_preprocessing_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# _*_coding:utf-8_*_
"""
@author : Zhaoqing Liu
@email : [email protected]
"""
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
"""
Functions to include in this module are for preprocessing data:
1. Transform numerical variables to fuzzy degree of membership.
2. Process missing values.
3. Process outliers.
4. Transform descriptive variables to numerical variables.
5. Partitioning training sets and testing sets.
6. Normalising data.
"""
# =============================================================================
# Fuzzy-related functions
# =============================================================================
def degree_of_membership_build(X_df, r_seed, n_conv, fuzzy_reg):
"""
Build the degree of membership set of a feature. That set maps to
the specified number of fuzzy sets of the feature.
This is the process of transforming a crisp set into a fuzzy set.
@author : Anjin Liu
@email : [email protected]
Parameters
----------
X_df : DataFrame
One feature values of the training input samples.
NB: All features must be normalized by feature scaling.
r_seed : int
The random seed.
n_conv : int
The number of convolution over the input sample.
Returns
-------
x_new : array-like of shape (n_samples, n_fuzzy_sets)
Transformed degree of membership set.
centriods :
degree_of_membership_theta :
"""
x_np = X_df.values
x_np = x_np.reshape(-1, 1)
kmeans = KMeans(n_clusters=n_conv, random_state=r_seed).fit(x_np)
x_new = kmeans.transform(x_np)
centriods = kmeans.cluster_centers_
# get degree of membership distances threshold theta, DOM_theta
centriods_pair_dist = pairwise_distances(centriods)
# to fill the diagonal
centriods_pair_dist[centriods_pair_dist == 0] = 9999
degree_of_membership_theta = centriods_pair_dist.min(axis=1)
# convert distance to degree of membership
for idx, item in enumerate(degree_of_membership_theta):
x_new[:, idx] = 1 - x_new[:, idx] / item
x_new[x_new < 0] = 0
# Followed by an alternative to the above three lines of code, which supports pruning `theta`.
# theta_f = np.log(fuzzy_reg) - np.log(1 - fuzzy_reg)
# for idx, item in enumerate(degree_of_membership_theta):
# if fuzzy_reg == 0 or fuzzy_reg == 0.0 or fuzzy_reg == 1 or fuzzy_reg == 1.0:
# x_new[:, idx] = 1 - x_new[:, idx] / item
# else:
# x_new[:, idx] = 1 - x_new[:, idx] / item * theta_f
# x_new[x_new < 0] = 0
return x_new, centriods, degree_of_membership_theta
def extract_fuzzy_features(X, n_conv=5, fuzzy_reg=0.0):
"""
Extract fuzzy features in feature fuzzification to generate degree of
membership sets of each feature.
Attention
---------
Feature fuzzification must be done in the data preprocessing, that is,
before training the model and predicting new samples.
@author: Anjin Liu
@email: [email protected]
"""
# logging.debug("************* X's shape: %s", np.shape(X))
n_samples, n_features = np.shape(X)
X_fuzzy_dms = np.empty([n_samples, 0])
for feature_idx in range(n_features):
X_fuzzy_dm, _, _ = degree_of_membership_build(r_seed=0, X_df=pd.DataFrame(X[:, feature_idx]), n_conv=n_conv,
fuzzy_reg=fuzzy_reg)
X_fuzzy_dms = np.concatenate((X_fuzzy_dms, X_fuzzy_dm), axis=1)
# logging.debug("************* X_fuzzy_dms's shape: %s", np.shape(X_fuzzy_dms))
return X_fuzzy_dms
# X_df = pd.DataFrame(X)
# X_fuzzy_dms, _, _ = degree_of_membership_build(r_seed=0, X_df=X_df.iloc[:, 0], n_conv=5)
# Another try: ====================================
# X_fuzzy_dms = []
# _, n_features = np.shape(X)
# for feature_idx in range(n_features):
# X_fuzzy_dm, _, _ = degree_of_membership_build(r_seed=0, X_df=pd.DataFrame(X[:, feature_idx]), n_conv=5)
# X_fuzzy_dms.append(X_fuzzy_dm)
# logging.debug("************* X_fuzzy_dms's elements: %s", np.asarray(X_fuzzy_dms).shape)
# return np.asarray(X_fuzzy_dms)
# =============================================================================
# Encoder
# =============================================================================
"""
Encoder class that encodes categorical variables.
Or, use the existing encoder, sklearn.preprocessing.
Feature encoding includes:
- Ordinal Encoder (OE): Label Encoder, which is used for label coding;
and Ordinal Encoder, which is used for n-dimensional feature vectors,
namely n-column feature values.
- One-Hot Encoder (OHE): encode numerical values.
- Target Encoder (Mean Encoder): is used when the cardinality of a
feature (e.g. the number of categories of this feature) is large.
- CatBoost Encoder: is similar to the Target Encoder.
- Cyclic Features: is used for uniformly cyclic features.
- Others, e.g. Feature Hashing, LeaveOut Encoder, etc.
"""
def one_hot_encode(y, n_ohe_col=None):
"""
One-hot encode nominal values of target values.
Parameters
----------
y : array-like, 1-dimensional, elements must be numerical value and start from 0
n_ohe_col : int
Returns
-------
"""
y = y.astype(int)
if n_ohe_col is None:
n_ohe_col = np.amax(y) + 1 # np.max is just an alias for np.amax.
one_hot = np.zeros((y.shape[0], n_ohe_col)) # Or, use y.size instead of y.shape[0]
one_hot[np.arange(y.shape[0]), y] = 1
return one_hot
def to_nominal(x):
"""
Transform values from one-hot encoding to nominal.
"""
return np.argmax(x, axis=1)
def make_diagonal(x):
"""
Transform a vector into an diagonal matrix.
"""
m = np.zeros((len(x), len(x)))
for i in range(len(m[0])):
m[i, i] = x[i]
return m
# =============================================================================
# Functions for Sampling
# =============================================================================
"""
## Definitions
- A population can be defined as including all people or items with the
characteristic one wishes to understand.
- Each observation in a population measures one or more properties (such as
weight, location, colour) of observable bodies distinguished as independent
objects or individuals.
"""
def resample_simple_random(X, y, n_subsets, n_samples_sub=None, replace=False):
"""
Randomly draw a specified number of collections of independent sample
subsets from the original sample sets in the simple random sampling method.
Parameters
----------
X : sequence of array-like of shape (n_samples, n_features)
The input samples in the format of indexable data-structure, which can
be arrays, lists, dataframes or scipy sparse matrices with consistent
first dimension.
y : array-like of shape (n_samples,)
The target values (class labels), which are non-negative integers in
classification and real numbers in regression. Its first dimension has
to be the same as that of X.
n_subsets : int
The number of collections of subsets to generate.
n_samples_sub : int, default=None
The sample size in each subset to generate. If left to None this is
automatically set to the first dimension of X.
replace : bool, default=False
Implements resampling with replacement. If False, each sampling for a
sample subset will implement (sliced) random permutations, which is a
simple sampling scheme without replacement ('WOR' - no element can be
selected more than once in the same sample). If True, each sampling for
a sample subset will implement a bootstrapping sampling scheme with
replacement ('WR' - an element may appear multiple times in the one
sample).
Returns
-------
X_subsets, y_subsets : tuple, where each is a sequence of array-like
The tuple of the generated lists of sample subsets. The first element
in the tuple is the list of the sample subsets from X and the second
is the list of sample subsets from y.
"""
n_samples_super = X.shape[0]
y = y.reshape(n_samples_super, 1) # Be equivalent to: np.expand_dims(y, axis=1)
X_y = np.concatenate((X, y), axis=1) # Be equivalent to: np.hstack((X, y))
np.random.shuffle(X_y) # NB: Bootstrap never uses a random_state, i.e., never np.random.seed(random_state) before.
if n_samples_sub is None:
n_samples_sub = n_samples_super
elif n_samples_sub is not None and n_samples_sub > n_samples_super:
raise ValueError("The sample is larger than the population.")
X_subsets = []
y_subsets = []
for _ in range(n_subsets):
# The larger the sample size (starting from > 10,000),
# the faster numpy.random.choice() is compared to random.sample().
# When the sample size < 10,000, the reverse applies.
idxs = np.random.choice(n_samples_super, n_samples_sub, replace=replace)
X_y_bootstrap = X_y[idxs, :]
X_bootstrap = X_y_bootstrap[:, :-1]
y_bootstrap = X_y_bootstrap[:, -1]
X_subsets.append(X_bootstrap)
y_subsets.append(y_bootstrap)
return X_subsets, y_subsets
def resample_bootstrap(X, y, n_subsets, n_samples_sub=None):
"""
Draw a specified number of collections of independent sample subsets from
the original sample sets in the bootstrapping sampling method.
This is a sampling scheme with replacement ('WR' - an element may appear
multiple times in the one sample).
Parameters
----------
X : sequence of array-like of shape (n_samples, n_features)
The input samples in the format of indexable data-structure, which can
be arrays, lists, dataframes or scipy sparse matrices with consistent
first dimension.
y : array-like of shape (n_samples,)
The target values (class labels), which are non-negative integers in
classification and real numbers in regression. Its first dimension has
to be the same as that of X.
n_subsets : int
The number of collections of subsets to generate.
n_samples_sub : int, default=None
The sample size in each subset to generate. If left to None this is
automatically set to the first dimension of X.
Returns
-------
X_subsets, y_subsets : tuple, where each is a sequence of array-like
The tuple of the generated lists of sample subsets. The first element
in the tuple is the list of the sample subsets from X and the second
is the list of sample subsets from y.
"""
return resample_simple_random(X=X, y=y, n_subsets=n_subsets, n_samples_sub=n_samples_sub, replace=True)