-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
58 lines (49 loc) · 3.37 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
""" Convert data from pandas DataFrame to numeric matrix (required for autoML methods like auto-sklearn, TPOT) """
import warnings
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
def processData(data, label_column=None, ag_predictor=None, problem_type=None, eval_metric=None):
""" Converts pandas Dataframe to matrix of entirely numerical values (stored in DataFrame).
Performs same data preprocessing as used for AutoGluon's tabular neural network model,
to deal with issues such as: missing value imputation, one-hot encoding of categoricals,
handling of high-cardinality categoricals, handling unknown categorical feature-levels at test-time, etc.
If ag_predictor is not None, uses existing autogluon predictor object to process data (must have tabularNN as first model).
To process training data, ag_predictor should = None. For test data, should != None.
Returns:
Tuple (X, y, ag_predictor)
where y may be None if labels are not present in test data.
"""
# fit dummy neural network model just to preprocess data. Here we ensure no embedding layers are used.
if ag_predictor is None:
if label_column is None:
raise ValueError("when processing training data, label_column cannot be None")
elif not label_column in data.columns:
raise ValueError("label_column cannot be missing from training data")
ag_predictor = TabularPredictor(label=label_column, problem_type=problem_type, eval_metric=eval_metric,
path='/home/zhangxj/program/AUTOML/AutoGluonModels').fit(
train_data=data, tuning_data=data,
hyperparameters={'NN': {'num_epochs': 0, 'proc.embed_min_categories': np.inf}},
num_bag_folds=0, num_stack_levels=0, verbosity=0)
model = ag_predictor._trainer.load_model(ag_predictor._trainer.get_model_names()[0]) # This must be the neural net model which contains data processor
if 'NeuralNetMXNet' not in model.name:
raise ValueError("Data preprocessing error. This model should be the NeuralNet, not the: %s" % model.name)
bad_inds = [] # row-indices to remove from dataset
if label_column is not None and label_column in data.columns:
label_cleaner = ag_predictor._learner.label_cleaner
y = data[label_column].values
data = data.drop([label_column], axis=1, inplace=False)
y = label_cleaner.transform(y)
if np.sum(y.isna()) > 0:
bad_inds = y.index[y.apply(np.isnan)].tolist() # remove these inds as label is NaN (due to very rare classes)
warnings.warn("Dropped these rows from data in preprocessing, due to missing labels: " + str(bad_inds))
else:
y = None
data_initial_processed = ag_predictor._learner.transform_features(data) # general autogluon data processing.
# data_fg = ag_predictor._learner.general_data_processing(X=data, X_test=data, holdout_frac=0.0, num_bagging_folds=0)
X = model.preprocess(data_initial_processed, is_test=True) # neural net-specific autogluon data processing required to turn tabular data into numerical matrix.
if len(bad_inds) > 0:
y.drop(index=bad_inds, inplace=True)
X.drop(index=bad_inds, axis=0, inplace=True)
X[label_column] = y
return X, ag_predictor