Skip to content

Commit

Permalink
增加二分类模型.
Browse files Browse the repository at this point in the history
  • Loading branch information
enjoysport2022 committed Aug 11, 2021
1 parent 373c58e commit fab33ba
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 9 deletions.
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,14 @@ AutoX一个高效的自动化机器学习工具,它主要针对于表格类型
```

# 快速上手
- 全自动: run_oneclick.py
- 全自动: 适合于想要快速获得一个不错结果的用户。只需要配置最少的数据信息,就能完成机器学习全流程的构建。
```
适合于想要快速获得一个不错结果的用户。只需要配置最少的数据信息,就能完成机器学习全流程的构建。
from autox import AutoX
path = data_dir
autox = AutoX(target = 'loss', train_name = 'train.csv', test_name = 'test.csv',
id = ['id'], path = path)
sub = autox.get_submit()
sub.to_csv("submission.csv", index = False)
```
- 半自动: run_demo.ipynb
```
Expand All @@ -61,9 +66,9 @@ AutoX一个高效的自动化机器学习工具,它主要针对于表格类型
# 效果对比:
| index |data_type | data_name(link) | AutoX | AutoGluon | H2o |
| ----- |----- | ------------- | ---------------- | ----------------|----------------|
| 1 |regression | [zhidemai](https://www.automl.ai/competitions/19) | 1.1426 | 1.9466 | 1.1927|
| 1 |regression | [zhidemai](https://www.automl.ai/competitions/19) | 1.1267 | 1.9466 | 1.1927|
| 2 |regression | [Tabular Playground Series - Aug 2021](https://www.kaggle.com/c/tabular-playground-series-aug-2021) | x | 10.3944 | 7.8895|
| 3 |binary classification | [x](x) | x | x | x|
| 3 |binary classification | [Titanic](https://www.kaggle.com/c/titanic/) | x | 0.78229 | 0.79186 |


# 数据类型
Expand Down
15 changes: 11 additions & 4 deletions autox/autox.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from .feature_engineer.fe_stat import FeatureStat
from .file_io.read_data import read_data_from_path
from .models.regressor import CrossLgbRegression, CrossXgbRegression
from .models.classifier import CrossLgbBiClassifier, CrossXgbBiClassifier
from .process_data import feature_combination, train_test_divide, clip_label
from .process_data import feature_filter
from .process_data import feature_filter, auto_label_encoder
from .process_data.feature_type_recognition import Feature_type_recognition
from .util import log, reduce_mem_usage

Expand Down Expand Up @@ -79,6 +80,9 @@ def get_submit(self):
self.dfs_['FE_count'] = featureCount.transform(df)
log(f"featureCount ops: {featureCount.get_ops()}")

# label_encoder
df = auto_label_encoder(df, feature_type)

# 特征合并
log("feature combination")
df_list = [df, self.dfs_['FE_count'], self.dfs_['FE_stat']]
Expand All @@ -105,9 +109,12 @@ def get_submit(self):

model_xgb = CrossXgbRegression()
model_xgb.fit(train[used_features], train[target], tuning=True)
elif self.data_type == 'binary_classification':
# todo: 开发二分类模型
pass
elif self.data_type == 'binary':
model_lgb = CrossLgbBiClassifier()
model_lgb.fit(train[used_features], train[target], tuning=True)

model_xgb = CrossXgbBiClassifier()
model_xgb.fit(train[used_features], train[target], tuning=True)

# 特征重要性
fimp = model_lgb.feature_importances_
Expand Down
3 changes: 2 additions & 1 deletion autox/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .regressor import CrossLgbRegression
from .regressor import CrossLgbRegression, CrossXgbRegression
from .classifier import CrossLgbBiClassifier, CrossXgbBiClassifier
257 changes: 257 additions & 0 deletions autox/models/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import datetime
from time import time
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from ..util import log
from sklearn.model_selection import train_test_split
import optuna
from optuna.samplers import TPESampler
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier

class CrossTabnetBiClassifier(object):
pass
# https://pypi.org/project/pytorch-tabnet/



class CrossXgbBiClassifier(object):
def __init__(self, params=None, n_fold=10):
self.models = []
self.scaler = None
self.feature_importances_ = pd.DataFrame()
self.n_fold = n_fold
self.params_ = {
'eta': 0.01,
'max_depth': 5,
'subsample': 0.6,
'n_estimators': 1700,
'reg_alpha': 40,
'reg_lambda': 18,
'min_child_weight': 16,
'tree_method': 'gpu_hist'
}
if params is not None:
self.params_ = params

def get_params(self):
return self.params_

def set_params(self, params):
self.params_ = params

def optuna_tuning(self, X, y):
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.4)
def objective(trial):
param_grid = {
'max_depth': trial.suggest_int('max_depth', 4, 15),
'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.1),
'n_estimators': trial.suggest_int('n_estimators', 500, 2000, 100),
'eta': 0.01,
'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
}
reg = xgb.XGBClassifier(tree_method='gpu_hist', **param_grid)
reg.fit(X_train, y_train,
eval_set=[(X_valid, y_valid)], eval_metric='auc',
verbose=False)
return roc_auc_score(y_valid, reg.predict(X_valid))

train_time = 1 * 10 * 60 # h * m * s
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBClassifier')
study.optimize(objective, timeout=train_time)

log(f'Number of finished trials: {len(study.trials)}')
log('Best trial:')
trial = study.best_trial

log(f'\tValue: {trial.value}')
log('\tParams: ')
for key, value in trial.params.items():
log('\t\t{}: {}'.format(key, value))

self.params_ = trial.params
self.params_['eta'] = 0.01
self.params_['tree_method'] = 'gpu_hist'

def fit(self, X, y, tuning=True):
log(X.shape)
self.feature_importances_['feature'] = X.columns
self.scaler = StandardScaler()
X = self.scaler.fit_transform(X)

if tuning:
log("[+]tuning params")
self.optuna_tuning(X, y)

folds = KFold(n_splits=self.n_fold, shuffle=True)
AUCs = []

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

start_time = time()
print('Training on fold {}'.format(fold_n + 1))
X_train, y_train = X[train_index], y.iloc[train_index]
X_valid, y_valid = X[valid_index], y.iloc[valid_index]
model = xgb.XGBClassifier(**self.params_)
model.fit(X_train, y_train,
eval_set=[(X_valid, y_valid)],
eval_metric='auc', verbose=False)

self.models.append(model)
self.feature_importances_['fold_{}'.format(fold_n + 1)] = model.feature_importances_
val = model.predict(X[valid_index])
auc_ = roc_auc_score(y.iloc[valid_index], val)
print('AUC: {}'.format(auc_))
AUCs.append(auc_)
print('Fold {} finished in {}'.format(fold_n + 1, str(datetime.timedelta(
seconds=time() - start_time))))
log(f'Average KFold AUC: {np.mean(AUCs)}')
self.feature_importances_['average'] = self.feature_importances_[
[x for x in self.feature_importances_.columns if x != "feature"]].mean(axis=1)
self.feature_importances_ = self.feature_importances_.sort_values(by="average", ascending=False)
self.feature_importances_.index = range(len(self.feature_importances_))

def predict(self, test):
test = self.scaler.transform(test)
for idx, model in enumerate(self.models):
if idx == 0:
result = model.predict(test)
else:
result += model.predict(test)
result /= self.n_fold
return result

class CrossLgbBiClassifier(object):
def __init__(self, params=None, n_fold=5):
self.models = []
self.feature_importances_ = pd.DataFrame()
self.n_fold = n_fold
self.params_ = {
'objective': 'binary',
'metric': 'auc',
'boosting_type': 'gbdt',
'learning_rate': 0.01,
'num_leaves': 2 ** 5,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 66,
'feature_fraction': 0.7,
'feature_fraction_seed': 66,
'max_bin': 100,
'max_depth': 5,
'verbose': -1
}
if params is not None:
self.params_ = params
self.Early_Stopping_Rounds = 150
self.N_round = 5000
self.Verbose = 100

def get_params(self):
return self.params_

def set_params(self, params):
self.params_ = params

def optuna_tuning(self, X, y):
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

def objective(trial):
param_grid = {
'num_leaves': trial.suggest_int('num_leaves', 2 ** 3, 2 ** 9),
'num_boost_round': trial.suggest_int('num_boost_round', 100, 8000),
'max_depth': trial.suggest_int('max_depth', 3, 9),
'objective': 'binary',
'metric': 'auc',
'boosting_type': 'gbdt',
'learning_rate': 0.01,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 66,
'feature_fraction': 0.7,
'feature_fraction_seed': 66,
'max_bin': 100,
'verbose': -1
}
trn_data = lgb.Dataset(X_train, label=y_train, categorical_feature="")
val_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature="")
clf = lgb.train(param_grid, trn_data, valid_sets=[trn_data, val_data], verbose_eval=False,
early_stopping_rounds=self.Early_Stopping_Rounds)
pred_val = clf.predict(X_valid)
auc_ = roc_auc_score(y_valid, pred_val)

return auc_

train_time = 1 * 10 * 60 # h * m * s
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='LgbClassifier')
study.optimize(objective, timeout=train_time)

log(f'Number of finished trials: {len(study.trials)}')
log('Best trial:')
trial = study.best_trial

log(f'\tValue: {trial.value}')
log('\tParams: ')
for key, value in trial.params.items():
log('\t\t{}: {}'.format(key, value))

self.params_['num_leaves'] = trial.params['num_leaves']
self.params_['max_depth'] = trial.params['max_depth']
self.N_round = trial.params['num_boost_round']

def fit(self, X, y, Early_Stopping_Rounds=None, N_round=None, Verbose=None, tuning=True):
log(X.shape)

if tuning:
log("[+]tuning params")
self.optuna_tuning(X, y)

if Early_Stopping_Rounds is not None:
self.Early_Stopping_Rounds = Early_Stopping_Rounds
if N_round is not None:
self.N_round = N_round
if Verbose is not None:
self.Verbose = Verbose

folds = KFold(n_splits=self.n_fold, shuffle=True, random_state=889)
AUCs = []
self.feature_importances_['feature'] = X.columns

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

start_time = time()
print('Training on fold {}'.format(fold_n + 1))

trn_data = lgb.Dataset(X.iloc[train_index],
label=y.iloc[train_index], categorical_feature="")
val_data = lgb.Dataset(X.iloc[valid_index],
label=y.iloc[valid_index], categorical_feature="")
model = lgb.train(self.params_, trn_data, num_boost_round=self.N_round, valid_sets=[trn_data, val_data],
verbose_eval=self.Verbose,
early_stopping_rounds=self.Early_Stopping_Rounds)
self.models.append(model)
self.feature_importances_['fold_{}'.format(fold_n + 1)] = model.feature_importance()
val = model.predict(X.iloc[valid_index])
auc_ = roc_auc_score(y.iloc[valid_index], val)
print('AUC: {}'.format(auc_))
AUCs.append(auc_)
print('Fold {} finished in {}'.format(fold_n + 1, str(datetime.timedelta(
seconds=time() - start_time))))
self.feature_importances_['average'] = self.feature_importances_[
[x for x in self.feature_importances_.columns if x != "feature"]].mean(axis=1)
self.feature_importances_ = self.feature_importances_.sort_values(by="average", ascending=False)
self.feature_importances_.index = range(len(self.feature_importances_))

def predict(self, test):
for idx, model in enumerate(self.models):
if idx == 0:
result = model.predict(test) / self.n_fold
else:
result += model.predict(test) / self.n_fold
return result
6 changes: 6 additions & 0 deletions demo/kaggle_TabularPlaygroundSeriesAug2021.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from autox import AutoX
path = '../input/tabular-playground-series-aug-2021'
autox = AutoX(target = 'loss', train_name = 'train.csv', test_name = 'test.csv',
id = ['id'], path = path)
sub = autox.get_submit()
sub.to_csv("submission.csv", index = False)

0 comments on commit fab33ba

Please sign in to comment.