From fab33ba59407c96ac146c4ad6865a32f06b8fa34 Mon Sep 17 00:00:00 2001 From: poteman <946691288@qq.com> Date: Wed, 11 Aug 2021 10:53:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=8C=E5=88=86=E7=B1=BB?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 13 +- autox/autox.py | 15 +- autox/models/__init__.py | 3 +- autox/models/classifier.py | 257 ++++++++++++++++++ demo/kaggle_TabularPlaygroundSeriesAug2021.py | 6 + 5 files changed, 285 insertions(+), 9 deletions(-) create mode 100644 autox/models/classifier.py create mode 100644 demo/kaggle_TabularPlaygroundSeriesAug2021.py diff --git a/README.md b/README.md index d87e58c..88f1653 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,14 @@ AutoX一个高效的自动化机器学习工具,它主要针对于表格类型 ``` # 快速上手 -- 全自动: run_oneclick.py +- 全自动: 适合于想要快速获得一个不错结果的用户。只需要配置最少的数据信息,就能完成机器学习全流程的构建。 ``` -适合于想要快速获得一个不错结果的用户。只需要配置最少的数据信息,就能完成机器学习全流程的构建。 +from autox import AutoX +path = data_dir +autox = AutoX(target = 'loss', train_name = 'train.csv', test_name = 'test.csv', + id = ['id'], path = path) +sub = autox.get_submit() +sub.to_csv("submission.csv", index = False) ``` - 半自动: run_demo.ipynb ``` @@ -61,9 +66,9 @@ AutoX一个高效的自动化机器学习工具,它主要针对于表格类型 # 效果对比: | index |data_type | data_name(link) | AutoX | AutoGluon | H2o | | ----- |----- | ------------- | ---------------- | ----------------|----------------| -| 1 |regression | [zhidemai](https://www.automl.ai/competitions/19) | 1.1426 | 1.9466 | 1.1927| +| 1 |regression | [zhidemai](https://www.automl.ai/competitions/19) | 1.1267 | 1.9466 | 1.1927| | 2 |regression | [Tabular Playground Series - Aug 2021](https://www.kaggle.com/c/tabular-playground-series-aug-2021) | x | 10.3944 | 7.8895| -| 3 |binary classification | [x](x) | x | x | x| +| 3 |binary classification | [Titanic](https://www.kaggle.com/c/titanic/) | x | 0.78229 | 0.79186 | # 数据类型 diff --git a/autox/autox.py b/autox/autox.py index 1cd837d..94a1779 100644 --- a/autox/autox.py +++ b/autox/autox.py @@ -2,8 +2,9 @@ from .feature_engineer.fe_stat import FeatureStat from .file_io.read_data import read_data_from_path from .models.regressor import CrossLgbRegression, CrossXgbRegression +from .models.classifier import CrossLgbBiClassifier, CrossXgbBiClassifier from .process_data import feature_combination, train_test_divide, clip_label -from .process_data import feature_filter +from .process_data import feature_filter, auto_label_encoder from .process_data.feature_type_recognition import Feature_type_recognition from .util import log, reduce_mem_usage @@ -79,6 +80,9 @@ def get_submit(self): self.dfs_['FE_count'] = featureCount.transform(df) log(f"featureCount ops: {featureCount.get_ops()}") + # label_encoder + df = auto_label_encoder(df, feature_type) + # 特征合并 log("feature combination") df_list = [df, self.dfs_['FE_count'], self.dfs_['FE_stat']] @@ -105,9 +109,12 @@ def get_submit(self): model_xgb = CrossXgbRegression() model_xgb.fit(train[used_features], train[target], tuning=True) - elif self.data_type == 'binary_classification': - # todo: 开发二分类模型 - pass + elif self.data_type == 'binary': + model_lgb = CrossLgbBiClassifier() + model_lgb.fit(train[used_features], train[target], tuning=True) + + model_xgb = CrossXgbBiClassifier() + model_xgb.fit(train[used_features], train[target], tuning=True) # 特征重要性 fimp = model_lgb.feature_importances_ diff --git a/autox/models/__init__.py b/autox/models/__init__.py index 969d4fd..f735be6 100644 --- a/autox/models/__init__.py +++ b/autox/models/__init__.py @@ -1 +1,2 @@ -from .regressor import CrossLgbRegression \ No newline at end of file +from .regressor import CrossLgbRegression, CrossXgbRegression +from .classifier import CrossLgbBiClassifier, CrossXgbBiClassifier \ No newline at end of file diff --git a/autox/models/classifier.py b/autox/models/classifier.py new file mode 100644 index 0000000..f439128 --- /dev/null +++ b/autox/models/classifier.py @@ -0,0 +1,257 @@ +import datetime +from time import time +import lightgbm as lgb +import numpy as np +import pandas as pd +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import KFold +from ..util import log +from sklearn.model_selection import train_test_split +import optuna +from optuna.samplers import TPESampler +import xgboost as xgb +from sklearn.preprocessing import StandardScaler +from pytorch_tabnet.tab_model import TabNetClassifier + +class CrossTabnetBiClassifier(object): + pass + # https://pypi.org/project/pytorch-tabnet/ + + + +class CrossXgbBiClassifier(object): + def __init__(self, params=None, n_fold=10): + self.models = [] + self.scaler = None + self.feature_importances_ = pd.DataFrame() + self.n_fold = n_fold + self.params_ = { + 'eta': 0.01, + 'max_depth': 5, + 'subsample': 0.6, + 'n_estimators': 1700, + 'reg_alpha': 40, + 'reg_lambda': 18, + 'min_child_weight': 16, + 'tree_method': 'gpu_hist' + } + if params is not None: + self.params_ = params + + def get_params(self): + return self.params_ + + def set_params(self, params): + self.params_ = params + + def optuna_tuning(self, X, y): + X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.4) + def objective(trial): + param_grid = { + 'max_depth': trial.suggest_int('max_depth', 4, 15), + 'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.1), + 'n_estimators': trial.suggest_int('n_estimators', 500, 2000, 100), + 'eta': 0.01, + 'reg_alpha': trial.suggest_int('reg_alpha', 1, 50), + 'reg_lambda': trial.suggest_int('reg_lambda', 5, 100), + 'min_child_weight': trial.suggest_int('min_child_weight', 5, 20), + } + reg = xgb.XGBClassifier(tree_method='gpu_hist', **param_grid) + reg.fit(X_train, y_train, + eval_set=[(X_valid, y_valid)], eval_metric='auc', + verbose=False) + return roc_auc_score(y_valid, reg.predict(X_valid)) + + train_time = 1 * 10 * 60 # h * m * s + study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBClassifier') + study.optimize(objective, timeout=train_time) + + log(f'Number of finished trials: {len(study.trials)}') + log('Best trial:') + trial = study.best_trial + + log(f'\tValue: {trial.value}') + log('\tParams: ') + for key, value in trial.params.items(): + log('\t\t{}: {}'.format(key, value)) + + self.params_ = trial.params + self.params_['eta'] = 0.01 + self.params_['tree_method'] = 'gpu_hist' + + def fit(self, X, y, tuning=True): + log(X.shape) + self.feature_importances_['feature'] = X.columns + self.scaler = StandardScaler() + X = self.scaler.fit_transform(X) + + if tuning: + log("[+]tuning params") + self.optuna_tuning(X, y) + + folds = KFold(n_splits=self.n_fold, shuffle=True) + AUCs = [] + + for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): + + start_time = time() + print('Training on fold {}'.format(fold_n + 1)) + X_train, y_train = X[train_index], y.iloc[train_index] + X_valid, y_valid = X[valid_index], y.iloc[valid_index] + model = xgb.XGBClassifier(**self.params_) + model.fit(X_train, y_train, + eval_set=[(X_valid, y_valid)], + eval_metric='auc', verbose=False) + + self.models.append(model) + self.feature_importances_['fold_{}'.format(fold_n + 1)] = model.feature_importances_ + val = model.predict(X[valid_index]) + auc_ = roc_auc_score(y.iloc[valid_index], val) + print('AUC: {}'.format(auc_)) + AUCs.append(auc_) + print('Fold {} finished in {}'.format(fold_n + 1, str(datetime.timedelta( + seconds=time() - start_time)))) + log(f'Average KFold AUC: {np.mean(AUCs)}') + self.feature_importances_['average'] = self.feature_importances_[ + [x for x in self.feature_importances_.columns if x != "feature"]].mean(axis=1) + self.feature_importances_ = self.feature_importances_.sort_values(by="average", ascending=False) + self.feature_importances_.index = range(len(self.feature_importances_)) + + def predict(self, test): + test = self.scaler.transform(test) + for idx, model in enumerate(self.models): + if idx == 0: + result = model.predict(test) + else: + result += model.predict(test) + result /= self.n_fold + return result + +class CrossLgbBiClassifier(object): + def __init__(self, params=None, n_fold=5): + self.models = [] + self.feature_importances_ = pd.DataFrame() + self.n_fold = n_fold + self.params_ = { + 'objective': 'binary', + 'metric': 'auc', + 'boosting_type': 'gbdt', + 'learning_rate': 0.01, + 'num_leaves': 2 ** 5, + 'bagging_fraction': 0.95, + 'bagging_freq': 1, + 'bagging_seed': 66, + 'feature_fraction': 0.7, + 'feature_fraction_seed': 66, + 'max_bin': 100, + 'max_depth': 5, + 'verbose': -1 + } + if params is not None: + self.params_ = params + self.Early_Stopping_Rounds = 150 + self.N_round = 5000 + self.Verbose = 100 + + def get_params(self): + return self.params_ + + def set_params(self, params): + self.params_ = params + + def optuna_tuning(self, X, y): + X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42) + + def objective(trial): + param_grid = { + 'num_leaves': trial.suggest_int('num_leaves', 2 ** 3, 2 ** 9), + 'num_boost_round': trial.suggest_int('num_boost_round', 100, 8000), + 'max_depth': trial.suggest_int('max_depth', 3, 9), + 'objective': 'binary', + 'metric': 'auc', + 'boosting_type': 'gbdt', + 'learning_rate': 0.01, + 'bagging_fraction': 0.95, + 'bagging_freq': 1, + 'bagging_seed': 66, + 'feature_fraction': 0.7, + 'feature_fraction_seed': 66, + 'max_bin': 100, + 'verbose': -1 + } + trn_data = lgb.Dataset(X_train, label=y_train, categorical_feature="") + val_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature="") + clf = lgb.train(param_grid, trn_data, valid_sets=[trn_data, val_data], verbose_eval=False, + early_stopping_rounds=self.Early_Stopping_Rounds) + pred_val = clf.predict(X_valid) + auc_ = roc_auc_score(y_valid, pred_val) + + return auc_ + + train_time = 1 * 10 * 60 # h * m * s + study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='LgbClassifier') + study.optimize(objective, timeout=train_time) + + log(f'Number of finished trials: {len(study.trials)}') + log('Best trial:') + trial = study.best_trial + + log(f'\tValue: {trial.value}') + log('\tParams: ') + for key, value in trial.params.items(): + log('\t\t{}: {}'.format(key, value)) + + self.params_['num_leaves'] = trial.params['num_leaves'] + self.params_['max_depth'] = trial.params['max_depth'] + self.N_round = trial.params['num_boost_round'] + + def fit(self, X, y, Early_Stopping_Rounds=None, N_round=None, Verbose=None, tuning=True): + log(X.shape) + + if tuning: + log("[+]tuning params") + self.optuna_tuning(X, y) + + if Early_Stopping_Rounds is not None: + self.Early_Stopping_Rounds = Early_Stopping_Rounds + if N_round is not None: + self.N_round = N_round + if Verbose is not None: + self.Verbose = Verbose + + folds = KFold(n_splits=self.n_fold, shuffle=True, random_state=889) + AUCs = [] + self.feature_importances_['feature'] = X.columns + + for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): + + start_time = time() + print('Training on fold {}'.format(fold_n + 1)) + + trn_data = lgb.Dataset(X.iloc[train_index], + label=y.iloc[train_index], categorical_feature="") + val_data = lgb.Dataset(X.iloc[valid_index], + label=y.iloc[valid_index], categorical_feature="") + model = lgb.train(self.params_, trn_data, num_boost_round=self.N_round, valid_sets=[trn_data, val_data], + verbose_eval=self.Verbose, + early_stopping_rounds=self.Early_Stopping_Rounds) + self.models.append(model) + self.feature_importances_['fold_{}'.format(fold_n + 1)] = model.feature_importance() + val = model.predict(X.iloc[valid_index]) + auc_ = roc_auc_score(y.iloc[valid_index], val) + print('AUC: {}'.format(auc_)) + AUCs.append(auc_) + print('Fold {} finished in {}'.format(fold_n + 1, str(datetime.timedelta( + seconds=time() - start_time)))) + self.feature_importances_['average'] = self.feature_importances_[ + [x for x in self.feature_importances_.columns if x != "feature"]].mean(axis=1) + self.feature_importances_ = self.feature_importances_.sort_values(by="average", ascending=False) + self.feature_importances_.index = range(len(self.feature_importances_)) + + def predict(self, test): + for idx, model in enumerate(self.models): + if idx == 0: + result = model.predict(test) / self.n_fold + else: + result += model.predict(test) / self.n_fold + return result \ No newline at end of file diff --git a/demo/kaggle_TabularPlaygroundSeriesAug2021.py b/demo/kaggle_TabularPlaygroundSeriesAug2021.py new file mode 100644 index 0000000..9052307 --- /dev/null +++ b/demo/kaggle_TabularPlaygroundSeriesAug2021.py @@ -0,0 +1,6 @@ +from autox import AutoX +path = '../input/tabular-playground-series-aug-2021' +autox = AutoX(target = 'loss', train_name = 'train.csv', test_name = 'test.csv', + id = ['id'], path = path) +sub = autox.get_submit() +sub.to_csv("submission.csv", index = False) \ No newline at end of file