From 02d06e60242918e21e63382caea84b1431a51855 Mon Sep 17 00:00:00 2001 From: <> Date: Thu, 15 Feb 2024 23:08:11 +0000 Subject: [PATCH] Deployed 3c771d4 with MkDocs version: 1.5.2 --- .nojekyll | 0 404.html | 2292 ++++++ Tutorial/1_Estimators_Overview/index.html | 4954 ++++++++++++ .../index.html | 4234 +++++++++++ .../index.html | 5396 +++++++++++++ .../index.html | 3819 ++++++++++ Tutorial/5_GraphPipeline/index.html | 3507 +++++++++ .../6_SH_and_early_termination/index.html | 4144 ++++++++++ Tutorial/7_dask_parallelization/index.html | 4056 ++++++++++ .../8_Genetic_Algorithm_Overview/index.html | 3926 ++++++++++ Tutorial/simple_fss.csv | 3 + assets/_mkdocstrings.css | 64 + assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.78eede0e.min.js | 29 + assets/javascripts/bundle.78eede0e.min.js.map | 8 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 +++++++++++++++++ .../workers/search.dfff1995.min.js | 42 + .../workers/search.dfff1995.min.js.map | 8 + assets/stylesheets/main.0e669242.min.css | 1 + assets/stylesheets/main.0e669242.min.css.map | 1 + assets/stylesheets/palette.85d0ee34.min.css | 1 + .../stylesheets/palette.85d0ee34.min.css.map | 1 + cite/index.html | 2326 ++++++ contribute/index.html | 2327 ++++++ css/extra.css | 3 + documentation/tpot2/_version/index.html | 2403 ++++++ .../arithmetictransformer/index.html | 2405 ++++++ .../column_one_hot_encoder/index.html | 3079 ++++++++ .../index.html | 3040 ++++++++ .../feature_set_selector/index.html | 2928 +++++++ .../feature_transformers/index.html | 3470 +++++++++ .../genetic_encoders/index.html | 4318 +++++++++++ .../tpot2/builtin_modules/imputer/index.html | 3007 ++++++++ .../one_hot_encoder/index.html | 3956 ++++++++++ .../builtin_modules/passthrough/index.html | 2405 ++++++ .../selector_wrappers/index.html | 2405 ++++++ .../builtin_modules/zero_count/index.html | 2768 +++++++ .../config/all_single_modules/index.html | 2389 ++++++ .../tpot2/config/autoqtl_builtins/index.html | 2405 ++++++ .../tpot2/config/classifiers/index.html | 2405 ++++++ .../config/classifiers_sklearnex/index.html | 2405 ++++++ .../config/hyperparametersuggestor/index.html | 2405 ++++++ .../tpot2/config/mdr_configs/index.html | 2405 ++++++ .../tpot2/config/regressors/index.html | 2405 ++++++ .../config/regressors_sklearnex/index.html | 2405 ++++++ .../tpot2/config/selectors/index.html | 2405 ++++++ .../tpot2/config/special_configs/index.html | 2748 +++++++ .../tpot2/config/transformers/index.html | 2405 ++++++ .../tpot2/evolvers/base_evolver/index.html | 5311 +++++++++++++ .../evolvers/steady_state_evolver/index.html | 2405 ++++++ documentation/tpot2/graphsklearn/index.html | 2981 ++++++++ .../individual/index.html | 5145 +++++++++++++ .../optuna_optimize/index.html | 2407 ++++++ .../templates/index.html | 2407 ++++++ .../individual/index.html | 2405 ++++++ .../subset_selector/subsetselector/index.html | 2407 ++++++ documentation/tpot2/logbook/index.html | 2403 ++++++ .../objectives/average_path_length/index.html | 2405 ++++++ .../tpot2/objectives/complexity/index.html | 2405 ++++++ .../objectives/number_of_leaves/index.html | 2405 ++++++ .../objectives/number_of_nodes/index.html | 2405 ++++++ documentation/tpot2/population/index.html | 3888 ++++++++++ .../selectors/lexicase_selection/index.html | 2491 ++++++ .../max_weighted_average_selector/index.html | 2405 ++++++ .../tpot2/selectors/nsgaii/index.html | 2584 +++++++ .../selectors/random_selector/index.html | 2405 ++++++ .../selectors/tournament_selection/index.html | 2502 ++++++ .../tournament_selection_dominated/index.html | 2554 +++++++ .../tpot_estimator/cross_val_utils/index.html | 2405 ++++++ .../tpot2/tpot_estimator/estimator/index.html | 6577 ++++++++++++++++ .../tpot_estimator/estimator_utils/index.html | 2454 ++++++ .../steady_state_estimator/index.html | 6556 ++++++++++++++++ .../templates/tpottemplates/index.html | 3139 ++++++++ .../tpot2/utils/eval_utils/index.html | 2491 ++++++ documentation/tpot2/utils/utils/index.html | 2495 ++++++ index.html | 2727 +++++++ installation/index.html | 2423 ++++++ objects.inv | Bin 0 -> 1620 bytes related/index.html | 2427 ++++++ requirements_docs.txt | 7 + scripts/build_docs_sources.sh | 25 + scripts/build_mkdocs.sh | 125 + scripts/build_tutorial_toc_not_used.sh | 6 + search/search_index.json | 1 + sitemap.xml | 343 + sitemap.xml.gz | Bin 0 -> 961 bytes support/index.html | 2328 ++++++ tpot2_api/classifier/index.html | 2715 +++++++ tpot2_api/estimator/index.html | 6573 ++++++++++++++++ tpot2_api/regressor/index.html | 2699 +++++++ using/index.html | 2326 ++++++ 123 files changed, 218470 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 Tutorial/1_Estimators_Overview/index.html create mode 100644 Tutorial/2_Defining_Search_Space_(config_dicts)/index.html create mode 100644 Tutorial/3_Genetic_Feature_Set_Selectors/index.html create mode 100644 Tutorial/4_Symbolic_Regression_and_Classification/index.html create mode 100644 Tutorial/5_GraphPipeline/index.html create mode 100644 Tutorial/6_SH_and_early_termination/index.html create mode 100644 Tutorial/7_dask_parallelization/index.html create mode 100644 Tutorial/8_Genetic_Algorithm_Overview/index.html create mode 100644 Tutorial/simple_fss.csv create mode 100644 assets/_mkdocstrings.css create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.78eede0e.min.js create mode 100644 assets/javascripts/bundle.78eede0e.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.dfff1995.min.js create mode 100644 assets/javascripts/workers/search.dfff1995.min.js.map create mode 100644 assets/stylesheets/main.0e669242.min.css create mode 100644 assets/stylesheets/main.0e669242.min.css.map create mode 100644 assets/stylesheets/palette.85d0ee34.min.css create mode 100644 assets/stylesheets/palette.85d0ee34.min.css.map create mode 100644 cite/index.html create mode 100644 contribute/index.html create mode 100644 css/extra.css create mode 100644 documentation/tpot2/_version/index.html create mode 100644 documentation/tpot2/builtin_modules/arithmetictransformer/index.html create mode 100644 documentation/tpot2/builtin_modules/column_one_hot_encoder/index.html create mode 100644 documentation/tpot2/builtin_modules/feature_encoding_frequency_selector/index.html create mode 100644 documentation/tpot2/builtin_modules/feature_set_selector/index.html create mode 100644 documentation/tpot2/builtin_modules/feature_transformers/index.html create mode 100644 documentation/tpot2/builtin_modules/genetic_encoders/index.html create mode 100644 documentation/tpot2/builtin_modules/imputer/index.html create mode 100644 documentation/tpot2/builtin_modules/one_hot_encoder/index.html create mode 100644 documentation/tpot2/builtin_modules/passthrough/index.html create mode 100644 documentation/tpot2/builtin_modules/selector_wrappers/index.html create mode 100644 documentation/tpot2/builtin_modules/zero_count/index.html create mode 100644 documentation/tpot2/config/all_single_modules/index.html create mode 100644 documentation/tpot2/config/autoqtl_builtins/index.html create mode 100644 documentation/tpot2/config/classifiers/index.html create mode 100644 documentation/tpot2/config/classifiers_sklearnex/index.html create mode 100644 documentation/tpot2/config/hyperparametersuggestor/index.html create mode 100644 documentation/tpot2/config/mdr_configs/index.html create mode 100644 documentation/tpot2/config/regressors/index.html create mode 100644 documentation/tpot2/config/regressors_sklearnex/index.html create mode 100644 documentation/tpot2/config/selectors/index.html create mode 100644 documentation/tpot2/config/special_configs/index.html create mode 100644 documentation/tpot2/config/transformers/index.html create mode 100644 documentation/tpot2/evolvers/base_evolver/index.html create mode 100644 documentation/tpot2/evolvers/steady_state_evolver/index.html create mode 100644 documentation/tpot2/graphsklearn/index.html create mode 100644 documentation/tpot2/individual_representations/graph_pipeline_individual/individual/index.html create mode 100644 documentation/tpot2/individual_representations/graph_pipeline_individual/optuna_optimize/index.html create mode 100644 documentation/tpot2/individual_representations/graph_pipeline_individual/templates/index.html create mode 100644 documentation/tpot2/individual_representations/individual/index.html create mode 100644 documentation/tpot2/individual_representations/subset_selector/subsetselector/index.html create mode 100644 documentation/tpot2/logbook/index.html create mode 100644 documentation/tpot2/objectives/average_path_length/index.html create mode 100644 documentation/tpot2/objectives/complexity/index.html create mode 100644 documentation/tpot2/objectives/number_of_leaves/index.html create mode 100644 documentation/tpot2/objectives/number_of_nodes/index.html create mode 100644 documentation/tpot2/population/index.html create mode 100644 documentation/tpot2/selectors/lexicase_selection/index.html create mode 100644 documentation/tpot2/selectors/max_weighted_average_selector/index.html create mode 100644 documentation/tpot2/selectors/nsgaii/index.html create mode 100644 documentation/tpot2/selectors/random_selector/index.html create mode 100644 documentation/tpot2/selectors/tournament_selection/index.html create mode 100644 documentation/tpot2/selectors/tournament_selection_dominated/index.html create mode 100644 documentation/tpot2/tpot_estimator/cross_val_utils/index.html create mode 100644 documentation/tpot2/tpot_estimator/estimator/index.html create mode 100644 documentation/tpot2/tpot_estimator/estimator_utils/index.html create mode 100644 documentation/tpot2/tpot_estimator/steady_state_estimator/index.html create mode 100644 documentation/tpot2/tpot_estimator/templates/tpottemplates/index.html create mode 100644 documentation/tpot2/utils/eval_utils/index.html create mode 100644 documentation/tpot2/utils/utils/index.html create mode 100644 index.html create mode 100644 installation/index.html create mode 100644 objects.inv create mode 100644 related/index.html create mode 100644 requirements_docs.txt create mode 100644 scripts/build_docs_sources.sh create mode 100644 scripts/build_mkdocs.sh create mode 100644 scripts/build_tutorial_toc_not_used.sh create mode 100644 search/search_index.json create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz create mode 100644 support/index.html create mode 100644 tpot2_api/classifier/index.html create mode 100644 tpot2_api/estimator/index.html create mode 100644 tpot2_api/regressor/index.html create mode 100644 using/index.html diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..f9027a73 --- /dev/null +++ b/404.html @@ -0,0 +1,2292 @@ + + + +
+ + + + + + + + + + + + + + +There are two evolutionary algorithms built into TPOT2, which corresponds to two different estimator classes.
+The tpot2.TPOTEstimator
uses a standard evolutionary algorithm that evaluates exactly population_size individuals each generation. This is similar to the algorithm in TPOT1. The next generation does not start until the previous is completely finished evaluating. This leads to underutilized CPU time as the cores are waiting for the last individuals to finish training, but may preserve diversity in the population.
The tpot2.TPOTEstimatorSteadyState
differs in that it will generate and evaluate the next individual as soon as an individual finishes evaluation. The number of individuals being evaluated is determined by the n_jobs parameter. There is no longer a concept of generations. The population_size parameter now refers to the size of the list of evaluated parents. When an individual is evaluated, the selection method updates the list of parents. This allows more efficient utilization when using multiple cores.
Additionally, two other simplified estimators are provided. These have a simplified set of hyperparameters with default values set for classification and regression problems. Currently, both of these use the standard evolutionary algorithm in the tpot2.TPOTEstimator
class.
tpot2.TPOTClassifier
for classification taskstpot2.TPOTRegressor
for regression tasksWhen running tpot from an .py script, it is important to protect code with if __name__=="__main__":
#my_analysis.py
+
+from dask.distributed import Client, LocalCluster
+import tpot2
+import sklearn
+import sklearn.datasets
+import numpy as np
+
+if __name__=="__main__":
+ scorer = sklearn.metrics.get_scorer('roc_auc_ovr')
+ X, y = sklearn.datasets.load_digits(return_X_y=True)
+ X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+ est = tpot2.TPOTEstimatorSteadyState(
+ scorers=['roc_auc_ovr'], #scorers can be a list of strings or a list of scorers. These get evaluated during cross validation.
+ scorers_weights=[1],
+
+ classification=True,
+
+ max_eval_time_seconds=15,
+ max_time_seconds=30,
+ verbose=2)
+ est.fit(X_train, y_train)
+ print(scorer(est, X_test, y_test))
+
Evaluations: : 19it [00:30, 1.59s/it] ++
0.9996046124012956 ++
scorers : (list, scorer)
+ A scorer or list of scorers to be used in the cross-validation process.
+ see https://scikit-learn.org/stable/modules/model_evaluation.html
+
+ scorers_weights : list
+ A list of weights to be applied to the scorers during the optimization process.
+
+ classification : bool
+ If True, the problem is treated as a classification problem. If False, the problem is treated as a regression problem.
+ Used to determine the CV strategy.
+
+ cv : int, cross-validator
+ - (int): Number of folds to use in the cross-validation process. By uses the sklearn.model_selection.KFold cross-validator for regression and StratifiedKFold for classification. In both cases, shuffled is set to True.
+ - (sklearn.model_selection.BaseCrossValidator): A cross-validator to use in the cross-validation process.
+ - max_depth (int): The maximum depth from any node to the root of the pipelines to be generated.
+
+ other_objective_functions : list, default=[tpot2.objectives.estimator_objective_functions.average_path_length_objective]
+ A list of other objective functions to apply to the pipeline.
+
+ other_objective_functions_weights : list, default=[-1]
+ A list of weights to be applied to the other objective functions.
+
+ objective_function_names : list, default=None
+ A list of names to be applied to the objective functions. If None, will use the names of the objective functions.
+
+ bigger_is_better : bool, default=True
+ If True, the objective function is maximized. If False, the objective function is minimized. Use negative weights to reverse the direction.
+
+
+ max_size : int, default=np.inf
+ The maximum number of nodes of the pipelines to be generated.
+
+ linear_pipeline : bool, default=False
+ If True, the pipelines generated will be linear. If False, the pipelines generated will be directed acyclic graphs.
+
+ generations : int, default=50
+ Number of generations to run
+
+ max_time_seconds : float, default=float("inf")
+ Maximum time to run the optimization. If none or inf, will run until the end of the generations.
+
+ max_eval_time_seconds : float, default=60*5
+ Maximum time to evaluate a single individual. If none or inf, there will be no time limit per evaluation.
+
+ n_jobs : int, default=1
+ Number of processes to run in parallel.
+
+ memory_limit : str, default="4GB"
+ Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information.
+
+
+ verbose : int, default=1
+ How much information to print during the optimization process. Higher values include the information from lower values.
+ 0. nothing
+ 1. progress bar
+
+ 3. best individual
+ 4. warnings
+ >=5. full warnings trace
+ 6. evaluations progress bar. (Temporary: This used to be 2. Currently, using evaluation progress bar may prevent some instances were we terminate a generation early due to it reaching max_time_seconds in the middle of a generation OR a pipeline failed to be terminated normally and we need to manually terminate it.)
+
+The following configuration dictionaries are covered in the next tutorial:
+root_config_dict
+inner_config_dict
+leaf_config_dict
+import tpot2
+import sklearn
+import sklearn.datasets
+
+est = tpot2.TPOTEstimatorSteadyState(
+ scorers=['roc_auc_ovr'], #scorers can be a list of strings or a list of scorers. These get evaluated during cross validation.
+ scorers_weights=[1],
+
+ classification=True,
+
+ max_eval_time_seconds=15,
+ max_time_seconds=30,
+ verbose=2)
+
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
Evaluations: : 117it [00:30, 3.85it/s] ++
0.9974747474747474 ++
fitted_pipeline = est.fitted_pipeline_ # access best pipeline directly
+fitted_pipeline.plot()
+
#view the summary of all evaluated individuals as a pandas dataframe
+est.evaluated_individuals
+
+ | roc_auc_score | +Parents | +Variation_Function | +Individual | +Submitted Timestamp | +Completed Timestamp | +Pareto_Front | +Instance | +
---|---|---|---|---|---|---|---|---|
0 | +0.994405 | +NaN | +NaN | +['LogisticRegression_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['LogisticRegression_1'] | +
1 | +0.954484 | +NaN | +NaN | +['DecisionTreeClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['DecisionTreeClassifier_1'] | +
2 | +1.000000 | +NaN | +NaN | +['KNeighborsClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +1.0 | +['KNeighborsClassifier_1'] | +
3 | +0.994048 | +NaN | +NaN | +['GradientBoostingClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['GradientBoostingClassifier_1'] | +
4 | +0.989841 | +NaN | +NaN | +['ExtraTreesClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['ExtraTreesClassifier_1'] | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +
112 | +0.997540 | +(105, 106) | +crossover | +['MLPClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['MLPClassifier_1'] | +
113 | +0.998214 | +(15,) | +mutate | +['KNeighborsClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['KNeighborsClassifier_1'] | +
114 | +0.997619 | +(67, 67) | +crossover | +[('MLPClassifier_1', 'StandardScaler_1')] | +1.692231e+09 | +1.692231e+09 | +NaN | +[('MLPClassifier_1', 'StandardScaler_1')] | +
115 | +0.996944 | +(81,) | +mutate | +[('ExtraTreesClassifier_1', 'RBFSampler_1'), (... | +1.692231e+09 | +1.692231e+09 | +NaN | +[('ExtraTreesClassifier_1', 'RBFSampler_1'), (... | +
116 | +1.000000 | +(90, 73) | +crossover | +[('MLPClassifier_1', 'MinMaxScaler_1')] | +1.692231e+09 | +1.692231e+09 | +NaN | +[('MLPClassifier_1', 'MinMaxScaler_1')] | +
117 rows × 8 columns
+There are two ways of passing objectives into TPOT2.
+scorers
: Scorers are functions that have the signature (estimator, X, y). These can be produced with the sklearn.metrics.make_scorer function. This function is used to evaluate the test folds during cross validation. These are passed into TPOT2 via the scorers parameter. This can take in the scorer itself or the string corresponding to a scoring function (as listed here). TPOT2 also supports passing in a list of several scorers for multiobjective optimization.
other_objective_functions
: Other objective functions in TPOT2 have the signature (estimator) and returns a float or list of floats. These get passed an unfitted estimator (in the case of TPOT2, a tpot2.GraphPipeline
).
Each scorer and objective function must be accompanied by a list of weights corresponding to the list of objectives. By default, TPOT2 maximizes objective functions (this can be changed by bigger_is_better=False
). Positive weights means that TPOT2 will seek to maximize that objective, and negative weights correspond to minimization.
Here is an example of using two scorers
+scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer],
+scorers_weights=[1,-1],
+Here is an example with a scorer and a secondary objective function
+scorers=['roc_auc_ovr'],
+scorers_weights=[1],
+other_objective_functions=[tpot2.objectives.number_of_leaves_objective],
+other_objective_functions_weights=[-1],
+import tpot2
+import sklearn
+import sklearn.datasets
+
+est = tpot2.TPOTEstimatorSteadyState(
+ scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer],
+ scorers_weights=[1,-1],
+
+ classification=True,
+
+ max_eval_time_seconds=15,
+ max_time_seconds=30,
+ verbose=2)
+
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
Evaluations: : 143it [00:30, 4.74it/s] +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge + warnings.warn( ++
0.9934232434232434 ++
fitted_pipeline = est.fitted_pipeline_ # access best pipeline directly
+fitted_pipeline.plot() #plot the best pipeline
+
view the results of all evaluated individuals as a pandas dataframe
+est.evaluated_individuals
+
+ | roc_auc_score | +complexity_scorer | +Parents | +Variation_Function | +Individual | +Submitted Timestamp | +Completed Timestamp | +Pareto_Front | +Instance | +
---|---|---|---|---|---|---|---|---|---|
0 | +1.0 | +15.0 | +NaN | +NaN | +['LogisticRegression_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['LogisticRegression_1'] | +
1 | +0.96619 | +45.0 | +NaN | +NaN | +['DecisionTreeClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['DecisionTreeClassifier_1'] | +
2 | +0.99746 | +7.0 | +NaN | +NaN | +['KNeighborsClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['KNeighborsClassifier_1'] | +
3 | +0.996429 | +15064.0 | +NaN | +NaN | +['GradientBoostingClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['GradientBoostingClassifier_1'] | +
4 | +0.995714 | +2802.0 | +NaN | +NaN | +['ExtraTreesClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['ExtraTreesClassifier_1'] | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +
138 | +0.5 | +6.0 | +(98,) | +mutate | +[('BernoulliNB_1', 'SelectFromModel_ExtraTrees... | +1.692231e+09 | +1.692231e+09 | +NaN | +[('BernoulliNB_1', 'SelectFromModel_ExtraTrees... | +
139 | +0.85496 | +1.0 | +(82, 87) | +crossover | +['MultinomialNB_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['MultinomialNB_1'] | +
140 | +0.997579 | +8210.0 | +(86,) | +mutate | +['ExtraTreesClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['ExtraTreesClassifier_1'] | +
141 | +0.979008 | +14.8 | +(98,) | +mutate | +['SGDClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['SGDClassifier_1'] | +
142 | +0.5 | +1500.0 | +(2,) | +mutate | +['XGBClassifier_1'] | +1.692231e+09 | +1.692231e+09 | +NaN | +['XGBClassifier_1'] | +
143 rows × 9 columns
+view pareto front as a pandas dataframe
+est.pareto_front
+
+ | roc_auc_score | +complexity_scorer | +Parents | +Variation_Function | +Individual | +Submitted Timestamp | +Completed Timestamp | +Pareto_Front | +Instance | +
---|---|---|---|---|---|---|---|---|---|
57 | +0.5 | +0.0 | +(56,) | +mutate | +['LogisticRegression_1'] | +1.692231e+09 | +1.692231e+09 | +1.0 | +['LogisticRegression_1'] | +
137 | +1.0 | +1.0 | +(82,) | +mutate | +[('MultinomialNB_1', 'SelectFromModel_ExtraTre... | +1.692231e+09 | +1.692231e+09 | +1.0 | +[('MultinomialNB_1', 'SelectFromModel_ExtraTre... | +
pareto_front = est.pareto_front
+
+#plot the pareto front of number_of_leaves_objective vs roc_auc_score
+
+import matplotlib.pyplot as plt
+plt.scatter(pareto_front['complexity_scorer'], pareto_front['roc_auc_score'])
+plt.xlabel('complexity_scorer')
+plt.ylabel('roc_auc_score')
+plt.show()
+
import tpot2
+import sklearn
+import sklearn.datasets
+
+est = tpot2.TPOTEstimator( population_size=30,
+ generations=5,
+ scorers=['roc_auc_ovr'], #scorers can be a list of strings or a list of scorers. These get evaluated during cross validation.
+ scorers_weights=[1],
+ classification=True,
+ n_jobs=1,
+ early_stop=5, #how many generations with no improvement to stop after
+
+ #List of other objective functions. All objective functions take in an untrained GraphPipeline and return a score or a list of scores
+ other_objective_functions= [ ],
+
+ #List of weights for the other objective functions. Must be the same length as other_objective_functions. By default, bigger is better is set to True.
+ other_objective_functions_weights=[],
+ verbose=2)
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
Generation: 100%|██████████| 5/5 [00:36<00:00, 7.20s/it] +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. + warnings.warn( ++
0.998015873015873 ++
The TPOTClassifier and TPOTRegressor are set default parameters for the TPOTEstimator for Classification and Regression. +In the future, a metalearner will be used to predict the best values for a given dataset.
+import tpot2
+import sklearn
+import sklearn.metrics
+import sklearn.datasets
+
+est = tpot2.tpot_estimator.templates.TPOTRegressor(n_jobs=4, max_time_seconds=10)
+
+
+scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
+X, y = sklearn.datasets.load_diabetes(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
terminating parallel evaluation due to timeout ++
/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 177.4640355714364, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 185.86338704440277, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 277.49028848926537, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 464.01662831846625, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 554.9558355270419, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1480.8552755513228, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2355.5063150407514, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2081.571493001771, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 3868.126368656056, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 5331.3651033417555, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 6862.873289547279, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 8656.98141344823, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 4311.308985096635, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 5839.020132572099, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 5923.854209526442, tolerance: 143.10199053030306 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 280.8815573984757, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 643.0934690993745, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 565.7529498867225, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 639.0793324268889, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 796.3080264698947, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2132.9185444641626, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2674.6467641871423, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2568.991994333919, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1767.4389212469105, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1605.1388315662043, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1771.0119939564029, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1812.8362937605707, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2090.934535113978, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2720.6381011917256, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 3694.640494319028, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 5819.918714194559, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 8499.700911721331, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 9747.96645780711, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 8925.452311816742, tolerance: 168.2528 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 242.2927812706912, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 520.11185573088, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 270.07291585509665, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 312.4193137688562, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 334.48251612263266, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 406.0909651533002, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 393.6330031697871, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 419.26211581844836, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1105.061883097398, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1492.2850051816786, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1359.714203708456, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1543.3692570256535, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 686.7691507576965, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 567.2123847292969, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 575.1844139498426, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1196.4656488135224, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 2136.7159360550577, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 3161.7749671411, tolerance: 166.10352603773586 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 158.83327397913672, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 168.01972272712737, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 529.342575648101, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 811.6219812278869, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 601.5064170324476, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 459.8468100364553, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 171.07939504506066, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 475.0977421862772, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1609.3130913197529, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 3371.636877565179, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 4893.275803661207, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 5689.945571509306, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 6327.594264068524, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 8071.667983187712, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 9214.471518416074, tolerance: 158.8069449056604 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 221.34985516022425, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 409.0736092341831, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 576.086710276315, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 759.3069202784682, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 788.3264070701553, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1851.77406217705, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1982.0810699927388, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1990.0643707137788, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1123.845644916175, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 466.2079415132757, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1319.3072104484309, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1714.5370268148836, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1605.753956191009, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 5471.587720631971, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:617: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 10655.474709162605, tolerance: 159.7256437735849 + model = cd_fast.enet_coordinate_descent_gram( ++
-2514.9527497535055 ++
import tpot2
+import sklearn
+import sklearn.datasets
+
+est = tpot2.tpot_estimator.templates.TPOTClassifier(n_jobs=4, max_time_seconds=10)
+
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_digits(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
terminating parallel evaluation due to timeout +0.9999694758670971 ++
Everything can be done with the TPOTEstimator class. All other classes (TPOTRegressor, TPOTClassifier, TPOTSymbolicClassifier, TPOTSymbolicRegression, TPOTGeneticFeatureSetSelector, etc.) are actually just different default settings for TPOTEstimator.
+By Default, TPOT will generate pipelines with a default set of classifiers or regressors as roots (this depends on whether classification is set to true or false). All other nodes are selected from a default list of selectors and transformers. Note: This differs from the TPOT1 behavior where by default classifiers and regressors can appear in locations other than the root. You can modify the the search space for leaves, inner nodes, and roots (final classifiers) separately through built in options or custom configuration dictionaries.
+In this tutorial we will walk through using the built in configurations, creating custom configurations, and using nested configurations.
+The default configuration includes several machine learning estimators from sklearn. Sometimes we may want to change or restrict what is allowed.
+In TPOT2, we specify three different configuration dictionaries to indicate which modules can go where on the graph
+Note: TPOT1 internally divided the methods inside the configuration dictionary into selectors/transformers/estimators and treated them differently. TPOT2 does not.
+Each configuration dictionary parameter has access to the same default parameters. The default parameters can also be grouped into a list to combine their search spaces.
+Configuration dictionaries are python dictionaries where the keys are the method types and the values are optuna-compatible functions that take in a trial and return a hyperparameter dictionary.
+Configuration dictionaries can also be nested. Meaning that the search space for that node, will be a graph defined by the nested dictionary. More on that later in the tutorial.
+With these three types of configuration dictionaries plus nesting, one can define very specific search spaces. More on nesting later.
+# A Linear pipeline starting with a selector, followed by 0 to 4 transformers, and ending with a classifier.
+
+import tpot2
+import sklearn
+import sklearn.datasets
+
+est = tpot2.TPOTEstimator( population_size=10,
+ generations=5,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ root_config_dict="classifiers",
+ inner_config_dict= "transformers",
+ leaf_config_dict="selectors",
+ linear_pipeline=True,
+ max_size=6,
+
+ early_stop=5,
+ verbose=0)
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
1.0 ++
# A Graph pipeline starting with at least one selector as a leaf, potentially followed by a series
+# of stacking classifiers or transformers, and ending with a classifier. The graph will have at most 15 nodes.
+
+import tpot2
+import sklearn
+import sklearn.datasets
+import numpy as np
+
+est = tpot2.TPOTEstimator( population_size=10,
+ generations=5,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ root_config_dict="classifiers",
+ inner_config_dict= ["classifiers","transformers"],
+ leaf_config_dict="selectors",
+ max_size=15,
+
+ early_stop=5,
+ verbose=0)
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
+est.fitted_pipeline_.plot()
+
0.9941520467836257 ++
Next, we will show how to use these features to define a graph pipeline search space similar to symbolic classification.
+The following defines a pipeline where leafs select a single feature, inner nodes perform arithmetic, and logistic regression is used as a final classifier.
+The arithmetic transformer and feature set selection of single columns are built in configurations with the "arithmetic_transformer" and "feature_set_selector" options respectively.
+There is not a built in configuration for a single logistic regression so we have to manually define one.
+To start, we create a function that takes in a trial object. This object takes in a search space, and outputs a parameter. This is designed to be compatible with the optuna trial class. More information on available functions within trial can be found here: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html
+The suggested parameters should be put into a dictionary that has the model parameters as keys with their corresponding values.
+Note: For optuna optimization to work, it is important to add '_{name}' to each of the names parameters. With large graphs, names of parameters will likely clash. The name parameter here allows TPOT2 to make sure each parameter for each node has a unique label.
+Note: This will be simplified in a future release.
+import tpot2
+import numpy as np
+def params_LogisticRegression(trial, name=None):
+ params = {}
+ params['solver'] = trial.suggest_categorical(name=f'solver_{name}',
+ choices=[f'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
+ params['dual'] = False
+ params['penalty'] = 'l2'
+ params['C'] = trial.suggest_float(f'C_{name}', 1e-4, 1e4, log=True)
+ params['l1_ratio'] = None
+ if params['solver'] == 'liblinear':
+ params['penalty'] = trial.suggest_categorical(name=f'penalty_{name}', choices=['l1', 'l2'])
+ if params['penalty'] == 'l2':
+ params['dual'] = trial.suggest_categorical(name=f'dual_{name}', choices=[True, False])
+ else:
+ params['penalty'] = 'l1'
+
+ params['class_weight'] = trial.suggest_categorical(name=f'class_weight_{name}', choices=['balanced'])
+ param_grid = {'solver': params['solver'],
+ 'penalty': params['penalty'],
+ 'dual': params['dual'],
+ 'multi_class': 'auto',
+ 'l1_ratio': params['l1_ratio'],
+ 'C': params['C'],
+ }
+ return param_grid
+
A configuration dictionary has the python Types for the designed estimator as keys, and the function as values.
+from sklearn.linear_model import LogisticRegression
+root_config_dict = { LogisticRegression : params_LogisticRegression }
+
est = tpot2.TPOTEstimator(population_size=20,generations=10,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ inner_config_dict= "arithmetic_transformer",
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ )
+
+#load iris
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge + warnings.warn( ++
1.0 ++
Configuration dictionaries can also be nested. If the string "Recursive" is used in place of a type, the node that would go in that place will now represent a graph with those restrictions.
+All inputs to the recursive node will be merged and input to all the leaves within the recursive graph. The output of the graph will be sent to the outputs of the node that represents it.
+This is handy for restricting the search space of the model as well as setting specific ensembling templates.
+(Currently) These are all flattened and merged into a single graph when exported as a graph pipeline. In the future these could be used for ensemble methods such as boosting/stacking/etc.
+Note that this is not a new instance of the TPOT2 estimator, and it does not independently run GP. Rather this recursive node just sets a search space restriction for that node.
+transformer_config_dictionary = "transformers"
+selector_config_dictionary = "feature_set_selector"
+classifier_config_dictionary = root_config_dict
+
+#Some example search spaces with nested graphs
+
+#pipelines of the shape selector->transformer
+st_params = {
+ 'root_config_dict':transformer_config_dictionary,
+ 'leaf_config_dict':selector_config_dictionary,
+ 'inner_config_dict': None,
+ 'max_size' : 2,
+ 'linear_pipeline' : True}
+
+#pipelines of the shape (selector->transformer) -> classifier.
+# This is equivalent to setting TPOT1 to use the 'Selector-Transformer-Classifier' template
+st_c_params = {
+ 'root_config_dict': classifier_config_dictionary,
+ 'leaf_config_dict': {"Recursive" : st_params},
+ 'inner_config_dict': None,
+ 'max_size' : 2,
+ 'linear_pipeline' : True}
+
+#pipelines of the shape ((selector->transformer) -> classifier)*N) -> classifier
+#This is like having an ensemble of 'Selector-Transformer-Classifier' models with a final meta classifier
+st_c_ensemble_params = {
+ 'root_config_dict': classifier_config_dictionary,
+ 'leaf_config_dict': {"Recursive" : st_c_params},
+ 'inner_config_dict': None,
+ 'max_size' : 6,
+ 'linear_pipeline' : True}
+
# linear pipelines of the shape selector->transformer->classifier
+est = tpot2.TPOTEstimator(population_size=20,generations=10,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ **st_c_params,
+ )
+
+#load iris
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
0.9880174291938998 ++
# ensembles of linear pipelines of the shape selector->transformer->classifier ensemble pipeline with a final meta classifier
+est = tpot2.TPOTEstimator(population_size=20,generations=10,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ **st_c_ensemble_params,
+ )
+
+#load iris
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/svm/_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. + warnings.warn( ++
0.9976851851851851 ++
The FeatureSetSelector is a subclass of sklearn.feature_selection.SelectorMixin that simply returns the manually specified columns. The parameter sel_subset specifies the name or index of the column that it selects. The transform function then simply indexes and returns the selected columns. You can also optionally name the group with the name parameter, though this is only for note keeping and does is not used by the class.
+sel_subset: list or int +If X is a dataframe, items in sel_subset list must correspond to column names +If X is a numpy array, items in sel_subset list must correspond to column indexes +int: index of a single column
+import tpot2
+import pandas as pd
+import numpy as np
+#make a dataframe with columns a,b,c,d,e,f
+
+#numpy array where columns are 1,2,3,4,5,6
+data = np.repeat([np.arange(6)],10,0)
+
+df = pd.DataFrame(data,columns=['a','b','c','d','e','f'])
+fss = tpot2.builtin_modules.FeatureSetSelector(name='test',sel_subset=['a','b','c'])
+
+print("original DataFrame")
+print(df)
+print("Transformed Data")
+print(fss.fit_transform(df))
+
original DataFrame + a b c d e f +0 0 1 2 3 4 5 +1 0 1 2 3 4 5 +2 0 1 2 3 4 5 +3 0 1 2 3 4 5 +4 0 1 2 3 4 5 +5 0 1 2 3 4 5 +6 0 1 2 3 4 5 +7 0 1 2 3 4 5 +8 0 1 2 3 4 5 +9 0 1 2 3 4 5 +Transformed Data +[[0 1 2] + [0 1 2] + [0 1 2] + [0 1 2] + [0 1 2] + [0 1 2] + [0 1 2] + [0 1 2] + [0 1 2] + [0 1 2]] ++
To use the FSS with TPOT2, you can simply pass it in to the configuration dictionary. Note that the FSS is only well defined when used in the leaf nodes of the graph. This is because downstream nodes will receive different transformations of the data such that the original indexes no longer correspond to the same columns in the raw data.
+TPOT2 includsing the string "feature_set_selector" in the leaf_config_dict parameter will include the FSS in the search space of the pipeline. By default, each FSS node will select a single column. You can also group columns into sets so that each node selects a set of features rather than a single feature.
+subsets : str or list, default=None +Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. +- str : If a string, it is assumed to be a path to a csv file with the subsets. +The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. +- list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. +- None : If None, each column will be treated as a subset. One column will be selected per subset. +If subsets is None, each column will be treated as a subset. One column will be selected per subset.
+Lets say you want to have three groups of features, each with three columns each. The following examples are equivalent:
+sel_subsets=simple_fss.csv
+# simple_fss.csv
+group_one, 1,2,3
+group_two, 4,5,6
+group_three, 7,8,9
+sel_subsets = { "group_one" : [1,2,3], +"group_two" : [4,5,6], +"group_three" : [7,8,9], +}
+sel_subsets = [[1,2,3],[4,5,6],[7,8,9]]
+(As the FSS is just another transformer, you could also pass it in with the standard configuration dictionary format (described in tutorial 2), in which you would have to define your own function that returns a hyperparameter. Similar to the params_LogisticRegression function below. )
+(In the future, FSS will be treated as a special case node with its own mutation/crossover functions to make it more efficient when there are large numbers of features.)
+import tpot2
+import sklearn.datasets
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+import pandas as pd
+
+n_features = 6
+X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=n_features, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
+X = np.hstack([X, np.random.rand(X.shape[0],3)]) #add three uninformative features
+X = pd.DataFrame(X, columns=['a','b','c','d','e','f','g','h','i'])
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+
+X.head()
+
+ | a | +b | +c | +d | +e | +f | +g | +h | +i | +
---|---|---|---|---|---|---|---|---|---|
0 | +-1.290879 | +-2.012016 | +-1.009434 | +0.083251 | +2.350751 | +-0.192295 | +0.266530 | +0.989323 | +0.207050 | +
1 | +-2.329471 | +-1.033893 | +-2.656589 | +-1.025489 | +3.015554 | +-1.106947 | +0.500059 | +0.853473 | +0.596733 | +
2 | +0.948998 | +-0.123783 | +0.530650 | +-3.025307 | +1.391029 | +1.176166 | +0.662410 | +0.945252 | +0.861687 | +
3 | +-3.265866 | +2.101229 | +5.141677 | +0.500888 | +0.613011 | +-1.470835 | +0.734725 | +0.718854 | +0.751557 | +
4 | +-2.232187 | +-0.825902 | +-1.430346 | +2.341929 | +0.845866 | +0.342470 | +0.261221 | +0.977495 | +0.732266 | +
def params_LogisticRegression(trial, name=None):
+ params = {}
+ params['solver'] = trial.suggest_categorical(name=f'solver_{name}',
+ choices=[f'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
+ params['dual'] = False
+ params['penalty'] = 'l2'
+ params['C'] = trial.suggest_float(f'C_{name}', 1e-4, 1e4, log=True)
+ params['l1_ratio'] = None
+ if params['solver'] == 'liblinear':
+ params['penalty'] = trial.suggest_categorical(name=f'penalty_{name}', choices=['l1', 'l2'])
+ if params['penalty'] == 'l2':
+ params['dual'] = trial.suggest_categorical(name=f'dual_{name}', choices=[True, False])
+ else:
+ params['penalty'] = 'l1'
+
+ params['class_weight'] = trial.suggest_categorical(name=f'class_weight_{name}', choices=['balanced'])
+ param_grid = {'solver': params['solver'],
+ 'penalty': params['penalty'],
+ 'dual': params['dual'],
+ 'multi_class': 'auto',
+ 'l1_ratio': params['l1_ratio'],
+ 'C': params['C'],
+ }
+ return param_grid
+
+
+
+root_config_dict = {LogisticRegression: params_LogisticRegression}
+
In this configuration, each FSS node considers a single column.
+The root node is a logistic regression and there are no other intermediate transformers. An additional objective function is included that seeks to minimize the number of leave nodes (i.e the number of selected features)
+import tpot2
+import sklearn.datasets
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+
+
+est = tpot2.TPOTEstimator(population_size=40,generations=20,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ other_objective_functions=[tpot2.objectives.number_of_leaves_objective],
+ other_objective_functions_weights=[-1],
+ n_jobs=32,
+ classification=True,
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ inner_config_dict=None,
+ subsets=None,
+ verbose=1,
+ )
+
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 20/20 [00:13<00:00, 1.52it/s] ++
0.9074667008196723 ++
# print the selected features for each FSS
+
+#get leaves
+leaves = [v for v, d in est.fitted_pipeline_.graph.out_degree() if d == 0]
+for l in leaves:
+ print(l, " : ", est.fitted_pipeline_.graph.nodes[l]['instance'])
+
FeatureSetSelector_1 : FeatureSetSelector(name='3', sel_subset=['d']) +FeatureSetSelector_2 : FeatureSetSelector(name='4', sel_subset=['e']) +FeatureSetSelector_3 : FeatureSetSelector(name='5', sel_subset=['f']) ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=3371.8568398103916, solver='saga') +FeatureSetSelector_1 : FeatureSetSelector(name='3', sel_subset=['d']) +FeatureSetSelector_2 : FeatureSetSelector(name='4', sel_subset=['e']) +FeatureSetSelector_3 : FeatureSetSelector(name='5', sel_subset=['f']) ++
pareto_front = est.evaluated_individuals[est.evaluated_individuals['Pareto_Front'] == 1]
+
+#plot the pareto front of number_of_leaves_objective vs roc_auc_score
+
+import matplotlib.pyplot as plt
+plt.scatter(pareto_front['number_of_leaves_objective'], pareto_front['roc_auc_score'])
+plt.xlabel('Number of Selected Features')
+plt.ylabel('roc_auc_score')
+plt.show()
+
here we include arithmetic operators in the inner nodes that can combine and transform the selected features.
+We now use the number of nodes objective to minimize the complexity of the resulting equation. This minimized the number of selected features and the number of arithmetic operators
+est = tpot2.TPOTEstimator(population_size=40,generations=20,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
+ other_objective_functions_weights=[-1],
+ n_jobs=32,
+ classification=True,
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ inner_config_dict="arithmetic_transformer",
+ subsets = None,
+ verbose=1,
+ )
+
+
+est.fit(X_train,y_train)
+print(sklearn.metrics.get_scorer('roc_auc_ovr')(est, X_test, y_test))
+
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 20/20 [00:13<00:00, 1.44it/s] ++
0.9307120901639344 ++
# print the selected features for each FSS
+
+#get leaves
+leaves = [v for v, d in est.fitted_pipeline_.graph.out_degree() if d == 0]
+for l in leaves:
+ print(l, " : ", est.fitted_pipeline_.graph.nodes[l]['instance'])
+
FeatureSetSelector_1 : FeatureSetSelector(name='5', sel_subset=['f']) +FeatureSetSelector_2 : FeatureSetSelector(name='1', sel_subset=['b']) +FeatureSetSelector_3 : FeatureSetSelector(name='4', sel_subset=['e']) +FeatureSetSelector_4 : FeatureSetSelector(name='3', sel_subset=['d']) +FeatureSetSelector_5 : FeatureSetSelector(name='0', sel_subset=['a']) ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=1.3234861148420467, solver='liblinear') +FeatureSetSelector_1 : FeatureSetSelector(name='5', sel_subset=['f']) +FeatureSetSelector_2 : FeatureSetSelector(name='1', sel_subset=['b']) +FeatureSetSelector_3 : FeatureSetSelector(name='4', sel_subset=['e']) +mul_neg_1_Transformer_1 : mul_neg_1_Transformer() +EQTransformer_1 : EQTransformer() +FeatureSetSelector_4 : FeatureSetSelector(name='3', sel_subset=['d']) +NETransformer_1 : NETransformer() +FeatureSetSelector_5 : FeatureSetSelector(name='0', sel_subset=['a']) ++
pareto_front = est.evaluated_individuals[est.evaluated_individuals['Pareto_Front'] == 1]
+
+#plot the pareto front of number_of_leaves_objective vs roc_auc_score
+
+plt.scatter(pareto_front['number_of_nodes_objective'], pareto_front['roc_auc_score'])
+plt.xlabel('Number of Nodes')
+plt.ylabel('roc_auc_score')
+plt.show()
+
import tpot2
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import sklearn
+
+subsets = { "group_one" : ['a','b','c'],
+ "group_two" : ['d','e','f'],
+ "group_three" : ['g','h','i'],
+ }
+
+est = tpot2.TPOTEstimator(population_size=40,generations=20,
+ scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer],
+ scorers_weights=[1,-1],
+ n_jobs=32,
+ classification=True,
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ inner_config_dict="transformers",
+ subsets = subsets,
+ verbose=1,
+ )
+
+
+est.fit(X_train,y_train)
+print(sklearn.metrics.get_scorer('roc_auc_ovr')(est, X_test, y_test))
+
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 20/20 [00:26<00:00, 1.31s/it] +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge + warnings.warn( ++
0.9699667008196722 ++
# print the selected features for each FSS
+
+#get leaves
+leaves = [v for v, d in est.fitted_pipeline_.graph.out_degree() if d == 0]
+for l in leaves:
+ print(l, " : ", est.fitted_pipeline_.graph.nodes[l]['instance'])
+
FeatureSetSelector_1 : FeatureSetSelector(name='group_one', sel_subset=['a', 'b', 'c']) +FeatureSetSelector_2 : FeatureSetSelector(name='group_two', sel_subset=['d', 'e', 'f']) +FeatureSetSelector_3 : FeatureSetSelector(name='group_three', sel_subset=['g', 'h', 'i']) ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=0.06776401610163652, solver='saga') +FeatureSetSelector_1 : FeatureSetSelector(name='group_one', sel_subset=['a', 'b', 'c']) +PolynomialFeatures_1 : PolynomialFeatures(include_bias=False) +FeatureSetSelector_2 : FeatureSetSelector(name='group_two', sel_subset=['d', 'e', 'f']) +MaxAbsScaler_1 : MaxAbsScaler() +PCA_1 : PCA(n_components=0.9574868087370769) +FeatureSetSelector_3 : FeatureSetSelector(name='group_three', sel_subset=['g', 'h', 'i']) +MaxAbsScaler_2 : MaxAbsScaler() ++
import tpot2
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import sklearn
+
+subsets = [['a','b','c'],['d','e','f'],['g','h','i']]
+
+est = tpot2.TPOTEstimator(population_size=40,generations=20,
+ scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer],
+ scorers_weights=[1,-1],
+ n_jobs=32,
+ classification=True,
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ inner_config_dict="transformers",
+ subsets = subsets,
+ verbose=1,
+ )
+
+
+est.fit(X_train,y_train)
+print(sklearn.metrics.get_scorer('roc_auc_ovr')(est, X_test, y_test))
+
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 20/20 [00:21<00:00, 1.07s/it] ++
0.9712474385245903 ++
# print the selected features for each FSS
+
+#get leaves
+leaves = [v for v, d in est.fitted_pipeline_.graph.out_degree() if d == 0]
+for l in leaves:
+ print(l, " : ", est.fitted_pipeline_.graph.nodes[l]['instance'])
+
FeatureSetSelector_1 : FeatureSetSelector(name='1', sel_subset=['d', 'e', 'f']) +FeatureSetSelector_2 : FeatureSetSelector(name='0', sel_subset=['a', 'b', 'c']) ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=0.01924346331466653) +PolynomialFeatures_1 : PolynomialFeatures(include_bias=False) +OneHotEncoder_1 : OneHotEncoder() +FeatureSetSelector_1 : FeatureSetSelector(name='1', sel_subset=['d', 'e', 'f']) +FeatureSetSelector_2 : FeatureSetSelector(name='0', sel_subset=['a', 'b', 'c']) +FastICA_1 : FastICA(whiten='unit-variance') +PolynomialFeatures_2 : PolynomialFeatures(include_bias=False) ++
note: watch for spaces in the csv file!
+import tpot2
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import sklearn
+
+subsets = 'simple_fss.csv'
+'''
+# simple_fss.csv
+one,a,b,c
+two,d,e,f
+three,g,h,i
+'''
+
+est = tpot2.TPOTEstimator(population_size=40,generations=20,
+ scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer],
+ scorers_weights=[1,-1],
+ n_jobs=32,
+ classification=True,
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ inner_config_dict="transformers",
+ subsets = subsets,
+ verbose=1,
+ )
+
+
+est.fit(X_train,y_train)
+print(sklearn.metrics.get_scorer('roc_auc_ovr')(est, X_test, y_test))
+
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 20/20 [00:46<00:00, 2.34s/it] +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge + warnings.warn( ++
0.9678534836065574 ++
# print the selected features for each FSS
+
+#get leaves
+leaves = [v for v, d in est.fitted_pipeline_.graph.out_degree() if d == 0]
+for l in leaves:
+ print(l, " : ", est.fitted_pipeline_.graph.nodes[l]['instance'])
+
FeatureSetSelector_1 : FeatureSetSelector(name='two', sel_subset=['d', 'e', 'f']) +FeatureSetSelector_2 : FeatureSetSelector(name='one', sel_subset=['a', 'b', 'c']) ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=90.92104183243647, solver='saga') +FeatureSetSelector_1 : FeatureSetSelector(name='two', sel_subset=['d', 'e', 'f']) +FeatureSetSelector_2 : FeatureSetSelector(name='one', sel_subset=['a', 'b', 'c']) +RBFSampler_1 : RBFSampler(gamma=0.9480907031133559) +Binarizer_1 : Binarizer(threshold=0.5204447023562712) +RBFSampler_2 : RBFSampler(gamma=0.07182739023710172) +MaxAbsScaler_1 : MaxAbsScaler() ++
note that all of the above is the same when using numpy X, but the column names are now int indeces
+import tpot2
+import sklearn.datasets
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+import pandas as pd
+
+n_features = 6
+X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=n_features, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
+X = np.hstack([X, np.random.rand(X.shape[0],3)]) #add three uninformative features
+
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+
+print(X)
+
[[ 0.03418023 1.85703799 1.3321493 ... 0.61740176 0.03615026 + 0.73457701] + [ 0.00655906 0.3495084 -2.86361395 ... 0.27195435 0.52330367 + 0.47208072] + [ 1.84952258 -0.98538028 0.60941956 ... 0.14054112 0.77081219 + 0.17160637] + ... + [ 0.02282946 0.55489649 -2.89758703 ... 0.04122268 0.66234341 + 0.76367281] + [-1.34268913 2.73488335 -1.82542106 ... 0.59224411 0.94857147 + 0.20810423] + [-0.46791145 2.53228934 -2.08802875 ... 0.82326686 0.23363656 + 0.77884819]] ++
import tpot2
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import sklearn
+
+subsets = { "group_one" : [0,1,2],
+ "group_two" : [3,4,5],
+ "group_three" : [6,7,8],
+ }
+
+est = tpot2.TPOTEstimator(population_size=40,generations=20,
+ scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer],
+ scorers_weights=[1,-1],
+ n_jobs=32,
+ classification=True,
+ leaf_config_dict="feature_set_selector",
+ root_config_dict=root_config_dict,
+ inner_config_dict="transformers",
+ subsets = subsets,
+ verbose=1,
+ )
+
+
+est.fit(X_train,y_train)
+print(sklearn.metrics.get_scorer('roc_auc_ovr')(est, X_test, y_test))
+
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 20/20 [00:44<00:00, 2.22s/it] +/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge + warnings.warn( ++
0.9830226151579218 ++
# print the selected features for each FSS
+
+#get leaves
+leaves = [v for v, d in est.fitted_pipeline_.graph.out_degree() if d == 0]
+for l in leaves:
+ print(l, " : ", est.fitted_pipeline_.graph.nodes[l]['instance'])
+
FeatureSetSelector_1 : FeatureSetSelector(name='group_one', sel_subset=[0, 1, 2]) +FeatureSetSelector_2 : FeatureSetSelector(name='group_two', sel_subset=[3, 4, 5]) +FeatureSetSelector_3 : FeatureSetSelector(name='group_three', sel_subset=[6, 7, 8]) ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=0.13013559430004598, solver='sag') +FeatureSetSelector_1 : FeatureSetSelector(name='group_one', sel_subset=[0, 1, 2]) +PCA_1 : PCA(n_components=0.9988096714708292) +PolynomialFeatures_1 : PolynomialFeatures(include_bias=False) +FeatureSetSelector_2 : FeatureSetSelector(name='group_two', sel_subset=[3, 4, 5]) +FeatureSetSelector_3 : FeatureSetSelector(name='group_three', sel_subset=[6, 7, 8]) +Normalizer_1 : Normalizer(norm='max') +RBFSampler_1 : RBFSampler(gamma=0.17772815448977386) ++
The following configurations allow TPOT2 to learn a symbolic classification or regression model.
+Leafs: Leaves can either select individual columns or output 1's or 0's.
+Inner nodes: arithmetic operators
+Root: logistic regression
+Symbolic Classification
+import tpot2
+import sklearn.datasets
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+from tpot2.builtin_modules import ZeroTransformer, OneTransformer
+from tpot2.config.classifiers import params_LogisticRegression
+
+root_config_dict = {LogisticRegression: params_LogisticRegression}
+leaf_config_dict = ["feature_set_selector", {ZeroTransformer: {}, OneTransformer: {}}]
+
+
+est = tpot2.TPOTEstimator(population_size=100,generations=50,
+ scorers=['roc_auc'],
+ scorers_weights=[1],
+ other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
+ other_objective_functions_weights=[-1],
+ classification=True,
+ inner_config_dict= "arithmetic_transformer",
+ leaf_config_dict=leaf_config_dict,
+ root_config_dict=root_config_dict,
+ n_jobs=32,
+ verbose=1,
+ )
+
+#load iris
+scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
+X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=100, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 50/50 [01:59<00:00, 2.39s/it] ++
0.8397174152569836 ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
LogisticRegression_1 : LogisticRegression(C=282.83015030119856, max_iter=1000, n_jobs=1, solver='sag') +FeatureSetSelector_1 : FeatureSetSelector(name='50', sel_subset=[50]) +FeatureSetSelector_2 : FeatureSetSelector(name='16', sel_subset=[16]) +MaxTransformer_1 : MaxTransformer() +LTTransformer_1 : LTTransformer() +FeatureSetSelector_3 : FeatureSetSelector(name='42', sel_subset=[42]) +FeatureSetSelector_4 : FeatureSetSelector(name='21', sel_subset=[21]) +MaxTransformer_2 : MaxTransformer() +LTTransformer_2 : LTTransformer() +MulTransformer_1 : MulTransformer() ++
pareto_front = est.evaluated_individuals[est.evaluated_individuals['Pareto_Front'] == 1]
+
+#plot the pareto front of number_of_leaves_objective vs roc_auc_score
+import matplotlib.pyplot as plt
+plt.scatter(pareto_front['number_of_nodes_objective'], pareto_front['roc_auc_score'])
+plt.xlabel('Number of Nodes')
+plt.ylabel('roc_auc_score')
+plt.show()
+
Symbolic Regression
+import tpot2
+import sklearn.datasets
+from sklearn.linear_model import SGDRegressor
+import numpy as np
+from tpot2.builtin_modules import ZeroTransformer, OneTransformer
+from tpot2.config.regressors import params_SGDRegressor
+
+root_config_dict = {SGDRegressor: params_SGDRegressor}
+leaf_config_dict = ["feature_set_selector", {ZeroTransformer: {}, OneTransformer: {}}]
+
+
+est = tpot2.TPOTEstimator(population_size=100,generations=50,
+ scorers=['neg_mean_squared_error'],
+ scorers_weights=[1],
+ other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
+ other_objective_functions_weights=[-1],
+ n_jobs=32,
+ classification=False,
+ inner_config_dict= "arithmetic_transformer",
+ leaf_config_dict=leaf_config_dict,
+ root_config_dict=root_config_dict,
+ verbose=1,
+ processes=False,
+ )
+
+
+scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
+X, y = sklearn.datasets.make_regression(n_samples=1000, n_features=100, n_informative=6)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+est.fitted_pipeline_.plot()
+
Generation: 100%|██████████| 50/50 [02:24<00:00, 2.89s/it] ++
-53.572578179092396 ++
# print all hyperparameters
+for n in est.fitted_pipeline_.graph.nodes:
+ print(n, " : ", est.fitted_pipeline_.graph.nodes[n]['instance'])
+
SGDRegressor_1 : SGDRegressor(alpha=1.6814005088136593e-05, eta0=0.6868335822696461, + fit_intercept=False, l1_ratio=0.5144783118066449, + learning_rate='constant', loss='huber', penalty='elasticnet', + power_t=5.487407069184651) +FeatureSetSelector_1 : FeatureSetSelector(name='34', sel_subset=[34]) +FeatureSetSelector_2 : FeatureSetSelector(name='17', sel_subset=[17]) +FeatureSetSelector_3 : FeatureSetSelector(name='16', sel_subset=[16]) +FeatureSetSelector_4 : FeatureSetSelector(name='3', sel_subset=[3]) +FeatureSetSelector_5 : FeatureSetSelector(name='19', sel_subset=[19]) +ZeroTransformer_1 : ZeroTransformer() ++
pareto_front = est.evaluated_individuals[est.evaluated_individuals['Pareto_Front'] == 1]
+
+#plot the pareto front of number_of_leaves_objective vs roc_auc_score
+import matplotlib.pyplot as plt
+plt.scatter(pareto_front['number_of_nodes_objective'], pareto_front['mean_squared_error'])
+plt.xlabel('Number of Nodes')
+plt.ylabel('neg_mean_squared_error')
+plt.show()
+
GraphPipelines work similarly to the sklearn Pipeline class. Rather than provide a list of steps, in GraphPipeline you provide a graph of steps using networkx. In GraphPipeline, parents get their inputs from their children. Leafs get the raw inputs (X,y).
+The label of the nodes can be anything, but is unique per instance of an sklearn estimator. Each node has an attribute "instance" for the instance of the step.
+By default, the root of the resulting tree will become the final estimator/classifier/transformer.
+from sklearn.svm import SVC
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+import networkx as nx
+from tpot2 import GraphPipeline
+import sklearn.metrics
+
+X, y = make_classification(random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y,
+ random_state=0)
+
+
+g = nx.DiGraph()
+
+g.add_node("scaler", instance=StandardScaler())
+g.add_node("svc", instance=SVC())
+g.add_node("LogisticRegression", instance=LogisticRegression())
+g.add_node("LogisticRegression2", instance=LogisticRegression())
+
+g.add_edge("svc","scaler")
+g.add_edge("LogisticRegression", "scaler")
+g.add_edge("LogisticRegression2", "LogisticRegression")
+g.add_edge("LogisticRegression2", "svc")
+
+
+est = GraphPipeline(g)
+est.plot()
+
+est.fit(X_train, y_train)
+print("score")
+print(sklearn.metrics.roc_auc_score(y_test, est.predict_proba(X_test)[:,1]))
+
score +0.8974358974358974 ++
access nodes through their labels
+svc = est.graph.nodes["svc"]["instance"]
+
Welcome to this Jupyter Notebook tutorial parameters relating to computational resources. In this tutorial, we will cover the following parameters:
+population_size
initial_population_size
population_scaling
generations_until_end_population
budget_range
generations_until_end_budget
budget_scaling
stepwise_steps
Population size is the number of individuals evaluated each generation. Budget refers to the proportion of data to sample. By manipulating these parameters, we can control how quickly the budget increases and how population size changes over time. Most often, this will be used to start the algorithm by evaluating a large number of pipelines on small subsets of the data to quickly narrow now best models, before later getting a better estimate with larger samples on fewer datasets. This can reduce overall computational cost by not spending as much time evaluating poor performing pipelines.
+population_size
determines the number of individuals to evalaute each generation. Sometimes we may want to evaluate more or fewer individuals in the earlier generations. The initial_population_size
parameter specifies the starting size of the population. The population size will gradually move from initial_population_size
to population_size
over the course of generations_until_end_population
generations. population_scaling
dictates how fast that scaling takes place. The interpolation over generations_until_end_population
is done stepwise with the number of steps specified by stepwise_steps
.
The same process goes for the budget scaling.
+The following cell illustrates how the population size and budget change over time with the given settings.
+import matplotlib.pyplot as plt
+import tpot2
+
+population_size=60
+initial_population_size=100
+population_scaling = .5
+generations_until_end_population = 50
+
+budget_range = [.3,1]
+generations_until_end_budget=50
+budget_scaling = .5
+stepwise_steps = 5
+
+#Population and budget use stepwise
+fig, ax1 = plt.subplots()
+ax2 = ax1.twinx()
+
+interpolated_values_population = tpot2.utils.beta_interpolation(start=initial_population_size, end=population_size, n=generations_until_end_population, n_steps=stepwise_steps, scale=population_scaling)
+interpolated_values_budget = tpot2.utils.beta_interpolation(start=budget_range[0], end=budget_range[1], n=generations_until_end_budget, n_steps=stepwise_steps, scale=budget_scaling)
+ax1.step(list(range(len(interpolated_values_population))), interpolated_values_population, label=f"population size")
+ax2.step(list(range(len(interpolated_values_budget))), interpolated_values_budget, label=f"budget", color='r')
+ax1.set_xlabel("generation")
+ax1.set_ylabel("population size")
+ax2.set_ylabel("bugdet")
+
+ax1.legend(loc='center left', bbox_to_anchor=(1.1, 0.4))
+ax2.legend(loc='center left', bbox_to_anchor=(1.1, 0.3))
+plt.show()
+
# A Graph pipeline starting with at least one selector as a leaf, potentially followed by a series
+# of stacking classifiers or transformers, and ending with a classifier. The graph will have at most 15 nodes and a max depth of 6.
+
+import tpot2
+import sklearn
+import sklearn.datasets
+import numpy as np
+import time
+import tpot2
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import sklearn
+
+X, y = sklearn.datasets.load_iris(return_X_y=True)
+
+est = tpot2.TPOTEstimator(
+ generations=5,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ root_config_dict="classifiers",
+ inner_config_dict= ["transformers"],
+ leaf_config_dict="selectors",
+ n_jobs=32,
+ cv=2,
+ max_eval_time_seconds=30,
+
+ population_size=population_size,
+ initial_population_size=initial_population_size,
+ population_scaling = population_scaling,
+ generations_until_end_population = generations_until_end_population,
+
+ budget_range = budget_range,
+ generations_until_end_budget=generations_until_end_budget,
+ verbose=0)
+
+
+start = time.time()
+est.fit(X, y)
+print(f"total time: {time.time()-start}")
+
2023-06-14 11:49:45,920 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-q6nay1zr', purging +2023-06-14 11:49:45,921 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-wni1q2fv', purging +2023-06-14 11:49:45,921 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-kunoeg91', purging +2023-06-14 11:49:45,921 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-40sr99dr', purging +2023-06-14 11:49:45,922 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_b9njy2q', purging +2023-06-14 11:49:45,922 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-qft6b6eq', purging +2023-06-14 11:49:45,922 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-cgnqe8s_', purging +2023-06-14 11:49:45,922 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-mcu4ugbz', purging +2023-06-14 11:49:45,923 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-za145tll', purging +2023-06-14 11:49:45,923 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-3qdbpmh_', purging +2023-06-14 11:49:45,923 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-54ch2nwd', purging +2023-06-14 11:49:45,923 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-0zc92jfw', purging +2023-06-14 11:49:45,923 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-ub9p6598', purging +2023-06-14 11:49:45,924 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-8peu6bbu', purging +2023-06-14 11:49:45,924 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-1qp5dr29', purging +2023-06-14 11:49:45,924 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-hfh3inka', purging +2023-06-14 11:49:45,924 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-b1yl5oa1', purging +2023-06-14 11:49:45,924 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-epp_nuw_', purging +2023-06-14 11:49:45,925 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-q1qdqc8g', purging +2023-06-14 11:49:45,925 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-ek1b28f4', purging +2023-06-14 11:49:45,925 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-1806jovl', purging +2023-06-14 11:49:45,925 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-p0cuouft', purging +2023-06-14 11:49:45,925 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-wh0g6edf', purging +2023-06-14 11:49:45,926 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-1o1ws1of', purging +2023-06-14 11:49:45,926 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_zh96wch', purging +2023-06-14 11:49:45,926 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-wd9vzw4h', purging +2023-06-14 11:49:45,926 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-jy7obwb9', purging +2023-06-14 11:49:45,926 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-f6ildiiw', purging +2023-06-14 11:49:45,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-4ddayasf', purging +2023-06-14 11:49:45,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-fn6vfz6t', purging +2023-06-14 11:49:45,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-eyc403bk', purging +2023-06-14 11:49:45,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-fr7a5y2z', purging +2023-06-14 11:49:45,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-9kejqh6s', purging +2023-06-14 11:49:45,927 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_xaoujzg', purging +2023-06-14 11:49:45,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-zimc_s51', purging +2023-06-14 11:49:45,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-vtsv2zit', purging +2023-06-14 11:49:45,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-hj0s47vd', purging +2023-06-14 11:49:45,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-dpti5p3r', purging +2023-06-14 11:49:45,928 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-4cplddft', purging +2023-06-14 11:49:45,929 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-poszaeet', purging +2023-06-14 11:49:45,929 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-cjx6kkgn', purging +2023-06-14 11:49:45,929 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-u096a9iq', purging +2023-06-14 11:49:45,929 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-0k3omqwi', purging +2023-06-14 11:49:45,929 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-zk0s6ywn', purging +2023-06-14 11:49:45,930 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-bwn757sx', purging +2023-06-14 11:49:45,930 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-2nu35fgy', purging +2023-06-14 11:49:45,930 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-w6b4di6m', purging +2023-06-14 11:49:45,930 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-asj0iobm', purging +2023-06-14 11:49:45,930 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-gxxzxsyi', purging +2023-06-14 11:49:45,931 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-qa8099ky', purging +2023-06-14 11:49:45,931 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_uypy41h', purging +2023-06-14 11:49:45,931 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-a4ujlka7', purging +2023-06-14 11:49:45,931 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-dwhz05x3', purging +2023-06-14 11:49:45,931 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-kgug_o6d', purging +2023-06-14 11:49:45,932 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-rnbpw5ka', purging +2023-06-14 11:49:45,932 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-i52qfiid', purging +2023-06-14 11:49:45,932 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_5el2wab', purging +2023-06-14 11:49:45,932 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-mqhhdxip', purging +2023-06-14 11:49:45,932 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-i6xplvqh', purging +2023-06-14 11:49:45,933 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_dmc4eb5', purging +2023-06-14 11:49:45,933 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-mok5p0dw', purging +2023-06-14 11:49:45,933 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-ugwiqoc3', purging +2023-06-14 11:49:45,933 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-e97he6cf', purging +2023-06-14 11:49:45,933 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-an5jredd', purging ++
total time: 17.05474090576172 ++
Tutorial on early termination of evaluating CV scores.
+We can further reduce computational load by terminating the evaluation of individual pipelines early if the first few CV scores are not promising. Note that this is different than early stopping of the full algorithm. In this section we will cover:
+threshold_evaluation_early_stop
threshold_evaluation_scaling
min_history_threshold
selection_evaluation_early_stop
selection_evaluation_scaling
Threshold early stopping uses previous scores to identify and terminate the cross validation evaluation of poorly performing pipelines. We calculate the percentile scores from the previously evaluated pipelines. A pipeline must reach the given percentile each fold for the next to be evaluated, otherwise the pipeline is discarded.
+The threshold_evaluation_early_stop
parameter is a list that specifies the starting and ending percentiles to use as a threshold for the evaluation early stopping. W The threshold_evaluation_scaling
parameter is a float that controls the rate at which the threshold moves from the start to end percentile. The min_history_threshold
parameter specifies the minimum number of previous scores needed before using threshold early stopping. This ensures that the algorithm has enough historical data to make an informed decision about when to stop evaluating pipelines.
Selection early stopping uses a selection algorithm after each fold to select which algorithms will be evaluated for the next fold. For example, after evaluating 100 individuals on fold 1, we may want to only evaluate the best 50 for the remaining folds.
+The selection_evaluation_early_stop
parameter is a list that specifies the lower and upper percentage of the population size to select each round of CV. This is used to determine which individuals to evaluate in the next generation. The selection_evaluation_scaling
parameter is a float that controls the rate at which the selection threshold moves from the start to end percentile.
By manipulating these parameters, we can control how the algorithm selects individuals to evaluate in the next generation and when to stop evaluating pipelines that are not performing well.
+In practice, the values of these parameters will depend on the specific problem and the available computational resources.
+In the following sections, we will show you how to set and manipulate these parameters using Python code in a Jupyter Notebook. We will also provide examples of how these parameters can affect the performance of the algorithm.
+import matplotlib.pyplot as plt
+import tpot2
+
+threshold_evaluation_early_stop = [30, 90]
+threshold_evaluation_scaling = .5
+cv = 5
+
+#Population and budget use stepwise
+fig, ax1 = plt.subplots()
+
+interpolated_values = tpot2.utils.beta_interpolation(start=threshold_evaluation_early_stop[0], end=threshold_evaluation_early_stop[-1], n=cv, n_steps=cv, scale=threshold_evaluation_scaling)
+ax1.step(list(range(len(interpolated_values))), interpolated_values, label=f"threshold")
+ax1.set_xlabel("fold")
+ax1.set_ylabel("percentile")
+#ax1.legend(loc='center left', bbox_to_anchor=(1.1, 0.4))
+plt.show()
+
est = tpot2.TPOTEstimator(
+ generations=5,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ root_config_dict="classifiers",
+ inner_config_dict= ["transformers"],
+ leaf_config_dict="selectors",
+ n_jobs=32,
+ cv=cv,
+
+ # budget_range = [.3,1],
+ # generations_until_end_budget=4,
+
+ threshold_evaluation_early_stop = threshold_evaluation_early_stop,
+ threshold_evaluation_scaling = threshold_evaluation_scaling,
+ verbose=0)
+
+
+start = time.time()
+est.fit(X, y)
+print(f"total time: {time.time()-start}")
+
total time: 23.97980833053589 ++
import matplotlib.pyplot as plt
+import tpot2
+
+selection_evaluation_early_stop = [.1, 1]
+selection_evaluation_scaling = .5
+cv = 5
+
+#Population and budget use stepwise
+fig, ax1 = plt.subplots()
+
+interpolated_values = tpot2.utils.beta_interpolation(start=selection_evaluation_early_stop[0], end=selection_evaluation_early_stop[-1], n=cv, n_steps=cv, scale=selection_evaluation_scaling)
+ax1.step(list(range(len(interpolated_values))), interpolated_values, label=f"threshold")
+ax1.set_xlabel("fold")
+ax1.set_ylabel("percent to select")
+#ax1.legend(loc='center left', bbox_to_anchor=(1.1, 0.4))
+plt.show()
+
est = tpot2.TPOTEstimator(
+ generations=5,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ root_config_dict="classifiers",
+ inner_config_dict= ["transformers"],
+ leaf_config_dict="selectors",
+ n_jobs=32,
+ cv=cv,
+
+ selection_evaluation_early_stop = selection_evaluation_early_stop,
+ selection_evaluation_scaling = selection_evaluation_scaling,
+
+ verbose=0)
+
+
+start = time.time()
+est.fit(X, y)
+print(f"total time: {time.time()-start}")
+
total time: 23.03678798675537 ++
All of the above methods can be used independently or simultaneously as done below:
+import math
+np.array([1.2,3.4,1])
+
array([1.2, 3.4, 1. ])+
est = tpot2.TPOTEstimator(
+ generations=5,
+ scorers=['roc_auc_ovr'],
+ scorers_weights=[1],
+ classification=True,
+ root_config_dict="classifiers",
+ inner_config_dict= ["transformers"],
+ leaf_config_dict="selectors",
+ n_jobs=32,
+ cv=cv,
+
+ population_size=population_size,
+ initial_population_size=initial_population_size,
+ population_scaling = population_scaling,
+ generations_until_end_population = generations_until_end_population,
+
+ budget_range = budget_range,
+ generations_until_end_budget=generations_until_end_budget,
+
+ threshold_evaluation_early_stop = threshold_evaluation_early_stop,
+ threshold_evaluation_scaling = threshold_evaluation_scaling,
+
+ selection_evaluation_early_stop = selection_evaluation_early_stop,
+ selection_evaluation_scaling = selection_evaluation_scaling,
+
+ verbose=0)
+
+
+start = time.time()
+est.fit(X, y)
+print(f"total time: {time.time()-start}")
+
/home/ribeirop/miniconda3/envs/tpot2env/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. + warnings.warn( ++
total time: 36.7981653213501 ++
This tutorial covers advanced setups for parallelizing TPOT2 with Dask. If you just want to parallelize TPOT2 within a single computer with multiple processes, set the n_jobs parameter to the number of threads you want to use and skip this tutorial.
+TPOT2 uses Dask for parallelization and defaults to using a dask.distributed.LocalCluster for local parallelization. A user can pass in a custom Dask client or cluster for advanced usage. For example, a multi-node parallelization is possible using the dask-jobqueue package.
+When running tpot from an .py script, it is important to protect code with if __name__=="__main__":
This is due to how parallelization is handled in Python. In short, when Python spawns new processes, each new process reimports code from the relevant .py files, including rerunning code. The context under if __name__=="__main__":
ensures the code under it only executed by the main process and only once. More info here.
#my_analysis.py
+
+from dask.distributed import Client, LocalCluster
+import tpot2
+import sklearn
+import sklearn.datasets
+import numpy as np
+
+if __name__=="__main__":
+ scorer = sklearn.metrics.get_scorer('roc_auc_ovr')
+ X, y = sklearn.datasets.load_digits(return_X_y=True)
+ X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+ est = tpot2.TPOTEstimatorSteadyState( n_jobs=10,memory_limit="4GB", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)
+ est.fit(X_train, y_train)
+ print(scorer(est, X_test, y_test))
+
Evaluations: : 242it [02:01, 1.99it/s] ++
0.9995194086144522 ++
TPOT2 can be easily parallelized on a local computer by setting the n_jobs and memory_limit parameters.
+n_jobs
dictates how many dask workers to launch. In TPOT2 this corresponds to the number of pipelines to evaluate in parallel.
memory_limit
is the amount of RAM to use per worker.
import tpot2
+import sklearn
+import sklearn.datasets
+import numpy as np
+scorer = sklearn.metrics.get_scorer('roc_auc_ovr')
+X, y = sklearn.datasets.load_digits(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+
+
+est = tpot2.TPOTEstimatorSteadyState( n_jobs=10,memory_limit="4GB", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
Evaluations: : 224it [02:00, 1.86it/s] ++
0.9996005895289903 ++
You can also manually initialize a dask client. This can be useful to gain additional control over the parallelization, debugging, as well as viewing a dashboard of the live performance of TPOT2.
+You can find more details in the official documentation here.
+Dask Python Tutorial +Dask Dashboard
+Note that the if a client is passed in manually, TPOT will ignore n_jobs and memory_limit. +If there is no client passed in, TPOT will ignore any global/existing client and create its own.
+Initializing a basic dask local cluster
+from dask.distributed import Client, LocalCluster
+
+n_jobs = 4
+memory_limit = "4GB"
+
+cluster = LocalCluster(n_workers=n_jobs, #if no client is passed in and no global client exists, create our own
+ threads_per_worker=1,
+ memory_limit=memory_limit)
+client = Client(cluster)
+
Get the link to view the dask Dashboard.
+client.dashboard_link
+
'http://127.0.0.1:8787/status'+
est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)
+# this is equivalent to:
+# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit="4GB", verbose=1)
+est.fit(X_train, y_train)
+print(scorer(est, X_test, y_test))
+
+#It is good to close the client and cluster when you are done with them
+client.close()
+cluster.close()
+
Evaluations: : 119it [02:01, 1.02s/it] ++
0.9988827327847432 ++
2023-08-23 13:49:06,747 - distributed.nanny - WARNING - Worker process still alive after 3.1999992370605472 seconds, killing +2023-08-23 13:49:06,748 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing +2023-08-23 13:49:06,748 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing ++
Option 2
+You can initialize the cluster and client with a context manager that will automatically close them.
+from dask.distributed import Client, LocalCluster
+import tpot2
+import sklearn
+import sklearn.datasets
+import numpy as np
+
+scorer = sklearn.metrics.get_scorer('roc_auc_ovr')
+X, y = sklearn.datasets.load_digits(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+
+
+n_jobs = 4
+memory_limit = "4GB"
+
+with LocalCluster(
+ n_workers=n_jobs,
+ threads_per_worker=1,
+ memory_limit='4GB',
+) as cluster, Client(cluster) as client:
+ est = tpot2.TPOTEstimatorSteadyState(client=client, n_jobs=10,memory_limit="4GB", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)
+ est.fit(X_train, y_train)
+ print(scorer(est, X_test, y_test))
+
Evaluations: : 132it [02:00, 1.10it/s] ++
0.999973663151898 ++
2023-08-23 13:51:14,527 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing +2023-08-23 13:51:14,528 - distributed.nanny - WARNING - Worker process still alive after 3.19999984741211 seconds, killing ++
Dask can parallelize across multiple nodes via job queueing systems. This is done using the Dask-Jobqueue package. More information can be found in the official documentation here.
+To parallelize TPOT2 with Dask-Jobqueue, simply pass in a client based on a Jobqueue cluster with desired settings into the client parameter. Each job will evaluate a single pipeline.
+Note that TPOT will ignore n_jobs and memory_limit as these should be set inside the Dask cluster.
+The following example is specific to the Sun Grid Engine. Other supported clusters can be found in the Dask-Jobqueue documentation here
+from dask.distributed import Client, LocalCluster
+import sklearn
+import sklearn.datasets
+import sklearn.metrics
+import sklearn.model_selection
+import tpot2
+from dask_jobqueue import SGECluster # or SLURMCluster, PBSCluster, etc. Replace SGE with your scheduler.
+import os
+
+if os.system("which qsub") != 0:
+ print("Sun Grid Engine is not installed. This example requires Sun Grid Engine to be installed.")
+else:
+ print("Sun Grid Engine is installed.")
+
+
+ cluster = SGECluster(
+ queue='all.q',
+ cores=2,
+ memory="50 GB"
+
+ )
+
+ cluster.adapt(minimum_jobs=10, maximum_jobs=100) # auto-scale between 10 and 100 jobs
+
+ client = Client(cluster)
+
+ scorer = sklearn.metrics.get_scorer('roc_auc_ovr')
+ X, y = sklearn.datasets.load_digits(return_X_y=True)
+ X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+
+ est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)
+ # this is equivalent to:
+ # est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit="4GB", verbose=1)
+ est.fit(X_train, y_train)
+ print(scorer(est, X_test, y_test))
+
+ #It is good to close the client and cluster when you are done with them
+ client.close()
+ cluster.close()
+
Sun Grid Engine is not installed. This example requires Sun Grid Engine to be installed. ++
Objective functions can optionally take in step, budget, and generations.
+step - The same objective function will be run for #evaluation_early_stop_steps, the current step will be passed into the function as an interger. (This is useful for getting a single fold of cross validation for example).
+budget - A parameter that varies over the course of the generations. Gets passed into the objective function as a float between 0 and 1. If the budget of the previous evaluation is less than the current budget, it will get re-evaluated. Useful for using smaller datasets earlier in training.
+generations - an int corresponding to the current generation number.
+#knapsack problem
+import numpy as np
+import tpot2
+import random
+import matplotlib.pyplot as plt
+from dask.distributed import Client, LocalCluster
+
+class SubsetSelector(tpot2.individual_representations.BaseIndividual):
+ def __init__( self,
+ values,
+ initial_set = None,
+ k=1, #step size for shuffling
+ ):
+
+ if isinstance(values, int):
+ self.values = set(range(0,values))
+ else:
+ self.values = set(values)
+
+
+ if initial_set is None:
+ self.subsets = set(random.choices(values, k=k))
+ else:
+ self.subsets = set(initial_set)
+
+ self.k = k
+
+ self.mutation_list = [self._mutate_add, self._mutate_remove]
+ self.crossover_list = [self._crossover_swap]
+
+
+ def mutate(self,):
+ mutation_list_copy = self.mutation_list.copy()
+ random.shuffle(mutation_list_copy)
+ for func in mutation_list_copy:
+ if func():
+ return True
+ return False
+
+ def crossover(self, ind2):
+ crossover_list_copy = self.crossover_list.copy()
+ random.shuffle(crossover_list_copy)
+ for func in crossover_list_copy:
+ if func(ind2):
+ return True
+ return False
+
+ def _mutate_add(self,):
+ not_included = list(self.values.difference(self.subsets))
+ if len(not_included) > 1:
+ self.subsets.update(random.sample(not_included, k=min(self.k, len(not_included))))
+ return True
+ else:
+ return False
+
+ def _mutate_remove(self,):
+ if len(self.subsets) > 1:
+ self.subsets = self.subsets - set(random.sample(list(self.subsets), k=min(self.k, len(self.subsets)-1) ))
+
+ def _crossover_swap(self, ss2):
+ diffs = self.subsets.symmetric_difference(ss2.subsets)
+
+ if len(diffs) == 0:
+ return False
+ for v in diffs:
+ self.subsets.discard(v)
+ ss2.subsets.discard(v)
+ random.choice([self.subsets, ss2.subsets]).add(v)
+
+ return True
+
+ def unique_id(self):
+ return str(tuple(sorted(self.subsets)))
+
+def individual_generator():
+ while True:
+ yield SubsetSelector(values=np.arange(len(values)))
+
+
+values = np.random.randint(200,size=100)
+weights = np.random.random(200)*10
+max_weight = 50
+
+def simple_objective(ind, **kwargs):
+ subset = np.array(list(ind.subsets))
+ if len(subset) == 0:
+ return 0, 0
+
+ total_weight = np.sum(weights[subset])
+ total_value = np.sum(values[subset])
+
+ if total_weight > max_weight:
+ total_value = 0
+
+ return total_value, total_weight
+
+objective_names = ["Value", "Weight"]
+objective_function_weights = [1,-1]
+
+
+
+evolver = tpot2.evolvers.BaseEvolver( individual_generator=individual_generator(),
+ objective_functions=[simple_objective],
+ objective_function_weights = objective_function_weights,
+ bigger_is_better = True,
+ population_size= 100,
+ objective_names = objective_names,
+ generations= 100,
+ n_jobs=1,
+ verbose = 1,
+
+)
+
+evolver.optimize()
+
Generation: 100%|██████████| 100/100 [02:22<00:00, 1.42s/it] ++
final_population_results = evolver.population.evaluated_individuals
+final_population_results.reset_index(inplace=True)
+final_population_results = final_population_results.rename(columns = {'index':'Selected Index'})
+
+best_idx = final_population_results["Value"].idxmax()
+best_individual = final_population_results.loc[best_idx]['Individual']
+print("best subset", best_individual.subsets)
+print("Best value {0}, weight {1}".format(final_population_results.loc[best_idx, "Value"],final_population_results.loc[best_idx, "Weight"]))
+print()
+
+print("All results")
+final_population_results
+
best subset {0, 96, 34, 35, 36, 5, 9, 75, 18, 50, 84, 20, 22, 23, 87, 26, 27, 28} +Best value 2422.0, weight 49.38389605974704 + +All results ++
+ | Selected Index | +Value | +Weight | +Parents | +Variation_Function | +Individual | +Generation | +Pareto_Front | +
---|---|---|---|---|---|---|---|---|
0 | +(60,) | +11.0 | +5.586633 | +NaN | +NaN | +<__main__.SubsetSelector object at 0x7fc0b1d2d... | +0.0 | +NaN | +
1 | +(66,) | +192.0 | +5.407096 | +NaN | +NaN | +<__main__.SubsetSelector object at 0x7fc0b1d61... | +0.0 | +NaN | +
2 | +(2,) | +50.0 | +5.002992 | +NaN | +NaN | +<__main__.SubsetSelector object at 0x7fc0b1d2c... | +0.0 | +NaN | +
3 | +(53,) | +27.0 | +4.088630 | +NaN | +NaN | +<__main__.SubsetSelector object at 0x7fc0b1d2c... | +0.0 | +NaN | +
4 | +(18,) | +165.0 | +4.886466 | +NaN | +NaN | +<__main__.SubsetSelector object at 0x7fc0b1d2d... | +0.0 | +NaN | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +
9995 | +(0, 26, 27, 28, 75, 84, 87, 98) | +1016.0 | +18.517959 | +((0, 26, 27, 28, 75, 84, 87),) | +mutate | +<__main__.SubsetSelector object at 0x7fc09452a... | +99.0 | +NaN | +
9996 | +(0, 26, 27, 28, 31, 65, 75, 87) | +1058.0 | +16.186587 | +((0, 26, 27, 28, 31, 75, 84, 87),) | +mutate | +<__main__.SubsetSelector object at 0x7fc09452a... | +99.0 | +NaN | +
9997 | +(0, 26, 27, 28, 31, 55, 75, 84, 87, 96) | +1264.0 | +20.441256 | +((0, 26, 27, 28, 31, 75, 84, 85, 87, 96),) | +mutate | +<__main__.SubsetSelector object at 0x7fc09452a... | +99.0 | +NaN | +
9998 | +(26, 27, 31, 75, 84, 87, 94) | +772.0 | +8.488381 | +((0, 26, 27, 31, 75, 84, 87),) | +mutate | +<__main__.SubsetSelector object at 0x7fc094529... | +99.0 | +NaN | +
9999 | +(0, 27, 29, 31, 75, 84, 87, 93) | +1060.0 | +21.392753 | +((0, 27, 28, 31, 75, 84, 85, 87),) | +mutate | +<__main__.SubsetSelector object at 0x7fc09452b... | +99.0 | +NaN | +
10000 rows × 8 columns
+from scipy.stats import binned_statistic_2d
+
+y = final_population_results["Value"]
+x = final_population_results["Weight"]
+c = final_population_results["Generation"]
+
+x_bins = np.linspace(0, 100, 100)
+y_bins = np.linspace(0, 3000, 100)
+
+ret = binned_statistic_2d(x, y, c, statistic=np.mean, bins=[x_bins, y_bins])
+
+fig, ax1 = plt.subplots(1, 1, figsize=(12, 4))
+
+im = ax1.imshow(ret.statistic.T, origin='lower', extent=(0,100,0,3000), vmin=0, vmax=100, aspect=.03)
+ax1.set_xlabel("Weight")
+ax1.set_ylabel("Value")
+ax1.set_title("Binned Average Generation")
+
+cbar = fig.colorbar(im,)
+cbar.set_label('Generation')
+plt.tight_layout()
+
{"use strict";/*!
+ * escape-html
+ * Copyright(c) 2012-2013 TJ Holowaychuk
+ * Copyright(c) 2015 Andreas Lubbe
+ * Copyright(c) 2015 Tiancheng "Timothy" Gu
+ * MIT Licensed
+ */var _a=/["'&<>]/;Pn.exports=Aa;function Aa(e){var t=""+e,r=_a.exec(t);if(!r)return t;var o,n="",i=0,s=0;for(i=r.index;i