From 39d42a29c2693d5ce40cdc94eaa0f6aa29087d12 Mon Sep 17 00:00:00 2001
From: spiros
How many minutes TPOT has to optimize the pipeline.max_eval_time_mins: float, optional (default=5) From 1998df3e96a4094fc44a88abd7f1b65f4951109a Mon Sep 17 00:00:00 2001 From: Jan-Hendrik Menke
-If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. +If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated.
-Number of iterations to the run pipeline optimization process. Must be a positive number. +Number of iterations to the run pipeline optimization process. Must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit.
Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.
diff --git a/tpot/base.py b/tpot/base.py index 32a3b9e9..24186c75 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -566,9 +566,11 @@ def _fit_init(self): self.operators.append(op_class) self.arguments += arg_types + if self.max_time_mins is None and self.generations is None: + raise ValueError("Either the parameter generations should bet set or a maximum evaluation time should be defined via max_time_mins") + # Schedule TPOT to run for many generations if the user specifies a - # run-time limit TPOT will automatically interrupt itself when the timer - # runs out + # run-time limit TPOT will automatically interrupt itself when the timer runs out if self.max_time_mins is not None and self.generations is None : self.generations = 1000000 @@ -1261,7 +1263,7 @@ def _stop_by_max_time_mins(self): if self.max_time_mins: total_mins_elapsed = (datetime.now() - self._start_datetime).total_seconds() / 60. if total_mins_elapsed >= self.max_time_mins: - raise KeyboardInterrupt('{} minutes have elapsed. TPOT will close down.'.format(total_mins_elapsed)) + raise KeyboardInterrupt('{:.2f} minutes have elapsed. TPOT will close down.'.format(total_mins_elapsed)) def _combine_individual_stats(self, operator_count, cv_score, individual_stats): """Combine the stats with operator count and cv score and preprare to be written to _evaluated_individuals From 356115f6cdc39d3eb2d3e90d318f7ff608bda8a4 Mon Sep 17 00:00:00 2001 From: UvADate: Sun, 3 Nov 2019 11:43:58 +0100 Subject: [PATCH 21/44] export always returns pipeline as string and writing to file is optional --- tpot/base.py | 19 ++++++++----------- tpot/export_utils.py | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 83a25e8d..f6276d2c 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1088,24 +1088,21 @@ def _create_periodic_checkpoint_folder(self): raise ValueError('Failed creating the periodic_checkpoint_folder:\n{}'.format(e)) - def export(self, output_file_name='tpot_pipeline.py', data_file_path='', to_screen=False): + def export(self, output_file_name='', data_file_path=''): """Export the optimized pipeline as Python code. Parameters ---------- - output_file_name: string (default: 'tpot_pipeline.py') - String containing the path and file name of the desired output file + output_file_name: string (default: '') + String containing the path and file name of the desired output file. If left empty, writing to file will be skipped. data_file_path: string (default: '') By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. - to_screen: boolean (default: False) - If set to True, the full text of the export is printed to screen instead of to file. Returns ------- - False if it skipped writing the pipeline to file - True if the pipeline was actually written - + to_write: str + The whole pipeline text as a string. """ if self._optimized_pipeline is None: raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') @@ -1116,11 +1113,11 @@ def export(self, output_file_name='tpot_pipeline.py', data_file_path='', to_scre self.random_state, data_file_path=data_file_path) - if to_screen: - print(to_write) - else: + if output_file_name is not '': with open(output_file_name, 'w') as output_file: output_file.write(to_write) + return to_write + def _impute_values(self, features): """Impute missing values in a feature set. diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 5464a05b..0260a384 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -113,7 +113,7 @@ def export_pipeline(exported_pipeline, """ if pipeline_score is not None: - pipeline_text += '\n# Average CV score on the training set was:{}'.format(pipeline_score) + pipeline_text += '\n# Average CV score on the training set was: {}'.format(pipeline_score) pipeline_text += '\n' # Replace the function calls with their corresponding Python code From 4167dad67fcf41ddbb75c5996da4859a0080a448 Mon Sep 17 00:00:00 2001 From: UvA Date: Sun, 3 Nov 2019 11:50:15 +0100 Subject: [PATCH 22/44] added space also added in expected code --- tests/export_tests.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/export_tests.py b/tests/export_tests.py index a77429c4..ab153a2a 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -104,6 +104,38 @@ def test_export(): remove("test_export.py") # clean up exported file +def test_export_2(): + """Assert that TPOT's export function returns the expected pipeline text as a string.""" + + pipeline_string = ( + 'KNeighborsClassifier(' + 'input_matrix, ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj._optimized_pipeline = pipeline + expected_code = """import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=None) + +exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + assert expected_code == tpot_obj.export() + + def test_generate_pipeline_code(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline.""" @@ -559,7 +591,7 @@ def test_pipeline_score_save(): training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=None) -# Average CV score on the training set was:0.929813743 +# Average CV score on the training set was: 0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) From 07852f1404364b354d075b76908bec5cecf6054e Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 4 Nov 2019 11:05:05 -0500 Subject: [PATCH 23/44] documents for new generations parameter #941 --- docs_sources/api.md | 10 +++++----- docs_sources/using.md | 6 +++--- tests/driver_tests.py | 24 +++++++++++++++++++++++- tpot/base.py | 9 ++++++--- tpot/driver.py | 38 +++++++++++++++++++++++++++++++++----- 5 files changed, 70 insertions(+), 17 deletions(-) diff --git a/docs_sources/api.md b/docs_sources/api.md index e4978b19..a1106417 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -32,9 +32,9 @@ Read more in the [User Guide](using/#tpot-with-code). Parameters: -generations: int, optional (default=100) +generations: int or None optional (default=100) -Number of iterations to the run pipeline optimization process. Must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. +Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit.
Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.
@@ -524,9 +524,9 @@ Read more in the [User Guide](using/#tpot-with-code).Parameters: -generations: int, optional (default=100) +generations: int or None, optional (default=100) -Number of iterations to the run pipeline optimization process. Must be a positive number. +Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit.
Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.
@@ -608,7 +608,7 @@ Setting n_jobs=-1 will use as many cores as available on the computer.How many minutes TPOT has to optimize the pipeline.max_eval_time_mins: float, optional (default=5) diff --git a/docs_sources/using.md b/docs_sources/using.md index 41d3dfa2..a8a55785 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -170,8 +170,8 @@ Detailed descriptions of the command-line arguments are below.
-If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. +If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated.@@ -248,7 +248,7 @@ Assigning this to -1 will use as many cores as available on the computer. For n_ -g GENERATIONS -Any positive integer -Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. + Any positive integer or None +Number of iterations to run the pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.
TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.Any positive integer How many minutes TPOT has to optimize the pipeline. +How many minutes TPOT has to optimize the pipeline.If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generationsis set and all generations are already evaluated.
-If provided, this setting will override the "generations" parameter and allow TPOT to run until it runs out of time.-maxeval diff --git a/tests/driver_tests.py b/tests/driver_tests.py index 99cebb8f..5532d960 100644 --- a/tests/driver_tests.py +++ b/tests/driver_tests.py @@ -38,7 +38,9 @@ import pandas as pd import sklearn -from tpot.driver import positive_integer, float_range, _get_arg_parser, _print_args, _read_data_file, load_scoring_function, tpot_driver +from tpot.driver import positive_integer, float_range, _get_arg_parser, \ + _print_args, _read_data_file, load_scoring_function, tpot_driver, \ + positive_integer_or_none from nose.tools import assert_raises, assert_equal, assert_in from unittest import TestCase @@ -359,6 +361,26 @@ def test_positive_integer_3(): """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer.""" assert_raises(Exception, positive_integer, 'foobar') +def test_positive_integer_or_none(): + """Assert that the TPOT CLI interface's positive_integer_or_none parsing throws an exception when n < 0.""" + assert_raises(Exception, positive_integer_or_none, '-1') + + +def test_positive_integer_or_none_2(): + """Assert that the TPOT CLI interface's positive_integer_or_none parsing returns the integer value of a string encoded integer when n > 0.""" + assert 1 == positive_integer_or_none('1') + + +def test_positive_integer_or_none_3(): + """Assert that the TPOT CLI interface's positive_integer_or_none parsing throws an exception when n is not an integer and not None.""" + assert_raises(Exception, positive_integer_or_none, 'foobar') + + +def test_positive_integer_or_none_4(): + """Assert that the TPOT CLI interface's positive_integer_or_none parsing return None when value is string 'None' or 'none'.""" + assert positive_integer_or_none('none') is None + assert positive_integer_or_none('None') is None + def test_float_range(): """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0.""" diff --git a/tpot/base.py b/tpot/base.py index 8be71246..865f0620 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -119,8 +119,10 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, Parameters ---------- - generations: int, optional (default: 100) + generations: int or None, optional (default: 100) Number of iterations to the run pipeline optimization process. + It must be a positive number or None. If None, the parameter + max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. @@ -182,8 +184,9 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, Thus for n_jobs = -2, all CPUs but one are used. max_time_mins: int, optional (default: None) How many minutes TPOT has to optimize the pipeline. - If provided, this setting will override the "generations" parameter and allow - TPOT to run until it runs out of time. + If not None, this setting will allow TPOT to run until max_time_mins minutes + elapsed and then stop. TPOT will stop earlier if generationsis set and all + generations are already evaluated. max_eval_time_mins: float, optional (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more diff --git a/tpot/driver.py b/tpot/driver.py index 09e60f5e..480fe251 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -42,7 +42,7 @@ def positive_integer(value): Parameters ---------- - value: int + value: string The number to evaluate Returns @@ -59,6 +59,31 @@ def positive_integer(value): return value +def positive_integer_or_none(value): + """Ensure that the provided value is a positive integer or None. + + Parameters + ---------- + value: string + The number to evaluate + + Returns + ------- + value: int or None + Returns a positive integer or None + """ + if value.lower() == 'none': + value = None + else: + try: + value = int(value) + except Exception: + raise argparse.ArgumentTypeError('Invalid int value: \'{}\''.format(value)) + if value < 0: + raise argparse.ArgumentTypeError('Invalid positive int value: \'{}\''.format(value)) + return value + + def float_range(value): """Ensure that the provided value is a float integer in the range [0., 1.]. @@ -152,9 +177,11 @@ def _get_arg_parser(): action='store', dest='GENERATIONS', default=100, - type=positive_integer, + type=positive_integer_or_none, help=( 'Number of iterations to run the pipeline optimization process. ' + 'It must be a positive number or None. If None, the parameter ' + 'max_time_mins must be defined as the runtime limit. ' 'Generally, TPOT will work better when you give it more ' 'generations (and therefore time) to optimize the pipeline. TPOT ' 'will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE ' @@ -308,9 +335,10 @@ def _get_arg_parser(): default=None, type=int, help=( - 'How many minutes TPOT has to optimize the pipeline. This setting ' - 'will override the GENERATIONS parameter and allow TPOT to run ' - 'until it runs out of time.' + 'How many minutes TPOT has to optimize the pipeline. ' + 'If not None, this setting will allow TPOT to run until max_time_mins minutes ' + 'elapsed and then stop. TPOT will stop earlier if generationsis set and all ' + 'generations are already evaluated. ' ) ) From 4391536db086dbb03a816a75bc55659db567c8a2 Mon Sep 17 00:00:00 2001 From: weixuanfuDate: Mon, 4 Nov 2019 12:46:51 -0500 Subject: [PATCH 24/44] add Stochastic Gradient Descent into default tpot config --- tpot/config/classifier.py | 11 +++++++++++ tpot/config/regressor.py | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/tpot/config/classifier.py b/tpot/config/classifier.py index 1441e3f6..9f1e9f2a 100644 --- a/tpot/config/classifier.py +++ b/tpot/config/classifier.py @@ -107,6 +107,17 @@ 'nthread': [1] }, + 'sklearn.linear_model.SGDClassifier': { + 'loss': ['log', 'hinge', 'modified_huber', 'squared_hinge', 'perceptron'], + 'penalty': ['elasticnet'], + 'alpha': [0.0, 0.01, 0.001], + 'learning_rate': ['invscaling', 'constant'], + 'fit_intercept': [True, False], + 'l1_ratio': [0.25, 0.0, 1.0, 0.75, 0.5], + 'eta0': [0.1, 1.0, 0.01], + 'power_t': [0.5, 0.0, 1.0, 0.1, 100.0, 10.0, 50.0] + }, + # Preprocesssors 'sklearn.preprocessing.Binarizer': { 'threshold': np.arange(0.0, 1.01, 0.05) diff --git a/tpot/config/regressor.py b/tpot/config/regressor.py index 6c7aa3da..33ec7478 100644 --- a/tpot/config/regressor.py +++ b/tpot/config/regressor.py @@ -105,6 +105,17 @@ 'objective': ['reg:squarederror'] }, + 'sklearn.linear_model.SGDRegressor': { + 'loss': ['squared_loss', 'huber', 'epsilon_insensitive'], + 'penalty': ['elasticnet'], + 'alpha': [0.0, 0.01, 0.001] , + 'learning_rate': ['invscaling', 'constant'] , + 'fit_intercept': [True, False], + 'l1_ratio': [0.25, 0.0, 1.0, 0.75, 0.5], + 'eta0': [0.1, 1.0, 0.01], + 'power_t': [0.5, 0.0, 1.0, 0.1, 100.0, 10.0, 50.0] + }, + # Preprocesssors 'sklearn.preprocessing.Binarizer': { 'threshold': np.arange(0.0, 1.01, 0.05) From b91c22aec7797c24155a2941af685925cf3c145b Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 4 Nov 2019 14:56:50 -0500 Subject: [PATCH 25/44] refine random state in exported codes #933 --- tests/export_tests.py | 80 ++++++++++++++++++++++++++++++++++++++++--- tests/tpot_tests.py | 61 +-------------------------------- tpot/base.py | 45 ++++-------------------- tpot/export_utils.py | 62 +++++++++++++++++++++++++++++---- 4 files changed, 138 insertions(+), 110 deletions(-) diff --git a/tests/export_tests.py b/tests/export_tests.py index ab153a2a..5b715e30 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -28,7 +28,8 @@ from os import remove, path from tpot import TPOTClassifier, TPOTRegressor -from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code, get_by_name +from tpot.export_utils import export_pipeline, generate_import_code, _indent, \ + generate_pipeline_code, get_by_name, set_param_recursive from tpot.operator_utils import TPOTOperatorClassFactory from tpot.config.classifier import classifier_config_dict @@ -70,6 +71,7 @@ def test_export_random_ind(): import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB +from tpot.export_utils import set_param_recursive # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) @@ -78,11 +80,15 @@ def test_export_random_ind(): train_test_split(features, tpot_data['target'].values, random_state=39) exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 39) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state) + exported_code = export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state) + + assert expected_code == exported_code def test_export(): @@ -493,6 +499,7 @@ def test_export_pipeline_6(): import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier +from tpot.export_utils import set_param_recursive # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64) @@ -501,13 +508,17 @@ def test_export_pipeline_6(): train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, - tpot_obj._pset, random_state=42, - data_file_path='test_path') + exported_code = export_pipeline(pipeline, tpot_obj.operators, + tpot_obj._pset, random_state=42, + data_file_path='test_path') + + assert expected_code == exported_code def test_operator_export(): @@ -657,3 +668,62 @@ def test_imputer_in_export(): """ assert_equal(export_code, expected_code) + + +def test_set_param_recursive(): + tpot_obj = TPOTClassifier() + tpot_obj._fit_init() + """Assert that _set_param_recursive sets \"random_state\" to 42 in all steps in a simple pipeline.""" + pipeline_string = ( + 'DecisionTreeClassifier(PCA(input_matrix, PCA__iterated_power=5, PCA__svd_solver=randomized), ' + 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' + ) + + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + # assert "random_state" of PCA at step 1 + assert getattr(sklearn_pipeline.steps[0][1], 'random_state') == 42 + # assert "random_state" of DecisionTreeClassifier at step 2 + assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 + + +def test_set_param_recursive_2(): + """Assert that set_param_recursive sets \"random_state\" to 42 in nested estimator in SelectFromModel.""" + pipeline_string = ( + 'DecisionTreeRegressor(SelectFromModel(input_matrix, ' + 'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, ' + 'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,' + 'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)' + ) + tpot_obj = TPOTRegressor() + tpot_obj._fit_init() + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + assert getattr(getattr(sklearn_pipeline.steps[0][1], 'estimator'), 'random_state') == 42 + assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 + + +def test_set_param_recursive_3(): + """Assert that set_param_recursive sets \"random_state\" to 42 in nested estimator in StackingEstimator in a complex pipeline.""" + pipeline_string = ( + 'DecisionTreeClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5),input_matrix) ' + 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' + ) + tpot_obj = TPOTClassifier() + tpot_obj._fit_init() + + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + # StackingEstimator under the transformer_list of FeatureUnion + assert getattr(getattr(sklearn_pipeline.steps[0][1].transformer_list[0][1], 'estimator'), 'random_state') == 42 + assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index d7414c85..490628e4 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -556,7 +556,7 @@ def test_score_3(): """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline.""" tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error', random_state=72) tpot_obj._fit_init() - known_score = -11.682841148312662 + known_score = -11.708199875921563 # Reify pipeline with known score pipeline_string = ( @@ -576,7 +576,6 @@ def test_score_3(): # Get score from TPOT score = tpot_obj.score(testing_features_r, testing_target_r) - assert np.allclose(known_score, score) @@ -1332,61 +1331,6 @@ def test_summary_of_best_pipeline(): assert_raises(RuntimeError, tpot_obj._summary_of_best_pipeline, features=training_features, target=training_target) -def test_set_param_recursive(): - """Assert that _set_param_recursive sets \"random_state\" to 42 in all steps in a simple pipeline.""" - pipeline_string = ( - 'DecisionTreeClassifier(PCA(input_matrix, PCA__iterated_power=5, PCA__svd_solver=randomized), ' - 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' - 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' - ) - - deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) - # assert "random_state" of PCA at step 1 - assert getattr(sklearn_pipeline.steps[0][1], 'random_state') == 42 - # assert "random_state" of DecisionTreeClassifier at step 2 - assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 - - -def test_set_param_recursive_2(): - """Assert that _set_param_recursive sets \"random_state\" to 42 in nested estimator in SelectFromModel.""" - pipeline_string = ( - 'DecisionTreeRegressor(SelectFromModel(input_matrix, ' - 'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, ' - 'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,' - 'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)' - ) - tpot_obj = TPOTRegressor() - tpot_obj._fit_init() - deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) - - assert getattr(getattr(sklearn_pipeline.steps[0][1], 'estimator'), 'random_state') == 42 - assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 - - -def test_set_param_recursive_3(): - """Assert that _set_param_recursive sets \"random_state\" to 42 in nested estimator in StackingEstimator in a complex pipeline.""" - pipeline_string = ( - 'DecisionTreeClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8, DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5),input_matrix) ' - 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' - 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' - ) - - deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) - - # StackingEstimator under the transformer_list of FeatureUnion - assert getattr(getattr(sklearn_pipeline.steps[0][1].transformer_list[0][1], 'estimator'), 'random_state') == 42 - assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 - - def test_evaluated_individuals_(): """Assert that evaluated_individuals_ stores current pipelines and their CV scores.""" tpot_obj = TPOTClassifier( @@ -1402,7 +1346,6 @@ def test_evaluated_individuals_(): for pipeline_string in sorted(tpot_obj.evaluated_individuals_.keys()): deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) operator_count = tpot_obj._operator_count(deap_pipeline) try: @@ -1450,7 +1393,6 @@ def pareto_eq(ind1, ind2): for deap_pipeline, fitness_score in zip(pop, fitness_scores): operator_count = tpot_obj._operator_count(deap_pipeline) sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) try: cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) @@ -1485,7 +1427,6 @@ def pareto_eq(ind1, ind2): for deap_pipeline, fitness_score in zip(pop, fitness_scores): operator_count = tpot_obj._operator_count(deap_pipeline) sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) try: cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) diff --git a/tpot/base.py b/tpot/base.py index 865f0620..61ec6c39 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -63,7 +63,7 @@ from ._version import __version__ from .operator_utils import TPOTOperatorClassFactory, Operator, ARGType -from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code +from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code, set_param_recursive from .decorators import _pre_test from .builtins import CombineDFs, StackingEstimator @@ -121,7 +121,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, ---------- generations: int or None, optional (default: 100) Number of iterations to the run pipeline optimization process. - It must be a positive number or None. If None, the parameter + It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate @@ -1065,7 +1065,7 @@ def _save_periodic_pipeline(self, gen): self.operators, self._pset, self._imputed, pareto_front_pipeline_score, self.random_state) - # dont export a pipeline you had + # dont export a pipeline you had if self._exported_pipeline_text.count(sklearn_pipeline_str): self._update_pbar(pbar_num=0, pbar_msg='Periodic pipeline was not saved, probably saved before...') else: @@ -1146,6 +1146,7 @@ def _impute_values(self, features): return self._fitted_imputer.transform(features) + def _check_dataset(self, features, target, sample_weight=None): """Check if a dataset has a valid feature set and labels. @@ -1232,35 +1233,11 @@ def _compile_to_sklearn(self, expr): sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(expr, self._pset), self.operators) sklearn_pipeline = eval(sklearn_pipeline_str, self.operators_context) sklearn_pipeline.memory = self._memory + if self.random_state: + # Fix random state when the operator allows + set_param_recursive(sklearn_pipeline.steps, 'random_state', self.random_state) return sklearn_pipeline - def _set_param_recursive(self, pipeline_steps, parameter, value): - """Recursively iterate through all objects in the pipeline and set a given parameter. - - Parameters - ---------- - pipeline_steps: array-like - List of (str, obj) tuples from a scikit-learn pipeline or related object - parameter: str - The parameter to assign a value for in each pipeline object - value: any - The value to assign the parameter to in each pipeline object - Returns - ------- - None - - """ - for (_, obj) in pipeline_steps: - recursive_attrs = ['steps', 'transformer_list', 'estimators'] - for attr in recursive_attrs: - if hasattr(obj, attr): - self._set_param_recursive(getattr(obj, attr), parameter, value) - if hasattr(obj, 'estimator'): # nested estimator - est = getattr(obj, 'estimator') - if hasattr(est, parameter): - setattr(est, parameter, value) - if hasattr(obj, parameter): - setattr(obj, parameter, value) def _stop_by_max_time_mins(self): """Stop optimization process once maximum minutes have elapsed.""" @@ -1479,14 +1456,6 @@ def _preprocess_individuals(self, individuals): # Transform the tree expression into an sklearn pipeline sklearn_pipeline = self._toolbox.compile(expr=individual) - # Fix random state when the operator allows - self._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) - # Setting the seed is needed for XGBoost support because XGBoost currently stores - # both a seed and random_state, and they're not synced correctly. - # XGBoost will raise an exception if random_state != seed. - if 'XGB' in sklearn_pipeline_str: - self._set_param_recursive(sklearn_pipeline.steps, 'seed', 42) - # Count the number of pipeline operators as a measure of pipeline complexity operator_count = self._operator_count(individual) operator_counts[individual_str] = max(1, operator_count) diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 0260a384..b2e373f1 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -69,7 +69,7 @@ def export_pipeline(exported_pipeline, impute: bool (False): If impute = True, then adda a imputation step. random_state: integer - Random seed in train_test_split function. + Random seed in train_test_split function and exported pipeline. data_file_path: string (default: '') By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. @@ -84,9 +84,9 @@ def export_pipeline(exported_pipeline, pipeline_tree = expr_to_tree(exported_pipeline, pset) # Have the exported code import all of the necessary modules and functions - pipeline_text = generate_import_code(exported_pipeline, operators, impute) + pipeline_text = generate_import_code(exported_pipeline, operators, impute, random_state) - pipeline_code = pipeline_code_wrapper(generate_export_pipeline_code(pipeline_tree, operators)) + pipeline_code = pipeline_code_wrapper(generate_export_pipeline_code(pipeline_tree, operators), random_state) if pipeline_code.count("FunctionTransformer(copy)"): pipeline_text += """from sklearn.preprocessing import FunctionTransformer @@ -165,7 +165,7 @@ def prim_to_list(prim, args): return tree -def generate_import_code(pipeline, operators, impute=False): +def generate_import_code(pipeline, operators, impute=False, random_state=None): """Generate all library import calls for use in TPOT.export(). Parameters @@ -176,6 +176,8 @@ def generate_import_code(pipeline, operators, impute=False): List of operator class from operator library impute : bool Whether to impute new values in the feature set. + random_state: integer or None + Random seed in train_test_split function and exported pipeline. Returns ------- @@ -220,6 +222,9 @@ def merge_imports(old_dict, new_dict): except ImportError: from sklearn.preprocessing import Imputer """ + if random_state is not None: + pipeline_text += """from tpot.export_utils import set_param_recursive +""" return pipeline_text @@ -256,24 +261,38 @@ def _starting_imports(operators, operators_used): } -def pipeline_code_wrapper(pipeline_code): +def pipeline_code_wrapper(pipeline_code, random_state=None): """Generate code specific to the execution of the sklearn pipeline. Parameters ---------- pipeline_code: str Code that defines the final sklearn pipeline + random_state: integer or None + Random seed in train_test_split function and exported pipeline. Returns ------- - Source code for the sklearn pipeline and calls to fit and predict + exported_code: str + Source code for the sklearn pipeline and calls to fit and predict """ - return """exported_pipeline = {} + if random_state is None: + exported_code = """exported_pipeline = {} exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """.format(pipeline_code) + else: + exported_code = """exported_pipeline = {} +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', {}) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""".format(pipeline_code, random_state) + + return exported_code def generate_pipeline_code(pipeline_tree, operators): @@ -390,3 +409,32 @@ def _make_branch(branch): return "make_union(\n{},\n{}\n)".\ format(_indent(_make_branch(left), 4), _indent(_make_branch(right), 4)) + + +def set_param_recursive(pipeline_steps, parameter, value): + """Recursively iterate through all objects in the pipeline and set a given parameter. + + Parameters + ---------- + pipeline_steps: array-like + List of (str, obj) tuples from a scikit-learn pipeline or related object + parameter: str + The parameter to assign a value for in each pipeline object + value: any + The value to assign the parameter to in each pipeline object + Returns + ------- + None + + """ + for (_, obj) in pipeline_steps: + recursive_attrs = ['steps', 'transformer_list', 'estimators'] + for attr in recursive_attrs: + if hasattr(obj, attr): + set_param_recursive(getattr(obj, attr), parameter, value) + if hasattr(obj, 'estimator'): # nested estimator + est = getattr(obj, 'estimator') + if hasattr(est, parameter): + setattr(est, parameter, value) + if hasattr(obj, parameter): + setattr(obj, parameter, value) From 1bf37bdda630fb5d6f3a77653e12110d526558ca Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 4 Nov 2019 15:55:51 -0500 Subject: [PATCH 26/44] update examples for new exported codes format #933 and correct digits dataset names #943 --- README.md | 62 +++++++----- docs/api/index.html | 2 +- docs/examples/index.html | 4 +- docs/search/search_index.json | 2 +- docs/using/index.html | 6 +- docs_sources/api.md | 2 +- docs_sources/examples.md | 98 +++++++++++-------- docs_sources/using.md | 6 +- tests/export_tests.py | 44 ++++----- tests/tpot_tests.py | 8 +- tpot/base.py | 3 +- tpot/export_utils.py | 4 +- tutorials/{MNIST.ipynb => Digits.ipynb} | 4 +- .../MAGIC Gamma Telescope.ipynb | 4 +- .../tpot_MAGIC_Gamma_Telescope_pipeline.py | 4 +- .../Portuguese Bank Marketing Strategy.ipynb | 4 +- .../tpot_marketing_pipeline.py | 4 +- 17 files changed, 150 insertions(+), 111 deletions(-) rename tutorials/{MNIST.ipynb => Digits.ipynb} (98%) diff --git a/README.md b/README.md index 911b64c8..40251369 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Click on the corresponding links to find more information on TPOT usage in the d ### Classification -Below is a minimal working example with the practice MNIST data set. +Below is a minimal working example with the the optical recognition of handwritten digits dataset. ```python from tpot import TPOTClassifier @@ -64,32 +64,43 @@ from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, - train_size=0.75, test_size=0.25) + train_size=0.75, test_size=0.25, random_state=42) -tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) +tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') ``` -Running this code should discover a pipeline that achieves about 98% testing accuracy, and the corresponding Python code should be exported to the `tpot_mnist_pipeline.py` file and look similar to the following: +Running this code should discover a pipeline that achieves about 98% testing accuracy, and the corresponding Python code should be exported to the `tpot_digits_pipeline.py` file and look similar to the following: ```python import numpy as np import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline, make_union +from sklearn.preprocessing import PolynomialFeatures +from tpot.builtins import StackingEstimator +from tpot.export_utils import set_param_recursive # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['target'].values, random_state=None) - - -exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights="distance") - -exported_pipeline.fit(training_features, training_classes) + train_test_split(features, tpot_data['target'], random_state=42) + +# Average CV score on the training set was: 0.9799428471757372 +exported_pipeline = make_pipeline( + PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), + StackingEstimator(estimator=LogisticRegression(C=0.1, dual=False, penalty="l1")), + RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=20, min_samples_split=19, n_estimators=100) +) +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 42) + +exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) ``` @@ -104,9 +115,9 @@ from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, - train_size=0.75, test_size=0.25) + train_size=0.75, test_size=0.25, random_state=42) -tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2) +tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') @@ -117,20 +128,27 @@ which should result in a pipeline that achieves about 12.77 mean squared error ( ```python import numpy as np import pandas as pd -from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures +from tpot.export_utils import set_param_recursive # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=42) -exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls", - max_features=0.9, min_samples_leaf=5, - min_samples_split=6) +# Average CV score on the training set was: -10.812040755234403 +exported_pipeline = make_pipeline( + PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), + ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100) +) +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 42) -exported_pipeline.fit(training_features, training_classes) +exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) ``` diff --git a/docs/api/index.html b/docs/api/index.html index 58c5f027..1119dfc3 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -406,7 +406,7 @@ Classification
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py')Functions
diff --git a/docs/examples/index.html b/docs/examples/index.html index 61db571d..5ceffe11 100644 --- a/docs/examples/index.html +++ b/docs/examples/index.html @@ -272,10 +272,10 @@MNIST digit recognition
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') -Running this code should discover a pipeline (exported as
+tpot_mnist_pipeline.py
) that achieves about 98% test accuracy:Running this code should discover a pipeline (exported as
tpot_digits_pipeline.py
) that achieves about 98% test accuracy:import numpy as np from sklearn.model_selection import train_test_split diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 2d44b7d7..b0e5bcf8 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Consider TPOT your Data Science Assistant . TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. An example machine learning pipeline Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. An example TPOT pipeline TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. TPOT is still under active development and we encourage you to check back on this repository regularly for updates.","title":"Home"},{"location":"api/","text":"Classification class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything Regression class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"TPOT API"},{"location":"api/#classification","text":"class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Classification"},{"location":"api/#regression","text":"class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Regression"},{"location":"citing/","text":"If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI:","title":"Citing"},{"location":"contributing/","text":"We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. Project layout The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch. How to contribute The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.) Before submitting your pull request Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh After submitting your pull request After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"Contributing"},{"location":"contributing/#project-layout","text":"The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.","title":"Project layout"},{"location":"contributing/#how-to-contribute","text":"The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.)","title":"How to contribute"},{"location":"contributing/#before-submitting-your-pull-request","text":"Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh","title":"Before submitting your pull request"},{"location":"contributing/#after-submitting-your-pull-request","text":"After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"After submitting your pull request"},{"location":"examples/","text":"Overview The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here. Iris flower classification The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) MNIST digit recognition Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Running this code should discover a pipeline (exported as tpot_mnist_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Boston housing prices modeling The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Titanic survival analysis To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT. Portuguese Bank Marketing The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . MAGIC Gamma Telescope The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Examples"},{"location":"examples/#overview","text":"The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.","title":"Overview"},{"location":"examples/#iris-flower-classification","text":"The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Iris flower classification"},{"location":"examples/#mnist-digit-recognition","text":"Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Running this code should discover a pipeline (exported as tpot_mnist_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"MNIST digit recognition"},{"location":"examples/#boston-housing-prices-modeling","text":"The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Boston housing prices modeling"},{"location":"examples/#titanic-survival-analysis","text":"To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.","title":"Titanic survival analysis"},{"location":"examples/#portuguese-bank-marketing","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Portuguese Bank Marketing"},{"location":"examples/#magic-gamma-telescope","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"MAGIC Gamma Telescope"},{"location":"installing/","text":"TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice. NumPy, SciPy, scikit-learn, pandas and joblib can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit For the Windows users , the pywin32 module is required if Python is NOT installed via the Anaconda Python distribution and can be installed with pip for Python verion <=3.3 or conda (e.g. miniconda) for any Python version: conda install pywin32 Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask_ml . pip install dask[delayed] dask-ml If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate Finally to install TPOT itself, run the following command: pip install tpot Please file a new issue if you run into installation problems.","title":"Installation"},{"location":"related/","text":"Other Automated Machine Learning (AutoML) tools and related projects: Name Language License Description Auto-WEKA Java GPL-v3 Automated model selection and hyper-parameter tuning for Weka models. auto-sklearn Python BSD-3-Clause An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. auto_ml Python MIT Automated machine learning for analytics & production. Supports manual feature type declarations. H2O AutoML Java with Python, Scala & R APIs and web GUI Apache 2.0 Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. devol Python MIT Automated deep neural network design via genetic programming. MLBox Python BSD-3-Clause Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. Recipe C GPL-v3 Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. Xcessiv Python Apache 2.0 A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. GAMA Python Apache 2.0 Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.","title":"Related"},{"location":"releases/","text":"Version 0.9 TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator. Version 0.8 TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code. Version 0.7 TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance. Version 0.6 TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. Version 0.5 Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes. Version 0.4 In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions Version 0.3 We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. Version 0.2 TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. Version 0.1 First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Release Notes"},{"location":"releases/#version-09","text":"TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator.","title":"Version 0.9"},{"location":"releases/#version-08","text":"TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.","title":"Version 0.8"},{"location":"releases/#version-07","text":"TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance.","title":"Version 0.7"},{"location":"releases/#version-06","text":"TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.","title":"Version 0.6"},{"location":"releases/#version-05","text":"Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes.","title":"Version 0.5"},{"location":"releases/#version-04","text":"In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions","title":"Version 0.4"},{"location":"releases/#version-03","text":"We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.","title":"Version 0.3"},{"location":"releases/#version-02","text":"TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.","title":"Version 0.2"},{"location":"releases/#version-01","text":"First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Version 0.1"},{"location":"support/","text":"TPOT was developed in the Computational Genetics Lab at the University of Pennsylvania with funding from the NIH under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.","title":"Support"},{"location":"using/","text":"What to expect from AutoML software Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT. AutoML algorithms aren't intended to run for only a few minutes Of course, you can run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset. However, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not find any suitable pipeline at all, in which case a RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') will be raised. Often it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search the pipeline space for your dataset. AutoML algorithms can take a long time to finish their search AutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms (random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling, PCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways to ensemble or stack the algorithms within the pipeline. As such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings (100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing. To put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm and how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation, which means that roughly 100,000 models are fit and evaluated on the training data in one grid search. That's a time-consuming procedure, even for simpler models like decision trees. Typical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt the run partway through and see the best results so far. TPOT also provides a warm_start parameter that lets you restart a TPOT run from where it left off. AutoML algorithms can recommend different solutions for the same dataset If you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs may result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means that it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different pipelines, this means that the TPOT runs didn't converge due to lack of time or that multiple pipelines perform more-or-less the same on your dataset. This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search. TPOT with code We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets. TPOT on the command line To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit. Scoring functions TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module. Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Customizing TPOT's operators and parameters Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. Template option in TPOT Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. FeatureSetSelector in TPOT FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y) Pipeline caching in TPOT With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore. Crash/freeze issue with n_jobs > 1 under OSX or Linux Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation . Parallel Training with Dask For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Using TPOT"},{"location":"using/#what-to-expect-from-automl-software","text":"Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.","title":"What to expect from AutoML software"},{"location":"using/#tpot-with-code","text":"We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets.","title":"TPOT with code"},{"location":"using/#tpot-on-the-command-line","text":"To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.","title":"TPOT on the command line"},{"location":"using/#scoring-functions","text":"TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.","title":"Scoring functions"},{"location":"using/#built-in-tpot-configurations","text":"TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py')","title":"Built-in TPOT configurations"},{"location":"using/#customizing-tpots-operators-and-parameters","text":"Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.","title":"Customizing TPOT's operators and parameters"},{"location":"using/#template-option-in-tpot","text":"Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.","title":"Template option in TPOT"},{"location":"using/#featuresetselector-in-tpot","text":"FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y)","title":"FeatureSetSelector in TPOT"},{"location":"using/#pipeline-caching-in-tpot","text":"With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.","title":"Pipeline caching in TPOT"},{"location":"using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux","text":"Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .","title":"Crash/freeze issue with n_jobs > 1 under OSX or Linux"},{"location":"using/#parallel-training-with-dask","text":"For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Parallel Training with Dask"}]} \ No newline at end of file +{"config":{"lang":["en"],"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Consider TPOT your Data Science Assistant . TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. An example machine learning pipeline Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. An example TPOT pipeline TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. TPOT is still under active development and we encourage you to check back on this repository regularly for updates.","title":"Home"},{"location":"api/","text":"Classification class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything Regression class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"TPOT API"},{"location":"api/#classification","text":"class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Classification"},{"location":"api/#regression","text":"class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Regression"},{"location":"citing/","text":"If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI:","title":"Citing"},{"location":"contributing/","text":"We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. Project layout The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch. How to contribute The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.) Before submitting your pull request Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh After submitting your pull request After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"Contributing"},{"location":"contributing/#project-layout","text":"The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.","title":"Project layout"},{"location":"contributing/#how-to-contribute","text":"The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.)","title":"How to contribute"},{"location":"contributing/#before-submitting-your-pull-request","text":"Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh","title":"Before submitting your pull request"},{"location":"contributing/#after-submitting-your-pull-request","text":"After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"After submitting your pull request"},{"location":"examples/","text":"Overview The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here. Iris flower classification The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) MNIST digit recognition Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Boston housing prices modeling The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Titanic survival analysis To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT. Portuguese Bank Marketing The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . MAGIC Gamma Telescope The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Examples"},{"location":"examples/#overview","text":"The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.","title":"Overview"},{"location":"examples/#iris-flower-classification","text":"The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Iris flower classification"},{"location":"examples/#mnist-digit-recognition","text":"Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"MNIST digit recognition"},{"location":"examples/#boston-housing-prices-modeling","text":"The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Boston housing prices modeling"},{"location":"examples/#titanic-survival-analysis","text":"To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.","title":"Titanic survival analysis"},{"location":"examples/#portuguese-bank-marketing","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Portuguese Bank Marketing"},{"location":"examples/#magic-gamma-telescope","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"MAGIC Gamma Telescope"},{"location":"installing/","text":"TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice. NumPy, SciPy, scikit-learn, pandas and joblib can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit For the Windows users , the pywin32 module is required if Python is NOT installed via the Anaconda Python distribution and can be installed with pip for Python verion <=3.3 or conda (e.g. miniconda) for any Python version: conda install pywin32 Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask_ml . pip install dask[delayed] dask-ml If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate Finally to install TPOT itself, run the following command: pip install tpot Please file a new issue if you run into installation problems.","title":"Installation"},{"location":"related/","text":"Other Automated Machine Learning (AutoML) tools and related projects: Name Language License Description Auto-WEKA Java GPL-v3 Automated model selection and hyper-parameter tuning for Weka models. auto-sklearn Python BSD-3-Clause An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. auto_ml Python MIT Automated machine learning for analytics & production. Supports manual feature type declarations. H2O AutoML Java with Python, Scala & R APIs and web GUI Apache 2.0 Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. devol Python MIT Automated deep neural network design via genetic programming. MLBox Python BSD-3-Clause Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. Recipe C GPL-v3 Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. Xcessiv Python Apache 2.0 A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. GAMA Python Apache 2.0 Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.","title":"Related"},{"location":"releases/","text":"Version 0.9 TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator. Version 0.8 TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code. Version 0.7 TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance. Version 0.6 TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. Version 0.5 Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes. Version 0.4 In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions Version 0.3 We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. Version 0.2 TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. Version 0.1 First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Release Notes"},{"location":"releases/#version-09","text":"TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator.","title":"Version 0.9"},{"location":"releases/#version-08","text":"TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.","title":"Version 0.8"},{"location":"releases/#version-07","text":"TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance.","title":"Version 0.7"},{"location":"releases/#version-06","text":"TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.","title":"Version 0.6"},{"location":"releases/#version-05","text":"Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes.","title":"Version 0.5"},{"location":"releases/#version-04","text":"In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions","title":"Version 0.4"},{"location":"releases/#version-03","text":"We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.","title":"Version 0.3"},{"location":"releases/#version-02","text":"TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.","title":"Version 0.2"},{"location":"releases/#version-01","text":"First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Version 0.1"},{"location":"support/","text":"TPOT was developed in the Computational Genetics Lab at the University of Pennsylvania with funding from the NIH under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.","title":"Support"},{"location":"using/","text":"What to expect from AutoML software Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT. AutoML algorithms aren't intended to run for only a few minutes Of course, you can run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset. However, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not find any suitable pipeline at all, in which case a RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') will be raised. Often it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search the pipeline space for your dataset. AutoML algorithms can take a long time to finish their search AutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms (random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling, PCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways to ensemble or stack the algorithms within the pipeline. As such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings (100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing. To put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm and how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation, which means that roughly 100,000 models are fit and evaluated on the training data in one grid search. That's a time-consuming procedure, even for simpler models like decision trees. Typical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt the run partway through and see the best results so far. TPOT also provides a warm_start parameter that lets you restart a TPOT run from where it left off. AutoML algorithms can recommend different solutions for the same dataset If you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs may result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means that it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different pipelines, this means that the TPOT runs didn't converge due to lack of time or that multiple pipelines perform more-or-less the same on your dataset. This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search. TPOT with code We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets. TPOT on the command line To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit. Scoring functions TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module. Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Customizing TPOT's operators and parameters Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. Template option in TPOT Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. FeatureSetSelector in TPOT FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y) Pipeline caching in TPOT With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore. Crash/freeze issue with n_jobs > 1 under OSX or Linux Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation . Parallel Training with Dask For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Using TPOT"},{"location":"using/#what-to-expect-from-automl-software","text":"Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.","title":"What to expect from AutoML software"},{"location":"using/#tpot-with-code","text":"We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets.","title":"TPOT with code"},{"location":"using/#tpot-on-the-command-line","text":"To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.","title":"TPOT on the command line"},{"location":"using/#scoring-functions","text":"TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.","title":"Scoring functions"},{"location":"using/#built-in-tpot-configurations","text":"TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')","title":"Built-in TPOT configurations"},{"location":"using/#customizing-tpots-operators-and-parameters","text":"Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.","title":"Customizing TPOT's operators and parameters"},{"location":"using/#template-option-in-tpot","text":"Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.","title":"Template option in TPOT"},{"location":"using/#featuresetselector-in-tpot","text":"FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y)","title":"FeatureSetSelector in TPOT"},{"location":"using/#pipeline-caching-in-tpot","text":"With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.","title":"Pipeline caching in TPOT"},{"location":"using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux","text":"Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .","title":"Crash/freeze issue with n_jobs > 1 under OSX or Linux"},{"location":"using/#parallel-training-with-dask","text":"For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Parallel Training with Dask"}]} \ No newline at end of file diff --git a/docs/using/index.html b/docs/using/index.html index 1a3e0f7a..94af489a 100644 --- a/docs/using/index.html +++ b/docs/using/index.html @@ -519,7 +519,7 @@
Scoring functions
scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py')@@ -595,7 +595,7 @@
Built-in TPOT configurations
config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') @@ -647,7 +647,7 @@Customizing TPOT's operators config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py')
Command-line users must create a separate
diff --git a/docs_sources/api.md b/docs_sources/api.md index a1106417..473509ac 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -268,7 +268,7 @@ X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') ``` Functions diff --git a/docs_sources/examples.md b/docs_sources/examples.md index 9096fdfa..e83a7f00 100644 --- a/docs_sources/examples.md +++ b/docs_sources/examples.md @@ -6,7 +6,7 @@ belonging to a typical class of machine learning tasks. | Dataset | Task | Task class | Dataset description | Jupyter notebook | | ------- | ----------------------- | ---------------------- |:-------------------:|:------------------------------------------------------------------------------------------:| | Iris | flower classification | classification | [link](https://archive.ics.uci.edu/ml/datasets/iris) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/IRIS.ipynb) | -| MNIST | digit recognition | (image) classification | [link](https://yann.lecun.com/exdb/mnist/) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/MNIST.ipynb) | +| Optical Recognition of Handwritten Digits | digit recognition | (image) classification | [link](https://scikit-learn.org/stable/datasets/index.html#digits-dataset) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/Digits.ipynb) | | Boston | housing prices modeling | regression | [link](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html) | N/A | | Titanic | survival analysis | classification | [link](https://www.kaggle.com/c/titanic/data) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/Titanic_Kaggle.ipynb) | | Bank Marketing | subscription prediction | classification | [link](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/Portuguese%20Bank%20Marketing/Portuguese%20Bank%20Marketing%20Stratergy.ipynb) | @@ -28,9 +28,9 @@ import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), - iris.target.astype(np.float64), train_size=0.75, test_size=0.25) + iris.target.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42) -tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) +tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') @@ -40,31 +40,34 @@ Running this code should discover a pipeline (exported as `tpot_iris_pipeline.py ```Python import numpy as np - +import pandas as pd from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer +from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), - tpot_data.dtype.names.index('class'), axis=1) +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=42) +# Average CV score on the training set was: 0.9826086956521738 exported_pipeline = make_pipeline( - Normalizer(), - GaussianNB() + Normalizer(norm="l2"), + KNeighborsClassifier(n_neighbors=5, p=2, weights="distance") ) +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) ``` -## MNIST digit recognition +## Digits dataset -Below is a minimal working example with the practice MNIST dataset, which is an _image classification problem_. +Below is a minimal working example with the optical recognition of handwritten digits dataset, which is an _image classification problem_. ```Python from tpot import TPOTClassifier @@ -73,30 +76,41 @@ from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, - train_size=0.75, test_size=0.25) + train_size=0.75, test_size=0.25, random_state=42) -tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) +tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') ``` -Running this code should discover a pipeline (exported as `tpot_mnist_pipeline.py`) that achieves about 98% test accuracy: +Running this code should discover a pipeline (exported as `tpot_digits_pipeline.py`) that achieves about 98% test accuracy: ```Python import numpy as np - +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier - -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), - tpot_data.dtype.names.index('class'), axis=1) +from sklearn.pipeline import make_pipeline, make_union +from sklearn.preprocessing import PolynomialFeatures +from tpot.builtins import StackingEstimator +from tpot.export_utils import set_param_recursive + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=42) -exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights="distance") +# Average CV score on the training set was: 0.9799428471757372 +exported_pipeline = make_pipeline( + PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), + StackingEstimator(estimator=LogisticRegression(C=0.1, dual=False, penalty="l1")), + RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=20, min_samples_split=19, n_estimators=100) +) +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) @@ -113,9 +127,9 @@ from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, - train_size=0.75, test_size=0.25) + train_size=0.75, test_size=0.25, random_state=42) -tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) +tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') @@ -125,20 +139,26 @@ Running this code should discover a pipeline (exported as `tpot_boston_pipeline. ```Python import numpy as np - -from sklearn.ensemble import GradientBoostingRegressor +import pandas as pd +from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures +from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), - tpot_data.dtype.names.index('class'), axis=1) +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=42) -exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls", - max_features=0.9, min_samples_leaf=5, - min_samples_split=6) +# Average CV score on the training set was: -10.812040755234403 +exported_pipeline = make_pipeline( + PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), + ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100) +) +# Fix random state for all the steps in exported pipeline +set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) diff --git a/docs_sources/using.md b/docs_sources/using.md index a8a55785..82ed0efc 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -384,7 +384,7 @@ tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') ``` - You can pass a metric function with the signature `score_func(y_true, y_pred)` (e.g. `my_custom_accuracy` in the example above), where `y_true` are the true target values and `y_pred` are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with "error" or "loss" in the function name is meant to be minimized (`greater_is_better=False` in [`make_scorer`](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html)), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. @@ -461,7 +461,7 @@ tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') ``` @@ -520,7 +520,7 @@ tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) -tpot.export('tpot_mnist_pipeline.py') +tpot.export('tpot_digits_pipeline.py') ``` Command-line users must create a separate `.py` file with the custom configuration and provide the path to the file to the `tpot` call. For example, if the simple example configuration above is saved in `tpot_classifier_config.py`, that configuration could be used on the command line with the command: diff --git a/tests/export_tests.py b/tests/export_tests.py index 5b715e30..59f9f27d 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -51,9 +51,9 @@ classifier_config_dict[test_operator_key_2] ) -mnist_data = load_digits() +digits_data = load_digits() training_features, testing_features, training_target, testing_target = \ - train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) + train_test_split(digits_data.data.astype(np.float64), digits_data.target.astype(np.float64), random_state=42) tpot_obj = TPOTClassifier() tpot_obj._fit_init() @@ -75,9 +75,9 @@ def test_export_random_ind(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=39) + train_test_split(features, tpot_data['target'], random_state=39) exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) # Fix random state for all the steps in exported pipeline @@ -130,9 +130,9 @@ def test_export_2(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") @@ -323,9 +323,9 @@ def test_export_pipeline(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( make_union( @@ -360,9 +360,9 @@ def test_export_pipeline_2(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") @@ -391,9 +391,9 @@ def test_export_pipeline_3(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), @@ -431,9 +431,9 @@ def test_export_pipeline_4(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( make_union( @@ -468,9 +468,9 @@ def test_export_pipeline_5(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.05, n_estimators=100), threshold=0.05), @@ -503,9 +503,9 @@ def test_export_pipeline_6(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=42) + train_test_split(features, tpot_data['target'], random_state=42) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") # Fix random state for all the steps in exported pipeline @@ -598,9 +598,9 @@ def test_pipeline_score_save(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.929813743 exported_pipeline = make_pipeline( @@ -652,9 +652,9 @@ def test_imputer_in_export(): # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) imputer = Imputer(strategy="median") imputer.fit(training_features) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 490628e4..0088833c 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -83,10 +83,10 @@ def closing(arg): else: from contextlib import closing -# Set up the MNIST data set for testing -mnist_data = load_digits() +# Set up the digits data set for testing +digits_data = load_digits() training_features, testing_features, training_target, testing_target = \ - train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) + train_test_split(digits_data.data.astype(np.float64), digits_data.target.astype(np.float64), random_state=42) # Set up test data with missing value features_with_nan = np.copy(training_features) @@ -713,7 +713,7 @@ def test_template_4(): def test_fit_GroupKFold(): """Assert that TPOT properly handles the group parameter when using GroupKFold.""" - # This check tests if the darker MNIST images would generalize to the lighter ones. + # This check tests if the darker digits images would generalize to the lighter ones. means = np.mean(training_features, axis=1) groups = means >= np.median(means) diff --git a/tpot/base.py b/tpot/base.py index 61ec6c39..7bfc56e5 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1122,7 +1122,8 @@ def export(self, output_file_name='', data_file_path=''): if output_file_name is not '': with open(output_file_name, 'w') as output_file: output_file.write(to_write) - return to_write + else: + return to_write def _impute_values(self, features): diff --git a/tpot/export_utils.py b/tpot/export_utils.py index b2e373f1..4cf8db47 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -98,9 +98,9 @@ def export_pipeline(exported_pipeline, pipeline_text += """ # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('{}', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['target'].values, random_state={}) + train_test_split(features, tpot_data['target'], random_state={}) """.format(data_file_path, random_state) # Add the imputation step if it was used by TPOT diff --git a/tutorials/MNIST.ipynb b/tutorials/Digits.ipynb similarity index 98% rename from tutorials/MNIST.ipynb rename to tutorials/Digits.ipynb index 8c4387c2..ff4992a5 100644 --- a/tutorials/MNIST.ipynb +++ b/tutorials/Digits.ipynb @@ -198,7 +198,7 @@ }, "outputs": [], "source": [ - "tpot.export('tpot_mnist_pipeline.py')" + "tpot.export('tpot_digits_pipeline.py')" ] }, { @@ -211,7 +211,7 @@ }, "outputs": [], "source": [ - "# %load tpot_mnist_pipeline.py\n", + "# %load tpot_digits_pipeline.py\n", "import numpy as np\n", "\n", "from sklearn.model_selection import train_test_split\n", diff --git a/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb b/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb index 3dee4074..4eb400af 100644 --- a/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb +++ b/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb @@ -934,9 +934,9 @@ "\n", "# NOTE: Make sure that the class is labeled 'target' in the data file\n", "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", - "features = tpot_data.drop('target', axis=1).values\n", + "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_target, testing_target = \\\n", - " train_test_split(features, tpot_data['target'].values, random_state=None)\n", + " train_test_split(features, tpot_data['target'], random_state=None)\n", "\n", "# Average CV score on the training set was:0.853347788745\n", "exported_pipeline = make_pipeline(\n", diff --git a/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py b/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py index 9fc55dae..388f04e3 100644 --- a/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py +++ b/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py @@ -8,9 +8,9 @@ # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was:0.853347788745 exported_pipeline = make_pipeline( diff --git a/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb b/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb index 005b2f99..cd4c9713 100644 --- a/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb +++ b/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb @@ -925,9 +925,9 @@ "\n", "# NOTE: Make sure that the class is labeled 'target' in the data file\n", "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", - "features = tpot_data.drop('target', axis=1).values\n", + "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_target, testing_target = \\\n", - " train_test_split(features, tpot_data['target'].values, random_state=None)\n", + " train_test_split(features, tpot_data['target'], random_state=None)\n", "\n", "# Average CV score on the training set was:0.913728927925\n", "exported_pipeline = DecisionTreeClassifier(criterion=\"gini\", max_depth=5, min_samples_leaf=16, min_samples_split=8)\n", diff --git a/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py b/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py index da8b3a78..5e737569 100644 --- a/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py +++ b/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py @@ -5,9 +5,9 @@ # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) -features = tpot_data.drop('target', axis=1).values +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['target'].values, random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was:0.913728927925 exported_pipeline = DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_leaf=16, min_samples_split=8) From 474d15983f255906073e563932332c34eeb6e60d Mon Sep 17 00:00:00 2001 From: weixuanfu.py
file with the custom configuration and provide the path to the file to thetpot
call. For example, if the simple example configuration above is saved intpot_classifier_config.py
, that configuration could be used on the command line with the command:Date: Tue, 5 Nov 2019 09:51:30 -0500 Subject: [PATCH 27/44] update docs #947 --- README.md | 4 ++-- docs/examples/index.html | 18 ++++++++--------- docs/search/search_index.json | 2 +- docs_sources/examples.md | 6 +++--- tests/export_tests.py | 20 +++++++++---------- tpot/export_utils.py | 2 +- tutorials/Digits.ipynb | 10 +++++----- tutorials/IRIS.ipynb | 10 +++++----- .../MAGIC Gamma Telescope.ipynb | 2 +- .../tpot_MAGIC_Gamma_Telescope_pipeline.py | 2 +- .../Portuguese Bank Marketing Strategy.ipynb | 2 +- .../tpot_marketing_pipeline.py | 2 +- tutorials/Titanic_Kaggle.ipynb | 10 +++++----- tutorials/tpot_iris_pipeline.py | 10 +++++----- tutorials/tpot_mnist_pipeline.py | 10 +++++----- tutorials/tpot_titanic_pipeline.py | 10 +++++----- 16 files changed, 60 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 40251369..949af676 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ @@ -134,7 +134,7 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ diff --git a/docs/examples/index.html b/docs/examples/index.html index 5ceffe11..063c117a 100644 --- a/docs/examples/index.html +++ b/docs/examples/index.html @@ -243,12 +243,12 @@ Iris flower classification
from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) +# NOTE: Make sure that the outcome column is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), @@ -281,12 +281,12 @@MNIST digit recognition
from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) +# NOTE: Make sure that the outcome column is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights="distance") @@ -316,12 +316,12 @@Boston housing prices modeling
from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) +# NOTE: Make sure that the outcome column is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls", max_features=0.9, min_samples_leaf=5, diff --git a/docs/search/search_index.json b/docs/search/search_index.json index b0e5bcf8..ee592d04 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Consider TPOT your Data Science Assistant . TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. An example machine learning pipeline Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. An example TPOT pipeline TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. TPOT is still under active development and we encourage you to check back on this repository regularly for updates.","title":"Home"},{"location":"api/","text":"Classification class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything Regression class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"TPOT API"},{"location":"api/#classification","text":"class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Classification"},{"location":"api/#regression","text":"class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Regression"},{"location":"citing/","text":"If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI:","title":"Citing"},{"location":"contributing/","text":"We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. Project layout The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch. How to contribute The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.) Before submitting your pull request Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh After submitting your pull request After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"Contributing"},{"location":"contributing/#project-layout","text":"The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.","title":"Project layout"},{"location":"contributing/#how-to-contribute","text":"The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.)","title":"How to contribute"},{"location":"contributing/#before-submitting-your-pull-request","text":"Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh","title":"Before submitting your pull request"},{"location":"contributing/#after-submitting-your-pull-request","text":"After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"After submitting your pull request"},{"location":"examples/","text":"Overview The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here. Iris flower classification The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) MNIST digit recognition Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Boston housing prices modeling The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Titanic survival analysis To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT. Portuguese Bank Marketing The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . MAGIC Gamma Telescope The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Examples"},{"location":"examples/#overview","text":"The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.","title":"Overview"},{"location":"examples/#iris-flower-classification","text":"The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Iris flower classification"},{"location":"examples/#mnist-digit-recognition","text":"Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"MNIST digit recognition"},{"location":"examples/#boston-housing-prices-modeling","text":"The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Boston housing prices modeling"},{"location":"examples/#titanic-survival-analysis","text":"To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.","title":"Titanic survival analysis"},{"location":"examples/#portuguese-bank-marketing","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Portuguese Bank Marketing"},{"location":"examples/#magic-gamma-telescope","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"MAGIC Gamma Telescope"},{"location":"installing/","text":"TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice. NumPy, SciPy, scikit-learn, pandas and joblib can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit For the Windows users , the pywin32 module is required if Python is NOT installed via the Anaconda Python distribution and can be installed with pip for Python verion <=3.3 or conda (e.g. miniconda) for any Python version: conda install pywin32 Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask_ml . pip install dask[delayed] dask-ml If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate Finally to install TPOT itself, run the following command: pip install tpot Please file a new issue if you run into installation problems.","title":"Installation"},{"location":"related/","text":"Other Automated Machine Learning (AutoML) tools and related projects: Name Language License Description Auto-WEKA Java GPL-v3 Automated model selection and hyper-parameter tuning for Weka models. auto-sklearn Python BSD-3-Clause An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. auto_ml Python MIT Automated machine learning for analytics & production. Supports manual feature type declarations. H2O AutoML Java with Python, Scala & R APIs and web GUI Apache 2.0 Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. devol Python MIT Automated deep neural network design via genetic programming. MLBox Python BSD-3-Clause Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. Recipe C GPL-v3 Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. Xcessiv Python Apache 2.0 A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. GAMA Python Apache 2.0 Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.","title":"Related"},{"location":"releases/","text":"Version 0.9 TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator. Version 0.8 TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code. Version 0.7 TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance. Version 0.6 TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. Version 0.5 Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes. Version 0.4 In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions Version 0.3 We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. Version 0.2 TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. Version 0.1 First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Release Notes"},{"location":"releases/#version-09","text":"TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator.","title":"Version 0.9"},{"location":"releases/#version-08","text":"TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.","title":"Version 0.8"},{"location":"releases/#version-07","text":"TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance.","title":"Version 0.7"},{"location":"releases/#version-06","text":"TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.","title":"Version 0.6"},{"location":"releases/#version-05","text":"Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes.","title":"Version 0.5"},{"location":"releases/#version-04","text":"In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions","title":"Version 0.4"},{"location":"releases/#version-03","text":"We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.","title":"Version 0.3"},{"location":"releases/#version-02","text":"TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.","title":"Version 0.2"},{"location":"releases/#version-01","text":"First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Version 0.1"},{"location":"support/","text":"TPOT was developed in the Computational Genetics Lab at the University of Pennsylvania with funding from the NIH under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.","title":"Support"},{"location":"using/","text":"What to expect from AutoML software Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT. AutoML algorithms aren't intended to run for only a few minutes Of course, you can run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset. However, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not find any suitable pipeline at all, in which case a RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') will be raised. Often it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search the pipeline space for your dataset. AutoML algorithms can take a long time to finish their search AutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms (random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling, PCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways to ensemble or stack the algorithms within the pipeline. As such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings (100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing. To put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm and how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation, which means that roughly 100,000 models are fit and evaluated on the training data in one grid search. That's a time-consuming procedure, even for simpler models like decision trees. Typical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt the run partway through and see the best results so far. TPOT also provides a warm_start parameter that lets you restart a TPOT run from where it left off. AutoML algorithms can recommend different solutions for the same dataset If you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs may result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means that it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different pipelines, this means that the TPOT runs didn't converge due to lack of time or that multiple pipelines perform more-or-less the same on your dataset. This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search. TPOT with code We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets. TPOT on the command line To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit. Scoring functions TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module. Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Customizing TPOT's operators and parameters Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. Template option in TPOT Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. FeatureSetSelector in TPOT FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y) Pipeline caching in TPOT With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore. Crash/freeze issue with n_jobs > 1 under OSX or Linux Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation . Parallel Training with Dask For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Using TPOT"},{"location":"using/#what-to-expect-from-automl-software","text":"Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.","title":"What to expect from AutoML software"},{"location":"using/#tpot-with-code","text":"We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets.","title":"TPOT with code"},{"location":"using/#tpot-on-the-command-line","text":"To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.","title":"TPOT on the command line"},{"location":"using/#scoring-functions","text":"TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.","title":"Scoring functions"},{"location":"using/#built-in-tpot-configurations","text":"TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')","title":"Built-in TPOT configurations"},{"location":"using/#customizing-tpots-operators-and-parameters","text":"Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.","title":"Customizing TPOT's operators and parameters"},{"location":"using/#template-option-in-tpot","text":"Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.","title":"Template option in TPOT"},{"location":"using/#featuresetselector-in-tpot","text":"FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y)","title":"FeatureSetSelector in TPOT"},{"location":"using/#pipeline-caching-in-tpot","text":"With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.","title":"Pipeline caching in TPOT"},{"location":"using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux","text":"Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .","title":"Crash/freeze issue with n_jobs > 1 under OSX or Linux"},{"location":"using/#parallel-training-with-dask","text":"For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Parallel Training with Dask"}]} \ No newline at end of file +{"config":{"lang":["en"],"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Consider TPOT your Data Science Assistant . TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. An example machine learning pipeline Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. An example TPOT pipeline TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. TPOT is still under active development and we encourage you to check back on this repository regularly for updates.","title":"Home"},{"location":"api/","text":"Classification class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything Regression class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"TPOT API"},{"location":"api/#classification","text":"class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Classification"},{"location":"api/#regression","text":"class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) Number of iterations to the run pipeline optimization process. Must be a positive number. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file Returns: Does not return anything","title":"Regression"},{"location":"citing/","text":"If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI:","title":"Citing"},{"location":"contributing/","text":"We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. Project layout The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch. How to contribute The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.) Before submitting your pull request Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh After submitting your pull request After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"Contributing"},{"location":"contributing/#project-layout","text":"The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.","title":"Project layout"},{"location":"contributing/#how-to-contribute","text":"The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.)","title":"How to contribute"},{"location":"contributing/#before-submitting-your-pull-request","text":"Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh","title":"Before submitting your pull request"},{"location":"contributing/#after-submitting-your-pull-request","text":"After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"After submitting your pull request"},{"location":"examples/","text":"Overview The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here. Iris flower classification The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) MNIST digit recognition Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Boston housing prices modeling The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Titanic survival analysis To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT. Portuguese Bank Marketing The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . MAGIC Gamma Telescope The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Examples"},{"location":"examples/#overview","text":"The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link MNIST digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.","title":"Overview"},{"location":"examples/#iris-flower-classification","text":"The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = make_pipeline( Normalizer(), GaussianNB() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Iris flower classification"},{"location":"examples/#mnist-digit-recognition","text":"Below is a minimal working example with the practice MNIST dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights=\"distance\") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"MNIST digit recognition"},{"location":"examples/#boston-housing-prices-modeling","text":"The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss=\"ls\", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Boston housing prices modeling"},{"location":"examples/#titanic-survival-analysis","text":"To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.","title":"Titanic survival analysis"},{"location":"examples/#portuguese-bank-marketing","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Portuguese Bank Marketing"},{"location":"examples/#magic-gamma-telescope","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"MAGIC Gamma Telescope"},{"location":"installing/","text":"TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice. NumPy, SciPy, scikit-learn, pandas and joblib can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit For the Windows users , the pywin32 module is required if Python is NOT installed via the Anaconda Python distribution and can be installed with pip for Python verion <=3.3 or conda (e.g. miniconda) for any Python version: conda install pywin32 Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask_ml . pip install dask[delayed] dask-ml If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate Finally to install TPOT itself, run the following command: pip install tpot Please file a new issue if you run into installation problems.","title":"Installation"},{"location":"related/","text":"Other Automated Machine Learning (AutoML) tools and related projects: Name Language License Description Auto-WEKA Java GPL-v3 Automated model selection and hyper-parameter tuning for Weka models. auto-sklearn Python BSD-3-Clause An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. auto_ml Python MIT Automated machine learning for analytics & production. Supports manual feature type declarations. H2O AutoML Java with Python, Scala & R APIs and web GUI Apache 2.0 Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. devol Python MIT Automated deep neural network design via genetic programming. MLBox Python BSD-3-Clause Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. Recipe C GPL-v3 Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. Xcessiv Python Apache 2.0 A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. GAMA Python Apache 2.0 Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.","title":"Related"},{"location":"releases/","text":"Version 0.9 TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator. Version 0.8 TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code. Version 0.7 TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance. Version 0.6 TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. Version 0.5 Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes. Version 0.4 In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions Version 0.3 We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. Version 0.2 TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. Version 0.1 First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Release Notes"},{"location":"releases/#version-09","text":"TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator.","title":"Version 0.9"},{"location":"releases/#version-08","text":"TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.","title":"Version 0.8"},{"location":"releases/#version-07","text":"TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance.","title":"Version 0.7"},{"location":"releases/#version-06","text":"TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.","title":"Version 0.6"},{"location":"releases/#version-05","text":"Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes.","title":"Version 0.5"},{"location":"releases/#version-04","text":"In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions","title":"Version 0.4"},{"location":"releases/#version-03","text":"We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.","title":"Version 0.3"},{"location":"releases/#version-02","text":"TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.","title":"Version 0.2"},{"location":"releases/#version-01","text":"First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Version 0.1"},{"location":"support/","text":"TPOT was developed in the Computational Genetics Lab at the University of Pennsylvania with funding from the NIH under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.","title":"Support"},{"location":"using/","text":"What to expect from AutoML software Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT. AutoML algorithms aren't intended to run for only a few minutes Of course, you can run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset. However, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not find any suitable pipeline at all, in which case a RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') will be raised. Often it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search the pipeline space for your dataset. AutoML algorithms can take a long time to finish their search AutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms (random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling, PCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways to ensemble or stack the algorithms within the pipeline. As such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings (100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing. To put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm and how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation, which means that roughly 100,000 models are fit and evaluated on the training data in one grid search. That's a time-consuming procedure, even for simpler models like decision trees. Typical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt the run partway through and see the best results so far. TPOT also provides a warm_start parameter that lets you restart a TPOT run from where it left off. AutoML algorithms can recommend different solutions for the same dataset If you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs may result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means that it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different pipelines, this means that the TPOT runs didn't converge due to lack of time or that multiple pipelines perform more-or-less the same on your dataset. This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search. TPOT with code We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets. TPOT on the command line To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit. Scoring functions TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module. Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Customizing TPOT's operators and parameters Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. Template option in TPOT Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. FeatureSetSelector in TPOT FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y) Pipeline caching in TPOT With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore. Crash/freeze issue with n_jobs > 1 under OSX or Linux Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation . Parallel Training with Dask For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Using TPOT"},{"location":"using/#what-to-expect-from-automl-software","text":"Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.","title":"What to expect from AutoML software"},{"location":"using/#tpot-with-code","text":"We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets.","title":"TPOT with code"},{"location":"using/#tpot-on-the-command-line","text":"To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.","title":"TPOT on the command line"},{"location":"using/#scoring-functions","text":"TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') You can pass a metric function with the signature score_func(y_true, y_pred) (e.g. my_custom_accuracy in the example above), where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized ( greater_is_better=False in make_scorer ), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.","title":"Scoring functions"},{"location":"using/#built-in-tpot-configurations","text":"TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')","title":"Built-in TPOT configurations"},{"location":"using/#customizing-tpots-operators-and-parameters","text":"Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.","title":"Customizing TPOT's operators and parameters"},{"location":"using/#template-option-in-tpot","text":"Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.","title":"Template option in TPOT"},{"location":"using/#featuresetselector-in-tpot","text":"FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y)","title":"FeatureSetSelector in TPOT"},{"location":"using/#pipeline-caching-in-tpot","text":"With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.","title":"Pipeline caching in TPOT"},{"location":"using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux","text":"Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .","title":"Crash/freeze issue with n_jobs > 1 under OSX or Linux"},{"location":"using/#parallel-training-with-dask","text":"For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Parallel Training with Dask"}]} \ No newline at end of file diff --git a/docs_sources/examples.md b/docs_sources/examples.md index e83a7f00..b3dcb702 100644 --- a/docs_sources/examples.md +++ b/docs_sources/examples.md @@ -47,7 +47,7 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ @@ -97,7 +97,7 @@ from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ @@ -146,7 +146,7 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ diff --git a/tests/export_tests.py b/tests/export_tests.py index 59f9f27d..bb39b90f 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -73,7 +73,7 @@ def test_export_random_ind(): from sklearn.naive_bayes import BernoulliNB from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -128,7 +128,7 @@ def test_export_2(): from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -321,7 +321,7 @@ def test_export_pipeline(): from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -358,7 +358,7 @@ def test_export_pipeline_2(): from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -389,7 +389,7 @@ def test_export_pipeline_3(): from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -429,7 +429,7 @@ def test_export_pipeline_4(): from sklearn.preprocessing import FunctionTransformer from copy import copy -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -466,7 +466,7 @@ def test_export_pipeline_5(): from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeRegressor -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -501,7 +501,7 @@ def test_export_pipeline_6(): from sklearn.neighbors import KNeighborsClassifier from tpot.export_utils import set_param_recursive -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -596,7 +596,7 @@ def test_pipeline_score_save(): from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ @@ -650,7 +650,7 @@ def test_imputer_in_export(): except ImportError: from sklearn.preprocessing import Imputer -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 4cf8db47..51a37305 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -96,7 +96,7 @@ def export_pipeline(exported_pipeline, data_file_path = 'PATH/TO/DATA/FILE' pipeline_text += """ -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('{}', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ diff --git a/tutorials/Digits.ipynb b/tutorials/Digits.ipynb index ff4992a5..5821c3e3 100644 --- a/tutorials/Digits.ipynb +++ b/tutorials/Digits.ipynb @@ -213,15 +213,15 @@ "source": [ "# %load tpot_digits_pipeline.py\n", "import numpy as np\n", - "\n", + "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "\n", - "# NOTE: Make sure that the class is labeled 'class' in the data file\n", - "tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)\n", - "features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", + "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_classes, testing_classes = \\\n", - " train_test_split(features, tpot_data['class'], random_state=None)\n", + " train_test_split(features, tpot_data['target'], random_state=None)\n", "\n", "exported_pipeline = KNeighborsClassifier(n_neighbors=4, p=2, weights=\"distance\")\n", "\n", diff --git a/tutorials/IRIS.ipynb b/tutorials/IRIS.ipynb index 8b1a88cf..9ce9729a 100644 --- a/tutorials/IRIS.ipynb +++ b/tutorials/IRIS.ipynb @@ -403,17 +403,17 @@ "source": [ "# %load tpot_iris_pipeline.py\n", "import numpy as np\n", - "\n", + "import pandas as pd\n", "from sklearn.kernel_approximation import RBFSampler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", - "# NOTE: Make sure that the class is labeled 'class' in the data file\n", - "tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)\n", - "features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", + "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_classes, testing_classes = \\\n", - " train_test_split(features, tpot_data['class'], random_state=None)\n", + " train_test_split(features, tpot_data['target'], random_state=None)\n", "\n", "exported_pipeline = make_pipeline(\n", " RBFSampler(gamma=0.8500000000000001),\n", diff --git a/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb b/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb index 4eb400af..4d857f1a 100644 --- a/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb +++ b/tutorials/MAGIC Gamma Telescope/MAGIC Gamma Telescope.ipynb @@ -932,7 +932,7 @@ "from sklearn.tree import DecisionTreeClassifier\n", "from tpot.builtins import StackingEstimator\n", "\n", - "# NOTE: Make sure that the class is labeled 'target' in the data file\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_target, testing_target = \\\n", diff --git a/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py b/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py index 388f04e3..208553c0 100644 --- a/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py +++ b/tutorials/MAGIC Gamma Telescope/tpot_MAGIC_Gamma_Telescope_pipeline.py @@ -6,7 +6,7 @@ from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ diff --git a/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb b/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb index cd4c9713..b2a5f1d8 100644 --- a/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb +++ b/tutorials/Portuguese Bank Marketing/Portuguese Bank Marketing Strategy.ipynb @@ -923,7 +923,7 @@ "from sklearn.model_selection import train_test_split\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", - "# NOTE: Make sure that the class is labeled 'target' in the data file\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_target, testing_target = \\\n", diff --git a/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py b/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py index 5e737569..8f8bed8c 100644 --- a/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py +++ b/tutorials/Portuguese Bank Marketing/tpot_marketing_pipeline.py @@ -3,7 +3,7 @@ from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier -# NOTE: Make sure that the class is labeled 'target' in the data file +# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ diff --git a/tutorials/Titanic_Kaggle.ipynb b/tutorials/Titanic_Kaggle.ipynb index 7b3c29f7..acc690f9 100644 --- a/tutorials/Titanic_Kaggle.ipynb +++ b/tutorials/Titanic_Kaggle.ipynb @@ -1065,15 +1065,15 @@ "source": [ "# %load tpot_titanic_pipeline.py\n", "import numpy as np\n", - "\n", + "import pandas as pd\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# NOTE: Make sure that the class is labeled 'class' in the data file\n", - "tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)\n", - "features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", + "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = tpot_data.drop('target', axis=1)\n", "training_features, testing_features, training_classes, testing_classes = \\\n", - " train_test_split(features, tpot_data['class'], random_state=None)\n", + " train_test_split(features, tpot_data['target'], random_state=None)\n", "\n", "exported_pipeline = RandomForestClassifier(bootstrap=False, max_features=0.4, min_samples_leaf=1, min_samples_split=9)\n", "\n", diff --git a/tutorials/tpot_iris_pipeline.py b/tutorials/tpot_iris_pipeline.py index c85875f2..2ded0bd5 100644 --- a/tutorials/tpot_iris_pipeline.py +++ b/tutorials/tpot_iris_pipeline.py @@ -1,15 +1,15 @@ import numpy as np - +import pandas as pd from sklearn.kernel_approximation import RBFSampler from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +# NOTE: Make sure that the outcome column is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_classes, testing_classes = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( RBFSampler(gamma=0.8500000000000001), diff --git a/tutorials/tpot_mnist_pipeline.py b/tutorials/tpot_mnist_pipeline.py index afa65109..1659c833 100644 --- a/tutorials/tpot_mnist_pipeline.py +++ b/tutorials/tpot_mnist_pipeline.py @@ -1,13 +1,13 @@ import numpy as np - +import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +# NOTE: Make sure that the outcome column is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_classes, testing_classes = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=4, p=2, weights="distance") diff --git a/tutorials/tpot_titanic_pipeline.py b/tutorials/tpot_titanic_pipeline.py index 1a0fa8d0..81f3e89c 100644 --- a/tutorials/tpot_titanic_pipeline.py +++ b/tutorials/tpot_titanic_pipeline.py @@ -1,13 +1,13 @@ import numpy as np - +import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +# NOTE: Make sure that the outcome column is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1) training_features, testing_features, training_classes, testing_classes = \ - train_test_split(features, tpot_data['class'], random_state=None) + train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = RandomForestClassifier(bootstrap=False, max_features=0.4, min_samples_leaf=1, min_samples_split=9) From 5bbd944ea22ec7735139b4899fe19a53e4650066 Mon Sep 17 00:00:00 2001 From: weixuanfuDate: Tue, 5 Nov 2019 10:27:36 -0500 Subject: [PATCH 28/44] fix the bug that warm_start is not working when max_time_mins is not default #946 --- tests/tpot_tests.py | 25 ++++++++++++++++++++++++- tpot/base.py | 9 +++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 0088833c..c2bc18cd 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -964,12 +964,35 @@ def test_fit_4(): tpot_obj.generations == 20 tpot_obj.fit(training_features, training_target) - + assert tpot_obj._pop == [] assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None) def test_fit_5(): + """Assert that the TPOT fit function provides an optimized pipeline with max_time_mins of 2 second.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=2, + generations=None, + verbosity=0, + max_time_mins=2/60., + config_dict='TPOT light', + warm_start=True + ) + tpot_obj._fit_init() + assert tpot_obj.generations == 1000000 + + # reset generations to 20 just in case that the failed test may take too much time + tpot_obj.generations == 20 + + tpot_obj.fit(training_features, training_target) + assert tpot_obj._pop != [] + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + + +def test_fit_6(): """Assert that the TPOT fit function provides an optimized pipeline with pandas DataFrame""" tpot_obj = TPOTClassifier( random_state=42, diff --git a/tpot/base.py b/tpot/base.py index 7bfc56e5..efe1b6be 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -752,10 +752,6 @@ def pareto_eq(ind1, ind2): per_generation_function=self._check_periodic_pipeline ) - # store population for the next call - if self.warm_start: - self._pop = pop - # Allow for certain exceptions to signal a premature fit() cancellation except (KeyboardInterrupt, SystemExit, StopIteration) as e: if self.verbosity > 0: @@ -763,6 +759,9 @@ def pareto_eq(ind1, ind2): self._pbar.write('{}\nTPOT closed prematurely. Will use the current best pipeline.'.format(e), file=self._file) finally: + # clean population for the next call if warm_start=False + if not self.warm_start: + self._pop = [] # keep trying 10 times in case weird things happened like multiple CTRL+C or exceptions attempts = 10 for attempt in range(attempts): @@ -1383,6 +1382,8 @@ def _evaluate_individuals(self, population, features, target, sample_weight=None ind.fitness.values = (5000.,-float('inf')) self._pareto_front.update(population) + print(111) + self._pop = population raise KeyboardInterrupt self._update_evaluated_individuals_(result_score_list, eval_individuals_str, operator_counts, stats_dicts) From 0def2d34ebffce9dd2d3d79e3fe6ba2d3b97184f Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 11:17:06 -0500 Subject: [PATCH 29/44] clean codes --- tpot/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/base.py b/tpot/base.py index efe1b6be..9b003ce7 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1382,7 +1382,7 @@ def _evaluate_individuals(self, population, features, target, sample_weight=None ind.fitness.values = (5000.,-float('inf')) self._pareto_front.update(population) - print(111) + self._pop = population raise KeyboardInterrupt From 73cec4e91b4d6ced8a987170349988d0cc0adf88 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 12:05:44 -0500 Subject: [PATCH 30/44] add rerun to test_fit_5 --- tests/tpot_tests.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index c2bc18cd..d8f878e8 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -970,13 +970,13 @@ def test_fit_4(): def test_fit_5(): - """Assert that the TPOT fit function provides an optimized pipeline with max_time_mins of 2 second.""" + """Assert that the TPOT fit function provides an optimized pipeline with max_time_mins of 2 second with warm_start=True.""" tpot_obj = TPOTClassifier( random_state=42, population_size=2, generations=None, verbosity=0, - max_time_mins=2/60., + max_time_mins=3/60., config_dict='TPOT light', warm_start=True ) @@ -990,6 +990,10 @@ def test_fit_5(): assert tpot_obj._pop != [] assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None) + # rerun it + tpot_obj.fit(training_features, training_target) + assert tpot_obj._pop != [] + def test_fit_6(): From cc42dfa0e2808a96e9022fac7da980fad13d124a Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 12:14:21 -0500 Subject: [PATCH 31/44] stacking_estimator should not stack nan/infinity prediction proba #893 --- tpot/builtins/stacking_estimator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tpot/builtins/stacking_estimator.py b/tpot/builtins/stacking_estimator.py index ee2abfb5..e742e5a5 100644 --- a/tpot/builtins/stacking_estimator.py +++ b/tpot/builtins/stacking_estimator.py @@ -84,7 +84,10 @@ def transform(self, X): X_transformed = np.copy(X) # add class probabilities as a synthetic feature if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'): - X_transformed = np.hstack((self.estimator.predict_proba(X), X)) + y_pred_proba = self.estimator.predict_proba(X) + # check all values that should be not infinity or not NAN + if np.all(np.isfinite(y_pred_proba)): + X_transformed = np.hstack((y_pred_proba, X)) # add class prodiction as a synthetic feature X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed)) From a0c0c40afaa1d0cf927e8eb62c90eddfb4a5059a Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 12:40:23 -0500 Subject: [PATCH 32/44] TPOT rasie ValueError when template parameter is invalid. #898 --- tests/tpot_tests.py | 11 +++++++++++ tpot/base.py | 32 +++++++++++++++++++------------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index d8f878e8..2f8915dc 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -711,6 +711,17 @@ def test_template_4(): assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) +def test_template_5(): + """Assert that TPOT rasie ValueError when template parameter is invalid.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='SelectPercentile-Transformer-Classifie' # a typ in Classifier + ) + assert_raises(ValueError, tpot_obj._fit_init) + + def test_fit_GroupKFold(): """Assert that TPOT properly handles the group parameter when using GroupKFold.""" # This check tests if the darker digits images would generalize to the lighter ones. diff --git a/tpot/base.py b/tpot/base.py index 9b003ce7..819087d5 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -460,22 +460,28 @@ def _add_operators(self): ret_types.append(step_ret_type) else: step_ret_type = Output_Array + check_template = True if step == 'CombineDFs': self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type - for operator in self.operators: + ops = [op for op in self.operators if op.type() == step] + for operator in ops: arg_types = operator.parameter_types()[0][1:] - if operator.type() == step: - p_types = ([step_in_type] + arg_types, step_ret_type) - self._pset.addPrimitive(operator, *p_types) - self._import_hash_and_add_terminals(operator, arg_types) - else: # is the step is a specific operator - for operator in self.operators: - arg_types = operator.parameter_types()[0][1:] - if operator.__name__ == step: - p_types = ([step_in_type] + arg_types, step_ret_type) - self._pset.addPrimitive(operator, *p_types) - self._import_hash_and_add_terminals(operator, arg_types) + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + self._import_hash_and_add_terminals(operator, arg_types) + else: # is the step is a specific operator or a wrong input + try: + operator = next(op for op in self.operators if op.__name__ == step) + except: + raise ValueError( + 'An error occured while attempting to read the specified ' + 'template. Please check a step named {}'.format(step) + ) + arg_types = operator.parameter_types()[0][1:] + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + self._import_hash_and_add_terminals(operator, arg_types) self.ret_types = [np.ndarray, Output_Array] + ret_types @@ -1382,7 +1388,7 @@ def _evaluate_individuals(self, population, features, target, sample_weight=None ind.fitness.values = (5000.,-float('inf')) self._pareto_front.update(population) - + self._pop = population raise KeyboardInterrupt From a41a4941bdae82817e1f9388385c1067493535d2 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 13:30:02 -0500 Subject: [PATCH 33/44] remove support for py2.7 and py version < 3.5 --- .appveyor.yml | 2 +- README.md | 3 +-- docs_sources/installing.md | 6 ------ requirements.txt | 18 +++++++++--------- setup.py | 22 +++++++++------------- tpot/_version.py | 2 +- tpot/base.py | 22 ---------------------- 7 files changed, 21 insertions(+), 54 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 8f3ddebb..dfa050c8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -18,7 +18,7 @@ install: - conda config --set always_yes yes --set changeps1 no - conda update -q conda - conda info -a - - conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy scikit-learn nose cython pandas pywin32 joblib + - conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy scikit-learn nose cython pandas joblib - activate test-environment - pip install deap tqdm update_checker stopit xgboost dask[delayed] dask[dataframe] cloudpickle==0.5.6 fsspec>=0.3.3 dask_ml==%DASK_ML_VERSION% diff --git a/README.md b/README.md index 949af676..02c6bc0b 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,7 @@ Development status: [![Development Build Status - Mac/Linux](https://travis-ci.o [![Development Build Status - Windows](https://ci.appveyor.com/api/projects/status/b7bmpwpkjhifrm7v/branch/development?svg=true)](https://ci.appveyor.com/project/weixuanfu/tpot?branch=development) [![Development Coverage Status](https://coveralls.io/repos/github/EpistasisLab/tpot/badge.svg?branch=development)](https://coveralls.io/github/EpistasisLab/tpot?branch=development) -Package information: [![Python 2.7](https://img.shields.io/badge/python-2.7-blue.svg)](https://www.python.org/download/releases/2.7/) -[![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) +Package information: [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) [![License: LGPL v3](https://img.shields.io/badge/license-LGPL%20v3-blue.svg)](http://www.gnu.org/licenses/lgpl-3.0) [![PyPI version](https://badge.fury.io/py/TPOT.svg)](https://badge.fury.io/py/TPOT) diff --git a/docs_sources/installing.md b/docs_sources/installing.md index 016ce433..e1014e2f 100644 --- a/docs_sources/installing.md +++ b/docs_sources/installing.md @@ -32,12 +32,6 @@ DEAP, update_checker, tqdm and stopit can be installed with `pip` via the comman pip install deap update_checker tqdm stopit ``` -**For the Windows users**, the pywin32 module is required if Python is NOT installed via the [Anaconda Python distribution](https://www.continuum.io/downloads) and can be installed with `pip` for Python version <=3.3 or `conda` (e.g. miniconda) for any Python version: - -```Shell -conda install pywin32 -``` - **Optionally**, you can install [XGBoost](https://github.com/dmlc/xgboost) if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. **Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors.** ```Shell diff --git a/requirements.txt b/requirements.txt index 616ce627..365b8e50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -deap==1.0.2.post2 +deap>=1.2 nose==1.3.7 -numpy==1.12.1 -scikit-learn==0.18.1 -scipy==0.19.0 -tqdm==4.26.0 -update-checker==0.16 -stopit==1.1.1 -pandas==0.20.2 -joblib==0.10.3 +numpy>=1.16.3 +scikit-learn>=0.21.3 +scipy>=1.3.1 +tqdm>=4.36.1 +update-checker>=0.16 +stopit>=1.1.1 +pandas>=0.24.2 +joblib>=0.13.2 diff --git a/setup.py b/setup.py index a6c0980e..ff075ea4 100644 --- a/setup.py +++ b/setup.py @@ -35,30 +35,26 @@ def calculate_version(): This project is hosted at https://github.com/EpistasisLab/tpot ''', zip_safe=True, - install_requires=['numpy>=1.12.1', - 'scipy>=0.19.0', - 'scikit-learn>=0.18.1', - 'deap>=1.0', + install_requires=['numpy>=1.16.3', + 'scipy>=1.3.1', + 'scikit-learn>=0.21.3', + 'deap>=1.2', 'update_checker>=0.16', - 'tqdm>=4.26.0', + 'tqdm>=4.36.1', 'stopit>=1.1.1', - 'pandas>=0.20.2', - 'joblib>=0.10.3'], + 'pandas>=0.24.2', + 'joblib>=0.13.2'], extras_require={ - 'xgboost': ['xgboost==0.6a2'], + 'xgboost': ['xgboost==0.90'], 'skrebate': ['skrebate>=0.3.4'], 'mdr': ['scikit-mdr>=0.4.4'], 'dask': ['dask>=0.18.2', 'distributed>=1.22.1', - 'dask-ml>=0.9.0'], + 'dask-ml>=1.0.0'], }, classifiers=[ 'Intended Audience :: Science/Research', 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', diff --git a/tpot/_version.py b/tpot/_version.py index 8b320b7d..c56b024a 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -23,4 +23,4 @@ """ -__version__ = '0.10.2' +__version__ = '0.11' diff --git a/tpot/base.py b/tpot/base.py index 819087d5..3627309a 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -81,28 +81,6 @@ warnings.simplefilter('ignore') from tqdm.autonotebook import tqdm -# hot patch for Windows: solve the problem of crashing python after Ctrl + C in Windows OS -# https://github.com/ContinuumIO/anaconda-issues/issues/905 -if sys.platform.startswith('win'): - import win32api - - try: - import _thread - except ImportError: - import thread as _thread - - - def handler(dwCtrlType, hook_sigint=_thread.interrupt_main): - """SIGINT handler function.""" - if dwCtrlType == 0: # CTRL_C_EVENT - hook_sigint() - return 1 # don't chain to the next handler - return 0 - - - win32api.SetConsoleCtrlHandler(handler, 1) - - class TPOTBase(BaseEstimator): """Automatically creates and optimizes machine learning pipelines using GP.""" From a48297160dd053752d307f8cf340c4e702a0f5af Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 13:36:10 -0500 Subject: [PATCH 34/44] remove supports for scikit-learn < 0.21 --- tests/export_tests.py | 7 ++----- tpot/base.py | 7 ++----- tpot/export_utils.py | 7 ++----- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/tests/export_tests.py b/tests/export_tests.py index bb39b90f..0ffbbc27 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -645,10 +645,7 @@ def test_imputer_in_export(): import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -try: - from sklearn.impute import SimpleImputer as Imputer -except ImportError: - from sklearn.preprocessing import Imputer +from sklearn.impute import SimpleImputer # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) @@ -656,7 +653,7 @@ def test_imputer_in_export(): training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) -imputer = Imputer(strategy="median") +imputer = SimpleImputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) diff --git a/tpot/base.py b/tpot/base.py index 3627309a..6f256ff2 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -50,10 +50,7 @@ from sklearn.utils import check_X_y, check_consistent_length, check_array from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer -try: - from sklearn.impute import SimpleImputer as Imputer -except ImportError: - from sklearn.preprocessing import Imputer +from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer, _BaseScorer @@ -1125,7 +1122,7 @@ def _impute_values(self, features): print('Imputing missing values in feature set') if self._fitted_imputer is None: - self._fitted_imputer = Imputer(strategy="median") + self._fitted_imputer = SimpleImputer(strategy="median") self._fitted_imputer.fit(features) return self._fitted_imputer.transform(features) diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 51a37305..a66e3da2 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -106,7 +106,7 @@ def export_pipeline(exported_pipeline, # Add the imputation step if it was used by TPOT if impute: pipeline_text += """ -imputer = Imputer(strategy="median") +imputer = SimpleImputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) @@ -217,10 +217,7 @@ def merge_imports(old_dict, new_dict): # Add the imputer if necessary if impute: - pipeline_text += """try: - from sklearn.impute import SimpleImputer as Imputer -except ImportError: - from sklearn.preprocessing import Imputer + pipeline_text += """from sklearn.impute import SimpleImputer """ if random_state is not None: pipeline_text += """from tpot.export_utils import set_param_recursive From cc475f41aff2f930ccc840b816cf139b3e759531 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 13:44:55 -0500 Subject: [PATCH 35/44] change sklearn requirement to 0.21.0 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 365b8e50..9fa9dca7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ deap>=1.2 nose==1.3.7 numpy>=1.16.3 -scikit-learn>=0.21.3 +scikit-learn>=0.21.0 scipy>=1.3.1 tqdm>=4.36.1 update-checker>=0.16 diff --git a/setup.py b/setup.py index ff075ea4..0c98684a 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def calculate_version(): zip_safe=True, install_requires=['numpy>=1.16.3', 'scipy>=1.3.1', - 'scikit-learn>=0.21.3', + 'scikit-learn>=0.21.0', 'deap>=1.2', 'update_checker>=0.16', 'tqdm>=4.36.1', From 083adb25ea45204bac771a5eb9dfa7b4accc54cd Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 13:46:10 -0500 Subject: [PATCH 36/44] change version id to 0.11.0 --- tpot/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/_version.py b/tpot/_version.py index c56b024a..69737bab 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -23,4 +23,4 @@ """ -__version__ = '0.11' +__version__ = '0.11.0' From 318641facaae7a687cec6ed14b362b4038e07302 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 14:44:15 -0500 Subject: [PATCH 37/44] drop supports for scoring function with the signature score_func(y_true, y_pred) --- docs_sources/api.md | 4 --- docs_sources/releases.md | 56 ++++++++++++++++++++++++++++++++++++++++ docs_sources/using.md | 3 --- tests/tpot_tests.py | 42 +++++++++++++----------------- tpot/base.py | 30 +++++---------------- 5 files changed, 81 insertions(+), 54 deletions(-) diff --git a/docs_sources/api.md b/docs_sources/api.md index 473509ac..e31c2883 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -80,8 +80,6 @@ Function used to evaluate the quality of a given pipeline for the classification
If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y).
-If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred). TPOT assumes that any function with "error" or "loss" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. -
See the section on scoring functions for more details. @@ -573,8 +571,6 @@ Note that we recommend using the neg version of mean squared error and
If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y).
-If you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred). TPOT assumes that any function with "error" or "loss" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. -
See the section on scoring functions for more details. diff --git a/docs_sources/releases.md b/docs_sources/releases.md index 52c600eb..6aa77a11 100644 --- a/docs_sources/releases.md +++ b/docs_sources/releases.md @@ -1,3 +1,59 @@ +# Version 0.11.0 +- **Support for Python 3.4 and below has been officially dropped.** Also support for scikit-learn 0.20 or below has been dropped. +- The support of a metric function with the signature `score_func(y_true, y_pred)` for `scoring parameter` has been dropped. +- Refine `StackingEstimator` for not stacking NaN/Infinity predication probabilities. +- Fix a bug that population doesn't persist by `warm_start=True` when `max_time_mins` is not default value. +- Now the `random_state` parameter in TPOT is used for pipeline evaluation instead of using a fixed random seed of 42 before. The `set_param_recursive` function has been moved to export_utils.py and it can be used in exported codes for setting `random_state` recursively in scikit-learn Pipeline. It is used to set `random_state` in `fitted_pipeline_` attribute and exported pipelines. +- TPOT can independently use `generations` and `max_time_mins` to limit the optimization process through using one of the parameters or both. +- `.export()` function will return string of exported pipeline if output filename is not specified. +- Add [`SGDClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html) and [`SGDRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html) into TPOT default configs. +- Fix errors in documentation + +# Version 0.10.2 +- **TPOT v0.10.2 is the last version to support Python 2.7 and Python 3.4.** +- Minor updates for fixing compatibility issues with the latest version of scikit-learn (version > 0.21) and xgboost (v0.90) +- Default value of `template` parameter is changed to `None` instead. +- Fix errors in documentation + +# Version 0.10.1 + +- Add `data_file_path` option into `expert` function for replacing `'PATH/TO/DATA/FILE'` to customized dataset path in exported scripts. (Related issue #838) +- Change python version in CI tests to 3.7 +- Add CI tests for macOS. + +# Version 0.10.0 + +- Add a new `template` option to specify a desired structure for machine learning pipeline in TPOT. Check [TPOT API](https://epistasislab.github.io/tpot/api/) (it will be updated once it is merge to master branch). +- Add `FeatureSetSelector` operator into TPOT for feature selection based on *priori* export knowledge. Please check our [preprint paper](https://www.biorxiv.org/content/10.1101/502484v1.article-info) for more details (*Note: it was named `DatasetSelector` in 1st version paper but we will rename to FeatureSetSelector in next version of the paper*) +- Refine `n_jobs` parameter to accept value below -1. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. +- Now `memory` parameter can create memory cache directory if it does not exist. +- Fix minor bugs. + +# Version 0.9.6 + +- Fix a bug causing that `max_time_mins` parameter doesn't work when `use_dask=True` in TPOT 0.9.5 +- Now TPOT saves best pareto values best pareto pipeline s in checkpoint folder +- TPOT raises `ImportError` if operators in the TPOT configuration are not available when `verbosity>2` +- Thank @PGijsbers for the suggestions. Now TPOT can save scores of individuals already evaluated in any generation even the evaluation process of that generation is interrupted/stopped. But it is noted that, in this case, TPOT will raise this **warning message**: `WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.`, because the pipelines in early generation, e.g. 1st generation, are evolved/modified very limited times via evolutionary algorithm. +- Fix bugs in configuration of `TPOTRegressor` +- Error fixes in documentation + +# Version 0.9.5 + +- **TPOT now supports integration with Dask for parallelization + smart caching**. Big thanks to the Dask dev team for making this happen! + +- TPOT now supports for imputation/sparse matrices into `predict` and `predict_proba` functions. + +- `TPOTClassifier` and `TPOTRegressor` now follows scikit-learn estimator API. + +- We refined scoring parameter in TPOT API for accepting [`Scorer` object](http://jaquesgrobler.github.io/online-sklearn-build/modules/generated/sklearn.metrics.Scorer.html). + +- We refined parameters in VarianceThreshold and FeatureAgglomeration. + +- TPOT now supports using memory caching within a Pipeline via a optional `memory` parameter. + +- We improved documentation of TPOT. + # Version 0.9 * **TPOT now supports sparse matrices** with a new built-in TPOT configuration, "TPOT sparse". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. diff --git a/docs_sources/using.md b/docs_sources/using.md index 82ed0efc..f83a2652 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -387,9 +387,6 @@ print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') ``` -- You can pass a metric function with the signature `score_func(y_true, y_pred)` (e.g. `my_custom_accuracy` in the example above), where `y_true` are the true target values and `y_pred` are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with "error" or "loss" in the function name is meant to be minimized (`greater_is_better=False` in [`make_scorer`](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html)), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. - - * **my_module.scorer_name**: You can also use a custom `score_func(y_true, y_pred)` or `scorer(estimator, X, y)` function through the command line by adding the argument `-scoring my_module.scorer` to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: `-scoring sklearn.metrics.auc` will use the function auc from sklearn.metrics module. diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 2f8915dc..e9be5b5e 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -175,14 +175,9 @@ def test_init_default_scoring(): def test_init_default_scoring_2(): - """Assert that TPOT intitializes with a valid customized metric function.""" - with warnings.catch_warnings(record=True) as w: - tpot_obj = TPOTClassifier(scoring=balanced_accuracy) - tpot_obj._fit_init() - assert len(w) == 1 # deap 1.2.2 warning message made this unit test failed - assert issubclass(w[-1].category, DeprecationWarning) # deap 1.2.2 warning message made this unit test failed - assert "This scoring type was deprecated" in str(w[-1].message) # deap 1.2.2 warning message made this unit test failed - assert tpot_obj.scoring_function._score_func == balanced_accuracy + """Assert that TPOT rasies ValueError with a invalid sklearn metric function.""" + tpot_obj = TPOTClassifier(scoring=balanced_accuracy) + assert_raises(ValueError, tpot_obj._fit_init) def test_init_default_scoring_3(): @@ -207,28 +202,27 @@ def my_scorer(clf, X, y): def test_init_default_scoring_5(): - """Assert that TPOT intitializes with a valid sklearn metric function roc_auc_score.""" - with warnings.catch_warnings(record=True) as w: - tpot_obj = TPOTClassifier(scoring=roc_auc_score) - tpot_obj._fit_init() - assert len(w) == 1 - assert issubclass(w[-1].category, DeprecationWarning) - assert "This scoring type was deprecated" in str(w[-1].message) - assert tpot_obj.scoring_function._score_func == roc_auc_score + """Assert that TPOT rasies ValueError with a invalid sklearn metric function roc_auc_score.""" + tpot_obj = TPOTClassifier(scoring=roc_auc_score) + assert_raises(ValueError, tpot_obj._fit_init) def test_init_default_scoring_6(): - """Assert that TPOT intitializes with a valid customized metric function in __main__""" + """Assert that TPOT rasies ValueError with a invalid sklearn metric function from __main__.""" def my_scorer(y_true, y_pred): return roc_auc_score(y_true, y_pred) - with warnings.catch_warnings(record=True) as w: - tpot_obj = TPOTClassifier(scoring=my_scorer) - tpot_obj._fit_init() - assert len(w) == 1 - assert issubclass(w[-1].category, DeprecationWarning) - assert "This scoring type was deprecated" in str(w[-1].message) - assert tpot_obj.scoring_function._score_func == my_scorer + tpot_obj = TPOTClassifier(scoring=my_scorer) + assert_raises(ValueError, tpot_obj._fit_init) + + +def test_init_default_scoring_7(): + """Assert that TPOT rasies ValueError with a valid sklearn metric function from __main__.""" + def my_scorer(estimator, X, y): + return make_scorer(balanced_accuracy) + + tpot_obj = TPOTClassifier(scoring=my_scorer) + tpot_obj._fit_init() def test_invalid_score_warning(): diff --git a/tpot/base.py b/tpot/base.py index 6f256ff2..b2f6b1f0 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -52,7 +52,7 @@ from sklearn.preprocessing import FunctionTransformer from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split -from sklearn.metrics.scorer import make_scorer, _BaseScorer +from sklearn.metrics.scorer import _BaseScorer from joblib import Parallel, delayed, Memory @@ -137,13 +137,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, ['neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'] - - If you would like to use a custom scoring function, you can pass a callable - function to this parameter with the signature scorer(y_true, y_pred). - See the section on scoring functions in the documentation for more details. - - TPOT assumes that any custom scoring function with "error" or "loss" in the - name is meant to be minimized, whereas any other functions will be maximized. cv: int or cross-validation generator, optional (default: 5) If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization @@ -308,25 +301,16 @@ def _setup_scoring_function(self, scoring): elif callable(scoring): # Heuristic to ensure user has not passed a metric module = getattr(scoring, '__module__', None) - if sys.version_info[0] < 3: - if inspect.isfunction(scoring): - args_list = inspect.getargspec(scoring)[0] - else: - args_list = inspect.getargspec(scoring.__call__)[0] - else: - args_list = inspect.getfullargspec(scoring)[0] + args_list = inspect.getfullargspec(scoring)[0] if args_list == ["y_true", "y_pred"] or (hasattr(module, 'startswith') and \ (module.startswith('sklearn.metrics.') or module.startswith('tpot.metrics')) and \ not module.startswith('sklearn.metrics.scorer') and \ not module.startswith('sklearn.metrics.tests.')): - scoring_name = scoring.__name__ - greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name - self.scoring_function = make_scorer(scoring, greater_is_better=greater_is_better) - warnings.simplefilter('always', DeprecationWarning) - warnings.warn('Scoring function {} looks like it is a metric function ' - 'rather than a scikit-learn scorer. This scoring type was deprecated ' - 'in version TPOT 0.9.1 and will be removed in version 0.11. ' - 'Please update your custom scoring function.'.format(scoring), DeprecationWarning) + raise ValueError( + 'Scoring function {} looks like it is a metric function ' + 'rather than a scikit-learn scorer. This scoring type was removed in version 0.11. ' + 'Please update your custom scoring function.'.format(scoring) + ) else: self.scoring_function = scoring From b119d82e1147f4469be4e26a4f68130f5ee487ec Mon Sep 17 00:00:00 2001 From: weixuanfuDate: Tue, 5 Nov 2019 14:49:36 -0500 Subject: [PATCH 38/44] fix a bug in eaMuPlusLambda #946 --- tpot/gp_deap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 71566ce0..a68c95e1 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -224,7 +224,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, for ind in population: initialize_stats_dict(ind) - population = toolbox.evaluate(population) + population[:] = toolbox.evaluate(population) record = stats.compile(population) if stats is not None else {} logbook.record(gen=0, nevals=len(population), **record) From d0e87cc1e1ed084b3181b0d06871a118c6fdcced Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 15:05:00 -0500 Subject: [PATCH 39/44] refine self._pop #946 --- tpot/base.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index b2f6b1f0..d32d19cb 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -656,10 +656,8 @@ def fit(self, features, target, sample_weight=None, groups=None): self._toolbox.register('evaluate', self._evaluate_individuals, features=features, target=target, sample_weight=sample_weight, groups=groups) # assign population, self._pop can only be not None if warm_start is enabled - if self._pop: - pop = self._pop - else: - pop = self._toolbox.population(n=self.population_size) + if not self._pop: + self._pop = self._toolbox.population(n=self.population_size) def pareto_eq(ind1, ind2): """Determine whether two individuals are equal on the Pareto front. @@ -704,7 +702,7 @@ def pareto_eq(ind1, ind2): self._setup_memory() warnings.simplefilter('ignore') pop, _ = eaMuPlusLambda( - population=pop, + population=self._pop, toolbox=self._toolbox, mu=self.population_size, lambda_=self._lambda, From 96d6a61dc0123f98e85512aed714ffa2d50db993 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 5 Nov 2019 15:19:14 -0500 Subject: [PATCH 40/44] refine documentation --- docs/404.html | 24 +- docs/api/index.html | 38 +- docs/citing/index.html | 18 +- docs/contributing/index.html | 18 +- docs/css/highlight.css | 124 + docs/css/theme_extra.css | 5 +- docs/examples/index.html | 106 +- docs/index.html | 28 +- docs/installing/index.html | 26 +- docs/js/highlight.pack.js | 2 + docs/js/theme.js | 36 +- docs/related/index.html | 18 +- docs/releases/index.html | 114 +- docs/search.html | 44 +- docs/search/lunr.js | 2986 ------------------ docs/search/lunr.min.js | 7 + docs/search/main.js | 96 - docs/search/mustache.min.js | 1 + docs/search/require.js | 36 + docs/search/search-results-template.mustache | 4 + docs/search/search.js | 92 + docs/search/search_index.json | 250 +- docs/search/text.js | 390 +++ docs/search/worker.js | 128 - docs/sitemap.xml | 51 +- docs/sitemap.xml.gz | Bin 272 -> 0 bytes docs/support/index.html | 18 +- docs/using/index.html | 43 +- docs_sources/releases.md | 6 +- 29 files changed, 1259 insertions(+), 3450 deletions(-) create mode 100644 docs/css/highlight.css create mode 100644 docs/js/highlight.pack.js delete mode 100644 docs/search/lunr.js create mode 100644 docs/search/lunr.min.js delete mode 100644 docs/search/main.js create mode 100644 docs/search/mustache.min.js create mode 100644 docs/search/require.js create mode 100644 docs/search/search-results-template.mustache create mode 100644 docs/search/search.js create mode 100644 docs/search/text.js delete mode 100644 docs/search/worker.js delete mode 100644 docs/sitemap.xml.gz diff --git a/docs/404.html b/docs/404.html index c277b485..009af1cd 100644 --- a/docs/404.html +++ b/docs/404.html @@ -13,12 +13,11 @@ - + - - - - + + + @@ -29,10 +28,10 @@