diff --git a/.gitignore b/.gitignore index 0af97b41..332f1f28 100644 --- a/.gitignore +++ b/.gitignore @@ -73,3 +73,5 @@ docs/sources/examples/.Rhistory .idea analyze-oj2-tpot-mdr.ipynb + +tpot-mdr-demo.ipynb diff --git a/README.md b/README.md index 684d4023..1d85e5e4 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Click on the corresponding links to find more information on TPOT usage in the d Below is a minimal working example with the practice MNIST data set. ```python -from tpot import TPOT +from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.cross_validation import train_test_split @@ -61,7 +61,7 @@ digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) -tpot = TPOT(generations=5, population_size=20, verbosity=2) +tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') diff --git a/ci/.travis_install.sh b/ci/.travis_install.sh index b4b931f1..267d8e60 100755 --- a/ci/.travis_install.sh +++ b/ci/.travis_install.sh @@ -38,15 +38,17 @@ else conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ scikit-learn=$SKLEARN_VERSION \ - cython + cython fi source activate testenv if [[ "$LATEST" == "true" ]]; then pip install deap + pip install xgboost else pip install deap==$DEAP_VERSION + pip install xgboost==$XGBOOST_VERSION fi pip install update_checker @@ -62,6 +64,7 @@ python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "import sklearn; print('sklearn %s' % sklearn.__version__)" python -c "import deap; print('deap %s' % deap.__version__)" +python -c "import xgboost; print('xgboost %s ' % xgboost.__version__)" python -c "import update_checker; print('update_checker %s' % update_checker.__version__)" python -c "import tqdm; print('tqdm %s' % tqdm.__version__)" python setup.py build_ext --inplace diff --git a/ci/.travis_test.sh b/ci/.travis_test.sh index 748513e4..77b3733b 100755 --- a/ci/.travis_test.sh +++ b/ci/.travis_test.sh @@ -14,6 +14,7 @@ python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "import sklearn; print('sklearn %s' % sklearn.__version__)" python -c "import deap; print('deap %s' % deap.__version__)" +python -c "import xgboost; print('xgboost %s ' % xgboost.__version__)" python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)" python -c "import tqdm; print('tqdm %s' % tqdm.__version__)" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index ed55c4c8..1374ec57 100755 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,5 +23,6 @@ pages: - examples/IRIS_Example.md - examples/Titanic_Kaggle_Example.md - Contributing: contributing.md +- Release Notes: releases.md - Citing: citing.md - Support: support.md diff --git a/docs/sources/contributing.md b/docs/sources/contributing.md index f358035a..604d2333 100644 --- a/docs/sources/contributing.md +++ b/docs/sources/contributing.md @@ -1,5 +1,19 @@ We welcome you to [check the existing issues](https://github.com/rhiever/tpot/issues/) for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please [file a new issue](https://github.com/rhiever/tpot/issues/new) so we can discuss it. +## Project layout + +The latest stable release of TPOT is on the [master branch](https://github.com/rhiever/tpot/tree/master), whereas the latest version of TPOT in development is on the [development branch](https://github.com/rhiever/tpot/tree/development). Make sure you are looking at and working on the correct branch if you're looking to contribute code. + +In terms of directory structure: + +* All of TPOT's code sources are in the `tpot` directory +* The documentation sources are in the `docs` directory +* Images in the documentation are in the `images` directory +* Tutorials for TPOT are in the `tutorials` directory +* Unit tests for TPOT are in the `tests.py` file + +Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the `development` branch. + ## How to contribute The preferred way to contribute to TPOT is to fork the @@ -27,9 +41,9 @@ GitHub: 6. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: - $ python -m tpot.tpot + $ python -m tpot.driver - or by running script that imports and uses the TPOT module with code similar to `from tpot import TPOT` + or by running script that imports and uses the TPOT module with code similar to `from tpot import TPOTClassifier` 7. To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the `nose` package installed within your dev environment for this to work): diff --git a/docs/sources/examples/IRIS_Example.md b/docs/sources/examples/IRIS_Example.md index 2494e5a4..2178ec3d 100644 --- a/docs/sources/examples/IRIS_Example.md +++ b/docs/sources/examples/IRIS_Example.md @@ -1,7 +1,7 @@ The following code illustrates the usage of TPOT with the IRIS data set. ```python -from tpot import TPOT +from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split import numpy as np @@ -10,7 +10,7 @@ iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) -tpot = TPOT(generations=5, population_size=20, verbosity=2) +tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') @@ -44,5 +44,4 @@ exported_pipeline = make_pipeline( exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) - ``` diff --git a/docs/sources/examples/MNIST_Example.md b/docs/sources/examples/MNIST_Example.md index 0c58de31..368a4b94 100644 --- a/docs/sources/examples/MNIST_Example.md +++ b/docs/sources/examples/MNIST_Example.md @@ -1,7 +1,7 @@ Below is a minimal working example with the practice MNIST data set. ```python -from tpot import TPOT +from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.cross_validation import train_test_split @@ -9,7 +9,7 @@ digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) -tpot = TPOT(generations=5, population_size=20, verbosity=2) +tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') diff --git a/docs/sources/installing.md b/docs/sources/installing.md index 50d5be51..eebe4396 100644 --- a/docs/sources/installing.md +++ b/docs/sources/installing.md @@ -26,6 +26,14 @@ DEAP, update_checker, and tqdm (used for verbose TPOT runs) can be installed wit pip install deap update_checker tqdm ``` +Optionally, install XGBoost if you would like TPOT to use XGBoost. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. + +```Shell +pip install xgboost +``` + +If you have issues installing XGBoost, check the [XGBoost installation documentation](http://xgboost.readthedocs.io/en/latest/build.html). + Finally to install TPOT itself, run the following command: ```Shell diff --git a/docs/sources/releases.md b/docs/sources/releases.md new file mode 100644 index 00000000..73006e5c --- /dev/null +++ b/docs/sources/releases.md @@ -0,0 +1,86 @@ +# Version 0.6 + +* **TPOT now supports regression problems!** We have created two separate `TPOTClassifier` and `TPOTRegressor` classes to support classification and regression problems, respectively. The [command-line interface](/using/#tpot-on-the-command-line) also supports this feature through the `-mode` parameter. + +* TPOT now allows you to **specify a time limit** for the optimization process with the `max_time_mins` parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. + +* Added a new operator that performs feature selection using [ExtraTrees](http://scikit-learn.org/stable/modules/ensemble.html#extremely-randomized-trees) feature importance scores. + +* **[XGBoost](https://github.com/dmlc/xgboost) has been added as an optional dependency to TPOT.** If you have XGBoost installed, TPOT will automatically detect your installation and use the `XGBoostClassifier` and `XGBoostRegressor` in its pipelines. + +* TPOT now offers a verbosity level of 3 ("science mode"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. + +# Version 0.5 + +* Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! +* TPOT now **exports directly to scikit-learn Pipelines** instead of hacky code. +* Internal representation of individuals now uses scikit-learn pipelines. +* Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. +* We have removed pandas as a dependency and instead use numpy matrices to store the data. +* TPOT now uses **k-fold cross-validation** when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. +* Improved **scoring function support**: Even though TPOT uses balanced accuracy by default, you can now have TPOT use [any of the scoring functions](http://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values) that `cross_val_score` supports. +* Added the scikit-learn [Normalizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html) preprocessor. +* [Minor text fixes.](http://knowyourmeme.com/memes/pokemon-go-updates-controversy) + +# Version 0.4 + +In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. + + + +# Version 0.3 + +* We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. + +# Version 0.2 + +* TPOT now has the ability to export the optimized pipelines to sklearn code. + +* Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. + +* TPOT can now use arbitrary scoring functions for the optimization process. + +* TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. + +# Version 0.1 + +* First public release of TPOT. + +* Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors. diff --git a/docs/sources/using.md b/docs/sources/using.md index 3b8f4253..e498681b 100644 --- a/docs/sources/using.md +++ b/docs/sources/using.md @@ -22,6 +22,18 @@ TPOT offers several arguments that can be provided at the command line: Character used to separate columns in the input file. +-target +TARGET_NAME +Any string +Name of the target column in the input file. + + +-mode +TPOT_MODE +['classification', 'regression'] +Whether TPOT is being used for a classification or regression problem. + + -o OUTPUT_FILE String path to a file @@ -54,14 +66,20 @@ TPOT offers several arguments that can be provided at the command line: -cv NUM_CV_FOLDS -[2, 10] +Any integer >2 The number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT pipeline optimization process. -scoring SCORING_FN -"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", "f1_micro", "f1_samples", "f1_weighted", "precision", "precision_macro", "precision_micro", "precision_samples", "precision_weighted", "recall", "recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc" -Function used to evaluate the goodness of a given pipeline for the classification problem. By default, balanced class accuracy is used. TPOT assumes that this scoring function should be maximized, i.e., higher is better. +'accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' +Function used to evaluate the quality of a given pipeline for the problem. By default, balanced accuracy is used for classification and mean squared error is used for regression. TPOT assumes that any function with "error" or "loss" in the name is meant to be minimized, whereas any other functions will be maximized. See the section on scoring functions for more details. + + +-maxtime +MAX_TIME_MINS +Any positive integer +How many minutes TPOT has to optimize the pipeline. This setting will override the GENERATIONS parameter and allow TPOT to run until it runs out of time. -s @@ -72,8 +90,8 @@ TPOT offers several arguments that can be provided at the command line: -v VERBOSITY -{0,1,2} -How much information TPOT communicates while it is running: 0 = none, 1 = minimal, 2 = all. A setting of 2 will add a progress bar during the optimization procedure. +{0, 1, 2, 3} +How much information TPOT communicates while it is running: 0 = none, 1 = minimal, 2 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check @@ -95,7 +113,7 @@ TPOT offers several arguments that can be provided at the command line: An example command-line call to TPOT may look like: ```Shell -tpot data/mnist.csv -is , -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 +tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 ``` # TPOT with code @@ -105,17 +123,19 @@ We've taken care to design the TPOT interface to be as similar as possible to sc TPOT can be imported just like any regular Python module. To import TPOT, type: ```Python -from tpot import TPOT +from tpot import TPOTClassifier ``` then create an instance of TPOT as follows: ```Python -from tpot import TPOT +from tpot import TPOTClassifier -pipeline_optimizer = TPOT() +pipeline_optimizer = TPOTClassifier() ``` +It's also possible to use TPOT for regression problems with the `TPOTRegressor` class. Other than the class name, a `TPOTRegressor` is used the same way as a `TPOTClassifier`. + Note that you can pass several parameters to the TPOT instantiation call: @@ -150,9 +170,14 @@ Note that you can pass several parameters to the TPOT instantiation call: - - - + + + + + + + + @@ -161,8 +186,8 @@ Note that you can pass several parameters to the TPOT instantiation call: - - + + @@ -174,17 +199,17 @@ Note that you can pass several parameters to the TPOT instantiation call: Some example code with custom TPOT parameters might look like: ```Python -from tpot import TPOT +from tpot import TPOTClassifier -pipeline_optimizer = TPOT(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) +pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) ``` Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the `fit` function: ```Python -from tpot import TPOT +from tpot import TPOTClassifier -pipeline_optimizer = TPOT(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) +pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(training_features, training_classes) ``` @@ -193,9 +218,9 @@ The `fit()` function takes in a training data set and uses k-fold cross-validati You can then proceed to evaluate the final pipeline on the testing set with the `score()` function: ```Python -from tpot import TPOT +from tpot import TPOTClassifier -pipeline_optimizer = TPOT(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) +pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(training_features, training_classes) print(pipeline_optimizer.score(testing_features, testing_classes)) ``` @@ -203,9 +228,9 @@ print(pipeline_optimizer.score(testing_features, testing_classes)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the `export()` function: ```Python -from tpot import TPOT +from tpot import TPOTClassifier -pipeline_optimizer = TPOT(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) +pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(training_features, training_classes) print(pipeline_optimizer.score(testing_features, testing_classes)) pipeline_optimizer.export('tpot_exported_pipeline.py') @@ -214,3 +239,17 @@ pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, `tpot_exported_pipeline.py` will contain the Python code for the optimized pipeline. Check our [examples](examples/MNIST_Example/) to see TPOT applied to some specific data sets. + + +## Scoring functions + +TPOT makes use of `sklearn.cross_validation.cross_val_score`, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: + +1. You can pass in a string from the list described in the table above. Any other strings will cause internal issues that may break your code down the line. + +2. You can pass in a function with the signature `scorer(y_true, y_pred)`, where `y_true` are the true target values and `y_pred` are the predicted target values from an estimator. To do this, you should implement your own function. See the example below for further explanation. + +```Python +def accuracy(y_true, y_pred): + return float(sum(y_pred == y_true)) / len(y_true) +``` diff --git a/setup.py b/setup.py index 88f0fee6..1bf14272 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def calculate_version(): packages=find_packages(), url='https://github.com/rhiever/tpot', license='GNU/GPLv3', - entry_points={'console_scripts': ['tpot=tpot:main', ]}, + entry_points={'console_scripts': ['tpot=driver:main', ]}, description=('Tree-based Pipeline Optimization Tool'), long_description=''' A Python tool that automatically creates and optimizes machine learning pipelines using genetic programming. @@ -35,6 +35,7 @@ def calculate_version(): ''', zip_safe=True, install_requires=['numpy', 'scipy', 'scikit-learn', 'deap', 'update_checker', 'tqdm'], + extras_require={'xgboost': ['xgboost']}, classifiers=[ 'Intended Audience :: Science/Research', 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', diff --git a/tests.py b/tests.py index 6ea2ee40..b4a2b7e9 100644 --- a/tests.py +++ b/tests.py @@ -4,8 +4,9 @@ TPOT Unit Tests """ -from tpot import TPOT -from tpot.tpot import positive_integer, float_range +from tpot import TPOTClassifier, TPOTRegressor +from tpot.base import TPOTBase +from tpot.driver import positive_integer, float_range from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code from tpot.decorators import _gp_new_generation from tpot.gp_types import Output_DF @@ -16,8 +17,9 @@ import numpy as np import inspect import random +from datetime import datetime -from sklearn.datasets import load_digits +from sklearn.datasets import load_digits, load_boston from sklearn.cross_validation import train_test_split from deap import creator @@ -28,53 +30,92 @@ training_features, testing_features, training_classes, testing_classes = \ train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) +# Set up the Boston data set for testing +boston_data = load_boston() +training_features_r, testing_features_r, training_classes_r, testing_classes_r = \ + train_test_split(boston_data.data, boston_data.target, random_state=42) + np.random.seed(42) random.seed(42) -def test_init(): +def test_init_custom_parameters(): """Assert that the TPOT instantiator stores the TPOT variables properly""" - def dummy_scoring_func(foo, bar): - return - - tpot_obj = TPOT(population_size=500, generations=1000, scoring_function=dummy_scoring_func, - mutation_rate=0.05, crossover_rate=0.9, verbosity=1, random_state=42, + tpot_obj = TPOTClassifier(population_size=500, generations=1000, + mutation_rate=0.05, crossover_rate=0.9, + scoring='accuracy', num_cv_folds=10, + verbosity=1, random_state=42, disable_update_check=True) assert tpot_obj.population_size == 500 assert tpot_obj.generations == 1000 assert tpot_obj.mutation_rate == 0.05 assert tpot_obj.crossover_rate == 0.9 + assert tpot_obj.scoring_function == 'accuracy' + assert tpot_obj.num_cv_folds == 10 + assert tpot_obj.max_time_mins is None assert tpot_obj.verbosity == 1 assert tpot_obj._optimized_pipeline is None assert tpot_obj._fitted_pipeline is None - assert tpot_obj.scoring_function == dummy_scoring_func - assert tpot_obj._pset + assert not (tpot_obj._pset is None) + assert not (tpot_obj._toolbox is None) + + +def test_init_default_scoring(): + """Assert that TPOT intitializes with the correct default scoring function""" + + tpot_obj = TPOTRegressor() + assert tpot_obj.scoring_function == 'mean_squared_error' + + +def test_init_max_time_mins(): + """Assert that the TPOT init stores max run time and sets generations to 1000000""" + + tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000) + + assert tpot_obj.generations == 1000000 + assert tpot_obj.max_time_mins == 30 def test_get_params(): """Assert that get_params returns the exact dictionary of parameters used by TPOT""" + kwargs = { 'population_size': 500, 'generations': 1000, 'verbosity': 1 } - tpot_obj = TPOT(**kwargs) + tpot_obj = TPOTClassifier(**kwargs) # Get default parameters of TPOT and merge with our specified parameters - initializer = inspect.getargspec(TPOT.__init__) + initializer = inspect.getargspec(TPOTBase.__init__) default_kwargs = dict(zip(initializer.args[1:], initializer.defaults)) default_kwargs.update(kwargs) assert tpot_obj.get_params() == default_kwargs +def test_set_params(): + """Assert that set_params returns a reference to the TPOT instance""" + + tpot_obj = TPOTClassifier() + assert tpot_obj.set_params() is tpot_obj + + +def test_set_params_2(): + """Assert that set_params updates TPOT's instance variables""" + tpot_obj = TPOTClassifier(generations=2) + tpot_obj.set_params(generations=3) + + assert tpot_obj.generations == 3 + + def test_score(): """Assert that the TPOT score function raises a ValueError when no optimized pipeline exists""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() try: tpot_obj.score(testing_features, testing_classes) @@ -84,10 +125,10 @@ def test_score(): def test_score_2(): - """Assert that the TPOT score function outputs a known score for a fixed pipeline""" + """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline""" - tpot_obj = TPOT() - tpot_obj.pbar = tqdm(total=1, disable=True) + tpot_obj = TPOTClassifier() + tpot_obj._pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score @@ -106,10 +147,32 @@ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): assert isclose(known_score, score) +def test_score_3(): + """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline""" + + tpot_obj = TPOTRegressor(scoring='mean_squared_error') + tpot_obj._pbar = tqdm(total=1, disable=True) + known_score = 8.9673743407873712 # Assumes use of mse + # Reify pipeline with known score + tpot_obj._optimized_pipeline = creator.Individual.\ + from_string('ExtraTreesRegressor(GradientBoostingRegressor(input_matrix, 100.0, 0.11), 0.17999999999999999)', tpot_obj._pset) + tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r) + + # Get score from TPOT + score = tpot_obj.score(testing_features_r, testing_classes_r) + + # http://stackoverflow.com/questions/5595425/ + def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): + return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + + assert isclose(known_score, score) + + def test_predict(): """Assert that the TPOT predict function raises a ValueError when no optimized pipeline exists""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() try: tpot_obj.predict(testing_features) @@ -121,7 +184,7 @@ def test_predict(): def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() tpot_obj._optimized_pipeline = creator.Individual.\ from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) @@ -134,19 +197,20 @@ def test_predict_2(): def test_fit(): """Assert that the TPOT fit function provides an optimized pipeline""" - tpot_obj = TPOT(random_state=42, population_size=1, generations=1, verbosity=0) + tpot_obj = TPOTClassifier(random_state=42, population_size=1, generations=1, verbosity=0) tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) - assert tpot_obj.gp_generation == 0 + assert tpot_obj._gp_generation == 0 + assert not (tpot_obj._start_datetime is None) def test_gp_new_generation(): """Assert that the gp_generation count gets incremented when _gp_new_generation is called""" - tpot_obj = TPOT() - tpot_obj.pbar = tqdm(total=1, disable=True) + tpot_obj = TPOTClassifier() + tpot_obj._pbar = tqdm(total=1, disable=True) - assert(tpot_obj.gp_generation == 0) + assert tpot_obj._gp_generation == 0 # Since _gp_new_generation is a decorator, and we dont want to run a full # fit(), decorate a dummy function and then call the dummy function. @@ -156,12 +220,12 @@ def dummy_function(self, foo): dummy_function(tpot_obj, None) - assert(tpot_obj.gp_generation == 1) + assert tpot_obj._gp_generation == 1 def check_export(op): """Assert that a TPOT operator exports as expected""" - tpot_obj = TPOT(random_state=42) + tpot_obj = TPOTClassifier(random_state=42) prng = np.random.RandomState(42) np.random.seed(42) @@ -185,7 +249,7 @@ def test_operators(): def test_export(): """Assert that TPOT's export function throws a ValueError when no optimized pipeline exists""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() try: tpot_obj.export("test_export.py") @@ -201,8 +265,7 @@ def test_generate_pipeline_code(): ['GradientBoostingClassifier', 'input_matrix', 38.0, - 0.87, - 0.5], + 0.87], ['GaussianNB', ['ZeroCount', 'input_matrix']]], @@ -211,10 +274,10 @@ def test_generate_pipeline_code(): expected_code = """make_pipeline( make_union( - make_union(VotingClassifier(estimators=[('branch', - GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500) + make_union(VotingClassifier([('branch', + GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, n_estimators=500) )]), FunctionTransformer(lambda X: X)), - make_union(VotingClassifier(estimators=[('branch', + make_union(VotingClassifier([('branch', make_pipeline( ZeroCount(), GaussianNB() @@ -229,7 +292,7 @@ def test_generate_pipeline_code(): def test_generate_import_code(): """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() pipeline = creator.Individual.\ from_string('DecisionTreeClassifier(SelectKBest(input_matrix, 7), 0.5)', tpot_obj._pset) @@ -254,9 +317,9 @@ def test_generate_import_code(): def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() pipeline = creator.Individual.\ - from_string("KNeighborsClassifier(CombineDFs(GradientBoostingClassifier(input_matrix, 38.0, 0.87, 0.5), RFE(input_matrix, 0.17999999999999999)), 18, 33)", tpot_obj._pset) + from_string("KNeighborsClassifier(CombineDFs(GradientBoostingClassifier(input_matrix, 38.0, 0.87), RFE(input_matrix, 0.17999999999999999)), 18, 33)", tpot_obj._pset) expected_code = """import numpy as np @@ -276,8 +339,8 @@ def test_export_pipeline(): exported_pipeline = make_pipeline( make_union( - make_union(VotingClassifier(estimators=[('branch', - GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500) + make_union(VotingClassifier([('branch', + GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, n_estimators=500) )]), FunctionTransformer(lambda X: X)), RFE(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', @@ -328,7 +391,7 @@ def test_get_by_name(): def test_gen(): """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure""" - tpot_obj = TPOT() + tpot_obj = TPOTClassifier() pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3) diff --git a/tpot/__init__.py b/tpot/__init__.py index 8246c6e3..86bcc7df 100644 --- a/tpot/__init__.py +++ b/tpot/__init__.py @@ -19,4 +19,5 @@ """ from ._version import __version__ -from .tpot import TPOT, main +from .tpot import TPOTClassifier, TPOTRegressor +from .driver import main diff --git a/tpot/_version.py b/tpot/_version.py index 858d01cd..fb2d11c3 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -18,4 +18,4 @@ """ -__version__ = '0.5.2' +__version__ = '0.6.0' diff --git a/tpot/base.py b/tpot/base.py new file mode 100644 index 00000000..a36f05df --- /dev/null +++ b/tpot/base.py @@ -0,0 +1,701 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from __future__ import print_function +import random +import inspect +import warnings +import sys +from functools import partial +from datetime import datetime + +import numpy as np +import deap +from deap import algorithms, base, creator, tools, gp +from tqdm import tqdm + +from sklearn.base import BaseEstimator +from sklearn.cross_validation import cross_val_score +from sklearn.pipeline import make_pipeline, make_union +from sklearn.preprocessing import FunctionTransformer +from sklearn.ensemble import VotingClassifier +from sklearn.metrics.scorer import make_scorer + +from update_checker import update_check + +from ._version import __version__ +from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code +from .decorators import _gp_new_generation +from . import operators +from .operators import CombineDFs +from .gp_types import Bool, Output_DF +from .metrics import SCORERS + + +class TPOTBase(BaseEstimator): + """TPOT automatically creates and optimizes machine learning pipelines using + genetic programming""" + + def __init__(self, population_size=100, generations=100, + mutation_rate=0.9, crossover_rate=0.05, + scoring=None, num_cv_folds=3, max_time_mins=None, + random_state=None, verbosity=0, + disable_update_check=False): + """Sets up the genetic programming algorithm for pipeline optimization. + + Parameters + ---------- + population_size: int (default: 100) + The number of pipelines in the genetic algorithm population. Must + be > 0.The more pipelines in the population, the slower TPOT will + run, but it's also more likely to find better pipelines. + generations: int (default: 100) + The number of generations to run pipeline optimization for. Must + be > 0. The more generations you give TPOT to run, the longer it + takes, but it's also more likely to find better pipelines. + mutation_rate: float (default: 0.9) + The mutation rate for the genetic programming algorithm in the range + [0.0, 1.0]. This tells the genetic programming algorithm how many + pipelines to apply random changes to every generation. We don't + recommend that you tweak this parameter unless you know what you're + doing. + crossover_rate: float (default: 0.05) + The crossover rate for the genetic programming algorithm in the + range [0.0, 1.0]. This tells the genetic programming algorithm how + many pipelines to "breed" every generation. We don't recommend that + you tweak this parameter unless you know what you're doing. + scoring: function or str + Function used to evaluate the quality of a given pipeline for the + problem. By default, balanced class accuracy is used for + classification problems, mean squared error for regression problems. + TPOT assumes that this scoring function should be maximized, i.e., + higher is better. + + Offers the same options as sklearn.cross_validation.cross_val_score: + + ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', + 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', + 'precision', 'precision_macro', 'precision_micro', 'precision_samples', + 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', + 'recall_samples', 'recall_weighted', 'roc_auc'] + num_cv_folds: int (default: 3) + The number of folds to evaluate each pipeline over in k-fold + cross-validation during the TPOT pipeline optimization process + max_time_mins: int (default: None) + How many minutes TPOT has to optimize the pipeline. If not None, + this setting will override the `generations` parameter. + random_state: int (default: 0) + The random number generator seed for TPOT. Use this to make sure + that TPOT will give you the same results each time you run it + against the same data set with that seed. + verbosity: int (default: 0) + How much information TPOT communicates while it's running. + 0 = none, 1 = minimal, 2 = all + disable_update_check: bool (default: False) + Flag indicating whether the TPOT version checker should be disabled. + + Returns + ------- + None + + """ + if self.__class__.__name__ == 'TPOTBase': + raise RuntimeError('Do not instantiate the TPOTBase class directly; ' + 'use TPOTRegressor or TPOTClassifier instead.') + + # Prompt the user if their version is out of date + self.disable_update_check = disable_update_check + if not self.disable_update_check: + update_check('tpot', __version__) + + self._hof = None + self._optimized_pipeline = None + self._fitted_pipeline = None + self.population_size = population_size + self.generations = generations + self.max_time_mins = max_time_mins + + # Schedule TPOT to run for a very long time if the user specifies a run-time + # limit TPOT will automatically interrupt itself when the timer runs out + if not (max_time_mins is None): + self.generations = 1000000 + + self.mutation_rate = mutation_rate + self.crossover_rate = crossover_rate + self.verbosity = verbosity + self.operators_context = { + 'make_pipeline': make_pipeline, + 'make_union': make_union, + 'VotingClassifier': VotingClassifier, + 'FunctionTransformer': FunctionTransformer + } + + self._pbar = None + self._gp_generation = 0 + + self.random_state = random_state + + # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary + if scoring: + if hasattr(scoring, '__call__'): + scoring_name = scoring.__name__ + + if 'loss' in scoring_name or 'error' in scoring_name: + greater_is_better = False + else: + greater_is_better = True + + SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better) + self.scoring_function = scoring_name + else: + self.scoring_function = scoring + + self.num_cv_folds = num_cv_folds + + self._setup_pset() + self._setup_toolbox() + + def _setup_pset(self): + self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_DF) + + # Rename pipeline input to "input_df" + self._pset.renameArguments(ARG0='input_matrix') + + # Add all operators to the primitive set + for op in operators.Operator.inheritors(): + if self._ignore_operator(op): + continue + + if op.root: + # We need to add rooted primitives twice so that they can + # return both an Output_DF (and thus be the root of the tree), + # and return a np.ndarray so they can exist elsewhere in the + # tree. + p_types = (op.parameter_types()[0], Output_DF) + self._pset.addPrimitive(op, *p_types) + + self._pset.addPrimitive(op, *op.parameter_types()) + + # Import required modules into local namespace so that pipelines + # may be evaluated directly + for key in sorted(op.import_hash.keys()): + module_list = ', '.join(sorted(op.import_hash[key])) + + if key.startswith('tpot.'): + exec('from {} import {}'.format(key[4:], module_list)) + else: + exec('from {} import {}'.format(key, module_list)) + + for var in op.import_hash[key]: + self.operators_context[var] = eval(var) + + self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) + + # Terminals + int_terminals = np.concatenate(( + np.arange(0, 51, 1), + np.arange(60, 110, 10)) + ) + + for val in int_terminals: + self._pset.addTerminal(val, int) + + float_terminals = np.concatenate(( + [1e-6, 1e-5, 1e-4, 1e-3], + np.arange(0., 1.01, 0.01), + np.arange(2., 51., 1.), + np.arange(60., 101., 10.)) + ) + + for val in float_terminals: + self._pset.addTerminal(val, float) + + self._pset.addTerminal(True, Bool) + self._pset.addTerminal(False, Bool) + + def _setup_toolbox(self): + creator.create('FitnessMulti', base.Fitness, weights=(-1.0, 1.0)) + creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti) + + self._toolbox = base.Toolbox() + self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=1, max_=3) + self._toolbox.register('individual', tools.initIterate, creator.Individual, self._toolbox.expr) + self._toolbox.register('population', tools.initRepeat, list, self._toolbox.individual) + self._toolbox.register('compile', self._compile_to_sklearn) + self._toolbox.register('select', self._combined_selection_operator) + self._toolbox.register('mate', gp.cxOnePoint) + self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) + self._toolbox.register('mutate', self._random_mutation_operator) + + def fit(self, features, classes): + """Fits a machine learning pipeline that maximizes classification score + on the provided data + + Uses genetic programming to optimize a machine learning pipeline that + maximizes classification score on the provided features and classes. + Performs an internal stratified training/testing cross-validaton split + to avoid overfitting on the provided data. + + Parameters + ---------- + features: array-like {n_samples, n_features} + Feature matrix + classes: array-like {n_samples} + List of class labels for prediction + + Returns + ------- + None + + """ + features = features.astype(np.float64) + + # Set the seed for the GP run + if self.random_state: + random.seed(self.random_state) + np.random.seed(self.random_state) + + self._start_datetime = datetime.now() + + self._toolbox.register('evaluate', self._evaluate_individual, features=features, classes=classes) + pop = self._toolbox.population(n=self.population_size) + + def pareto_eq(ind1, ind2): + """Determines whether two individuals are equal on the Pareto front + + Parameters + ---------- + ind1: DEAP individual from the GP population + First individual to compare + ind2: DEAP individual from the GP population + Second individual to compare + + Returns + ---------- + individuals_equal: bool + Boolean indicating whether the two individuals are equal on + the Pareto front + + """ + return np.all(ind1.fitness.values == ind2.fitness.values) + + self._hof = tools.ParetoFront(similar=pareto_eq) + + # Start the progress bar + if self.max_time_mins: + total_evals = self.population_size + else: + total_evals = self.population_size * (self.generations + 1) + + self._pbar = tqdm(total=total_evals, unit='pipeline', leave=False, + disable=not (self.verbosity >= 2), desc='GP Progress') + + try: + pop, _ = algorithms.eaSimple( + population=pop, toolbox=self._toolbox, cxpb=self.crossover_rate, + mutpb=self.mutation_rate, ngen=self.generations, + halloffame=self._hof, verbose=False) + + # Allow for certain exceptions to signal a premature fit() cancellation + except (KeyboardInterrupt, SystemExit): + if self.verbosity > 0: + print('GP closed prematurely - will use current best pipeline') + finally: + # Close the progress bar + # Standard truthiness checks won't work for tqdm + if not isinstance(self._pbar, type(None)): + self._pbar.close() + + # Reset gp_generation counter to restore initial state + self._gp_generation = 0 + + # Store the pipeline with the highest internal testing score + if self._hof: + top_score = -5000. + + for pipeline, pipeline_scores in zip(self._hof.items, reversed(self._hof.keys)): + if pipeline_scores.wvalues[1] > top_score: + self._optimized_pipeline = pipeline + + if not self._optimized_pipeline: + raise ValueError('There was an error in the TPOT optimization ' + 'process. This could be because the data was ' + 'not formatted properly, or because data for ' + 'a regression problem was provided to the ' + 'TPOTClassifier object. Please make sure you ' + 'passed the data to TPOT correctly.') + + self._fitted_pipeline = self._toolbox.compile(expr=self._optimized_pipeline) + + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self._fitted_pipeline.fit(features, classes) + + if self.verbosity in [1, 2] and self._optimized_pipeline: + # Add an extra line of spacing if the progress bar was used + if self.verbosity >= 2: + print('') + print('Best pipeline: {}'.format(self._optimized_pipeline)) + + # Store and fit the entire Pareto front if sciencing + elif self.verbosity >= 3 and self._hof: + self._hof_fitted_pipelines = {} + + for pipeline in self._hof.items: + self._hof_fitted_pipelines[str(pipeline)] = self._toolbox.compile(expr=pipeline) + + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self._hof_fitted_pipelines[str(pipeline)].fit(features, classes) + + def predict(self, features): + """Uses the optimized pipeline to predict the classes for a feature set + + Parameters + ---------- + features: array-like {n_samples, n_features} + Feature matrix to predict on + + Returns + ---------- + array-like: {n_samples} + Predicted classes for the feature matrix + + """ + if not self._fitted_pipeline: + raise ValueError('A pipeline has not yet been optimized. Please call fit() first.') + return self._fitted_pipeline.predict(features.astype(np.float64)) + + def fit_predict(self, features, classes): + """Convenience function that fits a pipeline then predicts on the + provided features + + Parameters + ---------- + features: array-like {n_samples, n_features} + Feature matrix + classes: array-like {n_samples} + List of class labels for prediction + + Returns + ---------- + array-like: {n_samples} + Predicted classes for the provided features + + """ + self.fit(features, classes) + return self.predict(features) + + def score(self, testing_features, testing_classes): + """Estimates the balanced testing accuracy of the optimized pipeline. + + Parameters + ---------- + testing_features: array-like {n_samples, n_features} + Feature matrix of the testing set + testing_classes: array-like {n_samples} + List of class labels for prediction in the testing set + + Returns + ------- + accuracy_score: float + The estimated test set accuracy + + """ + if self._fitted_pipeline is None: + raise ValueError('A pipeline has not yet been optimized. ' + 'Please call fit() first.') + + # If the scoring function is a string, we must adjust to use the sklearn scoring interface + return abs(SCORERS[self.scoring_function](self._fitted_pipeline, + testing_features.astype(np.float64), testing_classes.astype(np.float64))) + + def set_params(self, **params): + """Set the parameters of a TPOT instance + + Returns + ------- + self + """ + self.__init__(**params) + + return self + + def export(self, output_file_name): + """Exports the current optimized pipeline as Python code + + Parameters + ---------- + output_file_name: str + String containing the path and file name of the desired output file + + Returns + ------- + None + + """ + if self._optimized_pipeline is None: + raise ValueError('A pipeline has not yet been optimized. Please call fit() first.') + + with open(output_file_name, 'w') as output_file: + output_file.write(export_pipeline(self._optimized_pipeline)) + + def _compile_to_sklearn(self, expr): + """Compiles a DEAP pipeline into a sklearn pipeline + + Parameters + ---------- + expr: DEAP individual + The DEAP pipeline to be compiled + + Returns + ------- + sklearn_pipeline: sklearn.pipeline.Pipeline + """ + sklearn_pipeline = generate_pipeline_code(expr_to_tree(expr)) + + return eval(sklearn_pipeline, self.operators_context) + + def _set_param_recursive(self, pipeline_steps, parameter, value): + """Recursively iterates through all objects in the pipeline and sets the given parameter to the specified value + + Parameters + ---------- + pipeline_steps: array-like + List of (str, obj) tuples from a scikit-learn pipeline or related object + parameter: str + The parameter to assign a value for in each pipeline object + value: any + The value to assign the parameter to in each pipeline object + + Returns + ------- + None + + """ + for (_, obj) in pipeline_steps: + if hasattr(obj, 'steps'): + self._set_param_recursive(obj.steps) + elif hasattr(obj, 'transformer_list'): + self._set_param_recursive(obj.transformer_list) + elif hasattr(obj, 'estimators'): + self._set_param_recursive(obj.estimators) + else: + if hasattr(obj, parameter): + setattr(obj, parameter, value) + + def _evaluate_individual(self, individual, features, classes): + """Determines the `individual`'s fitness + + Parameters + ---------- + individual: DEAP individual + A list of pipeline operators and model parameters that can be + compiled by DEAP into a callable function + features: numpy.ndarray {n_samples, n_features} + A numpy matrix containing the training and testing features for the + `individual`'s evaluation + classes: numpy.ndarray {n_samples, } + A numpy matrix containing the training and testing classes for the + `individual`'s evaluation + + Returns + ------- + fitness: float + Returns a float value indicating the `individual`'s fitness + according to its performance on the provided data + + """ + + try: + if self.max_time_mins: + total_mins_elapsed = (datetime.now() - self._start_datetime).total_seconds() / 60. + if total_mins_elapsed >= self.max_time_mins: + raise KeyboardInterrupt('{} minutes have elapsed. ' + 'TPOT will close down'.format(total_mins_elapsed)) + + # Transform the tree expression into an sklearn pipeline + sklearn_pipeline = self._toolbox.compile(expr=individual) + + # Fix random state when specified + self._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + # Count the number of pipeline operators as a measure of pipeline complexity + operator_count = 0 + for i in range(len(individual)): + node = individual[i] + if ((type(node) is deap.gp.Terminal) or + type(node) is deap.gp.Primitive and node.name == 'CombineDFs'): + continue + operator_count += 1 + + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + cv_scores = cross_val_score(sklearn_pipeline, features, classes, + cv=self.num_cv_folds, scoring=self.scoring_function) + + resulting_score = np.mean(cv_scores) + + except Exception: + # Catch-all: Do not allow one pipeline that crashes to cause TPOT + # to crash. Instead, assign the crashing pipeline a poor fitness + # import traceback + # traceback.print_exc() + return 5000., -5000. + finally: + if not self._pbar.disable: + self._pbar.update(1) # One more pipeline evaluated + + if type(resulting_score) in [float, np.float64, np.float32]: + return max(1, operator_count), resulting_score + else: + raise ValueError('Scoring function does not return a float') + + @_gp_new_generation + def _combined_selection_operator(self, individuals, k): + """Perform NSGA2 selection on the population according to their Pareto fitness + + Parameters + ---------- + individuals: list + A list of individuals to perform selection on + k: int + The number of individuals to return from the selection phase + + Returns + ------- + fitness: list + Returns a list of individuals that were selected + + """ + return tools.selNSGA2(individuals, int(k / 5.)) * 5 + + def _random_mutation_operator(self, individual): + """Perform a replacement, insert, or shrink mutation on an individual + + Parameters + ---------- + individual: DEAP individual + A list of pipeline operators and model parameters that can be + compiled by DEAP into a callable function + + Returns + ------- + fitness: list + Returns the individual with one of the mutations applied to it + + """ + mutation_techniques = [ + partial(gp.mutUniform, expr=self._toolbox.expr_mut, pset=self._pset), + partial(gp.mutInsert, pset=self._pset), + partial(gp.mutShrink) + ] + return np.random.choice(mutation_techniques)(individual) + + def _gen_grow_safe(self, pset, min_, max_, type_=None): + """Generate an expression where each leaf might have a different depth + between *min* and *max*. + + Parameters + ---------- + pset: PrimitiveSetTyped + Primitive set from which primitives are selected. + min_: int + Minimum height of the produced trees. + max_: int + Maximum Height of the produced trees. + type_: class + The type that should return the tree when called, when + :obj:`None` (default) the type of :pset: (pset.ret) + is assumed. + Returns + ------- + individual: list + A grown tree with leaves at possibly different depths. + """ + def condition(height, depth, type_): + """Expression generation stops when the depth is equal to height or + when it is randomly determined that a a node should be a terminal""" + return type_ not in [np.ndarray, Output_DF] or depth == height + + return self._generate(pset, min_, max_, condition, type_) + + # Generate function stolen straight from deap.gp.generate + def _generate(self, pset, min_, max_, condition, type_=None): + """Generate a Tree as a list of list. The tree is build from the root to + the leaves, and it stop growing when the condition is fulfilled. + + Parameters + ---------- + pset: PrimitiveSetTyped + Primitive set from which primitives are selected. + min_: int + Minimum height of the produced trees. + max_: int + Maximum Height of the produced trees. + condition: function + The condition is a function that takes two arguments, + the height of the tree to build and the current + depth in the tree. + type_: class + The type that should return the tree when called, when + :obj:`None` (default) no return type is enforced. + + Returns + ------- + individual: list + A grown tree with leaves at possibly different depths + dependending on the condition function. + """ + if type_ is None: + type_ = pset.ret + expr = [] + height = np.random.randint(min_, max_) + stack = [(0, type_)] + while len(stack) != 0: + depth, type_ = stack.pop() + + # We've added a type_ parameter to the condition function + if condition(height, depth, type_): + try: + term = np.random.choice(pset.terminals[type_]) + except IndexError: + _, _, traceback = sys.exc_info() + raise IndexError("The gp.generate function tried to add " + "a terminal of type '%s', but there is " + "none available." % (type_,)).\ + with_traceback(traceback) + if inspect.isclass(term): + term = term() + expr.append(term) + else: + try: + prim = np.random.choice(pset.primitives[type_]) + except IndexError: + _, _, traceback = sys.exc_info() + raise IndexError("The gp.generate function tried to add " + "a primitive of type '%s', but there is " + "none available." % (type_,)).\ + with_traceback(traceback) + expr.append(prim) + for arg in reversed(prim.args): + stack.append((depth+1, arg)) + + return expr diff --git a/tpot/decorators.py b/tpot/decorators.py index 2a4a5a91..ef95360b 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -18,7 +18,6 @@ """ -from math import log10, floor from functools import wraps @@ -38,21 +37,34 @@ def _gp_new_generation(func): """ @wraps(func) def wrapped_func(self, *args, **kwargs): - """Increment gp_generation and bump pipeline count if necessary""" + """Increment _gp_generation and bump pipeline count if necessary""" ret = func(self, *args, **kwargs) - self.gp_generation = self.gp_generation + 1 + self._gp_generation += 1 - if not self.pbar.disable: - high_score = max([self.hof.keys[x].wvalues[1] for x in range(len(self.hof.keys))]) + if not self._pbar.disable: + # Print only the best individual fitness + if self.verbosity == 2: + high_score = abs(max([self._hof.keys[x].wvalues[1] for x in range(len(self._hof.keys))])) + self._pbar.write('Generation {0} - Current best internal CV score: {1}'.format(self._gp_generation, high_score)) - self.pbar.write('Generation {0} - Current best internal CV score: {1}'.format(self.gp_generation, high_score)) + # Print the entire Pareto front + elif self.verbosity == 3: + self._pbar.write('Generation {} - Current Pareto front scores:'.format(self._gp_generation)) + for pipeline, pipeline_scores in zip(self._hof.items, reversed(self._hof.keys)): + self._pbar.write('{}\t{}\t{}'.format(int(abs(pipeline_scores.wvalues[0])), + abs(pipeline_scores.wvalues[1]), + pipeline)) + self._pbar.write('') # Sometimes the actual evaluated pipeline count does not match the # supposed count because DEAP can cache pipelines. Here any missed # evaluations are added back to the progress bar. - if self.pbar.n < self.gp_generation * self.population_size: - missing_pipelines = (self.gp_generation * self.population_size) - self.pbar.n - self.pbar.update(missing_pipelines) + if self._pbar.n < self._gp_generation * self.population_size: + missing_pipelines = (self._gp_generation * self.population_size) - self._pbar.n + self._pbar.update(missing_pipelines) + + if not (self.max_time_mins is None) and self._pbar.n >= self._pbar.total: + self._pbar.total += self.population_size return ret # Pass back return value of func diff --git a/tpot/driver.py b/tpot/driver.py new file mode 100644 index 00000000..93d64993 --- /dev/null +++ b/tpot/driver.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +import numpy as np +import argparse +from sklearn.cross_validation import train_test_split + +from .tpot import TPOTClassifier, TPOTRegressor +from ._version import __version__ + + +def positive_integer(value): + """Ensures that the provided value is a positive integer + Throws an exception otherwise + + Parameters + ---------- + value: int + The number to evaluate + + Returns + ------- + value: int + Returns a positive integer + """ + try: + value = int(value) + except Exception: + raise argparse.ArgumentTypeError('Invalid int value: \'{}\''. + format(value)) + if value < 0: + raise argparse.ArgumentTypeError('Invalid positive int value: \'{}\''. + format(value)) + return value + + +def float_range(value): + """Ensures that the provided value is a float integer in the range [0., 1.] + Throws an exception otherwise + + Parameters + ---------- + value: float + The number to evaluate + + Returns + ------- + value: float + Returns a float in the range (0., 1.) + """ + try: + value = float(value) + except: + raise argparse.ArgumentTypeError('Invalid float value: \'{}\''. + format(value)) + if value < 0.0 or value > 1.0: + raise argparse.ArgumentTypeError('Invalid float value: \'{}\''. + format(value)) + return value + + +def main(): + """Main function that is called when TPOT is run on the command line""" + parser = argparse.ArgumentParser(description='A Python tool that ' + 'automatically creates and optimizes machine learning pipelines using ' + 'genetic programming.', add_help=False) + + parser.add_argument('INPUT_FILE', type=str, help='Data file to optimize the ' + 'pipeline on; ensure that the class label column is labeled as "class".') + + parser.add_argument('-h', '--help', action='help', help='Show this help message and exit.') + + parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t', + type=str, help='Character used to separate columns in the input file.') + + parser.add_argument('-target', action='store', dest='TARGET_NAME', default='class', + type=str, help='Name of the target column in the input file.') + + parser.add_argument('-mode', action='store', dest='TPOT_MODE', + choices=['classification', 'regression'], default='classification', type=str, + help='Whether TPOT is being used for a classification or regression problem.') + + parser.add_argument('-o', action='store', dest='OUTPUT_FILE', default='', + type=str, help='File to export the final optimized pipeline.') + + parser.add_argument('-g', action='store', dest='GENERATIONS', default=100, + type=positive_integer, help='Number of generations to run pipeline ' + 'optimization over.\nGenerally, TPOT will work better when ' + 'you give it more generations (and therefore time) to optimize over. ' + 'TPOT will evaluate GENERATIONS x POPULATION_SIZE number of pipelines in total.') + + parser.add_argument('-p', action='store', dest='POPULATION_SIZE', default=100, + type=positive_integer, help='Number of individuals in the GP population.\n' + 'Generally, TPOT will work better when you give it more individuals ' + '(and therefore time) to optimize over. TPOT will evaluate ' + 'GENERATIONS x POPULATION_SIZE number of pipelines in total.') + + parser.add_argument('-mr', action='store', dest='MUTATION_RATE', default=0.9, + type=float_range, help='GP mutation rate in the range [0.0, 1.0]. We ' + 'recommend using the default parameter unless you ' + 'understand how the mutation rate affects GP algorithms.') + + parser.add_argument('-xr', action='store', dest='CROSSOVER_RATE', default=0.05, + type=float_range, help='GP crossover rate in the range [0.0, 1.0]. We ' + 'recommend using the default parameter unless you ' + 'understand how the crossover rate affects GP algorithms.') + + parser.add_argument('-cv', action='store', dest='NUM_CV_FOLDS', default=3, + type=int, help='The number of folds to evaluate each pipeline over in ' + 'k-fold cross-validation during the TPOT pipeline optimization process.') + + parser.add_argument('-scoring', action='store', dest='SCORING_FN', default=None, + type=str, help='Function used to evaluate the quality of a given pipeline for ' + 'the problem. By default, balanced accuracy is used for classification and mean ' + 'squared error is used for regression. ' + 'TPOT assumes that any function with "error" or "loss" in the name is meant to ' + 'be minimized, whereas any other functions will be maximized. ' + 'Offers the same options as cross_val_score: ' + '"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", ' + '"f1_micro", "f1_samples", "f1_weighted", "log_loss", "mean_absolute_error", ' + '"mean_squared_error", "median_absolute_error", "precision", "precision_macro", ' + '"precision_micro", "precision_samples", "precision_weighted", "r2", "recall", ' + '"recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"') + + parser.add_argument('-maxtime', action='store', dest='MAX_TIME_MINS', default=None, + type=int, help='How many minutes TPOT has to optimize the pipeline. This ' + 'setting will override the GENERATIONS parameter ' + 'and allow TPOT to run until it runs out of time.') + + parser.add_argument('-s', action='store', dest='RANDOM_STATE', default=None, + type=int, help='Random number generator seed for reproducibility. Set ' + 'this seed if you want your TPOT run to be reproducible with the same ' + 'seed and data set in the future.') + + parser.add_argument('-v', action='store', dest='VERBOSITY', default=1, + choices=[0, 1, 2, 3], type=int, help='How much information TPOT ' + 'communicates while it is running: 0 = none, 1 = minimal, 2 = high, 3 = all.') + + parser.add_argument('--no-update-check', action='store_true', + dest='DISABLE_UPDATE_CHECK', default=False, + help='Flag indicating whether the TPOT version checker should be disabled.') + + parser.add_argument('--version', action='version', + version='TPOT {version}'.format(version=__version__), + help='Show TPOT\'s version number and exit.') + + args = parser.parse_args() + + if args.VERBOSITY >= 2: + print('\nTPOT settings:') + for arg in sorted(args.__dict__): + arg_val = args.__dict__[arg] + if arg == 'DISABLE_UPDATE_CHECK': + continue + elif arg == 'SCORING_FN' and args.__dict__[arg] is None: + if args.TPOT_MODE == 'classification': + arg_val = 'balanced_accuracy' + else: + arg_val = 'mean_squared_error' + print('{}\t=\t{}'.format(arg, arg_val)) + print('') + + input_data = np.recfromcsv(args.INPUT_FILE, delimiter=args.INPUT_SEPARATOR, dtype=np.float64) + if args.TARGET_NAME not in input_data.dtype.names: + raise ValueError('The provided data file does not seem to have a target column. ' + 'Please make sure to specify the target column using the -target parameter.') + + features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1), + input_data.dtype.names.index(args.TARGET_NAME), axis=1) + + training_features, testing_features, training_classes, testing_classes = \ + train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE) + + if args.TPOT_MODE == 'classification': + tpot_type = TPOTClassifier + else: + tpot_type = TPOTRegressor + + tpot = tpot_type(generations=args.GENERATIONS, population_size=args.POPULATION_SIZE, + mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE, + num_cv_folds=args.NUM_CV_FOLDS, scoring=args.SCORING_FN, + max_time_mins=args.MAX_TIME_MINS, + random_state=args.RANDOM_STATE, verbosity=args.VERBOSITY, + disable_update_check=args.DISABLE_UPDATE_CHECK) + + tpot.fit(training_features, training_classes) + + if args.VERBOSITY in [1, 2] and tpot._optimized_pipeline: + training_score = max([tpot._hof.keys[x].wvalues[1] for x in range(len(tpot._hof.keys))]) + print('\nTraining score: {}'.format(abs(training_score))) + print('Holdout score: {}'.format(tpot.score(testing_features, testing_classes))) + + elif args.VERBOSITY >= 3 and tpot._hof: + print('Final Pareto front testing scores:') + + for pipeline, pipeline_scores in zip(tpot._hof.items, reversed(tpot._hof.keys)): + tpot._fitted_pipeline = tpot._hof_fitted_pipelines[str(pipeline)] + print('{}\t{}\t{}'.format(int(abs(pipeline_scores.wvalues[0])), + tpot.score(testing_features, testing_classes), + pipeline)) + + if args.OUTPUT_FILE != '': + tpot.export(args.OUTPUT_FILE) + + +if __name__ == '__main__': + main() diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 779c1a6f..0da55a58 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -23,8 +23,7 @@ def export_pipeline(exported_pipeline): - """Generates the source code of a Python script that recreates the - functionality of a TPOT pipeline + """Generates the source code of a TPOT Pipeline Parameters ---------- @@ -209,11 +208,11 @@ def process_operator(operator, depth=0): if input_name != 'input_matrix': steps.extend(process_operator(input_name, depth + 1)) - # If the step is a classifier and is not the last step then we must + # If the step is an estimator and is not the last step then we must # add its guess as a synthetic feature - if tpot_op.type == "Classifier" and depth > 0: + if tpot_op.root and depth > 0: steps.append( - "make_union(VotingClassifier(estimators=[(\"clf\", {})]), FunctionTransformer(lambda X: X))". + "make_union(VotingClassifier([(\"est\", {})]), FunctionTransformer(lambda X: X))". format(tpot_op.export(*args)) ) else: @@ -250,8 +249,8 @@ def _make_branch(branch): elif branch[1] == "input_matrix": # If depth of branch == 1 tpot_op = operators.Operator.get_by_name(branch[0]) - if tpot_op.type == "Classifier": - return """make_union(VotingClassifier(estimators=[('branch', + if tpot_op.root: + return """make_union(VotingClassifier([('branch', {} )]), FunctionTransformer(lambda X: X))""".format(_indent(process_operator(branch)[0], 4)) else: @@ -259,8 +258,8 @@ def _make_branch(branch): else: # We're going to have to make a pipeline tpot_op = operators.Operator.get_by_name(branch[0]) - if tpot_op.type == "Classifier": - return """make_union(VotingClassifier(estimators=[('branch', + if tpot_op.root: + return """make_union(VotingClassifier([('branch', {} )]), FunctionTransformer(lambda X: X))""".format(_indent(generate_pipeline_code(branch), 4)) else: diff --git a/tpot/gp_types.py b/tpot/gp_types.py index 9cc0af44..db1a95dc 100644 --- a/tpot/gp_types.py +++ b/tpot/gp_types.py @@ -20,10 +20,13 @@ class Bool(object): + """Boolean class used for deap due to deap's poor handling of booleans""" - pass + pass class Output_DF(object): + """Output data type of pipelines""" + pass diff --git a/tpot/metrics.py b/tpot/metrics.py new file mode 100644 index 00000000..303f91e9 --- /dev/null +++ b/tpot/metrics.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +import numpy as np +from sklearn.metrics import make_scorer, SCORERS + + +def balanced_accuracy(y_true, y_pred): + """Default scoring function: balanced accuracy + + Balanced accuracy computes each class' accuracy on a per-class basis using a + one-vs-rest encoding, then computes an unweighted average of the class accuracies. + + Parameters + ---------- + y_true: numpy.ndarray {n_samples} + True class labels + y_pred: numpy.ndarray {n_samples} + Predicted class labels by the estimator + + Returns + ------- + fitness: float + Returns a float value indicating the `individual`'s balanced accuracy + 0.5 is as good as chance, and 1.0 is perfect predictive accuracy + """ + all_classes = list(set(np.append(y_true, y_pred))) + all_class_accuracies = [] + for this_class in all_classes: + this_class_sensitivity = \ + float(sum((y_pred == this_class) & (y_true == this_class))) /\ + float(sum((y_true == this_class))) + + this_class_specificity = \ + float(sum((y_pred != this_class) & (y_true != this_class))) /\ + float(sum((y_true != this_class))) + + this_class_accuracy = (this_class_sensitivity + this_class_specificity) / 2. + all_class_accuracies.append(this_class_accuracy) + + return np.mean(all_class_accuracies) + +SCORERS['balanced_accuracy'] = make_scorer(balanced_accuracy) diff --git a/tpot/operators/__init__.py b/tpot/operators/__init__.py index bee3c44d..14ec6b0a 100644 --- a/tpot/operators/__init__.py +++ b/tpot/operators/__init__.py @@ -22,4 +22,5 @@ from .classifiers import * from .preprocessors import * from .selectors import * +from .regressors import * from .combine_dfs import CombineDFs diff --git a/tpot/operators/base.py b/tpot/operators/base.py index ef215aeb..e0442b15 100644 --- a/tpot/operators/base.py +++ b/tpot/operators/base.py @@ -71,7 +71,7 @@ def __name__(self): @property def type(self): """Returns the type of the operator, e.g: - ("Classifier", "Selector", "Preprocessor") + ("Classifier", "Regressor", "Selector", "Preprocessor") """ return self.__class__.__bases__[0].__name__ diff --git a/tpot/operators/classifiers/__init__.py b/tpot/operators/classifiers/__init__.py index 6977bce6..e976cf9d 100644 --- a/tpot/operators/classifiers/__init__.py +++ b/tpot/operators/classifiers/__init__.py @@ -31,3 +31,7 @@ from .logistic_regression import * from .knnc import * from .gradient_boosting import * +try: + from .xg_boost import * +except ImportError: + pass diff --git a/tpot/operators/classifiers/base.py b/tpot/operators/classifiers/base.py index 13dee9ed..5b0bcb49 100644 --- a/tpot/operators/classifiers/base.py +++ b/tpot/operators/classifiers/base.py @@ -25,3 +25,5 @@ class Classifier(Operator): """Parent class for classifiers in TPOT""" root = True # Whether this operator type can be the root of the tree + regression = False # Whether this operator can be used in a regression problem + classification = True # Whether the operator can be used for classification diff --git a/tpot/operators/classifiers/extra_trees.py b/tpot/operators/classifiers/extra_trees.py index f52125ae..75e11f0f 100644 --- a/tpot/operators/classifiers/extra_trees.py +++ b/tpot/operators/classifiers/extra_trees.py @@ -32,27 +32,23 @@ class TPOTExtraTreesClassifier(Classifier): either 'gini', or 'entropy' max_features: float The number of features to consider when looking for the best split - min_weight_fraction_leaf: float - The minimum weighted fraction of the input samples required to be at a leaf node. """ import_hash = {'sklearn.ensemble': ['ExtraTreesClassifier']} sklearn_class = ExtraTreesClassifier - arg_types = (int, float, float) + arg_types = (int, float) def __init__(self): pass - def preprocess_args(self, criterion, max_features, min_weight_fraction_leaf): + def preprocess_args(self, criterion, max_features): # Select criterion string from list of valid parameters criterion_values = ['gini', 'entropy'] criterion_selection = criterion_values[criterion % len(criterion_values)] - min_weight = min(0.5, max(0., min_weight_fraction_leaf)) max_features = min(1., max(0., max_features)) return { - 'min_weight_fraction_leaf': min_weight, 'criterion': criterion_selection, 'max_features': max_features, 'n_estimators': 500 diff --git a/tpot/operators/classifiers/gradient_boosting.py b/tpot/operators/classifiers/gradient_boosting.py index 0724dfec..39a5070c 100644 --- a/tpot/operators/classifiers/gradient_boosting.py +++ b/tpot/operators/classifiers/gradient_boosting.py @@ -31,25 +31,21 @@ class TPOTGradientBoosting(Classifier): Shrinks the contribution of each tree by learning_rate max_features: float Maximum number of features to use (proportion of total features) - min_weight_fraction_leaf: float - The minimum weighted fraction of the input samples required to be at a leaf node. """ import_hash = {'sklearn.ensemble': ['GradientBoostingClassifier']} sklearn_class = GradientBoostingClassifier - arg_types = (float, float, float) + arg_types = (float, float) def __init__(self): pass - def preprocess_args(self, learning_rate, max_features, min_weight_fraction_leaf): + def preprocess_args(self, learning_rate, max_features): learning_rate = min(1., max(learning_rate, 0.0001)) max_features = min(1., max(0., learning_rate)) - min_weight = min(0.5, max(0., min_weight_fraction_leaf)) return { 'learning_rate': learning_rate, 'max_features': max_features, - 'min_weight_fraction_leaf': min_weight, 'n_estimators': 500 } diff --git a/tpot/operators/classifiers/multinomial_nb.py b/tpot/operators/classifiers/multinomial_nb.py index b298fae6..2bfa1930 100644 --- a/tpot/operators/classifiers/multinomial_nb.py +++ b/tpot/operators/classifiers/multinomial_nb.py @@ -23,14 +23,15 @@ class TPOTMultinomialNB(Classifier): + """Fits a Multinomial Naive Bayes Classifier Parameters ---------- alpha: float Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). - """ + import_hash = {'sklearn.naive_bayes': ['MultinomialNB']} sklearn_class = MultinomialNB arg_types = (float, ) @@ -39,6 +40,7 @@ def __init__(self): pass def preprocess_args(self, alpha): + """Preprocess the arguments in case they need to be limited to a certain value range""" return { 'alpha': alpha, 'fit_prior': True diff --git a/tpot/operators/classifiers/random_forest.py b/tpot/operators/classifiers/random_forest.py index 5b49bcdb..439ae49a 100644 --- a/tpot/operators/classifiers/random_forest.py +++ b/tpot/operators/classifiers/random_forest.py @@ -23,13 +23,14 @@ class TPOTRandomForestClassifier(Classifier): - """Fits a random forest classifier + """Fits a random forest classifier. + Parameters ---------- None - """ + import_hash = {'sklearn.ensemble': ['RandomForestClassifier']} sklearn_class = RandomForestClassifier arg_types = () @@ -38,6 +39,7 @@ def __init__(self): pass def preprocess_args(self): + """Preprocess the arguments in case they need to be limited to a certain value range""" return { 'n_estimators': 500 } diff --git a/tpot/operators/classifiers/xg_boost.py b/tpot/operators/classifiers/xg_boost.py new file mode 100644 index 00000000..99f14b4f --- /dev/null +++ b/tpot/operators/classifiers/xg_boost.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Classifier +from xgboost import XGBClassifier + + +class TPOTXGBClassifier(Classifier): + """Fits an XGBoost Classifier + + Parameters + ---------- + max_depth: int + Maximum tree depth for base learners + min_child_weight: int + Minimum sum of instance weight(hessian) needed in a child + learning_rate: float + Shrinks the contribution of each tree by learning_rate + subsample: float + Subsample ratio of the training instance + """ + import_hash = {'xgboost': ['XGBClassifier']} + sklearn_class = XGBClassifier + arg_types = (int, int, float, float) + + def __init__(self): + pass + + def preprocess_args(self, max_depth, min_child_weight, learning_rate, subsample): + max_depth = min(10, max(max_depth, 1)) + min_child_weight = min(20, max(min_child_weight, 1)) + learning_rate = min(1., max(learning_rate, 0.0001)) + subsample = min(1., max(subsample, 0.05)) + + return { + 'max_depth': max_depth, + 'min_child_weight': min_child_weight, + 'learning_rate': learning_rate, + 'subsample': subsample, + 'n_estimators': 500 + } diff --git a/tpot/operators/preprocessors/base.py b/tpot/operators/preprocessors/base.py index 75d63b9f..7b832364 100644 --- a/tpot/operators/preprocessors/base.py +++ b/tpot/operators/preprocessors/base.py @@ -25,3 +25,5 @@ class Preprocessor(Operator): """Parent class for Feature Preprocessors in TPOT""" root = False # Whether this operator type can be the root of the tree + regression = True # Whether this operator can be used in a regression problem + classification = True # Whether the operator can be used for classification diff --git a/tpot/operators/preprocessors/max_abs_scalar.py b/tpot/operators/preprocessors/max_abs_scalar.py index e3be175a..c07f9076 100644 --- a/tpot/operators/preprocessors/max_abs_scalar.py +++ b/tpot/operators/preprocessors/max_abs_scalar.py @@ -21,16 +21,15 @@ from .base import Preprocessor from sklearn.preprocessing import MaxAbsScaler - class TPOTMaxAbsScaler(Preprocessor): - """Uses scikit-learn's MaxAbsScaler to transform all of the features by - scaling them to [0, 1] relative to the feature's maximum value. + """Uses scikit-learn's MaxAbsScaler to transform all of the features by scaling them to [0, 1] relative to the feature's maximum value. + Parameters ---------- None - """ + import_hash = {'sklearn.preprocessing': ['MaxAbsScaler']} sklearn_class = MaxAbsScaler arg_types = () @@ -39,6 +38,5 @@ def __init__(self): pass def preprocess_args(self): - return { - - } + """Preprocess the arguments in case they need to be limited to a certain value range""" + return { } diff --git a/tpot/operators/preprocessors/min_max_scalar.py b/tpot/operators/preprocessors/min_max_scalar.py index 5a128279..56b741e7 100644 --- a/tpot/operators/preprocessors/min_max_scalar.py +++ b/tpot/operators/preprocessors/min_max_scalar.py @@ -21,16 +21,15 @@ from .base import Preprocessor from sklearn.preprocessing import MinMaxScaler - class TPOTMaxAbsScaler(Preprocessor): - """Uses scikit-learn's MinMaxScaler to transform all of the features by - scaling them to the range [0, 1]. + """Uses scikit-learn's MinMaxScaler to transform all of the features by scaling them to the range [0, 1]. + Parameters ---------- None - """ + import_hash = {'sklearn.preprocessing': ['MinMaxScaler']} sklearn_class = MinMaxScaler arg_types = () @@ -39,6 +38,5 @@ def __init__(self): pass def preprocess_args(self): - return { - - } + """Preprocess the arguments in case they need to be limited to a certain value range""" + return { } diff --git a/tpot/operators/preprocessors/pca.py b/tpot/operators/preprocessors/pca.py index dadd0ae9..1fa86eac 100644 --- a/tpot/operators/preprocessors/pca.py +++ b/tpot/operators/preprocessors/pca.py @@ -21,16 +21,16 @@ from .base import Preprocessor from sklearn.decomposition import RandomizedPCA - class TPOTRandomizedPCA(Preprocessor): + """Uses scikit-learn's RandomizedPCA to transform the feature set Parameters ---------- iterated_power: int Number of iterations for the power method. [1, 10] - """ + import_hash = {'sklearn.decomposition': ['RandomizedPCA']} sklearn_class = RandomizedPCA arg_types = (int, ) diff --git a/tpot/operators/preprocessors/zero_count.py b/tpot/operators/preprocessors/zero_count.py index 07880f18..fd37bd32 100644 --- a/tpot/operators/preprocessors/zero_count.py +++ b/tpot/operators/preprocessors/zero_count.py @@ -26,10 +26,9 @@ class ZeroCount(BaseEstimator): - """PreProcessor that adds two virtual features to the dataset, one for the - count of zero values in the feature set, and one for the count of non-zeros - in the feature set - """ + + """Preprocessor that adds two virtual features to the dataset, one for the count of zero values in the feature set, and one for the count of non-zeros in the feature set""" + def __init__(self): pass @@ -70,19 +69,17 @@ def transform(self, X, y=None): class TPOTZeroCount(Preprocessor): - """Uses TPOT's ZeroCount to transform the feature set - Parameters - ---------- - None + """Uses TPOT's ZeroCount to transform the feature set""" - """ import_hash = {'tpot.operators.preprocessors': ['ZeroCount']} sklearn_class = ZeroCount arg_types = () def __init__(self): + """Creates a new TPOTZeroCount instance""" pass def preprocess_args(self): + """Preprocesses the arguments in case they need to be constrained in some way""" return {} diff --git a/tpot/operators/regressors/__init__.py b/tpot/operators/regressors/__init__.py new file mode 100644 index 00000000..9efabf24 --- /dev/null +++ b/tpot/operators/regressors/__init__.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import * +from .extra_trees import * +from .random_forest import * +from .knnr import * +from .gradient_boosting import * +from .passive_aggressive_r import * +from .linear_svr import * +from .elastic_net import * +from .xg_boost_r import * diff --git a/tpot/operators/regressors/ada_boost.py b/tpot/operators/regressors/ada_boost.py new file mode 100644 index 00000000..cd9e6fa1 --- /dev/null +++ b/tpot/operators/regressors/ada_boost.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.ensemble import AdaBoostRegressor + + +class TPOTAdaBoostClassifier(Regressor): + """Fits an AdaBoost Regressor + + Parameters + ---------- + learning_rate: float + Learning rate shrinks the contribution of each classifier by learning_rate. + + """ + import_hash = {'sklearn.ensemble': ['AdaBoostRegressor']} + sklearn_class = AdaBoostRegressor + arg_types = (float, ) + + def __init__(self): + pass + + def preprocess_args(self, learning_rate): + learning_rate = min(1., max(0.0001, learning_rate)) + + return { + 'learning_rate': learning_rate, + 'n_estimators': 500 + } diff --git a/tpot/operators/regressors/base.py b/tpot/operators/regressors/base.py new file mode 100644 index 00000000..e6b1897f --- /dev/null +++ b/tpot/operators/regressors/base.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from tpot.operators import Operator + + +class Regressor(Operator): + """Parent class for regressors in TPOT""" + + root = True # Whether this operator type can be the root of the tree + regression = True # Whether this operator can be used in a regression problem + classification = False # Whether the operator can be used for classification diff --git a/tpot/operators/regressors/elastic_net.py b/tpot/operators/regressors/elastic_net.py new file mode 100644 index 00000000..345bf8a6 --- /dev/null +++ b/tpot/operators/regressors/elastic_net.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.linear_model import ElasticNet + + +class TPOTElasticNet(Regressor): + """Fits a Elastic Net Regressor + + Parameters + ---------- + alpha: float + Constant that multiplies the penalty terms. + l1_ratio: int + The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1 + + """ + import_hash = {'sklearn.linear_model': ['ElasticNet']} + sklearn_class = ElasticNet + arg_types = (float, float) + + def __init__(self): + pass + + def preprocess_args(self, alpha, l1_ratio): + alpha = min(1., max(0.0001, alpha)) + l1_ratio = min(1., max(0.0001, l1_ratio)) + + return { + 'alpha': alpha, + 'l1_ratio': l1_ratio + } diff --git a/tpot/operators/regressors/extra_trees.py b/tpot/operators/regressors/extra_trees.py new file mode 100644 index 00000000..7f10f28d --- /dev/null +++ b/tpot/operators/regressors/extra_trees.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.ensemble import ExtraTreesRegressor + + +class TPOTExtraTreesRegressor(Regressor): + """Fits an Extra Trees Regressor + + Parameters + ---------- + criterion: int + Integer that is used to select from the list of valid criteria, + either 'gini', or 'entropy' + max_features: float + The number of features to consider when looking for the best split + + """ + import_hash = {'sklearn.ensemble': ['ExtraTreesRegressor']} + sklearn_class = ExtraTreesRegressor + arg_types = (float, ) + + def __init__(self): + pass + + def preprocess_args(self, max_features): + max_features = min(1., max(0., max_features)) + + return { + 'max_features': max_features, + 'n_estimators': 500 + } diff --git a/tpot/operators/regressors/gradient_boosting.py b/tpot/operators/regressors/gradient_boosting.py new file mode 100644 index 00000000..7eaeea3d --- /dev/null +++ b/tpot/operators/regressors/gradient_boosting.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.ensemble import GradientBoostingRegressor + + +class TPOTGradientBRegressor(Regressor): + """Fits a Gradient Boosting Regressor + + Parameters + ---------- + learning_rate: float + Shrinks the contribution of each tree by learning_rate + max_features: float + Maximum number of features to use (proportion of total features) + + """ + import_hash = {'sklearn.ensemble': ['GradientBoostingRegressor']} + sklearn_class = GradientBoostingRegressor + arg_types = (float, float) + + def __init__(self): + pass + + def preprocess_args(self, learning_rate, max_features): + learning_rate = min(1., max(learning_rate, 0.0001)) + max_features = min(1., max(0., learning_rate)) + + return { + 'learning_rate': learning_rate, + 'max_features': max_features, + 'n_estimators': 500 + } diff --git a/tpot/operators/regressors/knnr.py b/tpot/operators/regressors/knnr.py new file mode 100644 index 00000000..276260dc --- /dev/null +++ b/tpot/operators/regressors/knnr.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.neighbors import KNeighborsRegressor + + +class TPOTKNeighborsRegressor(Regressor): + """Fits a k-nearest neighbor Regressor + + Parameters + ---------- + n_neighbors: int + Number of neighbors to use by default for k_neighbors queries; must be a positive value + weights: int + Selects a value from the list: ['uniform', 'distance'] + + """ + import_hash = {'sklearn.neighbors': ['KNeighborsRegressor']} + sklearn_class = KNeighborsRegressor + arg_types = (int, int) + + def __init__(self): + pass + + def preprocess_args(self, n_neighbors, weights): + n_neighbors = max(min(5, n_neighbors), 2) + + weights_values = ['uniform', 'distance'] + weights_selection = weights_values[weights % len(weights_values)] + + return { + 'n_neighbors': n_neighbors, + 'weights': weights_selection + } diff --git a/tpot/operators/regressors/lasso_lars_cv.py b/tpot/operators/regressors/lasso_lars_cv.py new file mode 100644 index 00000000..c1454bbd --- /dev/null +++ b/tpot/operators/regressors/lasso_lars_cv.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from ...gp_types import Bool +from .base import Regressor +from sklearn.linear_model import LassoLarsCV + + +class TPOTLassoLarsCV(Regressor): + """Fits a LassoLarsCV Regressor + + Parameters + ---------- + normalize: bool + If True, the regressors X will be normalized before regression. + + """ + import_hash = {'sklearn.linear_model': ['LassoLarsCV']} + sklearn_class = LassoLarsCV + arg_types = (Bool, ) + + def __init__(self): + pass + + def preprocess_args(self, normalize): + return { + 'normalize': normalize + } diff --git a/tpot/operators/regressors/linear_svr.py b/tpot/operators/regressors/linear_svr.py new file mode 100644 index 00000000..61b16cda --- /dev/null +++ b/tpot/operators/regressors/linear_svr.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from ...gp_types import Bool +from .base import Regressor +from sklearn.svm import LinearSVR + + +class TPOTLinearSVR(Regressor): + """Fits a Linear Support Vector Regressor + + Parameters + ---------- + C: float + Penalty parameter C of the error term. + dual: bool + Select the algorithm to either solve the dual or primal optimization problem. + + """ + import_hash = {'sklearn.svm': ['LinearSVR']} + sklearn_class = LinearSVR + arg_types = (float, Bool) + + def __init__(self): + pass + + def preprocess_args(self, C, dual): + C = min(25., max(0.0001, C)) + + return { + 'C': C, + 'dual': dual + } diff --git a/tpot/operators/regressors/passive_aggressive_r.py b/tpot/operators/regressors/passive_aggressive_r.py new file mode 100644 index 00000000..5cceed08 --- /dev/null +++ b/tpot/operators/regressors/passive_aggressive_r.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.linear_model import PassiveAggressiveRegressor + + +class TPOTPassiveAggressiveR(Regressor): + """Fits a Passive Aggressive Regressor + + Parameters + ---------- + C: float + Penalty parameter C of the error term. + loss: int + Integer used to determine the loss function + (either 'epsilon_insensitive' or 'squared_epsilon_insensitive') + + """ + import_hash = {'sklearn.linear_model': ['PassiveAggressiveRegressor']} + sklearn_class = PassiveAggressiveRegressor + arg_types = (float, int) + + def __init__(self): + pass + + def preprocess_args(self, C, loss): + loss_values = ['epsilon_insensitive', 'squared_epsilon_insensitive'] + loss_selection = loss_values[loss % len(loss_values)] + + C = min(1., max(0.0001, C)) + + return { + 'C': C, + 'loss': loss_selection, + 'fit_intercept': True + } diff --git a/tpot/operators/regressors/random_forest.py b/tpot/operators/regressors/random_forest.py new file mode 100644 index 00000000..e4d7f05c --- /dev/null +++ b/tpot/operators/regressors/random_forest.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from sklearn.ensemble import RandomForestRegressor + + +class TPOTRandomForestClassifier(Regressor): + + """Fits a random forest Regressor. + + Parameters + ---------- + None + """ + + import_hash = {'sklearn.ensemble': ['RandomForestRegressor']} + sklearn_class = RandomForestRegressor + arg_types = () + + def __init__(self): + pass + + def preprocess_args(self): + """Preprocess the arguments in case they need to be limited to a certain value range""" + return { + 'n_estimators': 500 + } diff --git a/tpot/operators/regressors/xg_boost_r.py b/tpot/operators/regressors/xg_boost_r.py new file mode 100644 index 00000000..2adb10db --- /dev/null +++ b/tpot/operators/regressors/xg_boost_r.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Regressor +from xgboost import XGBRegressor + + +class TPOTXGBRegressor(Regressor): + """Fits an XGBoost Regressor + + Parameters + ---------- + max_depth: int + Maximum tree depth for base learners + min_child_weight: int + Minimum sum of instance weight(hessian) needed in a child + learning_rate: float + Shrinks the contribution of each tree by learning_rate + subsample: float + Subsample ratio of the training instance + """ + import_hash = {'xgboost': ['XGBRegressor']} + sklearn_class = XGBRegressor + arg_types = (int, int, float, float) + + def __init__(self): + pass + + def preprocess_args(self, max_depth, min_child_weight, learning_rate, subsample): + max_depth = min(10, max(max_depth, 1)) + min_child_weight = min(20, max(min_child_weight, 1)) + learning_rate = min(1., max(learning_rate, 0.0001)) + subsample = min(1., max(subsample, 0.05)) + + return { + 'max_depth': max_depth, + 'min_child_weight': min_child_weight, + 'learning_rate': learning_rate, + 'subsample': subsample, + 'n_estimators': 500 + } diff --git a/tpot/operators/selectors/__init__.py b/tpot/operators/selectors/__init__.py index 46f32de1..43296fdf 100644 --- a/tpot/operators/selectors/__init__.py +++ b/tpot/operators/selectors/__init__.py @@ -24,3 +24,5 @@ from .select_kbest import * from .select_percentile import * from .variance_threshold import * +from .select_from_model import * +from .select_from_model_r import * diff --git a/tpot/operators/selectors/base.py b/tpot/operators/selectors/base.py index 45e9b797..57034699 100644 --- a/tpot/operators/selectors/base.py +++ b/tpot/operators/selectors/base.py @@ -25,3 +25,5 @@ class Selector(Operator): """Parent class for Feature Selectors in TPOT""" root = False # Whether this operator type can be the root of the tree + regression = True # Whether this operator can be used in a regression problem + classification = True # Whether the operator can be used for classification diff --git a/tpot/operators/selectors/rfe.py b/tpot/operators/selectors/rfe.py index 32d93a32..e0da0e74 100644 --- a/tpot/operators/selectors/rfe.py +++ b/tpot/operators/selectors/rfe.py @@ -35,6 +35,7 @@ class TPOTRFE(Selector): import_hash = {'sklearn.feature_selection': ['RFE'], 'sklearn.svm': ['SVC']} sklearn_class = RFE arg_types = (float, ) + regression = False # Can not be used in regression due to SVC estimator def __init__(self): pass diff --git a/tpot/operators/selectors/select_from_model.py b/tpot/operators/selectors/select_from_model.py new file mode 100644 index 00000000..0417771d --- /dev/null +++ b/tpot/operators/selectors/select_from_model.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Selector +from sklearn.feature_selection import SelectFromModel +from sklearn.ensemble import ExtraTreesClassifier + + +class TPOTSelectFromModel(Selector): + """Uses scikit-learn's ExtraTreesClassifier combined with SelectFromModel + to transform the feature set. + + Parameters + ---------- + threshold: float + Features whose importance is greater or equal are kept while the others + are discarded. + criterion: int + For the ExtraTreesClassifier: + Integer that is used to select from the list of valid criteria, + either 'gini', or 'entropy' + max_features: float + For the ExtraTreesClassifier: + The number of features to consider when looking for the best split + + """ + import_hash = { + 'sklearn.feature_selection': ['SelectFromModel'], + 'sklearn.ensemble': ['ExtraTreesClassifier'] + } + sklearn_class = SelectFromModel + arg_types = (float, int, float) + regression = False # Can not be used in regression due to ExtraTreesClassifier + + def __init__(self): + pass + + def preprocess_args(self, threshold, criterion, max_features): + threshold = min(1., max(0., threshold)) + + # Select criterion string from list of valid parameters + criterion_values = ['gini', 'entropy'] + criterion_selection = criterion_values[criterion % len(criterion_values)] + + max_features = min(1., max(0., max_features)) + + return { + 'estimator': ExtraTreesClassifier(criterion=criterion_selection, max_features=max_features), + 'threshold': threshold + } diff --git a/tpot/operators/selectors/select_from_model_r.py b/tpot/operators/selectors/select_from_model_r.py new file mode 100644 index 00000000..804fcafe --- /dev/null +++ b/tpot/operators/selectors/select_from_model_r.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Randal S. Olson + +This file is part of the TPOT library. + +The TPOT library is free software: you can redistribute it and/or +modify it under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your option) +any later version. + +The TPOT library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. You should have received a copy of the GNU General Public License along +with the TPOT library. If not, see http://www.gnu.org/licenses/. + +""" + +from .base import Selector +from sklearn.feature_selection import SelectFromModel +from sklearn.ensemble import ExtraTreesRegressor + + +class TPOTSelectFromModelR(Selector): + """Uses scikit-learn's ExtraTreesRegressor combined with SelectFromModel + to transform the feature set. + + Parameters + ---------- + threshold: float + Features whose importance is greater or equal are kept while the others + are discarded. + criterion: int + For the ExtraTreesRegressor: + Integer that is used to select from the list of valid criteria, + either 'gini', or 'entropy' + max_features: float + For the ExtraTreesRegressor: + The number of features to consider when looking for the best split + + """ + import_hash = { + 'sklearn.feature_selection': ['SelectFromModel'], + 'sklearn.ensemble': ['ExtraTreesRegressor'] + } + sklearn_class = SelectFromModel + arg_types = (float, int, float) + classification = False + + def __init__(self): + pass + + def preprocess_args(self, threshold, criterion, max_features): + threshold = min(1., max(0., threshold)) + + # Select criterion string from list of valid parameters + criterion_values = ['gini', 'entropy'] + criterion_selection = criterion_values[criterion % len(criterion_values)] + + max_features = min(1., max(0., max_features)) + + return { + 'estimator': ExtraTreesRegressor(criterion=criterion_selection, max_features=max_features), + 'threshold': threshold + } diff --git a/tpot/tpot.py b/tpot/tpot.py index 28dda914..cf6ba78e 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -18,803 +18,38 @@ """ -from __future__ import print_function -import argparse -import random -import inspect -import warnings -import sys -from functools import partial +from .base import TPOTBase -import numpy as np -import deap -from deap import algorithms, base, creator, tools, gp -from tqdm import tqdm -from sklearn.cross_validation import train_test_split, cross_val_score -from sklearn.pipeline import make_pipeline, make_union -from sklearn.preprocessing import FunctionTransformer -from sklearn.ensemble import VotingClassifier +class TPOTClassifier(TPOTBase): + """TPOT estimator for classification problems""" -from update_checker import update_check + scoring_function = 'balanced_accuracy' # Classification scoring -from ._version import __version__ -from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code -from .decorators import _gp_new_generation -from . import operators -from .operators import CombineDFs -from .gp_types import Bool, Output_DF - - -class TPOT(object): - """TPOT automatically creates and optimizes machine learning pipelines using - genetic programming - """ - - def __init__(self, population_size=100, generations=100, - mutation_rate=0.9, crossover_rate=0.05, - random_state=None, verbosity=0, - scoring_function=None, num_cv_folds=3, - disable_update_check=False): - """Sets up the genetic programming algorithm for pipeline optimization. - - Parameters - ---------- - population_size: int (default: 100) - The number of pipelines in the genetic algorithm population. Must - be > 0.The more pipelines in the population, the slower TPOT will - run, but it's also more likely to find better pipelines. - generations: int (default: 100) - The number of generations to run pipeline optimization for. Must - be > 0. The more generations you give TPOT to run, the longer it - takes, but it's also more likely to find better pipelines. - mutation_rate: float (default: 0.9) - The mutation rate for the genetic programming algorithm in the range - [0.0, 1.0]. This tells the genetic programming algorithm how many - pipelines to apply random changes to every generation. We don't - recommend that you tweak this parameter unless you know what you're - doing. - crossover_rate: float (default: 0.05) - The crossover rate for the genetic programming algorithm in the - range [0.0, 1.0]. This tells the genetic programming algorithm how - many pipelines to "breed" every generation. We don't recommend that - you tweak this parameter unless you know what you're doing. - random_state: int (default: 0) - The random number generator seed for TPOT. Use this to make sure - that TPOT will give you the same results each time you run it - against the same data set with that seed. - verbosity: int (default: 0) - How much information TPOT communicates while it's running. - 0 = none, 1 = minimal, 2 = all - scoring_function: str (default: balanced accuracy) - Function used to evaluate the goodness of a given pipeline for the - classification problem. By default, balanced class accuracy is used. - TPOT assumes that this scoring function should be maximized, i.e., - higher is better. - - Offers the same options as sklearn.cross_validation.cross_val_score: - - ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', - 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'precision', 'precision_macro', - 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', - 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] - num_cv_folds: int (default: 3) - The number of folds to evaluate each pipeline over in k-fold cross-validation - during the TPOT pipeline optimization process - disable_update_check: bool (default: False) - Flag indicating whether the TPOT version checker should be disabled. - - Returns - ------- - None - - """ - # Save params to be recalled later by get_params() - self.params = locals() # Must be before any local variable definitions - self.params.pop('self') - - # Prompt the user if their version is out of date - if not disable_update_check: - update_check('tpot', __version__) - - self.hof = None - self._optimized_pipeline = None - self._fitted_pipeline = None - self.population_size = population_size - self.generations = generations - self.mutation_rate = mutation_rate - self.crossover_rate = crossover_rate - self.verbosity = verbosity - self.operators_context = { - 'make_pipeline': make_pipeline, - 'make_union': make_union, - 'VotingClassifier': VotingClassifier, - 'FunctionTransformer': FunctionTransformer - } - - self.pbar = None - self.gp_generation = 0 - self.random_state = random_state - - if scoring_function is None: - self.scoring_function = self._balanced_accuracy - else: - self.scoring_function = scoring_function - - self.num_cv_folds = num_cv_folds - - self._setup_pset() - self._setup_toolbox() - - def _setup_pset(self): - self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_DF) - - # Rename pipeline input to "input_df" - self._pset.renameArguments(ARG0='input_matrix') - - # Add all operators to the primitive set - for op in operators.Operator.inheritors(): - if op.root: - # We need to add rooted primitives twice so that they can - # return both an Output_DF (and thus be the root of the tree), - # and return a np.ndarray so they can exist elsewhere in the - # tree. - p_types = (op.parameter_types()[0], Output_DF) - self._pset.addPrimitive(op, *p_types) - - self._pset.addPrimitive(op, *op.parameter_types()) - - # Import required modules into local namespace so that pipelines - # may be evaluated directly - for key in sorted(op.import_hash.keys()): - module_list = ', '.join(sorted(op.import_hash[key])) - - if key.startswith("tpot."): - exec('from {} import {}'.format(key[4:], module_list)) - else: - exec('from {} import {}'.format(key, module_list)) - - for var in op.import_hash[key]: - self.operators_context[var] = eval(var) - - self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) - - # Terminals - int_terminals = np.concatenate(( - np.arange(0, 51, 1), - np.arange(60, 110, 10)) - ) - - for val in int_terminals: - self._pset.addTerminal(val, int) - - float_terminals = np.concatenate(( - [1e-6, 1e-5, 1e-4, 1e-3], - np.arange(0., 1.01, 0.01), - np.arange(2., 51., 1.), - np.arange(60., 101., 10.)) - ) - - for val in float_terminals: - self._pset.addTerminal(val, float) - - self._pset.addTerminal(True, Bool) - self._pset.addTerminal(False, Bool) - - def _setup_toolbox(self): - creator.create('FitnessMulti', base.Fitness, weights=(-1.0, 1.0)) - creator.create('Individual', - gp.PrimitiveTree, fitness=creator.FitnessMulti) - - self._toolbox = base.Toolbox() - self._toolbox.register('expr', - self._gen_grow_safe, pset=self._pset, min_=1, max_=3) - self._toolbox.register('individual', - tools.initIterate, creator.Individual, self._toolbox.expr) - self._toolbox.register('population', - tools.initRepeat, list, self._toolbox.individual) - self._toolbox.register('compile', self._compile_to_sklearn) - self._toolbox.register('select', self._combined_selection_operator) - self._toolbox.register('mate', gp.cxOnePoint) - self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) - self._toolbox.register('mutate', self._random_mutation_operator) - - def fit(self, features, classes): - """Fits a machine learning pipeline that maximizes classification - accuracy on the provided data - - Uses genetic programming to optimize a machine learning pipeline that - maximizes classification accuracy on the provided features and classes. - Performs an internal stratified training/testing cross-validaton split - to avoid overfitting on the provided data. + def _ignore_operator(self, op): + """Filter that describes which operators are not used Parameters ---------- - features: array-like {n_samples, n_features} - Feature matrix - classes: array-like {n_samples} - List of class labels for prediction - - Returns - ------- - None + op: Operator + TPOT Pipeline operator being tested """ - try: - if self.random_state: - random.seed(self.random_state) - np.random.seed(self.random_state) - - features = features.astype(np.float64) - classes = classes.astype(np.float64) - - self._toolbox.register('evaluate', self._evaluate_individual, features=features, classes=classes) - pop = self._toolbox.population(n=self.population_size) - - def pareto_eq(ind1, ind2): - """Function used to determine whether two individuals are equal - on the Pareto front - - Parameters - ---------- - ind1: DEAP individual from the GP population - First individual to compare - ind2: DEAP individual from the GP population - Second individual to compare - - Returns - ---------- - individuals_equal: bool - Boolean indicating whether the two individuals are equal on - the Pareto front - - """ - return np.all(ind1.fitness.values == ind2.fitness.values) - - self.hof = tools.ParetoFront(similar=pareto_eq) - - verbose = (self.verbosity == 2) - - # Start the progress bar - num_evaluations = self.population_size * (self.generations + 1) - self.pbar = tqdm(total=num_evaluations, unit='pipeline', leave=False, - disable=(not verbose), desc='GP Progress') + return not op.classification - pop, _ = algorithms.eaSimple( - population=pop, toolbox=self._toolbox, cxpb=self.crossover_rate, - mutpb=self.mutation_rate, ngen=self.generations, - halloffame=self.hof, verbose=False) - # Allow for certain exceptions to signal a premature fit() cancellation - except (KeyboardInterrupt, SystemExit): - if self.verbosity > 0: - print('GP closed prematurely - will use current best pipeline') - finally: - # Close the progress bar - # Standard truthiness checks won't work for tqdm - if not isinstance(self.pbar, type(None)): - self.pbar.close() - - # Reset gp_generation counter to restore initial state - self.gp_generation = 0 - - # Store the pipeline with the highest internal testing accuracy - if self.hof: - top_score = 0. - for pipeline, pipeline_scores in zip(self.hof.items, reversed(self.hof.keys)): - if pipeline_scores.wvalues[1] > top_score: - self._optimized_pipeline = pipeline - if self._optimized_pipeline is None: - raise ValueError(('There was an error in the TPOT optimization process. ' - 'This could be because the data was not formatted properly, ' - 'or because data for a regression problem was provided to the TPOTClassifier object. ' - 'Please make sure you passed the data to TPOT correctly.')) - else: - self._fitted_pipeline = self._toolbox.compile(expr=self._optimized_pipeline) - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - self._fitted_pipeline.fit(features, classes) - - if self.verbosity >= 1 and self._optimized_pipeline: - # Add an extra line of spacing if the progress bar was used - if verbose: - print() - - print('Best pipeline: {}'.format(self._optimized_pipeline)) - - def predict(self, features): - """Uses the optimized pipeline to predict the classes for a feature set - - Parameters - ---------- - features: array-like {n_samples, n_features} - Feature matrix to predict on - - Returns - ---------- - array-like: {n_samples} - Predicted classes for the feature matrix - - """ - if not self._fitted_pipeline: - raise ValueError(('A pipeline has not yet been optimized. ' - 'Please call fit() first.')) - return self._fitted_pipeline.predict(features.astype(np.float64)) - - def fit_predict(self, features, classes): - """Convenience function that fits a pipeline then predicts on the - provided features - - Parameters - ---------- - features: array-like {n_samples, n_features} - Feature matrix - classes: array-like {n_samples} - List of class labels for prediction - - Returns - ---------- - array-like: {n_samples} - Predicted classes for the provided features - - """ - self.fit(features, classes) - return self.predict(features) - - def score(self, testing_features, testing_classes): - """Estimates the balanced testing accuracy of the optimized pipeline. - - Parameters - ---------- - testing_features: array-like {n_samples, n_features} - Feature matrix of the testing set - testing_classes: array-like {n_samples} - List of class labels for prediction in the testing set - - Returns - ------- - accuracy_score: float - The estimated test set accuracy - - """ - if self._fitted_pipeline is None: - raise ValueError(('A pipeline has not yet been optimized. ' - 'Please call fit() first.')) - - return self._balanced_accuracy(self._fitted_pipeline, testing_features.astype(np.float64), testing_classes) - - def get_params(self, deep=None): - """Get parameters for this estimator - - This function is necessary for TPOT to work as a drop-in estimator in, - e.g., sklearn.cross_validation.cross_val_score - - Parameters - ---------- - deep: unused - Only implemented to maintain interface for sklearn - - Returns - ------- - params : mapping of string to any - Parameter names mapped to their values. - - """ - return self.params - - def export(self, output_file_name): - """Exports the current optimized pipeline as Python code - - Parameters - ---------- - output_file_name: str - String containing the path and file name of the desired output file - - Returns - ------- - None - - """ - if self._optimized_pipeline is None: - raise ValueError(('A pipeline has not yet been optimized. ' - 'Please call fit() first.')) +class TPOTRegressor(TPOTBase): + """TPOT estimator for regression problems""" - with open(output_file_name, 'w') as output_file: - output_file.write(export_pipeline(self._optimized_pipeline)) + scoring_function = 'mean_squared_error' # Regression scoring - def _compile_to_sklearn(self, expr): - """Compiles a DEAP pipeline into a sklearn pipeline + def _ignore_operator(self, op): + """Filter that describes which operators are not used Parameters ---------- - expr: DEAP individual - The DEAP pipeline to be compiled + op: Operator + TPOT Pipeline operator being tested - Returns - ------- - sklearn_pipeline: sklearn.pipeline.Pipeline """ - sklearn_pipeline = generate_pipeline_code(expr_to_tree(expr)) - - return eval(sklearn_pipeline, self.operators_context) - - def _evaluate_individual(self, individual, features, classes): - """Determines the `individual`'s fitness according to its performance on - the provided data - - Parameters - ---------- - individual: DEAP individual - A list of pipeline operators and model parameters that can be - compiled by DEAP into a callable function - features: numpy.ndarray {n_samples, n_features} - A numpy matrix containing the training and testing features for the - `individual`'s evaluation - classes: numpy.ndarray {n_samples, } - A numpy matrix containing the training and testing classes for the - `individual`'s evaluation - - Returns - ------- - fitness: float - Returns a float value indicating the `individual`'s fitness - according to its performance on the provided data - - """ - - try: - # Transform the tree expression in a callable function - sklearn_pipeline = self._toolbox.compile(expr=individual) - - # Count the number of pipeline operators as a measure of pipeline - # complexity - operator_count = 0 - for i in range(len(individual)): - node = individual[i] - if ((type(node) is deap.gp.Terminal) or - type(node) is deap.gp.Primitive and node.name == 'CombineDFs'): - continue - operator_count += 1 - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - cv_scores = cross_val_score(sklearn_pipeline, features, classes, cv=self.num_cv_folds, scoring=self.scoring_function) - - resulting_score = np.mean(cv_scores) - except MemoryError: - # Throw out GP expressions that are too large to be compiled - return 5000., 0. - except Exception: - # Catch-all: Do not allow one pipeline that crashes to cause TPOT - # to crash. Instead, assign the crashing pipeline a poor fitness - return 5000., 0. - finally: - if not self.pbar.disable: - self.pbar.update(1) # One more pipeline evaluated - - if type(resulting_score) in [float, np.float64, np.float32]: - return max(1, operator_count), resulting_score - else: - raise ValueError('Scoring function does not return a float') - - def _balanced_accuracy(self, estimator, X_test, y_test): - """Default scoring function: balanced accuracy - - Balanced accuracy computes each class' accuracy on a per-class basis using a - one-vs-rest encoding, then computes an unweighted average of the class accuracies. - - Parameters - ---------- - estimator: scikit-learn estimator - The estimator for which to evaluate the balanced accuracy - X_test: numpy.ndarray {n_samples, n_features} - Test data that will be fed to estimator.predict. - y_test: numpy.ndarray {n_samples, 1} - Target values for X_test. - - Returns - ------- - fitness: float - Returns a float value indicating the `individual`'s balanced accuracy - 0.5 is as good as chance, and 1.0 is perfect predictive accuracy - """ - y_pred = estimator.predict(X_test) - all_classes = list(set(np.append(y_test, y_pred))) - all_class_accuracies = [] - for this_class in all_classes: - this_class_sensitivity = \ - float(sum((y_pred == this_class) & (y_test == this_class))) /\ - float(sum((y_test == this_class))) - - this_class_specificity = \ - float(sum((y_pred != this_class) & (y_test != this_class))) /\ - float(sum((y_test != this_class))) - - this_class_accuracy = (this_class_sensitivity + this_class_specificity) / 2. - all_class_accuracies.append(this_class_accuracy) - - balanced_accuracy = np.mean(all_class_accuracies) - return balanced_accuracy - - @_gp_new_generation - def _combined_selection_operator(self, individuals, k): - """Perform NSGA2 selection on the population according to their Pareto - fitness - - Parameters - ---------- - individuals: list - A list of individuals to perform selection on - k: int - The number of individuals to return from the selection phase - - Returns - ------- - fitness: list - Returns a list of individuals that were selected - - """ - return tools.selNSGA2(individuals, int(k / 5.)) * 5 - - def _random_mutation_operator(self, individual): - """Perform a replacement, insert, or shrink mutation on an individual - - Parameters - ---------- - individual: DEAP individual - A list of pipeline operators and model parameters that can be - compiled by DEAP into a callable function - - Returns - ------- - fitness: list - Returns the individual with one of the mutations applied to it - - """ - mutation_techniques = [ - partial(gp.mutUniform, expr=self._toolbox.expr_mut, pset=self._pset), - partial(gp.mutInsert, pset=self._pset), - partial(gp.mutShrink) - ] - return np.random.choice(mutation_techniques)(individual) - - def _gen_grow_safe(self, pset, min_, max_, type_=None): - """Generate an expression where each leaf might have a different depth - between *min* and *max*. - - Parameters - ---------- - pset: PrimitiveSetTyped - Primitive set from which primitives are selected. - min_: int - Minimum height of the produced trees. - max_: int - Maximum Height of the produced trees. - type_: class - The type that should return the tree when called, when - :obj:`None` (default) the type of :pset: (pset.ret) - is assumed. - Returns - ------- - individual: list - A grown tree with leaves at possibly different depths. - """ - - def condition(height, depth, type_): - """Expression generation stops when the depth is equal to height - or when it is randomly determined that a a node should be a terminal - """ - return type_ not in [np.ndarray, Output_DF] or depth == height - - return self._generate(pset, min_, max_, condition, type_) - - # Generate function stolen straight from deap.gp.generate - def _generate(self, pset, min_, max_, condition, type_=None): - """Generate a Tree as a list of list. The tree is build - from the root to the leaves, and it stop growing when the - condition is fulfilled. - - Parameters - ---------- - pset: PrimitiveSetTyped - Primitive set from which primitives are selected. - min_: int - Minimum height of the produced trees. - max_: int - Maximum Height of the produced trees. - condition: function - The condition is a function that takes two arguments, - the height of the tree to build and the current - depth in the tree. - type_: class - The type that should return the tree when called, when - :obj:`None` (default) no return type is enforced. - - Returns - ------- - individual: list - A grown tree with leaves at possibly different depths - dependending on the condition function. - """ - if type_ is None: - type_ = pset.ret - expr = [] - height = np.random.randint(min_, max_) - stack = [(0, type_)] - while len(stack) != 0: - depth, type_ = stack.pop() - - # We've added a type_ parameter to the condition function - if condition(height, depth, type_): - try: - term = np.random.choice(pset.terminals[type_]) - except IndexError: - _, _, traceback = sys.exc_info() - raise IndexError("The gp.generate function tried to add " - "a terminal of type '%s', but there is " - "none available." % (type_,)).\ - with_traceback(traceback) - if inspect.isclass(term): - term = term() - expr.append(term) - else: - try: - prim = np.random.choice(pset.primitives[type_]) - except IndexError: - _, _, traceback = sys.exc_info() - raise IndexError("The gp.generate function tried to add " - "a primitive of type '%s', but there is " - "none available." % (type_,)).\ - with_traceback(traceback) - expr.append(prim) - for arg in reversed(prim.args): - stack.append((depth+1, arg)) - - return expr - - -def positive_integer(value): - """Ensures that the provided value is a positive integer; throws an - exception otherwise - - Parameters - ---------- - value: int - The number to evaluate - - Returns - ------- - value: int - Returns a positive integer - """ - try: - value = int(value) - except Exception: - raise argparse.ArgumentTypeError('Invalid int value: \'{}\''.format(value)) - if value < 0: - raise argparse.ArgumentTypeError('Invalid positive int value: \'{}\''.format(value)) - return value - - -def float_range(value): - """Ensures that the provided value is a float integer in the range (0., 1.) - throws an exception otherwise - - Parameters - ---------- - value: float - The number to evaluate - - Returns - ------- - value: float - Returns a float in the range (0., 1.) - """ - try: - value = float(value) - except: - raise argparse.ArgumentTypeError('Invalid float value: \'{}\''.format(value)) - if value < 0.0 or value > 1.0: - raise argparse.ArgumentTypeError('Invalid float value: \'{}\''.format(value)) - return value - - -def main(): - """Main function that is called when TPOT is run on the command line""" - parser = argparse.ArgumentParser(description='A Python tool that automatically creates and ' - 'optimizes machine learning pipelines using genetic programming.', - add_help=False) - - parser.add_argument('INPUT_FILE', type=str, help='Data file to optimize the pipeline on; ensure that the class label column is labeled as "class".') - - parser.add_argument('-h', '--help', action='help', help='Show this help message and exit.') - - parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t', - type=str, help='Character used to separate columns in the input file.') - - parser.add_argument('-o', action='store', dest='OUTPUT_FILE', default='', - type=str, help='File to export the final optimized pipeline.') - - parser.add_argument('-g', action='store', dest='GENERATIONS', default=100, - type=positive_integer, help='Number of generations to run pipeline optimization over.\nGenerally, TPOT will work better when ' - 'you give it more generations (and therefore time) to optimize over. TPOT will evaluate ' - 'GENERATIONS x POPULATION_SIZE number of pipelines in total.') - - parser.add_argument('-p', action='store', dest='POPULATION_SIZE', default=100, - type=positive_integer, help='Number of individuals in the GP population.\nGenerally, TPOT will work better when you give it ' - ' more individuals (and therefore time) to optimize over. TPOT will evaluate ' - 'GENERATIONS x POPULATION_SIZE number of pipelines in total.') - - parser.add_argument('-mr', action='store', dest='MUTATION_RATE', default=0.9, - type=float_range, help='GP mutation rate in the range [0.0, 1.0]. We recommend using the default parameter unless you ' - 'understand how the mutation rate affects GP algorithms.') - - parser.add_argument('-xr', action='store', dest='CROSSOVER_RATE', default=0.05, - type=float_range, help='GP crossover rate in the range [0.0, 1.0]. We recommend using the default parameter unless you ' - 'understand how the crossover rate affects GP algorithms.') - - parser.add_argument('-cv', action='store', dest='NUM_CV_FOLDS', default=3, - type=int, help='The number of folds to evaluate each pipeline over in k-fold cross-validation during the ' - 'TPOT pipeline optimization process.') - - parser.add_argument('-scoring', action='store', dest='SCORING_FN', default=None, - type=str, help='Function used to evaluate the goodness of a given pipeline for the ' - 'classification problem. By default, balanced class accuracy is used. ' - 'TPOT assumes that this scoring function should be maximized, i.e., ' - 'higher is better. Offers the same options as cross_val_score: ' - '"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", ' - '"f1_micro", "f1_samples", "f1_weighted", "precision", "precision_macro", ' - '"precision_micro", "precision_samples", "precision_weighted", "recall", ' - '"recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"') - - parser.add_argument('-s', action='store', dest='RANDOM_STATE', default=None, - type=int, help='Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible ' - 'with the same seed and data set in the future.') - - parser.add_argument('-v', action='store', dest='VERBOSITY', default=1, choices=[0, 1, 2], - type=int, help='How much information TPOT communicates while it is running: 0 = none, 1 = minimal, 2 = all.') - - parser.add_argument('--no-update-check', action='store_true', dest='DISABLE_UPDATE_CHECK', default=False, - help='Flag indicating whether the TPOT version checker should be disabled.') - - parser.add_argument('--version', action='version', version='TPOT {version}'.format(version=__version__), - help='Show TPOT\'s version number and exit.') - - args = parser.parse_args() - - if args.VERBOSITY >= 2: - print('\nTPOT settings:') - for arg in sorted(args.__dict__): - arg_val = args.__dict__[arg] - if arg == 'DISABLE_UPDATE_CHECK': - continue - elif arg == 'SCORING_FN' and args.__dict__[arg] is None: - arg_val = 'balanced_accuracy' - print('{}\t=\t{}'.format(arg, arg_val)) - print('') - - input_data = np.recfromcsv(args.INPUT_FILE, delimiter=args.INPUT_SEPARATOR, dtype=np.float64) - features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1), - input_data.dtype.names.index('class'), axis=1) - - training_features, testing_features, training_classes, testing_classes = \ - train_test_split(features, input_data['class'], random_state=args.RANDOM_STATE) - - tpot = TPOT(generations=args.GENERATIONS, population_size=args.POPULATION_SIZE, - mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE, - num_cv_folds=args.NUM_CV_FOLDS, scoring_function=args.SCORING_FN, - random_state=args.RANDOM_STATE, verbosity=args.VERBOSITY, - disable_update_check=args.DISABLE_UPDATE_CHECK) - - tpot.fit(training_features, training_classes) - - if args.VERBOSITY >= 1: - print('\nTraining accuracy: {}'.format(tpot.score(training_features, training_classes))) - print('Holdout accuracy: {}'.format(tpot.score(testing_features, testing_classes))) - - if args.OUTPUT_FILE != '': - tpot.export(args.OUTPUT_FILE) - - -if __name__ == '__main__': - main() + return not op.regression diff --git a/tutorials/IRIS.ipynb b/tutorials/IRIS.ipynb index 79b39474..cd4f3a87 100644 --- a/tutorials/IRIS.ipynb +++ b/tutorials/IRIS.ipynb @@ -8,7 +8,7 @@ }, "outputs": [], "source": [ - "from tpot import TPOT\n", + "from tpot import TPOTClassifier\n", "from sklearn.datasets import load_iris\n", "from sklearn.cross_validation import train_test_split" ] @@ -96,12 +96,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.918414918415\n" + "0.964285714286\n" ] } ], "source": [ - "tpot = TPOT(generations=5)\n", + "tpot = TPOTClassifier(generations=5)\n", "tpot.fit(X_train, y_train)\n", "print(tpot.score(X_test, y_test))" ] @@ -117,6 +117,37 @@ "tpot.export('tpot_iris_pipeline.py')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# %load tpot_iris_pipeline.py\n", + "import numpy as np\n", + "\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.ensemble import VotingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.pipeline import make_pipeline, make_union\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "\n", + "# NOTE: Make sure that the class is labeled 'class' in the data file\n", + "tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)\n", + "training_features, testing_features, training_classes, testing_classes = \\\n", + " train_test_split(features, tpot_data['class'], random_state=42)\n", + "\n", + "exported_pipeline = make_pipeline(\n", + " LogisticRegression(C=7.0, dual=False, penalty=\"l1\")\n", + ")\n", + "\n", + "exported_pipeline.fit(training_features, training_classes)\n", + "results = exported_pipeline.predict(testing_features)\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -143,7 +174,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/tutorials/MNIST.ipynb b/tutorials/MNIST.ipynb index 6919e1d4..853cdf95 100644 --- a/tutorials/MNIST.ipynb +++ b/tutorials/MNIST.ipynb @@ -8,7 +8,7 @@ }, "outputs": [], "source": [ - "from tpot import TPOT\n", + "from tpot import TPOTClassifier\n", "from sklearn.datasets import load_digits\n", "from sklearn.cross_validation import train_test_split" ] @@ -49,12 +49,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.968636234802\n" + "0.990869380532\n" ] } ], "source": [ - "tpot = TPOT(generations=5)\n", + "tpot = TPOTClassifier(generations=5)\n", "tpot.fit(X_train, y_train)\n", "print(tpot.score(X_test, y_test))" ] @@ -70,6 +70,38 @@ "tpot.export('tpot_mnist_pipeline.py')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# %load tpot_mnist_pipeline.py\n", + "import numpy as np\n", + "\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier\n", + "from sklearn.feature_selection import VarianceThreshold\n", + "from sklearn.pipeline import make_pipeline, make_union\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "\n", + "# NOTE: Make sure that the class is labeled 'class' in the data file\n", + "tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)\n", + "training_features, testing_features, training_classes, testing_classes = \\\n", + " train_test_split(features, tpot_data['class'], random_state=42)\n", + "\n", + "exported_pipeline = make_pipeline(\n", + " VarianceThreshold(threshold=0.24),\n", + " ExtraTreesClassifier(criterion=\"entropy\", max_features=0.16, n_estimators=500)\n", + ")\n", + "\n", + "exported_pipeline.fit(training_features, training_classes)\n", + "results = exported_pipeline.predict(testing_features)\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -96,7 +128,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/tutorials/Titanic_Kaggle.ipynb b/tutorials/Titanic_Kaggle.ipynb index 710d2781..e6a9a958 100644 --- a/tutorials/Titanic_Kaggle.ipynb +++ b/tutorials/Titanic_Kaggle.ipynb @@ -23,7 +23,7 @@ "outputs": [], "source": [ "# Import required libraries\n", - "from tpot import TPOT\n", + "from tpot import TPOTClassifier\n", "from sklearn.cross_validation import train_test_split\n", "import pandas as pd \n", "import numpy as np" @@ -797,7 +797,7 @@ } ], "source": [ - "tpot = TPOT(generations=5, verbosity=2)\n", + "tpot = TPOTClassifier(generations=5, verbosity=2)\n", "tpot.fit(titanic_new[training_indices], titanic_class[training_indices])" ] }, @@ -1208,18 +1208,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" } }, "nbformat": 4, diff --git a/tutorials/tpot_iris_pipeline.py b/tutorials/tpot_iris_pipeline.py index 40610adb..71b5fe2a 100644 --- a/tutorials/tpot_iris_pipeline.py +++ b/tutorials/tpot_iris_pipeline.py @@ -1,28 +1,20 @@ -import pandas as pd +import numpy as np from sklearn.cross_validation import train_test_split -from sklearn.svm import SVC -from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import VotingClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import make_pipeline, make_union +from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') -training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) +tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +training_features, testing_features, training_classes, testing_classes = \ + train_test_split(features, tpot_data['class'], random_state=42) +exported_pipeline = make_pipeline( + LogisticRegression(C=7.0, dual=False, penalty="l1") +) -result1 = tpot_data.copy() - -# Perform classification with a C-support vector classifier -svc1 = SVC(C=0.1) -svc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) -result1['svc1-classification'] = svc1.predict(result1.drop('class', axis=1).values) - -# Subset the data columns -subset_df1 = result1[sorted(result1.columns.values)[4042:5640]] -subset_df2 = result1[[column for column in ['class'] if column not in subset_df1.columns.values]] -result2 = subset_df1.join(subset_df2) - -# Perform classification with a k-nearest neighbor classifier -knnc3 = KNeighborsClassifier(n_neighbors=min(131, len(training_indices))) -knnc3.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values) -result3 = result2 -result3['knnc3-classification'] = knnc3.predict(result3.drop('class', axis=1).values) +exported_pipeline.fit(training_features, training_classes) +results = exported_pipeline.predict(testing_features) diff --git a/tutorials/tpot_mnist_pipeline.py b/tutorials/tpot_mnist_pipeline.py index a4ea0d65..d7af621b 100644 --- a/tutorials/tpot_mnist_pipeline.py +++ b/tutorials/tpot_mnist_pipeline.py @@ -1,56 +1,21 @@ -import pandas as pd +import numpy as np from sklearn.cross_validation import train_test_split -from itertools import combinations -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier +from sklearn.feature_selection import VarianceThreshold +from sklearn.pipeline import make_pipeline, make_union +from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') -training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) - - -result1 = tpot_data.copy() - -# Perform classification with a logistic regression classifier -lrc1 = LogisticRegression(C=0.1) -lrc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) -result1['lrc1-classification'] = lrc1.predict(result1.drop('class', axis=1).values) - -# Decision-tree based feature selection -training_features = result1.loc[training_indices].drop('class', axis=1) -training_class_vals = result1.loc[training_indices, 'class'].values - -pair_scores = dict() -for features in combinations(training_features.columns.values, 2): - dtc = DecisionTreeClassifier() - training_feature_vals = training_features[list(features)].values - dtc.fit(training_feature_vals, training_class_vals) - pair_scores[features] = (dtc.score(training_feature_vals, training_class_vals), list(features)) - -best_pairs = [] -for pair in sorted(pair_scores, key=pair_scores.get, reverse=True)[:3870]: - best_pairs.extend(list(pair)) -best_pairs = sorted(list(set(best_pairs))) - -result2 = result1[sorted(list(set(best_pairs + ['class'])))] - -# Perform classification with a random forest classifier -rfc3 = RandomForestClassifier(n_estimators=1, max_features=min(64, len(result2.columns) - 1)) -rfc3.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values) -result3 = result2 -result3['rfc3-classification'] = rfc3.predict(result3.drop('class', axis=1).values) - -# Perform classification with a decision tree classifier -dtc4 = DecisionTreeClassifier(max_features=min(40, len(result3.columns) - 1), max_depth=7) -dtc4.fit(result3.loc[training_indices].drop('class', axis=1).values, result3.loc[training_indices, 'class'].values) -result4 = result3 -result4['dtc4-classification'] = dtc4.predict(result4.drop('class', axis=1).values) - -# Perform classification with a k-nearest neighbor classifier -knnc5 = KNeighborsClassifier(n_neighbors=1) -knnc5.fit(result4.loc[training_indices].drop('class', axis=1).values, result4.loc[training_indices, 'class'].values) -result5 = result4 -result5['knnc5-classification'] = knnc5.predict(result5.drop('class', axis=1).values) +tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +training_features, testing_features, training_classes, testing_classes = \ + train_test_split(features, tpot_data['class'], random_state=42) + +exported_pipeline = make_pipeline( + VarianceThreshold(threshold=0.24), + ExtraTreesClassifier(criterion="entropy", max_features=0.16, n_estimators=500) +) + +exported_pipeline.fit(training_features, training_classes) +results = exported_pipeline.predict(testing_features)
The number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT pipeline optimization process.
scoring_function"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", "f1_micro", "f1_samples", "f1_weighted", "log_loss", "precision", "precision_macro", "precision_micro", "precision_samples", "precision_weighted", "r2", "recall", "recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"Function used to evaluate the goodness of a given pipeline for the classification problem. By default, balanced class accuracy is used. TPOT assumes that this scoring function should be maximized, i.e., higher is better.scoring'accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' or a callable function with signature scorer(y_true, y_pred)Function used to evaluate the quality of a given pipeline for the problem. By default, balanced accuracy is used for classification and mean squared error is used for regression. TPOT assumes that any function with "error" or "loss" in the name is meant to be minimized, whereas any other functions will be maximized. See the section on scoring functions for more details.
max_time_minsAny positive integerHow many minutes TPOT has to optimize the pipeline. This setting will override the generations parameter.
random_state
verbosity[0, 1, 2]How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = all. A setting of 2 will add a progress bar to calls to fit().{0, 1, 2, 3}How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar to calls to fit().
disable_update_check