diff --git a/.appveyor.yml b/.appveyor.yml index 17ed4cc7..d5bb94fa 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -19,9 +19,9 @@ install: - conda config --set always_yes yes --set changeps1 no - conda update -q conda - conda info -a - - conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy scikit-learn nose cython py-xgboost + - conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy scikit-learn nose cython py-xgboost pandas - activate test-environment - - pip install deap tqdm update_checker pypiwin32 + - pip install deap tqdm update_checker pypiwin32 stopit test_script: diff --git a/ci/.travis_install.sh b/ci/.travis_install.sh index a51fdc2e..034713e4 100755 --- a/ci/.travis_install.sh +++ b/ci/.travis_install.sh @@ -33,13 +33,14 @@ conda update --yes conda # provided versions if [[ "$LATEST" == "true" ]]; then conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ - numpy scipy scikit-learn cython py-xgboost + numpy scipy scikit-learn cython py-xgboost pandas else conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ scikit-learn=$SKLEARN_VERSION \ py-xgboost=$XGBOOST_VERSION \ - cython + cython \ + pandas fi source activate testenv @@ -52,6 +53,7 @@ fi pip install update_checker pip install tqdm +pip install stopit if [[ "$COVERAGE" == "true" ]]; then pip install coverage coveralls @@ -66,4 +68,6 @@ python -c "import deap; print('deap %s' % deap.__version__)" python -c "import xgboost; print('xgboost %s ' % xgboost.__version__)" python -c "import update_checker; print('update_checker %s' % update_checker.__version__)" python -c "import tqdm; print('tqdm %s' % tqdm.__version__)" +python -c "import pandas; print('pandas %s' % pandas.__version__)" +python -c "import stopit; print('stopit %s' % stopit.__version__)" python setup.py build_ext --inplace diff --git a/docs/api/index.html b/docs/api/index.html index 1c6e5f2c..b673a0ab 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -101,6 +101,11 @@ Support +
  • + + Related +
  • +   @@ -144,7 +149,9 @@

    Classification

    subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, - warm_start=False, verbosity=0, + warm_start=False, + periodic_checkpoint_folder=None, + verbosity=0, disable_update_check=False)
    source
    @@ -266,6 +273,7 @@

    Classification

  • Python dictionary, TPOT will use your custom configuration,
  • string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or
  • string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or
  • +
  • string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or
  • None, TPOT will use the default TPOTClassifier configuration.
  • See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. @@ -278,6 +286,25 @@

    Classification

    Setting warm_start=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. +periodic_checkpoint_folder: path string, optional (default: None) +
    +If supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing.

    +Currently once per generation but not more often than once per 30 seconds.

    +Useful in multiple cases: + +
    + +early_stop: integer, optional (default: None) +
    +How many generations TPOT checks whether there is no improvement in optimization process. +

    +Ends the optimization process if there is no improvement in the given number of generations. +
    + verbosity: integer, optional (default=0)
    How much information TPOT communicates while it's running. @@ -555,7 +582,9 @@

    Regression

    subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, - warm_start=False, verbosity=0, + warm_start=False, + periodic_checkpoint_folder=None, + verbosity=0, disable_update_check=False)
    source
    @@ -679,6 +708,7 @@

    Regression

  • Python dictionary, TPOT will use your custom configuration,
  • string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or
  • string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or
  • +
  • string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or
  • None, TPOT will use the default TPOTRegressor configuration.
  • See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. @@ -691,6 +721,25 @@

    Regression

    Setting warm_start=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.
    +periodic_checkpoint_folder: path string, optional (default: None) +
    +If supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing.

    +Currently once per generation but not more often than once per 30 seconds.

    +Useful in multiple cases: + +
    + +early_stop: integer, optional (default: None) +
    +How many generations TPOT checks whether there is no improvement in optimization process. +

    +Ends the optimization process if there is no improvement in the given number of generations. +
    + verbosity: integer, optional (default=0)
    How much information TPOT communicates while it's running. diff --git a/docs/citing/index.html b/docs/citing/index.html index aa02c20e..bf25e50f 100644 --- a/docs/citing/index.html +++ b/docs/citing/index.html @@ -95,6 +95,11 @@ Support +
  • + + Related +
  • +   diff --git a/docs/contributing/index.html b/docs/contributing/index.html index 9913b513..6124b8dd 100644 --- a/docs/contributing/index.html +++ b/docs/contributing/index.html @@ -110,6 +110,11 @@ Support +
  • + + Related +
  • +   diff --git a/docs/examples/index.html b/docs/examples/index.html index 41ec1ff7..e32b6ade 100644 --- a/docs/examples/index.html +++ b/docs/examples/index.html @@ -107,6 +107,11 @@ Support +
  • + + Related +
  • +   diff --git a/docs/index.html b/docs/index.html index 1468cfee..047d7cdf 100644 --- a/docs/index.html +++ b/docs/index.html @@ -95,6 +95,11 @@ Support +
  • + + Related +
  • +   @@ -205,5 +210,5 @@ diff --git a/docs/installing/index.html b/docs/installing/index.html index a281828c..f9349d0b 100644 --- a/docs/installing/index.html +++ b/docs/installing/index.html @@ -95,6 +95,11 @@ Support +
  • + + Related +
  • +   @@ -150,14 +155,20 @@
  • tqdm

  • +
  • +

    stopit

    +
  • +
  • +

    pandas

    +
  • Most of the necessary Python packages can be installed via the Anaconda Python distribution, which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice.

    -

    NumPy, SciPy, and scikit-learn can be installed in Anaconda via the command:

    -
    conda install numpy scipy scikit-learn
    +

    NumPy, SciPy, scikit-learn and pandas can be installed in Anaconda via the command:

    +
    conda install numpy scipy scikit-learn pandas
     
    -

    DEAP, update_checker, and tqdm can be installed with pip via the command:

    -
    pip install deap update_checker tqdm
    +

    DEAP, update_checker, tqdm and stopit can be installed with pip via the command:

    +
    pip install deap update_checker tqdm stopit
     

    For the Windows users, the pywin32 module is required if Python is NOT installed via the Anaconda Python distribution and can be installed with pip:

    diff --git a/docs/mkdocs/search_index.json b/docs/mkdocs/search_index.json index 5d3fbf27..a32e704b 100644 --- a/docs/mkdocs/search_index.json +++ b/docs/mkdocs/search_index.json @@ -7,12 +7,12 @@ }, { "location": "/installing/", - "text": "TPOT is built on top of several existing Python libraries, including:\n\n\n\n\n\n\nNumPy\n\n\n\n\n\n\nSciPy\n\n\n\n\n\n\nscikit-learn\n\n\n\n\n\n\nDEAP\n\n\n\n\n\n\nupdate_checker\n\n\n\n\n\n\ntqdm\n\n\n\n\n\n\nMost of the necessary Python packages can be installed via the \nAnaconda Python distribution\n, which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice.\n\n\nNumPy, SciPy, and scikit-learn can be installed in Anaconda via the command:\n\n\nconda install numpy scipy scikit-learn\n\n\n\n\nDEAP, update_checker, and tqdm can be installed with \npip\n via the command:\n\n\npip install deap update_checker tqdm\n\n\n\n\nFor the Windows users\n, the pywin32 module is required if Python is NOT installed via the \nAnaconda Python distribution\n and can be installed with \npip\n:\n\n\npip install pywin32\n\n\n\n\nOptionally\n, you can install \nXGBoost\n if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed.\n\n\nconda install py-xgboost\n\n\n\n\nIf you have issues installing XGBoost, check the \nXGBoost installation documentation\n.\n\n\nIf you plan to use the \nTPOT-MDR configuration\n, make sure to install \nscikit-mdr\n and \nscikit-rebate\n:\n\n\npip install scikit-mdr skrebate\n\n\n\n\nFinally to install TPOT itself, run the following command:\n\n\npip install tpot\n\n\n\n\nPlease \nfile a new issue\n if you run into installation problems.", + "text": "TPOT is built on top of several existing Python libraries, including:\n\n\n\n\n\n\nNumPy\n\n\n\n\n\n\nSciPy\n\n\n\n\n\n\nscikit-learn\n\n\n\n\n\n\nDEAP\n\n\n\n\n\n\nupdate_checker\n\n\n\n\n\n\ntqdm\n\n\n\n\n\n\nstopit\n\n\n\n\n\n\npandas\n\n\n\n\n\n\nMost of the necessary Python packages can be installed via the \nAnaconda Python distribution\n, which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice.\n\n\nNumPy, SciPy, scikit-learn and pandas can be installed in Anaconda via the command:\n\n\nconda install numpy scipy scikit-learn pandas\n\n\n\n\nDEAP, update_checker, tqdm and stopit can be installed with \npip\n via the command:\n\n\npip install deap update_checker tqdm stopit\n\n\n\n\nFor the Windows users\n, the pywin32 module is required if Python is NOT installed via the \nAnaconda Python distribution\n and can be installed with \npip\n:\n\n\npip install pywin32\n\n\n\n\nOptionally\n, you can install \nXGBoost\n if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed.\n\n\nconda install py-xgboost\n\n\n\n\nIf you have issues installing XGBoost, check the \nXGBoost installation documentation\n.\n\n\nIf you plan to use the \nTPOT-MDR configuration\n, make sure to install \nscikit-mdr\n and \nscikit-rebate\n:\n\n\npip install scikit-mdr skrebate\n\n\n\n\nFinally to install TPOT itself, run the following command:\n\n\npip install tpot\n\n\n\n\nPlease \nfile a new issue\n if you run into installation problems.", "title": "Installation" }, { "location": "/using/", - "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for very long, it may not find the best pipeline possible for your dataset.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function takes in a training data set and uses k-fold cross-validation when evaluating pipelines. It then\ninitializes the genetic programming algoritm to find the best pipeline based on average k-fold score.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer >1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nFile path or string\n\n\nA path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process.\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass a function with the signature \nscorer(y_true, y_pred)\n, where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_accuracy)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.", + "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for very long, it may not find the best pipeline possible for your dataset.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function takes in a training data set and uses k-fold cross-validation when evaluating pipelines. It then\ninitializes the genetic programming algoritm to find the best pipeline based on average k-fold score.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save the best pipeline so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass a function with the signature \nscorer(y_true, y_pred)\n, where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_accuracy)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\nmy_module.scorer_name\n: you can also use your manual \nscorer(y_true, y_pred)\n function through the command line, just add an argument \n-scoring my_module.scorer\n and TPOT will import your module and take the function from there. TPOT will also include current workdir when importing the module, so you can just put it in the same folder where you are going to run.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nTPOT supports parallel computing for speeding up the optimization process, but it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.", "title": "Using TPOT" }, { @@ -27,17 +27,17 @@ }, { "location": "/using/#tpot-on-the-command-line", - "text": "To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. \nBy default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. \nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. \nSee the section on scoring functions for more details. -cv CV Any integer >1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. \nAssigning this to -1 will use as many cores as available on the computer. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. \nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive integer How many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. \nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE File path or string A path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. \n0 = none, 1 = minimal, 2 = high, 3 = all. \nA setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.", + "text": "To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. \nBy default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. \nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. \nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one. \nSee the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. \nAssigning this to -1 will use as many cores as available on the computer. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. \nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive integer How many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. \nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -cf CHECKPOINT_FOLDER Folder path \nIf supplied, a folder you created, in which tpot will periodically save the best pipeline so far while optimizing. \nThis is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working \nExample: \nmkdir my_checkpoints \n-cf ./my_checkpoints -es EARLY_STOP Any positive integer \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnd optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. \n0 = none, 1 = minimal, 2 = high, 3 = all. \nA setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.", "title": "TPOT on the command line" }, { "location": "/using/#scoring-functions", - "text": "TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass a function with the signature scorer(y_true, y_pred) , where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_accuracy)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')", + "text": "TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass a function with the signature scorer(y_true, y_pred) , where y_true are the true target values and y_pred are the predicted target values from an estimator. To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_accuracy)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') my_module.scorer_name : you can also use your manual scorer(y_true, y_pred) function through the command line, just add an argument -scoring my_module.scorer and TPOT will import your module and take the function from there. TPOT will also include current workdir when importing the module, so you can just put it in the same folder where you are going to run.\nExample: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.", "title": "Scoring functions" }, { "location": "/using/#built-in-tpot-configurations", - "text": "TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. \nThis configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . \nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')", + "text": "TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. \nThis configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . \nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. \nThis configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')", "title": "Built-in TPOT configurations" }, { @@ -45,19 +45,24 @@ "text": "Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n} in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.", "title": "Customizing TPOT's operators and parameters" }, + { + "location": "/using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux", + "text": "TPOT supports parallel computing for speeding up the optimization process, but it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .", + "title": "Crash/freeze issue with n_jobs > 1 under OSX or Linux" + }, { "location": "/api/", - "text": "Classification\n\n\nclass\n tpot.\nTPOTClassifier\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='accuracy', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False, \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised classification tasks.\n\n\nThe TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=100)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='accuracy')\n\n\nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used:\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'\n\n\nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature \nscorer(y_true, y_pred)\n. See the section on \nscoring functions\n for more details.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a StratifiedKFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: integer, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nNone, TPOT will use the default TPOTClassifier configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \npareto_front_fitted_pipelines_\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, classes[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\npredict_proba\n(features)\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_classes)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, classes, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and classes. Performs internal stratified k-fold cross-validaton to avoid overfitting on the provided data.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\nclasses\n: array-like {n_samples}\n\n\nList of class labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights force TPOT to put more emphasis on those points.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted classes for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict_proba(features)\n\n\n\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\nNote: This function will only work for pipelines whose final classifier supports the \npredict_proba\n function. TPOT will raise an error otherwise.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples, n_classes}\n\n\nThe class probabilities of the input samples\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_classes)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'accuracy'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_classes\n: array-like {n_samples}\n\n\nList of class labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything\n\n\n\n\n\n\n\n\n\n\nRegression\n\n\nclass\n tpot.\nTPOTRegressor\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='neg_mean_squared_error', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False, \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised regression tasks.\n\n\nThe TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=100)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='neg_mean_squared_error')\n\n\nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used:\n\n\n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'\n\n\nNote that we recommend using the \nneg\n version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric.\n\n\nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature \nscorer(y_true, y_pred)\n. See the section on \nscoring functions\n for more details.\n\n\nTPOT assumes that any custom scoring function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a KFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: integer, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nNone, TPOT will use the default TPOTRegressor configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \n_pareto_front_fitted_pipelines\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, target[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_target)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, target, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. Performs internal k-fold cross-validaton to avoid overfitting on the provided data.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\ntarget\n: array-like {n_samples}\n\n\nList of target labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights force TPOT to put more emphasis on those points.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted target values for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_target)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'mean_squared_error'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_target\n: array-like {n_samples}\n\n\nList of target labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything", + "text": "Classification\n\n\nclass\n tpot.\nTPOTClassifier\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='accuracy', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised classification tasks.\n\n\nThe TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=100)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='accuracy')\n\n\nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used:\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'\n\n\nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature \nscorer(y_true, y_pred)\n. See the section on \nscoring functions\n for more details.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a StratifiedKFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: integer, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTClassifier configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \npareto_front_fitted_pipelines_\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, classes[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\npredict_proba\n(features)\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_classes)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, classes, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and classes. Performs internal stratified k-fold cross-validaton to avoid overfitting on the provided data.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\nclasses\n: array-like {n_samples}\n\n\nList of class labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights force TPOT to put more emphasis on those points.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted classes for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict_proba(features)\n\n\n\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\nNote: This function will only work for pipelines whose final classifier supports the \npredict_proba\n function. TPOT will raise an error otherwise.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples, n_classes}\n\n\nThe class probabilities of the input samples\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_classes)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'accuracy'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_classes\n: array-like {n_samples}\n\n\nList of class labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything\n\n\n\n\n\n\n\n\n\n\nRegression\n\n\nclass\n tpot.\nTPOTRegressor\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='neg_mean_squared_error', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised regression tasks.\n\n\nThe TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=100)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='neg_mean_squared_error')\n\n\nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used:\n\n\n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'\n\n\nNote that we recommend using the \nneg\n version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric.\n\n\nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature \nscorer(y_true, y_pred)\n. See the section on \nscoring functions\n for more details.\n\n\nTPOT assumes that any custom scoring function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a KFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: integer, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTRegressor configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \n_pareto_front_fitted_pipelines\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, target[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_target)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, target, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. Performs internal k-fold cross-validaton to avoid overfitting on the provided data.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\ntarget\n: array-like {n_samples}\n\n\nList of target labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights force TPOT to put more emphasis on those points.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted target values for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_target)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'mean_squared_error'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_target\n: array-like {n_samples}\n\n\nList of target labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything", "title": "TPOT API" }, { "location": "/api/#classification", - "text": "class tpot. TPOTClassifier ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='accuracy', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False, verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=100) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') \nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: \n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' \nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature scorer(y_true, y_pred) . See the section on scoring functions for more details. \nTPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : integer, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or None, TPOT will use the default TPOTClassifier configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and classes. Performs internal stratified k-fold cross-validaton to avoid overfitting on the provided data. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} \nList of class labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights force TPOT to put more emphasis on those points. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted classes for the samples in the feature matrix predict_proba(features) \nUse the optimized pipeline to estimate the class probabilities for a feature set. \nNote: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples, n_classes} \nThe class probabilities of the input samples score(testing_features, testing_classes) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_classes : array-like {n_samples} \nList of class labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", + "text": "class tpot. TPOTClassifier ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='accuracy', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False,\n periodic_checkpoint_folder =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=100) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') \nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: \n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' \nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature scorer(y_true, y_pred) . See the section on scoring functions for more details. \nTPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : integer, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and classes. Performs internal stratified k-fold cross-validaton to avoid overfitting on the provided data. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} \nList of class labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights force TPOT to put more emphasis on those points. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted classes for the samples in the feature matrix predict_proba(features) \nUse the optimized pipeline to estimate the class probabilities for a feature set. \nNote: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples, n_classes} \nThe class probabilities of the input samples score(testing_features, testing_classes) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_classes : array-like {n_samples} \nList of class labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", "title": "Classification" }, { "location": "/api/#regression", - "text": "class tpot. TPOTRegressor ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='neg_mean_squared_error', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False, verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=100) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') \nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: \n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' \nNote that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. \nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature scorer(y_true, y_pred) . See the section on scoring functions for more details. \nTPOT assumes that any custom scoring function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : integer, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or None, TPOT will use the default TPOTRegressor configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. Performs internal k-fold cross-validaton to avoid overfitting on the provided data. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} \nList of target labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights force TPOT to put more emphasis on those points. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted target values for the samples in the feature matrix score(testing_features, testing_target) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_target : array-like {n_samples} \nList of target labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", + "text": "class tpot. TPOTRegressor ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='neg_mean_squared_error', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False,\n periodic_checkpoint_folder =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=100) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') \nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: \n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' \nNote that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. \nIf you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature scorer(y_true, y_pred) . See the section on scoring functions for more details. \nTPOT assumes that any custom scoring function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : integer, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. Performs internal k-fold cross-validaton to avoid overfitting on the provided data. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} \nList of target labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights force TPOT to put more emphasis on those points. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted target values for the samples in the feature matrix score(testing_features, testing_target) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_target : array-like {n_samples} \nList of target labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", "title": "Regression" }, { @@ -117,9 +122,14 @@ }, { "location": "/releases/", - "text": "Version 0.8\n\n\n\n\n\n\nTPOT now detects whether there are missing values in your dataset\n and replaces them with the median value of the column.\n\n\n\n\n\n\nTPOT now allows you to set a \ngroup\n parameter in the \nfit\n function so you can use the \nGroupKFold\n cross-validation strategy.\n\n\n\n\n\n\nTPOT now allows you to set a subsample ratio of the training instance with the \nsubsample\n parameter. For example, setting \nsubsample\n=0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation.\n\n\n\n\n\n\nTPOT now has more \nbuilt-in configurations\n, including TPOT MDR and TPOT light, for both classification and regression problems.\n\n\n\n\n\n\nTPOTClassifier\n and \nTPOTRegressor\n now expose three useful internal attributes, \nfitted_pipeline_\n, \npareto_front_fitted_pipelines_\n, and \nevaluated_individuals_\n. These attributes are described in the \nAPI documentation\n.\n\n\n\n\n\n\nOh, \nTPOT now has \nthorough API documentation\n. Check it out!\n\n\n\n\n\n\nFixed a reproducibility issue where setting \nrandom_seed\n didn't necessarily result in the same results every time. This bug was present since TPOT v0.7.\n\n\n\n\n\n\nRefined input checking in TPOT.\n\n\n\n\n\n\nRemoved Python 2 uncompliant code.\n\n\n\n\n\n\nVersion 0.7\n\n\n\n\n\n\nTPOT now has multiprocessing support.\n TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the \nn_jobs\n parameter.\n\n\n\n\n\n\nTPOT now allows you to \ncustomize the operators and parameters considered during the optimization process\n, which can be accomplished with the new \nconfig_dict\n parameter. The format of this customized dictionary can be found in the \nonline documentation\n, along with a list of \nbuilt-in configurations\n.\n\n\n\n\n\n\nTPOT now allows you to \nspecify a time limit for evaluating a single pipeline\n (default limit is 5 minutes) in optimization process with the \nmax_eval_time_mins\n parameter, so TPOT won't spend hours evaluating overly-complex pipelines.\n\n\n\n\n\n\nWe tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the \nmu+lambda algorithm\n. This algorithm gives you more control of how many pipelines are generated every iteration with the \noffspring_size\n parameter.\n\n\n\n\n\n\nRefined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6.\n\n\n\n\n\n\nTPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., \ntpot.fit(x_train, y_train, sample_weights=sample_weights)\n.\n\n\n\n\n\n\nThe default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting \nscoring='balanced_accuracy'\n when creating a TPOT instance.\n\n\n\n\n\n\nVersion 0.6\n\n\n\n\n\n\nTPOT now supports regression problems!\n We have created two separate \nTPOTClassifier\n and \nTPOTRegressor\n classes to support classification and regression problems, respectively. The \ncommand-line interface\n also supports this feature through the \n-mode\n parameter.\n\n\n\n\n\n\nTPOT now allows you to \nspecify a time limit\n for the optimization process with the \nmax_time_mins\n parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you.\n\n\n\n\n\n\nAdded a new operator that performs feature selection using \nExtraTrees\n feature importance scores.\n\n\n\n\n\n\nXGBoost\n has been added as an optional dependency to TPOT.\n If you have XGBoost installed, TPOT will automatically detect your installation and use the \nXGBoostClassifier\n and \nXGBoostRegressor\n in its pipelines.\n\n\n\n\n\n\nTPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.\n\n\n\n\n\n\nVersion 0.5\n\n\n\n\nMajor refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code!\n\n\nTPOT now \nexports directly to scikit-learn Pipelines\n instead of hacky code.\n\n\nInternal representation of individuals now uses scikit-learn pipelines.\n\n\nParameters for each operator have been optimized so TPOT spends less time exploring useless parameters.\n\n\nWe have removed pandas as a dependency and instead use numpy matrices to store the data.\n\n\nTPOT now uses \nk-fold cross-validation\n when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance.\n\n\nImproved \nscoring function support\n: Even though TPOT uses balanced accuracy by default, you can now have TPOT use \nany of the scoring functions\n that \ncross_val_score\n supports.\n\n\nAdded the scikit-learn \nNormalizer\n preprocessor.\n\n\nMinor text fixes.\n\n\n\n\nVersion 0.4\n\n\nIn TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below.\n\n\n\n\nAdded new sklearn models and preprocessors\n\n\n\n\nAdaBoostClassifier\n\n\nBernoulliNB\n\n\nExtraTreesClassifier\n\n\nGaussianNB\n\n\nMultinomialNB\n\n\nLinearSVC\n\n\nPassiveAggressiveClassifier\n\n\nGradientBoostingClassifier\n\n\nRBFSampler\n\n\nFastICA\n\n\nFeatureAgglomeration\n\n\nNystroem\n\n\n\n\nAdded operator that inserts virtual features for the count of features with values of zero\n\n\nReworked parameterization of TPOT operators\n\n\n\nReduced parameter search space with information from a scikit-learn benchmark\n\n\nTPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead\n\n\n\n\nRemoved XGBoost as a dependency\n\n\n\nToo many users were having install issues with XGBoost\n\n\nReplaced with scikit-learn's GradientBoostingClassifier\n\n\n\n\nImproved descriptiveness of TPOT command line parameter documentation\n\n\nRemoved min/max/avg details during fit() when verbosity > 1\n\n\n\n\nReplaced with tqdm progress bar\n\n\nAdded tqdm as a dependency\n\n\n\n\nAdded \nfit_predict()\n convenience function\n\n\nAdded \nget_params()\n function so TPOT can operate in scikit-learn's \ncross_val_score\n & related functions\n\n\n\n\n\nVersion 0.3\n\n\n\n\nWe revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.\n\n\n\n\nVersion 0.2\n\n\n\n\n\n\nTPOT now has the ability to export the optimized pipelines to sklearn code.\n\n\n\n\n\n\nLogistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers.\n\n\n\n\n\n\nTPOT can now use arbitrary scoring functions for the optimization process.\n\n\n\n\n\n\nTPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.\n\n\n\n\n\n\nVersion 0.1\n\n\n\n\n\n\nFirst public release of TPOT.\n\n\n\n\n\n\nOptimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.", + "text": "Version 0.9\n\n\n\n\n\n\nTPOT now supports sparse matrices\n with a new built-in TPOT configurations, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features.\n\n\n\n\n\n\nWe have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the \nearly_stop\n parameter to access this functionality.\n\n\n\n\n\n\nTPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process.\n\n\n\n\n\n\nTPOT now supports custom scoring functions via the command-line mode.\n\n\n\n\n\n\nWe have added a new optional argument, \nperiodic_checkpoint_folder\n, that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process.\n\n\n\n\n\n\nTPOT no longer uses \nsklearn.externals.joblib\n when \nn_jobs=1\n to avoid the potential freezing issue \nthat scikit-learn suffers from\n.\n\n\n\n\n\n\nWe have added \npandas\n as a dependency to read input datasets instead of \nnumpy.recfromcsv\n. NumPy's \nrecfromcsv\n function is unable to parse datasets with complex data types.\n\n\n\n\n\n\nFixed a bug that \nDEFAULT\n in the parameter(s) of nested estimator raises \nKeyError\n when exporting pipelines.\n\n\n\n\n\n\nFixed a bug related to setting \nrandom_state\n in nested estimators. The issue would happen with pipeline with \nSelectFromModel\n (\nExtraTreesClassifier\n as nested estimator) or \nStackingEstimator\n if nested estimator has \nrandom_state\n parameter.\n\n\n\n\n\n\nFixed a bug in the missing value imputation function in TPOT to impute along columns instead rows.\n\n\n\n\n\n\nRefined input checking for sparse matrices in TPOT.\n\n\n\n\n\n\nVersion 0.8\n\n\n\n\n\n\nTPOT now detects whether there are missing values in your dataset\n and replaces them with the median value of the column.\n\n\n\n\n\n\nTPOT now allows you to set a \ngroup\n parameter in the \nfit\n function so you can use the \nGroupKFold\n cross-validation strategy.\n\n\n\n\n\n\nTPOT now allows you to set a subsample ratio of the training instance with the \nsubsample\n parameter. For example, setting \nsubsample\n=0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation.\n\n\n\n\n\n\nTPOT now has more \nbuilt-in configurations\n, including TPOT MDR and TPOT light, for both classification and regression problems.\n\n\n\n\n\n\nTPOTClassifier\n and \nTPOTRegressor\n now expose three useful internal attributes, \nfitted_pipeline_\n, \npareto_front_fitted_pipelines_\n, and \nevaluated_individuals_\n. These attributes are described in the \nAPI documentation\n.\n\n\n\n\n\n\nOh, \nTPOT now has \nthorough API documentation\n. Check it out!\n\n\n\n\n\n\nFixed a reproducibility issue where setting \nrandom_seed\n didn't necessarily result in the same results every time. This bug was present since TPOT v0.7.\n\n\n\n\n\n\nRefined input checking in TPOT.\n\n\n\n\n\n\nRemoved Python 2 uncompliant code.\n\n\n\n\n\n\nVersion 0.7\n\n\n\n\n\n\nTPOT now has multiprocessing support.\n TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the \nn_jobs\n parameter.\n\n\n\n\n\n\nTPOT now allows you to \ncustomize the operators and parameters considered during the optimization process\n, which can be accomplished with the new \nconfig_dict\n parameter. The format of this customized dictionary can be found in the \nonline documentation\n, along with a list of \nbuilt-in configurations\n.\n\n\n\n\n\n\nTPOT now allows you to \nspecify a time limit for evaluating a single pipeline\n (default limit is 5 minutes) in optimization process with the \nmax_eval_time_mins\n parameter, so TPOT won't spend hours evaluating overly-complex pipelines.\n\n\n\n\n\n\nWe tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the \nmu+lambda algorithm\n. This algorithm gives you more control of how many pipelines are generated every iteration with the \noffspring_size\n parameter.\n\n\n\n\n\n\nRefined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6.\n\n\n\n\n\n\nTPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., \ntpot.fit(x_train, y_train, sample_weights=sample_weights)\n.\n\n\n\n\n\n\nThe default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting \nscoring='balanced_accuracy'\n when creating a TPOT instance.\n\n\n\n\n\n\nVersion 0.6\n\n\n\n\n\n\nTPOT now supports regression problems!\n We have created two separate \nTPOTClassifier\n and \nTPOTRegressor\n classes to support classification and regression problems, respectively. The \ncommand-line interface\n also supports this feature through the \n-mode\n parameter.\n\n\n\n\n\n\nTPOT now allows you to \nspecify a time limit\n for the optimization process with the \nmax_time_mins\n parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you.\n\n\n\n\n\n\nAdded a new operator that performs feature selection using \nExtraTrees\n feature importance scores.\n\n\n\n\n\n\nXGBoost\n has been added as an optional dependency to TPOT.\n If you have XGBoost installed, TPOT will automatically detect your installation and use the \nXGBoostClassifier\n and \nXGBoostRegressor\n in its pipelines.\n\n\n\n\n\n\nTPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.\n\n\n\n\n\n\nVersion 0.5\n\n\n\n\nMajor refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code!\n\n\nTPOT now \nexports directly to scikit-learn Pipelines\n instead of hacky code.\n\n\nInternal representation of individuals now uses scikit-learn pipelines.\n\n\nParameters for each operator have been optimized so TPOT spends less time exploring useless parameters.\n\n\nWe have removed pandas as a dependency and instead use numpy matrices to store the data.\n\n\nTPOT now uses \nk-fold cross-validation\n when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance.\n\n\nImproved \nscoring function support\n: Even though TPOT uses balanced accuracy by default, you can now have TPOT use \nany of the scoring functions\n that \ncross_val_score\n supports.\n\n\nAdded the scikit-learn \nNormalizer\n preprocessor.\n\n\nMinor text fixes.\n\n\n\n\nVersion 0.4\n\n\nIn TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below.\n\n\n\n\nAdded new sklearn models and preprocessors\n\n\n\n\nAdaBoostClassifier\n\n\nBernoulliNB\n\n\nExtraTreesClassifier\n\n\nGaussianNB\n\n\nMultinomialNB\n\n\nLinearSVC\n\n\nPassiveAggressiveClassifier\n\n\nGradientBoostingClassifier\n\n\nRBFSampler\n\n\nFastICA\n\n\nFeatureAgglomeration\n\n\nNystroem\n\n\n\n\nAdded operator that inserts virtual features for the count of features with values of zero\n\n\nReworked parameterization of TPOT operators\n\n\n\nReduced parameter search space with information from a scikit-learn benchmark\n\n\nTPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead\n\n\n\n\nRemoved XGBoost as a dependency\n\n\n\nToo many users were having install issues with XGBoost\n\n\nReplaced with scikit-learn's GradientBoostingClassifier\n\n\n\n\nImproved descriptiveness of TPOT command line parameter documentation\n\n\nRemoved min/max/avg details during fit() when verbosity > 1\n\n\n\n\nReplaced with tqdm progress bar\n\n\nAdded tqdm as a dependency\n\n\n\n\nAdded \nfit_predict()\n convenience function\n\n\nAdded \nget_params()\n function so TPOT can operate in scikit-learn's \ncross_val_score\n & related functions\n\n\n\n\n\nVersion 0.3\n\n\n\n\nWe revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.\n\n\n\n\nVersion 0.2\n\n\n\n\n\n\nTPOT now has the ability to export the optimized pipelines to sklearn code.\n\n\n\n\n\n\nLogistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers.\n\n\n\n\n\n\nTPOT can now use arbitrary scoring functions for the optimization process.\n\n\n\n\n\n\nTPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.\n\n\n\n\n\n\nVersion 0.1\n\n\n\n\n\n\nFirst public release of TPOT.\n\n\n\n\n\n\nOptimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.", "title": "Release Notes" }, + { + "location": "/releases/#version-09", + "text": "TPOT now supports sparse matrices with a new built-in TPOT configurations, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT.", + "title": "Version 0.9" + }, { "location": "/releases/#version-08", "text": "TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.", @@ -169,6 +179,11 @@ "location": "/support/", "text": "TPOT was developed in the \nComputational Genetics Lab\n with funding from the \nNIH\n under grant R01 AI117694. We're incredibly grateful for their support during the development of this project.\n\n\nThe TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.", "title": "Support" + }, + { + "location": "/related/", + "text": "Other Automated Machine Learning (AutoML) tools and related projects:\n\n\n\n\n\n\nName\n\n\nLanguage\n\n\nLicense\n\n\nDescription\n\n\n\n\n\n\nAuto-WEKA\n\n\nJava\n\n\nGPL-v3\n\n\nAutomated hyper-parameter tuning for WEKA models.\n\n\n\n\n\n\nauto-sklearn\n\n\nPython\n\n\nBSD-3-Clause\n\n\nAn automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.\n\n\n\n\n\n\nauto_ml\n\n\nPython\n\n\nMIT\n\n\nAutomated machine learning for analytics & production. Supports manual feature type declarations.\n\n\n\n\n\n\ndevol\n\n\nPython\n\n\nMIT\n\n\nAutomated deep neural network design via genetic programming.\n\n\n\n\n\n\nMLBox\n\n\nPython\n\n\nBSD-3-Clause\n\n\nAccurate hyper-parameter optimization in high-dimensional space with support for distributed computing.\n\n\n\n\n\n\nRecipe\n\n\nC\n\n\nGPL-v3\n\n\nMachine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure.\n\n\n\n\n\n\nXcessiv\n\n\nPython\n\n\nApache-2.0\n\n\nA web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python.", + "title": "Related" } ] } \ No newline at end of file diff --git a/docs/related/index.html b/docs/related/index.html new file mode 100644 index 00000000..258d62d1 --- /dev/null +++ b/docs/related/index.html @@ -0,0 +1,235 @@ + + + + + + + + + + + Related - TPOT + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    +
    +
    + +
    +
    +
    +
    + +

    Other Automated Machine Learning (AutoML) tools and related projects:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameLanguageLicenseDescription
    Auto-WEKAJavaGPL-v3Automated hyper-parameter tuning for WEKA models.
    auto-sklearnPythonBSD-3-ClauseAn automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
    auto_mlPythonMITAutomated machine learning for analytics & production. Supports manual feature type declarations.
    devolPythonMITAutomated deep neural network design via genetic programming.
    MLBoxPythonBSD-3-ClauseAccurate hyper-parameter optimization in high-dimensional space with support for distributed computing.
    RecipeCGPL-v3Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure.
    XcessivPythonApache-2.0A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python.
    + +
    +
    + + +
    +
    + +
    + +
    + +
    + + + GitHub + + + « Previous + + + +
    + + + + diff --git a/docs/releases/index.html b/docs/releases/index.html index abaa6ec9..f56646f9 100644 --- a/docs/releases/index.html +++ b/docs/releases/index.html @@ -82,6 +82,9 @@ Release Notes   @@ -154,7 +162,43 @@
    -

    Version 0.8

    +

    Version 0.9

    +
      +
    • +

      TPOT now supports sparse matrices with a new built-in TPOT configurations, "TPOT sparse". We are using a custom OneHotEncoder implementation that supports missing values and continuous features.

      +
    • +
    • +

      We have added an "early stopping" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality.

      +
    • +
    • +

      TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process.

      +
    • +
    • +

      TPOT now supports custom scoring functions via the command-line mode.

      +
    • +
    • +

      We have added a new optional argument, periodic_checkpoint_folder, that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process.

      +
    • +
    • +

      TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from.

      +
    • +
    • +

      We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv. NumPy's recfromcsv function is unable to parse datasets with complex data types.

      +
    • +
    • +

      Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines.

      +
    • +
    • +

      Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel (ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter.

      +
    • +
    • +

      Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows.

      +
    • +
    • +

      Refined input checking for sparse matrices in TPOT.

      +
    • +
    +

    Version 0.8

    • TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column.

      diff --git a/docs/search.html b/docs/search.html index caea363e..23277c3f 100644 --- a/docs/search.html +++ b/docs/search.html @@ -88,6 +88,11 @@ Support
    • +
    • + + Related +
    • +
      diff --git a/docs/sitemap.xml b/docs/sitemap.xml index eb3bd656..6ee7a000 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ http://rhiever.github.io/tpot/ - 2017-06-17 + 2017-09-27 daily @@ -12,7 +12,7 @@ http://rhiever.github.io/tpot/installing/ - 2017-06-17 + 2017-09-27 daily @@ -20,7 +20,7 @@ http://rhiever.github.io/tpot/using/ - 2017-06-17 + 2017-09-27 daily @@ -28,7 +28,7 @@ http://rhiever.github.io/tpot/api/ - 2017-06-17 + 2017-09-27 daily @@ -36,7 +36,7 @@ http://rhiever.github.io/tpot/examples/ - 2017-06-17 + 2017-09-27 daily @@ -44,7 +44,7 @@ http://rhiever.github.io/tpot/contributing/ - 2017-06-17 + 2017-09-27 daily @@ -52,7 +52,7 @@ http://rhiever.github.io/tpot/releases/ - 2017-06-17 + 2017-09-27 daily @@ -60,7 +60,7 @@ http://rhiever.github.io/tpot/citing/ - 2017-06-17 + 2017-09-27 daily @@ -68,7 +68,15 @@ http://rhiever.github.io/tpot/support/ - 2017-06-17 + 2017-09-27 + daily + + + + + + http://rhiever.github.io/tpot/related/ + 2017-09-27 daily diff --git a/docs/support/index.html b/docs/support/index.html index 297af771..fc049784 100644 --- a/docs/support/index.html +++ b/docs/support/index.html @@ -95,6 +95,11 @@ +
  • + + Related +
  • +
      @@ -139,6 +144,8 @@ diff --git a/docs/using/index.html b/docs/using/index.html index 3291eeca..6c33f8b2 100644 --- a/docs/using/index.html +++ b/docs/using/index.html @@ -80,6 +80,9 @@
  • Customizing TPOT's operators and parameters
  • +
  • Crash/freeze issue with n_jobs > 1 under OSX or Linux
  • + + @@ -113,6 +116,11 @@ Support +
  • + + Related +
  • +   @@ -325,17 +333,19 @@

    TPOT on the command line

    'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted',
    'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', -'recall_weighted', 'roc_auc' +'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.

    TPOT assumes that any function with "error" or "loss" in the name is meant to be minimized, whereas any other functions will be maximized.

    +my_module.scorer_name: You can also specify your own function or a full python path to an existing one. +

    See the section on scoring functions for more details. -cv CV -Any integer >1 +Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub @@ -378,10 +388,46 @@

    TPOT on the command line

    -config CONFIG_FILE -File path or string -A path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process. +String or file path +Operators and parameter configurations in TPOT:

    -See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. +
      +
    • Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process
    • +
    • string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors
    • +
    • string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies
    • +
    • string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.
    • +
    +See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. + + + +-cf +CHECKPOINT_FOLDER +Folder path + +If supplied, a folder you created, in which tpot will periodically save the best pipeline so far while optimizing. +

    +This is useful in multiple cases: +
      +
    • sudden death before tpot could save an optimized pipeline
    • +
    • progress tracking
    • +
    • grabbing a pipeline while tpot is working
    • +
    +

    +Example: +
    +mkdir my_checkpoints +
    +-cf ./my_checkpoints + + +-es +EARLY_STOP +Any positive integer + +How many generations TPOT checks whether there is no improvement in optimization process. +

    +End optimization process if there is no improvement in the set number of generations. -v @@ -435,6 +481,10 @@

    Scoring functions

    tpot.export('tpot_mnist_pipeline.py')
    +
      +
    • my_module.scorer_name: you can also use your manual scorer(y_true, y_pred) function through the command line, just add an argument -scoring my_module.scorer and TPOT will import your module and take the function from there. TPOT will also include current workdir when importing the module, so you can just put it in the same folder where you are going to run. +Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.
    • +

    Built-in TPOT configurations

    TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.

    @@ -473,6 +523,17 @@

    Built-in TPOT configurations



    Regression + + + + + + +
    TPOT sparseTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. +

    +This configuration works for both the TPOTClassifier and TPOTRegressor.
    Classification +

    +Regression

    To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the "TPOT light" configuration:

    @@ -550,6 +611,20 @@

    Customizing TPOT's operators

    When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config. Otherwise, TPOT will not be able to locate the configuration dictionary.

    For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code.

    Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.

    +

    Crash/freeze issue with n_jobs > 1 under OSX or Linux

    +

    TPOT supports parallel computing for speeding up the optimization process, but it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does, especially with large datasets.

    +

    One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script:

    +
    import multiprocessing
    +
    +# other imports, custom code, load data, define model...
    +
    +if __name__ == '__main__':
    +    multiprocessing.set_start_method('forkserver')
    +
    +    # call scikit-learn utils or tpot utils with n_jobs > 1 here
    +
    + +

    More information about these start methods can be found in the multiprocessing documentation.

    diff --git a/docs_sources/api.md b/docs_sources/api.md index fab57470..014adfd8 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -7,7 +7,9 @@ subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, - warm_start=False, verbosity=0, + warm_start=False, + periodic_checkpoint_folder=None, + verbosity=0, disable_update_check=False)
    source
    @@ -132,6 +134,7 @@ Possible inputs are:
  • Python dictionary, TPOT will use your custom configuration,
  • string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or
  • string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or
  • +
  • string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or
  • None, TPOT will use the default TPOTClassifier configuration.
  • See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. @@ -144,6 +147,25 @@ Flag indicating whether the TPOT instance will reuse the population from previou Setting warm_start=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.
    +periodic_checkpoint_folder: path string, optional (default: None) +
    +If supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing.

    +Currently once per generation but not more often than once per 30 seconds.

    +Useful in multiple cases: + +
    + +early_stop: integer, optional (default: None) +
    +How many generations TPOT checks whether there is no improvement in optimization process. +

    +Ends the optimization process if there is no improvement in the given number of generations. +
    + verbosity: integer, optional (default=0)
    How much information TPOT communicates while it's running. @@ -438,7 +460,9 @@ Does not return anything subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, - warm_start=False, verbosity=0, + warm_start=False, + periodic_checkpoint_folder=None, + verbosity=0, disable_update_check=False)
    source
    @@ -565,6 +589,7 @@ Possible inputs are:
  • Python dictionary, TPOT will use your custom configuration,
  • string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or
  • string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or
  • +
  • string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or
  • None, TPOT will use the default TPOTRegressor configuration.
  • See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. @@ -577,6 +602,25 @@ Flag indicating whether the TPOT instance will reuse the population from previou Setting warm_start=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.
    +periodic_checkpoint_folder: path string, optional (default: None) +
    +If supplied, a folder in which TPOT will periodically save the best pipeline so far while optimizing.

    +Currently once per generation but not more often than once per 30 seconds.

    +Useful in multiple cases: + +
    + +early_stop: integer, optional (default: None) +
    +How many generations TPOT checks whether there is no improvement in optimization process. +

    +Ends the optimization process if there is no improvement in the given number of generations. +
    + verbosity: integer, optional (default=0)
    How much information TPOT communicates while it's running. diff --git a/docs_sources/installing.md b/docs_sources/installing.md index 53b50e5e..a971a79b 100644 --- a/docs_sources/installing.md +++ b/docs_sources/installing.md @@ -12,19 +12,22 @@ TPOT is built on top of several existing Python libraries, including: * [tqdm](https://github.com/tqdm/tqdm) +* [stopit](https://github.com/glenfant/stopit) + +* [pandas](http://pandas.pydata.org) Most of the necessary Python packages can be installed via the [Anaconda Python distribution](https://www.continuum.io/downloads), which we strongly recommend that you use. We also strongly recommend that you use of Python 3 over Python 2 if you're given the choice. -NumPy, SciPy, and scikit-learn can be installed in Anaconda via the command: +NumPy, SciPy, scikit-learn and pandas can be installed in Anaconda via the command: ```Shell -conda install numpy scipy scikit-learn +conda install numpy scipy scikit-learn pandas ``` -DEAP, update_checker, and tqdm can be installed with `pip` via the command: +DEAP, update_checker, tqdm and stopit can be installed with `pip` via the command: ```Shell -pip install deap update_checker tqdm +pip install deap update_checker tqdm stopit ``` **For the Windows users**, the pywin32 module is required if Python is NOT installed via the [Anaconda Python distribution](https://www.continuum.io/downloads) and can be installed with `pip`: diff --git a/docs_sources/related.md b/docs_sources/related.md new file mode 100644 index 00000000..8d257162 --- /dev/null +++ b/docs_sources/related.md @@ -0,0 +1,52 @@ +Other Automated Machine Learning (AutoML) tools and related projects: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameLanguageLicenseDescription
    Auto-WEKAJavaGPL-v3Automated hyper-parameter tuning for WEKA models.
    auto-sklearnPythonBSD-3-ClauseAn automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
    auto_mlPythonMITAutomated machine learning for analytics & production. Supports manual feature type declarations.
    devolPythonMITAutomated deep neural network design via genetic programming.
    MLBoxPythonBSD-3-ClauseAccurate hyper-parameter optimization in high-dimensional space with support for distributed computing.
    RecipeCGPL-v3Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure.
    XcessivPythonApache-2.0A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python.
    diff --git a/docs_sources/releases.md b/docs_sources/releases.md index 5a5bf54a..cabe1064 100644 --- a/docs_sources/releases.md +++ b/docs_sources/releases.md @@ -1,3 +1,28 @@ +# Version 0.9 + +* **TPOT now supports sparse matrices** with a new built-in TPOT configurations, "TPOT sparse". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. + +* We have added an "early stopping" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the `early_stop` parameter to access this functionality. + +* TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. + +* TPOT now supports custom scoring functions via the command-line mode. + +* We have added a new optional argument, `periodic_checkpoint_folder`, that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. + +* TPOT no longer uses `sklearn.externals.joblib` when `n_jobs=1` to avoid the potential freezing issue [that scikit-learn suffers from](http://scikit-learn.org/stable/faq.html#why-do-i-sometime-get-a-crash-freeze-with-n-jobs-1-under-osx-or-linux). + +* We have added `pandas` as a dependency to read input datasets instead of `numpy.recfromcsv`. NumPy's `recfromcsv` function is unable to parse datasets with complex data types. + +* Fixed a bug that `DEFAULT` in the parameter(s) of nested estimator raises `KeyError` when exporting pipelines. + +* Fixed a bug related to setting `random_state` in nested estimators. The issue would happen with pipeline with `SelectFromModel` (`ExtraTreesClassifier` as nested estimator) or `StackingEstimator` if nested estimator has `random_state` parameter. + +* Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. + +* Refined input checking for sparse matrices in TPOT. + + # Version 0.8 * **TPOT now detects whether there are missing values in your dataset** and replaces them with the median value of the column. diff --git a/docs_sources/using.md b/docs_sources/using.md index 939aceb7..ef8e633a 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -212,17 +212,19 @@ We recommend using the default parameter unless you understand how the crossover 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted',
    'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', -'recall_weighted', 'roc_auc' +'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.

    TPOT assumes that any function with "error" or "loss" in the name is meant to be minimized, whereas any other functions will be maximized.

    +my_module.scorer_name: You can also specify your own function or a full python path to an existing one. +

    See the section on scoring functions for more details. -cv CV -Any integer >1 +Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub @@ -265,10 +267,46 @@ Set this seed if you want your TPOT run to be reproducible with the same seed an -config CONFIG_FILE -File path or string -A path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process. +String or file path +Operators and parameter configurations in TPOT: +

    + +See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. + + + +-cf +CHECKPOINT_FOLDER +Folder path + +If supplied, a folder you created, in which tpot will periodically save the best pipeline so far while optimizing. +

    +This is useful in multiple cases: + +

    +Example: +
    +mkdir my_checkpoints +
    +-cf ./my_checkpoints + + +-es +EARLY_STOP +Any positive integer + +How many generations TPOT checks whether there is no improvement in optimization process.

    -See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. +End optimization process if there is no improvement in the set number of generations. -v @@ -321,6 +359,9 @@ print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py') ``` +* **my_module.scorer_name**: you can also use your manual `scorer(y_true, y_pred)` function through the command line, just add an argument `-scoring my_module.scorer` and TPOT will import your module and take the function from there. TPOT will also include current workdir when importing the module, so you can just put it in the same folder where you are going to run. +Example: `-scoring sklearn.metrics.auc` will use the function auc from sklearn.metrics module. + # Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. @@ -361,6 +402,17 @@ Note that TPOT MDR may be slow to run because the feature selection routines are

    Regression + + +TPOT sparse +TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. +

    +This configuration works for both the TPOTClassifier and TPOTRegressor. +Classification +

    +Regression + + To use any of these configurations, simply pass the string name of the configuration to the `config_dict` parameter (or `-config` on the command line). For example, to use the "TPOT light" configuration: @@ -451,3 +503,23 @@ When using the command-line interface, the configuration file specified in the ` For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for [classification](https://github.com/rhiever/tpot/blob/master/tpot/config/classifier.py) and [regression](https://github.com/rhiever/tpot/blob/master/tpot/config/regressor.py) in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. + + +# Crash/freeze issue with n_jobs > 1 under OSX or Linux + +TPOT supports parallel computing for speeding up the optimization process, but it may crash/freeze with n_jobs > 1 under OSX or Linux [as scikit-learn does](http://scikit-learn.org/stable/faq.html#why-do-i-sometime-get-a-crash-freeze-with-n-jobs-1-under-osx-or-linux), especially with large datasets. + +One solution is to configure Python's `multiprocessing` module to use the `forkserver` start method (instead of the default `fork`) to manage the process pools. You can enable the `forkserver` mode globally for your program by putting the following codes into your main script: + +```Python +import multiprocessing + +# other imports, custom code, load data, define model... + +if __name__ == '__main__': + multiprocessing.set_start_method('forkserver') + + # call scikit-learn utils or tpot utils with n_jobs > 1 here +``` + +More information about these start methods can be found in the [multiprocessing documentation](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods). diff --git a/mkdocs.yml b/mkdocs.yml index f1583080..7407be2f 100755 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,3 +25,4 @@ pages: - Release Notes: releases.md - Citing: citing.md - Support: support.md +- Related: related.md diff --git a/requirements.txt b/requirements.txt index da0b5ab1..2a4e696b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ scikit-learn==0.18.1 scipy==0.19.0 tqdm==4.11.2 update-checker==0.16 +stopit=1.1.1 +pandas=0.20.2 diff --git a/setup.py b/setup.py index 2fa146eb..60628008 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,14 @@ def calculate_version(): This project is hosted at https://github.com/rhiever/tpot ''', zip_safe=True, - install_requires=['numpy>=1.12.1', 'scipy>=0.19.0', 'scikit-learn>=0.18.1', 'deap>=1.0', 'update_checker>=0.16', 'tqdm>=4.11.2'], + install_requires=['numpy>=1.12.1', + 'scipy>=0.19.0', + 'scikit-learn>=0.18.1', + 'deap>=1.0', + 'update_checker>=0.16', + 'tqdm>=4.11.2', + 'stopit>=1.1.1', + 'pandas>=0.20.2'], extras_require={ 'xgboost': ['xgboost==0.6a2'], 'skrebate': ['skrebate>=0.3.4'], diff --git a/test_config.py b/test_config.py deleted file mode 100644 index 6511c185..00000000 --- a/test_config.py +++ /dev/null @@ -1,14 +0,0 @@ -tpot_config = { - 'sklearn.naive_bayes.GaussianNB': { - }, - - 'sklearn.naive_bayes.BernoulliNB': { - 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], - 'fit_prior': [True, False] - }, - - 'sklearn.naive_bayes.MultinomialNB': { - 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], - 'fit_prior': [True, False] - } -} diff --git a/tests.py b/tests.py deleted file mode 100644 index 465c83bd..00000000 --- a/tests.py +++ /dev/null @@ -1,1319 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Copyright 2015-Present Randal S. Olson. - -This file is part of the TPOT library. - -TPOT is free software: you can redistribute it and/or modify -it under the terms of the GNU Lesser General Public License as -published by the Free Software Foundation, either version 3 of -the License, or (at your option) any later version. - -TPOT is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with TPOT. If not, see . - -""" - -from tpot import TPOTClassifier, TPOTRegressor -from tpot.base import TPOTBase -from tpot.builtins import ZeroCount, StackingEstimator -from tpot.driver import positive_integer, float_range, _get_arg_parser, _print_args, main, _read_data_file -from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code, get_by_name -from tpot.gp_types import Output_Array -from tpot.gp_deap import mutNodeReplacement, _wrapped_cross_val_score -from tpot.metrics import balanced_accuracy - -from tpot.operator_utils import TPOTOperatorClassFactory, set_sample_weight -from tpot.config.classifier import classifier_config_dict -from tpot.config.classifier_light import classifier_config_dict_light -from tpot.config.regressor_light import regressor_config_dict_light -from tpot.config.classifier_mdr import tpot_mdr_classifier_config_dict -from tpot.config.regressor_mdr import tpot_mdr_regressor_config_dict - -import numpy as np -import inspect -import random -import subprocess -import sys -from multiprocessing import cpu_count - -from sklearn.datasets import load_digits, load_boston -from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold -from sklearn.linear_model import LogisticRegression, Lasso -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.pipeline import make_pipeline -from deap import creator -from tqdm import tqdm -from nose.tools import assert_raises, assert_equal, assert_not_equal -from unittest import TestCase -from contextlib import contextmanager -try: - from StringIO import StringIO -except: - from io import StringIO - -# Set up the MNIST data set for testing -mnist_data = load_digits() -training_features, testing_features, training_target, testing_target = \ - train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) - -# Set up the Boston data set for testing -boston_data = load_boston() -training_features_r, testing_features_r, training_target_r, testing_target_r = \ - train_test_split(boston_data.data, boston_data.target, random_state=42) - -np.random.seed(42) -random.seed(42) - -test_operator_key = 'sklearn.feature_selection.SelectPercentile' -TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory( - test_operator_key, - classifier_config_dict[test_operator_key] -) - - -@contextmanager -def captured_output(): - new_out, new_err = StringIO(), StringIO() - old_out, old_err = sys.stdout, sys.stderr - try: - sys.stdout, sys.stderr = new_out, new_err - yield sys.stdout, sys.stderr - finally: - sys.stdout, sys.stderr = old_out, old_err - - -def test_driver(): - """Assert that the TPOT driver outputs normal result in mode mode.""" - batcmd = "python -m tpot.driver tests.csv -is , -target class -g 2 -p 2 -os 4 -cv 5 -s 45 -v 1" - ret_stdout = subprocess.check_output(batcmd, shell=True) - try: - ret_val = float(ret_stdout.decode('UTF-8').split('\n')[-2].split(': ')[-1]) - except Exception: - ret_val = -float('inf') - assert ret_val > 0.0 - -def test_read_data_file(): - """Assert that _read_data_file raises ValueError when the targe column is missing.""" - # Mis-spelled target - args_list = [ - 'tests.csv', - '-is', ',', - '-target', 'clas' # typo for right target 'class' - ] - args = _get_arg_parser().parse_args(args_list) - assert_raises(ValueError, _read_data_file, args=args) - # Correctly spelled - args_list = [ - 'tests.csv', - '-is', ',', - '-target', 'class' - ] - args = _get_arg_parser().parse_args(args_list) - input_data = _read_data_file(args) - assert isinstance(input_data, np.recarray) - - -class ParserTest(TestCase): - def setUp(self): - self.parser = _get_arg_parser() - - def test_default_param(self): - """Assert that the TPOT driver stores correct default values for all parameters.""" - args = self.parser.parse_args(['tests.csv']) - self.assertEqual(args.CROSSOVER_RATE, 0.1) - self.assertEqual(args.DISABLE_UPDATE_CHECK, False) - self.assertEqual(args.GENERATIONS, 100) - self.assertEqual(args.INPUT_FILE, 'tests.csv') - self.assertEqual(args.INPUT_SEPARATOR, '\t') - self.assertEqual(args.MAX_EVAL_MINS, 5) - self.assertEqual(args.MUTATION_RATE, 0.9) - self.assertEqual(args.NUM_CV_FOLDS, 5) - self.assertEqual(args.NUM_JOBS, 1) - self.assertEqual(args.OFFSPRING_SIZE, None) - self.assertEqual(args.OUTPUT_FILE, '') - self.assertEqual(args.POPULATION_SIZE, 100) - self.assertEqual(args.RANDOM_STATE, None) - self.assertEqual(args.SUBSAMPLE, 1.0) - self.assertEqual(args.SCORING_FN, None) - self.assertEqual(args.TARGET_NAME, 'class') - self.assertEqual(args.TPOT_MODE, 'classification') - self.assertEqual(args.VERBOSITY, 1) - - - def test_print_args(self): - """Assert that _print_args prints correct values for all parameters.""" - args = self.parser.parse_args(['tests.csv']) - with captured_output() as (out, err): - _print_args(args) - output = out.getvalue() - expected_output = """ -TPOT settings: -CONFIG_FILE\t=\tNone -CROSSOVER_RATE\t=\t0.1 -GENERATIONS\t=\t100 -INPUT_FILE\t=\ttests.csv -INPUT_SEPARATOR\t=\t\t -MAX_EVAL_MINS\t=\t5 -MAX_TIME_MINS\t=\tNone -MUTATION_RATE\t=\t0.9 -NUM_CV_FOLDS\t=\t5 -NUM_JOBS\t=\t1 -OFFSPRING_SIZE\t=\t100 -OUTPUT_FILE\t=\t -POPULATION_SIZE\t=\t100 -RANDOM_STATE\t=\tNone -SCORING_FN\t=\taccuracy -SUBSAMPLE\t=\t1.0 -TARGET_NAME\t=\tclass -TPOT_MODE\t=\tclassification -VERBOSITY\t=\t1 - -""" - - self.assertEqual(_sort_lines(expected_output), _sort_lines(output)) - -def _sort_lines(text): - return '\n'.join(sorted(text.split('\n'))) - -def test_init_custom_parameters(): - """Assert that the TPOT instantiator stores the TPOT variables properly.""" - tpot_obj = TPOTClassifier( - population_size=500, - generations=1000, - offspring_size=2000, - mutation_rate=0.05, - crossover_rate=0.9, - scoring='accuracy', - cv=10, - verbosity=1, - random_state=42, - disable_update_check=True, - warm_start=True - ) - - assert tpot_obj.population_size == 500 - assert tpot_obj.generations == 1000 - assert tpot_obj.offspring_size == 2000 - assert tpot_obj.mutation_rate == 0.05 - assert tpot_obj.crossover_rate == 0.9 - assert tpot_obj.scoring_function == 'accuracy' - assert tpot_obj.cv == 10 - assert tpot_obj.max_time_mins is None - assert tpot_obj.warm_start is True - assert tpot_obj.verbosity == 1 - assert tpot_obj._optimized_pipeline is None - assert tpot_obj.fitted_pipeline_ is None - assert not (tpot_obj._pset is None) - assert not (tpot_obj._toolbox is None) - - -def test_init_default_scoring(): - """Assert that TPOT intitializes with the correct default scoring function.""" - tpot_obj = TPOTRegressor() - assert tpot_obj.scoring_function == 'neg_mean_squared_error' - - tpot_obj = TPOTClassifier() - assert tpot_obj.scoring_function == 'accuracy' - - -def test_init_default_scoring_2(): - """Assert that TPOT intitializes with the correct customized scoring function.""" - - tpot_obj = TPOTClassifier(scoring=balanced_accuracy) - assert tpot_obj.scoring_function == 'balanced_accuracy' - - -def test_invaild_score_warning(): - """Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS.""" - # Mis-spelled scorer - assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray') - # Correctly spelled - TPOTClassifier(scoring='balanced_accuracy') - - -def test_invaild_dataset_warning(): - """Assert that the TPOT fit function raises a ValueError when dataset is not in right format.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - verbosity=0 - ) - # common mistake in target - bad_training_target = training_target.reshape((1, len(training_target))) - assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_target) - - -def test_invaild_subsample_ratio_warning(): - """Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0].""" - # Invalid ratio - assert_raises(ValueError, TPOTClassifier, subsample=0.0) - # Valid ratio - TPOTClassifier(subsample=0.1) - - -def test_invaild_mut_rate_plus_xo_rate(): - """Assert that the TPOT intitializes raises a ValueError when the sum of crossover and mutation probabilities is large than 1.""" - # Invalid ratio - assert_raises(ValueError, TPOTClassifier, mutation_rate=0.8, crossover_rate=0.8) - # Valid ratio - TPOTClassifier(mutation_rate=0.8, crossover_rate=0.1) - - -def test_init_max_time_mins(): - """Assert that the TPOT init stores max run time and sets generations to 1000000.""" - tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000) - - assert tpot_obj.generations == 1000000 - assert tpot_obj.max_time_mins == 30 - - -def test_init_n_jobs(): - """Assert that the TPOT init stores current number of processes""" - tpot_obj = TPOTClassifier(n_jobs=2) - assert tpot_obj.n_jobs == 2 - - tpot_obj = TPOTClassifier(n_jobs=-1) - assert tpot_obj.n_jobs == cpu_count() - - -def test_timeout(): - """Assert that _wrapped_cross_val_score return Timeout in a time limit""" - tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') - # a complex pipeline for the test - pipeline_string = ( - "ExtraTreesRegressor(" - "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," - "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," - "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," - "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," - "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," - "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," - "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " - "ExtraTreesRegressor__n_estimators=100)" - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - # test _wrapped_cross_val_score with cv=20 so that it is impossible to finish in 1 second - return_value = _wrapped_cross_val_score(tpot_obj.fitted_pipeline_, - training_features_r, - training_target_r, - cv=20, - scoring_function='neg_mean_squared_error', - sample_weight=None, - max_eval_time_mins=0.02, - groups=None) - assert return_value == "Timeout" - - -def test_balanced_accuracy(): - """Assert that the balanced_accuracy in TPOT returns correct accuracy.""" - y_true = np.array([1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,4,4,4]) - y_pred1 = np.array([1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,4,4,4]) - y_pred2 = np.array([3,3,3,3,3,2,2,2,2,2,2,2,3,3,3,3,3,4,4,4]) - accuracy_score1 = balanced_accuracy(y_true, y_pred1) - accuracy_score2 = balanced_accuracy(y_true, y_pred2) - assert np.allclose(accuracy_score1, 1.0) - assert np.allclose(accuracy_score2, 0.833333333333333) - - -def test_get_params(): - """Assert that get_params returns the exact dictionary of parameters used by TPOT.""" - kwargs = { - 'population_size': 500, - 'generations': 1000, - 'config_dict': 'TPOT light', - 'offspring_size': 2000, - 'verbosity': 1 - } - - tpot_obj = TPOTClassifier(**kwargs) - # Get default parameters of TPOT and merge with our specified parameters - initializer = inspect.getargspec(TPOTBase.__init__) - default_kwargs = dict(zip(initializer.args[1:], initializer.defaults)) - default_kwargs.update(kwargs) - # update to dictionary instead of input string - default_kwargs.update({'config_dict': classifier_config_dict_light}) - assert tpot_obj.get_params()['config_dict'] == default_kwargs['config_dict'] - assert tpot_obj.get_params() == default_kwargs - - -def test_set_params(): - """Assert that set_params returns a reference to the TPOT instance.""" - tpot_obj = TPOTClassifier() - assert tpot_obj.set_params() is tpot_obj - - -def test_set_params_2(): - """Assert that set_params updates TPOT's instance variables.""" - tpot_obj = TPOTClassifier(generations=2) - tpot_obj.set_params(generations=3) - - assert tpot_obj.generations == 3 - - -def test_TPOTBase(): - """Assert that TPOTBase class raises RuntimeError when using it directly.""" - assert_raises(RuntimeError, TPOTBase) - - -def test_conf_dict(): - """Assert that TPOT uses the pre-configured dictionary of operators when config_dict is 'TPOT light' or 'TPOT MDR'.""" - tpot_obj = TPOTClassifier(config_dict='TPOT light') - assert tpot_obj.config_dict == classifier_config_dict_light - - tpot_obj = TPOTClassifier(config_dict='TPOT MDR') - assert tpot_obj.config_dict == tpot_mdr_classifier_config_dict - - tpot_obj = TPOTRegressor(config_dict='TPOT light') - assert tpot_obj.config_dict == regressor_config_dict_light - - tpot_obj = TPOTRegressor(config_dict='TPOT MDR') - assert tpot_obj.config_dict == tpot_mdr_regressor_config_dict - - -def test_conf_dict_2(): - """Assert that TPOT uses a custom dictionary of operators when config_dict is Python dictionary.""" - tpot_obj = TPOTClassifier(config_dict=tpot_mdr_classifier_config_dict) - assert tpot_obj.config_dict == tpot_mdr_classifier_config_dict - - -def test_conf_dict_3(): - """Assert that TPOT uses a custom dictionary of operators when config_dict is the path of Python dictionary.""" - tpot_obj = TPOTRegressor(config_dict='test_config.py') - tested_config_dict = { - 'sklearn.naive_bayes.GaussianNB': { - }, - - 'sklearn.naive_bayes.BernoulliNB': { - 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], - 'fit_prior': [True, False] - }, - - 'sklearn.naive_bayes.MultinomialNB': { - 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], - 'fit_prior': [True, False] - } - } - assert isinstance(tpot_obj.config_dict, dict) - assert tpot_obj.config_dict == tested_config_dict - - -def test_random_ind(): - """Assert that the TPOTClassifier can generate the same pipeline with same random seed.""" - tpot_obj = TPOTClassifier(random_state=43) - pipeline1 = str(tpot_obj._toolbox.individual()) - tpot_obj = TPOTClassifier(random_state=43) - pipeline2 = str(tpot_obj._toolbox.individual()) - assert pipeline1 == pipeline2 - - -def test_random_ind_2(): - """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" - tpot_obj = TPOTClassifier(random_state=39) - tpot_obj._pbar = tqdm(total=1, disable=True) - pipeline = tpot_obj._toolbox.individual() - expected_code = """import numpy as np - -from sklearn.feature_selection import SelectPercentile, f_classif -from sklearn.model_selection import train_test_split -from sklearn.pipeline import make_pipeline -from sklearn.tree import DecisionTreeClassifier - -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) -training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['class'], random_state=42) - -exported_pipeline = make_pipeline( - SelectPercentile(score_func=f_classif, percentile=65), - DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) -) - -exported_pipeline.fit(training_features, training_target) -results = exported_pipeline.predict(testing_features) -""" - - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) - - -def test_score(): - """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists.""" - tpot_obj = TPOTClassifier() - assert_raises(RuntimeError, tpot_obj.score, testing_features, testing_target) - - -def test_score_2(): - """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline.""" - tpot_obj = TPOTClassifier(random_state=34) - known_score = 0.977777777778 # Assumes use of the TPOT accuracy function - - # Create a pipeline with a known score - pipeline_string = ( - 'KNeighborsClassifier(' - 'input_matrix, ' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1, ' - 'KNeighborsClassifier__weights=uniform' - ')' - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj.fitted_pipeline_.fit(training_features, training_target) - # Get score from TPOT - score = tpot_obj.score(testing_features, testing_target) - - assert np.allclose(known_score, score) - - -def test_score_3(): - """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline.""" - tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error', random_state=72) - known_score = 12.1791953611 - - # Reify pipeline with known score - pipeline_string = ( - "ExtraTreesRegressor(" - "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," - "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," - "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," - "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," - "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," - "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," - "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " - "ExtraTreesRegressor__n_estimators=100)" - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r) - - # Get score from TPOT - score = tpot_obj.score(testing_features_r, testing_target_r) - - assert np.allclose(known_score, score) - - - -def test_sample_weight_func(): - """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights.""" - tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') - - # Reify pipeline with known scor - pipeline_string = ( - "ExtraTreesRegressor(" - "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," - "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," - "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," - "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," - "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," - "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," - "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " - "ExtraTreesRegressor__n_estimators=100)" - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r) - - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - - # make up a sample weight - training_target_r_weight = np.array(range(1, len(training_target_r)+1)) - training_target_r_weight_dict = set_sample_weight(tpot_obj.fitted_pipeline_.steps, training_target_r_weight) - - np.random.seed(42) - cv_score1 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_target_r, cv=3, scoring='neg_mean_squared_error') - - np.random.seed(42) - cv_score2 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_target_r, cv=3, scoring='neg_mean_squared_error') - - np.random.seed(42) - cv_score_weight = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_target_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_target_r_weight_dict) - - np.random.seed(42) - tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r, **training_target_r_weight_dict) - # Get score from TPOT - known_score = 11.5790430757 - score = tpot_obj.score(testing_features_r, testing_target_r) - - assert np.allclose(cv_score1, cv_score2) - assert not np.allclose(cv_score1, cv_score_weight) - assert np.allclose(known_score, score) - -def test_fit_GroupKFold(): - """Assert that TPOT properly handles the group parameter when using GroupKFold""" - # This check tests if the darker MNIST images would generalize to the lighter ones. - means = np.mean(training_features, axis=1) - groups = means >= np.median(means) - - tpot_obj = TPOTClassifier( - random_state=42, - population_size=2, - offspring_size=4, - generations=1, - verbosity=0, - config_dict='TPOT light', - cv = GroupKFold(n_splits=2), - ) - tpot_obj.fit(training_features, training_target, groups=groups) - assert tpot_obj.score(testing_features, testing_target) >= 0.97 - - -def test_predict(): - """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists.""" - tpot_obj = TPOTClassifier() - assert_raises(RuntimeError, tpot_obj.predict, testing_features) - - -def test_predict_2(): - """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,).""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'DecisionTreeClassifier(' - 'input_matrix, ' - 'DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8, ' - 'DecisionTreeClassifier__min_samples_leaf=5, ' - 'DecisionTreeClassifier__min_samples_split=5' - ')' - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj.fitted_pipeline_.fit(training_features, training_target) - result = tpot_obj.predict(testing_features) - - assert result.shape == (testing_features.shape[0],) - -def test_predict_proba(): - """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_target).""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'DecisionTreeClassifier(' - 'input_matrix, ' - 'DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8, ' - 'DecisionTreeClassifier__min_samples_leaf=5, ' - 'DecisionTreeClassifier__min_samples_split=5)' - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj.fitted_pipeline_.fit(training_features, training_target) - - result = tpot_obj.predict_proba(testing_features) - num_labels = np.amax(testing_target) + 1 - - assert result.shape == (testing_features.shape[0], num_labels) - - -def test_predict_proba2(): - """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float).""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'DecisionTreeClassifier(' - 'input_matrix, ' - 'DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8, ' - 'DecisionTreeClassifier__min_samples_leaf=5, ' - 'DecisionTreeClassifier__min_samples_split=5)' - ) - tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj.fitted_pipeline_.fit(training_features, training_target) - - result = tpot_obj.predict_proba(testing_features) - rows, columns = result.shape - - for i in range(rows): - for j in range(columns): - float_range(result[i][j]) - - - -def test_warm_start(): - """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run.""" - tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True) - tpot_obj.fit(training_features, training_target) - - assert tpot_obj._pop is not None - assert tpot_obj._pareto_front is not None - - first_pop = tpot_obj._pop - tpot_obj.random_state = 21 - tpot_obj.fit(training_features, training_target) - - assert tpot_obj._pop == first_pop - - -def test_fit(): - """Assert that the TPOT fit function provides an optimized pipeline.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - verbosity=0 - ) - tpot_obj.fit(training_features, training_target) - - assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) - assert not (tpot_obj._start_datetime is None) - - -def test_fit2(): - """Assert that the TPOT fit function provides an optimized pipeline when config_dict is 'TPOT light'.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - verbosity=0, - config_dict='TPOT light' - ) - tpot_obj.fit(training_features, training_target) - - assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) - assert not (tpot_obj._start_datetime is None) - - -def test_fit3(): - """Assert that the TPOT fit function provides an optimized pipeline with subsample is 0.8""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - subsample=0.8, - verbosity=0 - ) - tpot_obj.fit(training_features, training_target) - - assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) - assert not (tpot_obj._start_datetime is None) - - -def test_evaluated_individuals_(): - """Assert that evaluated_individuals_ stores corrent pipelines and their CV scores.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=2, - offspring_size=4, - generations=1, - verbosity=0, - config_dict='TPOT light' - ) - tpot_obj.fit(training_features, training_target) - assert isinstance(tpot_obj.evaluated_individuals_, dict) - for pipeline_string in sorted(tpot_obj.evaluated_individuals_.keys()): - deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) - operator_count = tpot_obj._operator_count(deap_pipeline) - try: - cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) - mean_cv_scores = np.mean(cv_scores) - except: - mean_cv_scores = -float('inf') - assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][1], mean_cv_scores) - assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][0], operator_count) - - -def test_evaluate_individuals(): - """Assert that _evaluate_individuals returns operator_counts and CV scores in correct order.""" - tpot_obj = TPOTClassifier( - random_state=42, - verbosity=0, - config_dict='TPOT light' - ) - tpot_obj._pbar = tqdm(total=1, disable=True) - pop = tpot_obj._toolbox.population(n=10) - fitness_scores = tpot_obj._evaluate_individuals(pop, training_features, training_target) - for deap_pipeline, fitness_score in zip(pop, fitness_scores): - operator_count = tpot_obj._operator_count(deap_pipeline) - sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) - tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) - try: - cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) - mean_cv_scores = np.mean(cv_scores) - except: - mean_cv_scores = -float('inf') - assert isinstance(deap_pipeline, creator.Individual) - assert np.allclose(fitness_score[0], operator_count) - assert np.allclose(fitness_score[1], mean_cv_scores) - - -def test_imputer(): - """Assert that the TPOT fit function will not raise a ValueError in a dataset where NaNs are present.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - verbosity=0, - config_dict='TPOT light' - ) - features_with_nan = np.copy(training_features) - features_with_nan[0][0] = float('nan') - - tpot_obj.fit(features_with_nan, training_target) - - -def test_imputer2(): - """Assert that the TPOT predict function will not raise a ValueError in a dataset where NaNs are present.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - verbosity=0, - config_dict='TPOT light' - ) - features_with_nan = np.copy(training_features) - features_with_nan[0][0] = float('nan') - - tpot_obj.fit(features_with_nan, training_target) - tpot_obj.predict(features_with_nan) - - -def test_imputer3(): - """Assert that the TPOT _impute_values function returns a feature matrix with imputed NaN values.""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - verbosity=0, - config_dict='TPOT light' - ) - features_with_nan = np.copy(training_features) - features_with_nan[0][0] = float('nan') - - imputed_features = tpot_obj._impute_values(features_with_nan) - assert_not_equal(imputed_features[0][0], float('nan')) - - -def test_tpot_operator_factory_class(): - """Assert that the TPOT operators class factory.""" - test_config_dict = { - 'sklearn.svm.LinearSVC': { - 'penalty': ["l1", "l2"], - 'loss': ["hinge", "squared_hinge"], - 'dual': [True, False], - 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], - 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.] - }, - - 'sklearn.linear_model.LogisticRegression': { - 'penalty': ["l1", "l2"], - 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], - 'dual': [True, False] - }, - - 'sklearn.preprocessing.Binarizer': { - 'threshold': np.arange(0.0, 1.01, 0.05) - } - } - - tpot_operator_list = [] - tpot_argument_list = [] - - for key in sorted(test_config_dict.keys()): - op, args = TPOTOperatorClassFactory(key, test_config_dict[key]) - tpot_operator_list.append(op) - tpot_argument_list += args - - assert len(tpot_operator_list) == 3 - assert len(tpot_argument_list) == 9 - assert tpot_operator_list[0].root is True - assert tpot_operator_list[1].root is False - assert tpot_operator_list[2].type() == "Classifier or Regressor" - assert tpot_argument_list[1].values == [True, False] - - -def check_export(op, tpot_obj): - """Assert that a TPOT operator exports as expected.""" - prng = np.random.RandomState(42) - np.random.seed(42) - - args = [] - for type_ in op.parameter_types()[0][1:]: - args.append(prng.choice(tpot_obj._pset.terminals[type_]).value) - export_string = op.export(*args) - - assert export_string.startswith(op.__name__ + "(") and export_string.endswith(")") - - -def test_operators(): - """Assert that the TPOT operators match the output of their sklearn counterparts.""" - tpot_obj = TPOTClassifier(random_state=42) - for op in tpot_obj.operators: - check_export.description = ("Assert that the TPOT {} operator exports " - "as expected".format(op.__name__)) - yield check_export, op, tpot_obj - - -def test_export(): - """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists.""" - tpot_obj = TPOTClassifier() - assert_raises(RuntimeError, tpot_obj.export, "test_export.py") - - -def test_generate_pipeline_code(): - """Assert that generate_pipeline_code() returns the correct code given a specific pipeline.""" - tpot_obj = TPOTClassifier() - pipeline = [ - 'KNeighborsClassifier', - [ - 'CombineDFs', - [ - 'GradientBoostingClassifier', - 'input_matrix', - 38.0, - 5, - 5, - 5, - 0.05, - 0.5], - [ - 'GaussianNB', - [ - 'ZeroCount', - 'input_matrix' - ] - ] - ], - 18, - 'uniform', - 2 - ] - - expected_code = """make_pipeline( - make_union( - StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)), - StackingEstimator(estimator=make_pipeline( - ZeroCount(), - GaussianNB() - )) - ), - KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) -)""" - assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators) - - -def test_generate_import_code(): - """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline.""" - tpot_obj = TPOTClassifier() - pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset) - - expected_code = """import numpy as np - -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import GaussianNB -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import RobustScaler -""" - assert expected_code == generate_import_code(pipeline, tpot_obj.operators) - - -def test_generate_import_code_2(): - """Assert that generate_import_code() returns the correct set of dependancies and dependancies are importable.""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'KNeighborsClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' - ) - - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - - import_code = generate_import_code(pipeline, tpot_obj.operators) - - expected_code = """import numpy as np - -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier -from sklearn.pipeline import make_pipeline, make_union -from sklearn.tree import DecisionTreeClassifier -from tpot.builtins import StackingEstimator, ZeroCount -""" - exec(import_code) # should not raise error - assert expected_code == import_code - - -def test_PolynomialFeatures_exception(): - """Assert that TPOT allows only one PolynomialFeatures operator in a pipeline""" - tpot_obj = TPOTClassifier() - tpot_obj._pbar = tqdm(total=1, disable=True) - # pipeline with one PolynomialFeatures operator - pipeline_string_1 = ('LogisticRegression(PolynomialFeatures' - '(input_matrix, PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=DEFAULT, ' - 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' - 'LogisticRegression__dual=DEFAULT, LogisticRegression__penalty=DEFAULT)') - - # pipeline with two PolynomialFeatures operator - pipeline_string_2 = ('LogisticRegression(PolynomialFeatures' - '(PolynomialFeatures(input_matrix, PolynomialFeatures__degree=2, ' - 'PolynomialFeatures__include_bias=DEFAULT, PolynomialFeatures__interaction_only=False), ' - 'PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=DEFAULT, ' - 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' - 'LogisticRegression__dual=DEFAULT, LogisticRegression__penalty=DEFAULT)') - - # make a list for _evaluate_individuals - pipelines = [] - pipelines.append(creator.Individual.from_string(pipeline_string_1, tpot_obj._pset)) - pipelines.append(creator.Individual.from_string(pipeline_string_2, tpot_obj._pset)) - fitness_scores = tpot_obj._evaluate_individuals(pipelines, training_features, training_target) - known_scores = [(2, 0.98068077235290885), (5000.0, -float('inf'))] - assert np.allclose(known_scores, fitness_scores) - -def test_mutNodeReplacement(): - """Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline.""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'KNeighborsClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, ' - 'DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8, ' - 'DecisionTreeClassifier__min_samples_leaf=5, ' - 'DecisionTreeClassifier__min_samples_split=5' - '), ' - 'SelectPercentile(' - 'input_matrix, ' - 'SelectPercentile__percentile=20' - ')' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1, ' - 'KNeighborsClassifier__weights=uniform' - ')' - ) - - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - pipeline[0].ret = Output_Array - old_ret_type_list = [node.ret for node in pipeline] - old_prims_list = [node for node in pipeline if node.arity != 0] - mut_ind = mutNodeReplacement(pipeline, pset=tpot_obj._pset) - new_ret_type_list = [node.ret for node in mut_ind[0]] - new_prims_list = [node for node in mut_ind[0] if node.arity != 0] - - if new_prims_list == old_prims_list: # Terminal mutated - assert new_ret_type_list == old_ret_type_list - else: # Primitive mutated - diff_prims = list(set(new_prims_list).symmetric_difference(old_prims_list)) - assert diff_prims[0].ret == diff_prims[1].ret - - assert mut_ind[0][0].ret == Output_Array - - -def test_export_pipeline(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline.""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'KNeighborsClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' - ) - - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - expected_code = """import numpy as np - -from sklearn.feature_selection import SelectPercentile, f_classif -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier -from sklearn.pipeline import make_pipeline, make_union -from sklearn.tree import DecisionTreeClassifier -from tpot.builtins import StackingEstimator - -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) -training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['class'], random_state=42) - -exported_pipeline = make_pipeline( - make_union( - StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), - SelectPercentile(score_func=f_classif, percentile=20) - ), - KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") -) - -exported_pipeline.fit(training_features, training_target) -results = exported_pipeline.predict(testing_features) -""" - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) - - -def test_export_pipeline_2(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier).""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'KNeighborsClassifier(' - 'input_matrix, ' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1, ' - 'KNeighborsClassifier__weights=uniform' - ')' - ) - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - expected_code = """import numpy as np - -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier - -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) -training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['class'], random_state=42) - -exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") - -exported_pipeline.fit(training_features, training_target) -results = exported_pipeline.predict(testing_features) -""" - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) - - -def test_export_pipeline_3(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor.""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' - 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' - 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' - ) - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - - expected_code = """import numpy as np - -from sklearn.feature_selection import SelectPercentile, f_classif -from sklearn.model_selection import train_test_split -from sklearn.pipeline import make_pipeline -from sklearn.tree import DecisionTreeClassifier - -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) -training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['class'], random_state=42) - -exported_pipeline = make_pipeline( - SelectPercentile(score_func=f_classif, percentile=20), - DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) -) - -exported_pipeline.fit(training_features, training_target) -results = exported_pipeline.predict(testing_features) -""" - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) - -def test_export_pipeline_4(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with input_matrix in CombineDFs.""" - tpot_obj = TPOTClassifier() - pipeline_string = ( - 'KNeighborsClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' - 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5),input_matrix)' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' - ) - - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - expected_code = """import numpy as np - -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier -from sklearn.pipeline import make_pipeline, make_union -from sklearn.tree import DecisionTreeClassifier -from tpot.builtins import StackingEstimator -from sklearn.preprocessing import FunctionTransformer -from copy import copy - -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) -training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['class'], random_state=42) - -exported_pipeline = make_pipeline( - make_union( - StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), - FunctionTransformer(copy) - ), - KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") -) - -exported_pipeline.fit(training_features, training_target) -results = exported_pipeline.predict(testing_features) -""" - assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) - -def test_operator_export(): - """Assert that a TPOT operator can export properly with a function as a parameter to a classifier.""" - export_string = TPOTSelectPercentile.export(5) - assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)" - - -def test_indent(): - """Assert that indenting a multiline string by 4 spaces prepends 4 spaces before each new line.""" - multiline_string = """test -test1 -test2 -test3""" - - indented_multiline_string = """ test - test1 - test2 - test3""" - - assert indented_multiline_string == _indent(multiline_string, 4) - - -def test_operator_type(): - """Assert that TPOT operators return their type, e.g. 'Classifier', 'Preprocessor'.""" - assert TPOTSelectPercentile.type() == "Preprocessor or Selector" - - -def test_get_by_name(): - """Assert that the Operator class returns operators by name appropriately.""" - tpot_obj = TPOTClassifier() - assert get_by_name("SelectPercentile", tpot_obj.operators).__class__ == TPOTSelectPercentile.__class__ - - -def test_gen(): - """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure.""" - tpot_obj = TPOTClassifier() - - pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3) - - assert len(pipeline) > 1 - assert pipeline[0].ret == Output_Array - - -def test_positive_integer(): - """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0.""" - assert_raises(Exception, positive_integer, '-1') - - -def test_positive_integer_2(): - """Assert that the TPOT CLI interface's integer parsing returns the integer value of a string encoded integer when n > 0.""" - assert 1 == positive_integer('1') - - -def test_positive_integer_3(): - """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer.""" - assert_raises(Exception, positive_integer, 'foobar') - - -def test_float_range(): - """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0.""" - assert 0.5 == float_range('0.5') - - -def test_float_range_2(): - """Assert that the TPOT CLI interface's float range throws an exception when input it out of range.""" - assert_raises(Exception, float_range, '2.0') - - -def test_float_range_3(): - """Assert that the TPOT CLI interface's float range throws an exception when input is not a float.""" - assert_raises(Exception, float_range, 'foobar') - - -def test_StackingEstimator_1(): - """Assert that the StackingEstimator returns transformed X with synthetic features in classification.""" - clf = RandomForestClassifier(random_state=42) - stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42)) - # fit - clf.fit(training_features, training_target) - stack_clf.fit(training_features, training_target) - # get transformd X - X_clf_transformed = stack_clf.transform(training_features) - - assert np.allclose(clf.predict(training_features), X_clf_transformed[:,0]) - assert np.allclose(clf.predict_proba(training_features), X_clf_transformed[:,1:1+len(np.unique(training_target))]) - - -def test_StackingEstimator_2(): - """Assert that the StackingEstimator returns transformed X with a synthetic feature in regression.""" - reg = RandomForestRegressor(random_state=42) - stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) - # fit - reg.fit(training_features_r, training_target_r) - stack_reg.fit(training_features_r, training_target_r) - # get transformd X - X_reg_transformed = stack_reg.transform(training_features_r) - - assert np.allclose(reg.predict(training_features_r), X_reg_transformed[:,0]) - - -def test_StackingEstimator_3(): - """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in classification""" - stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42)) - meta_clf = LogisticRegression() - sklearn_pipeline = make_pipeline(stack_clf, meta_clf) - # fit in pipeline - sklearn_pipeline.fit(training_features, training_target) - # fit step by step - stack_clf.fit(training_features, training_target) - X_clf_transformed = stack_clf.transform(training_features) - meta_clf.fit(X_clf_transformed, training_target) - # scoring - score = meta_clf.score(X_clf_transformed, training_target) - pipeline_score = sklearn_pipeline.score(training_features, training_target) - assert np.allclose(score, pipeline_score) - - # test cv score - cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features, training_target, cv=3, scoring='accuracy')) - - known_cv_score = 0.947282375315 - - assert np.allclose(known_cv_score, cv_score) - -def test_StackingEstimator_4(): - """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in regression""" - stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) - meta_reg = Lasso(random_state=42) - sklearn_pipeline = make_pipeline(stack_reg, meta_reg) - # fit in pipeline - sklearn_pipeline.fit(training_features_r, training_target_r) - # fit step by step - stack_reg.fit(training_features_r, training_target_r) - X_reg_transformed = stack_reg.transform(training_features_r) - meta_reg.fit(X_reg_transformed, training_target_r) - # scoring - score = meta_reg.score(X_reg_transformed, training_target_r) - pipeline_score = sklearn_pipeline.score(training_features_r, training_target_r) - assert np.allclose(score, pipeline_score) - - # test cv score - cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features_r, training_target_r, cv=3, scoring='r2')) - known_cv_score = 0.795877470354 - - assert np.allclose(known_cv_score, cv_score) - - -def test_ZeroCount(): - """Assert that ZeroCount operator returns correct transformed X""" - X = np.array([[0, 1, 7, 0, 0], [3, 0, 0, 2, 19], [0, 1, 3, 4, 5], [5, 0, 0, 0, 0]]) - op = ZeroCount() - X_transformed = op.transform(X) - zero_col = np.array([3, 2, 1, 4]) - non_zero = np.array([2, 3, 4, 1]) - - assert np.allclose(zero_col, X_transformed[:, 0]) - assert np.allclose(non_zero, X_transformed[:, 1]) diff --git a/tests/driver_tests.py b/tests/driver_tests.py new file mode 100644 index 00000000..2be91315 --- /dev/null +++ b/tests/driver_tests.py @@ -0,0 +1,354 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import subprocess +import sys +from os import remove, path +from contextlib import contextmanager +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +import numpy as np +import pandas as pd + +from tpot.driver import positive_integer, float_range, _get_arg_parser, _print_args, _read_data_file, load_scoring_function, tpot_driver +from nose.tools import assert_raises, assert_equal, assert_in +from unittest import TestCase + + +@contextmanager +def captured_output(): + new_out, new_err = StringIO(), StringIO() + old_out, old_err = sys.stdout, sys.stderr + try: + sys.stdout, sys.stderr = new_out, new_err + yield sys.stdout, sys.stderr + finally: + sys.stdout, sys.stderr = old_out, old_err + + +def test_scoring_function_argument(): + with captured_output() as (out, err): + # regular argument returns regular string + assert_equal(load_scoring_function("roc_auc"), "roc_auc") + + # bad function returns exception + assert_raises(Exception, load_scoring_function, scoring_func="tests.__fake_BAD_FUNC_NAME") + + # manual function loads the function + assert_equal(load_scoring_function('driver_tests.test_scoring_function_argument').__name__, test_scoring_function_argument.__name__) + + # installed-module function test + assert_equal(load_scoring_function('sklearn.metrics.auc').__name__, "auc") + + out, err = out.getvalue(), err.getvalue() + + assert_in("failed importing custom scoring function", out) + assert_in("manual scoring function: 0.0 + + +def test_driver_2(): + """Assert that the tpot_driver() in TPOT driver outputs normal result with verbosity = 1.""" + args_list = [ + 'tests/tests.csv', + '-is', ',', + '-target', 'class', + '-g', '1', + '-p', '2', + '-cv', '2', + '-s',' 45', + '-config', 'TPOT light', + '-v', '1' + ] + args = _get_arg_parser().parse_args(args_list) + with captured_output() as (out, err): + tpot_driver(args) + ret_stdout = out.getvalue() + + assert "TPOT settings" not in ret_stdout + assert "Final Pareto front testing scores" not in ret_stdout + try: + ret_val = float(ret_stdout.split('\n')[-2].split(': ')[-1]) + except Exception: + ret_val = -float('inf') + assert ret_val > 0.0 + + +def test_driver_3(): + """Assert that the tpot_driver() in TPOT driver outputs normal result with verbosity = 2.""" + args_list = [ + 'tests/tests.csv', + '-is', ',', + '-target', 'class', + '-g', '1', + '-p', '2', + '-cv', '3', + '-s',' 45', + '-config', 'TPOT light', + '-v', '2' + ] + args = _get_arg_parser().parse_args(args_list) + with captured_output() as (out, err): + tpot_driver(args) + ret_stdout = out.getvalue() + assert "TPOT settings" in ret_stdout + assert "Final Pareto front testing scores" not in ret_stdout + try: + ret_val = float(ret_stdout.split('\n')[-2].split(': ')[-1]) + except Exception: + ret_val = -float('inf') + assert ret_val > 0.0 + + +def test_driver_4(): + """Assert that the tpot_driver() in TPOT driver outputs normal result with verbosity = 3.""" + args_list = [ + 'tests/tests.csv', + '-is', ',', + '-target', 'class', + '-g', '1', + '-p', '2', + '-cv', '3', + '-s', '42', + '-config', 'TPOT light', + '-v', '3' + ] + args = _get_arg_parser().parse_args(args_list) + with captured_output() as (out, err): + tpot_driver(args) + ret_stdout = out.getvalue() + + assert "TPOT settings" in ret_stdout + assert "Final Pareto front testing scores" in ret_stdout + try: + ret_val = float(ret_stdout.split('\n')[-2].split('\t')[1]) + except Exception: + ret_val = -float('inf') + assert ret_val > 0.0 + + +def test_driver_5(): + """Assert that the tpot_driver() in TPOT driver outputs normal result with exported python file and verbosity = 0.""" + args_list = [ + 'tests/tests.csv', + '-is', ',', + '-target', 'class', + '-o', 'test_export.py', + '-g', '1', + '-p', '2', + '-cv', '3', + '-s', '42', + '-config', 'TPOT light', + '-v', '0' + ] + args = _get_arg_parser().parse_args(args_list) + with captured_output() as (out, err): + tpot_driver(args) + ret_stdout = out.getvalue() + + assert ret_stdout == "" + assert path.isfile("test_export.py") + remove("test_export.py") # clean up exported file + + +def test_read_data_file(): + """Assert that _read_data_file raises ValueError when the targe column is missing.""" + # Mis-spelled target + args_list = [ + 'tests/tests.csv', + '-is', ',', + '-target', 'clas' # typo for right target 'class' + ] + args = _get_arg_parser().parse_args(args_list) + assert_raises(ValueError, _read_data_file, args=args) + + # Correctly spelled target + args_list = [ + 'tests/tests.csv', + '-is', ',', + '-target', 'class' + ] + args = _get_arg_parser().parse_args(args_list) + input_data = _read_data_file(args) + + assert isinstance(input_data, pd.DataFrame) + + +class ParserTest(TestCase): + def setUp(self): + self.parser = _get_arg_parser() + + + def test_default_param(self): + """Assert that the TPOT driver stores correct default values for all parameters.""" + args = self.parser.parse_args(['tests/tests.csv']) + self.assertEqual(args.CONFIG_FILE, None) + self.assertEqual(args.CROSSOVER_RATE, 0.1) + self.assertEqual(args.EARLY_STOP, None) + self.assertEqual(args.DISABLE_UPDATE_CHECK, False) + self.assertEqual(args.GENERATIONS, 100) + self.assertEqual(args.INPUT_FILE, 'tests/tests.csv') + self.assertEqual(args.INPUT_SEPARATOR, '\t') + self.assertEqual(args.MAX_EVAL_MINS, 5) + self.assertEqual(args.MUTATION_RATE, 0.9) + self.assertEqual(args.NUM_CV_FOLDS, 5) + self.assertEqual(args.NUM_JOBS, 1) + self.assertEqual(args.OFFSPRING_SIZE, None) + self.assertEqual(args.OUTPUT_FILE, '') + self.assertEqual(args.POPULATION_SIZE, 100) + self.assertEqual(args.RANDOM_STATE, None) + self.assertEqual(args.SUBSAMPLE, 1.0) + self.assertEqual(args.SCORING_FN, None) + self.assertEqual(args.TARGET_NAME, 'class') + self.assertEqual(args.TPOT_MODE, 'classification') + self.assertEqual(args.VERBOSITY, 1) + + + def test_print_args(self): + """Assert that _print_args prints correct values for all parameters in default settings.""" + args_list = [ + 'tests/tests.csv', + '-is', ',' + ] + args = self.parser.parse_args(args_list) + with captured_output() as (out, err): + _print_args(args) + output = out.getvalue() + expected_output = """ +TPOT settings: +CHECKPOINT_FOLDER = None +CONFIG_FILE = None +CROSSOVER_RATE = 0.1 +EARLY_STOP = None +GENERATIONS = 100 +INPUT_FILE = tests/tests.csv +INPUT_SEPARATOR = , +MAX_EVAL_MINS = 5 +MAX_TIME_MINS = None +MUTATION_RATE = 0.9 +NUM_CV_FOLDS = 5 +NUM_JOBS = 1 +OFFSPRING_SIZE = 100 +OUTPUT_FILE = +POPULATION_SIZE = 100 +RANDOM_STATE = None +SCORING_FN = accuracy +SUBSAMPLE = 1.0 +TARGET_NAME = class +TPOT_MODE = classification +VERBOSITY = 1 + +""" + print + + self.assertEqual(_sort_lines(expected_output), _sort_lines(output)) + + + def test_print_args_2(self): + """Assert that _print_args prints correct values for all parameters in regression mode.""" + args_list = [ + 'tests/tests.csv', + '-mode', 'regression', + '-is', ',' + ] + args = self.parser.parse_args(args_list) + with captured_output() as (out, err): + _print_args(args) + output = out.getvalue() + expected_output = """ +TPOT settings: +CHECKPOINT_FOLDER = None +CONFIG_FILE = None +CROSSOVER_RATE = 0.1 +EARLY_STOP = None +GENERATIONS = 100 +INPUT_FILE = tests/tests.csv +INPUT_SEPARATOR = , +MAX_EVAL_MINS = 5 +MAX_TIME_MINS = None +MUTATION_RATE = 0.9 +NUM_CV_FOLDS = 5 +NUM_JOBS = 1 +OFFSPRING_SIZE = 100 +OUTPUT_FILE = +POPULATION_SIZE = 100 +RANDOM_STATE = None +SCORING_FN = neg_mean_squared_error +SUBSAMPLE = 1.0 +TARGET_NAME = class +TPOT_MODE = regression +VERBOSITY = 1 + +""" + + self.assertEqual(_sort_lines(expected_output), _sort_lines(output)) + + +def _sort_lines(text): + return '\n'.join(sorted(text.split('\n'))) + + +def test_positive_integer(): + """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0.""" + assert_raises(Exception, positive_integer, '-1') + + +def test_positive_integer_2(): + """Assert that the TPOT CLI interface's integer parsing returns the integer value of a string encoded integer when n > 0.""" + assert 1 == positive_integer('1') + + +def test_positive_integer_3(): + """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer.""" + assert_raises(Exception, positive_integer, 'foobar') + + +def test_float_range(): + """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0.""" + assert 0.5 == float_range('0.5') + + +def test_float_range_2(): + """Assert that the TPOT CLI interface's float range throws an exception when input it out of range.""" + assert_raises(Exception, float_range, '2.0') + + +def test_float_range_3(): + """Assert that the TPOT CLI interface's float range throws an exception when input is not a float.""" + assert_raises(Exception, float_range, 'foobar') diff --git a/tests/export_tests.py b/tests/export_tests.py new file mode 100644 index 00000000..559f4efb --- /dev/null +++ b/tests/export_tests.py @@ -0,0 +1,567 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +from tqdm import tqdm +import numpy as np +from os import remove, path + +from tpot import TPOTClassifier, TPOTRegressor +from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code, get_by_name +from tpot.operator_utils import TPOTOperatorClassFactory +from tpot.config.classifier import classifier_config_dict + +from sklearn.datasets import load_digits +from sklearn.model_selection import train_test_split +from deap import creator + +from nose.tools import assert_raises, assert_equal + +test_operator_key = 'sklearn.feature_selection.SelectPercentile' + +TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory( + test_operator_key, + classifier_config_dict[test_operator_key] +) + +mnist_data = load_digits() +training_features, testing_features, training_target, testing_target = \ + train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) + + +def test_export_random_ind(): + """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" + tpot_obj = TPOTClassifier(random_state=39) + tpot_obj._pbar = tqdm(total=1, disable=True) + pipeline = tpot_obj._toolbox.individual() + expected_code = """import numpy as np +import pandas as pd +from sklearn.feature_selection import SelectPercentile, f_classif +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.tree import DecisionTreeClassifier + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +exported_pipeline = make_pipeline( + SelectPercentile(score_func=f_classif, percentile=65), + DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) +) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + print(export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)) + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + + +def test_export(): + """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists.""" + tpot_obj = TPOTClassifier() + assert_raises(RuntimeError, tpot_obj.export, "test_export.py") + pipeline_string = ( + 'KNeighborsClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' + ) + + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj._optimized_pipeline = pipeline + tpot_obj.export("test_export.py") + assert path.isfile("test_export.py") + remove("test_export.py") # clean up exported file + + +def test_generate_pipeline_code(): + """Assert that generate_pipeline_code() returns the correct code given a specific pipeline.""" + tpot_obj = TPOTClassifier() + pipeline = [ + 'KNeighborsClassifier', + [ + 'CombineDFs', + [ + 'GradientBoostingClassifier', + 'input_matrix', + 38.0, + 5, + 5, + 5, + 0.05, + 0.5], + [ + 'GaussianNB', + [ + 'ZeroCount', + 'input_matrix' + ] + ] + ], + 18, + 'uniform', + 2 + ] + + expected_code = """make_pipeline( + make_union( + StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)), + StackingEstimator(estimator=make_pipeline( + ZeroCount(), + GaussianNB() + )) + ), + KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) +)""" + assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators) + + +def test_generate_pipeline_code_2(): + """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs.""" + tpot_obj = TPOTClassifier() + pipeline = [ + 'KNeighborsClassifier', + [ + 'CombineDFs', + [ + 'GradientBoostingClassifier', + 'input_matrix', + 38.0, + 5, + 5, + 5, + 0.05, + 0.5], + [ + 'CombineDFs', + [ + 'MinMaxScaler', + 'input_matrix' + ], + ['ZeroCount', + [ + 'MaxAbsScaler', + 'input_matrix' + ] + ] + ] + ], + 18, + 'uniform', + 2 + ] + + expected_code = """make_pipeline( + make_union( + StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)), + make_union( + MinMaxScaler(), + make_pipeline( + MaxAbsScaler(), + ZeroCount() + ) + ) + ), + KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) +)""" + + assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators) + + +def test_generate_import_code(): + """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline.""" + tpot_obj = TPOTClassifier() + pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset) + + expected_code = """import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import GaussianNB +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import RobustScaler +""" + assert expected_code == generate_import_code(pipeline, tpot_obj.operators) + + +def test_generate_import_code_2(): + """Assert that generate_import_code() returns the correct set of dependancies and dependancies are importable.""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'KNeighborsClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' + ) + + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + import_code = generate_import_code(pipeline, tpot_obj.operators) + expected_code = """import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline, make_union +from sklearn.tree import DecisionTreeClassifier +from tpot.builtins import StackingEstimator, ZeroCount +""" + exec(import_code) # should not raise error + assert expected_code == import_code + + +def test_operators(): + """Assert that the TPOT operators match the output of their sklearn counterparts.""" + tpot_obj = TPOTClassifier(random_state=42) + for op in tpot_obj.operators: + check_export.description = ("Assert that the TPOT {} operator exports " + "as expected".format(op.__name__)) + yield check_export, op, tpot_obj + + +def check_export(op, tpot_obj): + """Assert that a TPOT operator exports as a class constructor.""" + prng = np.random.RandomState(42) + np.random.seed(42) + + args = [] + for type_ in op.parameter_types()[0][1:]: + args.append(prng.choice(tpot_obj._pset.terminals[type_]).value) + export_string = op.export(*args) + + assert export_string.startswith(op.__name__ + "(") and export_string.endswith(")") + + +def test_export_pipeline(): + """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline.""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'KNeighborsClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' + ) + + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + expected_code = """import numpy as np +import pandas as pd +from sklearn.feature_selection import SelectPercentile, f_classif +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline, make_union +from sklearn.tree import DecisionTreeClassifier +from tpot.builtins import StackingEstimator + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +exported_pipeline = make_pipeline( + make_union( + StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), + SelectPercentile(score_func=f_classif, percentile=20) + ), + KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") +) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + + +def test_export_pipeline_2(): + """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier).""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'KNeighborsClassifier(' + 'input_matrix, ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + expected_code = """import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + + +def test_export_pipeline_3(): + """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor.""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' + 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' + 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' + ) + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + + expected_code = """import numpy as np +import pandas as pd +from sklearn.feature_selection import SelectPercentile, f_classif +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.tree import DecisionTreeClassifier + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +exported_pipeline = make_pipeline( + SelectPercentile(score_func=f_classif, percentile=20), + DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) +) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + + +def test_export_pipeline_4(): + """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with input_matrix in CombineDFs.""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'KNeighborsClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5),input_matrix)' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' + ) + + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + expected_code = """import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline, make_union +from sklearn.tree import DecisionTreeClassifier +from tpot.builtins import StackingEstimator +from sklearn.preprocessing import FunctionTransformer +from copy import copy + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +exported_pipeline = make_pipeline( + make_union( + StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), + FunctionTransformer(copy) + ), + KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") +) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + + +def test_export_pipeline_5(): + """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with SelectFromModel.""" + tpot_obj = TPOTRegressor() + pipeline_string = ( + 'DecisionTreeRegressor(SelectFromModel(input_matrix, ' + 'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, ' + 'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,' + 'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)' + ) + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + expected_code = """import numpy as np +import pandas as pd +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.feature_selection import SelectFromModel +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.tree import DecisionTreeRegressor + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +exported_pipeline = make_pipeline( + SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.05, n_estimators=100), threshold=0.05), + DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, min_samples_split=5) +) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + + +def test_operator_export(): + """Assert that a TPOT operator can export properly with a function as a parameter to a classifier.""" + export_string = TPOTSelectPercentile.export(5) + assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)" + + +def test_get_by_name(): + """Assert that the Operator class returns operators by name appropriately.""" + tpot_obj = TPOTClassifier() + assert get_by_name("SelectPercentile", tpot_obj.operators).__class__ == TPOTSelectPercentile.__class__ + + +def test_get_by_name_2(): + """Assert that get_by_name raises TypeError with a incorrect operator name.""" + tpot_obj = TPOTClassifier() + assert_raises(TypeError, get_by_name, "RandomForestRegressor", tpot_obj.operators) + # use correct name + ret_op_class = get_by_name("RandomForestClassifier", tpot_obj.operators) + + +def test_get_by_name_3(): + """Assert that get_by_name raises ValueError with duplicate operators in operator dictionary.""" + tpot_obj = TPOTClassifier() + # no duplicate + ret_op_class = get_by_name("SelectPercentile", tpot_obj.operators) + # add a copy of TPOTSelectPercentile into operator list + tpot_obj.operators.append(TPOTSelectPercentile) + assert_raises(ValueError, get_by_name, "SelectPercentile", tpot_obj.operators) + + +def test_indent(): + """Assert that indenting a multiline string by 4 spaces prepends 4 spaces before each new line.""" + multiline_string = """test +test1 +test2 +test3""" + + indented_multiline_string = """ test + test1 + test2 + test3""" + + assert indented_multiline_string == _indent(multiline_string, 4) + + +def test_pipeline_score_save(): + """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" + tpot_obj = TPOTClassifier(random_state=39) + tpot_obj._pbar = tqdm(total=1, disable=True) + pipeline = tpot_obj._toolbox.individual() + expected_code = """import numpy as np +import pandas as pd +from sklearn.feature_selection import SelectPercentile, f_classif +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.tree import DecisionTreeClassifier + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +# Score on the training set was:0.929813743 +exported_pipeline = make_pipeline( + SelectPercentile(score_func=f_classif, percentile=65), + DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) +) + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + + assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743)) + + +def test_imputer_in_export(): + """Assert that TPOT exports a pipeline with an imputation step if imputation was used in fit().""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + features_with_nan = np.copy(training_features) + features_with_nan[0][0] = float('nan') + + tpot_obj.fit(features_with_nan, training_target) + # use fixed pipeline since the random.seed() performs differently in python 2.* and 3.* + pipeline_string = ( + 'KNeighborsClassifier(' + 'input_matrix, ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + + export_code = export_pipeline(tpot_obj._optimized_pipeline, tpot_obj.operators, tpot_obj._pset, tpot_obj._imputed) + + expected_code = """import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import Imputer + +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values +training_features, testing_features, training_target, testing_target = \\ + train_test_split(features, tpot_data['target'].values, random_state=42) + +imputer = Imputer(strategy="median") +imputer.fit(training_features) +training_features = imputer.transform(training_features) +testing_features = imputer.transform(testing_features) + +exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") + +exported_pipeline.fit(training_features, training_target) +results = exported_pipeline.predict(testing_features) +""" + + assert_equal(export_code, expected_code) diff --git a/tests/one_hot_encoder_tests.py b/tests/one_hot_encoder_tests.py new file mode 100644 index 00000000..b6769d0f --- /dev/null +++ b/tests/one_hot_encoder_tests.py @@ -0,0 +1,311 @@ +# -*- coding: utf-8 -*- + +""" +Copyright (c) 2014, Matthias Feurer +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" + +import numpy as np +import scipy.sparse +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.datasets import load_iris, load_boston +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import make_pipeline +from sklearn.model_selection import cross_val_score, KFold +from nose.tools import assert_equal + +from tpot.builtins.one_hot_encoder import OneHotEncoder, _auto_select_categorical_features + + +iris_data = load_iris().data + +dense1 = np.array([[0, 1, 0], + [0, 0, 0], + [1, 1, 0]]) +dense1_1h = np.array([[1, 0, 0, 1, 1], + [1, 0, 1, 0, 1], + [0, 1, 0, 1, 1]]) +dense1_1h_minimum_fraction = np.array([[0, 1, 0, 1, 1], + [0, 1, 1, 0, 1], + [1, 0, 0, 1, 1]]) + +# Including NaNs +dense2 = np.array([[0, np.NaN, 0], + [np.NaN, 0, 2], + [1, 1, 1], + [np.NaN, 0, 1]]) +dense2_1h = np.array([[0, 1, 0, 1, 0, 0, 1, 0, 0], + [1, 0, 0, 0, 1, 0, 0, 0, 1], + [0, 0, 1, 0, 0, 1, 0, 1, 0], + [1, 0, 0, 0, 1, 0, 0, 1, 0]]) + +dense2_1h_minimum_fraction = np.array([[1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 1, 0], + [1, 0, 1, 0, 0, 1], + [0, 1, 0, 1, 0, 1]]) + +dense2_partial_1h = np.array([[0., 1., 0., 1., 0., 0., 0.], + [1., 0., 0., 0., 1., 0., 2.], + [0., 0., 1., 0., 0., 1., 1.], + [1., 0., 0., 0., 1., 0., 1.]]) + +dense2_1h_minimum_fraction_as_sparse = np.array([[0, 0, 1, 0, 0, 0], + [0, 1, 0, 0, 1, 0], + [1, 0, 0, 1, 0, 1], + [0, 1, 0, 0, 0, 1]]) + +# All NaN slice +dense3 = np.array([[0, 1, np.NaN], + [1, 0, np.NaN]]) +dense3_1h = np.array([[1, 0, 0, 1, 1], + [0, 1, 1, 0, 1]]) + +sparse1 = scipy.sparse.csc_matrix(([3, 2, 1, 1, 2, 3], + ((1, 4, 5, 2, 3, 5), + (0, 0, 0, 1, 1, 1))), shape=(6, 2)) +sparse1_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 1, 1], + ((5, 4, 1, 2, 3, 5), + (0, 1, 2, 3, 4, 5))), shape=(6, 6)) +sparse1_paratial_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 2, 3], + ((5, 4, 1, 2, 3, 5), + (0, 1, 2, 3, 3, 3))), + shape=(6, 4)) + +# All zeros slice +sparse2 = scipy.sparse.csc_matrix(([2, 1, 0, 0, 0, 0], + ((1, 4, 5, 2, 3, 5), + (0, 0, 0, 1, 1, 1))), shape=(6, 2)) +sparse2_1h = scipy.sparse.csc_matrix(([1, 1, 1, 1, 1, 1], + ((5, 4, 1, 2, 3, 5), + (0, 1, 2, 3, 3, 3))), shape=(6, 4)) + +sparse2_csr = scipy.sparse.csr_matrix(([2, 1, 0, 0, 0, 0], + ((1, 4, 5, 2, 3, 5), + (0, 0, 0, 1, 1, 1))), shape=(6, 2)) +sparse2_csr_1h = scipy.sparse.csr_matrix(([1, 1, 1, 1, 1, 1], + ((5, 4, 1, 2, 3, 5), + (0, 1, 2, 3, 3, 3))), shape=(6, 4)) + + +def fit_then_transform(expected, input, categorical_features='all', + minimum_fraction=None): + # Test fit_transform + ohe = OneHotEncoder(categorical_features=categorical_features, + minimum_fraction=minimum_fraction) + transformation = ohe.fit_transform(input.copy()) + assert_array_almost_equal(expected.astype(float), + transformation.todense()) + + # Test fit, and afterwards transform + ohe2 = OneHotEncoder(categorical_features=categorical_features, + minimum_fraction=minimum_fraction) + ohe2.fit(input.copy()) + transformation = ohe2.transform(input.copy()) + assert_array_almost_equal(expected, transformation.todense()) + + +def fit_then_transform_dense(expected, input, + categorical_features='all', + minimum_fraction=None): + ohe = OneHotEncoder(categorical_features=categorical_features, + sparse=False, minimum_fraction=minimum_fraction) + transformation = ohe.fit_transform(input.copy()) + assert_array_almost_equal(expected, transformation) + + ohe2 = OneHotEncoder(categorical_features=categorical_features, + sparse=False, minimum_fraction=minimum_fraction) + ohe2.fit(input.copy()) + transformation = ohe2.transform(input.copy()) + assert_array_almost_equal(expected, transformation) + + +def test_auto_detect_categorical(): + """Assert that automatic selection of categorical features works as expected with a threshold of 10.""" + selected = _auto_select_categorical_features(iris_data[0:16, :], threshold=10) + expected = [False, False, True, True] + assert_equal(selected, expected) + + +def test_dense1(): + """Test fit_transform a dense matrix.""" + fit_then_transform(dense1_1h, dense1) + fit_then_transform_dense(dense1_1h, dense1) + + +def test_dense1_minimum_fraction(): + """Test fit_transform a dense matrix with minimum_fraction=0.5.""" + fit_then_transform(dense1_1h_minimum_fraction, dense1, minimum_fraction=0.5) + fit_then_transform_dense(dense1_1h_minimum_fraction, dense1, minimum_fraction=0.5) + + +def test_dense2(): + """Test fit_transform a dense matrix including NaNs.""" + fit_then_transform(dense2_1h, dense2) + fit_then_transform_dense(dense2_1h, dense2) + + +def test_dense2_minimum_fraction(): + """Test fit_transform a dense matrix including NaNs with minimum_fraction=0.5""" + fit_then_transform( + dense2_1h_minimum_fraction, + dense2, + minimum_fraction=0.3 + ) + fit_then_transform_dense( + dense2_1h_minimum_fraction, + dense2, + minimum_fraction=0.3 + ) + + +def test_dense2_with_non_sparse_components(): + """Test fit_transform a dense matrix including NaNs with specifying categorical_features.""" + fit_then_transform( + dense2_partial_1h, + dense2, + categorical_features=[True, True, False] + ) + fit_then_transform_dense( + dense2_partial_1h, + dense2, + categorical_features=[True, True, False] + ) + + +def test_sparse_on_dense2_minimum_fraction(): + """Test fit_transform a dense matrix with minimum_fraction as sparse""" + sparse = scipy.sparse.csr_matrix(dense2) + fit_then_transform( + dense2_1h_minimum_fraction_as_sparse, + sparse, + minimum_fraction=0.5 + ) + fit_then_transform_dense( + dense2_1h_minimum_fraction_as_sparse, + sparse, + minimum_fraction=0.5 + ) + + +# Minimum fraction is not too interesting here... +def test_dense3(): + """Test fit_transform a dense matrix including all NaN slice.""" + fit_then_transform(dense3_1h, dense3) + fit_then_transform_dense(dense3_1h, dense3) + + +def test_sparse1(): + """Test fit_transform a sparse matrix.""" + fit_then_transform(sparse1_1h.todense(), sparse1) + fit_then_transform_dense(sparse1_1h.todense(), sparse1) + + +def test_sparse1_minimum_fraction(): + """Test fit_transform a sparse matrix with minimum_fraction=0.5.""" + expected = np.array([[0, 1, 0, 0, 1, 1], + [0, 0, 1, 1, 0, 1]], dtype=float).transpose() + fit_then_transform( + expected, + sparse1, + minimum_fraction=0.5 + ) + fit_then_transform_dense( + expected, + sparse1, + minimum_fraction=0.5 + ) + + +def test_sparse1_with_non_sparse_components(): + """Test fit_transform a sparse matrix with specifying categorical_features.""" + fit_then_transform( + sparse1_paratial_1h.todense(), + sparse1, + categorical_features=[True, False] + ) + + +def test_sparse2(): + """Test fit_transform a sparse matrix including all zeros slice.""" + fit_then_transform(sparse2_1h.todense(), sparse2) + fit_then_transform_dense(sparse2_1h.todense(), sparse2) + + +def test_sparse2_minimum_fraction(): + """Test fit_transform a sparse matrix including all zeros slice with minimum_fraction=0.5.""" + expected = np.array([[0, 1, 0, 0, 1, 1], + [0, 0, 1, 1, 0, 1]], dtype=float).transpose() + fit_then_transform( + expected, + sparse2, + minimum_fraction=0.5 + ) + fit_then_transform_dense( + expected, + sparse2, + minimum_fraction=0.5 + ) + + +def test_sparse2_csr(): + """Test fit_transform another sparse matrix including all zeros slice.""" + fit_then_transform(sparse2_csr_1h.todense(), sparse2_csr) + fit_then_transform_dense(sparse2_csr_1h.todense(), sparse2_csr) + + +def test_transform(): + """Test OneHotEncoder with both dense and sparse matrixes.""" + input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() + ohe = OneHotEncoder() + ohe.fit(input) + test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() + output = ohe.transform(test_data).todense() + assert np.sum(output) == 5 + + input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() + ips = scipy.sparse.csr_matrix(input) + ohe = OneHotEncoder() + ohe.fit(ips) + test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() + tds = scipy.sparse.csr_matrix(test_data) + output = ohe.transform(tds).todense() + assert np.sum(output) == 3 + + +def test_k_fold_cv(): + """Test OneHotEncoder with categorical_features='auto'.""" + boston = load_boston() + + clf = make_pipeline( + OneHotEncoder( + categorical_features='auto', + sparse=False, + minimum_fraction=0.05 + ), + LinearRegression() + ) + + cross_val_score(clf, boston.data, boston.target, cv=KFold(n_splits=10, shuffle=True)) diff --git a/tests/stacking_estimator_tests.py b/tests/stacking_estimator_tests.py new file mode 100644 index 00000000..b57232ab --- /dev/null +++ b/tests/stacking_estimator_tests.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np +from tpot.builtins import StackingEstimator +from sklearn.linear_model import LogisticRegression, Lasso +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.pipeline import make_pipeline +from tpot_tests import training_features, training_target, training_features_r, training_target_r +from sklearn.model_selection import cross_val_score + +def test_StackingEstimator_1(): + """Assert that the StackingEstimator returns transformed X with synthetic features in classification.""" + clf = RandomForestClassifier(random_state=42) + stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42)) + # fit + clf.fit(training_features, training_target) + stack_clf.fit(training_features, training_target) + # get transformd X + X_clf_transformed = stack_clf.transform(training_features) + + assert np.allclose(clf.predict(training_features), X_clf_transformed[:, 0]) + assert np.allclose(clf.predict_proba(training_features), X_clf_transformed[:, 1:1 + len(np.unique(training_target))]) + + +def test_StackingEstimator_2(): + """Assert that the StackingEstimator returns transformed X with a synthetic feature in regression.""" + reg = RandomForestRegressor(random_state=42) + stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) + # fit + reg.fit(training_features_r, training_target_r) + stack_reg.fit(training_features_r, training_target_r) + # get transformd X + X_reg_transformed = stack_reg.transform(training_features_r) + + assert np.allclose(reg.predict(training_features_r), X_reg_transformed[:, 0]) + + +def test_StackingEstimator_3(): + """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in classification.""" + stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42)) + meta_clf = LogisticRegression() + sklearn_pipeline = make_pipeline(stack_clf, meta_clf) + # fit in pipeline + sklearn_pipeline.fit(training_features, training_target) + # fit step by step + stack_clf.fit(training_features, training_target) + X_clf_transformed = stack_clf.transform(training_features) + meta_clf.fit(X_clf_transformed, training_target) + # scoring + score = meta_clf.score(X_clf_transformed, training_target) + pipeline_score = sklearn_pipeline.score(training_features, training_target) + assert np.allclose(score, pipeline_score) + + # test cv score + cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features, training_target, cv=3, scoring='accuracy')) + + known_cv_score = 0.947282375315 + + assert np.allclose(known_cv_score, cv_score) + + +def test_StackingEstimator_4(): + """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in regression.""" + stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) + meta_reg = Lasso(random_state=42) + sklearn_pipeline = make_pipeline(stack_reg, meta_reg) + # fit in pipeline + sklearn_pipeline.fit(training_features_r, training_target_r) + # fit step by step + stack_reg.fit(training_features_r, training_target_r) + X_reg_transformed = stack_reg.transform(training_features_r) + meta_reg.fit(X_reg_transformed, training_target_r) + # scoring + score = meta_reg.score(X_reg_transformed, training_target_r) + pipeline_score = sklearn_pipeline.score(training_features_r, training_target_r) + assert np.allclose(score, pipeline_score) + + # test cv score + cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features_r, training_target_r, cv=3, scoring='r2')) + known_cv_score = 0.795877470354 + + assert np.allclose(known_cv_score, cv_score) diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 00000000..d1834e52 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +tpot_config = { + 'sklearn.naive_bayes.GaussianNB': { + }, + + 'sklearn.naive_bayes.BernoulliNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.naive_bayes.MultinomialNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + } +} diff --git a/tests/test_config.py.bad b/tests/test_config.py.bad new file mode 100644 index 00000000..528add77 --- /dev/null +++ b/tests/test_config.py.bad @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +tpot_config = { + 'sklearn.naive_bayes.GaussianNB': { + }, + + 'sklearn.naive_bayes.BernoulliNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.naive_bayes.MultinomialNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.]# miss a "," here + 'fit_prior': [True, False] + } +} diff --git a/tests/test_config_sparse.py b/tests/test_config_sparse.py new file mode 100644 index 00000000..29a635b8 --- /dev/null +++ b/tests/test_config_sparse.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np + +tpot_config = { + 'sklearn.neighbors.KNeighborsClassifier': { + 'n_neighbors': range(1, 101), + 'weights': ["uniform", "distance"], + 'p': [1, 2] + }, + + 'sklearn.ensemble.RandomForestClassifier': { + 'n_estimators': [100], + 'criterion': ["gini", "entropy"], + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'bootstrap': [True, False] + } +} diff --git a/tests.csv b/tests/tests.csv similarity index 100% rename from tests.csv rename to tests/tests.csv diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py new file mode 100644 index 00000000..0090593c --- /dev/null +++ b/tests/tpot_tests.py @@ -0,0 +1,1654 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +from tpot import TPOTClassifier, TPOTRegressor +from tpot.base import TPOTBase +from tpot.driver import float_range +from tpot.gp_types import Output_Array +from tpot.gp_deap import mutNodeReplacement, _wrapped_cross_val_score, pick_two_individuals_eligible_for_crossover, cxOnePoint, varOr +from tpot.metrics import balanced_accuracy +from tpot.operator_utils import TPOTOperatorClassFactory, set_sample_weight + +from tpot.config.classifier import classifier_config_dict +from tpot.config.classifier_light import classifier_config_dict_light +from tpot.config.regressor_light import regressor_config_dict_light +from tpot.config.classifier_mdr import tpot_mdr_classifier_config_dict +from tpot.config.regressor_mdr import tpot_mdr_regressor_config_dict +from tpot.config.regressor_sparse import regressor_config_sparse +from tpot.config.classifier_sparse import classifier_config_sparse + +import numpy as np +from scipy import sparse +import inspect +import random +from multiprocessing import cpu_count +import os +from re import search +from datetime import datetime +from time import sleep + +from sklearn.datasets import load_digits, load_boston +from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold +from deap import creator +from deap.tools import ParetoFront +from nose.tools import assert_raises, assert_not_equal, assert_greater_equal, assert_equal, assert_in +from driver_tests import captured_output + +from tqdm import tqdm + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +# Ensure we can use `with closing(...) as ... :` syntax +if getattr(StringIO, '__exit__', False) and \ + getattr(StringIO, '__enter__', False): + def closing(arg): + return arg +else: + from contextlib import closing + +# Set up the MNIST data set for testing +mnist_data = load_digits() +training_features, testing_features, training_target, testing_target = \ + train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) + +# Set up the Boston data set for testing +boston_data = load_boston() +training_features_r, testing_features_r, training_target_r, testing_target_r = \ + train_test_split(boston_data.data, boston_data.target, random_state=42) + +# Set up the sparse matrix for testing +sparse_features = sparse.csr_matrix(training_features) +sparse_target = training_target + +np.random.seed(42) +random.seed(42) + +test_operator_key = 'sklearn.feature_selection.SelectPercentile' +TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory( + test_operator_key, + classifier_config_dict[test_operator_key] +) + + +def test_init_custom_parameters(): + """Assert that the TPOT instantiator stores the TPOT variables properly.""" + tpot_obj = TPOTClassifier( + population_size=500, + generations=1000, + offspring_size=2000, + mutation_rate=0.05, + crossover_rate=0.9, + scoring='accuracy', + cv=10, + verbosity=1, + random_state=42, + disable_update_check=True, + warm_start=True + ) + + assert tpot_obj.population_size == 500 + assert tpot_obj.generations == 1000 + assert tpot_obj.offspring_size == 2000 + assert tpot_obj.mutation_rate == 0.05 + assert tpot_obj.crossover_rate == 0.9 + assert tpot_obj.scoring_function == 'accuracy' + assert tpot_obj.cv == 10 + assert tpot_obj.max_time_mins is None + assert tpot_obj.warm_start is True + assert tpot_obj.verbosity == 1 + assert tpot_obj._optimized_pipeline is None + assert tpot_obj.fitted_pipeline_ is None + assert not (tpot_obj._pset is None) + assert not (tpot_obj._toolbox is None) + + +def test_init_default_scoring(): + """Assert that TPOT intitializes with the correct default scoring function.""" + tpot_obj = TPOTRegressor() + assert tpot_obj.scoring_function == 'neg_mean_squared_error' + + tpot_obj = TPOTClassifier() + assert tpot_obj.scoring_function == 'accuracy' + + +def test_init_default_scoring_2(): + """Assert that TPOT intitializes with the correct customized scoring function.""" + tpot_obj = TPOTClassifier(scoring=balanced_accuracy) + assert tpot_obj.scoring_function == 'balanced_accuracy' + + +def test_invalid_score_warning(): + """Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS.""" + # Mis-spelled scorer + assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray') + # Correctly spelled + TPOTClassifier(scoring='balanced_accuracy') + + +def test_invalid_dataset_warning(): + """Assert that the TPOT fit function raises a ValueError when dataset is not in right format.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0 + ) + # common mistake in target + bad_training_target = training_target.reshape((1, len(training_target))) + assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_target) + + +def test_invalid_subsample_ratio_warning(): + """Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0].""" + # Invalid ratio + assert_raises(ValueError, TPOTClassifier, subsample=0.0) + # Valid ratio + TPOTClassifier(subsample=0.1) + + +def test_invalid_mut_rate_plus_xo_rate(): + """Assert that the TPOT intitializes raises a ValueError when the sum of crossover and mutation probabilities is large than 1.""" + # Invalid ratio + assert_raises(ValueError, TPOTClassifier, mutation_rate=0.8, crossover_rate=0.8) + # Valid ratio + TPOTClassifier(mutation_rate=0.8, crossover_rate=0.1) + + +def test_init_max_time_mins(): + """Assert that the TPOT init stores max run time and sets generations to 1000000.""" + tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000) + + assert tpot_obj.generations == 1000000 + assert tpot_obj.max_time_mins == 30 + + +def test_init_n_jobs(): + """Assert that the TPOT init stores current number of processes.""" + tpot_obj = TPOTClassifier(n_jobs=2) + assert tpot_obj.n_jobs == 2 + + tpot_obj = TPOTClassifier(n_jobs=-1) + assert tpot_obj.n_jobs == cpu_count() + + +def test_timeout(): + """Assert that _wrapped_cross_val_score return Timeout in a time limit.""" + tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') + # a complex pipeline for the test + pipeline_string = ( + "ExtraTreesRegressor(" + "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," + "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," + "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," + "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," + "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," + "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," + "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " + "ExtraTreesRegressor__n_estimators=100)" + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + # test _wrapped_cross_val_score with cv=20 so that it is impossible to finish in 1 second + return_value = _wrapped_cross_val_score(tpot_obj.fitted_pipeline_, + training_features_r, + training_target_r, + cv=20, + scoring_function='neg_mean_squared_error', + sample_weight=None, + groups=None, + timeout=1) + assert return_value == "Timeout" + + +def test_invalid_pipeline(): + """Assert that _wrapped_cross_val_score return -float(\'inf\') with a invalid_pipeline""" + tpot_obj = TPOTClassifier() + # a invalid pipeline + # Dual or primal formulation. Dual formulation is only implemented for l2 penalty. + pipeline_string = ( + 'LogisticRegression(input_matrix, LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=True, LogisticRegression__penalty=l1)' + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + # test _wrapped_cross_val_score with cv=20 so that it is impossible to finish in 1 second + return_value = _wrapped_cross_val_score(tpot_obj.fitted_pipeline_, + training_features, + training_target, + cv=5, + scoring_function='accuracy', + sample_weight=None, + groups=None, + timeout=300) + assert return_value == -float('inf') + + +def test_balanced_accuracy(): + """Assert that the balanced_accuracy in TPOT returns correct accuracy.""" + y_true = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4]) + y_pred1 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4]) + y_pred2 = np.array([3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4]) + accuracy_score1 = balanced_accuracy(y_true, y_pred1) + accuracy_score2 = balanced_accuracy(y_true, y_pred2) + assert np.allclose(accuracy_score1, 1.0) + assert np.allclose(accuracy_score2, 0.833333333333333) + + +def test_get_params(): + """Assert that get_params returns the exact dictionary of parameters used by TPOT.""" + kwargs = { + 'population_size': 500, + 'generations': 1000, + 'config_dict': 'TPOT light', + 'offspring_size': 2000, + 'verbosity': 1 + } + + tpot_obj = TPOTClassifier(**kwargs) + # Get default parameters of TPOT and merge with our specified parameters + initializer = inspect.getargspec(TPOTBase.__init__) + default_kwargs = dict(zip(initializer.args[1:], initializer.defaults)) + default_kwargs.update(kwargs) + # update to dictionary instead of input string + default_kwargs.update({'config_dict': classifier_config_dict_light}) + assert tpot_obj.get_params()['config_dict'] == default_kwargs['config_dict'] + assert tpot_obj.get_params() == default_kwargs + + +def test_set_params(): + """Assert that set_params returns a reference to the TPOT instance.""" + tpot_obj = TPOTClassifier() + assert tpot_obj.set_params() is tpot_obj + + +def test_set_params_2(): + """Assert that set_params updates TPOT's instance variables.""" + tpot_obj = TPOTClassifier(generations=2) + tpot_obj.set_params(generations=3) + + assert tpot_obj.generations == 3 + + +def test_TPOTBase(): + """Assert that TPOTBase class raises RuntimeError when using it directly.""" + assert_raises(RuntimeError, TPOTBase) + + +def test_conf_dict(): + """Assert that TPOT uses the pre-configured dictionary of operators when config_dict is 'TPOT light' or 'TPOT MDR'.""" + tpot_obj = TPOTClassifier(config_dict='TPOT light') + assert tpot_obj.config_dict == classifier_config_dict_light + + tpot_obj = TPOTClassifier(config_dict='TPOT MDR') + assert tpot_obj.config_dict == tpot_mdr_classifier_config_dict + + tpot_obj = TPOTClassifier(config_dict='TPOT sparse') + assert tpot_obj.config_dict == classifier_config_sparse + + tpot_obj = TPOTRegressor(config_dict='TPOT light') + assert tpot_obj.config_dict == regressor_config_dict_light + + tpot_obj = TPOTRegressor(config_dict='TPOT MDR') + assert tpot_obj.config_dict == tpot_mdr_regressor_config_dict + + tpot_obj = TPOTRegressor(config_dict='TPOT sparse') + assert tpot_obj.config_dict == regressor_config_sparse + + +def test_conf_dict_2(): + """Assert that TPOT uses a custom dictionary of operators when config_dict is Python dictionary.""" + tpot_obj = TPOTClassifier(config_dict=tpot_mdr_classifier_config_dict) + assert tpot_obj.config_dict == tpot_mdr_classifier_config_dict + + +def test_conf_dict_3(): + """Assert that TPOT uses a custom dictionary of operators when config_dict is the path of Python dictionary.""" + tpot_obj = TPOTRegressor(config_dict='tests/test_config.py') + tested_config_dict = { + 'sklearn.naive_bayes.GaussianNB': { + }, + + 'sklearn.naive_bayes.BernoulliNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.naive_bayes.MultinomialNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + } + } + assert isinstance(tpot_obj.config_dict, dict) + assert tpot_obj.config_dict == tested_config_dict + + +def test_read_config_file(): + """Assert that _read_config_file rasies FileNotFoundError with a wrong path.""" + tpot_obj = TPOTRegressor() + # typo for "tests/test_config.py" + assert_raises(ValueError, tpot_obj._read_config_file, "tests/test_confg.py") + + +def test_read_config_file_2(): + """Assert that _read_config_file rasies ValueError with wrong dictionary format""" + tpot_obj = TPOTRegressor() + assert_raises(ValueError, tpot_obj._read_config_file, "tests/test_config.py.bad") + + +def test_read_config_file_3(): + """Assert that _read_config_file rasies ValueError without a dictionary named 'tpot_config'.""" + tpot_obj = TPOTRegressor() + assert_raises(ValueError, tpot_obj._setup_config, "tpot/config/regressor_sparse.py") + + +def test_random_ind(): + """Assert that the TPOTClassifier can generate the same pipeline with same random seed.""" + tpot_obj = TPOTClassifier(random_state=43) + pipeline1 = str(tpot_obj._toolbox.individual()) + tpot_obj = TPOTClassifier(random_state=43) + pipeline2 = str(tpot_obj._toolbox.individual()) + assert pipeline1 == pipeline2 + + +def test_random_ind_2(): + """Assert that the TPOTRegressor can generate the same pipeline with same random seed.""" + tpot_obj = TPOTRegressor(random_state=43) + pipeline1 = str(tpot_obj._toolbox.individual()) + tpot_obj = TPOTRegressor(random_state=43) + pipeline2 = str(tpot_obj._toolbox.individual()) + + assert pipeline1 == pipeline2 + + +def test_score(): + """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists.""" + tpot_obj = TPOTClassifier() + + assert_raises(RuntimeError, tpot_obj.score, testing_features, testing_target) + + +def test_score_2(): + """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline.""" + tpot_obj = TPOTClassifier(random_state=34) + known_score = 0.977777777778 # Assumes use of the TPOT accuracy function + + # Create a pipeline with a known score + pipeline_string = ( + 'KNeighborsClassifier(' + 'input_matrix, ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_target) + # Get score from TPOT + score = tpot_obj.score(testing_features, testing_target) + + assert np.allclose(known_score, score) + + +def test_score_3(): + """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline.""" + tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error', random_state=72) + known_score = 12.1791953611 + + # Reify pipeline with known score + pipeline_string = ( + "ExtraTreesRegressor(" + "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," + "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," + "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," + "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," + "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," + "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," + "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " + "ExtraTreesRegressor__n_estimators=100)" + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r) + + # Get score from TPOT + score = tpot_obj.score(testing_features_r, testing_target_r) + + assert np.allclose(known_score, score) + + +def test_sample_weight_func(): + """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights.""" + tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') + + # Reify pipeline with known scor + pipeline_string = ( + "ExtraTreesRegressor(" + "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," + "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," + "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," + "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," + "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," + "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," + "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " + "ExtraTreesRegressor__n_estimators=100)" + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r) + + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + + # make up a sample weight + training_target_r_weight = np.array(range(1, len(training_target_r)+1)) + training_target_r_weight_dict = set_sample_weight(tpot_obj.fitted_pipeline_.steps, training_target_r_weight) + + np.random.seed(42) + cv_score1 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_target_r, cv=3, scoring='neg_mean_squared_error') + + np.random.seed(42) + cv_score2 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_target_r, cv=3, scoring='neg_mean_squared_error') + + np.random.seed(42) + cv_score_weight = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_target_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_target_r_weight_dict) + + np.random.seed(42) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r, **training_target_r_weight_dict) + # Get score from TPOT + known_score = 11.5790430757 + score = tpot_obj.score(testing_features_r, testing_target_r) + + assert np.allclose(cv_score1, cv_score2) + assert not np.allclose(cv_score1, cv_score_weight) + assert np.allclose(known_score, score) + + +def test_fit_GroupKFold(): + """Assert that TPOT properly handles the group parameter when using GroupKFold.""" + # This check tests if the darker MNIST images would generalize to the lighter ones. + means = np.mean(training_features, axis=1) + groups = means >= np.median(means) + + tpot_obj = TPOTClassifier( + random_state=42, + population_size=2, + offspring_size=4, + generations=1, + verbosity=0, + config_dict='TPOT light', + cv=GroupKFold(n_splits=2), + ) + tpot_obj.fit(training_features, training_target, groups=groups) + + assert_greater_equal(tpot_obj.score(testing_features, testing_target), 0.97) + + +def test_predict(): + """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists.""" + tpot_obj = TPOTClassifier() + + assert_raises(RuntimeError, tpot_obj.predict, testing_features) + + +def test_predict_2(): + """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,).""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5' + ')' + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_target) + result = tpot_obj.predict(testing_features) + + assert result.shape == (testing_features.shape[0],) + + +def test_predict_proba(): + """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_target).""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_target) + + result = tpot_obj.predict_proba(testing_features) + num_labels = np.amax(testing_target) + 1 + + assert result.shape == (testing_features.shape[0], num_labels) + + +def test_predict_proba_2(): + """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float).""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_target) + + result = tpot_obj.predict_proba(testing_features) + rows, columns = result.shape + + for i in range(rows): + for j in range(columns): + float_range(result[i][j]) + + +def test_predict_proba_3(): + """Assert that the TPOT predict_proba function raises a RuntimeError when no optimized pipeline exists.""" + tpot_obj = TPOTClassifier() + + assert_raises(RuntimeError, tpot_obj.predict_proba, testing_features) + + +def test_predict_proba_4(): + """Assert that the TPOT predict_proba function raises a RuntimeError when the optimized pipeline do not have the predict_proba() function""" + tpot_obj = TPOTRegressor() + pipeline_string = ( + "ExtraTreesRegressor(input_matrix, " + "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," + "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " + "ExtraTreesRegressor__n_estimators=100)" + ) + tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_target_r) + + assert_raises(RuntimeError, tpot_obj.predict_proba, testing_features) + + +def test_warm_start(): + """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run.""" + tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True) + tpot_obj.fit(training_features, training_target) + + assert tpot_obj._pop is not None + assert tpot_obj._pareto_front is not None + + first_pop = tpot_obj._pop + tpot_obj.random_state = 21 + tpot_obj.fit(training_features, training_target) + + assert tpot_obj._pop == first_pop + + +def test_fit(): + """Assert that the TPOT fit function provides an optimized pipeline.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0 + ) + tpot_obj.fit(training_features, training_target) + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + + +def test_fit_2(): + """Assert that the TPOT fit function provides an optimized pipeline when config_dict is 'TPOT light'.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + + +def test_fit_3(): + """Assert that the TPOT fit function provides an optimized pipeline with subsample of 0.8.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + subsample=0.8, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + + +def test_fit_4(): + """Assert that the TPOT fit function provides an optimized pipeline with max_time_mins of 2 second.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=2, + generations=1, + verbosity=0, + max_time_mins=2/60., + config_dict='TPOT light' + ) + assert tpot_obj.generations == 1000000 + + # reset generations to 20 just in case that the failed test may take too much time + tpot_obj.generations == 20 + + tpot_obj.fit(training_features, training_target) + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + + +def test_check_periodic_pipeline(): + """Assert that the _check_periodic_pipeline exports periodic pipeline.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + with closing(StringIO()) as our_file: + tpot_obj._file = our_file + tpot_obj.verbosity = 3 + tpot_obj._last_pipeline_write = datetime.now() + sleep(0.11) + tpot_obj._output_best_pipeline_period_seconds = 0.1 + tpot_obj.periodic_checkpoint_folder = './' + tpot_obj._check_periodic_pipeline() + our_file.seek(0) + + assert_in('Saving best periodic pipeline to ./pipeline', our_file.read()) + # clean up + for f in os.listdir('./'): + if search('pipeline_', f): + os.remove(os.path.join('./', f)) + + +def test_check_periodic_pipeline_2(): + """Assert that the _check_periodic_pipeline does not export periodic pipeline if the pipeline has been saved before.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + with closing(StringIO()) as our_file: + tpot_obj._file = our_file + tpot_obj.verbosity = 3 + tpot_obj._last_pipeline_write = datetime.now() + sleep(0.11) + tpot_obj._output_best_pipeline_period_seconds = 0.1 + tpot_obj.periodic_checkpoint_folder = './' + # export once before + tpot_obj.export('./pipeline_test.py') + tpot_obj._check_periodic_pipeline() + our_file.seek(0) + + assert_in('Periodic pipeline was not saved, probably saved before...', our_file.read()) + # clean up + for f in os.listdir('./'): + if search('pipeline_', f): + os.remove(os.path.join('./', f)) + + +def test_check_periodic_pipeline_3(): + """Assert that the _check_periodic_pipeline rasie StopIteration if self._last_optimized_pareto_front_n_gens >= self.early_stop.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + tpot_obj.early_stop = 3 + # will pass + tpot_obj._check_periodic_pipeline() + tpot_obj._last_optimized_pareto_front_n_gens = 3 + assert_raises(StopIteration, tpot_obj._check_periodic_pipeline) + + +def test_save_periodic_pipeline(): + """Assert that the _save_periodic_pipeline does not export periodic pipeline if exception happened""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + with closing(StringIO()) as our_file: + tpot_obj._file = our_file + tpot_obj.verbosity = 3 + tpot_obj._last_pipeline_write = datetime.now() + sleep(0.11) + tpot_obj._output_best_pipeline_period_seconds = 0.1 + tpot_obj.periodic_checkpoint_folder = './' + # reset _optimized_pipeline to rasie exception + tpot_obj._optimized_pipeline = None + + tpot_obj._save_periodic_pipeline() + our_file.seek(0) + + assert_in('Failed saving periodic pipeline, exception', our_file.read()) + # clean up + for f in os.listdir('./'): + if search('pipeline_', f): + os.remove(os.path.join('./', f)) + + +def test_fit_predict(): + """Assert that the TPOT fit_predict function provides an optimized pipeline and correct output.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + result = tpot_obj.fit_predict(training_features, training_target) + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + assert result.shape == (training_features.shape[0],) + + +def test_update_top_pipeline(): + """Assert that the TPOT _update_top_pipeline updated an optimized pipeline.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + tpot_obj._optimized_pipeline = None + tpot_obj.fitted_pipeline_ = None + tpot_obj._update_top_pipeline() + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + + +def test_update_top_pipeline_2(): + """Assert that the TPOT _update_top_pipeline raises RuntimeError when self._pareto_front is empty.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + + def pareto_eq(ind1, ind2): + return np.allclose(ind1.fitness.values, ind2.fitness.values) + + tpot_obj._pareto_front = ParetoFront(similar=pareto_eq) + + assert_raises(RuntimeError, tpot_obj._update_top_pipeline) + + +def test_update_top_pipeline_3(): + """Assert that the TPOT _update_top_pipeline raises RuntimeError when self._optimized_pipeline is not updated.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + tpot_obj._optimized_pipeline = None + # reset the fitness score to -float('inf') + for pipeline_scores in reversed(tpot_obj._pareto_front.keys): + pipeline_scores.wvalues = (5000., -float('inf')) + + assert_raises(RuntimeError, tpot_obj._update_top_pipeline) + + +def test_summary_of_best_pipeline(): + """Assert that the TPOT _update_top_pipeline raises RuntimeError when self._optimized_pipeline is not updated.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + + assert_raises(RuntimeError, tpot_obj._summary_of_best_pipeline, features=training_features, target=training_target) + + +def test_set_param_recursive(): + """Assert that _set_param_recursive sets \"random_state\" to 42 in all steps in a simple pipeline.""" + pipeline_string = ( + 'DecisionTreeClassifier(PCA(input_matrix, PCA__iterated_power=5, PCA__svd_solver=randomized), ' + 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' + ) + tpot_obj = TPOTClassifier() + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + # assert "random_state" of PCA at step 1 + assert getattr(sklearn_pipeline.steps[0][1], 'random_state') == 42 + # assert "random_state" of DecisionTreeClassifier at step 2 + assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 + + +def test_set_param_recursive_2(): + """Assert that _set_param_recursive sets \"random_state\" to 42 in nested estimator in SelectFromModel.""" + pipeline_string = ( + 'DecisionTreeRegressor(SelectFromModel(input_matrix, ' + 'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, ' + 'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,' + 'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)' + ) + tpot_obj = TPOTRegressor() + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + assert getattr(getattr(sklearn_pipeline.steps[0][1], 'estimator'), 'random_state') == 42 + assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 + + +def test_set_param_recursive_3(): + """Assert that _set_param_recursive sets \"random_state\" to 42 in nested estimator in StackingEstimator in a complex pipeline.""" + pipeline_string = ( + 'DecisionTreeClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5),input_matrix) ' + 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' + ) + tpot_obj = TPOTClassifier() + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + # StackingEstimator under the transformer_list of FeatureUnion + assert getattr(getattr(sklearn_pipeline.steps[0][1].transformer_list[0][1], 'estimator'), 'random_state') == 42 + assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 + + +def test_evaluated_individuals_(): + """Assert that evaluated_individuals_ stores current pipelines and their CV scores.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=2, + offspring_size=4, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + tpot_obj.fit(training_features, training_target) + assert isinstance(tpot_obj.evaluated_individuals_, dict) + for pipeline_string in sorted(tpot_obj.evaluated_individuals_.keys()): + deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + operator_count = tpot_obj._operator_count(deap_pipeline) + + try: + cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) + mean_cv_scores = np.mean(cv_scores) + except Exception as e: + mean_cv_scores = -float('inf') + assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][1], mean_cv_scores) + assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][0], operator_count) + + +def test_stop_by_max_time_mins(): + """Assert that _stop_by_max_time_mins raises KeyboardInterrupt when maximum minutes have elapsed.""" + tpot_obj = TPOTClassifier(config_dict='TPOT light') + tpot_obj._start_datetime = datetime.now() + sleep(0.11) + tpot_obj.max_time_mins = 0.1/60. + assert_raises(KeyboardInterrupt, tpot_obj._stop_by_max_time_mins) + + +def test_update_evaluated_individuals_(): + """Assert that _update_evaluated_individuals_ raises ValueError when scoring function does not return a float.""" + tpot_obj = TPOTClassifier(config_dict='TPOT light') + assert_raises(ValueError, tpot_obj._update_evaluated_individuals_, ['Non-Float-Score'], ['Test_Pipeline'], [1]) + + +def test_evaluate_individuals(): + """Assert that _evaluate_individuals returns operator_counts and CV scores in correct order.""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + + tpot_obj._pbar = tqdm(total=1, disable=True) + pop = tpot_obj._toolbox.population(n=10) + fitness_scores = tpot_obj._evaluate_individuals(pop, training_features, training_target) + + for deap_pipeline, fitness_score in zip(pop, fitness_scores): + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + try: + cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) + mean_cv_scores = np.mean(cv_scores) + except Exception as e: + mean_cv_scores = -float('inf') + + assert isinstance(deap_pipeline, creator.Individual) + assert np.allclose(fitness_score[0], operator_count) + assert np.allclose(fitness_score[1], mean_cv_scores) + + +def test_evaluate_individuals_2(): + """Assert that _evaluate_individuals returns operator_counts and CV scores in correct order with n_jobs=2""" + tpot_obj = TPOTClassifier( + n_jobs=2, + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + + tpot_obj._pbar = tqdm(total=1, disable=True) + pop = tpot_obj._toolbox.population(n=10) + fitness_scores = tpot_obj._evaluate_individuals(pop, training_features, training_target) + + for deap_pipeline, fitness_score in zip(pop, fitness_scores): + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) + + try: + cv_scores = cross_val_score(sklearn_pipeline, training_features, training_target, cv=5, scoring='accuracy', verbose=0) + mean_cv_scores = np.mean(cv_scores) + except Exception as e: + mean_cv_scores = -float('inf') + + assert isinstance(deap_pipeline, creator.Individual) + assert np.allclose(fitness_score[0], operator_count) + assert np.allclose(fitness_score[1], mean_cv_scores) + + +def test_update_pbar(): + """Assert that _update_pbar updates self._pbar with printing correct warning message.""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + # reset verbosity = 3 for checking pbar message + tpot_obj.verbosity = 3 + with closing(StringIO()) as our_file: + tpot_obj._file=our_file + tpot_obj._pbar = tqdm(total=10, disable=False, file=our_file) + tpot_obj._update_pbar(pbar_num=2, pbar_msg="Test Warning Message") + our_file.seek(0) + assert_in("Test Warning Message", our_file.read()) + assert_equal(tpot_obj._pbar.n, 2) + + +def test_update_val(): + """Assert _update_val updates result score in list and prints timeout message.""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + # reset verbosity = 3 for checking pbar message + tpot_obj.verbosity = 3 + with closing(StringIO()) as our_file: + tpot_obj._file=our_file + tpot_obj._pbar = tqdm(total=10, disable=False, file=our_file) + result_score_list = [] + result_score_list = tpot_obj._update_val(0.9999, result_score_list) + assert_equal(result_score_list, [0.9999]) + # check "Timeout" + result_score_list = tpot_obj._update_val("Timeout", result_score_list) + our_file.seek(0) + assert_in("Skipped pipeline #2 due to time out.", our_file.read()) + assert_equal(result_score_list, [0.9999, -float('inf')]) + + +def test_preprocess_individuals(): + """Assert _preprocess_individuals preprocess DEAP individuals including one evaluated individual""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0 + ) + + pipeline_string_1 = ( + 'LogisticRegression(PolynomialFeatures' + '(input_matrix, PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, ' + 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=False, LogisticRegression__penalty=l2)' + ) + + # a normal pipeline + pipeline_string_2 = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) + + individuals = [] + individuals.append(creator.Individual.from_string(pipeline_string_1, tpot_obj._pset)) + individuals.append(creator.Individual.from_string(pipeline_string_2, tpot_obj._pset)) + + # set pipeline 2 has been evaluated before + tpot_obj.evaluated_individuals_[pipeline_string_2] = (1, 0.99999) + + # reset verbosity = 3 for checking pbar message + tpot_obj.verbosity = 3 + with closing(StringIO()) as our_file: + tpot_obj._file=our_file + tpot_obj._pbar = tqdm(total=2, disable=False, file=our_file) + operator_counts, eval_individuals_str, sklearn_pipeline_list = \ + tpot_obj._preprocess_individuals(individuals) + our_file.seek(0) + assert_in("Pipeline encountered that has previously been evaluated", our_file.read()) + assert_in(pipeline_string_1, eval_individuals_str) + assert_equal(operator_counts[pipeline_string_1], 2) + assert_equal(len(sklearn_pipeline_list), 1) + + +def test_preprocess_individuals_2(): + """Assert _preprocess_individuals preprocess DEAP individuals with one invalid pipeline""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0 + ) + + # pipeline with two PolynomialFeatures operator + pipeline_string_1 = ( + 'LogisticRegression(PolynomialFeatures' + '(PolynomialFeatures(input_matrix, PolynomialFeatures__degree=2, ' + 'PolynomialFeatures__include_bias=False, PolynomialFeatures__interaction_only=False), ' + 'PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, ' + 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=False, LogisticRegression__penalty=l2)' + ) + + # a normal pipeline + pipeline_string_2 = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) + + individuals = [] + individuals.append(creator.Individual.from_string(pipeline_string_1, tpot_obj._pset)) + individuals.append(creator.Individual.from_string(pipeline_string_2, tpot_obj._pset)) + + # reset verbosity = 3 for checking pbar message + tpot_obj.verbosity = 3 + with closing(StringIO()) as our_file: + tpot_obj._file=our_file + tpot_obj._pbar = tqdm(total=3, disable=False, file=our_file) + operator_counts, eval_individuals_str, sklearn_pipeline_list = \ + tpot_obj._preprocess_individuals(individuals) + our_file.seek(0) + + assert_in("Invalid pipeline encountered. Skipping its evaluation.", our_file.read()) + assert_in(pipeline_string_2, eval_individuals_str) + assert_equal(operator_counts[pipeline_string_2], 1) + assert_equal(len(sklearn_pipeline_list), 1) + + +def test_preprocess_individuals_3(): + """Assert _preprocess_individuals updatas self._pbar.total when max_time_mins is not None""" + tpot_obj = TPOTClassifier( + population_size=2, + offspring_size=4, + random_state=42, + max_time_mins=5, + verbosity=0 + ) + + pipeline_string_1 = ( + 'LogisticRegression(PolynomialFeatures' + '(input_matrix, PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, ' + 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=False, LogisticRegression__penalty=l2)' + ) + + # a normal pipeline + pipeline_string_2 = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) + + individuals = [] + individuals.append(creator.Individual.from_string(pipeline_string_1, tpot_obj._pset)) + individuals.append(creator.Individual.from_string(pipeline_string_2, tpot_obj._pset)) + + # reset verbosity = 3 for checking pbar message + + with closing(StringIO()) as our_file: + tpot_obj._file=our_file + tpot_obj._pbar = tqdm(total=2, disable=False, file=our_file) + tpot_obj._pbar.n = 2 + operator_counts, eval_individuals_str, sklearn_pipeline_list = \ + tpot_obj._preprocess_individuals(individuals) + assert tpot_obj._pbar.total == 6 + + +def test_imputer(): + """Assert that the TPOT fit function will not raise a ValueError in a dataset where NaNs are present.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + features_with_nan = np.copy(training_features) + features_with_nan[0][0] = float('nan') + + tpot_obj.fit(features_with_nan, training_target) + + +def test_imputer_2(): + """Assert that the TPOT predict function will not raise a ValueError in a dataset where NaNs are present.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + features_with_nan = np.copy(training_features) + features_with_nan[0][0] = float('nan') + + tpot_obj.fit(features_with_nan, training_target) + tpot_obj.predict(features_with_nan) + + +def test_imputer_3(): + """Assert that the TPOT _impute_values function returns a feature matrix with imputed NaN values.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=2, + config_dict='TPOT light' + ) + features_with_nan = np.copy(training_features) + features_with_nan[0][0] = float('nan') + with captured_output() as (out, err): + imputed_features = tpot_obj._impute_values(features_with_nan) + assert_in("Imputing missing values in feature set", out.getvalue()) + + assert_not_equal(imputed_features[0][0], float('nan')) + + +def test_sparse_matrix(): + """Assert that the TPOT fit function will raise a ValueError in a sparse matrix with config_dict='TPOT light'.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) + + assert_raises(ValueError, tpot_obj.fit, sparse_features, sparse_target) + + +def test_sparse_matrix_2(): + """Assert that the TPOT fit function will raise a ValueError in a sparse matrix with config_dict=None.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict=None + ) + + assert_raises(ValueError, tpot_obj.fit, sparse_features, sparse_target) + + +def test_sparse_matrix_3(): + """Assert that the TPOT fit function will raise a ValueError in a sparse matrix with config_dict='TPOT MDR'.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT MDR' + ) + + assert_raises(ValueError, tpot_obj.fit, sparse_features, sparse_target) + + +def test_sparse_matrix_4(): + """Assert that the TPOT fit function will not raise a ValueError in a sparse matrix with config_dict='TPOT sparse'.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT sparse' + ) + + tpot_obj.fit(sparse_features, sparse_target) + + +def test_sparse_matrix_5(): + """Assert that the TPOT fit function will not raise a ValueError in a sparse matrix with a customized config dictionary.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='tests/test_config_sparse.py' + ) + + tpot_obj.fit(sparse_features, sparse_target) + + +def test_tpot_operator_factory_class(): + """Assert that the TPOT operators class factory.""" + test_config_dict = { + 'sklearn.svm.LinearSVC': { + 'penalty': ["l1", "l2"], + 'loss': ["hinge", "squared_hinge"], + 'dual': [True, False], + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.] + }, + + 'sklearn.linear_model.LogisticRegression': { + 'penalty': ["l1", "l2"], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], + 'dual': [True, False] + }, + + 'sklearn.preprocessing.Binarizer': { + 'threshold': np.arange(0.0, 1.01, 0.05) + } + } + + tpot_operator_list = [] + tpot_argument_list = [] + + for key in sorted(test_config_dict.keys()): + op, args = TPOTOperatorClassFactory(key, test_config_dict[key]) + tpot_operator_list.append(op) + tpot_argument_list += args + + assert len(tpot_operator_list) == 3 + assert len(tpot_argument_list) == 9 + assert tpot_operator_list[0].root is True + assert tpot_operator_list[1].root is False + assert tpot_operator_list[2].type() == "Classifier or Regressor" + assert tpot_argument_list[1].values == [True, False] + + +def test_PolynomialFeatures_exception(): + """Assert that TPOT allows only one PolynomialFeatures operator in a pipeline.""" + tpot_obj = TPOTClassifier() + tpot_obj._pbar = tqdm(total=1, disable=True) + # pipeline with one PolynomialFeatures operator + pipeline_string_1 = ( + 'LogisticRegression(PolynomialFeatures' + '(input_matrix, PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, ' + 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=False, LogisticRegression__penalty=l2)' + ) + + # pipeline with two PolynomialFeatures operator + pipeline_string_2 = ( + 'LogisticRegression(PolynomialFeatures' + '(PolynomialFeatures(input_matrix, PolynomialFeatures__degree=2, ' + 'PolynomialFeatures__include_bias=False, PolynomialFeatures__interaction_only=False), ' + 'PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, ' + 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=False, LogisticRegression__penalty=l2)' + ) + + # make a list for _evaluate_individuals + pipelines = [] + pipelines.append(creator.Individual.from_string(pipeline_string_1, tpot_obj._pset)) + pipelines.append(creator.Individual.from_string(pipeline_string_2, tpot_obj._pset)) + fitness_scores = tpot_obj._evaluate_individuals(pipelines, training_features, training_target) + known_scores = [(2, 0.98068077235290885), (5000.0, -float('inf'))] + assert np.allclose(known_scores, fitness_scores) + + +def test_pick_two_individuals_eligible_for_crossover(): + """Assert that pick_two_individuals_eligible_for_crossover() picks the correct pair of nodes to perform crossover with""" + tpot_obj = TPOTClassifier() + ind1 = creator.Individual.from_string( + 'BernoulliNB(input_matrix, BernoulliNB__alpha=1.0, BernoulliNB__fit_prior=True)', + tpot_obj._pset + ) + ind2 = creator.Individual.from_string( + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True)', + tpot_obj._pset + ) + ind3 = creator.Individual.from_string( + 'GaussianNB(input_matrix)', + tpot_obj._pset + ) + + pick1, pick2 = pick_two_individuals_eligible_for_crossover([ind1, ind2, ind3]) + assert ((str(pick1) == str(ind1) and str(pick2) == str(ind2)) or + str(pick1) == str(ind2) and str(pick2) == str(ind1)) + + ind4 = creator.Individual.from_string( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')', + tpot_obj._pset + ) + + # Eventhough ind4 does not have the same primitive at the root, the tree shares a primitive with ind1 + pick1, pick2 = pick_two_individuals_eligible_for_crossover([ind1, ind3, ind4]) + assert ((str(pick1) == str(ind1) and str(pick2) == str(ind4)) or + str(pick1) == str(ind4) and str(pick2) == str(ind1)) + + +def test_pick_two_individuals_eligible_for_crossover_bad(): + """Assert that pick_two_individuals_eligible_for_crossover() returns the right output when no pair is eligible""" + tpot_obj = TPOTClassifier() + ind1 = creator.Individual.from_string( + 'BernoulliNB(input_matrix, BernoulliNB__alpha=1.0, BernoulliNB__fit_prior=True)', + tpot_obj._pset + ) + ind2 = creator.Individual.from_string( + 'BernoulliNB(input_matrix, BernoulliNB__alpha=1.0, BernoulliNB__fit_prior=True)', + tpot_obj._pset + ) + ind3 = creator.Individual.from_string( + 'GaussianNB(input_matrix)', + tpot_obj._pset + ) + + # Ind1 and ind2 are not a pair because they are the same, ind3 shares no primitive + pick1, pick2 = pick_two_individuals_eligible_for_crossover([ind1, ind2, ind3]) + assert pick1 is None and pick2 is None + + # You can not do crossover with a population of only 1. + pick1, pick2 = pick_two_individuals_eligible_for_crossover([ind1]) + assert pick1 is None and pick2 is None + + # You can not do crossover with a population of 0. + pick1, pick2 = pick_two_individuals_eligible_for_crossover([]) + assert pick1 is None and pick2 is None + + +def test_mate_operator(): + """Assert that self._mate_operator returns offsprings as expected.""" + tpot_obj = TPOTClassifier() + ind1 = creator.Individual.from_string( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')', + tpot_obj._pset + ) + ind2 = creator.Individual.from_string( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=2, ' + 'KNeighborsClassifier__weights=uniform' + ')', + tpot_obj._pset + ) + + # set as evaluated pipelines in tpot_obj.evaluated_individuals_ + tpot_obj.evaluated_individuals_[str(ind1)] = (2, 0.99) + tpot_obj.evaluated_individuals_[str(ind2)] = (2, 0.99) + + offspring1, offspring2 = tpot_obj._mate_operator(ind1, ind2) + expected_offspring1 = ( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False), ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=2, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + expected_offspring1_alt = ( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True), ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + assert str(offspring1) in [expected_offspring1, expected_offspring1_alt] + + +def test_cxOnePoint(): + """Assert that cxOnePoint() returns the correct type of node between two fixed pipelines.""" + tpot_obj = TPOTClassifier() + ind1 = creator.Individual.from_string( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')', + tpot_obj._pset + ) + ind2 = creator.Individual.from_string( + 'KNeighborsClassifier(' + 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=2, ' + 'KNeighborsClassifier__weights=uniform' + ')', + tpot_obj._pset + ) + ind1[0].ret = Output_Array + ind2[0].ret = Output_Array + ind1_copy, ind2_copy = tpot_obj._toolbox.clone(ind1),tpot_obj._toolbox.clone(ind2) + offspring1, offspring2 = cxOnePoint(ind1_copy, ind2_copy) + + assert offspring1[0].ret == Output_Array + assert offspring2[0].ret == Output_Array + + +def test_mutNodeReplacement(): + """Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline.""" + tpot_obj = TPOTClassifier() + pipeline_string = ( + 'LogisticRegression(PolynomialFeatures' + '(input_matrix, PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, ' + 'PolynomialFeatures__interaction_only=False), LogisticRegression__C=10.0, ' + 'LogisticRegression__dual=False, LogisticRegression__penalty=l2)' + ) + + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + pipeline[0].ret = Output_Array + old_ret_type_list = [node.ret for node in pipeline] + old_prims_list = [node for node in pipeline if node.arity != 0] + + # test 10 times + for _ in range(10): + mut_ind = mutNodeReplacement(tpot_obj._toolbox.clone(pipeline), pset=tpot_obj._pset) + new_ret_type_list = [node.ret for node in mut_ind[0]] + new_prims_list = [node for node in mut_ind[0] if node.arity != 0] + + if new_prims_list == old_prims_list: # Terminal mutated + assert new_ret_type_list == old_ret_type_list + else: # Primitive mutated + diff_prims = list(set(new_prims_list).symmetric_difference(old_prims_list)) + if len(diff_prims) > 1: # Sometimes mutation randomly replaces an operator that already in the pipelines + assert diff_prims[0].ret == diff_prims[1].ret + assert mut_ind[0][0].ret == Output_Array + + +def test_varOr(): + """Assert that varOr() applys crossover only and removes CV scores in offsprings.""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + + tpot_obj._pbar = tqdm(total=1, disable=True) + pop = tpot_obj._toolbox.population(n=5) + for ind in pop: + ind.fitness.values = (2, 1.0) + + offspring = varOr(pop, tpot_obj._toolbox, 5, cxpb=1.0, mutpb=0.0) + invalid_ind = [ind for ind in offspring if not ind.fitness.valid] + + assert len(offspring) == 5 + assert len(invalid_ind) == 5 + + +def test_varOr_2(): + """Assert that varOr() applys mutation only and removes CV scores in offsprings.""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + + tpot_obj._pbar = tqdm(total=1, disable=True) + pop = tpot_obj._toolbox.population(n=5) + for ind in pop: + ind.fitness.values = (2, 1.0) + + offspring = varOr(pop, tpot_obj._toolbox, 5, cxpb=0.0, mutpb=1.0) + invalid_ind = [ind for ind in offspring if not ind.fitness.valid] + + assert len(offspring) == 5 + assert len(invalid_ind) == 5 + + +def test_varOr_3(): + """Assert that varOr() applys reproduction only and does NOT remove CV scores in offsprings.""" + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + config_dict='TPOT light' + ) + + tpot_obj._pbar = tqdm(total=1, disable=True) + pop = tpot_obj._toolbox.population(n=5) + for ind in pop: + ind.fitness.values = (2, 1.0) + + offspring = varOr(pop, tpot_obj._toolbox, 5, cxpb=0.0, mutpb=0.0) + invalid_ind = [ind for ind in offspring if not ind.fitness.valid] + + assert len(offspring) == 5 + assert len(invalid_ind) == 0 + + +def test_operator_type(): + """Assert that TPOT operators return their type, e.g. 'Classifier', 'Preprocessor'.""" + assert TPOTSelectPercentile.type() == "Preprocessor or Selector" + + +def test_gen(): + """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure.""" + tpot_obj = TPOTClassifier() + + pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3) + + assert len(pipeline) > 1 + assert pipeline[0].ret == Output_Array + + +def test_clean_pipeline_string(): + """Assert that clean_pipeline_string correctly returns a string without parameter prefixes""" + + with_prefix = 'BernoulliNB(input_matrix, BernoulliNB__alpha=1.0, BernoulliNB__fit_prior=True)' + without_prefix = 'BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)' + tpot_obj = TPOTClassifier() + ind1 = creator.Individual.from_string(with_prefix, tpot_obj._pset) + + pretty_string = tpot_obj.clean_pipeline_string(ind1) + assert pretty_string == without_prefix diff --git a/tests/zero_count_tests.py b/tests/zero_count_tests.py new file mode 100644 index 00000000..fbbff71e --- /dev/null +++ b/tests/zero_count_tests.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np +from tpot.builtins import ZeroCount + +X = np.array([[0, 1, 7, 0, 0], + [3, 0, 0, 2, 19], + [0, 1, 3, 4, 5], + [5, 0, 0, 0, 0]]) + +def test_ZeroCount(): + """Assert that ZeroCount operator returns correct transformed X.""" + op = ZeroCount() + X_transformed = op.transform(X) + zero_col = np.array([3, 2, 1, 4]) + non_zero = np.array([2, 3, 4, 1]) + + assert np.allclose(zero_col, X_transformed[:, 0]) + assert np.allclose(non_zero, X_transformed[:, 1]) diff --git a/tpot/_version.py b/tpot/_version.py index 41d65571..53c20e73 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -19,4 +19,4 @@ """ -__version__ = '0.8.3' +__version__ = '0.9.0' diff --git a/tpot/base.py b/tpot/base.py index 834980be..06d39ed5 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -18,7 +18,6 @@ License along with TPOT. If not, see . """ - from __future__ import print_function import random import inspect @@ -28,8 +27,11 @@ from functools import partial from datetime import datetime from multiprocessing import cpu_count +import os +import re import numpy as np +from scipy import sparse import deap from deap import base, creator, tools, gp from tqdm import tqdm @@ -50,16 +52,21 @@ from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code from .decorators import _pre_test from .builtins import CombineDFs, StackingEstimator + from .config.classifier_light import classifier_config_dict_light from .config.regressor_light import regressor_config_dict_light from .config.classifier_mdr import tpot_mdr_classifier_config_dict from .config.regressor_mdr import tpot_mdr_regressor_config_dict +from .config.regressor_sparse import regressor_config_sparse +from .config.classifier_sparse import classifier_config_sparse from .metrics import SCORERS from .gp_types import Output_Array from .gp_deap import eaMuPlusLambda, mutNodeReplacement, _wrapped_cross_val_score, cxOnePoint + # hot patch for Windows: solve the problem of crashing python after Ctrl + C in Windows OS +# https://github.com/ContinuumIO/anaconda-issues/issues/905 if sys.platform.startswith('win'): import win32api try: @@ -83,7 +90,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, - random_state=None, config_dict=None, warm_start=False, + random_state=None, config_dict=None, + warm_start=False, periodic_checkpoint_folder=None, early_stop=None, verbosity=0, disable_update_check=False): """Set up the genetic programming algorithm for pipeline optimization. @@ -176,9 +184,22 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT MDR': TPOT uses a list of TPOT-MDR operator configuration dictionary instead of the default one. + String 'TPOT sparse': + TPOT uses a configuration dictionary with a one-hot-encoder and the + operators normally included in TPOT that also support sparse matrices. warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). + periodic_checkpoint_folder: path string, optional (default: None) + If supplied, a folder in which tpot will periodically save the best pipeline so far while optimizing. + Currently once per generation but not more often than once per 30 seconds. + Useful in multiple cases: + Sudden death before tpot could save optimized pipeline + Track its progress + Grab pipelines while it's still optimizing + early_stop: int or None (default: None) + How many generations TPOT checks whether there is no improvement in optimization process. + End optimization process if there is no improvement in the set number of generations. verbosity: int, optional (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = high, 3 = all. @@ -201,14 +222,29 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._pareto_front = None self._optimized_pipeline = None + self._optimized_pipeline_score = None + self._exported_pipeline_text = "" self.fitted_pipeline_ = None self._fitted_imputer = None - self._pop = None + self._imputed = False + self._pop = [] self.warm_start = warm_start self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins + self.max_eval_time_seconds = max(int(self.max_eval_time_mins * 60), 1) + self.periodic_checkpoint_folder = periodic_checkpoint_folder + self.early_stop = early_stop + self._last_optimized_pareto_front = None + self._last_optimized_pareto_front_n_gens = 0 + + # dont save periodic pipelines more often than this + self._output_best_pipeline_period_seconds = 30 + + # Try crossover and mutation at most this many times for + # any one given individual (or pair of individuals) + self._max_mut_loops = 50 # Set offspring_size equal to population_size by default if offspring_size: @@ -216,7 +252,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, else: self.offspring_size = population_size - self._setup_config(config_dict) + self.config_dict_params=config_dict + self._setup_config(self.config_dict_params) self.operators = [] self.arguments = [] @@ -254,6 +291,9 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, 'copy': copy } self._pbar = None + # Specifies where to output the progress messages (default: sys.stdout). + # Maybe open this API in future version of TPOT.(io.TextIOWrapper or io.StringIO) + self._file = sys.stdout # Dictionary of individuals that have already been evaluated in previous # generations @@ -283,17 +323,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, raise ValueError( 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) - # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module - if sys.platform.startswith('win') and n_jobs != 1: - print( - 'Warning: Although parallelization is currently supported in ' - 'TPOT for Windows, pressing Ctrl+C will freeze the optimization ' - 'process without saving the best pipeline! Thus, Please DO NOT ' - 'press Ctrl+C during the optimization procss if n_jobs is not ' - 'equal to 1. For quick test in Windows, please set n_jobs to 1 ' - 'for saving the best pipeline in the middle of the optimization ' - 'process via Ctrl+C.' - ) if n_jobs == -1: self.n_jobs = cpu_count() else: @@ -302,6 +331,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._setup_pset() self._setup_toolbox() + def _setup_config(self, config_dict): if config_dict: if isinstance(config_dict, dict): @@ -316,36 +346,47 @@ def _setup_config(self, config_dict): self.config_dict = tpot_mdr_classifier_config_dict else: self.config_dict = tpot_mdr_regressor_config_dict + elif config_dict == 'TPOT sparse': + if self.classification: + self.config_dict = classifier_config_sparse + else: + self.config_dict = regressor_config_sparse else: - self.config_dict = self._read_config_file(config_dict) + config = self._read_config_file(config_dict) + if hasattr(config, 'tpot_config'): + self.config_dict = config.tpot_config + else: + raise ValueError( + 'Could not find "tpot_config" in configuration file {}. ' + 'When using a custom config file for customizing operators ' + 'dictionary, the file must have a python dictionary with ' + 'the standardized name of "tpot_config"'.format(config_dict) + ) else: self.config_dict = self.default_config_dict - def _read_config_file(self, config_path): - try: - custom_config = imp.new_module('custom_config') - with open(config_path, 'r') as config_file: - file_string = config_file.read() - exec(file_string, custom_config.__dict__) - - return custom_config.tpot_config - except FileNotFoundError as e: - raise FileNotFoundError( + def _read_config_file(self, config_path): + if os.path.isfile(config_path): + try: + custom_config = imp.new_module('custom_config') + + with open(config_path, 'r') as config_file: + file_string = config_file.read() + exec(file_string, custom_config.__dict__) + return custom_config + except Exception as e: + raise ValueError( + 'An error occured while attempting to read the specified ' + 'custom TPOT operator configuration file: {}'.format(e) + ) + else: + raise ValueError( 'Could not open specified TPOT operator config file: ' - '{}'.format(e.filename) - ) - except AttributeError: - raise AttributeError( - 'The supplied TPOT operator config file does not contain ' - 'a dictionary named "tpot_config".' - ) - except Exception as e: - raise type(e)( - 'An error occured while attempting to read the specified ' - 'custom TPOT operator configuration file.' + '{}'.format(config_path) ) + def _setup_pset(self): if self.random_state is not None: random.seed(self.random_state) @@ -359,6 +400,7 @@ def _setup_pset(self): if self.verbosity > 2: print('{} operators have been imported by TPOT.'.format(len(self.operators))) + def _add_operators(self): for operator in self.operators: if operator.root: @@ -385,17 +427,16 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) + def _add_terminals(self): for _type in self.arguments: type_values = list(_type.values) - # This check prevents XGBoost from using multithreading, which breaks in TPOT - if 'nthread' not in _type.__name__: - type_values += ['DEFAULT'] for val in type_values: terminal_name = _type.__name__ + "=" + str(val) self._pset.addTerminal(val, _type, name=terminal_name) + def _setup_toolbox(self): creator.create('FitnessMulti', base.Fitness, weights=(-1.0, 1.0)) creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti) @@ -410,6 +451,7 @@ def _setup_toolbox(self): self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) self._toolbox.register('mutate', self._random_mutation_operator) + def fit(self, features, target, sample_weight=None, groups=None): """Fit an optimized machine learning pipeline. @@ -448,9 +490,23 @@ def fit(self, features, target, sample_weight=None, groups=None): # Resets the imputer to be fit for the new dataset self._fitted_imputer = None - - if np.any(np.isnan(features)): - features = self._impute_values(features) + self._imputed = False + # If features is a sparse matrix, do not apply imputation + if sparse.issparse(features): + if self.config_dict_params in [None, "TPOT light", "TPOT MDR"]: + raise ValueError( + 'Not all operators in {} supports sparse matrix. ' + 'Please use \"TPOT sparse\" for sparse matrix.'.format(self.config_dict_params) + ) + elif self.config_dict_params != "TPOT sparse": + print( + 'Warning: Since the input matrix is a sparse matrix, please makes sure all the operators in the ' + 'customized config dictionary supports sparse matriies.' + ) + else: + if np.any(np.isnan(features)): + self._imputed = True + features = self._impute_values(features) self._check_dataset(features, target) @@ -472,6 +528,7 @@ def fit(self, features, target, sample_weight=None, groups=None): np.random.seed(self.random_state) self._start_datetime = datetime.now() + self._last_pipeline_write = self._start_datetime self._toolbox.register('evaluate', self._evaluate_individuals, features=features, target=target, sample_weight=sample_weight, groups=groups) # assign population, self._pop can only be not None if warm_start is enabled @@ -526,7 +583,7 @@ def pareto_eq(ind1, ind2): pbar=self._pbar, halloffame=self._pareto_front, verbose=self.verbosity, - max_time_mins=self.max_time_mins + per_generation_function=self._check_periodic_pipeline ) # store population for the next call @@ -534,10 +591,11 @@ def pareto_eq(ind1, ind2): self._pop = pop # Allow for certain exceptions to signal a premature fit() cancellation - except (KeyboardInterrupt, SystemExit): + except (KeyboardInterrupt, SystemExit, StopIteration) as e: if self.verbosity > 0: - self._pbar.write('') - self._pbar.write('TPOT closed prematurely. Will use the current best pipeline.') + self._pbar.write('', file=self._file) + self._pbar.write('{}\nTPOT closed prematurely. Will use the current best pipeline.'.format(e), + file=self._file) finally: # keep trying 10 times in case weird things happened like multiple CTRL+C or exceptions attempts = 10 @@ -548,56 +606,97 @@ def pareto_eq(ind1, ind2): if not isinstance(self._pbar, type(None)): self._pbar.close() - # Store the pipeline with the highest internal testing score - if self._pareto_front: - self._update_top_pipeline() - - # It won't raise error for a small test like in a unit test because a few pipeline sometimes - # may fail due to the training data does not fit the operator's requirement. - if not self._optimized_pipeline: - print('There was an error in the TPOT optimization ' - 'process. This could be because the data was ' - 'not formatted properly, or because data for ' - 'a regression problem was provided to the ' - 'TPOTClassifier object. Please make sure you ' - 'passed the data to TPOT correctly.') - else: - self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline) - - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - self.fitted_pipeline_.fit(features, target) - - if self.verbosity in [1, 2]: - # Add an extra line of spacing if the progress bar was used - if self.verbosity >= 2: - print('') - print('Best pipeline: {}'.format(self._optimized_pipeline)) - - # Store and fit the entire Pareto front as fitted models for convenience - self.pareto_front_fitted_pipelines_ = {} - - for pipeline in self._pareto_front.items: - self.pareto_front_fitted_pipelines_[str(pipeline)] = self._toolbox.compile(expr=pipeline) - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - self.pareto_front_fitted_pipelines_[str(pipeline)].fit(features, target) + self._update_top_pipeline() + self._summary_of_best_pipeline(features, target) break except (KeyboardInterrupt, SystemExit, Exception) as e: # raise the exception if it's our last attempt if attempt == (attempts - 1): - raise + raise e return self + def _update_top_pipeline(self): - """Helper function to update the _optimized_pipeline field.""" + """Helper function to update the _optimized_pipeline field. + """ + # Store the pipeline with the highest internal testing score if self._pareto_front: - top_score = -float('inf') + self._optimized_pipeline_score = -float('inf') for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)): - if pipeline_scores.wvalues[1] > top_score: + if pipeline_scores.wvalues[1] > self._optimized_pipeline_score: self._optimized_pipeline = pipeline - top_score = pipeline_scores.wvalues[1] + self._optimized_pipeline_score = pipeline_scores.wvalues[1] + + if not self._optimized_pipeline: + raise RuntimeError('There was an error in the TPOT optimization ' + 'process. This could be because the data was ' + 'not formatted properly, or because data for ' + 'a regression problem was provided to the ' + 'TPOTClassifier object. Please make sure you ' + 'passed the data to TPOT correctly.') + else: + pareto_front_wvalues = [pipeline_scores.wvalues[1] for pipeline_scores in self._pareto_front.keys] + if not self._last_optimized_pareto_front: + self._last_optimized_pareto_front = pareto_front_wvalues + elif self._last_optimized_pareto_front == pareto_front_wvalues: + self._last_optimized_pareto_front_n_gens += 1 + else: + self._last_optimized_pareto_front = pareto_front_wvalues + self._last_optimized_pareto_front_n_gens = 0 + else: + # If user passes CTRL+C in initial generation, self._pareto_front (halloffame) shoule be not updated yet. + # need raise RuntimeError because no pipeline has been optimized + raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') + + + def _summary_of_best_pipeline(self, features, target): + """Print out best pipeline at the end of optimization process. + + Parameters + ---------- + features: array-like {n_samples, n_features} + Feature matrix + + target: array-like {n_samples} + List of class labels for prediction + + Returns + ------- + self: object + Returns a copy of the fitted TPOT object + """ + if not self._optimized_pipeline: + raise RuntimeError('There was an error in the TPOT optimization ' + 'process. This could be because the data was ' + 'not formatted properly, or because data for ' + 'a regression problem was provided to the ' + 'TPOTClassifier object. Please make sure you ' + 'passed the data to TPOT correctly.') + else: + self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline) + + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self.fitted_pipeline_.fit(features, target) + + if self.verbosity in [1, 2]: + # Add an extra line of spacing if the progress bar was used + if self.verbosity >= 2: + print('') + + optimized_pipeline_str = self.clean_pipeline_string(self._optimized_pipeline) + print('Best pipeline:', optimized_pipeline_str) + + # Store and fit the entire Pareto front as fitted models for convenience + self.pareto_front_fitted_pipelines_ = {} + + for pipeline in self._pareto_front.items: + self.pareto_front_fitted_pipelines_[str(pipeline)] = self._toolbox.compile(expr=pipeline) + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self.pareto_front_fitted_pipelines_[str(pipeline)].fit(features, target) + def predict(self, features): """Use the optimized pipeline to predict the target for a feature set. @@ -619,11 +718,15 @@ def predict(self, features): features = features.astype(np.float64) if np.any(np.isnan(features)): + self._imputed = True features = self._impute_values(features) + else: + self._imputed = False return self.fitted_pipeline_.predict(features) - def fit_predict(self, features, target): + + def fit_predict(self, features, target, sample_weight=None, groups=None): """Call fit and predict in sequence. Parameters @@ -632,6 +735,12 @@ def fit_predict(self, features, target): Feature matrix target: array-like {n_samples} List of class labels for prediction + sample_weight: array-like {n_samples}, optional + Per-sample weights. Higher weights force TPOT to put more emphasis on those points + groups: array-like, with shape {n_samples, }, optional + Group labels for the samples used when performing cross-validation. + This parameter should only be used in conjunction with sklearn's Group cross-validation + functions, such as sklearn.model_selection.GroupKFold Returns ---------- @@ -639,11 +748,12 @@ def fit_predict(self, features, target): Predicted target for the provided features """ - self.fit(features, target) + self.fit(features, target, sample_weight=sample_weight, groups=groups) return self.predict(features) + def score(self, testing_features, testing_target): - """Returns the score on the given testing data using the user-specified scoring function. + """Return the score on the given testing data using the user-specified scoring function. Parameters ---------- @@ -670,6 +780,7 @@ def score(self, testing_features, testing_target): ) return abs(score) + def predict_proba(self, features): """Use the optimized pipeline to estimate the class probabilities for a feature set. @@ -691,6 +802,7 @@ def predict_proba(self, features): raise RuntimeError('The fitted pipeline does not have the predict_proba() function.') return self.fitted_pipeline_.predict_proba(features.astype(np.float64)) + def set_params(self, **params): """Set the parameters of TPOT. @@ -702,24 +814,94 @@ def set_params(self, **params): return self - def export(self, output_file_name): + + def clean_pipeline_string(self, individual): + """Provide a string of the individual without the parameter prefixes. + + Parameters + ---------- + individual: individual + Individual which should be represented by a pretty string + + Returns + ------- + A string like str(individual), but with parameter prefixes removed. + + """ + dirty_string = str(individual) + # There are many parameter prefixes in the pipeline strings, used solely for + # making the terminal name unique, eg. LinearSVC__. + parameter_prefixes = [(m.start(), m.end()) for m in re.finditer(', [\w]+__', dirty_string)] + # We handle them in reverse so we do not mess up indices + pretty = dirty_string + for (start, end) in reversed(parameter_prefixes): + pretty = pretty[:start+2] + pretty[end:] + + return pretty + + + def _check_periodic_pipeline(self): + """If enough time has passed, save a new optimized pipeline. + + Currently used in the per generation hook in the optimization loop. + """ + self._update_top_pipeline() + if self.periodic_checkpoint_folder is not None: + total_since_last_pipeline_save = (datetime.now() - self._last_pipeline_write).total_seconds() + if total_since_last_pipeline_save > self._output_best_pipeline_period_seconds: + self._last_pipeline_write = datetime.now() + self._save_periodic_pipeline() + + if self.early_stop is not None: + if self._last_optimized_pareto_front_n_gens >= self.early_stop: + raise StopIteration("The optimized pipeline was not improved after evaluating {} more generations. " + "Will end the optimization process.\n".format(self.early_stop)) + + + def _save_periodic_pipeline(self): + try: + filename = os.path.join(self.periodic_checkpoint_folder, 'pipeline_{}.py'.format(datetime.now().strftime('%Y.%m.%d_%H-%M-%S'))) + did_export = self.export(filename, skip_if_repeated=True) + if not did_export: + self._update_pbar(pbar_num=0, pbar_msg='Periodic pipeline was not saved, probably saved before...') + else: + self._update_pbar(pbar_num=0, pbar_msg='Saving best periodic pipeline to {}'.format(filename)) + except Exception as e: + self._update_pbar(pbar_num=0, pbar_msg='Failed saving periodic pipeline, exception:\n{}'.format(str(e)[:250])) + + + def export(self, output_file_name, skip_if_repeated=False): """Export the optimized pipeline as Python code. Parameters ---------- output_file_name: string String containing the path and file name of the desired output file + skip_if_repeated: boolean + If True, skip the actual writing if a pipeline + code would be identical to the last pipeline exported Returns ------- - None + False if it skipped writing the pipeline to file + True if the pipeline was actually written """ if self._optimized_pipeline is None: raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') + to_write = export_pipeline(self._optimized_pipeline, self.operators, self._pset, self._imputed, self._optimized_pipeline_score) + + # dont export a pipeline you just had + if skip_if_repeated and (self._exported_pipeline_text == to_write): + return False + with open(output_file_name, 'w') as output_file: - output_file.write(export_pipeline(self._optimized_pipeline, self.operators, self._pset)) + output_file.write(to_write) + self._exported_pipeline_text = to_write + + return True + def _impute_values(self, features): """Impute missing values in a feature set. @@ -737,11 +919,12 @@ def _impute_values(self, features): print('Imputing missing values in feature set') if self._fitted_imputer is None: - self._fitted_imputer = Imputer(strategy="median", axis=1) + self._fitted_imputer = Imputer(strategy="median") self._fitted_imputer.fit(features) return self._fitted_imputer.transform(features) + def _check_dataset(self, features, target): """Check if a dataset has a valid feature set and labels. @@ -757,7 +940,7 @@ def _check_dataset(self, features, target): None """ try: - check_X_y(features, target, accept_sparse=False) + check_X_y(features, target, accept_sparse=True) except (AssertionError, ValueError): raise ValueError( 'Error: Input data is not in a valid format. Please confirm ' @@ -781,6 +964,7 @@ def _compile_to_sklearn(self, expr): sklearn_pipeline = generate_pipeline_code(expr_to_tree(expr, self._pset), self.operators) return eval(sklearn_pipeline, self.operators_context) + def _set_param_recursive(self, pipeline_steps, parameter, value): """Recursively iterate through all objects in the pipeline and set a given parameter. @@ -799,14 +983,23 @@ def _set_param_recursive(self, pipeline_steps, parameter, value): """ for (_, obj) in pipeline_steps: recursive_attrs = ['steps', 'transformer_list', 'estimators'] - for attr in recursive_attrs: if hasattr(obj, attr): self._set_param_recursive(getattr(obj, attr), parameter, value) - break - else: - if hasattr(obj, parameter): - setattr(obj, parameter, value) + if hasattr(obj, 'estimator'): # nested estimator + est = getattr(obj, 'estimator') + if hasattr(est, parameter): + setattr(est, parameter, value) + if hasattr(obj, parameter): + setattr(obj, parameter, value) + + + def _stop_by_max_time_mins(self): + """Stop optimization process once maximum minutes have elapsed.""" + if self.max_time_mins: + total_mins_elapsed = (datetime.now() - self._start_datetime).total_seconds() / 60. + if total_mins_elapsed >= self.max_time_mins: + raise KeyboardInterrupt('{} minutes have elapsed. TPOT will close down.'.format(total_mins_elapsed)) def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None): @@ -833,18 +1026,74 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non according to its performance on the provided data """ - if self.max_time_mins: - total_mins_elapsed = (datetime.now() - self._start_datetime).total_seconds() / 60. - if total_mins_elapsed >= self.max_time_mins: - raise KeyboardInterrupt('{} minutes have elapsed. TPOT will close down.'.format(total_mins_elapsed)) + operator_counts, eval_individuals_str, sklearn_pipeline_list = self._preprocess_individuals(individuals) + + # Make the partial function that will be called below + partial_wrapped_cross_val_score = partial( + _wrapped_cross_val_score, + features=features, + target=target, + cv=self.cv, + scoring_function=self.scoring_function, + sample_weight=sample_weight, + groups=groups, + timeout=self.max_eval_time_seconds + ) + + result_score_list = [] + # Don't use parallelization if n_jobs==1 + if self.n_jobs == 1: + for sklearn_pipeline in sklearn_pipeline_list: + self._stop_by_max_time_mins() + val = partial_wrapped_cross_val_score(sklearn_pipeline=sklearn_pipeline) + result_score_list = self._update_val(val, result_score_list) + else: + # chunk size for pbar update + for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4): + self._stop_by_max_time_mins() + parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs') + tmp_result_scores = parallel(delayed(partial_wrapped_cross_val_score)(sklearn_pipeline=sklearn_pipeline) + for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]) + # update pbar + for val in tmp_result_scores: + result_score_list = self._update_val(val, result_score_list) + + self._update_evaluated_individuals_(result_score_list, eval_individuals_str, operator_counts) + + return [self.evaluated_individuals_[str(individual)] for individual in individuals] + + + def _preprocess_individuals(self, individuals): + """Preprocess DEAP individuals before pipeline evaluation. + + Parameters + ---------- + individuals: a list of DEAP individual + One individual is a list of pipeline operators and model parameters that can be + compiled by DEAP into a callable function + + Returns + ------- + operator_counts: dictionary + a dictionary of operator counts in individuals for evaluation + eval_individuals_str: list + a list of string of individuals for evaluation + sklearn_pipeline_list: list + a list of scikit-learn pipelines converted from DEAP individuals for evaluation + """ + # update self._pbar.total + if not (self.max_time_mins is None) and not self._pbar.disable and self._pbar.total <= self._pbar.n: + self._pbar.total += self.offspring_size # Check we do not evaluate twice the same individual in one pass. _, unique_individual_indices = np.unique([str(ind) for ind in individuals], return_index=True) unique_individuals = [ind for i, ind in enumerate(individuals) if i in unique_individual_indices] + # update number of duplicate pipelines + self._update_pbar(pbar_num=len(individuals)-len(unique_individuals)) - # return fitness scores + # a dictionary for storing operator counts operator_counts = {} - # 4 lists of DEAP individuals, their sklearn pipelines and their operator counts for parallel computing + # 2 lists of DEAP individuals' string, their sklearn pipelines for parallel computing eval_individuals_str = [] sklearn_pipeline_list = [] @@ -854,18 +1103,12 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non individual_str = str(individual) sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(individual, self._pset), self.operators) if sklearn_pipeline_str.count('PolynomialFeatures') > 1: - if self.verbosity > 2: - self._pbar.write('Invalid pipeline encountered. Skipping its evaluation.') self.evaluated_individuals_[individual_str] = (5000., -float('inf')) - if not self._pbar.disable: - self._pbar.update(1) + self._update_pbar(pbar_msg='Invalid pipeline encountered. Skipping its evaluation.') # Check if the individual was evaluated before elif individual_str in self.evaluated_individuals_: - if self.verbosity > 2: - self._pbar.write('Pipeline encountered that has previously been evaluated during the ' - 'optimization process. Using the score from the previous evaluation.') - if not self._pbar.disable: - self._pbar.update(1) + self._update_pbar(pbar_msg=('Pipeline encountered that has previously been evaluated during the ' + 'optimization process. Using the score from the previous evaluation.')) else: try: # Transform the tree expression into an sklearn pipeline @@ -884,58 +1127,72 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non operator_counts[individual_str] = max(1, operator_count) except Exception: self.evaluated_individuals_[individual_str] = (5000., -float('inf')) - if not self._pbar.disable: - self._pbar.update(1) + self._update_pbar() continue eval_individuals_str.append(individual_str) sklearn_pipeline_list.append(sklearn_pipeline) - # evalurate pipeline - resulting_score_list = [] - # chunk size for pbar update - for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4): - jobs = [] - for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]: - job = delayed(_wrapped_cross_val_score)( - sklearn_pipeline=sklearn_pipeline, - features=features, - target=target, - cv=self.cv, - scoring_function=self.scoring_function, - sample_weight=sample_weight, - max_eval_time_mins=self.max_eval_time_mins, - groups=groups - ) - jobs.append(job) - parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs') - tmp_result_score = parallel(jobs) - - # update pbar - for val in tmp_result_score: - if not self._pbar.disable: - self._pbar.update(1) - if val == 'Timeout': - if self.verbosity > 2: - self._pbar.write('Skipped pipeline #{0} due to time out. ' - 'Continuing to the next pipeline.'.format(self._pbar.n)) - resulting_score_list.append(-float('inf')) - else: - resulting_score_list.append(val) + return operator_counts, eval_individuals_str, sklearn_pipeline_list + + + def _update_evaluated_individuals_(self, result_score_list, eval_individuals_str, operator_counts): + """Update self.evaluated_individuals_ and error message during pipeline evaluation. - for resulting_score, individual_str in zip(resulting_score_list, eval_individuals_str): - if type(resulting_score) in [float, np.float64, np.float32]: - self.evaluated_individuals_[individual_str] = (operator_counts[individual_str], resulting_score) + Parameters + ---------- + result_score_list: list + A list of CV scores for evaluated pipelines + eval_individuals_str: list + A list of strings for evaluated pipelines + operator_counts: list + A list of operator counts for evaluated pipelines + + Returns + ------- + None + """ + for result_score, individual_str in zip(result_score_list, eval_individuals_str): + if type(result_score) in [float, np.float64, np.float32]: + self.evaluated_individuals_[individual_str] = (operator_counts[individual_str], result_score) else: raise ValueError('Scoring function does not return a float.') - return [self.evaluated_individuals_[str(individual)] for individual in individuals] + + def _update_pbar(self, pbar_num=1, pbar_msg=None): + """Update self._pbar and error message during pipeline evaluation. + + Parameters + ---------- + pbar_num: int + How many pipelines has been processed + pbar_msg: None or string + Error message + + Returns + ------- + None + """ + if not isinstance(self._pbar, type(None)): + if self.verbosity > 2 and pbar_msg is not None: + self._pbar.write(pbar_msg, file=self._file) + if not self._pbar.disable: + self._pbar.update(pbar_num) + @_pre_test def _mate_operator(self, ind1, ind2): - return cxOnePoint(ind1, ind2) + for _ in range(self._max_mut_loops): + ind1_copy, ind2_copy = self._toolbox.clone(ind1),self._toolbox.clone(ind2) + offspring, offspring2 = cxOnePoint(ind1_copy, ind2_copy) + if str(offspring) not in self.evaluated_individuals_: + # We only use the first offspring, so we do not care to check uniqueness of the second. + break + + return offspring, offspring2 + @_pre_test - def _random_mutation_operator(self, individual): + def _random_mutation_operator(self, individual, allow_shrink=True): """Perform a replacement, insertion, or shrink mutation on an individual. Parameters @@ -944,6 +1201,11 @@ def _random_mutation_operator(self, individual): A list of pipeline operators and model parameters that can be compiled by DEAP into a callable function + allow_shrink: bool (True) + If True the `mutShrink` operator, which randomly shrinks the pipeline, + is allowed to be chosen as one of the random mutation operators. + If False, `mutShrink` will never be chosen as a mutation operator. + Returns ------- mut_ind: DEAP individual @@ -952,10 +1214,34 @@ def _random_mutation_operator(self, individual): """ mutation_techniques = [ partial(gp.mutInsert, pset=self._pset), - partial(mutNodeReplacement, pset=self._pset), - partial(gp.mutShrink) + partial(mutNodeReplacement, pset=self._pset) ] - return np.random.choice(mutation_techniques)(individual) + + # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives. + number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual]) + if number_of_primitives > 1 and allow_shrink: + mutation_techniques.append(partial(gp.mutShrink)) + + mutator = np.random.choice(mutation_techniques) + + unsuccesful_mutations = 0 + for _ in range(self._max_mut_loops): + # We have to clone the individual because mutator operators work in-place. + ind = self._toolbox.clone(individual) + offspring, = mutator(ind) + if str(offspring) not in self.evaluated_individuals_: + break + else: + unsuccesful_mutations += 1 + + # Sometimes you have pipelines for which every shrunk version has already been explored too. + # To still mutate the individual, one of the two other mutators should be applied instead. + if ((unsuccesful_mutations == 50) and + (type(mutator) is partial and mutator.func is gp.mutShrink)): + offspring, = self._random_mutation_operator(individual, allow_shrink=False) + + return offspring, + def _gen_grow_safe(self, pset, min_, max_, type_=None): """Generate an expression where each leaf might have a different depth between min_ and max_. @@ -983,8 +1269,21 @@ def condition(height, depth, type_): return self._generate(pset, min_, max_, condition, type_) - # Count the number of pipeline operators as a measure of pipeline complexity + def _operator_count(self, individual): + """Count the number of pipeline operators as a measure of pipeline complexity. + + Parameters + ---------- + individual: list + A grown tree with leaves at possibly different depths + dependending on the condition function. + + Returns + ------- + operator_count: int + How many operators in a pipeline + """ operator_count = 0 for i in range(len(individual)): node = individual[i] @@ -992,7 +1291,31 @@ def _operator_count(self, individual): operator_count += 1 return operator_count - # Generate function stolen straight from deap.gp.generate + + def _update_val(self, val, result_score_list): + """Update values in the list of result scores and self._pbar during pipeline evaluation. + + Parameters + ---------- + val: float or "Timeout" + CV scores + result_score_list: list + A list of CV scores + + Returns + ------- + result_score_list: list + A updated list of CV scores + """ + self._update_pbar() + if val == 'Timeout': + self._update_pbar(pbar_msg=('Skipped pipeline #{0} due to time out. ' + 'Continuing to the next pipeline.'.format(self._pbar.n))) + result_score_list.append(-float('inf')) + else: + result_score_list.append(val) + return result_score_list + @_pre_test def _generate(self, pset, min_, max_, condition, type_=None): """Generate a Tree as a list of lists. diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py index ad11e150..12a7c6a3 100644 --- a/tpot/builtins/__init__.py +++ b/tpot/builtins/__init__.py @@ -22,3 +22,4 @@ from .zero_count import ZeroCount from .combine_dfs import CombineDFs from .stacking_estimator import StackingEstimator +from .one_hot_encoder import OneHotEncoder diff --git a/tpot/builtins/one_hot_encoder.py b/tpot/builtins/one_hot_encoder.py new file mode 100644 index 00000000..8fb5d6c3 --- /dev/null +++ b/tpot/builtins/one_hot_encoder.py @@ -0,0 +1,487 @@ +# -*- coding: utf-8 -*- + +"""Copyright (c) 2015 The auto-sklearn developers. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the auto-sklearn Developers nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. +""" + +import numpy as np +from scipy import sparse + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array + + +SPARSE_ENCODINGS = { + 'OTHER': 1, + 'NAN': 2, +} + + +def _auto_select_categorical_features(X, threshold=10): + """Make a feature mask of categorical features in X. + + Features with less than 10 unique values are considered categorical. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + + threshold : int + Maximum number of unique values per feature to consider the feature + to be categorical. + + Returns + ------- + feature_mask : array of booleans of size {n_features, } + """ + feature_mask = [] + + for column in range(X.shape[1]): + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + unique = np.unique(X.data[indptr_start:indptr_end]) + else: + unique = np.unique(X[:, column]) + + feature_mask.append(len(unique) <= threshold) + + return feature_mask + + +def _transform_selected(X, transform, selected, copy=True): + """Apply a transform function to portion of selected features. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + + transform : callable + A callable transform(X) -> X_transformed + + copy : boolean, optional + Copy X even if it could be avoided. + + selected: "all", "auto" or array of indices or mask + Specify which features to apply the transform to. + + Returns + ------- + X : array or sparse matrix, shape=(n_samples, n_features_new) + """ + if selected == "all": + return transform(X) + if len(selected) == 0: + return X + + X = check_array(X, accept_sparse='csc', force_all_finite=False) + + n_features = X.shape[1] + ind = np.arange(n_features) + sel = np.zeros(n_features, dtype=bool) + sel[np.asarray(selected)] = True + not_sel = np.logical_not(sel) + n_selected = np.sum(sel) + + if n_selected == 0: + # No features selected. + return X + elif n_selected == n_features: + # All features selected. + return transform(X) + else: + X_sel = transform(X[:, ind[sel]]) + X_not_sel = X[:, ind[not_sel]] + + if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): + return sparse.hstack((X_sel, X_not_sel), format='csr') + else: + return np.hstack((X_sel, X_not_sel)) + + +class OneHotEncoder(BaseEstimator, TransformerMixin): + """Encode categorical integer features using a one-hot aka one-of-K scheme. + + The input to this transformer should be a matrix of integers, denoting + the values taken on by categorical (discrete) features. The output will be + a sparse matrix were each column corresponds to one possible value of one + feature. It is assumed that input features take on values in the range + [0, n_values). + + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Parameters + ---------- + + categorical_features: "all" or array of indices or mask + Specify what features are treated as categorical. + + - 'all': All features are treated as categorical. + - 'auto' (default): Select only features that have less than 10 unique values. + - array of indices: Array of categorical feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-categorical features are always stacked to the right of the matrix. + + dtype : number type, default=np.float + Desired dtype of output. + + sparse : boolean, default=True + Will return sparse matrix if set True else will return an array. + + threshold : int, default=10 + Maximum number of unique values per feature to consider the feature + to be categorical when categorical_features is 'auto' . + + Attributes + ---------- + `active_features_` : array + Indices for active features, meaning values that actually occur + in the training set. Only available when n_values is ``'auto'``. + + `feature_indices_` : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by `active_features_` afterwards) + + `n_values_` : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and two samples, we let the encoder + find the maximum value per feature and transform the data to a binary + one-hot encoding. + + >>> from sklearn.preprocessing import OneHotEncoder + >>> enc = OneHotEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + OneHotEncoder(categorical_features='all', dtype=<... 'float'>, + sparse=True, minimum_fraction=None) + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 2, 5, 9]) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) + + See also + -------- + sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot + encoding of dictionary items or strings. + """ + + def __init__(self, categorical_features='auto', dtype=np.float, + sparse=True, minimum_fraction=None, threshold=10): + self.categorical_features = categorical_features + self.dtype = dtype + self.sparse = sparse + self.minimum_fraction = minimum_fraction + self.threshold = threshold + + def fit(self, X, y=None): + """Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_feature) + Input array of type int. + + Returns + ------- + self + """ + self.fit_transform(X) + return self + + def _matrix_adjust(self, X): + """Adjust all values in X to encode for NaNs and infinities in the data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_feature) + Input array of type int. + + Returns + ------- + X : array-like, shape=(n_samples, n_feature) + Input array without any NaNs or infinities. + """ + data_matrix = X.data if sparse.issparse(X) else X + + # Shift all values to specially encode for NAN/infinity/OTHER and 0 + # Old value New Value + # --------- --------- + # N (0..int_max) N + 3 + # np.NaN 2 + # infinity 2 + # *other* 1 + # + # A value of 0 is reserved, as that is specially handled in sparse + # matrices. + data_matrix += len(SPARSE_ENCODINGS) + 1 + data_matrix[~np.isfinite(data_matrix)] = SPARSE_ENCODINGS['NAN'] + + return X + + def _fit_transform(self, X): + """Assume X contains only categorical features. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + """ + X = self._matrix_adjust(X) + + X = check_array( + X, + accept_sparse='csc', + force_all_finite=False, + dtype=int + ) + + if X.min() < 0: + raise ValueError("X needs to contain only non-negative integers.") + + n_samples, n_features = X.shape + + # Remember which values should not be replaced by the value 'other' + if self.minimum_fraction is not None: + do_not_replace_by_other = list() + for column in range(X.shape[1]): + do_not_replace_by_other.append(list()) + + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + unique = np.unique(X.data[indptr_start:indptr_end]) + colsize = indptr_end - indptr_start + else: + unique = np.unique(X[:, column]) + colsize = X.shape[0] + + for unique_value in unique: + if np.isfinite(unique_value): + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + count = np.nansum(unique_value == + X.data[indptr_start:indptr_end]) + else: + count = np.nansum(unique_value == X[:, column]) + else: + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + count = np.nansum(~np.isfinite( + X.data[indptr_start:indptr_end])) + else: + count = np.nansum(~np.isfinite(X[:, column])) + + fraction = float(count) / colsize + if fraction >= self.minimum_fraction: + do_not_replace_by_other[-1].append(unique_value) + + for unique_value in unique: + if unique_value not in do_not_replace_by_other[-1]: + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + X.data[indptr_start:indptr_end][ + X.data[indptr_start:indptr_end] == + unique_value] = SPARSE_ENCODINGS['OTHER'] + else: + X[:, column][X[:, column] == unique_value] = SPARSE_ENCODINGS['OTHER'] + + self.do_not_replace_by_other_ = do_not_replace_by_other + + if sparse.issparse(X): + n_values = X.max(axis=0).toarray().flatten() + len(SPARSE_ENCODINGS) + else: + n_values = np.max(X, axis=0) + len(SPARSE_ENCODINGS) + + self.n_values_ = n_values + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + self.feature_indices_ = indices + + if sparse.issparse(X): + row_indices = X.indices + column_indices = [] + for i in range(len(X.indptr) - 1): + nbr = X.indptr[i+1] - X.indptr[i] + column_indices_ = [indices[i]] * nbr + column_indices_ += X.data[X.indptr[i]:X.indptr[i+1]] + column_indices.extend(column_indices_) + data = np.ones(X.data.size) + else: + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + data = np.ones(n_samples * n_features) + + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsc() + + mask = np.array(out.sum(axis=0)).ravel() != 0 + active_features = np.where(mask)[0] + out = out[:, active_features] + self.active_features_ = active_features + return out.tocsr() if self.sparse else out.toarray() + + def fit_transform(self, X, y=None): + """Fit OneHotEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + y: array-like {n_samples,} (Optional, ignored) + Feature labels + """ + if self.categorical_features == "auto": + self.categorical_features = _auto_select_categorical_features(X, threshold=self.threshold) + + return _transform_selected( + X, + self._fit_transform, + self.categorical_features, + copy=True + ) + + def _transform(self, X): + """Asssume X contains only categorical features. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + """ + X = self._matrix_adjust(X) + + X = check_array(X, accept_sparse='csc', force_all_finite=False, + dtype=int) + if X.min() < 0: + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # Replace all indicators which were below `minimum_fraction` in the + # training set by 'other' + if self.minimum_fraction is not None: + for column in range(X.shape[1]): + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + unique = np.unique(X.data[indptr_start:indptr_end]) + else: + unique = np.unique(X[:, column]) + + for unique_value in unique: + if unique_value not in self.do_not_replace_by_other_[column]: + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + X.data[indptr_start:indptr_end][ + X.data[indptr_start:indptr_end] == + unique_value] = SPARSE_ENCODINGS['OTHER'] + else: + X[:, column][X[:, column] == unique_value] = SPARSE_ENCODINGS['OTHER'] + + if sparse.issparse(X): + n_values_check = X.max(axis=0).toarray().flatten() + 1 + else: + n_values_check = np.max(X, axis=0) + 1 + + # Replace all indicators which are out of bounds by 'other' (index 0) + if (n_values_check > self.n_values_).any(): + # raise ValueError("Feature out of bounds. Try setting n_values.") + for i, n_value_check in enumerate(n_values_check): + if (n_value_check - 1) >= self.n_values_[i]: + if sparse.issparse(X): + indptr_start = X.indptr[i] + indptr_end = X.indptr[i+1] + X.data[indptr_start:indptr_end][X.data[indptr_start:indptr_end] >= self.n_values_[i]] = 0 + else: + X[:, i][X[:, i] >= self.n_values_[i]] = 0 + + if sparse.issparse(X): + row_indices = X.indices + column_indices = [] + for i in range(len(X.indptr) - 1): + nbr = X.indptr[i + 1] - X.indptr[i] + column_indices_ = [indices[i]] * nbr + column_indices_ += X.data[X.indptr[i]:X.indptr[i + 1]] + column_indices.extend(column_indices_) + data = np.ones(X.data.size) + else: + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + data = np.ones(n_samples * n_features) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsc() + + out = out[:, self.active_features_] + return out.tocsr() if self.sparse else out.toarray() + + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + return _transform_selected( + X, self._transform, + self.categorical_features, + copy=True + ) diff --git a/tpot/config/__init__.py b/tpot/config/__init__.py index d57a7c6f..034defca 100644 --- a/tpot/config/__init__.py +++ b/tpot/config/__init__.py @@ -21,7 +21,9 @@ from .classifier_light import classifier_config_dict_light from .classifier_mdr import tpot_mdr_classifier_config_dict +from .classifier_sparse import classifier_config_sparse from .classifier import classifier_config_dict from .regressor_light import regressor_config_dict_light from .regressor_mdr import tpot_mdr_regressor_config_dict +from .regressor_sparse import regressor_config_sparse from .regressor import regressor_config_dict diff --git a/tpot/config/classifier.py b/tpot/config/classifier.py index e252ca5f..801db183 100644 --- a/tpot/config/classifier.py +++ b/tpot/config/classifier.py @@ -155,6 +155,11 @@ 'tpot.builtins.ZeroCount': { }, + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], + 'sparse': [False] + }, + # Selectors 'sklearn.feature_selection.SelectFwe': { 'alpha': np.arange(0, 0.05, 0.001), diff --git a/tpot/config/classifier_sparse.py b/tpot/config/classifier_sparse.py new file mode 100644 index 00000000..1d1788fc --- /dev/null +++ b/tpot/config/classifier_sparse.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . +""" + +import numpy as np + +classifier_config_sparse = { + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25] + }, + + 'sklearn.neighbors.KNeighborsClassifier': { + 'n_neighbors': range(1, 101), + 'weights': ["uniform", "distance"], + 'p': [1, 2] + }, + + 'sklearn.ensemble.RandomForestClassifier': { + 'n_estimators': [100], + 'criterion': ["gini", "entropy"], + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'bootstrap': [True, False] + }, + + 'sklearn.feature_selection.SelectFwe': { + 'alpha': np.arange(0, 0.05, 0.001), + 'score_func': { + 'sklearn.feature_selection.f_classif': None + } + }, + + 'sklearn.feature_selection.SelectPercentile': { + 'percentile': range(1, 100), + 'score_func': { + 'sklearn.feature_selection.f_classif': None + } + }, + + 'sklearn.feature_selection.VarianceThreshold': { + 'threshold': np.arange(0.05, 1.01, 0.05) + }, + + 'sklearn.feature_selection.RFE': { + 'step': np.arange(0.05, 1.01, 0.05), + 'estimator': { + 'sklearn.ensemble.ExtraTreesClassifier': { + 'n_estimators': [100], + 'criterion': ['gini', 'entropy'], + 'max_features': np.arange(0.05, 1.01, 0.05) + } + } + }, + + 'sklearn.feature_selection.SelectFromModel': { + 'threshold': np.arange(0, 1.01, 0.05), + 'estimator': { + 'sklearn.ensemble.ExtraTreesClassifier': { + 'n_estimators': [100], + 'criterion': ['gini', 'entropy'], + 'max_features': np.arange(0.05, 1.01, 0.05) + } + } + }, + + 'sklearn.linear_model.LogisticRegression': { + 'penalty': ["l1", "l2"], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], + 'dual': [True, False] + }, + + 'sklearn.naive_bayes.BernoulliNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.naive_bayes.MultinomialNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.svm.LinearSVC': { + 'penalty': ["l1", "l2"], + 'loss': ["hinge", "squared_hinge"], + 'dual': [True, False], + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.] + }, + + 'xgboost.XGBClassifier': { + 'n_estimators': [100], + 'max_depth': range(1, 11), + 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], + 'subsample': np.arange(0.05, 1.01, 0.05), + 'min_child_weight': range(1, 21), + 'nthread': [1] + } +} diff --git a/tpot/config/regressor.py b/tpot/config/regressor.py index a0e3cb4f..c64a45c2 100644 --- a/tpot/config/regressor.py +++ b/tpot/config/regressor.py @@ -23,7 +23,6 @@ regressor_config_dict = { - 'sklearn.linear_model.ElasticNetCV': { 'l1_ratio': np.arange(0.0, 1.01, 0.05), 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] @@ -91,7 +90,6 @@ 'sklearn.linear_model.RidgeCV': { }, - 'xgboost.XGBRegressor': { 'n_estimators': [100], 'max_depth': range(1, 11), @@ -155,18 +153,23 @@ 'tpot.builtins.ZeroCount': { }, + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], + 'sparse': [False] + }, + # Selectors 'sklearn.feature_selection.SelectFwe': { 'alpha': np.arange(0, 0.05, 0.001), 'score_func': { - 'sklearn.feature_selection.f_classif': None + 'sklearn.feature_selection.f_regression': None } }, 'sklearn.feature_selection.SelectPercentile': { 'percentile': range(1, 100), 'score_func': { - 'sklearn.feature_selection.f_classif': None + 'sklearn.feature_selection.f_regression': None } }, diff --git a/tpot/config/regressor_light.py b/tpot/config/regressor_light.py index fa263b93..c259d293 100644 --- a/tpot/config/regressor_light.py +++ b/tpot/config/regressor_light.py @@ -105,14 +105,14 @@ 'sklearn.feature_selection.SelectFwe': { 'alpha': np.arange(0, 0.05, 0.001), 'score_func': { - 'sklearn.feature_selection.f_classif': None + 'sklearn.feature_selection.f_regression': None } }, 'sklearn.feature_selection.SelectPercentile': { 'percentile': range(1, 100), 'score_func': { - 'sklearn.feature_selection.f_classif': None + 'sklearn.feature_selection.f_regression': None } }, diff --git a/tpot/config/regressor_sparse.py b/tpot/config/regressor_sparse.py new file mode 100644 index 00000000..34a93cec --- /dev/null +++ b/tpot/config/regressor_sparse.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- + +"""Copyright 2015-Present Randal S. Olson. + +This file is part of the TPOT library. + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . +""" + +import numpy as np + +regressor_config_sparse = { + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25] + }, + + 'sklearn.neighbors.KNeighborsRegressor': { + 'n_neighbors': range(1, 101), + 'weights': ["uniform", "distance"], + 'p': [1, 2] + }, + + 'sklearn.ensemble.RandomForestRegressor': { + 'n_estimators': [100], + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'bootstrap': [True, False] + }, + + 'sklearn.feature_selection.SelectFwe': { + 'alpha': np.arange(0, 0.05, 0.001), + 'score_func': { + 'sklearn.feature_selection.f_regression': None + } + }, + + 'sklearn.feature_selection.SelectPercentile': { + 'percentile': range(1, 100), + 'score_func': { + 'sklearn.feature_selection.f_regression': None + } + }, + + 'sklearn.feature_selection.VarianceThreshold': { + 'threshold': np.arange(0.05, 1.01, 0.05) + }, + + 'sklearn.feature_selection.SelectFromModel': { + 'threshold': np.arange(0, 1.01, 0.05), + 'estimator': { + 'sklearn.ensemble.ExtraTreesRegressor': { + 'n_estimators': [100], + 'max_features': np.arange(0.05, 1.01, 0.05) + } + } + }, + + 'sklearn.linear_model.ElasticNetCV': { + 'l1_ratio': np.arange(0.0, 1.01, 0.05), + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + }, + + 'sklearn.linear_model.RidgeCV': { + }, + + 'sklearn.svm.LinearSVR': { + 'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"], + 'dual': [True, False], + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], + 'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1.] + }, + + 'xgboost.XGBRegressor': { + 'n_estimators': [100], + 'max_depth': range(1, 11), + 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], + 'subsample': np.arange(0.05, 1.01, 0.05), + 'min_child_weight': range(1, 21), + 'nthread': [1] + }} diff --git a/tpot/decorators.py b/tpot/decorators.py index 102b0c09..061d8c3f 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -83,17 +83,13 @@ def check_pipeline(self, *args, **kwargs): sklearn_pipeline.fit(pretest_X_reg, pretest_y_reg) bad_pipeline = False except BaseException as e: - if self.verbosity > 2: - message = '_pre_test decorator: {fname}: num_test={n} {e}'.format( - n=num_test, - fname=func.__name__, - e=e - ) - # Use the pbar output stream if it's active - if not isinstance(self._pbar, type(None)): - self._pbar.write(message) - else: - print(message) + message = '_pre_test decorator: {fname}: num_test={n} {e}'.format( + n=num_test, + fname=func.__name__, + e=e + ) + # Use the pbar output stream if it's active + self._update_pbar(pbar_num=0, pbar_msg=message) finally: num_test += 1 diff --git a/tpot/driver.py b/tpot/driver.py index d6385ee7..204ee5c0 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -20,9 +20,15 @@ """ import numpy as np +import pandas as pd import argparse from sklearn.model_selection import train_test_split +# for manual scoring function, see load_scoring_function +import sys +import os +from importlib import import_module + from .tpot import TPOTClassifier, TPOTRegressor from ._version import __version__ @@ -217,6 +223,11 @@ def _get_arg_parser(): 'Function used to evaluate the quality of a given pipeline for the ' 'problem. By default, accuracy is used for classification problems ' 'and mean squared error (mse) is used for regression problems. ' + + 'Note: If you wrote your own function, set this argument to mymodule.myfunction' + 'and TPOT will import your module and take the function from there.' + 'TPOT will assume the module can be imported from the current workdir.' + 'TPOT assumes that any function with "error" or "loss" in the name ' 'is meant to be minimized, whereas any other functions will be ' 'maximized. Offers the same options as cross_val_score: ' @@ -271,6 +282,7 @@ def _get_arg_parser(): ) ) + parser.add_argument( '-njobs', action='store', @@ -323,6 +335,7 @@ def _get_arg_parser(): ) ) + parser.add_argument( '-config', action='store', @@ -337,6 +350,36 @@ def _get_arg_parser(): ) ) + + + parser.add_argument( + '-cf', + action='store', + dest='CHECKPOINT_FOLDER', + default=None, + type=str, + help=('If supplied, a folder in which tpot will periodically ' + 'save the best pipeline so far while optimizing. ' + 'This is useful in multiple cases: ' + 'sudden death before tpot could save an optimized pipeline, ' + 'progress tracking, ' + "grabbing a pipeline while it's still optimizing etc." + ) + ) + + parser.add_argument( + '-es', + action='store', + dest='EARLY_STOP', + default=None, + type=int, + help=( + 'How many generations TPOT checks whether there is no improvement ' + 'in optimization process. End optimization process if there is no improvement ' + 'in the set number of generations.' + ) + ) + parser.add_argument( '-v', action='store', @@ -383,19 +426,22 @@ def _print_args(args): arg_val = args.__dict__['POPULATION_SIZE'] else: arg_val = args.__dict__[arg] - print('{}\t=\t{}'.format(arg, arg_val)) + + # Pad the outputs with an even amount of space + arg = (arg + (' ') * 100)[:20] + arg_val = ((' ') * 5 + str(arg_val)) + print('{}={}'.format(arg, arg_val)) print('') def _read_data_file(args): - input_data = np.recfromcsv( + input_data = pd.read_csv( args.INPUT_FILE, - delimiter=args.INPUT_SEPARATOR, + sep=args.INPUT_SEPARATOR, dtype=np.float64, - case_sensitive=True ) - if args.TARGET_NAME not in input_data.dtype.names: + if args.TARGET_NAME not in input_data.columns.values: raise ValueError( 'The provided data file does not seem to have a target column. ' 'Please make sure to specify the target column using the -target ' @@ -404,22 +450,45 @@ def _read_data_file(args): return input_data -def main(): + +def load_scoring_function(scoring_func): + """ + converts mymodule.myfunc in the myfunc + object itself so tpot receives a scoring function + """ + if scoring_func and ("." in scoring_func): + try: + module_name, func_name = scoring_func.rsplit('.', 1) + + module_path = os.getcwd() + sys.path.insert(0, module_path) + scoring_func = getattr(import_module(module_name), func_name) + sys.path.pop(0) + + print('manual scoring function: {}'.format(scoring_func)) + print('taken from module: {}'.format(module_name)) + except Exception as e: + print('failed importing custom scoring function, error: {}'.format(str(e))) + raise ValueError(e) + + return scoring_func + + +def tpot_driver(args): """Perform a TPOT run.""" - args = _get_arg_parser().parse_args() if args.VERBOSITY >= 2: _print_args(args) input_data = _read_data_file(args) - features = np.delete( - input_data.view(np.float64).reshape(input_data.size, -1), - input_data.dtype.names.index(args.TARGET_NAME), - axis=1 - ) + features = input_data.drop(args.TARGET_NAME, axis=1).values training_features, testing_features, training_target, testing_target = \ - train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE) + train_test_split(features, input_data[args.TARGET_NAME].values, random_state=args.RANDOM_STATE) + tpot_type = TPOTClassifier if args.TPOT_MODE == 'classification' else TPOTRegressor + + scoring_func = load_scoring_function(args.SCORING_FN) + tpot_obj = tpot_type( generations=args.GENERATIONS, population_size=args.POPULATION_SIZE, @@ -429,11 +498,13 @@ def main(): cv=args.NUM_CV_FOLDS, subsample=args.SUBSAMPLE, n_jobs=args.NUM_JOBS, - scoring=args.SCORING_FN, + scoring=scoring_func, max_time_mins=args.MAX_TIME_MINS, max_eval_time_mins=args.MAX_EVAL_MINS, random_state=args.RANDOM_STATE, config_dict=args.CONFIG_FILE, + periodic_checkpoint_folder=args.CHECKPOINT_FOLDER, + early_stop=args.EARLY_STOP, verbosity=args.VERBOSITY, disable_update_check=args.DISABLE_UPDATE_CHECK ) @@ -460,6 +531,9 @@ def main(): if args.OUTPUT_FILE != '': tpot_obj.export(args.OUTPUT_FILE) +def main(): + args = _get_arg_parser().parse_args() + tpot_driver(args) if __name__ == '__main__': main() diff --git a/tpot/export_utils.py b/tpot/export_utils.py index f262ca1e..79c26ddd 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -43,15 +43,15 @@ def get_by_name(opname, operators): if len(ret_op_classes) == 0: raise TypeError('Cannot found operator {} in operator dictionary'.format(opname)) elif len(ret_op_classes) > 1: - print( - 'Found multiple operator {} in operator dictionary. Please check ' + raise ValueError( + 'Found duplicate operators {} in operator dictionary. Please check ' 'your dictionary file.'.format(opname) ) ret_op_class = ret_op_classes[0] return ret_op_class -def export_pipeline(exported_pipeline, operators, pset): +def export_pipeline(exported_pipeline, operators, pset, impute=False, pipeline_score=None): """Generate source code for a TPOT Pipeline. Parameters @@ -60,6 +60,8 @@ def export_pipeline(exported_pipeline, operators, pset): The pipeline that is being exported operators: List of operator classes from operator library + pipeline_score: + Optional pipeline score to be saved to the exported file Returns ------- @@ -71,7 +73,7 @@ def export_pipeline(exported_pipeline, operators, pset): pipeline_tree = expr_to_tree(exported_pipeline, pset) # Have the exported code import all of the necessary modules and functions - pipeline_text = generate_import_code(exported_pipeline, operators) + pipeline_text = generate_import_code(exported_pipeline, operators, impute) pipeline_code = pipeline_code_wrapper(generate_export_pipeline_code(pipeline_tree, operators)) @@ -81,13 +83,26 @@ def export_pipeline(exported_pipeline, operators, pset): """ pipeline_text += """ -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +# NOTE: Make sure that the class is labeled 'target' in the data file +tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) +features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ - train_test_split(features, tpot_data['class'], random_state=42) + train_test_split(features, tpot_data['target'].values, random_state=42) """ + # Add the imputation step if it was used by TPOT + if impute: + pipeline_text += """ +imputer = Imputer(strategy="median") +imputer.fit(training_features) +training_features = imputer.transform(training_features) +testing_features = imputer.transform(testing_features) +""" + + if pipeline_score is not None: + pipeline_text += '\n# Score on the training set was:{}'.format(pipeline_score) + pipeline_text += '\n' + # Replace the function calls with their corresponding Python code pipeline_text += pipeline_code @@ -137,7 +152,7 @@ def prim_to_list(prim, args): return tree -def generate_import_code(pipeline, operators): +def generate_import_code(pipeline, operators, impute=False): """Generate all library import calls for use in TPOT.export(). Parameters @@ -146,6 +161,8 @@ def generate_import_code(pipeline, operators): List of operators in the current optimized pipeline operators: List of operator class from operator library + impute : bool + Whether to impute new values in the feature set. Returns ------- @@ -164,12 +181,16 @@ def merge_imports(old_dict, new_dict): old_dict[key] = set(new_dict[key]) operators_used = [x.name for x in pipeline if isinstance(x, deap.gp.Primitive)] - pipeline_text = 'import numpy as np\n\n' + pipeline_text = 'import numpy as np\nimport pandas as pd\n' pipeline_imports = _starting_imports(operators, operators_used) # Build dict of import requirments from list of operators import_relations = {op.__name__: op.import_hash for op in operators} + # Add the imputer if necessary + if impute: + pipeline_imports['sklearn.preprocessing'] = ['Imputer'] + # Build import dict from operators used for op in operators_used: try: @@ -200,7 +221,6 @@ def _starting_imports(operators, operators_used): else: num_op_root += 1 - if num_op_root > 1: return { 'sklearn.model_selection': ['train_test_split'], @@ -232,8 +252,7 @@ def pipeline_code_wrapper(pipeline_code): Source code for the sklearn pipeline and calls to fit and predict """ - return """ -exported_pipeline = {} + return """exported_pipeline = {} exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 4c31228d..dbfd82c1 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -28,14 +28,70 @@ from deap import tools, gp from inspect import isclass from .operator_utils import set_sample_weight -from sklearn.model_selection import cross_val_score -from sklearn.base import clone +from sklearn.utils import indexable +from sklearn.metrics.scorer import check_scoring +from sklearn.model_selection._validation import _fit_and_score +from sklearn.model_selection._split import check_cv + +from sklearn.base import clone, is_classifier from collections import defaultdict import warnings -import threading +from stopit import threading_timeoutable, TimeoutException + + +def pick_two_individuals_eligible_for_crossover(population): + """Pick two individuals from the population which can do crossover, that is, they share a primitive. + + Parameters + ---------- + population: array of individuals + + Returns + ---------- + tuple: (individual, individual) + Two individuals which are not the same, but share at least one primitive. + Alternatively, if no such pair exists in the population, (None, None) is returned instead. + """ + primitives_by_ind = [set([node.name for node in ind if isinstance(node, gp.Primitive)]) + for ind in population] + pop_as_str = [str(ind) for ind in population] -# Limit loops to generate a different individual by crossover/mutation -MAX_MUT_LOOPS = 50 + eligible_pairs = [(i, i+1+j) for i, ind1_prims in enumerate(primitives_by_ind) + for j, ind2_prims in enumerate(primitives_by_ind[i+1:]) + if not ind1_prims.isdisjoint(ind2_prims) and + pop_as_str[i] != pop_as_str[i+1+j]] + + # Pairs are eligible in both orders, this ensures that both orders are considered + eligible_pairs += [(j, i) for (i,j) in eligible_pairs] + + if not eligible_pairs: + # If there are no eligible pairs, the caller should decide what to do + return None, None + + pair = np.random.randint(0,len(eligible_pairs)) + idx1, idx2 = eligible_pairs[pair] + + return population[idx1], population[idx2] + + +def mutate_random_individual(population, toolbox): + """Picks a random individual from the population, and performs mutation on a copy of it. + + Parameters + ---------- + population: array of individuals + + Returns + ---------- + individual: individual + An individual which is a mutated copy of one of the individuals in population, + the returned individual does not have fitness.values + """ + idx = np.random.randint(0,len(population)) + ind = population[idx] + ind, = toolbox.mutate(ind) + del ind.fitness.values + return ind def varOr(population, toolbox, lambda_, cxpb, mutpb): @@ -71,29 +127,21 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb): 1 - *cxpb* - *mutpb*. """ offspring = [] + for _ in range(lambda_): op_choice = np.random.random() if op_choice < cxpb: # Apply crossover - idxs = np.random.randint(0, len(population), size=2) - ind1, ind2 = toolbox.clone(population[idxs[0]]), toolbox.clone(population[idxs[1]]) - ind_str = str(ind1) - num_loop = 0 - while ind_str == str(ind1) and num_loop < MAX_MUT_LOOPS: - ind1, ind2 = toolbox.mate(ind1, ind2) - num_loop += 1 - if ind_str != str(ind1): # check if crossover happened + ind1, ind2 = pick_two_individuals_eligible_for_crossover(population) + if ind1 is not None: + ind1, _ = toolbox.mate(ind1, ind2) del ind1.fitness.values + else: + # If there is no pair eligible for crossover, we still want to + # create diversity in the population, and do so by mutation instead. + ind1 = mutate_random_individual(population, toolbox) offspring.append(ind1) elif op_choice < cxpb + mutpb: # Apply mutation - idx = np.random.randint(0, len(population)) - ind = toolbox.clone(population[idx]) - ind_str = str(ind) - num_loop = 0 - while ind_str == str(ind) and num_loop < MAX_MUT_LOOPS: - ind, = toolbox.mutate(ind) - num_loop += 1 - if ind_str != str(ind): # check if mutation happened - del ind.fitness.values + ind = mutate_random_individual(population, toolbox) offspring.append(ind) else: # Apply reproduction idx = np.random.randint(0, len(population)) @@ -103,7 +151,7 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb): def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, - stats=None, halloffame=None, verbose=0, max_time_mins=None): + stats=None, halloffame=None, verbose=0, per_generation_function=None): """This is the :math:`(\mu + \lambda)` evolutionary algorithm. :param population: A list of individuals. :param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution @@ -119,6 +167,8 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, :param halloffame: A :class:`~deap.tools.HallOfFame` object that will contain the best individuals, optional. :param verbose: Whether or not to log the statistics. + :param per_generation_function: if supplied, call this function before each generation + used by tpot to save best pipeline before each new generation :returns: The final population :returns: A class:`~deap.tools.Logbook` with the statistics of the evolution. @@ -164,6 +214,9 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, # Begin the generational process for gen in range(1, ngen + 1): + # after each population save a periodic pipeline + if per_generation_function is not None: + per_generation_function() # Vary the population offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) @@ -171,11 +224,9 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] - # update pbar for valid_ind + # update pbar for valid individuals (with fitness values) if not pbar.disable: pbar.update(len(offspring)-len(invalid_ind)) - if not (max_time_mins is None) and pbar.n >= pbar.total: - pbar.total += lambda_ fitnesses = toolbox.evaluate(invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): @@ -221,29 +272,17 @@ def cxOnePoint(ind1, ind2): :param ind2: Second tree participating in the crossover. :returns: A tuple of two trees. """ - # Define the name of type for any types. - __type__ = object - - if len(ind1) < 2 or len(ind2) < 2: - # No crossover on single node tree - return ind1, ind2 - # List all available primitive types in each individual types1 = defaultdict(list) types2 = defaultdict(list) - if ind1.root.ret == __type__: - # Not STGP optimization - types1[__type__] = range(1, len(ind1)) - types2[__type__] = range(1, len(ind2)) - common_types = [__type__] - else: - for idx, node in enumerate(ind1[1:], 1): - types1[node.ret].append(idx) - common_types = [] - for idx, node in enumerate(ind2[1:], 1): - if node.ret in types1 and node.ret not in types2: - common_types.append(node.ret) - types2[node.ret].append(idx) + + for idx, node in enumerate(ind1[1:], 1): + types1[node.ret].append(idx) + common_types = [] + for idx, node in enumerate(ind2[1:], 1): + if node.ret in types1 and node.ret not in types2: + common_types.append(node.ret) + types2[node.ret].append(idx) if len(common_types) > 0: type_ = np.random.choice(common_types) @@ -322,55 +361,56 @@ def mutNodeReplacement(individual, pset): return individual, -class Interruptable_cross_val_score(threading.Thread): - def __init__(self, *args, **kwargs): - threading.Thread.__init__(self) - self.args = args - self.kwargs = kwargs - self.result = -float('inf') - self._stopevent = threading.Event() - self.daemon = True - - def stop(self): - self._stopevent.set() - threading.Thread.join(self) - - def run(self): - # Note: changed name of the thread to "MainThread" to avoid such warning from joblib (maybe bugs) - # Note: Need attention if using parallel execution model of scikit-learn - threading.current_thread().name = 'MainThread' - try: - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - self.result = cross_val_score(*self.args, **self.kwargs) - except Exception as e: - pass - - +@threading_timeoutable(default="Timeout") def _wrapped_cross_val_score(sklearn_pipeline, features, target, - cv, scoring_function, sample_weight, - max_eval_time_mins, groups): - max_time_seconds = max(int(max_eval_time_mins * 60), 1) + cv, scoring_function, sample_weight=None, groups=None): + """Fit estimator and compute scores for a given dataset split. + Parameters + ---------- + sklearn_pipeline : pipeline object implementing 'fit' + The object to use to fit the data. + features : array-like of shape at least 2D + The data to fit. + target : array-like, optional, default: None + The target variable to try to predict in the case of + supervised learning. + cv: int or cross-validation generator + If CV is a number, then it is the number of folds to evaluate each + pipeline over in k-fold cross-validation during the TPOT optimization + process. If it is an object then it is an object to be used as a + cross-validation generator. + scoring_function : callable + A scorer callable object / function with signature + ``scorer(estimator, X, y)``. + sample_weight : array-like, optional + List of sample weights to balance (or un-balanace) the dataset target as needed + groups: array-like {n_samples, }, optional + Group labels for the samples used while splitting the dataset into train/test set + """ sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) - # build a job for cross_val_score - tmp_it = Interruptable_cross_val_score( - clone(sklearn_pipeline), - features, - target, - scoring=scoring_function, - cv=cv, - n_jobs=1, - verbose=0, - fit_params=sample_weight_dict, - groups=groups - ) - tmp_it.start() - tmp_it.join(max_time_seconds) - - if tmp_it.isAlive(): - resulting_score = 'Timeout' - else: - resulting_score = np.mean(tmp_it.result) - - tmp_it.stop() - return resulting_score + + features, target, groups = indexable(features, target, groups) + + cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline)) + cv_iter = list(cv.split(features, target, groups)) + scorer = check_scoring(sklearn_pipeline, scoring=scoring_function) + + try: + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + scores = [_fit_and_score(estimator=clone(sklearn_pipeline), + X=features, + y=target, + scorer=scorer, + train=train, + test=test, + verbose=0, + parameters=None, + fit_params=sample_weight_dict) + for train, test in cv_iter] + CV_score = np.array(scores)[:, 0] + return np.nanmean(CV_score) + except TimeoutException: + return "Timeout" + except Exception as e: + return -float('inf') diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index f5228a52..3460d6d7 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -226,8 +226,6 @@ def export(cls, *args): dep_op_arguments = {} for arg_class, arg_value in zip(arg_types, args): - if arg_value == "DEFAULT": - continue aname_split = arg_class.__name__.split('__') if isinstance(arg_value, str): arg_value = '\"{}\"'.format(arg_value) @@ -236,19 +234,16 @@ def export(cls, *args): # Parameter of internal operator as a parameter in the # operator, usually in Selector else: - if not list(dep_op_list.values()).count(aname_split[1]): - raise TypeError('Warning: the operator {} is not in right format in the operator dictionary'.format(aname_split[0])) - else: - if aname_split[1] not in dep_op_arguments: - dep_op_arguments[aname_split[1]] = [] - dep_op_arguments[aname_split[1]].append("{}={}".format(aname_split[-1], arg_value)) + if aname_split[1] not in dep_op_arguments: + dep_op_arguments[aname_split[1]] = [] + dep_op_arguments[aname_split[1]].append("{}={}".format(aname_split[-1], arg_value)) tmp_op_args = [] if dep_op_list: # To make sure the inital operators is the first parameter just # for better persentation for dep_op_pname, dep_op_str in dep_op_list.items(): - if dep_op_str == 'f_classif': + if dep_op_pname == 'score_func': arg_value = dep_op_str else: arg_value = "{}({})".format(dep_op_str, ", ".join(dep_op_arguments[dep_op_str]))