Merge pull request #148 from EpistasisLab/dev

Dev
EpistasisLab · Sep 17, 2024 · 944699a · 944699a
2 parents 96ef8bb + 46f42bb
commit 944699a
Show file tree

Hide file tree

Showing 13 changed files with 2,293 additions and 66 deletions.
diff --git a/Tutorial/amltk_search_space_parser_example.ipynb b/Tutorial/amltk_search_space_parser_example.ipynb
diff --git a/setup.py b/setup.py
@@ -53,7 +53,8 @@ def calculate_version():
     extras_require={
         'skrebate': ['skrebate>=0.3.4'],
         'mdr': ['scikit-mdr>=0.4.4'],
-        'sklearnex' : ['scikit-learn-intelex>=2023.2.1']
+        'sklearnex' : ['scikit-learn-intelex>=2023.2.1'],
+        'amltk' : ['amltk>=1.12.1'],
     },
     classifiers=[
         'Intended Audience :: Science/Research',

diff --git a/tpot2/__init__.py b/tpot2/__init__.py
@@ -8,9 +8,9 @@
 from .population import Population
 
 from . import builtin_modules
-from . import utils
 from . import config
 from . import search_spaces
+from . import utils
 from . import evolvers
 from . import objectives
 from . import selectors

diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py
@@ -535,7 +535,7 @@ def MLPClassifier_hyperparameter_parser(params):
 def get_GaussianProcessClassifier_ConfigurationSpace(n_features, random_state):
     space = {
         'n_features': n_features,
-        'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),
+        'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),
         'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),
         'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),
     }

diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py
@@ -45,7 +45,8 @@
 from sklearn.feature_selection import f_classif, f_regression #TODO create a selectomixin using these?
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
-from sklearn.impute import SimpleImputer
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
 
 all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor,  ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
                AdaBoostClassifier,MLPRegressor,
@@ -56,7 +57,7 @@
                GaussianProcessClassifier, BaggingClassifier,LGBMRegressor,
                Passthrough,SkipTransformer,
                PassKBinsDiscretizer,
-               SimpleImputer,
+               SimpleImputer, IterativeImputer, KNNImputer
                ]
 
 
@@ -124,7 +125,7 @@
         "all_transformers" : ["transformers", "scalers"],
 
         "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"],
-        "imputers": ["SimpleImputer"],
+        "imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"],
         "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"],
         "genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
 
@@ -136,8 +137,6 @@
 
 def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_state=None):
     match name:
-        case "SimpleImputer":
-            return imputers.simple_imputer_cs
 
         #autoqtl_builtins.py
         case "FeatureEncodingFrequencySelector":
@@ -352,6 +351,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
             ) 
 
         #imputers.py
+        case "SimpleImputer":
+            return imputers.simple_imputer_cs
+        case "IterativeImputer":
+            return imputers.get_IterativeImputer_config_space(n_features=n_features, random_state=random_state)
+        case "KNNImputer":
+            return imputers.get_KNNImputer_config_space(n_samples=n_samples)
 
         #mdr_configs.py
         case "MDR":
@@ -401,12 +406,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
     raise ValueError(f"Could not find configspace for {name}")
 
 
-def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True):
+def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True, base_node=EstimatorNode):
 
 
     #if list of names, return a list of EstimatorNodes
     if isinstance(name, list) or isinstance(name, np.ndarray):
-        search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False) for n in name]
+        search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False, base_node=base_node) for n in name]
         #remove Nones
         search_spaces = [s for s in search_spaces if s is not None]
 
@@ -417,12 +422,12 @@ def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_st
 
     if name in GROUPNAMES:
         name_list = GROUPNAMES[name]
-        return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline)
+        return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline, base_node=base_node)
 
-    return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
+    return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, base_node=base_node)
 
 
-def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None):
+def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None, base_node=EstimatorNode):
 
     #these are wrappers that take in another estimator as a parameter
     # TODO Add AdaBoostRegressor, AdaBoostClassifier as wrappers? wrap a decision tree with different params?
@@ -443,43 +448,52 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None
         sfm_sp = get_configspace(name="SelectFromModel", n_classes=n_classes, n_samples=n_samples, random_state=random_state)
         ext = get_node("ExtraTreesRegressor", n_classes=n_classes, n_samples=n_samples, random_state=random_state)
         return WrapperPipeline(estimator_search_space=ext, method=SelectFromModel, space=sfm_sp)
-
+    # TODO Add IterativeImputer with more estimator methods
+    '''
+    if name == "IterativeImputer_learnedestimators":
+        iteative_sp = get_configspace(name="IterativeImputer", n_classes=n_classes, n_samples=n_samples, random_state=random_state)
+        regessor_searchspace = get_search_space(["LinearRegression", ..], n_classes=n_classes, n_samples=n_samples, random_state=random_state)
+        return WrapperPipeline(estimator_search_space=regressor_searchspace, method=ItartiveImputer, space=iteative_sp)
+    '''
     #these are nodes that have special search spaces which require custom parsing of the hyperparameters
+    if name == "IterativeImputer":
+        configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
+        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=imputers.IterativeImputer_hyperparameter_parser)
     if name == "RobustScaler":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser)
     if name == "GradientBoostingClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser)
     if name == "HistGradientBoostingClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
     if name == "GradientBoostingRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
     if  name == "HistGradientBoostingRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
     if name == "MLPClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)
     if name == "MLPRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser)
     if name == "GaussianProcessRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser)
     if name == "GaussianProcessClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GaussianProcessClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GaussianProcessClassifier_hyperparameter_parser)
     if name == "FeatureAgglomeration":
         configspace = get_configspace(name, n_features=n_features)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.FeatureAgglomeration_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.FeatureAgglomeration_hyperparameter_parser)
 
     configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
     if configspace is None:
         #raise warning
         warnings.warn(f"Could not find configspace for {name}")
         return None
 
-    return EstimatorNode(STRING_TO_CLASS[name], configspace)
+    return base_node(STRING_TO_CLASS[name], configspace)
diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py
@@ -1,9 +1,80 @@
+import sklearn
+import sklearn.ensemble
+import sklearn.linear_model
+import sklearn.neighbors
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
+from ConfigSpace import EqualsCondition
+
 
 simple_imputer_cs = ConfigurationSpace(
     space = {
-        'strategy' : Categorical('strategy', ['mean','median', 'most_frequent', ]),
-        'add_indicator' : Categorical('add_indicator', [True, False]), 
+        'strategy' : Categorical('strategy', 
+                                 ['mean','median', 'most_frequent', 'constant']
+                                 ),
+        #'add_indicator' : Categorical('add_indicator', [True, False]), 
+        #Removed add_indicator, it appends a mask next to the rest of the data 
+        # and can cause errors. gk
+    }
+)
+
+def get_IterativeImputer_config_space(n_features, random_state):
+    space = { 'initial_strategy' : Categorical('initial_strategy', 
+                                             ['mean', 'median', 
+                                              'most_frequent', 'constant']),
+                'n_nearest_features' : Integer('n_nearest_features', 
+                                           bounds=(1, n_features)),
+                'imputation_order' : Categorical('imputation_order', 
+                                             ['ascending', 'descending', 
+                                              'roman', 'arabic', 'random']),
     }
-)
+
+    estimator = Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', 'KNN'])  
+    sample_posterior = Categorical('sample_posterior', [True, False])
+    sampling_condition = EqualsCondition(sample_posterior, estimator, 'Bayesian')
+
+    if random_state is not None: 
+            #This is required because configspace doesn't allow None as a value
+            space['random_state'] = random_state
+
+    cs = ConfigurationSpace(space=space)
+    cs.add_hyperparameters([estimator, sample_posterior])
+    cs.add_conditions([sampling_condition])
+    return cs
+
+def get_KNNImputer_config_space(n_samples):
+    space = {
+            'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))),
+            'weights': Categorical('weights', ['uniform', 'distance'])
+    }
+
+    return ConfigurationSpace(
+          space=space
+    )
+
+def IterativeImputer_hyperparameter_parser(params):
+    est = params['estimator']
+    match est:
+        case 'Bayesian':
+                estimator = sklearn.linear_model.BayesianRidge()
+        case 'RFR':
+                estimator = sklearn.ensemble.RandomForestRegressor()
+        case 'Ridge':
+                estimator = sklearn.linear_model.Ridge()
+        case 'KNN':
+                estimator = sklearn.neighbors.KNeighborsRegressor()
+
+    final_params = {
+            'estimator' : estimator,
+            'initial_strategy' : params['initial_strategy'],
+            'n_nearest_features' : params['n_nearest_features'],
+            'imputation_order' : params['imputation_order'],
+    }
+
+    if 'sample_posterior' in params:
+        final_params['sample_posterior'] = params['sample_posterior']
+
+    if 'random_state' in params:
+        final_params['random_state'] = params['random_state']
+
+    return final_params
diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py
@@ -354,7 +354,7 @@ def get_ExtraTreesRegressor_ConfigurationSpace(random_state):
 def get_GaussianProcessRegressor_ConfigurationSpace(n_features, random_state):
     space = {
         'n_features': n_features,
-        'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),
+        'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),
         'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),
         'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),
     }

diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py
@@ -1,18 +1,10 @@
 import tpot2
-import numpy as np
-import pandas as pd
 import sklearn
-from tpot2 import config
-from typing import Generator, List, Tuple, Union
-import random
 from sklearn.base import BaseEstimator
 import sklearn
 import networkx as nx
 from . import graph_utils
 from typing import final
-from abc import ABC, abstractmethod
-
-