Merge pull request #132 from perib/new_search_space_def

New search space def
EpistasisLab · May 16, 2024 · 2a3ee23 · 2a3ee23
2 parents 8728fff + 0b1aa09
commit 2a3ee23
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 30 deletions.
diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py
@@ -10,7 +10,7 @@ def get_LogisticRegression_ConfigurationSpace(n_samples, n_features, random_stat
 
     dual = n_samples<=n_features
 
-    dual = TRUE_SPECIAL_STRING if dual else FALSE_SPECIAL_STRING
+    dual = FALSE_SPECIAL_STRING
 
     space = {"solver":"saga",
                     "max_iter":1000,
@@ -21,6 +21,7 @@ def get_LogisticRegression_ConfigurationSpace(n_samples, n_features, random_stat
     penalty = Categorical('penalty', ['l1', 'l2',"elasticnet"], default='l2')
     C = Float('C',  (0.01, 1e5), log=True)
     l1_ratio = Float('l1_ratio', (0.0, 1.0))
+    class_weight = Categorical('class_weight', [NONE_SPECIAL_STRING, 'balanced'])
 
     l1_ratio_condition = EqualsCondition(l1_ratio, penalty, 'elasticnet')
 
@@ -29,7 +30,7 @@ def get_LogisticRegression_ConfigurationSpace(n_samples, n_features, random_stat
 
 
     cs = ConfigurationSpace(space)
-    cs.add_hyperparameters([penalty, C, l1_ratio])
+    cs.add_hyperparameters([penalty, C, l1_ratio, class_weight])
     cs.add_conditions([l1_ratio_condition])
 
     return cs
@@ -84,6 +85,7 @@ def get_DecisionTreeClassifier_ConfigurationSpace(n_featues, random_state):
         'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 20)),
         'max_features': Categorical("max_features", [NONE_SPECIAL_STRING, 'sqrt', 'log2']),
         'min_weight_fraction_leaf': 0.0,
+        'class_weight' : Categorical('class_weight', [NONE_SPECIAL_STRING, 'balanced']),
     }
 
 
@@ -94,7 +96,7 @@ def get_DecisionTreeClassifier_ConfigurationSpace(n_featues, random_state):
         space = space
     )
 
-
+#TODO Does not support predict_proba
 def get_LinearSVC_ConfigurationSpace(random_state):
     space = {"dual":"auto"}
 
@@ -120,12 +122,13 @@ def get_SVC_ConfigurationSpace(random_state):
             'max_iter': 3000,
             'probability':TRUE_SPECIAL_STRING}
 
-    kernel = Categorical("kernel", ['poly', 'rbf', 'sigmoid'])
+    kernel = Categorical("kernel", ['poly', 'rbf', 'sigmoid', 'linear'])
     C = Float('C',  (0.01, 1e5), log=True)
     degree = Integer("degree", bounds=(1, 5))
     gamma = Float("gamma", bounds=(1e-5, 8), log=True)
     shrinking = Categorical("shrinking", [True, False])
     coef0 = Float("coef0", bounds=(-1, 1))
+    class_weight = Categorical('class_weight', [NONE_SPECIAL_STRING, 'balanced'])
 
     degree_condition = EqualsCondition(degree, kernel, 'poly')
     gamma_condition = InCondition(gamma, kernel, ['rbf', 'poly'])
@@ -136,7 +139,7 @@ def get_SVC_ConfigurationSpace(random_state):
 
 
     cs = ConfigurationSpace(space)
-    cs.add_hyperparameters([kernel, C, coef0, degree, gamma, shrinking])
+    cs.add_hyperparameters([kernel, C, coef0, degree, gamma, shrinking, class_weight])
     cs.add_conditions([degree_condition, gamma_condition, coef0_condition])
 
     return cs
@@ -187,12 +190,11 @@ def get_XGBClassifier_ConfigurationSpace(random_state,):
 def get_LGBMClassifier_ConfigurationSpace(random_state,):
 
     space = {
-            'objective': 'binary',
-            'metric': 'binary_logloss',
             'boosting_type': Categorical("boosting_type", ['gbdt', 'dart', 'goss']),
             'num_leaves': Integer("num_leaves", bounds=(2, 256)),
             'max_depth': Integer("max_depth", bounds=(1, 10)),
             'n_estimators': Integer("n_estimators", bounds=(10, 100)),
+            'class_weight': Categorical("class_weight", [NONE_SPECIAL_STRING, 'balanced']),
             'verbose':-1,
             'n_jobs': 1,
         }
@@ -213,6 +215,7 @@ def get_ExtraTreesClassifier_ConfigurationSpace(random_state):
             'min_samples_split': Integer("min_samples_split", bounds=(2, 20)),
             'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 20)),
             'bootstrap': Categorical("bootstrap", [True, False]),
+            'class_weight': Categorical("class_weight", [NONE_SPECIAL_STRING, 'balanced']),
             'n_jobs': 1,
         }
 
@@ -228,7 +231,7 @@ def get_ExtraTreesClassifier_ConfigurationSpace(random_state):
 def get_SGDClassifier_ConfigurationSpace(random_state):
 
     space = {
-            'loss': Categorical("loss", ['squared_hinge', 'modified_huber']), #don't include hinge because we have LinearSVC, don't include log because we have LogisticRegression
+            'loss': Categorical("loss", ['modified_huber']), #don't include hinge because we have LinearSVC, don't include log because we have LogisticRegression. TODO 'squared_hinge'? doesn't support predict proba
             'penalty': 'elasticnet',
             'alpha': Float("alpha", bounds=(1e-5, 0.01), log=True),
             'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)),

diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py
@@ -113,8 +113,8 @@
         "selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",],
         "selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"],
         "selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"],
-        "classifiers" :  ["LGBMRegressor", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB',  "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
-        "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor',  'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],
+        "classifiers" :  ["LGBMClassifier", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB',  "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
+        "regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor',  'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],
 
 
         "transformers":  ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],

diff --git a/tpot2/search_spaces/pipelines/dynamic_linear.py b/tpot2/search_spaces/pipelines/dynamic_linear.py
@@ -64,16 +64,62 @@ def _mutate_step(self, rng=None):
 
 
     def _crossover(self, other, rng=None):
+        #swap a random step in the pipeline with the corresponding step in the other pipeline
+
+        rng = np.random.default_rng()
+        cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step]
+
+        rng.shuffle(cx_funcs)
+        for cx_func in cx_funcs:
+            if cx_func(other, rng):
+                return True
+
+        return False
+
+    def _crossover_swap_random_steps(self, other, rng):
         rng = np.random.default_rng()
 
-        if len(self.pipeline) < 2 or len(other.pipeline) < 2:
-            return False
+        max_steps = int(min(len(self.pipeline), len(other.pipeline))/2)
+        max_steps = max(max_steps, 1)
+
+        if max_steps == 1:
+            n_steps_to_swap = 1
+        else:
+            n_steps_to_swap = rng.integers(1, max_steps)
 
-        idx = rng.integers(1,len(self.pipeline))
-        idx2 = rng.integers(1,len(other.pipeline))
-        self.pipeline[idx:] = copy.deepcopy(other.pipeline[idx2:])
+        other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False)
+        self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False)
+
+        # self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace]
+
+        for self_idx, other_idx in zip(self_indexes_to_replace, other_indexes_to_take):
+            self.pipeline[self_idx], other.pipeline[other_idx] = other.pipeline[other_idx], self.pipeline[self_idx]
+
+        return True
+
+    def _crossover_swap_step(self, other, rng):
+        if len(self.pipeline) != len(other.pipeline):
+            return False
 
+        if len(self.pipeline) < 2:
+            return False
+
+        rng = np.random.default_rng()
+        idx = rng.integers(1,len(self.pipeline))
+
+        self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx]
         return True
+
+    def _crossover_inner_step(self, other, rng):
+        rng = np.random.default_rng()
+
+        crossover_success = False
+        for idx in range(len(self.pipeline)):
+            if rng.random() < 0.5:
+                if self.pipeline[idx].crossover(other.pipeline[idx], rng):
+                    crossover_success = True
+
+        return crossover_success
 
     def export_pipeline(self, **graph_pipeline_args):
         return [step.export_pipeline(**graph_pipeline_args) for step in self.pipeline]

diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py
@@ -107,12 +107,19 @@ def _crossover_swap_random_steps(self, other, rng):
         max_steps = int(min(len(self.pipeline), len(other.pipeline))/2)
         max_steps = max(max_steps, 1)
 
-        n_steps_to_swap = rng.integers(1, max_steps)
+        if max_steps == 1:
+            n_steps_to_swap = 1
+        else:
+            n_steps_to_swap = rng.integers(1, max_steps)
 
         other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False)
         self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False)
 
-        self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace]
+        # self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace]
+
+        for self_idx, other_idx in zip(self_indexes_to_replace, other_indexes_to_take):
+            self.pipeline[self_idx], other.pipeline[other_idx] = other.pipeline[other_idx], self.pipeline[self_idx]
+
         return True
 
 

diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py
@@ -33,9 +33,6 @@ def _crossover(self, other, rng=None):
         #swap a random step in the pipeline with the corresponding step in the other pipeline
         if len(self.pipeline) != len(other.pipeline):
             return False
-
-        if len(self.pipeline) < 2:
-            return False
 
         rng = np.random.default_rng()
         cx_funcs = [self._crossover_swap_random_steps, self._crossover_swap_segment, self._crossover_inner_step]
@@ -51,8 +48,6 @@ def _crossover_swap_step(self, other, rng):
         if len(self.pipeline) != len(other.pipeline):
             return False
 
-        if len(self.pipeline) < 2:
-            return False
 
         rng = np.random.default_rng()
         idx = rng.integers(1,len(self.pipeline))
@@ -61,12 +56,29 @@ def _crossover_swap_step(self, other, rng):
         return True
 
     def _crossover_swap_random_steps(self, other, rng):
+
+        if len(self.pipeline) != len(other.pipeline):
+            return False
+
+        if len(self.pipeline) < 2:
+            return False
+
         rng = np.random.default_rng()
-        #selet steps idxs with probability 0.5
-        idxs = rng.random(len(self.pipeline)) < 0.5
-        #swap steps
-        self.pipeline[idxs], other.pipeline[idxs] = other.pipeline[idxs], self.pipeline[idxs]
 
+        max_steps = int(min(len(self.pipeline), len(other.pipeline))/2)
+        max_steps = max(max_steps, 1)
+
+        if max_steps == 1:
+            n_steps_to_swap = 1
+        else:
+            n_steps_to_swap = rng.integers(1, max_steps)
+
+        indexes_to_swap = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False)
+
+        for idx in indexes_to_swap:
+            self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx]
+
+
         return True
 
     def _crossover_swap_segment(self, other, rng):
@@ -105,6 +117,8 @@ def unique_id(self):
         l = [step.unique_id() for step in self.pipeline]
         l = ["SequentialPipeline"] + l
         return TupleIndex(tuple(l))
+
+
 
 
 class SequentialPipeline(SklearnIndividualGenerator):

diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py
@@ -51,11 +51,23 @@ def _crossover_swap_step(self, other, rng):
 
     def _crossover_swap_random_steps(self, other, rng):
         rng = np.random.default_rng()
-        #selet steps idxs with probability 0.5
-        idxs = rng.random(len(self.pipeline)) < 0.5
-        #swap steps
-        self.pipeline[idxs], other.pipeline[idxs] = other.pipeline[idxs], self.pipeline[idxs]
 
+        max_steps = int(min(len(self.pipeline), len(other.pipeline))/2)
+        max_steps = max(max_steps, 1)
+
+        if max_steps == 1:
+            n_steps_to_swap = 1
+        else:
+            n_steps_to_swap = rng.integers(1, max_steps)
+
+        other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False)
+        self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False)
+
+        # self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace]
+
+        for self_idx, other_idx in zip(self_indexes_to_replace, other_indexes_to_take):
+            self.pipeline[self_idx], other.pipeline[other_idx] = other.pipeline[other_idx], self.pipeline[self_idx]
+
         return True
 
     def _crossover_inner_step(self, other, rng):