From 102cdd537cbf8d57d25ee65cf65cd6c832ac7fce Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 8 Jul 2024 17:07:10 -0700 Subject: [PATCH 01/12] removed unused files, passed rng into default_rng --- .../nodes/estimator_node_custom_sampler.py | 59 ------- .../nodes/estimator_node_simple.py | 70 -------- .../nodes/genetic_feature_selection.py | 8 +- .../search_spaces/pipelines/dynamic_linear.py | 18 +- .../pipelines/dynamicunion copy.py | 165 ++++++++++++++++++ tpot2/search_spaces/pipelines/dynamicunion.py | 18 +- .../pipelines/genetic_sample_weight.py | 1 - .../pipelines/hierarchical_individual.py | 1 - tpot2/search_spaces/pipelines/sequential.py | 12 +- tpot2/search_spaces/pipelines/union.py | 10 +- 10 files changed, 198 insertions(+), 164 deletions(-) delete mode 100644 tpot2/search_spaces/nodes/estimator_node_custom_sampler.py delete mode 100644 tpot2/search_spaces/nodes/estimator_node_simple.py create mode 100644 tpot2/search_spaces/pipelines/dynamicunion copy.py delete mode 100644 tpot2/search_spaces/pipelines/genetic_sample_weight.py delete mode 100644 tpot2/search_spaces/pipelines/hierarchical_individual.py diff --git a/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py b/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py deleted file mode 100644 index 93a55a4e..00000000 --- a/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py +++ /dev/null @@ -1,59 +0,0 @@ -# try https://automl.github.io/ConfigSpace/main/api/hyperparameters.html -import tpot2 -import numpy as np -import pandas as pd -import sklearn -from tpot2 import config -from typing import Generator, List, Tuple, Union -import random -from ..base import SklearnIndividual, SklearnIndividualGenerator, check_same_subclass -from ConfigSpace import ConfigurationSpace - - -class EstimatorNodeCustomIndividual(SklearnIndividual): - def __init__(self, method: type, - sample_func : callable, - rng=None) -> None: - super().__init__() - self.method = method - self.sample_func = sample_func - - self.hyperparameters = self.sample_func(rng) - - def mutate(self, rng=None): - rng = np.random.default_rng(rng) - self.hyperparameters = self.sample_func(rng) - return True - - def _crossover(self, other, rng=None): - rng = np.random.default_rng(rng) - if self.method != other.method: - return False - - #loop through hyperparameters, randomly swap items in self.hyperparameters with items in other.hyperparameters - for hyperparameter in self.space: - if rng.choice([True, False]): - if hyperparameter in other.hyperparameters: - self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter] - - def export_pipeline(self, **kwargs): - return self.method(**self.hyperparameters) - - def unique_id(self): - #return a dictionary of the method and the hyperparameters - method_str = self.method.__name__ - params = list(self.hyperparameters.keys()) - params = sorted(params) - - id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" - - return id_str - -class EstimatorNodeCustom(SklearnIndividualGenerator): - def __init__(self, method : type, - sample_func: callable): - self.method = method - self.sample_func = sample_func - - def generate(self, rng=None): - return EstimatorNodeCustomIndividual(self.method, self.sample_func) \ No newline at end of file diff --git a/tpot2/search_spaces/nodes/estimator_node_simple.py b/tpot2/search_spaces/nodes/estimator_node_simple.py deleted file mode 100644 index 8063526a..00000000 --- a/tpot2/search_spaces/nodes/estimator_node_simple.py +++ /dev/null @@ -1,70 +0,0 @@ -# try https://automl.github.io/ConfigSpace/main/api/hyperparameters.html -import tpot2 -import numpy as np -import pandas as pd -import sklearn -from tpot2 import config -from typing import Generator, List, Tuple, Union -import random -from ..base import SklearnIndividual, SklearnIndividualGenerator - -class EstimatorNodeIndividual(SklearnIndividual): - def __init__(self, method, space ) -> None: - super().__init__() - self.method = method - self.space = space #a dictionary. keys are hyperparameters, values are the space of the hyperparameter. If list, then hyperparameter is categorical. If tuple, then hyperparameter is continuous. If single value, then hyperparameter is fixed. - - self._mutate_hyperparameters() - - def mutate(self, rng=None): - rng = np.random.default_rng(rng) - return self._mutate_hyperparameters(rng) - - def _mutate_hyperparameters(self, rng=None): - rng = np.random.default_rng(rng) - self.hyperparameters = {} - #sample new hyperparameters from the space - for hyperparameter in self.space: - hyperparameter_space = self.space[hyperparameter] - if isinstance(hyperparameter_space, list): - hp = rng.choice(hyperparameter_space) - elif isinstance(hyperparameter_space, tuple): - hp = rng.uniform(hyperparameter_space[0], hyperparameter_space[1]) - else: - hp = hyperparameter_space - - self.hyperparameters[hyperparameter] = hp - - return True - - def _crossover(self, other, rng=None): - rng = np.random.default_rng(rng) - if self.method != other.method: - return False - - #loop through hyperparameters, randomly swap items in self.hyperparameters with items in other.hyperparameters - for hyperparameter in self.space: - if rng.choice([True, False]): - if hyperparameter in other.hyperparameters: - self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter] - - def export_pipeline(self, **kwargs): - return self.method(**self.hyperparameters) - - def unique_id(self): - #return a dictionary of the method and the hyperparameters - method_str = self.method.__name__ - params = list(self.hyperparameters.keys()) - params = sorted(params) - - id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" - - return id_str - -class EstimatorNode(SklearnIndividualGenerator): - def __init__(self, method, space): - self.method = method - self.space = space - - def generate(self, rng=None): - return EstimatorNodeIndividual(self.method, self.space) \ No newline at end of file diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py index 0fe16586..9e36e666 100644 --- a/tpot2/search_spaces/nodes/genetic_feature_selection.py +++ b/tpot2/search_spaces/nodes/genetic_feature_selection.py @@ -31,16 +31,16 @@ def __init__( self, start_p=0.2, mutation_rate = 0.5, crossover_rate = 0.5, - mutation_rate_rate = 0, - crossover_rate_rate = 0, rng=None, ): self.start_p = start_p self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate - self.mutation_rate_rate = mutation_rate_rate - self.crossover_rate_rate = crossover_rate_rate + self.mutation_rate_rate = 0 + self.crossover_rate_rate = 0 + + rng = np.random.default_rng(rng) diff --git a/tpot2/search_spaces/pipelines/dynamic_linear.py b/tpot2/search_spaces/pipelines/dynamic_linear.py index 6da90d81..79ccedef 100644 --- a/tpot2/search_spaces/pipelines/dynamic_linear.py +++ b/tpot2/search_spaces/pipelines/dynamic_linear.py @@ -26,7 +26,7 @@ def __init__(self, search_space : SklearnIndividualGenerator, max_length: int , self.pipeline = self._generate_pipeline(rng) def _generate_pipeline(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) pipeline = [] length = rng.integers(self.min_length, self.max_length) length = min(length, 3) @@ -37,7 +37,7 @@ def _generate_pipeline(self, rng=None): def mutate(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) options = [] if len(self.pipeline) > self.min_length: options.append(self._mutate_remove_node) @@ -48,19 +48,19 @@ def mutate(self, rng=None): return rng.choice(options)(rng) def _mutate_add_node(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) new_node = self.search_space.generate(rng) idx = rng.integers(len(self.pipeline)) self.pipeline.insert(idx, new_node) def _mutate_remove_node(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) idx = rng.integers(len(self.pipeline)) self.pipeline.pop(idx) def _mutate_step(self, rng=None): #choose a random step in the pipeline and mutate it - rng = np.random.default_rng() + rng = np.random.default_rng(rng) step = rng.choice(self.pipeline) return step.mutate(rng) @@ -68,7 +68,7 @@ def _mutate_step(self, rng=None): def _crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline - rng = np.random.default_rng() + rng = np.random.default_rng(rng) cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] rng.shuffle(cx_funcs) @@ -79,7 +79,7 @@ def _crossover(self, other, rng=None): return False def _crossover_swap_random_steps(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) max_steps = max(max_steps, 1) @@ -106,14 +106,14 @@ def _crossover_swap_step(self, other, rng): if len(self.pipeline) < 2: return False - rng = np.random.default_rng() + rng = np.random.default_rng(rng) idx = rng.integers(1,len(self.pipeline)) self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] return True def _crossover_inner_step(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) pipeline1_indexes= list(range(len(self.pipeline))) pipeline2_indexes= list(range(len(other.pipeline))) diff --git a/tpot2/search_spaces/pipelines/dynamicunion copy.py b/tpot2/search_spaces/pipelines/dynamicunion copy.py new file mode 100644 index 00000000..7951c25c --- /dev/null +++ b/tpot2/search_spaces/pipelines/dynamicunion copy.py @@ -0,0 +1,165 @@ +import tpot2 +import numpy as np +import pandas as pd +import sklearn +from tpot2 import config +from typing import Generator, List, Tuple, Union +import random +from ..base import SklearnIndividual, SklearnIndividualGenerator +from ..tuple_index import TupleIndex + +class DynamicUnionPipelineIndividual(SklearnIndividual): + """ + Takes in one search space. + Will produce a FeatureUnion of up to max_estimators number of steps. + The output of the FeatureUnion will the all of the steps concatenated together. + + """ + + def __init__(self, search_space : SklearnIndividualGenerator, max_estimators=None, rng=None) -> None: + super().__init__() + self.search_space = search_space + + if max_estimators is None: + self.max_estimators = np.inf + else: + self.max_estimators = max_estimators + + self.pipeline = [] + + if self.max_estimators == np.inf: + init_max = 3 + else: + init_max = self.max_estimators + + rng = np.random.default_rng(rng) + + for _ in range(rng.integers(1, init_max)): + self.pipeline.append(self.search_space.generate(rng)) + + def mutate(self, rng=None): + rng = np.random.default_rng(rng) + mutation_funcs = [self._mutate_add_step, self._mutate_remove_step, self._mutate_replace_step, self._mutate_inner_step] + rng.shuffle(mutation_funcs) + for mutation_func in mutation_funcs: + if mutation_func(rng): + return True + + def _mutate_add_step(self, rng): + rng = np.random.default_rng(rng) + if len(self.pipeline) < self.max_estimators: + self.pipeline.append(self.search_space.generate(rng)) + return True + return False + + def _mutate_remove_step(self, rng): + rng = np.random.default_rng(rng) + if len(self.pipeline) > 1: + self.pipeline.pop(rng.integers(0, len(self.pipeline))) + return True + return False + + def _mutate_replace_step(self, rng): + rng = np.random.default_rng(rng) + idx = rng.integers(0, len(self.pipeline)) + self.pipeline[idx] = self.search_space.generate(rng) + return True + + #TODO mutate one step or multiple? + def _mutate_inner_step(self, rng): + rng = np.random.default_rng(rng) + indexes = rng.random(len(self.pipeline)) < 0.5 + indexes = np.where(indexes)[0] + mutated = False + if len(indexes) > 0: + for idx in indexes: + if self.pipeline[idx].mutate(rng): + mutated = True + else: + mutated = self.pipeline[rng.integers(0, len(self.pipeline))].mutate(rng) + + return mutated + + + def _crossover(self, other, rng=None): + rng = np.random.default_rng(rng) + + cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] + rng.shuffle(cx_funcs) + for cx_func in cx_funcs: + if cx_func(other, rng): + return True + + return False + + def _crossover_swap_step(self, other, rng): + rng = np.random.default_rng(rng) + idx = rng.integers(1,len(self.pipeline)) + idx2 = rng.integers(1,len(other.pipeline)) + + self.pipeline[idx], other.pipeline[idx2] = other.pipeline[idx2], self.pipeline[idx] + # self.pipeline[idx] = other.pipeline[idx2] + return True + + def _crossover_swap_random_steps(self, other, rng): + rng = np.random.default_rng(rng) + + max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) + max_steps = max(max_steps, 1) + + if max_steps == 1: + n_steps_to_swap = 1 + else: + n_steps_to_swap = rng.integers(1, max_steps) + + other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False) + self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False) + + # self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace] + + for self_idx, other_idx in zip(self_indexes_to_replace, other_indexes_to_take): + self.pipeline[self_idx], other.pipeline[other_idx] = other.pipeline[other_idx], self.pipeline[self_idx] + + return True + + + + def _crossover_inner_step(self, other, rng): + rng = np.random.default_rng(rng) + + #randomly select pairs of steps to crossover + indexes = list(range(1, len(self.pipeline))) + other_indexes = list(range(1, len(other.pipeline))) + #shuffle + rng.shuffle(indexes) + rng.shuffle(other_indexes) + + crossover_success = False + for idx, other_idx in zip(indexes, other_indexes): + if self.pipeline[idx].crossover(other.pipeline[other_idx], rng): + crossover_success = True + + return crossover_success + + def export_pipeline(self): + return sklearn.pipeline.make_union(*[step.export_pipeline() for step in self.pipeline]) + + def unique_id(self): + l = [step.unique_id() for step in self.pipeline] + # if all items are strings, then sort them + if all([isinstance(x, str) for x in l]): + l.sort() + l = ["FeatureUnion"] + l + return TupleIndex(tuple(l)) + + +class DynamicUnionPipeline(SklearnIndividualGenerator): + def __init__(self, search_spaces : List[SklearnIndividualGenerator] ) -> None: + """ + Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index. + """ + + self.search_spaces = search_spaces + + def generate(self, rng=None): + return DynamicUnionPipelineIndividual(self.search_spaces) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py index 48fa9669..401c16ef 100644 --- a/tpot2/search_spaces/pipelines/dynamicunion.py +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -41,7 +41,7 @@ def __init__(self, search_space : SklearnIndividualGenerator, max_estimators=Non def mutate(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) mutation_funcs = [self._mutate_add_step, self._mutate_remove_step, self._mutate_replace_step, self._mutate_inner_step] rng.shuffle(mutation_funcs) for mutation_func in mutation_funcs: @@ -49,7 +49,7 @@ def mutate(self, rng=None): return True def _mutate_add_step(self, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) max_attempts = 10 if len(self.union_dict) < self.max_estimators: for _ in range(max_attempts): @@ -60,20 +60,20 @@ def _mutate_add_step(self, rng): return False def _mutate_remove_step(self, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) if len(self.union_dict) > 1: self.union_dict.pop( rng.choice(list(self.union_dict.keys()))) return True return False def _mutate_replace_step(self, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) changed = self._mutate_remove_step(rng) or self._mutate_add_step(rng) return changed #TODO mutate one step or multiple? def _mutate_inner_step(self, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) changed = False values = list(self.union_dict.values()) for step in values: @@ -86,7 +86,7 @@ def _mutate_inner_step(self, rng): def _crossover(self, other, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] rng.shuffle(cx_funcs) @@ -97,7 +97,7 @@ def _crossover(self, other, rng=None): return False def _crossover_swap_step(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) changed = False self_step = rng.choice(list(self.union_dict.values())) @@ -118,7 +118,7 @@ def _crossover_swap_step(self, other, rng): def _crossover_swap_random_steps(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) self_values = list(self.union_dict.values()) other_values = list(other.union_dict.values()) @@ -137,7 +137,7 @@ def _crossover_swap_random_steps(self, other, rng): def _crossover_inner_step(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) changed = False self_values = list(self.union_dict.values()) diff --git a/tpot2/search_spaces/pipelines/genetic_sample_weight.py b/tpot2/search_spaces/pipelines/genetic_sample_weight.py deleted file mode 100644 index db731a85..00000000 --- a/tpot2/search_spaces/pipelines/genetic_sample_weight.py +++ /dev/null @@ -1 +0,0 @@ -from ..base import SklearnIndividual, SklearnIndividualGenerator \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/hierarchical_individual.py b/tpot2/search_spaces/pipelines/hierarchical_individual.py deleted file mode 100644 index db731a85..00000000 --- a/tpot2/search_spaces/pipelines/hierarchical_individual.py +++ /dev/null @@ -1 +0,0 @@ -from ..base import SklearnIndividual, SklearnIndividualGenerator \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py index da52222a..7a7e6a99 100644 --- a/tpot2/search_spaces/pipelines/sequential.py +++ b/tpot2/search_spaces/pipelines/sequential.py @@ -25,7 +25,7 @@ def __init__(self, search_spaces : List[SklearnIndividualGenerator], memory=None #TODO, mutate all steps or just one? def mutate(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) # mutated = False # for step in self.pipeline: @@ -43,7 +43,7 @@ def _crossover(self, other, rng=None): if len(self.pipeline) != len(other.pipeline): return False - rng = np.random.default_rng() + rng = np.random.default_rng(rng) cx_funcs = [self._crossover_swap_random_steps, self._crossover_swap_segment, self._crossover_inner_step] rng.shuffle(cx_funcs) @@ -58,7 +58,7 @@ def _crossover_swap_step(self, other, rng): return False - rng = np.random.default_rng() + rng = np.random.default_rng(rng) idx = rng.integers(1,len(self.pipeline)) self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] @@ -72,7 +72,7 @@ def _crossover_swap_random_steps(self, other, rng): if len(self.pipeline) < 2: return False - rng = np.random.default_rng() + rng = np.random.default_rng(rng) max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) max_steps = max(max_steps, 1) @@ -97,7 +97,7 @@ def _crossover_swap_segment(self, other, rng): if len(self.pipeline) < 2: return False - rng = np.random.default_rng() + rng = np.random.default_rng(rng) idx = rng.integers(1,len(self.pipeline)) left = rng.choice([True, False]) @@ -109,7 +109,7 @@ def _crossover_swap_segment(self, other, rng): return True def _crossover_inner_step(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) # crossover_success = False # for idx in range(len(self.pipeline)): diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py index 1e1a58c8..32f988e6 100644 --- a/tpot2/search_spaces/pipelines/union.py +++ b/tpot2/search_spaces/pipelines/union.py @@ -25,14 +25,14 @@ def __init__(self, search_spaces : List[SklearnIndividualGenerator], rng=None) - self.pipeline.append(space.generate(rng)) def mutate(self, rng=None): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) step = rng.choice(self.pipeline) return step.mutate(rng) def _crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline - rng = np.random.default_rng() + rng = np.random.default_rng(rng) cx_funcs = [self._crossover_inner_step] rng.shuffle(cx_funcs) @@ -43,14 +43,14 @@ def _crossover(self, other, rng=None): return False def _crossover_swap_step(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) idx = rng.integers(1,len(self.pipeline)) self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] return True def _crossover_swap_random_steps(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) max_steps = max(max_steps, 1) @@ -71,7 +71,7 @@ def _crossover_swap_random_steps(self, other, rng): return True def _crossover_inner_step(self, other, rng): - rng = np.random.default_rng() + rng = np.random.default_rng(rng) crossover_success = False for idx in range(len(self.pipeline)): From 2b59ec8c783cb87457cbad467bb32f6a98e5fcae Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 8 Jul 2024 17:36:36 -0700 Subject: [PATCH 02/12] wrapper now crossover over hyperparametesr --- tpot2/search_spaces/pipelines/wrapper.py | 53 +++++++++++++++++++----- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 1cd33bf3..7b49e182 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -9,6 +9,11 @@ from ConfigSpace import ConfigurationSpace from ..tuple_index import TupleIndex +NONE_SPECIAL_STRING = "" +TRUE_SPECIAL_STRING = "" +FALSE_SPECIAL_STRING = "" + + class WrapperPipelineIndividual(SklearnIndividual): def __init__( self, @@ -18,14 +23,7 @@ def __init__( hyperparameter_parser: callable = None, wrapped_param_name: str = None, rng=None) -> None: - - - super().__init__() - - - - self.method = method self.space = space @@ -33,7 +31,6 @@ def __init__( self.hyperparameters_parser = hyperparameter_parser self.wrapped_param_name = wrapped_param_name - rng = np.random.default_rng(rng) self.node = self.estimator_search_space.generate(rng) @@ -44,8 +41,7 @@ def __init__( self.space.seed(rng.integers(0, 2**32)) self.hyperparameters = dict(self.space.sample_configuration()) - - + self.check_hyperparameters_for_None() def mutate(self, rng=None): rng = np.random.default_rng(rng) @@ -60,14 +56,49 @@ def _mutate_hyperparameters(self, rng=None): rng = np.random.default_rng(rng) self.space.seed(rng.integers(0, 2**32)) self.hyperparameters = dict(self.space.sample_configuration()) + self.check_hyperparameters_for_None() return True def _mutate_node(self, rng=None): return self.node.mutate(rng) def _crossover(self, other, rng=None): - return self.node.crossover(other.node, rng) + if rng.choice([True, False]): + return self._crossover_hyperparameters(other, rng) + else: + self.estimator_search_space.crossover(other.estimator_search_space, rng) + + def _crossover_hyperparameters(self, other, rng=None): + if isinstance(self.space, dict): + return False + + rng = np.random.default_rng(rng) + if self.method != other.method: + return False + + #loop through hyperparameters, randomly swap items in self.hyperparameters with items in other.hyperparameters + for hyperparameter in self.space: + if rng.choice([True, False]): + if hyperparameter in other.hyperparameters: + self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter] + + self.check_hyperparameters_for_None() + + return True + + def check_hyperparameters_for_None(self): + for key, value in self.hyperparameters.items(): + #if string + if isinstance(value, str): + if value == NONE_SPECIAL_STRING: + self.hyperparameters[key] = None + elif value == TRUE_SPECIAL_STRING: + self.hyperparameters[key] = True + elif value == FALSE_SPECIAL_STRING: + self.hyperparameters[key] = False + + def export_pipeline(self): if self.hyperparameters_parser is not None: From 1b63414fdae8898d837aa582806219e61ea2edaa Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 8 Jul 2024 17:36:49 -0700 Subject: [PATCH 03/12] made naming clearer --- .../search_spaces/pipelines/dynamic_linear.py | 8 +- .../pipelines/dynamicunion copy.py | 165 ------------------ tpot2/search_spaces/pipelines/dynamicunion.py | 12 +- tpot2/search_spaces/pipelines/sequential.py | 8 +- tpot2/search_spaces/pipelines/union.py | 27 +-- 5 files changed, 17 insertions(+), 203 deletions(-) delete mode 100644 tpot2/search_spaces/pipelines/dynamicunion copy.py diff --git a/tpot2/search_spaces/pipelines/dynamic_linear.py b/tpot2/search_spaces/pipelines/dynamic_linear.py index 79ccedef..2ff2bf0b 100644 --- a/tpot2/search_spaces/pipelines/dynamic_linear.py +++ b/tpot2/search_spaces/pipelines/dynamic_linear.py @@ -69,7 +69,7 @@ def _crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline rng = np.random.default_rng(rng) - cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] + cx_funcs = [self._crossover_swap_multiple_nodes, self._crossover_node] rng.shuffle(cx_funcs) for cx_func in cx_funcs: @@ -78,7 +78,7 @@ def _crossover(self, other, rng=None): return False - def _crossover_swap_random_steps(self, other, rng): + def _crossover_swap_multiple_nodes(self, other, rng): rng = np.random.default_rng(rng) max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) @@ -99,7 +99,7 @@ def _crossover_swap_random_steps(self, other, rng): return True - def _crossover_swap_step(self, other, rng): + def _crossover_swap_node(self, other, rng): if len(self.pipeline) != len(other.pipeline): return False @@ -112,7 +112,7 @@ def _crossover_swap_step(self, other, rng): self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] return True - def _crossover_inner_step(self, other, rng): + def _crossover_node(self, other, rng): rng = np.random.default_rng(rng) pipeline1_indexes= list(range(len(self.pipeline))) diff --git a/tpot2/search_spaces/pipelines/dynamicunion copy.py b/tpot2/search_spaces/pipelines/dynamicunion copy.py deleted file mode 100644 index 7951c25c..00000000 --- a/tpot2/search_spaces/pipelines/dynamicunion copy.py +++ /dev/null @@ -1,165 +0,0 @@ -import tpot2 -import numpy as np -import pandas as pd -import sklearn -from tpot2 import config -from typing import Generator, List, Tuple, Union -import random -from ..base import SklearnIndividual, SklearnIndividualGenerator -from ..tuple_index import TupleIndex - -class DynamicUnionPipelineIndividual(SklearnIndividual): - """ - Takes in one search space. - Will produce a FeatureUnion of up to max_estimators number of steps. - The output of the FeatureUnion will the all of the steps concatenated together. - - """ - - def __init__(self, search_space : SklearnIndividualGenerator, max_estimators=None, rng=None) -> None: - super().__init__() - self.search_space = search_space - - if max_estimators is None: - self.max_estimators = np.inf - else: - self.max_estimators = max_estimators - - self.pipeline = [] - - if self.max_estimators == np.inf: - init_max = 3 - else: - init_max = self.max_estimators - - rng = np.random.default_rng(rng) - - for _ in range(rng.integers(1, init_max)): - self.pipeline.append(self.search_space.generate(rng)) - - def mutate(self, rng=None): - rng = np.random.default_rng(rng) - mutation_funcs = [self._mutate_add_step, self._mutate_remove_step, self._mutate_replace_step, self._mutate_inner_step] - rng.shuffle(mutation_funcs) - for mutation_func in mutation_funcs: - if mutation_func(rng): - return True - - def _mutate_add_step(self, rng): - rng = np.random.default_rng(rng) - if len(self.pipeline) < self.max_estimators: - self.pipeline.append(self.search_space.generate(rng)) - return True - return False - - def _mutate_remove_step(self, rng): - rng = np.random.default_rng(rng) - if len(self.pipeline) > 1: - self.pipeline.pop(rng.integers(0, len(self.pipeline))) - return True - return False - - def _mutate_replace_step(self, rng): - rng = np.random.default_rng(rng) - idx = rng.integers(0, len(self.pipeline)) - self.pipeline[idx] = self.search_space.generate(rng) - return True - - #TODO mutate one step or multiple? - def _mutate_inner_step(self, rng): - rng = np.random.default_rng(rng) - indexes = rng.random(len(self.pipeline)) < 0.5 - indexes = np.where(indexes)[0] - mutated = False - if len(indexes) > 0: - for idx in indexes: - if self.pipeline[idx].mutate(rng): - mutated = True - else: - mutated = self.pipeline[rng.integers(0, len(self.pipeline))].mutate(rng) - - return mutated - - - def _crossover(self, other, rng=None): - rng = np.random.default_rng(rng) - - cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] - rng.shuffle(cx_funcs) - for cx_func in cx_funcs: - if cx_func(other, rng): - return True - - return False - - def _crossover_swap_step(self, other, rng): - rng = np.random.default_rng(rng) - idx = rng.integers(1,len(self.pipeline)) - idx2 = rng.integers(1,len(other.pipeline)) - - self.pipeline[idx], other.pipeline[idx2] = other.pipeline[idx2], self.pipeline[idx] - # self.pipeline[idx] = other.pipeline[idx2] - return True - - def _crossover_swap_random_steps(self, other, rng): - rng = np.random.default_rng(rng) - - max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) - max_steps = max(max_steps, 1) - - if max_steps == 1: - n_steps_to_swap = 1 - else: - n_steps_to_swap = rng.integers(1, max_steps) - - other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False) - self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False) - - # self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace] - - for self_idx, other_idx in zip(self_indexes_to_replace, other_indexes_to_take): - self.pipeline[self_idx], other.pipeline[other_idx] = other.pipeline[other_idx], self.pipeline[self_idx] - - return True - - - - def _crossover_inner_step(self, other, rng): - rng = np.random.default_rng(rng) - - #randomly select pairs of steps to crossover - indexes = list(range(1, len(self.pipeline))) - other_indexes = list(range(1, len(other.pipeline))) - #shuffle - rng.shuffle(indexes) - rng.shuffle(other_indexes) - - crossover_success = False - for idx, other_idx in zip(indexes, other_indexes): - if self.pipeline[idx].crossover(other.pipeline[other_idx], rng): - crossover_success = True - - return crossover_success - - def export_pipeline(self): - return sklearn.pipeline.make_union(*[step.export_pipeline() for step in self.pipeline]) - - def unique_id(self): - l = [step.unique_id() for step in self.pipeline] - # if all items are strings, then sort them - if all([isinstance(x, str) for x in l]): - l.sort() - l = ["FeatureUnion"] + l - return TupleIndex(tuple(l)) - - -class DynamicUnionPipeline(SklearnIndividualGenerator): - def __init__(self, search_spaces : List[SklearnIndividualGenerator] ) -> None: - """ - Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index. - """ - - self.search_spaces = search_spaces - - def generate(self, rng=None): - return DynamicUnionPipelineIndividual(self.search_spaces) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py index 401c16ef..01651c29 100644 --- a/tpot2/search_spaces/pipelines/dynamicunion.py +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -42,7 +42,7 @@ def __init__(self, search_space : SklearnIndividualGenerator, max_estimators=Non def mutate(self, rng=None): rng = np.random.default_rng(rng) - mutation_funcs = [self._mutate_add_step, self._mutate_remove_step, self._mutate_replace_step, self._mutate_inner_step] + mutation_funcs = [self._mutate_add_step, self._mutate_remove_step, self._mutate_replace_step, self._mutate_note] rng.shuffle(mutation_funcs) for mutation_func in mutation_funcs: if mutation_func(rng): @@ -72,7 +72,7 @@ def _mutate_replace_step(self, rng): return changed #TODO mutate one step or multiple? - def _mutate_inner_step(self, rng): + def _mutate_note(self, rng): rng = np.random.default_rng(rng) changed = False values = list(self.union_dict.values()) @@ -88,7 +88,7 @@ def _mutate_inner_step(self, rng): def _crossover(self, other, rng=None): rng = np.random.default_rng(rng) - cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] + cx_funcs = [self._crossover_swap_multiple_nodes, self._crossover_node] rng.shuffle(cx_funcs) for cx_func in cx_funcs: if cx_func(other, rng): @@ -96,7 +96,7 @@ def _crossover(self, other, rng=None): return False - def _crossover_swap_step(self, other, rng): + def _crossover_swap_node(self, other, rng): rng = np.random.default_rng(rng) changed = False @@ -117,7 +117,7 @@ def _crossover_swap_step(self, other, rng): - def _crossover_swap_random_steps(self, other, rng): + def _crossover_swap_multiple_nodes(self, other, rng): rng = np.random.default_rng(rng) self_values = list(self.union_dict.values()) other_values = list(other.union_dict.values()) @@ -136,7 +136,7 @@ def _crossover_swap_random_steps(self, other, rng): return True - def _crossover_inner_step(self, other, rng): + def _crossover_node(self, other, rng): rng = np.random.default_rng(rng) changed = False diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py index 7a7e6a99..2fa15f9a 100644 --- a/tpot2/search_spaces/pipelines/sequential.py +++ b/tpot2/search_spaces/pipelines/sequential.py @@ -44,7 +44,7 @@ def _crossover(self, other, rng=None): return False rng = np.random.default_rng(rng) - cx_funcs = [self._crossover_swap_random_steps, self._crossover_swap_segment, self._crossover_inner_step] + cx_funcs = [self._crossover_swap_multiple_nodes, self._crossover_swap_segment, self._crossover_node] rng.shuffle(cx_funcs) for cx_func in cx_funcs: @@ -53,7 +53,7 @@ def _crossover(self, other, rng=None): return False - def _crossover_swap_step(self, other, rng): + def _crossover_swap_node(self, other, rng): if len(self.pipeline) != len(other.pipeline): return False @@ -64,7 +64,7 @@ def _crossover_swap_step(self, other, rng): self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] return True - def _crossover_swap_random_steps(self, other, rng): + def _crossover_swap_multiple_nodes(self, other, rng): if len(self.pipeline) != len(other.pipeline): return False @@ -108,7 +108,7 @@ def _crossover_swap_segment(self, other, rng): return True - def _crossover_inner_step(self, other, rng): + def _crossover_node(self, other, rng): rng = np.random.default_rng(rng) # crossover_success = False diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py index 32f988e6..a9f8215a 100644 --- a/tpot2/search_spaces/pipelines/union.py +++ b/tpot2/search_spaces/pipelines/union.py @@ -34,7 +34,7 @@ def _crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline rng = np.random.default_rng(rng) - cx_funcs = [self._crossover_inner_step] + cx_funcs = [self._crossover_node, self._crossover_swap_node] rng.shuffle(cx_funcs) for cx_func in cx_funcs: if cx_func(other, rng): @@ -42,35 +42,14 @@ def _crossover(self, other, rng=None): return False - def _crossover_swap_step(self, other, rng): + def _crossover_swap_node(self, other, rng): rng = np.random.default_rng(rng) idx = rng.integers(1,len(self.pipeline)) self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] return True - - def _crossover_swap_random_steps(self, other, rng): - rng = np.random.default_rng(rng) - - max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) - max_steps = max(max_steps, 1) - - if max_steps == 1: - n_steps_to_swap = 1 - else: - n_steps_to_swap = rng.integers(1, max_steps) - - other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False) - self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False) - - # self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace] - - for self_idx, other_idx in zip(self_indexes_to_replace, other_indexes_to_take): - self.pipeline[self_idx], other.pipeline[other_idx] = other.pipeline[other_idx], self.pipeline[self_idx] - - return True - def _crossover_inner_step(self, other, rng): + def _crossover_node(self, other, rng): rng = np.random.default_rng(rng) crossover_success = False From e3886513b627ce882ba0ad1fcf6d78f062752c77 Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 8 Jul 2024 18:21:48 -0700 Subject: [PATCH 04/12] wrapper for crossover functions so that subclasses can use crossover instead of _crossover --- tpot2/search_spaces/base.py | 22 ++++++--- tpot2/search_spaces/nodes/estimator_node.py | 2 +- tpot2/search_spaces/nodes/fss_node.py | 2 +- .../nodes/genetic_feature_selection.py | 2 +- tpot2/search_spaces/pipelines/choice.py | 2 +- .../search_spaces/pipelines/dynamic_linear.py | 2 +- tpot2/search_spaces/pipelines/dynamicunion.py | 25 ++-------- tpot2/search_spaces/pipelines/graph.py | 46 +++++++++---------- tpot2/search_spaces/pipelines/sequential.py | 2 +- tpot2/search_spaces/pipelines/union.py | 2 +- tpot2/search_spaces/pipelines/wrapper.py | 2 +- 11 files changed, 48 insertions(+), 61 deletions(-) diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py index 2977d491..3133057e 100644 --- a/tpot2/search_spaces/base.py +++ b/tpot2/search_spaces/base.py @@ -15,23 +15,31 @@ + class SklearnIndividual(tpot2.BaseIndividual): + def __init_subclass__(cls): + cls.crossover = cls.validate_same_type(cls.crossover) + + def __init__(self,) -> None: super().__init__() def mutate(self, rng=None): return - @final def crossover(self, other, rng=None, **kwargs): - if not isinstance(other, type(self)): - return False - return self._crossover(other, rng=rng, **kwargs) + return - @abstractmethod - def _crossover(self, other, rng=None): - return + @final + def validate_same_type(func): + + def wrapper(self, other, rng=None, **kwargs): + if not isinstance(other, type(self)): + return False + return func(self, other, rng=None, **kwargs) + + return wrapper def export_pipeline(self) -> BaseEstimator: return diff --git a/tpot2/search_spaces/nodes/estimator_node.py b/tpot2/search_spaces/nodes/estimator_node.py index 4724405e..50d698f3 100644 --- a/tpot2/search_spaces/nodes/estimator_node.py +++ b/tpot2/search_spaces/nodes/estimator_node.py @@ -60,7 +60,7 @@ def mutate(self, rng=None): self.check_hyperparameters_for_None() return True - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): if isinstance(self.space, dict): return False diff --git a/tpot2/search_spaces/nodes/fss_node.py b/tpot2/search_spaces/nodes/fss_node.py index 46aef024..4dda0d92 100644 --- a/tpot2/search_spaces/nodes/fss_node.py +++ b/tpot2/search_spaces/nodes/fss_node.py @@ -51,7 +51,7 @@ def mutate(self, rng=None): self.sel_subset = self.subset_dict[self.selected_subset_name] - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): self.selected_subset_name = other.selected_subset_name self.sel_subset = other.sel_subset diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py index 9e36e666..f9c4892a 100644 --- a/tpot2/search_spaces/nodes/genetic_feature_selection.py +++ b/tpot2/search_spaces/nodes/genetic_feature_selection.py @@ -69,7 +69,7 @@ def mutate(self, rng=None): return rng.choice(self.mutation_list)(rng) - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): rng = np.random.default_rng(rng) if rng.uniform() < self.crossover_rate_rate: diff --git a/tpot2/search_spaces/pipelines/choice.py b/tpot2/search_spaces/pipelines/choice.py index ebe9c51c..25051aa0 100644 --- a/tpot2/search_spaces/pipelines/choice.py +++ b/tpot2/search_spaces/pipelines/choice.py @@ -29,7 +29,7 @@ def _mutate_select_new_node(self, rng=None): def _mutate_node(self, rng=None): return self.node.mutate(rng) - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): return self.node.crossover(other.node, rng) def export_pipeline(self): diff --git a/tpot2/search_spaces/pipelines/dynamic_linear.py b/tpot2/search_spaces/pipelines/dynamic_linear.py index 2ff2bf0b..528ec7c4 100644 --- a/tpot2/search_spaces/pipelines/dynamic_linear.py +++ b/tpot2/search_spaces/pipelines/dynamic_linear.py @@ -65,7 +65,7 @@ def _mutate_step(self, rng=None): return step.mutate(rng) - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline rng = np.random.default_rng(rng) diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py index 01651c29..8d8772eb 100644 --- a/tpot2/search_spaces/pipelines/dynamicunion.py +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -85,7 +85,7 @@ def _mutate_note(self, rng): return changed - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): rng = np.random.default_rng(rng) cx_funcs = [self._crossover_swap_multiple_nodes, self._crossover_node] @@ -95,28 +95,8 @@ def _crossover(self, other, rng=None): return True return False - - def _crossover_swap_node(self, other, rng): - rng = np.random.default_rng(rng) - changed = False - - self_step = rng.choice(list(self.union_dict.values())) - other_step = rng.choice(list(other.union_dict.values())) - - if other_step.unique_id() in self.union_dict: - self.union_dict[other_step.unique_id()] = other_step - self.union_dict.pop(self_step.unique_id()) - changed = True - - if self_step.unique_id() in other.union_dict: - other.union_dict[self_step.unique_id()] = self_step - other.union_dict.pop(other_step.unique_id()) - return changed - - - - + def _crossover_swap_multiple_nodes(self, other, rng): rng = np.random.default_rng(rng) self_values = list(self.union_dict.values()) @@ -128,6 +108,7 @@ def _crossover_swap_multiple_nodes(self, other, rng): self_idx = rng.integers(0,len(self_values)) other_idx = rng.integers(0,len(other_values)) + #Note that this is not one-point-crossover since the sequence doesn't matter. this is just a quick way to swap multiple random items self_values[:self_idx], other_values[:other_idx] = other_values[:other_idx], self_values[:self_idx] self.union_dict = {step.unique_id(): step for step in self_values} diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index e3e49a1b..fc769b1c 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -111,35 +111,33 @@ def __init__( def mutate(self, rng=None): rng = np.random.default_rng(rng) + rng.shuffle(self.mutate_methods_list) + for mutate_method in self.mutate_methods_list: + if mutate_method(rng=rng): + + if self.merge_duplicated_nodes_toggle: + self._merge_duplicated_nodes() - for i in range(0,random.randint(1,15)): - rng.shuffle(self.mutate_methods_list) - for mutate_method in self.mutate_methods_list: - if mutate_method(rng=rng): - - if self.merge_duplicated_nodes_toggle: - self._merge_duplicated_nodes() - - if self.__debug: - print(mutate_method) + if self.__debug: + print(mutate_method) - if self.root not in self.graph.nodes: - print('lost root something went wrong with ', mutate_method) + if self.root not in self.graph.nodes: + print('lost root something went wrong with ', mutate_method) - if len(self.graph.predecessors(self.root)) > 0: - print('root has parents ', mutate_method) + if len(self.graph.predecessors(self.root)) > 0: + print('root has parents ', mutate_method) - if any([n in nx.ancestors(self.graph,n) for n in self.graph.nodes]): - print('a node is connecting to itself...') + if any([n in nx.ancestors(self.graph,n) for n in self.graph.nodes]): + print('a node is connecting to itself...') - if self.__debug: - try: - nx.find_cycle(self.graph) - print('something went wrong with ', mutate_method) - except: - pass + if self.__debug: + try: + nx.find_cycle(self.graph) + print('something went wrong with ', mutate_method) + except: + pass - self.graphkey = None + self.graphkey = None return False @@ -323,7 +321,7 @@ def _mutate_insert_bypass_node(self, rng=None): return False - def _crossover(self, ind2, rng=None): + def crossover(self, ind2, rng=None): ''' self is the first individual, ind2 is the second individual If crossover_same_depth, it will select graphindividuals at the same recursive depth. diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py index 2fa15f9a..75bad8d2 100644 --- a/tpot2/search_spaces/pipelines/sequential.py +++ b/tpot2/search_spaces/pipelines/sequential.py @@ -38,7 +38,7 @@ def mutate(self, rng=None): return step.mutate(rng) - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline if len(self.pipeline) != len(other.pipeline): return False diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py index a9f8215a..811ef38b 100644 --- a/tpot2/search_spaces/pipelines/union.py +++ b/tpot2/search_spaces/pipelines/union.py @@ -30,7 +30,7 @@ def mutate(self, rng=None): return step.mutate(rng) - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline rng = np.random.default_rng(rng) diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 7b49e182..df504a89 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -62,7 +62,7 @@ def _mutate_hyperparameters(self, rng=None): def _mutate_node(self, rng=None): return self.node.mutate(rng) - def _crossover(self, other, rng=None): + def crossover(self, other, rng=None): if rng.choice([True, False]): return self._crossover_hyperparameters(other, rng) else: From a250ed2efc2ef671f63401baf61ca652bce273d4 Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 8 Jul 2024 18:25:16 -0700 Subject: [PATCH 05/12] tpot estimator documentation edit --- tpot2/tpot_estimator/estimator.py | 35 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index f4c0e954..9bc33e8a 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -112,7 +112,7 @@ def __init__(self, Parameters ---------- - default_search_space : (String, tpot2.search_spaces.SklearnIndividualGenerator) + search_space : (String, tpot2.search_spaces.SklearnIndividualGenerator) - String : The default search space to use for the optimization. This can be either "linear" or "graph". If "linear", will use the default linear pipeline search space. If "graph", will use the default graph pipeline search space. - SklearnIndividualGenerator : The search space to use for the optimization. This should be an instance of a SklearnIndividualGenerator. The search space to use for the optimization. This should be an instance of a SklearnIndividualGenerator. @@ -145,6 +145,7 @@ def __init__(self, bigger_is_better : bool, default=True If True, the objective function is maximized. If False, the objective function is minimized. Use negative weights to reverse the direction. + cross_val_predict_cv : int, default=0 Number of folds to use for the cross_val_predict function for inner classifiers and regressors. Estimators will still be fit on the full dataset, but the following node will get the outputs from cross_val_predict. @@ -152,20 +153,6 @@ def __init__(self, - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. However, the output to the next node will come from cross_val_predict with the specified number of folds. - categorical_features: list or None - Categorical columns to inpute and/or one hot encode during the preprocessing step. Used only if preprocessing is not False. - - None : If None, TPOT2 will automatically use object columns in pandas dataframes as objects for one hot encoding in preprocessing. - - List of categorical features. If X is a dataframe, this should be a list of column names. If X is a numpy array, this should be a list of column indices - - subsets : str or list, default=None - Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. - - str : If a string, it is assumed to be a path to a csv file with the subsets. - The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - - None : If None, each column will be treated as a subset. One column will be selected per subset. - If subsets is None, each column will be treated as a subset. One column will be selected per subset. - - memory: Memory object or string, default=None If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters @@ -180,7 +167,20 @@ def __init__(self, TPOT uses the instance of joblib.Memory for memory caching, and TPOT does NOT clean the caching directory up upon shutdown. - None: - TPOT does not use memory caching. + TPOT does not use memory caching. + + categorical_features: list or None + Categorical columns to inpute and/or one hot encode during the preprocessing step. Used only if preprocessing is not False. + - None : If None, TPOT2 will automatically use object columns in pandas dataframes as objects for one hot encoding in preprocessing. + - List of categorical features. If X is a dataframe, this should be a list of column names. If X is a numpy array, this should be a list of column indices + + subsets : str or list, default=None + Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. + - str : If a string, it is assumed to be a path to a csv file with the subsets. + The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. + - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. + - None : If None, each column will be treated as a subset. One column will be selected per subset. + If subsets is None, each column will be treated as a subset. One column will be selected per subset. preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL @@ -329,6 +329,9 @@ def __init__(self, >=5. full warnings trace 6. evaluations progress bar. (Temporary: This used to be 2. Currently, using evaluation progress bar may prevent some instances were we terminate a generation early due to it reaching max_time_seconds in the middle of a generation OR a pipeline failed to be terminated normally and we need to manually terminate it.) + scatter : bool, default=True + If True, will scatter the data to the dask workers. If False, will not scatter the data. This can be useful for debugging. + random_state : int, None, default=None A seed for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes From 6905e6f91b6b86d52a0334a32c06d9462e23ada6 Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 8 Jul 2024 19:12:00 -0700 Subject: [PATCH 06/12] update documentation --- tpot2/tpot_estimator/estimator.py | 48 ++++++++++--------- .../tpot_estimator/templates/tpottemplates.py | 8 ++-- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 9bc33e8a..bb1f3b3f 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -184,8 +184,8 @@ def __init__(self, preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL - A pipeline that will be used to preprocess the data before CV. - - bool : If True, will use a default preprocessing pipeline. + A pipeline that will be used to preprocess the data before CV. Note that the parameters for these steps are not optimized. Add them to the search space to be optimized. + - bool : If True, will use a default preprocessing pipeline which includes imputation followed by one hot encoding. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. population_size : int, default=50 @@ -562,7 +562,7 @@ def fit(self, X, y): if self.categorical_features is not None: #if categorical features are specified, use those pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'))) pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'))) - pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent'))) + pipeline_steps.append(("ColumnOneHotEncoder", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent'))) else: if isinstance(X, pd.DataFrame): @@ -570,7 +570,7 @@ def fit(self, X, y): if len(categorical_columns) > 0: pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'))) pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'))) - pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent'))) + pipeline_steps.append(("ColumnOneHotEncoder", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent'))) else: pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))) else: @@ -661,30 +661,34 @@ def objective_function(pipeline_individual, self._search_space = get_default_search_space(self.search_space, classification=True, inner_predictors=True, **get_search_space_params) - if check_empty_values(X): - from sklearn.experimental import enable_iterative_imputer + # TODO : Add check for empty values in X and if so, add imputation to the search space + # make this depend on self.preprocessing + # if check_empty_values(X): + # from sklearn.experimental import enable_iterative_imputer - from ConfigSpace import ConfigurationSpace - from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal - iterative_imputer_cs = ConfigurationSpace( - space = { - 'n_nearest_features' : Categorical('n_nearest_features', [100]), - 'initial_strategy' : Categorical('initial_strategy', ['mean','median', 'most_frequent', ]), - 'add_indicator' : Categorical('add_indicator', [True, False]), - } - ) + # from ConfigSpace import ConfigurationSpace + # from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal + # iterative_imputer_cs = ConfigurationSpace( + # space = { + # 'n_nearest_features' : Categorical('n_nearest_features', [100]), + # 'initial_strategy' : Categorical('initial_strategy', ['mean','median', 'most_frequent', ]), + # 'add_indicator' : Categorical('add_indicator', [True, False]), + # } + # ) - imputation_search = tpot2.search_spaces.pipelines.ChoicePipeline([ - tpot2.config.get_search_space("SimpleImputer"), - tpot2.search_spaces.nodes.EstimatorNode(sklearn.impute.IterativeImputer, iterative_imputer_cs) - ]) + # imputation_search = tpot2.search_spaces.pipelines.ChoicePipeline([ + # tpot2.config.get_search_space("SimpleImputer"), + # tpot2.search_spaces.nodes.EstimatorNode(sklearn.impute.IterativeImputer, iterative_imputer_cs) + # ]) - self.search_space_final = tpot2.search_spaces.pipelines.SequentialPipeline(search_spaces=[ imputation_search, self._search_space], memory="sklearn_pipeline_memory") - else: - self.search_space_final = self._search_space + # self.search_space_final = tpot2.search_spaces.pipelines.SequentialPipeline(search_spaces=[ imputation_search, self._search_space], memory="sklearn_pipeline_memory") + # else: + # self.search_space_final = self._search_space + + self.search_space_final = self._search_space def ind_generator(rng): rng = np.random.default_rng(rng) diff --git a/tpot2/tpot_estimator/templates/tpottemplates.py b/tpot2/tpot_estimator/templates/tpottemplates.py index d31dcc88..3871e6e1 100644 --- a/tpot2/tpot_estimator/templates/tpottemplates.py +++ b/tpot2/tpot_estimator/templates/tpottemplates.py @@ -104,8 +104,8 @@ def __init__( self, preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL - A pipeline that will be used to preprocess the data before CV. - - bool : If True, will use a default preprocessing pipeline. + A pipeline that will be used to preprocess the data before CV. Note that the parameters for these steps are not optimized. Add them to the search space to be optimized. + - bool : If True, will use a default preprocessing pipeline which includes imputation followed by one hot encoding. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. max_time_seconds : float, default=float("inf") @@ -358,8 +358,8 @@ def __init__( self, preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL - A pipeline that will be used to preprocess the data before CV. - - bool : If True, will use a default preprocessing pipeline. + A pipeline that will be used to preprocess the data before CV. Note that the parameters for these steps are not optimized. Add them to the search space to be optimized. + - bool : If True, will use a default preprocessing pipeline which includes imputation followed by one hot encoding. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. max_time_seconds : float, default=float("inf") From f7b4b270e72e8ac77fd6caa815a5d1ee3ed3266d Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 9 Jul 2024 13:45:26 -0700 Subject: [PATCH 07/12] fixed imputation tutorial --- .../Example_Search_Spaces/imputation.ipynb | 507 +++++++++++++++++- 1 file changed, 481 insertions(+), 26 deletions(-) diff --git a/Tutorial/Example_Search_Spaces/imputation.ipynb b/Tutorial/Example_Search_Spaces/imputation.ipynb index 07532532..b6de7ef8 100644 --- a/Tutorial/Example_Search_Spaces/imputation.ipynb +++ b/Tutorial/Example_Search_Spaces/imputation.ipynb @@ -2,16 +2,32 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Configuration(values={\n", + " 'add_indicator': False,\n", + " 'strategy': 'most_frequent',\n", + "})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from ConfigSpace import ConfigurationSpace\n", "from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal\n", + "import tpot2\n", + "from sklearn.impute import SimpleImputer\n", "\n", "simple_imputer = ConfigurationSpace(\n", " space = {\n", - " 'strategy' : Categorical('strategy', [['mean','median',], ['most_frequent'] ]),\n", + " 'strategy' : Categorical('strategy', ['mean','median','most_frequent']),\n", " 'add_indicator' : Categorical('add_indicator', [True, False]), \n", " }\n", ")\n", @@ -21,43 +37,482 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
Pipeline(steps=[('simpleimputer',\n",
+       "                 SimpleImputer(add_indicator=True, strategy='median')),\n",
+       "                ('selectpercentile',\n",
+       "                 SelectPercentile(percentile=44.546578384975824)),\n",
+       "                ('featureagglomeration',\n",
+       "                 FeatureAgglomeration(linkage='complete', metric='cosine',\n",
+       "                                      n_clusters=102,\n",
+       "                                      pooling_func=<function median at 0x711a67539830>)),\n",
+       "                ('extratreesclassifier',\n",
+       "                 ExtraTreesClassifier(bootstrap=True, class_weight='balanced',\n",
+       "                                      max_features=0.9974817877523433,\n",
+       "                                      min_samples_leaf=8, min_samples_split=20,\n",
+       "                                      n_jobs=1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], "text/plain": [ - "Configuration(values={\n", - " '2': 2,\n", - " 'a': 2,\n", - "})" + "Pipeline(steps=[('simpleimputer',\n", + " SimpleImputer(add_indicator=True, strategy='median')),\n", + " ('selectpercentile',\n", + " SelectPercentile(percentile=44.546578384975824)),\n", + " ('featureagglomeration',\n", + " FeatureAgglomeration(linkage='complete', metric='cosine',\n", + " n_clusters=102,\n", + " pooling_func=)),\n", + " ('extratreesclassifier',\n", + " ExtraTreesClassifier(bootstrap=True, class_weight='balanced',\n", + " max_features=0.9974817877523433,\n", + " min_samples_leaf=8, min_samples_split=20,\n", + " n_jobs=1))])" ] }, - "execution_count": 11, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from ConfigSpace import ConfigurationSpace, EqualsCondition\n", - "import ConfigSpace\n", - "\n", - "cs = ConfigurationSpace({\n", - "\n", - " \"1\": [1,2,3],\n", - " \"2\": ConfigSpace.Constant(\"2\", 2),\n", - "\n", - " \"a\": [1, 2, 3],\n", - "\n", - "})\n", + "imputation_node =tpot2.search_spaces.nodes.EstimatorNode(\n", + " method = SimpleImputer,\n", + " space = simple_imputer,\n", + ")\n", "\n", - "cond = EqualsCondition(cs['1'], cs['a'], 1)\n", - "cond2 = EqualsCondition(cs['2'], cs['a'], 2)\n", + "impute_classifier_space = tpot2.search_spaces.pipelines.SequentialPipeline([\n", + " imputation_node,\n", + " tpot2.config.get_search_space(\"selectors\"), \n", + " tpot2.config.get_search_space(\"transformers\"),\n", + " tpot2.config.get_search_space(\"classifiers\"),\n", + " \n", + "])\n", "\n", - "cs.add_condition(cond)\n", - "cs.add_condition(cond2)\n", "\n", - "cs.sample_configuration()" + "impute_classifier_space.generate().export_pipeline()" ] } ], @@ -77,7 +532,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.14" } }, "nbformat": 4, From 76f76faf4a59181813d361ef42aa8ae0be391d36 Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 9 Jul 2024 14:37:39 -0700 Subject: [PATCH 08/12] fix --- tpot2/search_spaces/pipelines/wrapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index df504a89..2c1ad138 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -63,6 +63,7 @@ def _mutate_node(self, rng=None): return self.node.mutate(rng) def crossover(self, other, rng=None): + rng = np.random.default_rng(rng) if rng.choice([True, False]): return self._crossover_hyperparameters(other, rng) else: From 334ca58319d767aecd097232c09ddc6596ca659a Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 9 Jul 2024 14:41:36 -0700 Subject: [PATCH 09/12] wrap fix --- tpot2/search_spaces/pipelines/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 2c1ad138..d61bc5f3 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -67,7 +67,7 @@ def crossover(self, other, rng=None): if rng.choice([True, False]): return self._crossover_hyperparameters(other, rng) else: - self.estimator_search_space.crossover(other.estimator_search_space, rng) + self.node.crossover(other.estimator_search_space, rng) def _crossover_hyperparameters(self, other, rng=None): From 8c0379c4c2cfca3bf414cd51174505ce5bde8561 Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 9 Jul 2024 17:46:10 -0700 Subject: [PATCH 10/12] added check for infinite max_eval_time_seconds --- tpot2/evolvers/steady_state_evolver.py | 19 ++++++++++--------- tpot2/utils/eval_utils.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tpot2/evolvers/steady_state_evolver.py b/tpot2/evolvers/steady_state_evolver.py index eecc2b29..e0b9b593 100644 --- a/tpot2/evolvers/steady_state_evolver.py +++ b/tpot2/evolvers/steady_state_evolver.py @@ -299,17 +299,18 @@ def optimize(self): eval_error = "INVALID" else: #if future is not done - #check if the future has been running for too long, cancel the future - if time.time() - submitted_futures[completed_future]["time"] > self.max_eval_time_seconds*1.25: - completed_future.cancel() + if self.max_eval_time_seconds is not None: + #check if the future has been running for too long, cancel the future + if time.time() - submitted_futures[completed_future]["time"] > self.max_eval_time_seconds*1.25: + completed_future.cancel() - if self.verbose >= 4: - print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n') + if self.verbose >= 4: + print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n') - scores = [np.nan for _ in range(len(self.objective_names))] - eval_error = "TIMEOUT" - else: - continue #otherwise, continue to next future + scores = [np.nan for _ in range(len(self.objective_names))] + eval_error = "TIMEOUT" + else: + continue #otherwise, continue to next future diff --git a/tpot2/utils/eval_utils.py b/tpot2/utils/eval_utils.py index f37cb823..f8d4bd7f 100644 --- a/tpot2/utils/eval_utils.py +++ b/tpot2/utils/eval_utils.py @@ -218,7 +218,7 @@ def parallel_eval_objective_list2(individual_list, #check if the future has been running for too long, cancel the future - if time.time() - submitted_futures[completed_future]["time"] > max_eval_time_seconds*1.25: + if max_eval_time_seconds is not None and time.time() - submitted_futures[completed_future]["time"] > max_eval_time_seconds*1.25: completed_future.cancel() if verbose >= 4: From dc1fb8aa03c705c0110af292b4b8d473fef9559b Mon Sep 17 00:00:00 2001 From: Jay Moran Date: Wed, 10 Jul 2024 15:37:39 -0700 Subject: [PATCH 11/12] Print out package versions and make tests verbose --- tox.ini | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 4e250aef..f215d6d0 100644 --- a/tox.ini +++ b/tox.ini @@ -16,7 +16,8 @@ setenv = deps = -r{toxinidir}/requirements_dev.txt commands = - pytest --basetemp={envtmpdir} + pip freeze + pytest --basetemp={envtmpdir} -v [testenv:flake8] basepython = python3.10 @@ -27,4 +28,4 @@ commands = flake8 tpot2 basepython = python3.10 deps = -r{toxinidir}/requirements_dev.txt -commands = mypy tpot2 \ No newline at end of file +commands = mypy tpot2 From 76d3989e8cd6bc93144ef79a49fe6e6837be425e Mon Sep 17 00:00:00 2001 From: Jay Moran Date: Wed, 10 Jul 2024 16:59:13 -0700 Subject: [PATCH 12/12] Pin numpy version --- setup.py | 2 +- tox.ini | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 27b4a474..0a404280 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def calculate_version(): ''', zip_safe=True, - install_requires=['numpy>=1.26.4', + install_requires=['numpy==1.26.4', 'scipy>=1.3.1', 'scikit-learn>=1.3.0', 'update_checker>=0.16', diff --git a/tox.ini b/tox.ini index f215d6d0..7177d0a7 100644 --- a/tox.ini +++ b/tox.ini @@ -16,8 +16,7 @@ setenv = deps = -r{toxinidir}/requirements_dev.txt commands = - pip freeze - pytest --basetemp={envtmpdir} -v + pytest --basetemp={envtmpdir} [testenv:flake8] basepython = python3.10