From 1495af6240f35cf4104a1a3a4a7a8f75f25e4287 Mon Sep 17 00:00:00 2001 From: perib Date: Thu, 12 Oct 2023 14:36:13 -0700 Subject: [PATCH 01/43] hyperparameter mutation changes --- tpot2/config/hyperparametersuggestor.py | 261 +++++++++++++----- .../graph_pipeline_individual/individual.py | 64 ++++- .../graph_pipeline_individual/templates.py | 7 + tpot2/tpot_estimator/estimator.py | 10 + 4 files changed, 259 insertions(+), 83 deletions(-) diff --git a/tpot2/config/hyperparametersuggestor.py b/tpot2/config/hyperparametersuggestor.py index 01fa188d..f0f1318c 100644 --- a/tpot2/config/hyperparametersuggestor.py +++ b/tpot2/config/hyperparametersuggestor.py @@ -4,86 +4,203 @@ +class Trial(): -#Replicating the API found in optuna: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html -#copy-pasted some code -def suggest_categorical(name, choices): - return random.choice(choices) - -def suggest_float( - name: str, - low: float, - high: float, - *, - step = None, - log = False, - ): - - if log and step is not None: - raise ValueError("The parameter `step` is not supported when `log` is true.") - - if low > high: - raise ValueError( - "The `low` value must be smaller than or equal to the `high` value " - "(low={}, high={}).".format(low, high) - ) - - if log and low <= 0.0: - raise ValueError( - "The `low` value must be larger than 0 for a log distribution " - "(low={}, high={}).".format(low, high) - ) - - if step is not None and step <= 0: - raise ValueError( - "The `step` value must be non-zero positive value, " "but step={}.".format(step) - ) - - #TODO check this produces correct output - if log: - value = np.random.uniform(np.log(low),np.log(high)) - return np.e**value - - else: - if step is not None: - return np.random.choice(np.arange(low,high,step)) + def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): + self._params = dict() + + self.old_params = old_params + self.alpha = alpha + self.hyperparameter_probability = hyperparameter_probability + + if old_params is not None: + self.params_to_update = set(random.sample(list(old_params.keys()), max(int(len(old_params.keys())*self.hyperparameter_probability),1))) else: - return np.random.uniform(low,high) + self.params_to_update = None + + + #Replicating the API found in optuna: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html + #copy-pasted some code + def suggest_categorical(self, name, choices): + if self.params_to_update == None or name in self.params_to_update: #If this parameter is selected to be changed + choice = self.suggest_categorical_(name, choices) + else: #if this parameter is not selected to be changed + if name not in self.old_params: #if this parameter is not in the old params, then we need to choose a value for it + choice = self.suggest_categorical_(name, choices) + else: #if this parameter is in the old params, then we can just use the old value + choice = self.old_params[name] + if choice not in choices: #if the old value is not in the choices, then we need to choose a value for it + choice = self.suggest_categorical_(name, choices) + + self._params[name] = choice + return choice + + def suggest_float(self, + name: str, + low: float, + high: float, + *, + step = None, + log = False, + ): + if self.params_to_update == None or name in self.params_to_update: #If this parameter is selected to be changed + choice = self.suggest_float_(name, low=low, high=high, step=step, log=log) + if self.old_params is not None and name in self.old_params: + choice = self.alpha*choice + (1-self.alpha)*self.old_params[name] + else: #if this parameter is not selected to be changed + + if name not in self.old_params: + choice = self.suggest_float_(name, low=low, high=high, step=step, log=log) + else: + choice = self.old_params[name] + + self._params[name] = choice + return choice + + + + def suggest_discrete_uniform(self, name, low, high, q): + if self.params_to_update == None or name in self.params_to_update: + choice = self.suggest_discrete_uniform_(name, low=low, high=high, q=q) + if self.old_params is not None and name in self.old_params: + choice = self.alpha*choice + (1-self.alpha)*self.old_params[name] + else: + if name not in self.old_params: + choice = self.suggest_discrete_uniform_(name, low=low, high=high, q=q) + else: + choice = self.old_params[name] + self._params[name] = choice + return choice -def suggest_discrete_uniform(name, low, high, q): - return suggest_float(name, low, high, step=q) -def suggest_int(name, low, high, step=1, log=False): - if low == high: #TODO check that this matches optuna's behaviour - return low - - if log and step >1: - raise ValueError("The parameter `step`>1 is not supported when `log` is true.") + def suggest_int(self, name, low, high, step=1, log=False): + if self.params_to_update == None or name in self.params_to_update: + choice = self.suggest_int_(name, low=low, high=high, step=step, log=log) + if self.old_params is not None and name in self.old_params: + choice = int(self.alpha*choice + (1-self.alpha)*self.old_params[name]) + else: + if name not in self.old_params: + choice = self.suggest_int_(name, low=low, high=high, step=step, log=log) + else: + choice = self.old_params[name] - if low > high: - raise ValueError( - "The `low` value must be smaller than or equal to the `high` value " - "(low={}, high={}).".format(low, high) - ) + self._params[name] = choice + return choice - if log and low <= 0.0: - raise ValueError( - "The `low` value must be larger than 0 for a log distribution " - "(low={}, high={}).".format(low, high) - ) - if step is not None and step <= 0: - raise ValueError( - "The `step` value must be non-zero positive value, " "but step={}.".format(step) - ) + def suggest_uniform(self, name, low, high): + if self.params_to_update == None or name in self.params_to_update: + choice = self.suggest_uniform_(name, low=low, high=high) + if self.old_params is not None and name in self.old_params: + choice = self.alpha*choice + (1-self.alpha)*self.old_params[name] + else: + if name not in self.old_params: + choice = self.suggest_uniform_(name, low=low, high=high) + else: + choice = self.old_params[name] + + self._params[name] = choice + return choice + + + +#################################### + #Replicating the API found in optuna: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html + #copy-pasted some code + def suggest_categorical_(self, name, choices): + + choice = random.choice(choices) + self._params[name] = choice + return choice + + def suggest_float_(self, + name: str, + low: float, + high: float, + *, + step = None, + log = False, + ): + + if log and step is not None: + raise ValueError("The parameter `step` is not supported when `log` is true.") + + if low > high: + raise ValueError( + "The `low` value must be smaller than or equal to the `high` value " + "(low={}, high={}).".format(low, high) + ) + + if log and low <= 0.0: + raise ValueError( + "The `low` value must be larger than 0 for a log distribution " + "(low={}, high={}).".format(low, high) + ) + + if step is not None and step <= 0: + raise ValueError( + "The `step` value must be non-zero positive value, " "but step={}.".format(step) + ) + + #TODO check this produces correct output + if log: + value = np.random.uniform(np.log(low),np.log(high)) + choice = np.e**value + self._params[name] = choice + return choice - if log: - value = np.random.uniform(np.log(low),np.log(high)) - return int(np.e**value) - else: - return np.random.choice(list(range(low,high,step))) + else: + if step is not None: + choice = np.random.choice(np.arange(low,high,step)) + self._params[name] = choice + return choice + else: + choice = np.random.uniform(low,high) + self._params[name] = choice + return choice + + + def suggest_discrete_uniform_(self, name, low, high, q): + choice = self.suggest_float(name, low, high, step=q) + self._params[name] = choice + return choice + + + def suggest_int_(self, name, low, high, step=1, log=False): + if low == high: #TODO check that this matches optuna's behaviour + return low + + if log and step >1: + raise ValueError("The parameter `step`>1 is not supported when `log` is true.") + + if low > high: + raise ValueError( + "The `low` value must be smaller than or equal to the `high` value " + "(low={}, high={}).".format(low, high) + ) + + if log and low <= 0.0: + raise ValueError( + "The `low` value must be larger than 0 for a log distribution " + "(low={}, high={}).".format(low, high) + ) + + if step is not None and step <= 0: + raise ValueError( + "The `step` value must be non-zero positive value, " "but step={}.".format(step) + ) + + if log: + value = np.random.uniform(np.log(low),np.log(high)) + choice = int(np.e**value) + self._params[name] = choice + return choice + else: + choice = np.random.choice(list(range(low,high,step))) + self._params[name] = choice + return choice -def suggest_uniform(name, low, high): - return suggest_float(name, low, high) \ No newline at end of file + def suggest_uniform_(self, name, low, high): + return self.suggest_float(name, low, high) \ No newline at end of file diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index 89abf09b..d4bed314 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -29,6 +29,9 @@ def __init__(self, *, self.method_class = method_class #transformer or baseestimator self.hyperparameters = hyperparameters self.label = label + self._params = None + + from functools import partial #@https://stackoverflow.com/questions/20530455/isomorphic-comparison-of-networkx-graph-objects-instead-of-the-default-address @@ -114,6 +117,9 @@ def __init__( crossover_same_depth = False, crossover_same_recursive_depth = True, + hyperparameter_probability = 1, + hyper_node_probability = 0, + hyperparameter_alpha = 1, unique_subset_values = None, initial_subset_values = None, @@ -135,6 +141,10 @@ def __init__( self.unique_subset_values = unique_subset_values self.initial_subset_values = initial_subset_values + self.hyperparameter_probability = hyperparameter_probability + self.hyper_node_probability = hyper_node_probability + self.hyperparameter_alpha = hyperparameter_alpha + if self.unique_subset_values is not None: self.row_subset_selector = tpot2.representations.SubsetSelector(values=unique_subset_values, initial_set=initial_subset_values,k=20) @@ -237,7 +247,8 @@ def initialize_all_nodes(self,): if node.method_class is None: node.method_class = random.choice(list(self.select_config_dict(node).keys())) if node.hyperparameters is None: - node.hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) + get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + def fix_noncompliant_leafs(self): leafs = [node for node in self.graph.nodes if len(list(self.graph.successors(node)))==0] @@ -254,6 +265,7 @@ def fix_noncompliant_leafs(self): first_leaf = NodeLabel(config_dict=self.leaf_config_dict) first_leaf.method_class = random.choice(list(first_leaf.config_dict.keys())) #TODO: check when there is no new method first_leaf.hyperparameters = first_leaf.config_dict[first_leaf.method_class](config.hyperparametersuggestor) + get_hyperparameter(self.select_config_dict(first_leaf)[first_leaf.method_class], nodelabel=first_leaf, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) compliant_leafs.append(first_leaf) #connect bad leaves to good leaves (making them internal nodes) @@ -547,15 +559,24 @@ def _mutate_hyperparameters(self): ''' sorted_nodes_list = list(self.graph.nodes) random.shuffle(sorted_nodes_list) + completed_one = False for node in sorted_nodes_list: if isinstance(node,GraphIndividual): continue if isinstance(self.select_config_dict(node)[node.method_class], dict): continue - node.hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) - + + if not completed_one: + + get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + else: + if self.hyper_node_probability < random.random(): + get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) return True return False + + + def _mutate_replace_node(self): ''' @@ -570,9 +591,11 @@ def _mutate_replace_node(self): node.method_class = random.choice(list(self.select_config_dict(node).keys())) if isinstance(self.select_config_dict(node)[node.method_class], dict): hyperparameters = self.select_config_dict(node)[node.method_class] + node.hyperparameters = hyperparameters else: - hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) - node.hyperparameters = hyperparameters + #hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) + get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + return True return False @@ -1024,6 +1047,7 @@ def _optimize_optuna_single_method_full_pipeline(self, objective_function, steps def objective(trial): params = self.select_config_dict(node)[node.method_class](trial) node.hyperparameters = params + trial.set_user_attr('params', params) try: return objective_function(self) @@ -1115,14 +1139,14 @@ def create_node(config_dict): if method_class == 'Recursive': node = GraphIndividual(**config_dict[method_class]) else: - if isinstance(config_dict[method_class], dict): - hyperparameters = config_dict[method_class] - else: - hyperparameters = config_dict[method_class](config.hyperparametersuggestor) + hyperparameters, params = get_hyperparameter(config_dict[method_class], nodelabel=None) node = NodeLabel( method_class=method_class, - hyperparameters=hyperparameters) + hyperparameters=hyperparameters + ) + node._params = params + return node @@ -1137,4 +1161,22 @@ def random_weighted_sort(l,weights): indeces.pop(next_item) sorted_l.append(l[next_item]) - return sorted_l \ No newline at end of file + return sorted_l + + + +def get_hyperparameter(config_func, nodelabel=None, alpha=1, hyperparameter_probability=1): + if isinstance(config_func, dict): + return config_func, None + + if nodelabel is not None: + trial = config.hyperparametersuggestor.Trial(old_params=nodelabel._params, alpha=alpha, hyperparameter_probability=hyperparameter_probability) + new_params = config_func(trial) + + nodelabel._params = trial._params + nodelabel.hyperparameters = new_params + else: + trial = config.hyperparametersuggestor.Trial(old_params=None, alpha=alpha, hyperparameter_probability=hyperparameter_probability) + new_params = config_func(trial) + + return new_params, trial._params, \ No newline at end of file diff --git a/tpot2/individual_representations/graph_pipeline_individual/templates.py b/tpot2/individual_representations/graph_pipeline_individual/templates.py index 6991c042..decc4570 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/templates.py +++ b/tpot2/individual_representations/graph_pipeline_individual/templates.py @@ -14,6 +14,10 @@ def estimator_graph_individual_generator( leaf_config_dict=None, max_size = np.inf, linear_pipeline = False, + + hyperparameter_probability = 1, + hyper_node_probability = 0, + hyperparameter_alpha = 1, **kwargs, ) : @@ -37,6 +41,9 @@ def estimator_graph_individual_generator( max_size = max_size, linear_pipeline = linear_pipeline, + hyperparameter_probability = hyperparameter_probability, + hyper_node_probability = hyper_node_probability, + hyperparameter_alpha = hyperparameter_alpha, **kwargs, ) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 351aa12b..8b9af74d 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -37,6 +37,10 @@ def __init__(self, scorers, other_objective_functions_weights = [], objective_function_names = None, bigger_is_better = True, + + hyperparameter_probability = 1, + hyper_node_probability = 0, + hyperparameter_alpha = 1, max_size = np.inf, linear_pipeline = False, root_config_dict= 'Auto', @@ -427,6 +431,9 @@ def __init__(self, scorers, self.other_objective_functions_weights = other_objective_functions_weights self.objective_function_names = objective_function_names self.bigger_is_better = bigger_is_better + self.hyperparameter_probability = hyperparameter_probability + self.hyper_node_probability = hyper_node_probability + self.hyperparameter_alpha = hyperparameter_alpha self.max_size = max_size self.linear_pipeline = linear_pipeline self.root_config_dict= root_config_dict @@ -685,6 +692,9 @@ def objective_function(pipeline_individual, leaf_config_dict=leaf_config_dict, max_size = self.max_size, linear_pipeline=self.linear_pipeline, + hyperparameter_probability=self.hyperparameter_probability, + hyper_node_probability=self.hyper_node_probability, + hyperparameter_alpha=self.hyperparameter_alpha, ) if self.threshold_evaluation_early_stop is not None or self.selection_evaluation_early_stop is not None: From 98358196f762b9abc59fb993c1e423a764e7bd0f Mon Sep 17 00:00:00 2001 From: perib Date: Thu, 12 Oct 2023 16:54:14 -0700 Subject: [PATCH 02/43] replace node creates new node hyper fix --- .../graph_pipeline_individual/individual.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index d4bed314..257e2e20 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -594,8 +594,13 @@ def _mutate_replace_node(self): node.hyperparameters = hyperparameters else: #hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) - get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - + #get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=None, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + new_node = create_node(self.select_config_dict(node)[node.method_class]) + #TODO cleanup + node.hyperparameters = new_node.hyperparameters + node.method_class = new_node.method_class + node.label = new_node.label + return True return False From 10e1c75de044d64a7403644dfdf65e48834b4091 Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 16 Oct 2023 11:22:32 -0700 Subject: [PATCH 03/43] fix --- .../graph_pipeline_individual/individual.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index 257e2e20..d3b5f63b 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -569,6 +569,7 @@ def _mutate_hyperparameters(self): if not completed_one: get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + completed_one = True else: if self.hyper_node_probability < random.random(): get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) From 8e53a91c794b09daeb3497de81d6c2238d378f62 Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 16 Oct 2023 14:13:48 -0700 Subject: [PATCH 04/43] fix --- .../graph_pipeline_individual/individual.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index d3b5f63b..6fc10503 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -568,13 +568,12 @@ def _mutate_hyperparameters(self): if not completed_one: - get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - completed_one = True + _,_, completed_one = get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) else: if self.hyper_node_probability < random.random(): get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - return True - return False + + return completed_one @@ -1145,7 +1144,7 @@ def create_node(config_dict): if method_class == 'Recursive': node = GraphIndividual(**config_dict[method_class]) else: - hyperparameters, params = get_hyperparameter(config_dict[method_class], nodelabel=None) + hyperparameters, params, _ = get_hyperparameter(config_dict[method_class], nodelabel=None) node = NodeLabel( method_class=method_class, @@ -1172,17 +1171,18 @@ def random_weighted_sort(l,weights): def get_hyperparameter(config_func, nodelabel=None, alpha=1, hyperparameter_probability=1): + changed = False if isinstance(config_func, dict): - return config_func, None + return config_func, None, changed if nodelabel is not None: trial = config.hyperparametersuggestor.Trial(old_params=nodelabel._params, alpha=alpha, hyperparameter_probability=hyperparameter_probability) new_params = config_func(trial) - + changed = trial._params != nodelabel._params nodelabel._params = trial._params nodelabel.hyperparameters = new_params else: trial = config.hyperparametersuggestor.Trial(old_params=None, alpha=alpha, hyperparameter_probability=hyperparameter_probability) new_params = config_func(trial) - return new_params, trial._params, \ No newline at end of file + return new_params, trial._params, changed \ No newline at end of file From 82676cc1b2631bd686e05c9b0452417e1db904a3 Mon Sep 17 00:00:00 2001 From: perib Date: Mon, 16 Oct 2023 14:18:11 -0700 Subject: [PATCH 05/43] fix --- .../graph_pipeline_individual/individual.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index 6fc10503..f994e270 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -567,10 +567,9 @@ def _mutate_hyperparameters(self): continue if not completed_one: - _,_, completed_one = get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) else: - if self.hyper_node_probability < random.random(): + if self.hyper_node_probability > random.random(): get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) return completed_one From 9aaf019e1ff0acdf30bb32c6243f142ed984f845 Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 17 Oct 2023 12:19:14 -0700 Subject: [PATCH 06/43] fix --- .../graph_pipeline_individual/individual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index f994e270..a611755c 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -594,7 +594,7 @@ def _mutate_replace_node(self): else: #hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) #get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=None, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - new_node = create_node(self.select_config_dict(node)[node.method_class]) + new_node = create_node(self.select_config_dict(node)) #TODO cleanup node.hyperparameters = new_node.hyperparameters node.method_class = new_node.method_class From 3ea17297c557a295cca08cd517c2e2c5e890f5c8 Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 18 Oct 2023 11:30:08 -0700 Subject: [PATCH 07/43] fix --- tpot2/config/hyperparametersuggestor.py | 54 +++++++++---------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/tpot2/config/hyperparametersuggestor.py b/tpot2/config/hyperparametersuggestor.py index f0f1318c..73c9c678 100644 --- a/tpot2/config/hyperparametersuggestor.py +++ b/tpot2/config/hyperparametersuggestor.py @@ -2,6 +2,13 @@ from scipy.stats import loguniform, logser #TODO: remove this dependency? import numpy as np #TODO: remove this dependency and use scipy instead? +#function that selects selects items from a list with each having independent probability p of being selected +def select(items, p): + selected = [item for item in items if random.random() < p] + #if selected is empty, select one item at random + if not selected: + return [random.choice(items)] + return selected class Trial(): @@ -13,8 +20,8 @@ def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): self.alpha = alpha self.hyperparameter_probability = hyperparameter_probability - if old_params is not None: - self.params_to_update = set(random.sample(list(old_params.keys()), max(int(len(old_params.keys())*self.hyperparameter_probability),1))) + if old_params is not None and len(old_params) > 0: + self.params_to_update = select(list(old_params.keys()), self.hyperparameter_probability) else: self.params_to_update = None @@ -22,15 +29,12 @@ def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): #Replicating the API found in optuna: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html #copy-pasted some code def suggest_categorical(self, name, choices): - if self.params_to_update == None or name in self.params_to_update: #If this parameter is selected to be changed + if self.params_to_update == None or name in self.params_to_update or name not in self.old_params: #If this parameter is selected to be changed choice = self.suggest_categorical_(name, choices) else: #if this parameter is not selected to be changed - if name not in self.old_params: #if this parameter is not in the old params, then we need to choose a value for it + choice = self.old_params[name] + if choice not in choices: #if the old value is not in the choices, then we need to choose a value for it choice = self.suggest_categorical_(name, choices) - else: #if this parameter is in the old params, then we can just use the old value - choice = self.old_params[name] - if choice not in choices: #if the old value is not in the choices, then we need to choose a value for it - choice = self.suggest_categorical_(name, choices) self._params[name] = choice return choice @@ -43,15 +47,11 @@ def suggest_float(self, step = None, log = False, ): - if self.params_to_update == None or name in self.params_to_update: #If this parameter is selected to be changed + if self.params_to_update == None or name in self.params_to_update or name not in self.old_params: #If this parameter is selected to be changed choice = self.suggest_float_(name, low=low, high=high, step=step, log=log) if self.old_params is not None and name in self.old_params: choice = self.alpha*choice + (1-self.alpha)*self.old_params[name] else: #if this parameter is not selected to be changed - - if name not in self.old_params: - choice = self.suggest_float_(name, low=low, high=high, step=step, log=log) - else: choice = self.old_params[name] self._params[name] = choice @@ -60,15 +60,12 @@ def suggest_float(self, def suggest_discrete_uniform(self, name, low, high, q): - if self.params_to_update == None or name in self.params_to_update: + if self.params_to_update == None or name in self.params_to_update or name not in self.old_params: choice = self.suggest_discrete_uniform_(name, low=low, high=high, q=q) if self.old_params is not None and name in self.old_params: choice = self.alpha*choice + (1-self.alpha)*self.old_params[name] else: - if name not in self.old_params: - choice = self.suggest_discrete_uniform_(name, low=low, high=high, q=q) - else: - choice = self.old_params[name] + choice = self.old_params[name] self._params[name] = choice return choice @@ -76,30 +73,24 @@ def suggest_discrete_uniform(self, name, low, high, q): def suggest_int(self, name, low, high, step=1, log=False): - if self.params_to_update == None or name in self.params_to_update: + if self.params_to_update == None or name in self.params_to_update or name not in self.old_params: choice = self.suggest_int_(name, low=low, high=high, step=step, log=log) if self.old_params is not None and name in self.old_params: choice = int(self.alpha*choice + (1-self.alpha)*self.old_params[name]) else: - if name not in self.old_params: - choice = self.suggest_int_(name, low=low, high=high, step=step, log=log) - else: - choice = self.old_params[name] + choice = self.old_params[name] self._params[name] = choice return choice def suggest_uniform(self, name, low, high): - if self.params_to_update == None or name in self.params_to_update: + if self.params_to_update == None or name in self.params_to_update or name not in self.old_params: choice = self.suggest_uniform_(name, low=low, high=high) if self.old_params is not None and name in self.old_params: choice = self.alpha*choice + (1-self.alpha)*self.old_params[name] else: - if name not in self.old_params: - choice = self.suggest_uniform_(name, low=low, high=high) - else: - choice = self.old_params[name] + choice = self.old_params[name] self._params[name] = choice return choice @@ -112,7 +103,6 @@ def suggest_uniform(self, name, low, high): def suggest_categorical_(self, name, choices): choice = random.choice(choices) - self._params[name] = choice return choice def suggest_float_(self, @@ -148,23 +138,19 @@ def suggest_float_(self, if log: value = np.random.uniform(np.log(low),np.log(high)) choice = np.e**value - self._params[name] = choice return choice else: if step is not None: choice = np.random.choice(np.arange(low,high,step)) - self._params[name] = choice return choice else: choice = np.random.uniform(low,high) - self._params[name] = choice return choice def suggest_discrete_uniform_(self, name, low, high, q): choice = self.suggest_float(name, low, high, step=q) - self._params[name] = choice return choice @@ -195,11 +181,9 @@ def suggest_int_(self, name, low, high, step=1, log=False): if log: value = np.random.uniform(np.log(low),np.log(high)) choice = int(np.e**value) - self._params[name] = choice return choice else: choice = np.random.choice(list(range(low,high,step))) - self._params[name] = choice return choice def suggest_uniform_(self, name, low, high): From 1cf45fa8c39672db8ed00f6c80ebbf1ce8a301cd Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 23 Oct 2023 14:18:04 -0700 Subject: [PATCH 08/43] removed trailing white space --- tpot2/tpot_estimator/estimator.py | 380 +++++++++++++++--------------- 1 file changed, 189 insertions(+), 191 deletions(-) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 8b9af74d..2ed5f488 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -5,7 +5,7 @@ import tpot2.config from sklearn.utils.validation import check_is_fitted from tpot2.selectors import survival_select_NSGA2, tournament_selection_dominated -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder import pandas as pd from sklearn.model_selection import train_test_split @@ -29,7 +29,7 @@ def set_dask_settings(): #TODO inherit from _BaseComposition? class TPOTEstimator(BaseEstimator): - def __init__(self, scorers, + def __init__(self, scorers, scorers_weights, classification, cv = 5, @@ -41,11 +41,11 @@ def __init__(self, scorers, hyperparameter_probability = 1, hyper_node_probability = 0, hyperparameter_alpha = 1, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, root_config_dict= 'Auto', inner_config_dict=["selectors", "transformers"], - leaf_config_dict= None, + leaf_config_dict= None, cross_val_predict_cv = 0, categorical_features = None, subsets = None, @@ -53,25 +53,25 @@ def __init__(self, scorers, preprocessing = False, population_size = 50, initial_population_size = None, - population_scaling = .5, - generations_until_end_population = 1, + population_scaling = .5, + generations_until_end_population = 1, generations = None, - max_time_seconds=3600, - max_eval_time_seconds=60*10, + max_time_seconds=3600, + max_eval_time_seconds=60*10, validation_strategy = "none", validation_fraction = .2, disable_label_encoder = False, - - #early stopping parameters + + #early stopping parameters early_stop = None, scorers_early_stop_tol = 0.001, other_objectives_early_stop_tol =None, - threshold_evaluation_early_stop = None, + threshold_evaluation_early_stop = None, threshold_evaluation_scaling = .5, - selection_evaluation_early_stop = None, - selection_evaluation_scaling = .5, + selection_evaluation_early_stop = None, + selection_evaluation_scaling = .5, min_history_threshold = 20, - + #evolver parameters survival_percentage = 1, crossover_probability=.2, @@ -80,77 +80,77 @@ def __init__(self, scorers, crossover_then_mutate_probability=.05, survival_selector = survival_select_NSGA2, parent_selector = tournament_selection_dominated, - + #budget parameters budget_range = None, budget_scaling = .5, - generations_until_end_budget = 1, + generations_until_end_budget = 1, stepwise_steps = 5, - + optuna_optimize_pareto_front = False, optuna_optimize_pareto_front_trials = 100, optuna_optimize_pareto_front_timeout = 60*10, optuna_storage = "sqlite:///optuna.db", - + #dask parameters n_jobs=1, memory_limit = "4GB", client = None, processes = True, - + #debugging and logging parameters warm_start = False, subset_column = None, - periodic_checkpoint_folder = None, + periodic_checkpoint_folder = None, callback = None, - + verbose = 0, scatter = True, ): - + ''' An sklearn baseestimator that uses genetic programming to optimize a pipeline. - + Parameters ---------- - + scorers : (list, scorer) - A scorer or list of scorers to be used in the cross-validation process. + A scorer or list of scorers to be used in the cross-validation process. see https://scikit-learn.org/stable/modules/model_evaluation.html - + scorers_weights : list A list of weights to be applied to the scorers during the optimization process. - + classification : bool If True, the problem is treated as a classification problem. If False, the problem is treated as a regression problem. Used to determine the CV strategy. - + cv : int, cross-validator - (int): Number of folds to use in the cross-validation process. By uses the sklearn.model_selection.KFold cross-validator for regression and StratifiedKFold for classification. In both cases, shuffled is set to True. - (sklearn.model_selection.BaseCrossValidator): A cross-validator to use in the cross-validation process. - max_depth (int): The maximum depth from any node to the root of the pipelines to be generated. - + other_objective_functions : list, default=[] A list of other objective functions to apply to the pipeline. The function takes a single parameter for the graphpipeline estimator and returns either a single score or a list of scores. - + other_objective_functions_weights : list, default=[] A list of weights to be applied to the other objective functions. - + objective_function_names : list, default=None A list of names to be applied to the objective functions. If None, will use the names of the objective functions. - + bigger_is_better : bool, default=True If True, the objective function is maximized. If False, the objective function is minimized. Use negative weights to reverse the direction. - + max_size : int, default=np.inf The maximum number of nodes of the pipelines to be generated. - + linear_pipeline : bool, default=False If True, the pipelines generated will be linear. If False, the pipelines generated will be directed acyclic graphs. - + root_config_dict : dict, default='auto' The configuration dictionary to use for the root node of the model. If 'auto', will use "classifiers" if classification=True, else "regressors". @@ -168,7 +168,7 @@ def __init__(self, scorers, - 'genetic encoders' : Includes Genetic Encoder methods as used in AutoQTL. - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - + inner_config_dict : dict, default=["selectors", "transformers"] The configuration dictionary to use for the inner nodes of the model generation. Default ["selectors", "transformers"] @@ -187,10 +187,10 @@ def __init__(self, scorers, - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None and max_depth>1, the root_config_dict will be used for the inner nodes as well. - - leaf_config_dict : dict, default=None + + leaf_config_dict : dict, default=None The configuration dictionary to use for the leaf node of the model. If set, leaf nodes must be from this dictionary. - Otherwise leaf nodes will be generated from the root_config_dict. + Otherwise leaf nodes will be generated from the root_config_dict. Default None - 'selectors' : A selection of sklearn Selector methods. - 'classifiers' : A selection of sklearn Classifier methods. @@ -207,14 +207,14 @@ def __init__(self, scorers, - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None, a leaf will not be required (i.e. the pipeline can be a single root node). Leaf nodes will be generated from the inner_config_dict. - + cross_val_predict_cv : int, default=0 Number of folds to use for the cross_val_predict function for inner classifiers and regressors. Estimators will still be fit on the full dataset, but the following node will get the outputs from cross_val_predict. - + - 0-1 : When set to 0 or 1, the cross_val_predict function will not be used. The next layer will get the outputs from fitting and transforming the full dataset. - - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. + - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. However, the output to the next node will come from cross_val_predict with the specified number of folds. - + categorical_features: list or None Categorical columns to inpute and/or one hot encode during the preprocessing step. Used only if preprocessing is not False. - None : If None, TPOT2 will automatically use object columns in pandas dataframes as objects for one hot encoding in preprocessing. @@ -222,7 +222,7 @@ def __init__(self, scorers, subsets : str or list, default=None Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. - - str : If a string, it is assumed to be a path to a csv file with the subsets. + - str : If a string, it is assumed to be a path to a csv file with the subsets. The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - None : If None, each column will be treated as a subset. One column will be selected per subset. @@ -245,178 +245,178 @@ def __init__(self, scorers, - None: TPOT does not use memory caching. - preprocessing : bool or BaseEstimator/Pipeline, + preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL A pipeline that will be used to preprocess the data before CV. - bool : If True, will use a default preprocessing pipeline. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. - + population_size : int, default=50 Size of the population - + initial_population_size : int, default=None Size of the initial population. If None, population_size will be used. - + population_scaling : int, default=0.5 Scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - - generations_until_end_population : int, default=1 - Number of generations until the population size reaches population_size - + + generations_until_end_population : int, default=1 + Number of generations until the population size reaches population_size + generations : int, default=50 Number of generations to run - + max_time_seconds : float, default=float("inf") Maximum time to run the optimization. If none or inf, will run until the end of the generations. - + max_eval_time_seconds : float, default=60*5 Maximum time to evaluate a single individual. If none or inf, there will be no time limit per evaluation. - + validation_strategy : str, default='none' EXPERIMENTAL The validation strategy to use for selecting the final pipeline from the population. TPOT2 may overfit the cross validation score. A second validation set can be used to select the final pipeline. - 'auto' : Automatically determine the validation strategy based on the dataset shape. - - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. - - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. + - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. + - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. - 'none' : Do not use a separate validation set for final validation. Select based on the original cross-validation score. This is the default for large datasets. validation_fraction : float, default=0.2 EXPERIMENTAL The fraction of the dataset to use for the validation set when validation_strategy is 'split'. Must be between 0 and 1. - + disable_label_encoder : bool, default=False If True, TPOT will check if the target needs to be relabeled to be sequential ints from 0 to N. This is necessary for XGBoost compatibility. If the labels need to be encoded, TPOT2 will use sklearn.preprocessing.LabelEncoder to encode the labels. The encoder can be accessed via the self.label_encoder_ attribute. If False, no additional label encoders will be used. early_stop : int, default=None Number of generations without improvement before early stopping. All objectives must have converged within the tolerance for this to be triggered. - - scorers_early_stop_tol : + + scorers_early_stop_tol : -list of floats list of tolerances for each scorer. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - - other_objectives_early_stop_tol : + + other_objectives_early_stop_tol : -list of floats list of tolerances for each of the other objective function. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - + threshold_evaluation_early_stop : list [start, end], default=None starting and ending percentile to use as a threshold for the evaluation early stopping. Values between 0 and 100. - + threshold_evaluation_scaling : float [0,inf), default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. - + selection_evaluation_early_stop : list, default=None A lower and upper percent of the population size to select each round of CV. Values between 0 and 1. - - selection_evaluation_scaling : float, default=0.5 + + selection_evaluation_scaling : float, default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - Must be greater than zero. Higher numbers will move the threshold to the end faster. - + Must be greater than zero. Higher numbers will move the threshold to the end faster. + min_history_threshold : int, default=0 The minimum number of previous scores needed before using threshold early stopping. - + survival_percentage : float, default=1 - Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. + Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. For example, if the population size is 100 and the survival percentage is .5, 50 individuals will be selected with NSGA2 from the existing population. These will be used for mutation and crossover to generate the next 100 individuals for the next generation. The remainder are discarded from the live population. In the next generation, there will now be the 50 parents + the 100 individuals for a total of 150. Surivival percentage is based of the population size parameter and not the existing population size (current population size when using successive halving). Therefore, in the next generation we will still select 50 individuals from the currently existing 150. - + crossover_probability : float, default=.2 Probability of generating a new individual by crossover between two individuals. - + mutate_probability : float, default=.7 Probability of generating a new individual by crossover between one individuals. - + mutate_then_crossover_probability : float, default=.05 Probability of generating a new individual by mutating two individuals followed by crossover. - + crossover_then_mutate_probability : float, default=.05 Probability of generating a new individual by crossover between two individuals followed by a mutation of the resulting individual. - + survival_selector : function, default=survival_select_NSGA2 Function to use to select individuals for survival. Must take a matrix of scores and return selected indexes. Used to selected population_size * survival_percentage individuals at the start of each generation to use for mutation and crossover. - + parent_selector : function, default=parent_select_NSGA2 Function to use to select pairs parents for crossover and individuals for mutation. Must take a matrix of scores and return selected indexes. - + budget_range : list [start, end], default=None A starting and ending budget to use for the budget scaling. - + budget_scaling float : [0,1], default=0.5 A scaling factor to use when determining how fast we move the budget from the start to end budget. - + generations_until_end_budget : int, default=1 The number of generations to run before reaching the max budget. - + stepwise_steps : int, default=1 The number of staircase steps to take when scaling the budget and population size. - - + + n_jobs : int, default=1 Number of processes to run in parallel. - + memory_limit : str, default="4GB" Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information. - + client : dask.distributed.Client, default=None - A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. - + A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. + processes : bool, default=True If True, will use multiprocessing to parallelize the optimization process. If False, will use threading. True seems to perform better. However, False is required for interactive debugging. - - + + warm_start : bool, default=False If True, will use the continue the evolutionary algorithm from the last generation of the previous run. - + subset_column : str or int, default=None EXPERIMENTAL The column to use for the subset selection. Must also pass in unique_subset_values to GraphIndividual to function. - + periodic_checkpoint_folder : str, default=None Folder to save the population to periodically. If None, no periodic saving will be done. If provided, training will resume from this checkpoint. - + callback : tpot2.CallBackInterface, default=None Callback object. Not implemented - - verbose : int, default=1 + + verbose : int, default=1 How much information to print during the optimization process. Higher values include the information from lower values. 0. nothing 1. progress bar - + 3. best individual 4. warnings >=5. full warnings trace 6. evaluations progress bar. (Temporary: This used to be 2. Currently, using evaluation progress bar may prevent some instances were we terminate a generation early due to it reaching max_time_seconds in the middle of a generation OR a pipeline failed to be terminated normally and we need to manually terminate it.) - - + + Attributes ---------- fitted_pipeline_ : GraphPipeline A fitted instance of the GraphPipeline that inherits from sklearn BaseEstimator. This is fitted on the full X, y passed to fit. - evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. - Columns: + evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. + Columns: - *objective functions : The first few columns correspond to the passed in scorers and objective functions - Parents : A tuple containing the indexes of the pipelines used to generate the pipeline of that row. If NaN, this pipeline was generated randomly in the initial population. - Variation_Function : Which variation function was used to mutate or crossover the parents. If NaN, this pipeline was generated randomly in the initial population. - Individual : The internal representation of the individual that is used during the evolutionary algorithm. This is not an sklearn BaseEstimator. - - Generation : The generation the pipeline first appeared. - - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. - To save on computational time, the best frontier is updated iteratively each generation. + - Generation : The generation the pipeline first appeared. + - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. + To save on computational time, the best frontier is updated iteratively each generation. The pipelines with the 0th pareto front do represent the exact best frontier. However, the pipelines with pareto front >= 1 are only in reference to the other pipelines in the final population. - All other pipelines are set to NaN. - - Instance : The unfitted GraphPipeline BaseEstimator. + All other pipelines are set to NaN. + - Instance : The unfitted GraphPipeline BaseEstimator. - *validation objective functions : Objective function scores evaluated on the validation set. - Validation_Pareto_Front : The full pareto front calculated on the validation set. This is calculated for all pipelines with Pareto_Front equal to 0. Unlike the Pareto_Front which only calculates the frontier and the final population, the Validation Pareto Front is calculated for all pipelines tested on the validation set. - + pareto_front : The same pandas dataframe as evaluated individuals, but containing only the frontier pareto front pipelines. ''' @@ -455,7 +455,7 @@ def __init__(self, scorers, self.early_stop = early_stop self.scorers_early_stop_tol = scorers_early_stop_tol self.other_objectives_early_stop_tol = other_objectives_early_stop_tol - self.max_time_seconds = max_time_seconds + self.max_time_seconds = max_time_seconds self.max_eval_time_seconds = max_eval_time_seconds self.n_jobs= n_jobs self.memory_limit = memory_limit @@ -506,22 +506,22 @@ def __init__(self, scorers, self._scorers = [self.scorers] else: self._scorers = self.scorers - + self._scorers = [sklearn.metrics.get_scorer(scoring) for scoring in self._scorers] self._scorers_early_stop_tol = self.scorers_early_stop_tol - + self._evolver = tpot2.evolvers.BaseEvolver - + self.objective_function_weights = [*scorers_weights, *other_objective_functions_weights] - + if self.objective_function_names is None: obj_names = [f.__name__ for f in other_objective_functions] else: obj_names = self.objective_function_names self.objective_names = [f._score_func.__name__ if hasattr(f,"_score_func") else f.__name__ for f in self._scorers] + obj_names - - + + if not isinstance(self.other_objectives_early_stop_tol, list): self._other_objectives_early_stop_tol = [self.other_objectives_early_stop_tol for _ in range(len(self.other_objective_functions))] else: @@ -533,7 +533,7 @@ def __init__(self, scorers, self._scorers_early_stop_tol = self._scorers_early_stop_tol self.early_stop_tol = [*self._scorers_early_stop_tol, *self._other_objectives_early_stop_tol] - + self._evolver_instance = None self.evaluated_individuals = None @@ -564,8 +564,8 @@ def fit(self, X, y): if self.classification and not self.disable_label_encoder and not check_if_y_is_encoded(y): warnings.warn("Labels are not encoded as ints from 0 to N. For compatibility with some classifiers such as sklearn, TPOT has encoded y with the sklearn LabelEncoder. When using pipelines outside the main TPOT estimator class, you can encode the labels with est.label_encoder_") - self.label_encoder_ = LabelEncoder() - y = self.label_encoder_.fit_transform(y) + self.label_encoder_ = LabelEncoder() + y = self.label_encoder_.fit_transform(y) self.evaluated_individuals = None #determine validation strategy @@ -598,7 +598,7 @@ def fit(self, X, y): if self.classification: X, y = remove_underrepresented_classes(X, y, n_folds) - + if self.preprocessing: #X = pd.DataFrame(X) @@ -616,7 +616,7 @@ def fit(self, X, y): tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns else: #numpy array and no categorical columns specified, just do imputation - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) + self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) else: @@ -660,33 +660,33 @@ def fit(self, X, y): else: self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) - - def objective_function(pipeline_individual, - X, + + def objective_function(pipeline_individual, + X, y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, + scorers= self._scorers, + cv=self.cv_gen, other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, - **kwargs): + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, + **kwargs): return objective_function_generator( pipeline_individual, - X, - y, + X, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, ) - self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( + self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( inner_config_dict=inner_config_dict, root_config_dict=root_config_dict, leaf_config_dict=leaf_config_dict, @@ -711,7 +711,7 @@ def objective_function(pipeline_individual, #If warm start and we have an evolver instance, use the existing one if not(self.warm_start and self._evolver_instance is not None): - self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, + self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, objective_functions= [objective_function], objective_function_weights = self.objective_function_weights, objective_names=self.objective_names, @@ -735,7 +735,7 @@ def objective_function(pipeline_individual, early_stop_tol = self.early_stop_tol, early_stop= self.early_stop, - + budget_range = self.budget_range, budget_scaling = self.budget_scaling, generations_until_end_budget = self.generations_until_end_budget, @@ -752,10 +752,10 @@ def objective_function(pipeline_individual, mutate_probability = self.mutate_probability, mutate_then_crossover_probability= self.mutate_then_crossover_probability, crossover_then_mutate_probability= self.crossover_then_mutate_probability, - + ) - + self._evolver_instance.optimize() #self._evolver_instance.population.update_pareto_fronts(self.objective_names, self.objective_function_weights) self.make_evaluated_individuals() @@ -765,22 +765,22 @@ def objective_function(pipeline_individual, pareto_front_inds = self.pareto_front['Individual'].values all_graphs, all_scores = tpot2.individual_representations.graph_pipeline_individual.simple_parallel_optuna(pareto_front_inds, objective_function, self.objective_function_weights, _client, storage=self.optuna_storage, steps=self.optuna_optimize_pareto_front_trials, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, max_time_seconds=self.optuna_optimize_pareto_front_timeout, **{"X": X, "y": y}) all_scores = tpot2.utils.eval_utils.process_scores(all_scores, len(self.objective_function_weights)) - + if len(all_graphs) > 0: df = pd.DataFrame(np.column_stack((all_graphs, all_scores,np.repeat("Optuna",len(all_graphs)))), columns=["Individual"] + self.objective_names +["Parents"]) for obj in self.objective_names: df[obj] = df[obj].apply(convert_to_float) - + self.evaluated_individuals = pd.concat([self.evaluated_individuals, df], ignore_index=True) else: print("WARNING NO OPTUNA TRIALS COMPLETED") - + tpot2.utils.get_pareto_frontier(self.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) if validation_strategy == 'reshuffled': best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - + #reshuffle rows X, y = sklearn.utils.shuffle(X, y, random_state=1) @@ -791,30 +791,30 @@ def objective_function(pipeline_individual, X_future = X y_future = y - val_objective_function_list = [lambda ind, - X, - y, + val_objective_function_list = [lambda ind, + X, + y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + scorers= self._scorers, + cv=self.cv_gen, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: objective_function_generator( ind, X, - y, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + objective_kwargs = {"X": X_future, "y": y_future} val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, @@ -829,7 +829,7 @@ def objective_function(pipeline_individual, elif validation_strategy == 'split': - if self.scatter: + if self.scatter: X_future = _client.scatter(X) y_future = _client.scatter(y) X_val_future = _client.scatter(X_val) @@ -841,33 +841,33 @@ def objective_function(pipeline_individual, y_val_future = y_val objective_kwargs = {"X": X_future, "y": y_future, "X_val" : X_val_future, "y_val":y_val_future } - + best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - val_objective_function_list = [lambda ind, - X, - y, - X_val, - y_val, - scorers= self._scorers, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + val_objective_function_list = [lambda ind, + X, + y, + X_val, + y_val, + scorers= self._scorers, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: val_objective_function_generator( ind, X, y, - X_val, - y_val, - scorers= scorers, + X_val, + y_val, + scorers= scorers, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, val_objective_function_list, n_jobs=self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds,n_expected_columns=len(self.objective_names),client=_client, **objective_kwargs) @@ -879,25 +879,25 @@ def objective_function(pipeline_individual, else: self.objective_names_for_selection = self.objective_names - val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) + val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) weighted_scores = val_scores*self.objective_function_weights - + if self.bigger_is_better: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmax() else: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmin() - + best_individual = self.evaluated_individuals.loc[best_idx]['Individual'] self.selected_best_score = self.evaluated_individuals.loc[best_idx] - + best_individual_pipeline = best_individual.export_pipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, subset_column=self.subset_column) if self.preprocessing: self.fitted_pipeline_ = sklearn.pipeline.make_pipeline(sklearn.base.clone(self._preprocessing_pipeline), best_individual_pipeline ) else: - self.fitted_pipeline_ = best_individual_pipeline - + self.fitted_pipeline_ = best_individual_pipeline + self.fitted_pipeline_.fit(X_original,y_original) #TODO use y_original as well? @@ -907,7 +907,7 @@ def objective_function(pipeline_individual, cluster.close() return self - + def _estimator_has(attr): '''Check if we can delegate a method to the underlying estimator. First, we check the first fitted final estimator if available, otherwise we @@ -919,7 +919,7 @@ def _estimator_has(attr): - + @available_if(_estimator_has('predict')) @@ -932,19 +932,19 @@ def predict(self, X, **predict_params): preds = self.label_encoder_.inverse_transform(preds) return preds - + @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.predict_proba(X,**predict_params) - + @available_if(_estimator_has('decision_function')) def decision_function(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.decision_function(X,**predict_params) - + @available_if(_estimator_has('transform')) def transform(self, X, **predict_params): check_is_fitted(self) @@ -958,7 +958,7 @@ def classes_(self): return self.label_encoder_.classes_ else: return self.fitted_pipeline_.classes_ - + @property def _estimator_type(self): @@ -977,7 +977,7 @@ def make_evaluated_individuals(self): self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline)) return self.evaluated_individuals - + @property def pareto_front(self): #check if _evolver_instance exists @@ -988,5 +988,3 @@ def pareto_front(self): return self.evaluated_individuals else: return self.evaluated_individuals[self.evaluated_individuals["Pareto_Front"]==1] - - From 09b49d1e3990309ef5e85286da89c6a9ec621c3d Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 23 Oct 2023 14:50:15 -0700 Subject: [PATCH 09/43] removed trailing white space --- tpot2/tpot_estimator/estimator_utils.py | 32 ++++++++++--------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tpot2/tpot_estimator/estimator_utils.py b/tpot2/tpot_estimator/estimator_utils.py index 08d25f1b..ef156e26 100644 --- a/tpot2/tpot_estimator/estimator_utils.py +++ b/tpot2/tpot_estimator/estimator_utils.py @@ -13,7 +13,7 @@ def convert_parents_tuples_to_integers(row, object_to_int): return np.nan def apply_make_pipeline(graphindividual, preprocessing_pipeline=None): - try: + try: if preprocessing_pipeline is None: return graphindividual.export_pipeline() else: @@ -27,7 +27,7 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, if isinstance(options, dict): return recursive_with_defaults(options, n_samples, n_features, classification, subsets=subsets, feature_names=feature_names) - + if not isinstance(options, list): options = [options] @@ -52,7 +52,7 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, elif option == "transformers": config_dict.update(tpot2.config.make_transformer_config_dictionary(n_features=n_features)) - + elif option == "arithmetic_transformer": config_dict.update(tpot2.config.make_arithmetic_transformer_config_dictionary()) @@ -61,10 +61,10 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, elif option == "skrebate": config_dict.update(tpot2.config.make_skrebate_config_dictionary(n_features=n_features)) - + elif option == "MDR": config_dict.update(tpot2.config.make_MDR_config_dictionary()) - + elif option == "continuousMDR": config_dict.update(tpot2.config.make_ContinuousMDR_config_dictionary()) @@ -76,7 +76,7 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, elif option == "passthrough": config_dict.update(tpot2.config.make_passthrough_config_dictionary()) - + else: config_dict.update(recursive_with_defaults(option, n_samples, n_features, classification, subsets=subsets, feature_names=feature_names)) @@ -87,7 +87,7 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, return config_dict def recursive_with_defaults(config_dict, n_samples, n_features, classification, subsets=None, feature_names=None): - + for key in 'leaf_config_dict', 'root_config_dict', 'inner_config_dict', 'Recursive': if key in config_dict: value = config_dict[key] @@ -95,7 +95,7 @@ def recursive_with_defaults(config_dict, n_samples, n_features, classification, config_dict[key] = recursive_with_defaults(value,n_samples, n_features, classification, subsets=None, feature_names=None) else: config_dict[key] = get_configuration_dictionary(value, n_samples, n_features, classification, subsets, feature_names) - + return config_dict @@ -117,14 +117,14 @@ def objective_function_generator(pipeline, x,y, scorers, cv, other_objective_fun cv_obj_scores = cross_val_score_objective(sklearn.base.clone(pipeline),x,y,scorers=scorers, cv=cv , fold=step) else: cv_obj_scores = [] - + if other_objective_functions is not None and len(other_objective_functions) >0: other_scores = [obj(sklearn.base.clone(pipeline)) for obj in other_objective_functions] #flatten other_scores = np.array(other_scores).flatten().tolist() else: other_scores = [] - + return np.concatenate([cv_obj_scores,other_scores]) def val_objective_function_generator(pipeline, X_train, y_train, X_test, y_test, scorers, other_objective_functions, memory, cross_val_predict_cv, subset_column): @@ -134,12 +134,12 @@ def val_objective_function_generator(pipeline, X_train, y_train, X_test, y_test, fitted_pipeline.fit(X_train, y_train) if len(scorers) > 0: - scores =[sklearn.metrics.get_scorer(scorer)(fitted_pipeline, X_test, y_test) for scorer in scorers] + scores =[sklearn.metrics.get_scorer(scorer)(fitted_pipeline, X_test, y_test) for scorer in scorers] other_scores = [] if other_objective_functions is not None and len(other_objective_functions) >0: other_scores = [obj(sklearn.base.clone(pipeline)) for obj in other_objective_functions] - + return np.concatenate([scores,other_scores]) @@ -170,7 +170,7 @@ def convert_to_float(x): return float(x) except ValueError: return x - + @@ -180,9 +180,3 @@ def check_if_y_is_encoded(y): ''' y = sorted(set(y)) return all(i == j for i, j in enumerate(y)) - - - - - - From 9ee89c2eb09a811594649ce2fd7f36ee362eca55 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 23 Oct 2023 14:52:54 -0700 Subject: [PATCH 10/43] removed trailing white space --- tpot2/config/selectors.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tpot2/config/selectors.py b/tpot2/config/selectors.py index 12c3e3c1..2a46ce89 100644 --- a/tpot2/config/selectors.py +++ b/tpot2/config/selectors.py @@ -1,4 +1,4 @@ -#TODO: how to best support transformers/selectors that take other transformers with their own hyperparameters? +#TODO: how to best support transformers/selectors that take other transformers with their own hyperparameters? import numpy as np from sklearn.feature_selection import SelectFwe from sklearn.feature_selection import SelectPercentile @@ -29,7 +29,7 @@ def params_sklearn_feature_selection_VarianceThreshold(trial, name=None): return { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, .2, log=True) } - + #TODO add more estimator options? How will that interact with optuna? def params_sklearn_feature_selection_RFE(trial, name=None, classifier=True): @@ -37,7 +37,7 @@ def params_sklearn_feature_selection_RFE(trial, name=None, classifier=True): estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, name=f"RFE_{name}")) else: estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, name=f"RFE_{name}")) - + params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), 'estimator' : estimator, @@ -51,7 +51,7 @@ def params_sklearn_feature_selection_SelectFromModel(trial, name=None, classifie estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, name=f"SFM_{name}")) else: estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, name=f"SFM_{name}")) - + params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), 'estimator' : estimator, @@ -65,12 +65,12 @@ def params_sklearn_feature_selection_RFE_wrapped(trial, name=None, classifier=Tr params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), } - + if classifier: estimator_params = params_ExtraTreesClassifier(trial, name=f"RFE_{name}") else: estimator_params = params_ExtraTreesRegressor(trial, name=f"RFE_{name}") - + params.update(estimator_params) return params @@ -80,12 +80,12 @@ def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, name=None, c params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), } - + if classifier: estimator_params = params_ExtraTreesClassifier(trial, name=f"SFM_{name}") else: estimator_params = params_ExtraTreesRegressor(trial, name=f"SFM_{name}") - + params.update(estimator_params) return params @@ -101,7 +101,7 @@ def make_selector_config_dictionary(classifier=True): params = {RFE_ExtraTreesRegressor : partial(params_sklearn_feature_selection_RFE_wrapped, classifier=classifier), SelectFromModel_ExtraTreesRegressor : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, classifier=classifier), } - + params.update({ SelectFwe: params_sklearn_feature_selection_SelectFwe, SelectPercentile: params_sklearn_feature_selection_SelectPercentile, VarianceThreshold: params_sklearn_feature_selection_VarianceThreshold,}) From 4038413fbd5985901e1b8a9e46007ea992059a36 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 23 Oct 2023 14:53:52 -0700 Subject: [PATCH 11/43] removed trailing white space --- tpot2/config/hyperparametersuggestor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tpot2/config/hyperparametersuggestor.py b/tpot2/config/hyperparametersuggestor.py index 73c9c678..08b7a228 100644 --- a/tpot2/config/hyperparametersuggestor.py +++ b/tpot2/config/hyperparametersuggestor.py @@ -35,11 +35,11 @@ def suggest_categorical(self, name, choices): choice = self.old_params[name] if choice not in choices: #if the old value is not in the choices, then we need to choose a value for it choice = self.suggest_categorical_(name, choices) - + self._params[name] = choice return choice - def suggest_float(self, + def suggest_float(self, name: str, low: float, high: float, @@ -94,18 +94,18 @@ def suggest_uniform(self, name, low, high): self._params[name] = choice return choice - + #################################### #Replicating the API found in optuna: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html #copy-pasted some code def suggest_categorical_(self, name, choices): - + choice = random.choice(choices) return choice - def suggest_float_(self, + def suggest_float_(self, name: str, low: float, high: float, @@ -113,7 +113,7 @@ def suggest_float_(self, step = None, log = False, ): - + if log and step is not None: raise ValueError("The parameter `step` is not supported when `log` is true.") @@ -157,7 +157,7 @@ def suggest_discrete_uniform_(self, name, low, high, q): def suggest_int_(self, name, low, high, step=1, log=False): if low == high: #TODO check that this matches optuna's behaviour return low - + if log and step >1: raise ValueError("The parameter `step`>1 is not supported when `log` is true.") From c1d139fb6e7b4e18feb93a1a61232511fd404dc6 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 23 Oct 2023 15:06:10 -0700 Subject: [PATCH 12/43] added numpy generator code taht will be passed through Trial class --- tpot2/config/hyperparametersuggestor.py | 32 ++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/tpot2/config/hyperparametersuggestor.py b/tpot2/config/hyperparametersuggestor.py index 08b7a228..55786369 100644 --- a/tpot2/config/hyperparametersuggestor.py +++ b/tpot2/config/hyperparametersuggestor.py @@ -1,19 +1,23 @@ -import random -from scipy.stats import loguniform, logser #TODO: remove this dependency? -import numpy as np #TODO: remove this dependency and use scipy instead? +# import random +# from scipy.stats import loguniform, logser #TODO: remove this dependency? +import numpy as np #function that selects selects items from a list with each having independent probability p of being selected -def select(items, p): - selected = [item for item in items if random.random() < p] +def select(items, p, rng_): + rng = np.random.default_rng(rng_) + + selected = [item for item in items if rng.random() < p] #if selected is empty, select one item at random if not selected: - return [random.choice(items)] + return [rng.choice(items)] return selected class Trial(): - def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): + def __init__(self, rng_, old_params=None, alpha=1, hyperparameter_probability=1): + self.rng = np.random.default_rng(rng_) + self._params = dict() self.old_params = old_params @@ -21,7 +25,7 @@ def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): self.hyperparameter_probability = hyperparameter_probability if old_params is not None and len(old_params) > 0: - self.params_to_update = select(list(old_params.keys()), self.hyperparameter_probability) + self.params_to_update = select(list(old_params.keys()), self.hyperparameter_probability, rng_=self.rng) else: self.params_to_update = None @@ -102,7 +106,7 @@ def suggest_uniform(self, name, low, high): #copy-pasted some code def suggest_categorical_(self, name, choices): - choice = random.choice(choices) + choice = self.rng.choice(choices) return choice def suggest_float_(self, @@ -136,16 +140,16 @@ def suggest_float_(self, #TODO check this produces correct output if log: - value = np.random.uniform(np.log(low),np.log(high)) + value = self.rng.uniform(np.log(low),np.log(high)) choice = np.e**value return choice else: if step is not None: - choice = np.random.choice(np.arange(low,high,step)) + choice = self.rng.choice(np.arange(low,high,step)) return choice else: - choice = np.random.uniform(low,high) + choice = self.rng.uniform(low,high) return choice @@ -179,11 +183,11 @@ def suggest_int_(self, name, low, high, step=1, log=False): ) if log: - value = np.random.uniform(np.log(low),np.log(high)) + value = self.rng.uniform(np.log(low),np.log(high)) choice = int(np.e**value) return choice else: - choice = np.random.choice(list(range(low,high,step))) + choice = self.rng.choice(list(range(low,high,step))) return choice def suggest_uniform_(self, name, low, high): From 47ac0d8e4a262d47fb1d346eeae6ad9b4e6eb703 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 23 Oct 2023 15:20:00 -0700 Subject: [PATCH 13/43] removed trailing white space --- tpot2/config/classifiers.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index d11dc396..a82f5f82 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -88,7 +88,7 @@ def params_LinearSVC(trial, name=None): loss = 'squared_hinge' else: loss = trial.suggest_categorical(name=f'loss_{name}', choices=['hinge', 'squared_hinge']) - + if loss == 'hinge' and penalty == 'l2': dual = True else: @@ -97,7 +97,7 @@ def params_LinearSVC(trial, name=None): return { 'penalty': penalty, 'loss': loss, - 'dual': dual, + 'dual': dual, 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), } @@ -120,7 +120,7 @@ def params_GradientBoostingClassifier(trial,n_classes=None, name=None): loss = 'log_loss' else: loss = trial.suggest_categorical(name=f'loss_{name}', choices=['log_loss', 'exponential']) - + params = { 'n_estimators': 100, 'loss': loss, @@ -215,7 +215,7 @@ def params_MLPClassifier_large(trial, name=None): 'max_iter' : 10000 } - return params + return params def params_BernoulliNB(trial, name=None): params = { @@ -253,4 +253,3 @@ def make_classifier_config_dictionary(n_samples=10, n_classes=None): #: params_LGBMClassifier, # logistic regression and SVM/SVC are just special cases of this one? remove? MLPClassifier: params_MLPClassifier_tpot, } - From e741cb302ff33f771d01e051db2687328ab41f4f Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 25 Oct 2023 10:56:01 -0700 Subject: [PATCH 14/43] selection fix when zero mutation/cx selected --- tpot2/evolvers/base_evolver.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tpot2/evolvers/base_evolver.py b/tpot2/evolvers/base_evolver.py index 93655d66..7a94cf10 100644 --- a/tpot2/evolvers/base_evolver.py +++ b/tpot2/evolvers/base_evolver.py @@ -516,19 +516,15 @@ def step(self,): self.generation += 1 def generate_offspring(self, ): #your EA Algorithm goes here - n_mutations = np.random.binomial(self.cur_population_size, self.mutate_probability) - n_crossover = self.cur_population_size - n_mutations + parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=self.cur_population_size, n_parents=2) + p = np.array([self.crossover_probability, self.mutate_then_crossover_probability, self.crossover_then_mutate_probability, self.mutate_probability]) + p = p / p.sum() + var_op_list = np.random.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate", "mutate"], size=self.cur_population_size, p=p) + + for i, op in enumerate(var_op_list): + if op == "mutate": + parents[i] = parents[i][0] #mutations take a single individual - cx_parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=n_crossover, n_parents=2) - m_parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=n_mutations, n_parents=1) - - p = np.array([self.crossover_probability, self.mutate_then_crossover_probability, self.crossover_then_mutate_probability]) - p = p/np.sum(p) - var_op_list = np.random.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate"], size=n_crossover, p=p) - var_op_list = np.concatenate([var_op_list, ["mutate"]*n_mutations]) - - parents = list(cx_parents) + list(m_parents) - offspring = self.population.create_offspring2(parents, var_op_list, self.mutation_functions, self.mutation_function_weights, self.crossover_functions, self.crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True) self.population.update_column(offspring, column_names="Generation", data=self.generation, ) From 4aefcd5c8473de1320119079d55cb087401e74bd Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:28:37 -0700 Subject: [PATCH 15/43] added random state where needed, along with other params needed to be passed to models --- tpot2/config/autoqtl_builtins.py | 2 +- tpot2/config/classifiers.py | 69 ++++++++++------- tpot2/config/classifiers_sklearnex.py | 29 +++++--- tpot2/config/mdr_configs.py | 5 +- tpot2/config/regressors.py | 103 +++++++++++++++----------- tpot2/config/regressors_sklearnex.py | 34 ++++++--- tpot2/config/selectors.py | 38 +++++----- tpot2/config/special_configs.py | 14 ++-- tpot2/config/transformers.py | 49 ++++++------ 9 files changed, 196 insertions(+), 147 deletions(-) diff --git a/tpot2/config/autoqtl_builtins.py b/tpot2/config/autoqtl_builtins.py index d4638894..d3cc8dfc 100644 --- a/tpot2/config/autoqtl_builtins.py +++ b/tpot2/config/autoqtl_builtins.py @@ -1,6 +1,7 @@ from tpot2.builtin_modules import genetic_encoders from tpot2.builtin_modules import feature_encoding_frequency_selector import sklearn +import numpy as np def params_FeatureEncodingFrequencySelector(trial, name=None): return { @@ -21,4 +22,3 @@ def make_genetic_encoders_config_dictionary(): genetic_encoders.UnderDominanceEncoder : {}, genetic_encoders.OverDominanceEncoder : {}, } - diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index a82f5f82..88a320b9 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -14,9 +14,11 @@ from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB +import numpy as np -def params_LogisticRegression(trial, name=None): + +def params_LogisticRegression(trial, random_state, name=None): params = {} params['solver'] = trial.suggest_categorical(name=f'solver_{name}', choices=[f'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) @@ -40,22 +42,22 @@ def params_LogisticRegression(trial, name=None): 'C': params['C'], 'n_jobs': 1, 'max_iter': 1000, + 'random_state': random_state } return param_grid def params_KNeighborsClassifier(trial, name=None, n_samples=10): return { - #'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, 20 ), #TODO: set as a function of the number of samples - 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, n_samples, log=True ), #TODO: set as a function of the number of samples + 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, n_samples, log=True ), 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), 'p': trial.suggest_int('p', 1, 3), - 'metric': trial.suggest_categorical(f'metric_{name}', ['euclidean', 'minkowski']), + 'metric': str(trial.suggest_categorical(f'metric_{name}', ['euclidean', 'minkowski'])), 'n_jobs': 1, } -def params_DecisionTreeClassifier(trial, name=None): +def params_DecisionTreeClassifier(trial, random_state, name=None): return { 'criterion': trial.suggest_categorical(f'criterion_{name}', ['gini', 'entropy']), 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 11), @@ -65,10 +67,11 @@ def params_DecisionTreeClassifier(trial, name=None): 'min_weight_fraction_leaf': 0.0, 'max_features': trial.suggest_categorical(f'max_features_{name}', [ 'sqrt', 'log2']), 'max_leaf_nodes': None, + 'random_state': random_state } -def params_SVC(trial, name=None): +def params_SVC(trial, random_state, name=None): return { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), @@ -79,10 +82,12 @@ def params_SVC(trial, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, + 'random_state': random_state } -def params_LinearSVC(trial, name=None): +def params_LinearSVC(trial, random_state, name=None): + penalty = trial.suggest_categorical(name=f'penalty_{name}', choices=['l1', 'l2']) if penalty == 'l1': loss = 'squared_hinge' @@ -99,10 +104,11 @@ def params_LinearSVC(trial, name=None): 'loss': loss, 'dual': dual, 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), + 'random_state': random_state } -def params_RandomForestClassifier(trial, name=None): +def params_RandomForestClassifier(trial, random_state, name=None): params = { 'n_estimators': 100, 'criterion': trial.suggest_categorical(name=f'criterion_{name}', choices=['gini', 'entropy']), @@ -111,11 +117,13 @@ def params_RandomForestClassifier(trial, name=None): 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 20), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 20), 'n_jobs': 1, + 'random_state': random_state, } return params -def params_GradientBoostingClassifier(trial,n_classes=None, name=None): +def params_GradientBoostingClassifier(trial, random_state, n_classes=None, name=None): + if n_classes is not None and n_classes > 2: loss = 'log_loss' else: @@ -131,11 +139,12 @@ def params_GradientBoostingClassifier(trial,n_classes=None, name=None): 'max_features': trial.suggest_float(f'max_features_{name}', 0.1, 1.0), 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 10), 'tol': 1e-4, + 'random_state': random_state, } return params -def params_XGBClassifier(trial, name=None): +def params_XGBClassifier(trial, random_state, name=None): return { 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1, log=True), 'subsample': trial.suggest_float(f'subsample_{name}', 0.1, 1.0), @@ -145,10 +154,11 @@ def params_XGBClassifier(trial, name=None): 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 11), 'n_jobs': 1, #'use_label_encoder' : True, + 'random_state': random_state, } -def params_LGBMClassifier(trial, name=None): +def params_LGBMClassifier(trial, random_state, name=None): params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -159,13 +169,15 @@ def params_LGBMClassifier(trial, name=None): 'deterministic': True, 'force_row_wise': True, 'n_jobs': 1, + 'random_state': random_state + } if 2 ** params['max_depth'] > params['num_leaves']: params['num_leaves'] = 2 ** params['max_depth'] return params -def params_ExtraTreesClassifier(trial, name=None): +def params_ExtraTreesClassifier(trial, random_state, name=None): params = { 'n_estimators': 100, 'criterion': trial.suggest_categorical(name=f'criterion_{name}', choices=["gini", "entropy"]), @@ -174,10 +186,11 @@ def params_ExtraTreesClassifier(trial, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21, step=1), 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), 'n_jobs': 1, + 'random_state': random_state, } return params -def params_SGDClassifier(trial, name=None): +def params_SGDClassifier(trial, random_state, name=None): params = { 'loss': trial.suggest_categorical(f'loss_{name}', ['log_loss', 'modified_huber',]), 'penalty': 'elasticnet', @@ -188,20 +201,21 @@ def params_SGDClassifier(trial, name=None): 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True), 'n_jobs': 1, + 'random_state': random_state } return params -def params_MLPClassifier_tpot(trial, name=None): +def params_MLPClassifier_tpot(trial, random_state, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-4, 1e-1, log=True), - 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True) + 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True), + 'random_state': random_state, } return params def params_MLPClassifier_large(trial, name=None): - n_layers = trial.suggest_int(f'n_layers_{name}', 2, 3) layers = [] for i in range(n_layers): @@ -217,6 +231,7 @@ def params_MLPClassifier_large(trial, name=None): return params + def params_BernoulliNB(trial, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-3, 100, log=True), @@ -233,23 +248,23 @@ def params_MultinomialNB(trial, name=None): return params -def make_classifier_config_dictionary(n_samples=10, n_classes=None): +def make_classifier_config_dictionary(random_state, n_samples=10, n_classes=None): n_samples = min(n_samples,100) #TODO optimize this return { - LogisticRegression: params_LogisticRegression, - DecisionTreeClassifier: params_DecisionTreeClassifier, + LogisticRegression: partial(params_LogisticRegression, random_state=random_state), + DecisionTreeClassifier: partial(params_DecisionTreeClassifier, random_state=random_state), KNeighborsClassifier: partial(params_KNeighborsClassifier,n_samples=n_samples), - GradientBoostingClassifier: partial(params_GradientBoostingClassifier, n_classes=n_classes), - ExtraTreesClassifier:params_ExtraTreesClassifier, - RandomForestClassifier: params_RandomForestClassifier, - SGDClassifier:params_SGDClassifier, + GradientBoostingClassifier: partial(params_GradientBoostingClassifier, random_state=random_state, n_classes=n_classes), + ExtraTreesClassifier: partial(params_ExtraTreesClassifier, random_state=random_state), + RandomForestClassifier: partial(params_RandomForestClassifier, random_state=random_state), + SGDClassifier: partial(params_SGDClassifier, random_state=random_state), GaussianNB: {}, BernoulliNB: params_BernoulliNB, MultinomialNB: params_MultinomialNB, - XGBClassifier: params_XGBClassifier, - #LinearSVC: params_LinearSVC, - SVC: params_SVC, + XGBClassifier: partial(params_XGBClassifier, random_state=random_state), + #LinearSVC: partial(params_LinearSVC, random_state=random_state), + SVC: partial(params_SVC, random_state=random_state), #: params_LGBMClassifier, # logistic regression and SVM/SVC are just special cases of this one? remove? - MLPClassifier: params_MLPClassifier_tpot, + MLPClassifier: partial(params_MLPClassifier_tpot, random_state=random_state), } diff --git a/tpot2/config/classifiers_sklearnex.py b/tpot2/config/classifiers_sklearnex.py index 7d4129d0..fe7f213b 100644 --- a/tpot2/config/classifiers_sklearnex.py +++ b/tpot2/config/classifiers_sklearnex.py @@ -4,14 +4,18 @@ from sklearnex.svm import NuSVC from sklearnex.linear_model import LogisticRegression +import numpy as np -def params_RandomForestClassifier(trial, name=None): +from functools import partial + +def params_RandomForestClassifier(trial, random_state, name=None): return { 'n_estimators': 100, 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 20), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 20), 'n_jobs': 1, + 'random_state': random_state } def params_KNeighborsClassifier(trial, name=None, n_samples=10): @@ -21,7 +25,7 @@ def params_KNeighborsClassifier(trial, name=None, n_samples=10): 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), } -def params_LogisticRegression(trial, name=None): +def params_LogisticRegression(trial, random_state, name=None): params = {} params['dual'] = False params['penalty'] = 'l2' @@ -38,9 +42,10 @@ def params_LogisticRegression(trial, name=None): 'dual': params['dual'], 'C': trial.suggest_float(f'C_{name}', 1e-4, 1e4, log=True), 'max_iter': 1000, + 'random_state': random_state, } -def params_SVC(trial, name=None): +def params_SVC(trial, random_state, name=None): return { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), @@ -49,9 +54,10 @@ def params_SVC(trial, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, + 'random_state': random_state, } -def params_NuSVC(trial, name=None): +def params_NuSVC(trial, random_state, name=None): return { 'nu': trial.suggest_float(f'subsample_{name}', 0.05, 1.0), 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), @@ -61,13 +67,14 @@ def params_NuSVC(trial, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, + 'random_state': random_state, } -def make_sklearnex_classifier_config_dictionary(n_samples=10, n_classes=None): +def make_sklearnex_classifier_config_dictionary(random_state, n_samples=10, n_classes=None): return { - RandomForestClassifier: params_RandomForestClassifier, - KNeighborsClassifier: params_KNeighborsClassifier, - LogisticRegression: params_LogisticRegression, - SVC: params_SVC, - NuSVC: params_NuSVC, - } + RandomForestClassifier: partial(params_RandomForestClassifier, random_state=random_state), + KNeighborsClassifier: partial(params_KNeighborsClassifier, n_samples=n_samples), + LogisticRegression: partial(params_LogisticRegression, random_state=random_state), + SVC: partial(params_SVC, random_state=random_state), + NuSVC: partial(params_NuSVC, random_state=random_state), + } \ No newline at end of file diff --git a/tpot2/config/mdr_configs.py b/tpot2/config/mdr_configs.py index aff4ee87..9634b534 100644 --- a/tpot2/config/mdr_configs.py +++ b/tpot2/config/mdr_configs.py @@ -2,6 +2,8 @@ from skrebate import ReliefF, SURF, SURFstar, MultiSURF from functools import partial +import numpy as np + #MDR def params_MDR(trial, name=None): return { @@ -57,5 +59,4 @@ def make_MDR_config_dictionary(): def make_ContinuousMDR_config_dictionary(): return { ContinuousMDR : params_ContinuousMDR - } - + } \ No newline at end of file diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 930e0e7e..7c08a28f 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -18,7 +18,7 @@ from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import ElasticNetCV - +import numpy as np from xgboost import XGBRegressor from functools import partial @@ -29,18 +29,19 @@ #TODO: fill in remaining #TODO check for places were we could use log scaling -def params_RandomForestRegressor(trial, name=None): +def params_RandomForestRegressor(trial, random_state, name=None): return { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), + 'random_state': random_state, } # SGDRegressor parameters -def params_SGDRegressor(trial, name=None): +def params_SGDRegressor(trial, random_state, name=None): params = { 'loss': trial.suggest_categorical(f'loss_{name}', ['huber', 'squared_error', 'epsilon_insensitive', 'squared_epsilon_insensitive']), 'penalty': 'elasticnet', @@ -49,13 +50,14 @@ def params_SGDRegressor(trial, name=None): 'fit_intercept':True, 'l1_ratio': trial.suggest_float(f'l1_ratio_{name}', 0.0, 1.0), 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), - 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True) + 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True), + 'random_state': random_state, } return params # Ridge parameters -def params_Ridge(trial, name=None): +def params_Ridge(trial, random_state, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -64,12 +66,13 @@ def params_Ridge(trial, name=None): #'max_iter': trial.suggest_int(f'max_iter_{name}', 100, 1000), 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'solver': trial.suggest_categorical(f'solver_{name}', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']), + 'random_state': random_state, } return params # Lasso parameters -def params_Lasso(trial, name=None): +def params_Lasso(trial, random_state, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -81,30 +84,33 @@ def params_Lasso(trial, name=None): 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), 'selection': trial.suggest_categorical(f'selection_{name}', ['cyclic', 'random']), + 'random_state': random_state, } return params # ElasticNet parameters -def params_ElasticNet(trial, name=None): +def params_ElasticNet(trial, random_state, name=None): params = { 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0, log=True), 'l1_ratio': 1- trial.suggest_float(f'l1_ratio_{name}',0.0, 1.0), + 'random_state': random_state, } return params # Lars parameters -def params_Lars(trial, name=None): +def params_Lars(trial, random_state, name=None): params = { 'fit_intercept': True, 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), - + # 'precompute': trial.suggest_categorical(f'precompute_{name}', ['auto_{name}', True, False]), 'n_nonzero_coefs': trial.suggest_int(f'n_nonzero_coefs_{name}', 1, 100), 'eps': trial.suggest_float(f'eps_{name}', 1e-5, 1e-1, log=True), 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'fit_path': trial.suggest_categorical(f'fit_path_{name}', [True, False]), # 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), + 'random_state': random_state, } return params @@ -136,7 +142,7 @@ def params_BayesianRidge(trial, name=None): return params # LassoLars parameters -def params_LassoLars(trial, name=None): +def params_LassoLars(trial, random_state, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), # 'fit_intercept': True, @@ -146,24 +152,27 @@ def params_LassoLars(trial, name=None): 'eps': trial.suggest_float(f'eps_{name}', 1e-5, 1e-1, log=True), # 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), # 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), + 'random_state': random_state, } return params # LassoLars parameters -def params_LassoLarsCV(trial, name=None): +def params_LassoLarsCV(trial, cv, name=None): params = { 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), + 'cv': cv, } return params # BaggingRegressor parameters -def params_BaggingRegressor(trial, name=None): +def params_BaggingRegressor(trial, random_state, name=None): params = { 'n_estimators': trial.suggest_int(f'n_estimators_{name}', 10, 100), 'max_samples': trial.suggest_float(f'max_samples_{name}', 0.05, 1.00), 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.00), 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), 'bootstrap_features': trial.suggest_categorical(f'bootstrap_features_{name}', [True, False]), + 'random_state': random_state, } return params @@ -187,13 +196,14 @@ def params_ARDRegression(trial, name=None): # TheilSenRegressor parameters -def params_TheilSenRegressor(trial, name=None): +def params_TheilSenRegressor(trial, random_state, name=None): params = { 'n_subsamples': trial.suggest_int(f'n_subsamples_{name}', 10, 100), 'max_subpopulation': trial.suggest_int(f'max_subpopulation_{name}', 100, 1000), 'fit_intercept': True, 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), + 'random_state': random_state, } return params @@ -208,9 +218,9 @@ def params_SVR(trial, name=None): 'tol': 0.005, } return params - + # Perceptron parameters -def params_Perceptron(trial, name=None): +def params_Perceptron(trial, random_state, name=None): params = { 'penalty': trial.suggest_categorical(f'penalty_{name}', [None, 'l2', 'l1', 'elasticnet']), 'alpha': trial.suggest_float(f'alpha_{name}', 1e-5, 1e-1, log=True), @@ -228,20 +238,22 @@ def params_Perceptron(trial, name=None): 'class_weight': trial.suggest_categorical(f'class_weight_{name}', [None, 'balanced']), 'warm_start': trial.suggest_categorical(f'warm_start_{name}', [True, False]), 'average': trial.suggest_categorical(f'average_{name}', [True, False]), + 'random_state': random_state, } return params -def params_MLPRegressor(trial, name=None): +def params_MLPRegressor(trial, random_state, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-4, 1e-1, log=True), - 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True) + 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True), + 'random_state': random_state, } return params #GradientBoostingRegressor parameters -def params_GradientBoostingRegressor(trial, name=None): +def params_GradientBoostingRegressor(trial, random_state, name=None): loss = trial.suggest_categorical(f'loss_{name}', ['ls', 'lad', 'huber', 'quantile']) params = { @@ -254,6 +266,7 @@ def params_GradientBoostingRegressor(trial, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), 'subsample': 1-trial.suggest_float(f'subsample_{name}', 0.05, 1.00, log=True), 'max_features': 1-trial.suggest_float(f'max_features_{name}', 0.05, 1.00, log=True), + 'random_state': random_state, } @@ -265,7 +278,7 @@ def params_GradientBoostingRegressor(trial, name=None): -def params_DecisionTreeRegressor(trial, name=None): +def params_DecisionTreeRegressor(trial, random_state, name=None): params = { 'max_depth': trial.suggest_int(f'max_depth_{name}', 1,11), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), @@ -274,13 +287,14 @@ def params_DecisionTreeRegressor(trial, name=None): # 'splitter': trial.suggest_categorical(f'splitter_{name}', ['best', 'random']), #'max_features': trial.suggest_categorical(f'max_features_{name}', [None, 'auto', 'sqrt', 'log2']), #'ccp_alpha': trial.suggest_float(f'ccp_alpha_{name}', 1e-1, 10.0), - + 'random_state': random_state, + } return params def params_KNeighborsRegressor(trial, name=None, n_samples=100): params = { - 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, 100), + 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, n_samples, log=True ), 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), 'p': trial.suggest_int(f'p_{name}', 1, 3), 'metric': trial.suggest_categorical(f'metric_{name}', ['minkowski', 'euclidean', 'manhattan']), @@ -288,19 +302,20 @@ def params_KNeighborsRegressor(trial, name=None, n_samples=100): } return params -def params_LinearSVR(trial, name=None): +def params_LinearSVR(trial, random_state, name=None): params = { 'epsilon': trial.suggest_float(f'epsilon_{name}', 1e-4, 1.0, log=True), 'C': trial.suggest_float(f'C_{name}', 1e-4,25.0, log=True), 'dual': trial.suggest_categorical(f'dual_{name}', [True,False]), 'loss': trial.suggest_categorical(f'loss_{name}', ['epsilon_insensitive', 'squared_epsilon_insensitive']), + 'random_state': random_state, } return params # XGBRegressor parameters -def params_XGBRegressor(trial, name=None): +def params_XGBRegressor(trial, random_state, name=None): return { 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1, log=True), 'subsample': trial.suggest_float(f'subsample_{name}', 0.05, 1.0), @@ -311,20 +326,22 @@ def params_XGBRegressor(trial, name=None): 'nthread': 1, 'verbosity': 0, 'objective': 'reg:squarederror', + 'random_state': random_state, } -def params_AdaBoostRegressor(trial, name=None): +def params_AdaBoostRegressor(trial, random_state, name=None): params = { 'n_estimators': 100, 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1.0, log=True), 'loss': trial.suggest_categorical(f'loss_{name}', ['linear', 'square', 'exponential']), + 'random_state': random_state, } return params # ExtraTreesRegressor parameters -def params_ExtraTreesRegressor(trial, name=None): +def params_ExtraTreesRegressor(trial, random_state, name=None): params = { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), @@ -333,7 +350,7 @@ def params_ExtraTreesRegressor(trial, name=None): 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), #'criterion': trial.suggest_categorical(f'criterion_{name}', ['squared_error', 'poisson', 'absolute_error', 'friedman_mse']), - + #'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 10), #'min_weight_fraction_leaf': trial.suggest_float(f'min_weight_fraction_leaf_{name}', 0.0, 0.5), @@ -341,41 +358,41 @@ def params_ExtraTreesRegressor(trial, name=None): #'max_leaf_nodes': trial.suggest_int(f'max_leaf_nodes_{name}', 2, 100), #'min_impurity_decrease': trial.suggest_float(f'min_impurity_decrease_{name}', 1e-5, 1e-1, log=True), # 'min_impurity_split': trial.suggest_float(f'min_impurity_split_{name}', 1e-5, 1e-1, log=True), - + #if bootstrap is True #'oob_score': trial.suggest_categorical(f'oob_score_{name}', [True, False]), - + #'ccp_alpha': trial.suggest_float(f'ccp_alpha_{name}', 1e-5, 1e-1, log=True), # 'max_samples': trial.suggest_float(f'max_samples_{name}', 0.05, 1.00), + + 'random_state': random_state, } return params -def make_regressor_config_dictionary(n_samples=10): +def make_regressor_config_dictionary(random_state, cv, n_samples=10): n_samples = min(n_samples,100) #TODO optimize this - + regressor_config_dictionary = { #ElasticNet: params_ElasticNet, ElasticNetCV: { 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1], - 'cv': 5, + 'cv': cv, }, - ExtraTreesRegressor: params_ExtraTreesRegressor, - GradientBoostingRegressor: params_GradientBoostingRegressor, - AdaBoostRegressor: params_AdaBoostRegressor, - DecisionTreeRegressor: params_DecisionTreeRegressor, + ExtraTreesRegressor: partial(params_ExtraTreesRegressor, random_state=random_state), + GradientBoostingRegressor: partial(params_GradientBoostingRegressor, random_state=random_state), + AdaBoostRegressor: partial(params_AdaBoostRegressor, random_state=random_state), + DecisionTreeRegressor: partial(params_DecisionTreeRegressor, random_state=random_state), KNeighborsRegressor: partial(params_KNeighborsRegressor,n_samples=n_samples), - LassoLarsCV: params_LassoLarsCV, + LassoLarsCV: partial(params_LassoLarsCV, cv=cv), SVR: params_SVR, - RandomForestRegressor: params_RandomForestRegressor, - RidgeCV: {}, - XGBRegressor: params_XGBRegressor, - SGDRegressor: params_SGDRegressor, + RandomForestRegressor: partial(params_RandomForestRegressor, random_state=random_state), + RidgeCV: {'cv': cv}, + XGBRegressor: partial(params_XGBRegressor, random_state=random_state), + SGDRegressor: partial(params_SGDRegressor, random_state= random_state), } - - return regressor_config_dictionary - + return regressor_config_dictionary \ No newline at end of file diff --git a/tpot2/config/regressors_sklearnex.py b/tpot2/config/regressors_sklearnex.py index 4eb10f1c..fe102525 100644 --- a/tpot2/config/regressors_sklearnex.py +++ b/tpot2/config/regressors_sklearnex.py @@ -9,14 +9,19 @@ from sklearnex.ensemble import RandomForestRegressor from sklearnex.neighbors import KNeighborsRegressor +import numpy as np -def params_RandomForestRegressor(trial, name=None): +from functools import partial + + +def params_RandomForestRegressor(trial, random_state, name=None): return { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), + 'random_state': random_state, } def params_KNeighborsRegressor(trial, name=None, n_samples=100): @@ -29,14 +34,15 @@ def params_KNeighborsRegressor(trial, name=None, n_samples=100): def params_LinearRegression(trial, name=None): return {} -def params_Ridge(trial, name=None): +def params_Ridge(trial, random_state, name=None): return { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), + 'random_state': random_state, } -def params_Lasso(trial, name=None): +def params_Lasso(trial, random_state, name=None): return { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -44,22 +50,26 @@ def params_Lasso(trial, name=None): 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), 'selection': trial.suggest_categorical(f'selection_{name}', ['cyclic', 'random']), + 'random_state': random_state, } -def params_ElasticNet(trial, name=None): - return { - 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0), +def params_ElasticNet(trial, random_state, name=None): + params = { + 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0, log=True), 'l1_ratio': 1- trial.suggest_float(f'l1_ratio_{name}',0.0, 1.0), + 'random_state': random_state, } + return params def params_SVR(trial, name=None): - return { + params = { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), 'degree': trial.suggest_int(f'degree_{name}', 1, 4), 'max_iter': 3000, 'tol': 0.005, } + return params def params_NuSVR(trial, name=None): return { @@ -71,14 +81,14 @@ def params_NuSVR(trial, name=None): 'tol': 0.005, } -def make_sklearnex_regressor_config_dictionary(n_samples=10): +def make_sklearnex_regressor_config_dictionary(random_state, n_samples=10): return { - RandomForestRegressor: params_RandomForestRegressor, + RandomForestRegressor: partial(params_RandomForestRegressor, random_state=random_state), KNeighborsRegressor: params_KNeighborsRegressor, LinearRegression: params_LinearRegression, - Ridge: params_Ridge, - Lasso: params_Lasso, - ElasticNet: params_ElasticNet, + Ridge: partial(params_Ridge, random_state=random_state), + Lasso: partial(params_Lasso, random_state=random_state), + ElasticNet: partial(params_ElasticNet, random_state=random_state), SVR: params_SVR, NuSVR: params_NuSVR, } diff --git a/tpot2/config/selectors.py b/tpot2/config/selectors.py index 2a46ce89..0e4b28ff 100644 --- a/tpot2/config/selectors.py +++ b/tpot2/config/selectors.py @@ -32,11 +32,12 @@ def params_sklearn_feature_selection_VarianceThreshold(trial, name=None): #TODO add more estimator options? How will that interact with optuna? -def params_sklearn_feature_selection_RFE(trial, name=None, classifier=True): +def params_sklearn_feature_selection_RFE(trial, random_state, name=None, classifier=True): + if classifier: - estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, name=f"RFE_{name}")) + estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, random_state=random_state, name=f"RFE_{name}")) else: - estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, name=f"RFE_{name}")) + estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, random_state=random_state, name=f"RFE_{name}")) params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), @@ -46,11 +47,12 @@ def params_sklearn_feature_selection_RFE(trial, name=None, classifier=True): return params -def params_sklearn_feature_selection_SelectFromModel(trial, name=None, classifier=True): +def params_sklearn_feature_selection_SelectFromModel(trial, random_state, name=None, classifier=True): + if classifier: - estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, name=f"SFM_{name}")) + estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, random_state=random_state, name=f"SFM_{name}")) else: - estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, name=f"SFM_{name}")) + estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, random_state=random_state, name=f"SFM_{name}")) params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), @@ -61,30 +63,32 @@ def params_sklearn_feature_selection_SelectFromModel(trial, name=None, classifie -def params_sklearn_feature_selection_RFE_wrapped(trial, name=None, classifier=True): +def params_sklearn_feature_selection_RFE_wrapped(trial, random_state, name=None, classifier=True): + params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), } if classifier: - estimator_params = params_ExtraTreesClassifier(trial, name=f"RFE_{name}") + estimator_params = params_ExtraTreesClassifier(trial, random_state=random_state, name=f"RFE_{name}") else: - estimator_params = params_ExtraTreesRegressor(trial, name=f"RFE_{name}") + estimator_params = params_ExtraTreesRegressor(trial, random_state=random_state, name=f"RFE_{name}") params.update(estimator_params) return params -def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, name=None, classifier=True): +def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, random_state, name=None, classifier=True): + params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), } if classifier: - estimator_params = params_ExtraTreesClassifier(trial, name=f"SFM_{name}") + estimator_params = params_ExtraTreesClassifier(trial, random_state=random_state, name=f"SFM_{name}") else: - estimator_params = params_ExtraTreesRegressor(trial, name=f"SFM_{name}") + estimator_params = params_ExtraTreesRegressor(trial, random_state=random_state, name=f"SFM_{name}") params.update(estimator_params) @@ -92,14 +96,14 @@ def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, name=None, c -def make_selector_config_dictionary(classifier=True): +def make_selector_config_dictionary(random_state, classifier=True): if classifier: - params = {RFE_ExtraTreesClassifier : partial(params_sklearn_feature_selection_RFE_wrapped, classifier=classifier), - SelectFromModel_ExtraTreesClassifier : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, classifier=classifier), + params = {RFE_ExtraTreesClassifier : partial(params_sklearn_feature_selection_RFE_wrapped, random_state=random_state, classifier=classifier), + SelectFromModel_ExtraTreesClassifier : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, random_state=random_state, classifier=classifier), } else: - params = {RFE_ExtraTreesRegressor : partial(params_sklearn_feature_selection_RFE_wrapped, classifier=classifier), - SelectFromModel_ExtraTreesRegressor : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, classifier=classifier), + params = {RFE_ExtraTreesRegressor : partial(params_sklearn_feature_selection_RFE_wrapped, random_state=random_state, classifier=classifier), + SelectFromModel_ExtraTreesRegressor : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, random_state=random_state, classifier=classifier), } params.update({ SelectFwe: params_sklearn_feature_selection_SelectFwe, diff --git a/tpot2/config/special_configs.py b/tpot2/config/special_configs.py index 0f1e1e85..a6745b6f 100644 --- a/tpot2/config/special_configs.py +++ b/tpot2/config/special_configs.py @@ -11,9 +11,9 @@ def params_arthmetic_operator(trial, name=None): } def make_arithmetic_transformer_config_dictionary(): - return { + return { AddTransformer: {}, - mul_neg_1_Transformer: {}, + mul_neg_1_Transformer: {}, MulTransformer: {}, SafeReciprocalTransformer: {}, EQTransformer: {}, @@ -22,7 +22,7 @@ def make_arithmetic_transformer_config_dictionary(): GTTransformer: {}, LETransformer: {}, LTTransformer: {}, - MinTransformer: {}, + MinTransformer: {}, MaxTransformer: {}, } @@ -65,10 +65,10 @@ def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None Parameters ---------- subsets: Sets the subsets to select from. - - str : If a string, it is assumed to be a path to a csv file with the subsets. + - str : If a string, it is assumed to be a path to a csv file with the subsets. The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - + n_features: int the number of features in the dataset. If subsets is None, each column will be treated as a subset. One column will be selected per subset. """ @@ -76,10 +76,10 @@ def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None #require at least of of the parameters if subsets is None and n_features is None: raise ValueError('At least one of the parameters must be provided') - + if isinstance(subsets, str): df = pd.read_csv(subsets,header=None,index_col=0) - df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1) + df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1) subset_dict = {} for row in df.index: subset_dict[row] = df.loc[row]['features'] diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index 6608c6eb..3f2fbb8a 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -1,4 +1,6 @@ from functools import partial +import numpy as np + from tpot2.builtin_modules import ZeroCount, OneHotEncoder from sklearn.preprocessing import Binarizer from sklearn.decomposition import FastICA @@ -19,14 +21,16 @@ def params_sklearn_preprocessing_Binarizer(trial, name=None): 'threshold': trial.suggest_float(f'threshold_{name}', 0.0, 1.0), } -def params_sklearn_decomposition_FastICA(trial, name=None, n_features=100): +def params_sklearn_decomposition_FastICA(trial, random_state, name=None, n_features=100): return { + 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), # number of components wrt number of features 'algorithm': trial.suggest_categorical(f'algorithm_{name}', ['parallel', 'deflation']), 'whiten':'unit-variance', + 'random_state': random_state, } def params_sklearn_cluster_FeatureAgglomeration(trial, name=None, n_features=100): - + linkage = trial.suggest_categorical(f'linkage_{name}', ['ward', 'complete', 'average']) if linkage == 'ward': metric = 'euclidean' @@ -35,74 +39,65 @@ def params_sklearn_cluster_FeatureAgglomeration(trial, name=None, n_features=100 return { 'linkage': linkage, 'metric': metric, - 'n_clusters': trial.suggest_int(f'n_clusters_{name}', 2, 4), #TODO perhaps a percentage of n_features + 'n_clusters': trial.suggest_int(f'n_clusters_{name}', 2, n_features-1), #TODO perhaps a percentage of n_features } - - def params_sklearn_preprocessing_Normalizer(trial, name=None): return { 'norm': trial.suggest_categorical(f'norm_{name}', ['l1', 'l2', 'max']), } -def params_sklearn_kernel_approximation_Nystroem(trial, name=None, n_features=100): +def params_sklearn_kernel_approximation_Nystroem(trial, random_state, name=None, n_features=100): return { 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), 'kernel': trial.suggest_categorical(f'kernel_{name}', ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid']), - 'n_components': trial.suggest_int(f'n_components_{name}', 1, 11), #TODO perhaps a percentage of n_features + 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), + 'random_state': random_state, } -def params_sklearn_decomposition_PCA(trial, name=None, n_features=100): +def params_sklearn_decomposition_PCA(trial, random_state, name=None, n_features=100): # keep the number of components required to explain 'variance_explained' of the variance - variance_explained = 1 - trial.suggest_float(f'n_components_{name}', 0.001, 0.5, log=True) #values closer to 1 are more likely - + variance_explained = 1.0 - trial.suggest_float(f'n_components_{name}', 0.001, 0.5, log=True) #values closer to 1 are more likely + return { 'n_components': variance_explained, + 'random_state': random_state, } - -def params_sklearn_kernel_approximation_RBFSampler(trial, name=None, n_features=100): +def params_sklearn_kernel_approximation_RBFSampler(trial, random_state, name=None, n_features=100): return { + 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), + 'random_state': random_state, } - - - def params_tpot_builtins_ZeroCount(trial, name=None): return {} - def params_tpot_builtins_OneHotEncoder(trial, name=None): return {} - - - - -def make_transformer_config_dictionary(n_features=10): +def make_transformer_config_dictionary(random_state, n_features=10): #n_features = min(n_features,100) #TODO optimize this return { Binarizer: params_sklearn_preprocessing_Binarizer, - FastICA: partial(params_sklearn_decomposition_FastICA,n_features=n_features), + FastICA: partial(params_sklearn_decomposition_FastICA, random_state=random_state, n_features=n_features), FeatureAgglomeration: partial(params_sklearn_cluster_FeatureAgglomeration,n_features=n_features), MaxAbsScaler: {}, MinMaxScaler: {}, Normalizer: params_sklearn_preprocessing_Normalizer, - Nystroem: partial(params_sklearn_kernel_approximation_Nystroem,n_features=n_features), - PCA: partial(params_sklearn_decomposition_PCA,n_features=n_features), + Nystroem: partial(params_sklearn_kernel_approximation_Nystroem, random_state=random_state, n_features=n_features), + PCA: partial(params_sklearn_decomposition_PCA, random_state=random_state, n_features=n_features), PolynomialFeatures: { 'degree': 2, 'include_bias': False, 'interaction_only': False, }, - RBFSampler: partial(params_sklearn_kernel_approximation_RBFSampler,n_features=n_features), + RBFSampler: partial(params_sklearn_kernel_approximation_RBFSampler, random_state=random_state, n_features=n_features), RobustScaler: {}, StandardScaler: {}, ZeroCount: params_tpot_builtins_ZeroCount, OneHotEncoder: params_tpot_builtins_OneHotEncoder, } - - From 7e4ae9cbd06d17c4993e447ffbd69fa081364eeb Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:43:00 -0700 Subject: [PATCH 16/43] passed rng to other classes that need it: population, mutation stuff, etc. --- tpot2/evolvers/base_evolver.py | 183 +++++++++++++++++---------------- 1 file changed, 97 insertions(+), 86 deletions(-) diff --git a/tpot2/evolvers/base_evolver.py b/tpot2/evolvers/base_evolver.py index 7a94cf10..9959f9ab 100644 --- a/tpot2/evolvers/base_evolver.py +++ b/tpot2/evolvers/base_evolver.py @@ -20,16 +20,18 @@ import math from tpot2.utils.utils import get_thresholds, beta_interpolation, remove_items, equalize_list -def ind_mutate(ind): - return ind.mutate() +def ind_mutate(ind, rng_): + rng = np.random.default_rng(rng_) + return ind.mutate(rng_=rng) -def ind_crossover(ind1, ind2): - return ind1.crossover(ind2) +def ind_crossover(ind1, ind2, rng_): + rng = np.random.default_rng(rng_) + return ind1.crossover(ind2, rng_=rng) class BaseEvolver(): - def __init__( self, + def __init__( self, individual_generator , - + objective_functions, objective_function_weights, objective_names = None, @@ -38,53 +40,55 @@ def __init__( self, population_size = 50, initial_population_size = None, - population_scaling = .5, - generations_until_end_population = 1, - generations = 50, + population_scaling = .5, + generations_until_end_population = 1, + generations = 50, early_stop = None, early_stop_tol = 0.001, - - max_time_seconds=float("inf"), + + max_time_seconds=float("inf"), max_eval_time_seconds=60*5, n_jobs=1, memory_limit="4GB", client=None, - + survival_percentage = 1, crossover_probability=.2, mutate_probability=.7, mutate_then_crossover_probability=.05, crossover_then_mutate_probability=.05, - + mutation_functions = [ind_mutate], crossover_functions = [ind_crossover], mutation_function_weights = None, crossover_function_weights = None, - + n_parents=2, survival_selector = survival_select_NSGA2, parent_selector = tournament_selection_dominated, - - budget_range = None, - budget_scaling = .5, - generations_until_end_budget = 1, + + budget_range = None, + budget_scaling = .5, + generations_until_end_budget = 1, stepwise_steps = 5, - - threshold_evaluation_early_stop = None, + + threshold_evaluation_early_stop = None, threshold_evaluation_scaling = .5, min_history_threshold = 20, selection_evaluation_early_stop = None, selection_evaluation_scaling = .5, - evaluation_early_stop_steps = None, + evaluation_early_stop_steps = None, final_score_strategy = "mean", - verbose = 0, + verbose = 0, periodic_checkpoint_folder = None, callback = None, + rng_=None, + ) -> None: """ Uses mutation, crossover, and optimization functions to evolve a population of individuals towards the given objective functions. @@ -95,7 +99,7 @@ def __init__( self, Generator that yields new base individuals. Used to generate initial population. objective_functions : list of callables list of functions that get applied to the individual and return a float or list of floats - If an objective function returns multiple values, they are all concatenated in order + If an objective function returns multiple values, they are all concatenated in order with respect to objective_function_weights and early_stop_tol. objective_function_weights : list of floats list of weights for each objective function. Sign flips whether bigger is better or not @@ -111,8 +115,8 @@ def __init__( self, Size of the initial population. If None, population_size will be used. population_scaling : int, default=0.5 Scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - generations_until_end_population : int, default=1 - Number of generations until the population size reaches population_size + generations_until_end_population : int, default=1 + Number of generations until the population size reaches population_size generations : int, default=50 Number of generations to run early_stop : int, default=None @@ -121,7 +125,7 @@ def __init__( self, -list of floats list of tolerances for each objective function. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives max_time_seconds : float, default=float("inf") Maximum time to run the optimization. If none or inf, will run until the end of the generations. @@ -132,9 +136,9 @@ def __init__( self, memory_limit : str, default="4GB" Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information. client : dask.distributed.Client, default=None - A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. + A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. survival_percentage : float, default=1 - Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. + Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. For example, if the population size is 100 and the survival percentage is .5, 50 individuals will be selected with NSGA2 from the existing population. These will be used for mutation and crossover to generate the next 100 individuals for the next generation. The remainder are discarded from the live population. In the next generation, there will now be the 50 parents + the 100 individuals for a total of 150. Surivival percentage is based of the population size parameter and not the existing population size (current population size when using successive halving). Therefore, in the next generation we will still select 50 individuals from the currently existing 150. crossover_probability : float, default=.2 Probability of generating a new individual by crossover between two individuals. @@ -170,12 +174,12 @@ def __init__( self, selection_evaluation_early_stop : list, default=None A lower and upper percent of the population size to select each round of CV. Values between 0 and 1. - selection_evaluation_scaling : float, default=0.5 + selection_evaluation_scaling : float, default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. evaluation_early_stop_steps : int, default=1 The number of steps that will be taken from the objective function. (e.g., the number of CV folds to evaluate) - final_score_strategy : str, default="mean" + final_score_strategy : str, default="mean" The strategy to use when determining the final score for an individual. "mean": The mean of all objective scores "last": The score returned by the last call. Currently each objective is evaluated with a clone of the individual. @@ -192,16 +196,24 @@ def __init__( self, If provided, training will resume from this checkpoint. callback : tpot2.CallBackInterface, default=None Callback object. Not implemented + rng_ : Numpy.Random.Generator, None, default=None + An object for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes + + - Numpy.Random.Generator + Will be used to create and lock in Generator instance with 'numpy.random.default_rng()'. Note this will be the same Generator passed in. + - None + Will be used to create Generator for 'numpy.random.default_rng()' where a fresh, unpredictable entropy will be pulled from the OS """ + self.rng = np.random.default_rng(rng_) if threshold_evaluation_early_stop is not None or selection_evaluation_early_stop is not None: if evaluation_early_stop_steps is None: raise ValueError("evaluation_early_stop_steps must be set when using threshold_evaluation_early_stop or selection_evaluation_early_stop") - self.individual_generator = individual_generator - self.population_size = population_size - self.objective_functions = objective_functions + self.individual_generator = individual_generator + self.population_size = population_size + self.objective_functions = objective_functions self.objective_function_weights = np.array(objective_function_weights) self.bigger_is_better = bigger_is_better if not bigger_is_better: @@ -220,32 +232,32 @@ def __init__( self, self.periodic_checkpoint_folder = periodic_checkpoint_folder - self.verbose = verbose - self.callback = callback - self.generations = generations + self.verbose = verbose + self.callback = callback + self.generations = generations self.n_jobs = n_jobs - + if max_time_seconds is None: self.max_time_seconds = float("inf") else: - self.max_time_seconds = max_time_seconds - + self.max_time_seconds = max_time_seconds + #functools requires none for infinite time, doesn't support inf if max_eval_time_seconds is not None and math.isinf(max_eval_time_seconds ): self.max_eval_time_seconds = None else: self.max_eval_time_seconds = max_eval_time_seconds - - + + self.generation = 0 self.threshold_evaluation_early_stop =threshold_evaluation_early_stop - self.threshold_evaluation_scaling = max(0.00001,threshold_evaluation_scaling ) + self.threshold_evaluation_scaling = max(0.00001,threshold_evaluation_scaling ) self.min_history_threshold = min_history_threshold self.selection_evaluation_early_stop = selection_evaluation_early_stop @@ -266,7 +278,7 @@ def __init__( self, self.survival_selector=survival_selector self.parent_selector=parent_selector self.survival_percentage = survival_percentage - + total_var_p = crossover_probability + mutate_probability + mutate_then_crossover_probability + crossover_then_mutate_probability self.crossover_probability = crossover_probability / total_var_p self.mutate_probability = mutate_probability / total_var_p @@ -324,7 +336,7 @@ def __init__( self, self.budget = self.budget_list[self.generation] else: self.budget = None - + self.early_stop_tol = early_stop_tol self.early_stop = early_stop @@ -343,7 +355,7 @@ def __init__( self, if os.path.exists(self.population_file): self.population = pickle.load(open(self.population_file, "rb")) - if len(self.population.evaluated_individuals)>0 and "Generation" in self.population.evaluated_individuals.columns: + if len(self.population.evaluated_individuals)>0 and "Generation" in self.population.evaluated_individuals.columns: self.generation = self.population.evaluated_individuals['Generation'].max() + 1 #TODO check if this is empty? init_names = self.objective_names @@ -352,7 +364,7 @@ def __init__( self, if self.population is None: self.population = tpot2.Population(column_names=init_names) initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] - self.population.add_to_population(initial_population) + self.population.add_to_population(initial_population, self.rng) self.population.update_column(self.population.population, column_names="Generation", data=self.generation) @@ -374,14 +386,14 @@ def optimize(self, generations=None): processes=True, memory_limit=self.memory_limit) self._client = Client(self._cluster) - + if generations is None: generations = self.generations - start_time = time.time() - + start_time = time.time() + generations_without_improvement = np.array([0 for _ in range(len(self.objective_function_weights))]) best_scores = [-np.inf for _ in range(len(self.objective_function_weights))] @@ -389,7 +401,7 @@ def optimize(self, generations=None): self.scheduled_timeout_time = time.time() + self.max_time_seconds - try: + try: #for gen in tnrange(generations,desc="Generation", disable=self.verbose<1): done = False gen = 0 @@ -407,7 +419,7 @@ def optimize(self, generations=None): self.evaluate_population() if self.population_file is not None: pickle.dump(self.population, open(self.population_file, "wb")) - + attempts = 2 while len(self.population.population) == 0 and attempts > 0: new_initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] @@ -424,8 +436,8 @@ def optimize(self, generations=None): if time.time() - start_time > self.max_time_seconds: break self.step() - - if self.verbose >= 3: + + if self.verbose >= 3: sign = np.sign(self.objective_function_weights) valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign cur_best_scores = valid_df.max(axis=0)*sign @@ -444,7 +456,7 @@ def optimize(self, generations=None): cur_best_scores = valid_df.max(axis=0) cur_best_scores = cur_best_scores.to_numpy() #cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best - + improved = ( np.array(cur_best_scores) - np.array(best_scores) >= np.array(self.early_stop_tol) ) not_improved = np.logical_not(improved) generations_without_improvement = generations_without_improvement * not_improved + not_improved #set to zero if not improved, else increment @@ -471,12 +483,12 @@ def optimize(self, generations=None): except KeyboardInterrupt: if self.verbose >= 3: print("KeyboardInterrupt") - + self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID") self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT") - + if self.population_file is not None: pickle.dump(self.population, open(self.population_file, "wb")) @@ -504,43 +516,44 @@ def step(self,): if self.survival_selector is not None: n_survivors = max(1,int(self.cur_population_size*self.survival_percentage)) #always keep at least one individual - self.population.survival_select( selector=self.survival_selector, - weights=self.objective_function_weights, - columns_names=self.objective_names, - n_survivors=n_survivors, - inplace=True) - + self.population.survival_select( selector=self.survival_selector, + weights=self.objective_function_weights, + columns_names=self.objective_names, + n_survivors=n_survivors, + inplace=True, + rng_=self.rng,) + self.generate_offspring() self.evaluate_population() self.generation += 1 - + def generate_offspring(self, ): #your EA Algorithm goes here - parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=self.cur_population_size, n_parents=2) + parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=self.cur_population_size, n_parents=2, rng_=self.rng) p = np.array([self.crossover_probability, self.mutate_then_crossover_probability, self.crossover_then_mutate_probability, self.mutate_probability]) p = p / p.sum() - var_op_list = np.random.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate", "mutate"], size=self.cur_population_size, p=p) + var_op_list = self.rng.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate", "mutate"], size=self.cur_population_size, p=p) for i, op in enumerate(var_op_list): if op == "mutate": parents[i] = parents[i][0] #mutations take a single individual - - offspring = self.population.create_offspring2(parents, var_op_list, self.mutation_functions, self.mutation_function_weights, self.crossover_functions, self.crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True) - + + offspring = self.population.create_offspring2(parents, var_op_list, self.mutation_functions, self.mutation_function_weights, self.crossover_functions, self.crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True, rng_=self.rng) + self.population.update_column(offspring, column_names="Generation", data=self.generation, ) - - + + # Gets a list of unevaluated individuals in the livepopulation, evaluates them, and removes failed attempts # TODO This could probably be an independent function? def evaluate_population(self,): - - #Update the sliding scales and thresholds + + #Update the sliding scales and thresholds # Save population, TODO remove some of these if self.population_file is not None: # and time.time() - last_save_time > 60*10: pickle.dump(self.population, open(self.population_file, "wb")) @@ -554,8 +567,8 @@ def evaluate_population(self,): old_data = old_data[old_data[self.objective_names].notnull().all(axis=1)] if len(old_data) >= self.min_history_threshold: self.thresholds = np.array([get_thresholds(old_data[obj_name], - start=self.threshold_evaluation_early_stop[0], - end=self.threshold_evaluation_early_stop[1], + start=self.threshold_evaluation_early_stop[0], + end=self.threshold_evaluation_early_stop[1], scale=self.threshold_evaluation_scaling, n=self.evaluation_early_stop_steps) for obj_name in self.objective_names]).T @@ -565,7 +578,7 @@ def evaluate_population(self,): lower = self.cur_population_size*self.selection_evaluation_early_stop[0] upper = self.cur_population_size*self.selection_evaluation_early_stop[1] #survival_counts = self.cur_population_size*(scipy.special.betainc(1,self.selection_evaluation_scaling,np.linspace(0,1,self.evaluation_early_stop_steps))*(upper-lower)+lower) - + survival_counts = np.array(beta_interpolation(start=lower, end=upper, scale=self.selection_evaluation_scaling, n=self.evaluation_early_stop_steps, n_steps=self.evaluation_early_stop_steps)) self.survival_counts = survival_counts.astype(int) else: @@ -591,7 +604,7 @@ def evaluate_population(self,): def evaluate_population_full(self, budget=None): individuals_to_evaluate = self.get_unevaluated_individuals(self.objective_names, budget=budget,) - + #print("evaluating this many individuals: ", len(individuals_to_evaluate)) if len(individuals_to_evaluate) == 0: @@ -608,7 +621,7 @@ def evaluate_population_full(self, budget=None): parallel_timeout = min(theoretical_timeout, scheduled_timeout_time_left) if parallel_timeout < 0: parallel_timeout = 10 - + #scores = tpot2.utils.eval_utils.parallel_eval_objective_list(individuals_to_evaluate, self.objective_functions, self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, parallel_timeout=parallel_timeout, **self.objective_kwargs) scores, start_times, end_times = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs) @@ -638,7 +651,7 @@ def get_unevaluated_individuals(self, column_names, budget=None, individual_list unevaluated_filter = lambda i: any(offspring_scores.loc[offspring_scores.index[i]][column_names].isna()) unevaluated_individuals_this_step = [i for i in range(len(cur_pop)) if unevaluated_filter(i)] return cur_pop[unevaluated_individuals_this_step] - + else: #if column names are not in the evaluated_individuals, then we have not evaluated any individuals yet for name_step in column_names: self.population.evaluated_individuals[name_step] = np.nan @@ -655,23 +668,23 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No cur_individuals = self.population.population.copy() - + all_step_names = [] for step in range(self.evaluation_early_stop_steps): if budget is None: this_step_names = [f"{n}_step_{step}" for n in self.objective_names] else: this_step_names = [f"{n}_budget_{budget}_step_{step}" for n in self.objective_names] - + all_step_names.append(this_step_names) - + unevaluated_individuals_this_step = self.get_unevaluated_individuals(this_step_names, budget=None, individual_list=cur_individuals) if len(unevaluated_individuals_this_step) == 0: if self.verbose > 3: print("No new individuals to evaluate") continue - + if self.max_eval_time_seconds is not None: theoretical_timeout = self.max_eval_time_seconds * math.ceil(len(unevaluated_individuals_this_step) / self.n_jobs) theoretical_timeout = theoretical_timeout*2 @@ -753,9 +766,9 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No invalids.append(i) if len(invalids) > 0: - + max_to_remove = min(len(cur_individuals) - self.n_jobs, len(invalids)) - + if max_to_remove < len(invalids): invalids = np.random.choice(invalids, max_to_remove, replace=False) @@ -771,5 +784,3 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No new_population_index = survival_selector(weighted_scores, k=k) cur_individuals = np.array(cur_individuals)[new_population_index] - - From 4af29479feb92371b545da1e12606c6a54d7d0c3 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:46:54 -0700 Subject: [PATCH 17/43] added rng as parameter to functions that rely on stochasticity --- tpot2/population.py | 175 +++++++++++++++++++++++--------------------- 1 file changed, 93 insertions(+), 82 deletions(-) diff --git a/tpot2/population.py b/tpot2/population.py index f32ad5c4..509a249d 100644 --- a/tpot2/population.py +++ b/tpot2/population.py @@ -12,43 +12,47 @@ import pickle import dask -def mutate(individual): +def mutate(individual, rng_): + rng = np.random.default_rng(rng_) if isinstance(individual, collections.abc.Iterable): for ind in individual: - ind.mutate() + ind.mutate(rng_=rng) else: - individual.mutate() + individual.mutate(rng_=rng) return individual -def crossover(parents): - parents[0].crossover(parents[1]) +def crossover(parents, rng_): + rng = np.random.default_rng(rng_) + parents[0].crossover(parents[1], rng_=rng) return parents[0] -def mutate_and_crossover(parents): - parents[0].crossover(parents[1]) - parents[0].mutate() - parents[1].mutate() +def mutate_and_crossover(parents, rng_): + rng = np.random.default_rng(rng_) + parents[0].crossover(parents[1], rng_=rng) + parents[0].mutate(rng_=rng) + parents[1].mutate(rng_=rng) return parents -def crossover_and_mutate(parents): +def crossover_and_mutate(parents, rng_): + rng = np.random.default_rng(rng_) for p in parents: - p.mutate() - parents[0].crossover(parents[1]) + p.mutate(rng_=rng) + parents[0].crossover(parents[1], rng_=rng) return parents[0] -built_in_var_ops_dict = {"mutate":mutate, - "crossover":crossover, - "mutate_then_crossover":mutate_and_crossover, +built_in_var_ops_dict = {"mutate":mutate, + "crossover":crossover, + "mutate_then_crossover":mutate_and_crossover, "crossover_then_mutate":crossover_and_mutate} - + class Population(): ''' Primary usage is to keep track of evaluated individuals - + Parameters ---------- initial_population : {list of BaseIndividuals}, default=None @@ -59,7 +63,7 @@ class Population(): callback : {function}, default=None NOT YET IMPLEMENTED A function to call after each generation. The function should take a Population object as its only argument. - + Attributes ---------- population : {list of BaseIndividuals} @@ -75,7 +79,7 @@ def __init__( self, ) -> None: if column_names is not None: - + column_names = column_names+["Parents", "Variation_Function"] else: column_names = ["Parents", "Variation_Function"] @@ -86,21 +90,22 @@ def __init__( self, self.callback=callback self.population = [] - def survival_select(self, selector, weights, columns_names, n_survivors, inplace=True): + def survival_select(self, selector, weights, columns_names, n_survivors, rng_, inplace=True): + rng = np.random.default_rng(rng_) weighted_scores = self.get_column(self.population, column_names=columns_names) * weights - new_population_index = np.ravel(selector(weighted_scores, k=n_survivors)) #TODO make it clear that we are concatenating scores... + new_population_index = np.ravel(selector(weighted_scores, k=n_survivors, rng_=rng)) #TODO make it clear that we are concatenating scores... new_population = np.array(self.population)[new_population_index] if inplace: - self.set_population(new_population) + self.set_population(new_population, rng_=rng) return new_population - def parent_select(self, selector, weights, columns_names, k, n_parents): - + def parent_select(self, selector, weights, columns_names, k, n_parents, rng_): + rng = np.random.default_rng(rng_) weighted_scores = self.get_column(self.population, column_names=columns_names) * weights - parents_index = selector(weighted_scores, k=k, n_parents=n_parents) + parents_index = selector(weighted_scores, k=k, n_parents=n_parents, rng_=rng) parents = np.array(self.population)[parents_index] return parents - + #remove individuals that either do not have a column_name value or a nan in that value #TODO take into account when the value is not a list/tuple? @@ -108,12 +113,12 @@ def parent_select(self, selector, weights, columns_names, k, n_parents): def remove_invalid_from_population(self, column_names, invalid_value = "INVALID"): ''' Remove individuals from the live population if either do not have a value in the column_name column or if the value contains np.nan. - + Parameters ---------- column_name : {str} The name of the column to check for np.nan values. - + Returns ------- None @@ -124,17 +129,17 @@ def remove_invalid_from_population(self, column_names, invalid_value = "INVALID" is_valid = lambda ind: ind.unique_id() not in self.evaluated_individuals.index or invalid_value not in self.evaluated_individuals.loc[ind.unique_id(),column_names].to_list() self.population = [ind for ind in self.population if is_valid(ind)] - - # takes the list of individuals and adds it to the live population list. + + # takes the list of individuals and adds it to the live population list. # if keep_repeats is False, repeated individuals are not added to the population - # returns a list of individuals added to the live population + # returns a list of individuals added to the live population #TODO make keep repeats allow for previously evaluated individuals, #but make sure that the live population only includes one of each, no repeats - def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repeats=False, mutate_until_unique=True): + def add_to_population(self, individuals: typing.List[BaseIndividual], rng_, keep_repeats=False, mutate_until_unique=True): ''' Add individuals to the live population. Add individuals to the evaluated_individuals if they are not already there. - + Parameters: ----------- individuals : {list of BaseIndividuals} @@ -143,6 +148,9 @@ def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repea If True, allow the population to have repeated individuals. If False, only add individuals that have not yet been added to geneology. ''' + + rng = np.random.default_rng(rng_) + if not isinstance(individuals, collections.abc.Iterable): individuals = [individuals] @@ -164,7 +172,7 @@ def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repea elif mutate_until_unique: #If its old and we don't want repeats, we can optionally mutate it until it is unique for _ in range(20): individual = copy.deepcopy(individual) - individual.mutate() + individual.mutate(rng_=rng) key = individual.unique_id() if key not in self.evaluated_individuals.index: self.evaluated_individuals.loc[key] = np.nan @@ -172,7 +180,7 @@ def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repea self.population.append(individual) new_individuals.append(individual) break - + return new_individuals @@ -195,7 +203,7 @@ def update_column(self, individual, column_names, data): self.evaluated_individuals.loc[key,column_names] = data - + def get_column(self, individual, column_names=None, to_numpy=True): ''' Update the column_name column in the evaluated_individuals with the data. @@ -229,13 +237,13 @@ def get_column(self, individual, column_names=None, to_numpy=True): def get_unevaluated_individuals(self, column_names, individual_list=None): if individual_list is None: individual_list = self.population - + if self.use_unique_id: unevaluated_filter = lambda individual: individual.unique_id() not in self.evaluated_individuals.index or any(self.evaluated_individuals.loc[individual.unique_id(), column_names].isna()) else: unevaluated_filter = lambda individual: individual not in self.evaluated_individuals.index or any(self.evaluated_individuals.loc[individual.unique_id(), column_names].isna()) - - return [individual for individual in individual_list if unevaluated_filter(individual)] + + return [individual for individual in individual_list if unevaluated_filter(individual)] # def get_valid_evaluated_individuals_df(self, column_names_to_check, invalid_values=["TIMEOUT","INVALID"]): # ''' @@ -244,18 +252,19 @@ def get_unevaluated_individuals(self, column_names, individual_list=None): # return self.evaluated_individuals[~self.evaluated_individuals[column_names_to_check].isin(invalid_values).any(axis=1)] #the live population empied and is set to new_population - def set_population(self, new_population, keep_repeats=True): + def set_population(self, new_population, rng_, keep_repeats=True): ''' sets population to new population for selection? ''' + rng = np.random.default_rng(rng_) self.population = [] - self.add_to_population(new_population, keep_repeats=keep_repeats) + self.add_to_population(new_population, rng_=rng, keep_repeats=keep_repeats) - #TODO should we just generate one offspring per crossover? + #TODO should we just generate one offspring per crossover? def create_offspring(self, parents_list, var_op_list, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): ''' - parents_list: a list of lists of parents. + parents_list: a list of lists of parents. var_op_list: a list of var_ops to apply to each list of parents. Should be the same length as parents_list. for example: @@ -265,7 +274,7 @@ def create_offspring(self, parents_list, var_op_list, add_to_population=True, ke This will apply crossover to parent1 and parent2 and mutate to parent3. Creates offspring from parents using the var_op_list. - If string, will use a built in method + If string, will use a built in method - "crossover" : crossover - "mutate" : mutate - "mutate_and_crossover" : mutate_and_crossover @@ -275,14 +284,14 @@ def create_offspring(self, parents_list, var_op_list, add_to_population=True, ke all_offspring = parallel_create_offspring(parents_list, var_op_list, n_jobs=n_jobs) for parents, offspring, var_op in zip(parents_list, all_offspring, var_op_list): - + # if var_op in built_in_var_ops_dict: # var_op = built_in_var_ops_dict[var_op] # offspring = copy.deepcopy(parents) # offspring = var_op(offspring) # if isinstance(offspring, collections.abc.Iterable): - # offspring = offspring[0] + # offspring = offspring[0] if add_to_population: added = self.add_to_population(offspring, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) @@ -292,26 +301,27 @@ def create_offspring(self, parents_list, var_op_list, add_to_population=True, ke if not pd.api.types.is_object_dtype(self.evaluated_individuals["Parents"]): #TODO Is there a cleaner way of doing this? Not required for some python environments? self.evaluated_individuals["Parents"] = self.evaluated_individuals["Parents"].astype('object') self.evaluated_individuals.at[new_child.unique_id(),"Parents"] = tuple(parent_keys) - + #if var_op is a function if hasattr(var_op, '__call__'): self.evaluated_individuals.at[new_child.unique_id(),"Variation_Function"] = var_op.__name__ else: self.evaluated_individuals.at[new_child.unique_id(),"Variation_Function"] = var_op - - + + new_offspring.append(new_child) else: new_offspring.append(offspring) - - + + return new_offspring - #TODO should we just generate one offspring per crossover? - def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutation_function_weights, crossover_functions,crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True): + #TODO should we just generate one offspring per crossover? + def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutation_function_weights, crossover_functions,crossover_function_weights, rng_, add_to_population=True, keep_repeats=False, mutate_until_unique=True): + rng = np.random.default_rng(rng_) new_offspring = [] all_offspring = [] @@ -320,62 +330,62 @@ def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutati for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class if var_op == "mutation": - mutation_op = np.random.choice(mutation_functions, p=mutation_function_weights) - all_offspring.append(copy_and_mutate(parents, mutation_op)) + mutation_op = rng.choice(mutation_functions, p=mutation_function_weights) + all_offspring.append(copy_and_mutate(parents, mutation_op, rng_=rng)) chosen_ops.append(mutation_op.__name__) - - + + elif var_op == "crossover": - crossover_op = np.random.choice(crossover_functions, p=crossover_function_weights) - all_offspring.append(copy_and_crossover(parents, crossover_op)) + crossover_op = rng.choice(crossover_functions, p=crossover_function_weights) + all_offspring.append(copy_and_crossover(parents, crossover_op, rng_=rng)) chosen_ops.append(crossover_op.__name__) elif var_op == "mutate_then_crossover": - mutation_op1 = np.random.choice(mutation_functions, p=mutation_function_weights) - mutation_op2 = np.random.choice(mutation_functions, p=mutation_function_weights) - crossover_op = np.random.choice(crossover_functions, p=crossover_function_weights) - p1 = copy_and_mutate(parents[0], mutation_op1) - p2 = copy_and_mutate(parents[1], mutation_op2) - crossover_op(p1,p2) + mutation_op1 = rng.choice(mutation_functions, p=mutation_function_weights) + mutation_op2 = rng.choice(mutation_functions, p=mutation_function_weights) + crossover_op = rng.choice(crossover_functions, p=crossover_function_weights) + p1 = copy_and_mutate(parents[0], mutation_op1, rng_=rng) + p2 = copy_and_mutate(parents[1], mutation_op2, rng_=rng) + crossover_op(p1,p2,rng_=rng) all_offspring.append(p1) chosen_ops.append(f"{mutation_op1.__name__} , {mutation_op2.__name__} , {crossover_op.__name__}") elif var_op == "crossover_then_mutate": - crossover_op = np.random.choice(crossover_functions, p=crossover_function_weights) - child = copy_and_crossover(parents, crossover_op) - mutation_op = np.random.choice(mutation_functions, p=mutation_function_weights) - mutation_op(child) + crossover_op = rng.choice(crossover_functions, p=crossover_function_weights) + child = copy_and_crossover(parents, crossover_op, rng_=rng) + mutation_op = rng.choice(mutation_functions, p=mutation_function_weights) + mutation_op(child, rng_=rng) all_offspring.append(child) chosen_ops.append(f"{crossover_op.__name__} , {mutation_op.__name__}") for parents, offspring, var_op in zip(parents_list, all_offspring, chosen_ops): - + # if var_op in built_in_var_ops_dict: # var_op = built_in_var_ops_dict[var_op] # offspring = copy.deepcopy(parents) # offspring = var_op(offspring) # if isinstance(offspring, collections.abc.Iterable): - # offspring = offspring[0] + # offspring = offspring[0] if add_to_population: - added = self.add_to_population(offspring, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) + added = self.add_to_population(offspring, rng_=rng, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) if len(added) > 0: for new_child in added: parent_keys = [parent.unique_id() for parent in parents] if not pd.api.types.is_object_dtype(self.evaluated_individuals["Parents"]): #TODO Is there a cleaner way of doing this? Not required for some python environments? self.evaluated_individuals["Parents"] = self.evaluated_individuals["Parents"].astype('object') self.evaluated_individuals.at[new_child.unique_id(),"Parents"] = tuple(parent_keys) - + self.evaluated_individuals.at[new_child.unique_id(),"Variation_Function"] = var_op - - + + new_offspring.append(new_child) else: new_offspring.append(offspring) - - + + return new_offspring @@ -419,19 +429,20 @@ def copy_and_change(parents, var_op): offspring = offspring[0] return offspring -def copy_and_mutate(parents, var_op): +def copy_and_mutate(parents, var_op, rng_): + rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) - var_op(offspring) + var_op(offspring, rng_=rng) if isinstance(offspring, collections.abc.Iterable): offspring = offspring[0] return offspring -def copy_and_crossover(parents, var_op): +def copy_and_crossover(parents, var_op, rng_): + rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) - var_op(offspring[0],offspring[1]) + var_op(offspring[0],offspring[1], rng_=rng) return offspring[0] def parallel_get_id(n_jobs, individual_list): id_list = Parallel(n_jobs=n_jobs)(delayed(get_id)(ind) for ind in individual_list) return id_list - From b1c33989d782b98264ab9d2fba1d5a8144c4dfd5 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:47:20 -0700 Subject: [PATCH 18/43] added rng to control random events --- tpot2/selectors/lexicase_selection.py | 12 +++++------ .../max_weighted_average_selector.py | 3 +-- tpot2/selectors/nsgaii.py | 17 +++++++-------- tpot2/selectors/random_selector.py | 6 +++--- tpot2/selectors/tournament_selection.py | 9 ++++---- .../tournament_selection_dominated.py | 21 +++++++------------ 6 files changed, 31 insertions(+), 37 deletions(-) diff --git a/tpot2/selectors/lexicase_selection.py b/tpot2/selectors/lexicase_selection.py index 54683a44..54a8f7de 100644 --- a/tpot2/selectors/lexicase_selection.py +++ b/tpot2/selectors/lexicase_selection.py @@ -1,24 +1,24 @@ import numpy as np -import random -def lexicase_selection(scores, k, n_parents=1,): - """Select the best individual according to Lexicase Selection, *k* times. +def lexicase_selection(scores, k, rng_, n_parents=1,): + """Select the best individual according to Lexicase Selection, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. :returns: A list of indices of selected individuals. This function uses the :func:`~random.choice` function from the python base :mod:`random` module. """ + rng = np.random.default_rng(rng_) chosen =[] for i in range(k*n_parents): candidates = list(range(len(scores))) cases = list(range(len(scores[0]))) - random.shuffle(cases) - + rng.shuffle(cases) + while len(cases) > 0 and len(candidates) > 1: best_val_for_case = max(scores[candidates,cases[0]]) candidates = [x for x in candidates if scores[x, cases[0]] == best_val_for_case] cases.pop(0) - chosen.append(random.choice(candidates)) + chosen.append(rng.choice(candidates)) return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/max_weighted_average_selector.py b/tpot2/selectors/max_weighted_average_selector.py index edf3b06e..b8379c10 100644 --- a/tpot2/selectors/max_weighted_average_selector.py +++ b/tpot2/selectors/max_weighted_average_selector.py @@ -1,7 +1,6 @@ import numpy as np -import random -def max_weighted_average_selector(scores,k, n_parents=1,): +def max_weighted_average_selector(scores,k, rng_, n_parents=1,): ave_scores = [np.nanmean(s ) for s in scores ] #TODO make this more efficient chosen = np.argsort(ave_scores)[::-1][0:k] #TODO check this behavior with nans return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/nsgaii.py b/tpot2/selectors/nsgaii.py index 0090407b..670d86a4 100644 --- a/tpot2/selectors/nsgaii.py +++ b/tpot2/selectors/nsgaii.py @@ -1,12 +1,11 @@ import numpy as np -import random # Deb, Pratab, Agarwal, and Meyarivan, “A fast elitist non-dominated sorting genetic algorithm for multi-objective optimization: NSGA-II”, 2002. # chatgpt def nondominated_sorting(matrix): """ - Returns the indexes of the matrix + Returns the indexes of the matrix bigger is better """ # Initialize the front list and the rank list @@ -20,7 +19,7 @@ def nondominated_sorting(matrix): # Initialize the list of points that dominate the current point dominating = [0 for _ in range(len(matrix))] #ni the number of solutions that denominate solution i - + # Iterate over all points for p, p_scores in enumerate(matrix): # Iterate over all other points @@ -31,7 +30,7 @@ def nondominated_sorting(matrix): # If the current point is dominated by the other point, add it to the list of dominated points elif dominates(q_scores, p_scores): dominating[p] += 1 - + if dominating[p] == 0: fronts[0].add(p) @@ -65,16 +64,16 @@ def crowding_distance(matrix): matrix = np.array(matrix) # Initialize the crowding distance for each point to zero crowding_distances = [0 for _ in range(len(matrix))] - + # Iterate over each objective for objective_i in range(matrix.shape[1]): # Sort the points according to the current objective sorted_i = matrix[:, objective_i].argsort() - + # Set the crowding distance of the first and last points to infinity crowding_distances[sorted_i[0]] = float("inf") crowding_distances[sorted_i[-1]] = float("inf") - + if matrix[sorted_i[0]][objective_i] == matrix[sorted_i[-1]][objective_i]: # https://github.com/DEAP/deap/blob/f2a570567fa3dce156d7cfb0c50bc72f133258a1/deap/tools/emo.py#L135 continue @@ -88,7 +87,7 @@ def crowding_distance(matrix): -def survival_select_NSGA2(scores, k,): +def survival_select_NSGA2(scores, k, rng_): pareto_fronts = nondominated_sorting(scores) @@ -109,5 +108,5 @@ def survival_select_NSGA2(scores, k,): chosen.extend(sorted_indeces[0:(k-len(chosen))]) current_front_number += 1 - + return chosen \ No newline at end of file diff --git a/tpot2/selectors/random_selector.py b/tpot2/selectors/random_selector.py index 2a384c62..3eff5c41 100644 --- a/tpot2/selectors/random_selector.py +++ b/tpot2/selectors/random_selector.py @@ -1,6 +1,6 @@ import numpy as np -import random -def random_selector(scores, k, n_parents=1,): - chosen = random.choices(list(range(0,len(scores))), k=k*n_parents) +def random_selector(scores, k, rng_, n_parents=1, ): + rng = np.random.default_rng(rng_) + chosen = rng.choice(list(range(0,len(scores))), size=k*n_parents) return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/tournament_selection.py b/tpot2/selectors/tournament_selection.py index a2bbf950..fc8ea598 100644 --- a/tpot2/selectors/tournament_selection.py +++ b/tpot2/selectors/tournament_selection.py @@ -1,7 +1,6 @@ import numpy as np -import random -def tournament_selection(scores, k, n_parents=1, tournament_size=2, score_index=0): +def tournament_selection(scores, k, rng_, n_parents=1, tournament_size=2, score_index=0): """Select the best individual among *tournsize* randomly chosen individuals, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. @@ -13,6 +12,8 @@ def tournament_selection(scores, k, n_parents=1, tournament_size=2, score_index= :mod:`random` module. """ + rng = np.random.default_rng(rng_) + if isinstance(score_index,int): key=lambda x:x[1][score_index] elif score_index == "average": @@ -20,8 +21,8 @@ def tournament_selection(scores, k, n_parents=1, tournament_size=2, score_index= chosen = [] for i in range(k*n_parents): - aspirants_idx =[random.randrange(len(scores)) for i in range(tournament_size)] + aspirants_idx =[rng.choice(len(scores)) for i in range(tournament_size)] aspirants = list(zip(aspirants_idx, scores[aspirants_idx])) # Zip indices and elements together chosen.append(max(aspirants, key=key)[0]) # Retrun the index of the maximum element - + return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/tournament_selection_dominated.py b/tpot2/selectors/tournament_selection_dominated.py index be485dc5..ea8bc7e9 100644 --- a/tpot2/selectors/tournament_selection_dominated.py +++ b/tpot2/selectors/tournament_selection_dominated.py @@ -1,10 +1,9 @@ import numpy as np -import random from.nsgaii import nondominated_sorting, crowding_distance, dominates #based on deap -def tournament_selection_dominated(scores, k, n_parents=2): +def tournament_selection_dominated(scores, k, rng_, n_parents=2): """Select the best individual among *tournsize* randomly chosen individuals, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. @@ -15,6 +14,8 @@ def tournament_selection_dominated(scores, k, n_parents=2): This function uses the :func:`~random.choice` function from the python base :mod:`random` module. """ + + rng = np.random.default_rng(rng_) pareto_fronts = nondominated_sorting(scores) # chosen = list(itertools.chain.from_iterable(fronts)) @@ -37,26 +38,20 @@ def tournament_selection_dominated(scores, k, n_parents=2): chosen = [] for i in range(k*n_parents): - asp1 = random.randrange(len(scores)) - asp2 = random.randrange(len(scores)) + asp1 = rng.choice(len(scores)) + asp2 = rng.choice(len(scores)) if dominates(scores[asp1], scores[asp2]): chosen.append(asp1) elif dominates(scores[asp2], scores[asp1]): chosen.append(asp2) - + elif crowding_dict[asp1] > crowding_dict[asp2]: chosen.append(asp1) elif crowding_dict[asp1] < crowding_dict[asp2]: chosen.append(asp2) else: - chosen.append(random.choice([asp1,asp2])) - - return np.reshape(chosen, (k, n_parents)) - - - - - + chosen.append(rng.choice([asp1,asp2])) + return np.reshape(chosen, (k, n_parents)) From b96e63b83a15c34be2cb49e06c3f12be67c52b2b Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:48:18 -0700 Subject: [PATCH 19/43] extra params to pass to model instantiations --- tpot2/tpot_estimator/estimator_utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tpot2/tpot_estimator/estimator_utils.py b/tpot2/tpot_estimator/estimator_utils.py index ef156e26..0dd69ed7 100644 --- a/tpot2/tpot_estimator/estimator_utils.py +++ b/tpot2/tpot_estimator/estimator_utils.py @@ -21,12 +21,12 @@ def apply_make_pipeline(graphindividual, preprocessing_pipeline=None): except: return None -def get_configuration_dictionary(options, n_samples, n_features, classification, subsets=None, feature_names=None, n_classes=None): +def get_configuration_dictionary(options, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None): if options is None: return options if isinstance(options, dict): - return recursive_with_defaults(options, n_samples, n_features, classification, subsets=subsets, feature_names=feature_names) + return recursive_with_defaults(options, n_samples, n_features, classification, random_state, cv, subsets=subsets, feature_names=feature_names, n_classes=n_classes) if not isinstance(options, list): options = [options] @@ -36,22 +36,22 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, for option in options: if option == "selectors": - config_dict.update(tpot2.config.make_selector_config_dictionary(classification)) + config_dict.update(tpot2.config.make_selector_config_dictionary(random_state=random_state, classifier=classification)) elif option == "classifiers": - config_dict.update(tpot2.config.make_classifier_config_dictionary(n_samples=n_samples, n_classes=n_classes)) + config_dict.update(tpot2.config.make_classifier_config_dictionary(random_state=random_state, n_samples=n_samples, n_classes=n_classes)) elif option == "classifiers_sklearnex": - config_dict.update(tpot2.config.make_sklearnex_classifier_config_dictionary(n_samples=n_samples, n_classes=n_classes)) + config_dict.update(tpot2.config.make_sklearnex_classifier_config_dictionary(random_state=random_state, n_samples=n_samples, n_classes=n_classes)) elif option == "regressors": - config_dict.update(tpot2.config.make_regressor_config_dictionary(n_samples=n_samples)) + config_dict.update(tpot2.config.make_regressor_config_dictionary(random_state=random_state, cv=cv, n_samples=n_samples)) elif option == "regressors_sklearnex": - config_dict.update(tpot2.config.make_sklearnex_regressor_config_dictionary(n_samples=n_samples)) + config_dict.update(tpot2.config.make_sklearnex_regressor_config_dictionary(random_state=random_state, n_samples=n_samples)) elif option == "transformers": - config_dict.update(tpot2.config.make_transformer_config_dictionary(n_features=n_features)) + config_dict.update(tpot2.config.make_transformer_config_dictionary(random_state=random_state, n_features=n_features)) elif option == "arithmetic_transformer": config_dict.update(tpot2.config.make_arithmetic_transformer_config_dictionary()) @@ -79,22 +79,22 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, else: - config_dict.update(recursive_with_defaults(option, n_samples, n_features, classification, subsets=subsets, feature_names=feature_names)) + config_dict.update(recursive_with_defaults(options, n_samples, n_features, classification, random_state, cv, subsets=subsets, feature_names=feature_names, n_classes=n_classes)) if len(config_dict) == 0: raise ValueError("No valid configuration options were provided. Please check the options you provided and try again.") return config_dict -def recursive_with_defaults(config_dict, n_samples, n_features, classification, subsets=None, feature_names=None): +def recursive_with_defaults(config_dict, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None): for key in 'leaf_config_dict', 'root_config_dict', 'inner_config_dict', 'Recursive': if key in config_dict: value = config_dict[key] if key=="Resursive": - config_dict[key] = recursive_with_defaults(value,n_samples, n_features, classification, subsets=None, feature_names=None) + config_dict[key] = recursive_with_defaults(value, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None) else: - config_dict[key] = get_configuration_dictionary(value, n_samples, n_features, classification, subsets, feature_names) + config_dict[key] = get_configuration_dictionary(value, n_samples, n_features, classification, random_state, cv, subsets, feature_names, n_classes) return config_dict From 64f22b5da83656622c56733c3060ca0e45d8154a Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:49:19 -0700 Subject: [PATCH 20/43] added random_state into cv splitter to change, passed rng to required classes --- tpot2/tpot_estimator/estimator.py | 57 ++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 2ed5f488..c534a7c1 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -108,6 +108,9 @@ def __init__(self, scorers, verbose = 0, scatter = True, + # random seed for random number generator (rng) + random_state = None, + ): ''' @@ -395,6 +398,13 @@ def __init__(self, scorers, >=5. full warnings trace 6. evaluations progress bar. (Temporary: This used to be 2. Currently, using evaluation progress bar may prevent some instances were we terminate a generation early due to it reaching max_time_seconds in the middle of a generation OR a pipeline failed to be terminated normally and we need to manually terminate it.) + random_state : int, None, default=None + A seed for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes + + - int + Will be used to create and lock in Generator instance with 'numpy.random.default_rng()' + - None + Will be used to create Generator for 'numpy.random.default_rng()' where a fresh, unpredictable entropy will be pulled from the OS Attributes ---------- @@ -491,6 +501,13 @@ def __init__(self, scorers, self.optuna_optimize_pareto_front_timeout = optuna_optimize_pareto_front_timeout self.optuna_storage = optuna_storage + # create random number generator based on rng_seed + self.rng = np.random.default_rng(random_state) + # save random state passed to us for other functions that use random_state + self.random_state = random_state + # set the numpy seed so anything using it will be consistent as well + np.random.seed(random_state) + #Initialize other used params @@ -584,9 +601,9 @@ def fit(self, X, y): if validation_strategy == 'split': if self.classification: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=self.random_state) else: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=self.random_state) X_original = X @@ -626,6 +643,16 @@ def fit(self, X, y): #Set up the configuation dictionaries and the search spaces + #check if self.cv is a number + if isinstance(self.cv, int) or isinstance(self.cv, float): + if self.classification: + self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + else: + self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + + else: + self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) + n_samples= int(math.floor(X.shape[0]/n_folds)) @@ -639,28 +666,16 @@ def fit(self, X, y): if self.root_config_dict == 'Auto': if self.classification: n_classes = len(np.unique(y)) - root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) + root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) else: - root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) else: - root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, subsets=self.subsets,feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets,feature_names=self.feature_names) - inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) - leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names) + inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) + leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) - - - #check if self.cv is a number - if isinstance(self.cv, int) or isinstance(self.cv, float): - if self.classification: - self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=42) - else: - self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=42) - - else: - self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) - def objective_function(pipeline_individual, X, y, @@ -695,6 +710,7 @@ def objective_function(pipeline_individual, hyperparameter_probability=self.hyperparameter_probability, hyper_node_probability=self.hyper_node_probability, hyperparameter_alpha=self.hyperparameter_alpha, + rng_=self.rng, ) if self.threshold_evaluation_early_stop is not None or self.selection_evaluation_early_stop is not None: @@ -753,6 +769,7 @@ def objective_function(pipeline_individual, mutate_then_crossover_probability= self.mutate_then_crossover_probability, crossover_then_mutate_probability= self.crossover_then_mutate_probability, + rng_=self.rng, ) @@ -782,7 +799,7 @@ def objective_function(pipeline_individual, best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) #reshuffle rows - X, y = sklearn.utils.shuffle(X, y, random_state=1) + X, y = sklearn.utils.shuffle(X, y, random_state=self.random_state) if self.scatter: X_future = _client.scatter(X) From 04dfbbd1877aa52b2a92dcc7187d8c86f489c6d8 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:50:29 -0700 Subject: [PATCH 21/43] made variation operator functions have rng by defualt --- .../individual_representations/individual.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tpot2/individual_representations/individual.py b/tpot2/individual_representations/individual.py index 2dfdfc14..47169cb0 100644 --- a/tpot2/individual_representations/individual.py +++ b/tpot2/individual_representations/individual.py @@ -1,7 +1,6 @@ from abc import abstractmethod import types import numpy as np -import random import copy import copy import typing @@ -11,32 +10,32 @@ class BaseIndividual: def __init__(self) -> None: - - - self.mutation_list = [] + self.mutation_list = [] self.crossover_list = [] - - def mutate(self,): + def mutate(self, rng_): + rng = np.random.default_rng(rng_) mutation_list_copy = self.mutation_list.copy() - random.shuffle(mutation_list_copy) + rng.shuffle(mutation_list_copy) for func in mutation_list_copy: if func(): return True return False - def crossover(self, ind2): + def crossover(self, ind2, rng_): + rng = np.random.default_rng(rng_) crossover_list_copy = self.crossover_list.copy() - random.shuffle(crossover_list_copy) + rng.shuffle(crossover_list_copy) for func in crossover_list_copy: if func(ind2): return True return False # a guided change of an individual when given an objective function - def optimize(self, objective_function, steps=5): + def optimize(self, rng_, objective_function, steps=5): + rng = np.random.default_rng(rng_) for _ in range(steps): - self.mutate() + self.mutate(rng_=rng) #Return a hashable unique to this individual setup #For use when evaluating whether or not an individual is 'the same' and another individual From 4115f61a40d32a5e7c341936e922c94f15c8354c Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:51:19 -0700 Subject: [PATCH 22/43] added rng to control random individual generated --- .../graph_pipeline_individual/templates.py | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/templates.py b/tpot2/individual_representations/graph_pipeline_individual/templates.py index decc4570..e0599601 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/templates.py +++ b/tpot2/individual_representations/graph_pipeline_individual/templates.py @@ -12,58 +12,59 @@ def estimator_graph_individual_generator( root_config_dict, inner_config_dict=None, leaf_config_dict=None, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, - hyperparameter_probability = 1, hyper_node_probability = 0, hyperparameter_alpha = 1, + rng_=None, **kwargs, ) : - + rng = np.random.default_rng(rng_) n_nodes = 0 while True: if n_nodes < max_size: n_nodes += 1 - + for k in root_config_dict.keys(): - + graph = nx.DiGraph() - root = create_node(config_dict={k:root_config_dict[k]}) + root = create_node(config_dict={k:root_config_dict[k]}, rng_=rng) graph.add_node(root) - - ind = GraphIndividual( inner_config_dict=inner_config_dict, - leaf_config_dict=leaf_config_dict, - root_config_dict=root_config_dict, - initial_graph = graph, - - max_size = max_size, - linear_pipeline = linear_pipeline, - hyperparameter_probability = hyperparameter_probability, - hyper_node_probability = hyper_node_probability, - hyperparameter_alpha = hyperparameter_alpha, - - **kwargs, - ) - + + ind = GraphIndividual( rng_=rng, + inner_config_dict=inner_config_dict, + leaf_config_dict=leaf_config_dict, + root_config_dict=root_config_dict, + initial_graph = graph, + + max_size = max_size, + linear_pipeline = linear_pipeline, + hyperparameter_probability = hyperparameter_probability, + hyper_node_probability = hyper_node_probability, + hyperparameter_alpha = hyperparameter_alpha, + + **kwargs, + ) + starting_ops = [] if inner_config_dict is not None: starting_ops.append(ind._mutate_insert_inner_node) if leaf_config_dict is not None: starting_ops.append(ind._mutate_insert_leaf) - + if len(starting_ops) > 0: if n_nodes > 0: - for _ in range(np.random.randint(0,min(n_nodes,3))): - func = np.random.choice(starting_ops) - func() + for _ in range(rng.integers(0,min(n_nodes,3))): + func = rng.choice(starting_ops) + func(rng_=rng) + - yield ind - + class BaggingCompositeGraphSklearn(): def __init__(self) -> None: @@ -72,4 +73,3 @@ def __init__(self) -> None: class BoostingCompositeGraphSklearn(): def __init__(self) -> None: pass - From e2a39d7083311c04b290d2a59c48a8a04de96f0d Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:52:39 -0700 Subject: [PATCH 23/43] passed rng to mutation/crossover functions and things that use random --- .../graph_pipeline_individual/individual.py | 409 ++++++++++-------- 1 file changed, 224 insertions(+), 185 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index a611755c..724034ea 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -1,5 +1,4 @@ import numpy as np -import random from tpot2 import config import networkx as nx from abc import abstractmethod @@ -31,7 +30,7 @@ def __init__(self, *, self.label = label self._params = None - + from functools import partial #@https://stackoverflow.com/questions/20530455/isomorphic-comparison-of-networkx-graph-objects-instead-of-the-default-address @@ -73,22 +72,22 @@ def node_match(n1,n2, matched_labels): class GraphIndividual(BaseIndividual): ''' - An individual that contains a template for a graph sklearn pipeline. + An individual that contains a template for a graph sklearn pipeline. Parameters ---------- - root_config_dict : {dict with format {method class: param_function}} + root_config_dict : {dict with format {method class: param_function}} A dictionary of methods and functions that return a dictionary of hyperparameters. Used to create the root node of the graph. - inner_config_dict : {dict with format {method class: param_function}} - A dictionary of methods and functions that return a dictionary of hyperparameters. + inner_config_dict : {dict with format {method class: param_function}} + A dictionary of methods and functions that return a dictionary of hyperparameters. Used to create the inner nodes of the graph. If None, uses root_config_dict. - leaf_config_dict : {dict with format {method class: param_function}} + leaf_config_dict : {dict with format {method class: param_function}} A dictionary of methods and functions that return a dictionary of hyperparameters. - Used to create the leaf nodes of the graph. If not None, then all leafs must be created from this dictionary. + Used to create the leaf nodes of the graph. If not None, then all leafs must be created from this dictionary. Otherwise leaves will be created from inner_config_dict. initial_graph : (nx.DiGraph or list): - A graph to initialize the individual with. + A graph to initialize the individual with. If a list, it will initialize a linear graph with the methods in the list in the sequence provided. If the items in the list are dictionaries, nodes will be itialized with those dictionaries. Strings in the list correspond to the default configuration files. They can be 'Selector', 'Regressor', 'Transformer', 'Classifier'. @@ -107,16 +106,17 @@ class GraphIndividual(BaseIndividual): ''' def __init__( self, + rng_, root_config_dict, - inner_config_dict=None, + inner_config_dict=None, leaf_config_dict=None, initial_graph = None, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, name=None, crossover_same_depth = False, crossover_same_recursive_depth = True, - + hyperparameter_probability = 1, hyper_node_probability = 0, hyperparameter_alpha = 1, @@ -127,6 +127,8 @@ def __init__( self.__debug = False + rng = np.random.default_rng(rng_) + self.root_config_dict = root_config_dict self.inner_config_dict = inner_config_dict self.leaf_config_dict = leaf_config_dict @@ -146,35 +148,35 @@ def __init__( self.hyperparameter_alpha = hyperparameter_alpha if self.unique_subset_values is not None: - self.row_subset_selector = tpot2.representations.SubsetSelector(values=unique_subset_values, initial_set=initial_subset_values,k=20) + self.row_subset_selector = tpot2.representations.SubsetSelector(rng_=rng, values=unique_subset_values, initial_set=initial_subset_values,k=20) if isinstance(initial_graph, nx.DiGraph): self.graph = initial_graph self.root = list(nx.topological_sort(self.graph))[0] if self.leaf_config_dict is not None and len(self.graph.nodes) == 1: - first_leaf = create_node(self.leaf_config_dict) + first_leaf = create_node(self.leaf_config_dict, rng_=rng) self.graph.add_edge(self.root,first_leaf) elif isinstance(initial_graph, list): node_list = [] for item in initial_graph: if isinstance(item, dict): - node_list.append(create_node(item)) + node_list.append(create_node(item, rng_=rng)) elif isinstance(item, str): if item == 'Selector': from tpot2.config import selector_config_dictionary - node_list.append(create_node(selector_config_dictionary)) + node_list.append(create_node(selector_config_dictionary, rng_=rng)) elif item == 'Regressor': from tpot2.config import regressor_config_dictionary - node_list.append(create_node(regressor_config_dictionary)) + node_list.append(create_node(regressor_config_dictionary, rng_=rng)) elif item == 'Transformer': from tpot2.config import transformer_config_dictionary - node_list.append(create_node(transformer_config_dictionary)) - elif item == 'Classifier': + node_list.append(create_node(transformer_config_dictionary, rng_=rng)) + elif item == 'Classifier': from tpot2.config import classifier_config_dictionary - node_list.append(create_node(classifier_config_dictionary)) - + node_list.append(create_node(classifier_config_dictionary, rng_=rng)) + self.graph = nx.DiGraph() for child, parent in zip(node_list, node_list[1:]): self.graph.add_edge(parent, child) @@ -183,26 +185,26 @@ def __init__( else: self.graph = nx.DiGraph() - - self.root = create_node(self.root_config_dict) + + self.root = create_node(self.root_config_dict, rng_=rng) self.graph.add_node(self.root) if self.leaf_config_dict is not None: - first_leaf = create_node(self.leaf_config_dict) + first_leaf = create_node(self.leaf_config_dict, rng_=rng) self.graph.add_edge(self.root,first_leaf) - - self.initialize_all_nodes() + + self.initialize_all_nodes(rng_=rng) #self.root =list(nx.topological_sort(self.graph))[0] self.mutate_methods_list = [self._mutate_hyperparameters, - self._mutate_replace_node, + self._mutate_replace_node, self._mutate_remove_node, ] - + self.crossover_methods_list = [ self._crossover_swap_branch, ] @@ -240,17 +242,19 @@ def select_config_dict(self, node): return self.inner_config_dict - def initialize_all_nodes(self,): + def initialize_all_nodes(self, rng_): + rng = np.random.default_rng(rng_) for node in self.graph: if isinstance(node,GraphIndividual): continue if node.method_class is None: - node.method_class = random.choice(list(self.select_config_dict(node).keys())) + node.method_class = rng.choice(list(self.select_config_dict(node).keys())) if node.hyperparameters is None: get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - - def fix_noncompliant_leafs(self): + + def fix_noncompliant_leafs(self, rng_): + rng = np.random.default_rng(rng_) leafs = [node for node in self.graph.nodes if len(list(self.graph.successors(node)))==0] compliant_leafs = [] noncompliant_leafs = [] @@ -259,11 +263,11 @@ def fix_noncompliant_leafs(self): compliant_leafs.append(leaf) else: noncompliant_leafs.append(leaf) - + #find all good leafs. If no good leaves exist, create a new one if len(compliant_leafs) == 0: first_leaf = NodeLabel(config_dict=self.leaf_config_dict) - first_leaf.method_class = random.choice(list(first_leaf.config_dict.keys())) #TODO: check when there is no new method + first_leaf.method_class = rng.choice(list(first_leaf.config_dict.keys())) #TODO: check when there is no new method first_leaf.hyperparameters = first_leaf.config_dict[first_leaf.method_class](config.hyperparametersuggestor) get_hyperparameter(self.select_config_dict(first_leaf)[first_leaf.method_class], nodelabel=first_leaf, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) compliant_leafs.append(first_leaf) @@ -271,12 +275,12 @@ def fix_noncompliant_leafs(self): #connect bad leaves to good leaves (making them internal nodes) if len(noncompliant_leafs) > 0: for node in noncompliant_leafs: - self.graph.add_edge(node, random.choice(compliant_leafs)) + self.graph.add_edge(node, rng.choice(compliant_leafs)) - def _merge_duplicated_nodes(self): + def _merge_duplicated_nodes(self): graph_changed = False merged = False @@ -292,7 +296,7 @@ def _merge_duplicated_nodes(self): node_children = set(self.graph.successors(node)) other_node_children = set(self.graph.successors(other_node)) #if nodes have identical children, they can be merged - if node_children == other_node_children: + if node_children == other_node_children: for other_node_parent in list(self.graph.predecessors(other_node)): if other_node_parent not in self.graph.predecessors(node): self.graph.add_edge(other_node_parent,node) @@ -318,7 +322,7 @@ def flatten_pipeline(self,depth=0): n1_p = flattened_full_graph.predecessors(node) remove_list.append(node) - + flattened_full_graph = nx.compose(flattened_full_graph, flattened) @@ -327,7 +331,7 @@ def flatten_pipeline(self,depth=0): flattened_full_graph.add_edges_from([ (n, n2) for n in n1_p for n2 in roots]) else: flattened_full_graph.nodes[node]['recursive depth'] = depth - + for node in remove_list: flattened_full_graph.remove_node(node) @@ -341,7 +345,7 @@ def flatten_pipeline(self,depth=0): flattened_full_graph.nodes[node]["subset_values"] = list(set(flattened_full_graph.nodes[node]["subset_values"]) & set(self.row_subset_selector.subsets)) return flattened_full_graph - + def get_num_nodes(self,): num_nodes = 0 @@ -355,7 +359,7 @@ def get_num_nodes(self,): def export_nested_pipeline(self, **graph_pipeline_args): - + flattened_full_graph = self.graph.copy() remove_list = [] for node in list(flattened_full_graph.nodes): @@ -366,7 +370,7 @@ def export_nested_pipeline(self, **graph_pipeline_args): n1_p = flattened_full_graph.predecessors(node) remove_list.append(node) - + flattened_full_graph.add_node(gp) @@ -376,14 +380,14 @@ def export_nested_pipeline(self, **graph_pipeline_args): for node in remove_list: flattened_full_graph.remove_node(node) - + estimator_graph = flattened_full_graph - + #mapping = {node:node.method_class(**node.hyperparameters) for node in estimator_graph} label_remapping = {} label_to_instance = {} - - for node in estimator_graph: + + for node in estimator_graph: found_unique_label = False i=1 while not found_unique_label: @@ -397,14 +401,14 @@ def export_nested_pipeline(self, **graph_pipeline_args): else: i+=1 - + if type(node) is tpot2.GraphPipeline: label_remapping[node] = label label_to_instance[label] = node else: label_remapping[node] = label label_to_instance[label] = node.method_class(**node.hyperparameters) - + estimator_graph = nx.relabel_nodes(estimator_graph, label_remapping) for label, instance in label_to_instance.items(): @@ -414,12 +418,12 @@ def export_nested_pipeline(self, **graph_pipeline_args): def export_pipeline(self, **graph_pipeline_args): estimator_graph = self.flatten_pipeline() - + #mapping = {node:node.method_class(**node.hyperparameters) for node in estimator_graph} label_remapping = {} label_to_instance = {} - - for node in estimator_graph: + + for node in estimator_graph: found_unique_label = False i=1 while not found_unique_label: @@ -431,7 +435,7 @@ def export_pipeline(self, **graph_pipeline_args): label_remapping[node] = label label_to_instance[label] = node.method_class(**node.hyperparameters) - + estimator_graph = nx.relabel_nodes(estimator_graph, label_remapping) for label, instance in label_to_instance.items(): @@ -460,7 +464,6 @@ def export_baikal(self,): if i == len(toposorted)-1: #last method doesn't need transformed. return baikal.Model(inputs=X, outputs=this_output, targets=y) - def plot(self): @@ -476,10 +479,10 @@ def plot(self): node_color = [plt.cm.Set1(G.nodes[n]['recursive depth']) for n in G] fig, ax = plt.subplots() - + nx.draw(G, pos, nodelist=nodelist, node_color=node_color, ax=ax, **options) - + '''edgelist = [] for n in n1.node_set: for child in n.children: @@ -514,15 +517,17 @@ def plot(self): ############# #TODO currently does not correctly return false when adding a leaf causes a duplicate node that is later merged - def mutate(self,): + def mutate(self, rng_): + rng = np.random.default_rng(rng_) self.key = None - graph = self.select_graphindividual() - return graph._mutate() + graph = self.select_graphindividual(rng_=rng) + return graph._mutate(rng_=rng) - def _mutate(self,): - random.shuffle(self.mutate_methods_list) + def _mutate(self, rng_): + rng = np.random.default_rng(rng_) + rng.shuffle(self.mutate_methods_list) for mutate_method in self.mutate_methods_list: - if mutate_method(): + if mutate_method(rng_=rng): self._merge_duplicated_nodes() if self.__debug: @@ -541,24 +546,26 @@ def _mutate(self,): try: nx.find_cycle(self.graph) print('something went wrong with ', mutate_method) - except: + except: pass - + return True - + return False - def _mutate_row_subsets(self,): + def _mutate_row_subsets(self, rng_): + rng = np.random.default_rng(rng_) if self.unique_subset_values is not None: - self.row_subset_selector.mutate() + self.row_subset_selector.mutate(rng_=rng) - def _mutate_hyperparameters(self): + def _mutate_hyperparameters(self, rng_): ''' Mutates the hyperparameters for a randomly chosen node in the graph. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) completed_one = False for node in sorted_nodes_list: if isinstance(node,GraphIndividual): @@ -567,55 +574,57 @@ def _mutate_hyperparameters(self): continue if not completed_one: - _,_, completed_one = get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + _,_, completed_one = get_hyperparameter(self.select_config_dict(node)[node.method_class], rng_=rng, nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) else: - if self.hyper_node_probability > random.random(): - get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + if self.hyper_node_probability > rng.random(): + get_hyperparameter(self.select_config_dict(node)[node.method_class], rng_=rng, nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) return completed_one - - - def _mutate_replace_node(self): + + + def _mutate_replace_node(self, rng_): ''' Replaces the method in a randomly chosen node by a method from the available methods for that node. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) for node in sorted_nodes_list: if isinstance(node,GraphIndividual): continue - node.method_class = random.choice(list(self.select_config_dict(node).keys())) + node.method_class = rng.choice(list(self.select_config_dict(node).keys())) if isinstance(self.select_config_dict(node)[node.method_class], dict): hyperparameters = self.select_config_dict(node)[node.method_class] node.hyperparameters = hyperparameters - else: + else: #hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) #get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=None, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - new_node = create_node(self.select_config_dict(node)) + new_node = create_node(self.select_config_dict(node), rng_=rng) #TODO cleanup node.hyperparameters = new_node.hyperparameters node.method_class = new_node.method_class node.label = new_node.label return True - + return False - def _mutate_remove_node(self): + def _mutate_remove_node(self, rng_): ''' Removes a randomly chosen node and connects its parents to its children. If the node is the only leaf for an inner node and 'leaf_config_dict' is not none, we do not remove it. ''' + rng = np.random.default_rng(rng_) nodes_list = list(self.graph.nodes) nodes_list.remove(self.root) leaves = graph_utils.get_leaves(self.graph) while len(nodes_list) > 0: - node = random.choices(nodes_list,)[0] + node = rng.choice(nodes_list) nodes_list.remove(node) if self.leaf_config_dict is not None and len(list(nx.descendants(self.graph,node))) == 0 : #if the node is a leaf @@ -635,55 +644,58 @@ def _mutate_remove_node(self): graph_utils.remove_and_stitch(self.graph, node) graph_utils.remove_nodes_disconnected_from_node(self.graph, self.root) return True - + return False - def _mutate_remove_edge(self): + def _mutate_remove_edge(self, rng_): ''' Deletes an edge as long as deleting that edge does not make the graph disconnected. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) for child_node in sorted_nodes_list: parents = list(self.graph.predecessors(child_node)) if len(parents) > 1: # if it has more than one parent, you can remove an edge (if this is the only child of a node, it will become a leaf) for parent_node in parents: # if removing the egde will make the parent_node a leaf node, skip - if self.leaf_config_dict is not None and len(list(self.graph.successors(parent_node))) < 2: + if self.leaf_config_dict is not None and len(list(self.graph.successors(parent_node))) < 2: continue self.graph.remove_edge(parent_node, child_node) return True return False - def _mutate_add_edge(self): + def _mutate_add_edge(self, rng_): ''' Randomly add an edge from a node to another node that is not an ancestor of the first node. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) for child_node in sorted_nodes_list: for parent_node in sorted_nodes_list: if self.leaf_config_dict is not None: if len(list(self.graph.successors(parent_node))) == 0: continue - + # skip if # - parent and child are the same node # - edge already exists # - child is an ancestor of parent - if (child_node is not parent_node) and not self.graph.has_edge(parent_node,child_node) and (child_node not in nx.ancestors(self.graph, parent_node)): + if (child_node is not parent_node) and not self.graph.has_edge(parent_node,child_node) and (child_node not in nx.ancestors(self.graph, parent_node)): self.graph.add_edge(parent_node,child_node) return True return False - def _mutate_insert_leaf(self): + def _mutate_insert_leaf(self, rng_): + rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another for node in sorted_nodes_list: #if leafs are protected, check if node is a leaf #if node is a leaf, skip because we don't want to add node on top of node @@ -691,15 +703,14 @@ def _mutate_insert_leaf(self): and len(list(self.graph.successors(node))) == 0 #if node is leaf and len(list(self.graph.predecessors(node))) > 0 #except if node is root, in which case we want to add a leaf even if it happens to be a leaf too ): - - + continue - + #If node *is* the root or is not a leaf, add leaf node. (dont want to add leaf on top of leaf) if self.leaf_config_dict is not None: - new_node = create_node(self.leaf_config_dict) + new_node = create_node(self.leaf_config_dict, rng_=rng) else: - new_node = create_node(self.inner_config_dict) + new_node = create_node(self.inner_config_dict, rng_=rng) self.graph.add_node(new_node) self.graph.add_edge(node, new_node) @@ -707,13 +718,14 @@ def _mutate_insert_leaf(self): return False - def _mutate_insert_bypass_node(self): + def _mutate_insert_bypass_node(self, rng_): + rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) sorted_nodes_list2 = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another - random.shuffle(sorted_nodes_list2) - for node in sorted_nodes_list: + rng.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(sorted_nodes_list2) + for node in sorted_nodes_list: for child_node in sorted_nodes_list2: if child_node is not node and child_node not in nx.ancestors(self.graph, node): if self.leaf_config_dict is not None: @@ -721,7 +733,7 @@ def _mutate_insert_bypass_node(self): if len(list(nx.descendants(self.graph,node))) ==0 : continue - new_node = create_node(config_dict = self.inner_config_dict) + new_node = create_node(config_dict = self.inner_config_dict, rng_=rng) self.graph.add_node(new_node) self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) @@ -730,23 +742,24 @@ def _mutate_insert_bypass_node(self): return False - def _mutate_insert_inner_node(self): + def _mutate_insert_inner_node(self, rng_): + rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) sorted_nodes_list2 = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another - random.shuffle(sorted_nodes_list2) + rng.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(sorted_nodes_list2) for node in sorted_nodes_list: #loop through children of node for child_node in list(self.graph.successors(node)): - + if child_node is not node and child_node not in nx.ancestors(self.graph, node): if self.leaf_config_dict is not None: #If if we are protecting leafs, dont add connection into a leaf if len(list(nx.descendants(self.graph,node))) ==0 : continue - - new_node = create_node(config_dict = self.inner_config_dict) + + new_node = create_node(config_dict = self.inner_config_dict, rng_=rng) self.graph.add_node(new_node) self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) @@ -769,7 +782,7 @@ def get_graphs(self): return graphs - + def _get_graphs(self, depth=1): graphs = [self] self.graph.graph['recursive depth'] = depth @@ -781,19 +794,30 @@ def _get_graphs(self, depth=1): return graphs - def select_graphindividual(self,): + def select_graphindividual(self, rng_): + rng = np.random.default_rng(rng_) graphs = self.get_graphs() weights = [g.graph.number_of_nodes() for g in graphs] - return random.choices(graphs, weights=weights)[0] + w_sum = sum(weights) + weights = [w / w_sum for w in weights] # generate probabilities based on sum of weights + return rng.choice(graphs, p=weights) + + + def select_graph_same_recursive_depth(self,ind1,ind2,rng_): + rng = np.random.default_rng(rng_) - def select_graph_same_recursive_depth(self,ind1,ind2): graphs1 = ind1.get_graphs() weights1 = [g.graph.number_of_nodes() for g in graphs1] + w1_sum = sum(weights1) + weights1 = [w / w1_sum for w in weights1] + graphs2 = ind2.get_graphs() weights2 = [g.graph.number_of_nodes() for g in graphs2] - - g1_sorted_graphs = random_weighted_sort(graphs1, weights1) - g2_sorted_graphs = random_weighted_sort(graphs2, weights2) + w2_sum = sum(weights2) + weights2 = [w / w2_sum for w in weights2] + + g1_sorted_graphs = random_weighted_sort(graphs1, weights1, rng) + g2_sorted_graphs = random_weighted_sort(graphs2, weights2, rng) for g1, g2 in zip(g1_sorted_graphs, g2_sorted_graphs): if g1.graph.graph['depth'] == g2.graph.graph['depth'] and g1.graph.graph['recursive depth'] == g2.graph.graph['recursive depth']: @@ -801,7 +825,7 @@ def select_graph_same_recursive_depth(self,ind1,ind2): return ind1,ind2 - def crossover(self, ind2): + def crossover(self, ind2, rng_): ''' self is the first individual, ind2 is the second individual If crossover_same_depth, it will select graphindividuals at the same recursive depth. @@ -809,25 +833,28 @@ def crossover(self, ind2): This does not impact graphs without subgraphs. And it does not impacts nodes that are not graphindividuals. Cros ''' - + + rng = np.random.default_rng(rng_) + self.key = None ind2.key = None if self.crossover_same_recursive_depth: # selects graphs from the same recursive depth and same depth from the root - g1, g2 = self.select_graph_same_recursive_depth(self, ind2) - - + g1, g2 = self.select_graph_same_recursive_depth(self, ind2, rng_=rng) + + else: - g1 = self.select_graphindividual() - g2 = ind2.select_graphindividual() - - return g1._crossover(g2) - - def _crossover(self, Graph): - - random.shuffle(self.crossover_methods_list) + g1 = self.select_graphindividual(rng_=rng) + g2 = ind2.select_graphindividual(rng_=rng) + + return g1._crossover(g2, rng_=rng) + + def _crossover(self, Graph, rng_): + rng = np.random.default_rng(rng_) + + rng.shuffle(self.crossover_methods_list) for crossover_method in self.crossover_methods_list: - if crossover_method(Graph): + if crossover_method(Graph, rng_=rng): self._merge_duplicated_nodes() return True @@ -835,35 +862,38 @@ def _crossover(self, Graph): try: nx.find_cycle(self.graph) print('something went wrong with ', crossover_method) - except: + except: pass return False - def _crossover_row_subsets(self, G2): + def _crossover_row_subsets(self, G2, rng_): + rng = np.random.default_rng(rng_) if self.unique_subset_values is not None and G2.unique_subset_values is not None: - self.row_subset_selector.crossover(G2.row_subset_selector) - + self.row_subset_selector.crossover(G2.row_subset_selector, rng_=rng) - def _crossover_swap_node(self, G2): + + def _crossover_swap_node(self, G2, rng_): ''' Swaps randomly chosen node from Parent1 with a randomly chosen node from Parent2. ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: if not (node1 is self.root or node2 is G2.root): #TODO: allow root - + n1_s = self.graph.successors(node1) n1_p = self.graph.predecessors(node1) n2_s = G2.graph.successors(node2) n2_p = G2.graph.predecessors(node2) - + self.graph.remove_node(node1) G2.graph.remove_node(node2) @@ -874,28 +904,30 @@ def _crossover_swap_node(self, G2): self.graph.add_edges_from([ (n, node2) for n in n1_p]) G2.graph.add_edges_from([ (n, node1) for n in n2_p]) - + return True return False - def _crossover_swap_branch(self, G2): + def _crossover_swap_branch(self, G2, rng_): ''' swaps a branch from parent1 with a branch from parent2. does not modify parent2 ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: #TODO: if root is in inner_config_dict, then do use it? if node1 is self.root or node2 is G2.root: #dont want to add root as inner node continue - - #check if node1 is a leaf and leafs are protected, don't add an input to the leave - if self.leaf_config_dict is not None: #if we are protecting leaves, + + #check if node1 is a leaf and leafs are protected, don't add an input to the leave + if self.leaf_config_dict is not None: #if we are protecting leaves, node1_is_leaf = len(list(self.graph.successors(node1))) == 0 node2_is_leaf = len(list(G2.graph.successors(node2))) == 0 #if not ((node1_is_leaf and node1_is_leaf) or (not node1_is_leaf and not node2_is_leaf)): #if node1 is a leaf @@ -929,14 +961,16 @@ def _crossover_swap_branch(self, G2): return False #TODO: Currently returns true even if hyperparameters are blank - def _crossover_hyperparameters(self, G2): + def _crossover_hyperparameters(self, G2, rng_): ''' Swaps the hyperparamters of one randomly chosen node in Parent1 with the hyperparameters of randnomly chosen node in Parent2. ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: if isinstance(node1,GraphIndividual) or isinstance(node2,GraphIndividual): @@ -952,15 +986,17 @@ def _crossover_hyperparameters(self, G2): #not including the nodes, just their children #Finds leaves attached to nodes and swaps them - def _crossover_swap_leaf_at_node(self, G2): + def _crossover_swap_leaf_at_node(self, G2, rng_): + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) success = False for node1, node2 in pair_gen: - # if leaves are protected node1 and node2 must both be leaves or both be inner nodes + # if leaves are protected node1 and node2 must both be leaves or both be inner nodes if self.leaf_config_dict is not None and not (len(list(self.graph.successors(node1)))==0 ^ len(list(G2.graph.successors(node2)))==0): continue #self_leafs = [c for c in nx.descendants(self.graph,node1) if len(list(self.graph.successors(c)))==0 and c is not node1] @@ -975,7 +1011,7 @@ def _crossover_swap_leaf_at_node(self, G2): if len(node_leafs) >0: for c in node_leafs: - if random.choice([True,False]): + if rng.choice([True,False]): G2.graph.remove_node(c) self.graph.add_edge(node1, c) success = True @@ -983,22 +1019,24 @@ def _crossover_swap_leaf_at_node(self, G2): return success - def _crossover_take_branch(self, G2): + def _crossover_take_branch(self, G2, rng_): ''' Takes a subgraph from Parent2 and add it to a randomly chosen node in Parent1. ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: #TODO: if root is in inner_config_dict, then do use it? if node2 is G2.root: #dont want to add root as inner node continue - - #check if node1 is a leaf and leafs are protected, don't add an input to the leave + + #check if node1 is a leaf and leafs are protected, don't add an input to the leave if self.leaf_config_dict is not None and len(list(self.graph.successors(node1))) == 0: continue @@ -1027,21 +1065,23 @@ def _crossover_take_branch(self, G2): return False #TODO: swap all leaf nodes - def _crossover_swap_all_leafs(self, G2): + def _crossover_swap_all_leafs(self, G2, rng_): pass #TODO: currently ignores ensembles, make it include nodes inside of ensembles - def optimize(self, objective_function, steps=5): - random.shuffle(self.optimize_methods_list) #select an optimization method + def optimize(self, rng_, objective_function, steps=5): + rng = np.random.default_rng(rng_) + rng.shuffle(self.optimize_methods_list) #select an optimization method for optimize_method in self.optimize_methods_list: - if optimize_method(objective_function, steps=steps): + if optimize_method(rng, objective_function, steps=steps): return True #optimize the hyperparameters of one method to improve the entire pipeline - def _optimize_optuna_single_method_full_pipeline(self, objective_function, steps=5): + def _optimize_optuna_single_method_full_pipeline(self, rng_, objective_function, steps=5): + rng = np.random.default_rng(rng_) nodes_list = list(self.graph.nodes) - random.shuffle(nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another for node in nodes_list: if not isinstance(node, NodeLabel) or isinstance(self.select_config_dict(node)[node.method_class],dict): continue @@ -1051,7 +1091,7 @@ def _optimize_optuna_single_method_full_pipeline(self, objective_function, steps def objective(trial): params = self.select_config_dict(node)[node.method_class](trial) node.hyperparameters = params - + trial.set_user_attr('params', params) try: return objective_function(self) @@ -1064,7 +1104,7 @@ def objective(trial): #optimize the hyperparameters of all methods simultaneously to improve the entire pipeline - def _optimize_optuna_all_methods_full_pipeline(self, objective_function, steps=5): + def _optimize_optuna_all_methods_full_pipeline(self, rng_, objective_function, steps=5): nodes_list = list(self.graph.nodes) study = optuna.create_study() nodes_to_optimize = [] @@ -1080,9 +1120,9 @@ def objective(trial): params = self.select_config_dict(node)[node.method_class](trial, name=f'node_{i}') node.hyperparameters = params param_list.append(params) - + trial.set_user_attr('params', param_list) - + try: return objective_function(self) except: @@ -1095,8 +1135,8 @@ def objective(trial): node.hyperparameters = params return True - - + + def _cached_transform(cache_nunber=0): #use a cache for models at each CV fold? #cache just transformations at each fold? @@ -1114,8 +1154,8 @@ def unique_id(self) -> GraphKey: g.nodes[n]['label'] = {n.method_class: n.hyperparameters, "subset_values":g.nodes[n]["subset_values"]} else: g.nodes[n]['label'] = {n.method_class: n.hyperparameters} - - g.nodes[n]['method_class'] = n.method_class #TODO making this transformation doesn't feel very clean? + + g.nodes[n]['method_class'] = n.method_class #TODO making this transformation doesn't feel very clean? g.nodes[n]['hyperparameters'] = n.hyperparameters g = nx.convert_node_labels_to_integers(g) @@ -1130,20 +1170,21 @@ def full_node_list(self): node_list.pop(node_list.index(node)) node_list.extend(node.graph.nodes) return node_list - -def create_node(config_dict): + +def create_node(config_dict, rng_): ''' Takes a config_dict and returns a node with a random method_class and hyperparameters ''' - method_class = random.choice(list(config_dict.keys())) + rng = np.random.default_rng(rng_) + method_class = rng.choice(list(config_dict.keys())) #if method_class == GraphIndividual or method_class == 'Recursive': if method_class == 'Recursive': node = GraphIndividual(**config_dict[method_class]) else: - hyperparameters, params, _ = get_hyperparameter(config_dict[method_class], nodelabel=None) + hyperparameters, params, _ = get_hyperparameter(config_dict[method_class], rng_=rng, nodelabel=None) node = NodeLabel( method_class=method_class, @@ -1154,34 +1195,32 @@ def create_node(config_dict): return node - - -import random -def random_weighted_sort(l,weights): +def random_weighted_sort(l,weights, rng_): + rng = np.random.default_rng(rng_) sorted_l = [] indeces = {i: weights[i] for i in range(len(l))} while len(indeces) > 0: - next_item = random.choices(list(indeces.keys()), weights=list(indeces.values()))[0] + next_item = rng.choice(list(indeces.keys()), p=list(indeces.values())) indeces.pop(next_item) sorted_l.append(l[next_item]) - - return sorted_l + return sorted_l -def get_hyperparameter(config_func, nodelabel=None, alpha=1, hyperparameter_probability=1): +def get_hyperparameter(config_func, rng_, nodelabel=None, alpha=1, hyperparameter_probability=1): + rng = np.random.default_rng(rng_) changed = False if isinstance(config_func, dict): return config_func, None, changed if nodelabel is not None: - trial = config.hyperparametersuggestor.Trial(old_params=nodelabel._params, alpha=alpha, hyperparameter_probability=hyperparameter_probability) + trial = config.hyperparametersuggestor.Trial(rng_=rng, old_params=nodelabel._params, alpha=alpha, hyperparameter_probability=hyperparameter_probability) new_params = config_func(trial) changed = trial._params != nodelabel._params nodelabel._params = trial._params nodelabel.hyperparameters = new_params else: - trial = config.hyperparametersuggestor.Trial(old_params=None, alpha=alpha, hyperparameter_probability=hyperparameter_probability) + trial = config.hyperparametersuggestor.Trial(rng_=rng, old_params=None, alpha=alpha, hyperparameter_probability=hyperparameter_probability) new_params = config_func(trial) return new_params, trial._params, changed \ No newline at end of file From d3c8cdb751332355b5cb57aca787c1c28c763044 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:53:35 -0700 Subject: [PATCH 24/43] passed rng to functions and things that use random to get nodes randomly --- .../graph_utils/graph_utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py b/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py index bf96a8dc..87d8a739 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py +++ b/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py @@ -1,5 +1,5 @@ import networkx as nx -import random +import numpy as np def remove_and_stitch(graph, node): @@ -53,9 +53,11 @@ def invert_dictionary(d): for k, v in d.items(): inv_map.setdefault(v, set()).add(k) - return inv_map + return inv_map + +def select_nodes_same_depth(g1, node1, g2, node2, rng_): + rng = np.random.default_rng(rng_) -def select_nodes_same_depth(g1, node1, g2, node2): g1_nodes = nx.shortest_path_length(g1, source=node1) g2_nodes = nx.shortest_path_length(g2, source=node2) @@ -79,17 +81,19 @@ def select_nodes_same_depth(g1, node1, g2, node2): for n2 in g2_nodes[i]: possible_pairs.append( (n1,n2) ) - random.shuffle(possible_pairs) + rng.shuffle(possible_pairs) for p in possible_pairs: yield p[0], p[1] -def select_nodes_randomly(g1, g2,): +def select_nodes_randomly(g1, g2, rng_): + rng = np.random.default_rng(rng_) + sorted_self_nodes_list = list(g1.nodes) - random.shuffle(sorted_self_nodes_list) + rng.shuffle(sorted_self_nodes_list) sorted_other_nodes_list = list(g2.nodes) - random.shuffle(sorted_other_nodes_list) + rng.shuffle(sorted_other_nodes_list) for node1 in sorted_self_nodes_list: for node2 in sorted_other_nodes_list: yield node1, node2 \ No newline at end of file From e75694ec0fd61a78c7c595cf8e679ebc0436149f Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 10:54:32 -0700 Subject: [PATCH 25/43] passed rng to control mutations/crossover and initial set generation --- .../subset_selector/subsetselector.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tpot2/individual_representations/subset_selector/subsetselector.py b/tpot2/individual_representations/subset_selector/subsetselector.py index f3beccf4..7cddbccf 100644 --- a/tpot2/individual_representations/subset_selector/subsetselector.py +++ b/tpot2/individual_representations/subset_selector/subsetselector.py @@ -1,15 +1,18 @@ from numpy import iterable import tpot2 -import random +import numpy as np from .. import BaseIndividual class SubsetSelector(BaseIndividual): def __init__( self, values, + rng_, initial_set = None, k=1, #step size for shuffling ): + rng = np.random.default_rng(rng_) + if isinstance(values, int): self.values = set(range(0,values)) else: @@ -17,7 +20,7 @@ def __init__( self, if initial_set is None: - self.subsets = set(random.choices(values, k=k)) + self.subsets = set(rng.choices(values, k=k)) else: self.subsets = set(initial_set) @@ -25,20 +28,23 @@ def __init__( self, self.mutation_list = [self._mutate_add, self._mutate_remove] self.crossover_list = [self._crossover_swap] - - def _mutate_add(self,): + + def _mutate_add(self, rng_): + rng = np.random.default_rng(rng_) not_included = list(self.values.difference(self.subsets)) if len(not_included) > 1: - self.subsets.update(random.sample(not_included, k=min(self.k, len(not_included)))) + self.subsets.update(rng.choice(not_included, k=min(self.k, len(not_included)))) return True else: return False - def _mutate_remove(self,): + def _mutate_remove(self, rng_): + rng = np.random.default_rng(rng_) if len(self.subsets) > 1: - self.subsets = self.subsets - set(random.sample(list(self.subsets), k=min(self.k, len(self.subsets)-1) )) + self.subsets = self.subsets - set(rng.choice(list(self.subsets), k=min(self.k, len(self.subsets)-1) )) - def _crossover_swap(self, ss2): + def _crossover_swap(self, ss2, rng_): + rng = np.random.default_rng(rng_) diffs = self.subsets.symmetric_difference(ss2.subsets) if len(diffs) == 0: @@ -46,6 +52,6 @@ def _crossover_swap(self, ss2): for v in diffs: self.subsets.discard(v) ss2.subsets.discard(v) - random.choice([self.subsets, ss2.subsets]).add(v) - + rng.choice([self.subsets, ss2.subsets]).add(v) + return True From f26488a4e94addadfe800a0e761e928b8f5dd7eb Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 13:56:24 -0700 Subject: [PATCH 26/43] added rng stuff to control stochasticity --- tpot2/evolvers/steady_state_evolver.py | 130 +++++++++++++------------ 1 file changed, 66 insertions(+), 64 deletions(-) diff --git a/tpot2/evolvers/steady_state_evolver.py b/tpot2/evolvers/steady_state_evolver.py index 23abdfe9..a45e4059 100644 --- a/tpot2/evolvers/steady_state_evolver.py +++ b/tpot2/evolvers/steady_state_evolver.py @@ -23,9 +23,9 @@ import warnings class SteadyStateEvolver(): - def __init__( self, + def __init__( self, individual_generator , - + objective_functions, objective_function_weights, objective_names = None, @@ -34,19 +34,19 @@ def __init__( self, initial_population_size = 50, population_size = 50, - max_evaluated_individuals = None, + max_evaluated_individuals = None, early_stop = None, early_stop_seconds = None, early_stop_tol = 0.001, - - max_time_seconds=float("inf"), + + max_time_seconds=float("inf"), max_eval_time_seconds=60*5, n_jobs=1, memory_limit="4GB", client=None, - + crossover_probability=.2, mutate_probability=.7, @@ -56,24 +56,28 @@ def __init__( self, survival_selector = survival_select_NSGA2, parent_selector = tournament_selection_dominated, - - budget_range = None, - budget_scaling = .5, - individuals_until_end_budget = 1, + + budget_range = None, + budget_scaling = .5, + individuals_until_end_budget = 1, stepwise_steps = 5, - - verbose = 0, + + verbose = 0, periodic_checkpoint_folder = None, callback = None, + + rng_=None ) -> None: + self.rng = np.random.default_rng(rng_) + self.max_evaluated_individuals = max_evaluated_individuals self.individuals_until_end_budget = individuals_until_end_budget - self.individual_generator = individual_generator - self.population_size = population_size - self.objective_functions = objective_functions + self.individual_generator = individual_generator + self.population_size = population_size + self.objective_functions = objective_functions self.objective_function_weights = np.array(objective_function_weights) self.bigger_is_better = bigger_is_better if not bigger_is_better: @@ -83,15 +87,15 @@ def __init__( self, self.periodic_checkpoint_folder = periodic_checkpoint_folder - self.verbose = verbose - self.callback = callback + self.verbose = verbose + self.callback = callback self.n_jobs = n_jobs - + if max_time_seconds is None: self.max_time_seconds = float("inf") else: - self.max_time_seconds = max_time_seconds - + self.max_time_seconds = max_time_seconds + #functools requires none for infinite time, doesn't support inf if max_eval_time_seconds is not None and math.isinf(max_eval_time_seconds ): self.max_eval_time_seconds = None @@ -111,7 +115,7 @@ def __init__( self, self.survival_selector=survival_selector self.parent_selector=parent_selector - + total_var_p = crossover_probability + mutate_probability + mutate_then_crossover_probability + crossover_then_mutate_probability self.crossover_probability = crossover_probability / total_var_p self.mutate_probability = mutate_probability / total_var_p @@ -145,7 +149,7 @@ def __init__( self, self.budget = self.budget_list[self.generation] else: self.budget = None - + self.early_stop_tol = early_stop_tol self.early_stop_seconds = early_stop_seconds @@ -172,7 +176,7 @@ def __init__( self, if self.population is None: self.population = tpot2.Population(column_names=init_names) initial_population = [next(self.individual_generator) for _ in range(self.initial_population_size)] - self.population.add_to_population(initial_population) + self.population.add_to_population(initial_population, rng_=self.rng) def optimize(self): @@ -194,7 +198,7 @@ def optimize(self): processes=False, memory_limit=self.memory_limit) self._client = Client(self._cluster) - + self.max_queue_size = len(self._client.cluster.workers) @@ -209,11 +213,11 @@ def optimize(self): submitted_futures = {} submitted_inds = set() - start_time = time.time() - - try: - - + start_time = time.time() + + try: + + if self.verbose >= 1: if self.max_evaluated_individuals is not None: pbar = tqdm.tqdm(total=self.max_evaluated_individuals, miniters=1) @@ -228,7 +232,7 @@ def optimize(self): if len(submitted_futures) >= self.max_queue_size: break future = self._client.submit(tpot2.utils.eval_utils.eval_objective_list, individual, self.objective_functions, verbose=self.verbose, timeout=self.max_eval_time_seconds,**self.objective_kwargs) - + submitted_futures[future] = {"individual": individual, "time": time.time(), "budget": budget,} @@ -238,7 +242,7 @@ def optimize(self): done = False start_time = time.time() while not done: - + ############################### # Step 1: Check for finished futures ############################### @@ -253,8 +257,8 @@ def optimize(self): #Loop through all futures, collect completed and timeout futures. for completed_future in list(submitted_futures.keys()): - - #get scores and update + + #get scores and update if completed_future.done(): #if future is done #If the future is done but threw and error, record the error if completed_future.exception() or completed_future.status == "error": #if the future is done and threw an error @@ -277,18 +281,18 @@ def optimize(self): print("cancelld ", completed_future.cancelled()) scores = ["INVALID" for _ in range(len(self.objective_names))] else: #if future is not done - + #check if the future has been running for too long, cancel the future if time.time() - submitted_futures[completed_future]["time"] > self.max_eval_time_seconds*1.25: completed_future.cancel() - + if self.verbose >= 4: print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n') - + scores = ["TIMEOUT" for _ in range(len(self.objective_names))] else: continue #otherwise, continue to next future - + #update population @@ -310,15 +314,15 @@ def optimize(self): #now we have a list of completed futures - + self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID") self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT") - - + + ############################### # Step 2: Early Stopping ############################### - if self.verbose >= 3: + if self.verbose >= 3: sign = np.sign(self.objective_function_weights) valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign cur_best_scores = valid_df.max(axis=0)*sign @@ -335,13 +339,13 @@ def optimize(self): cur_best_scores = valid_df.max(axis=0) cur_best_scores = cur_best_scores.to_numpy() #cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best - + improved = ( np.array(cur_best_scores) - np.array(best_scores) >= np.array(self.early_stop_tol) ) not_improved = np.logical_not(improved) generations_without_improvement = generations_without_improvement * not_improved + not_improved #set to zero if not improved, else increment - + timestamp_of_last_improvement = timestamp_of_last_improvement * not_improved + time.time()*improved #set to current time if improved - + pass #update best score best_scores = [max(best_scores[i], cur_best_scores[i]) for i in range(len(self.objective_names))] @@ -351,7 +355,7 @@ def optimize(self): if self.verbose >= 3: print(f"Early stop ({self.early_stop} individuals evaluated without improvement)") break - + if self.early_stop_seconds: if any(time.time() - timestamp_of_last_improvement > self.early_stop_seconds): if self.verbose >= 3: @@ -364,7 +368,7 @@ def optimize(self): print("Time limit reached") done = True break - + if self.max_evaluated_individuals is not None and len(self.population.evaluated_individuals.dropna(subset=self.objective_names)) >= self.max_evaluated_individuals: print("Evaluated enough individuals") done = True @@ -378,7 +382,7 @@ def optimize(self): for individual in individuals_to_evaluate: if self.max_queue_size > len(submitted_futures): future = self._client.submit(tpot2.utils.eval_utils.eval_objective_list, individual, self.objective_functions, verbose=self.verbose, timeout=self.max_eval_time_seconds,**self.objective_kwargs) - + submitted_futures[future] = {"individual": individual, "time": time.time(), "budget": budget,} @@ -400,13 +404,13 @@ def optimize(self): if len(cur_evaluated_population) > self.population_size: scores = evaluated[self.objective_names].to_numpy() weighted_scores = scores * self.objective_function_weights - new_population_index = np.ravel(self.survival_selector(weighted_scores, k=self.population_size)) #TODO make it clear that we are concatenating scores... - + new_population_index = np.ravel(self.survival_selector(weighted_scores, k=self.population_size, rng_=self.rng)) #TODO make it clear that we are concatenating scores... + #set new population try: cur_evaluated_population = np.array(cur_evaluated_population)[new_population_index] cur_evaluated_population = np.concatenate([cur_evaluated_population, unevaluated["Individual"].to_numpy()]) - self.population.set_population(cur_evaluated_population) + self.population.set_population(cur_evaluated_population, rng_=self.rng) except Exception as e: print("Exception in survival selection") print(e) @@ -419,7 +423,7 @@ def optimize(self): print("self.objective_function_weights", self.objective_function_weights) print("self.population_size", self.population_size) print("parents_df", parents_df) - + ############################### # Step 5: Parent Selection and Variation ############################### @@ -438,21 +442,21 @@ def optimize(self): if len(parents_df) < 2: var_ops = ["mutate" for _ in range(n_individuals_to_submit)] else: - var_ops = [np.random.choice(["crossover","mutate_then_crossover","crossover_then_mutate",'mutate'],p=[self.crossover_probability,self.mutate_then_crossover_probability, self.crossover_then_mutate_probability,self.mutate_probability]) for _ in range(n_individuals_to_submit)] + var_ops = [self.rng.choice(["crossover","mutate_then_crossover","crossover_then_mutate",'mutate'],p=[self.crossover_probability,self.mutate_then_crossover_probability, self.crossover_then_mutate_probability,self.mutate_probability]) for _ in range(n_individuals_to_submit)] parents = [] for op in var_ops: if op == "mutate": - parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=1, )]) + parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=1, rng_=self.rng)]) else: - parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=2, )]) + parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=2, rng_=self.rng)]) + + _offspring = self.population.create_offspring(parents, var_ops, rng_=self.rng, n_jobs=1, add_to_population=True) - _offspring = self.population.create_offspring(parents, var_ops, n_jobs=1, add_to_population=True) - # If we don't have enough evaluated individuals to use as parents for variation, we create new individuals randomly # This can happen if the individuals in the initial population are invalid if len(cur_evaluated_population) == 0 and len(submitted_futures) < self.max_queue_size: - + initial_population = self.population.evaluated_individuals.iloc[:self.initial_population_size*3] invalid_initial_population = initial_population[initial_population[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)] if len(invalid_initial_population) >= self.initial_population_size*3: #if all individuals in the 3*initial population are invalid @@ -462,7 +466,7 @@ def optimize(self): initial_population = [next(self.individual_generator) for _ in range(n_individuals_to_create)] self.population.add_to_population(initial_population) - + ############################### @@ -473,7 +477,7 @@ def optimize(self): for individual in individuals_to_evaluate: if self.max_queue_size > len(submitted_futures): future = self._client.submit(tpot2.utils.eval_utils.eval_objective_list, individual, self.objective_functions, verbose=self.verbose, timeout=self.max_eval_time_seconds,**self.objective_kwargs) - + submitted_futures[future] = {"individual": individual, "time": time.time(), "budget": budget,} @@ -494,7 +498,7 @@ def optimize(self): ############################### # Step 7: Cleanup ############################### - + self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID") self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT") @@ -513,7 +517,7 @@ def optimize(self): tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) - + def get_unevaluated_individuals(self, column_names, budget=None, individual_list=None): if individual_list is not None: @@ -531,10 +535,8 @@ def get_unevaluated_individuals(self, column_names, budget=None, individual_list unevaluated_filter = lambda i: any(offspring_scores.loc[offspring_scores.index[i]][column_names].isna()) unevaluated_individuals_this_step = [i for i in range(len(cur_pop)) if unevaluated_filter(i)] return cur_pop[unevaluated_individuals_this_step] - + else: #if column names are not in the evaluated_individuals, then we have not evaluated any individuals yet for name_step in column_names: self.population.evaluated_individuals[name_step] = np.nan return cur_pop - - \ No newline at end of file From ece23919fbaa3e68abd4cb0147c0365c620a89ae Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 13:56:42 -0700 Subject: [PATCH 27/43] added rng control for functions that vary offspring --- tpot2/population.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tpot2/population.py b/tpot2/population.py index 509a249d..a80a399c 100644 --- a/tpot2/population.py +++ b/tpot2/population.py @@ -262,7 +262,7 @@ def set_population(self, new_population, rng_, keep_repeats=True): self.add_to_population(new_population, rng_=rng, keep_repeats=keep_repeats) #TODO should we just generate one offspring per crossover? - def create_offspring(self, parents_list, var_op_list, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): + def create_offspring(self, parents_list, var_op_list, rng_, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): ''' parents_list: a list of lists of parents. var_op_list: a list of var_ops to apply to each list of parents. Should be the same length as parents_list. @@ -280,8 +280,9 @@ def create_offspring(self, parents_list, var_op_list, add_to_population=True, ke - "mutate_and_crossover" : mutate_and_crossover - "cross_and_mutate" : cross_and_mutate ''' + rng = np.random.default_rng(rng_) new_offspring = [] - all_offspring = parallel_create_offspring(parents_list, var_op_list, n_jobs=n_jobs) + all_offspring = parallel_create_offspring(parents_list, var_op_list, rng_=rng, n_jobs=n_jobs) for parents, offspring, var_op in zip(parents_list, all_offspring, var_op_list): @@ -294,7 +295,7 @@ def create_offspring(self, parents_list, var_op_list, add_to_population=True, ke # offspring = offspring[0] if add_to_population: - added = self.add_to_population(offspring, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) + added = self.add_to_population(offspring, rng_=rng, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) if len(added) > 0: for new_child in added: parent_keys = [parent.unique_id() for parent in parents] @@ -394,37 +395,40 @@ def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutati def get_id(individual): return individual.unique_id() -def parallel_create_offspring(parents_list, var_op_list, n_jobs=1): +def parallel_create_offspring(parents_list, var_op_list, rng_, n_jobs=1): + rng = np.random.default_rng(rng_) if n_jobs == 1: - return nonparallel_create_offpring(parents_list, var_op_list) + return nonparallel_create_offpring(parents_list, var_op_list, rng_=rng) else: delayed_offspring = [] for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class if var_op in built_in_var_ops_dict: var_op = built_in_var_ops_dict[var_op] - delayed_offspring.append(dask.delayed(copy_and_change)(parents, var_op)) + delayed_offspring.append(dask.delayed(copy_and_change)(parents, var_op, rng_=rng)) offspring = dask.compute(*delayed_offspring, num_workers=n_jobs, threads_per_worker=1) return offspring -def nonparallel_create_offpring(parents_list, var_op_list, n_jobs=1): +def nonparallel_create_offpring(parents_list, var_op_list, rng_, n_jobs=1): + rng = np.random.default_rng(rng_) offspring = [] for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class if var_op in built_in_var_ops_dict: var_op = built_in_var_ops_dict[var_op] - offspring.append(copy_and_change(parents, var_op)) + offspring.append(copy_and_change(parents, var_op, rng_=rng)) return offspring -def copy_and_change(parents, var_op): +def copy_and_change(parents, var_op, rng_): + rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) - offspring = var_op(offspring) + offspring = var_op(offspring, rng_=rng) if isinstance(offspring, collections.abc.Iterable): offspring = offspring[0] return offspring From 8599e9c143bdd8237a6e584186dbf0973010d158 Mon Sep 17 00:00:00 2001 From: Jose Date: Mon, 30 Oct 2023 13:57:02 -0700 Subject: [PATCH 28/43] added rng control and random state for cv gen --- .../tpot_estimator/steady_state_estimator.py | 419 +++++++++--------- 1 file changed, 217 insertions(+), 202 deletions(-) diff --git a/tpot2/tpot_estimator/steady_state_estimator.py b/tpot2/tpot_estimator/steady_state_estimator.py index 72f7f595..0f48c827 100644 --- a/tpot2/tpot_estimator/steady_state_estimator.py +++ b/tpot2/tpot_estimator/steady_state_estimator.py @@ -5,8 +5,8 @@ import tpot2.config from sklearn.utils.validation import check_is_fitted from tpot2.selectors import survival_select_NSGA2, tournament_selection_dominated -from sklearn.preprocessing import LabelEncoder -from sklearn.utils.multiclass import unique_labels +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.multiclass import unique_labels import pandas as pd from sklearn.model_selection import train_test_split import tpot2 @@ -27,7 +27,7 @@ def set_dask_settings(): #TODO inherit from _BaseComposition? class TPOTEstimatorSteadyState(BaseEstimator): - def __init__(self, scorers= [], + def __init__(self, scorers= [], scorers_weights = [], classification = False, cv = 5, @@ -35,16 +35,16 @@ def __init__(self, scorers= [], other_objective_functions_weights = [], objective_function_names = None, bigger_is_better = True, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, root_config_dict= 'Auto', inner_config_dict=["selectors", "transformers"], - leaf_config_dict= None, + leaf_config_dict= None, cross_val_predict_cv = 0, categorical_features = None, subsets = None, memory = None, - preprocessing = False, + preprocessing = False, validation_strategy = "none", validation_fraction = .2, disable_label_encoder = False, @@ -53,14 +53,14 @@ def __init__(self, scorers= [], population_size = 50, max_evaluated_individuals = None, - + early_stop = None, early_stop_seconds = None, scorers_early_stop_tol = 0.001, other_objectives_early_stop_tol = None, - max_time_seconds=None, - max_eval_time_seconds=60*10, + max_time_seconds=None, + max_eval_time_seconds=60*10, n_jobs=1, memory_limit = "4GB", client = None, @@ -73,65 +73,68 @@ def __init__(self, scorers= [], parent_selector = tournament_selection_dominated, budget_range = None, budget_scaling = .5, - individuals_until_end_budget = 1, + individuals_until_end_budget = 1, stepwise_steps = 5, warm_start = False, subset_column = None, verbose = 0, - periodic_checkpoint_folder = None, + periodic_checkpoint_folder = None, callback = None, processes = True, scatter = True, + # random seed for random number generator (rng) + random_state = None, + optuna_optimize_pareto_front = False, optuna_optimize_pareto_front_trials = 100, optuna_optimize_pareto_front_timeout = 60*10, optuna_storage = "sqlite:///optuna.db", ): - + ''' An sklearn baseestimator that uses genetic programming to optimize a pipeline. - + Parameters ---------- - + scorers : (list, scorer) - A scorer or list of scorers to be used in the cross-validation process. + A scorer or list of scorers to be used in the cross-validation process. see https://scikit-learn.org/stable/modules/model_evaluation.html - + scorers_weights : list A list of weights to be applied to the scorers during the optimization process. - + classification : bool If True, the problem is treated as a classification problem. If False, the problem is treated as a regression problem. Used to determine the CV strategy. - + cv : int, cross-validator - (int): Number of folds to use in the cross-validation process. By uses the sklearn.model_selection.KFold cross-validator for regression and StratifiedKFold for classification. In both cases, shuffled is set to True. - (sklearn.model_selection.BaseCrossValidator): A cross-validator to use in the cross-validation process. - + other_objective_functions : list, default=[] A list of other objective functions to apply to the pipeline. The function takes a single parameter for the graphpipeline estimator and returns either a single score or a list of scores. - + other_objective_functions_weights : list, default=[] A list of weights to be applied to the other objective functions. - + objective_function_names : list, default=None A list of names to be applied to the objective functions. If None, will use the names of the objective functions. - + bigger_is_better : bool, default=True If True, the objective function is maximized. If False, the objective function is minimized. Use negative weights to reverse the direction. - + max_size : int, default=np.inf The maximum number of nodes of the pipelines to be generated. - + linear_pipeline : bool, default=False If True, the pipelines generated will be linear. If False, the pipelines generated will be directed acyclic graphs. - + root_config_dict : dict, default='auto' The configuration dictionary to use for the root node of the model. If 'auto', will use "classifiers" if classification=True, else "regressors". @@ -149,7 +152,7 @@ def __init__(self, scorers= [], - 'genetic encoders' : Includes Genetic Encoder methods as used in AutoQTL. - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - + inner_config_dict : dict, default=["selectors", "transformers"] The configuration dictionary to use for the inner nodes of the model generation. Default ["selectors", "transformers"] @@ -168,10 +171,10 @@ def __init__(self, scorers= [], - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None and max_depth>1, the root_config_dict will be used for the inner nodes as well. - - leaf_config_dict : dict, default=None + + leaf_config_dict : dict, default=None The configuration dictionary to use for the leaf node of the model. If set, leaf nodes must be from this dictionary. - Otherwise leaf nodes will be generated from the root_config_dict. + Otherwise leaf nodes will be generated from the root_config_dict. Default None - 'selectors' : A selection of sklearn Selector methods. - 'classifiers' : A selection of sklearn Classifier methods. @@ -188,14 +191,14 @@ def __init__(self, scorers= [], - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None, a leaf will not be required (i.e. the pipeline can be a single root node). Leaf nodes will be generated from the inner_config_dict. - + cross_val_predict_cv : int, default=0 Number of folds to use for the cross_val_predict function for inner classifiers and regressors. Estimators will still be fit on the full dataset, but the following node will get the outputs from cross_val_predict. - + - 0-1 : When set to 0 or 1, the cross_val_predict function will not be used. The next layer will get the outputs from fitting and transforming the full dataset. - - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. + - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. However, the output to the next node will come from cross_val_predict with the specified number of folds. - + categorical_features: list or None Categorical columns to inpute and/or one hot encode during the preprocessing step. Used only if preprocessing is not False. - None : If None, TPOT2 will automatically use object columns in pandas dataframes as objects for one hot encoding in preprocessing. @@ -203,7 +206,7 @@ def __init__(self, scorers= [], subsets : str or list, default=None Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. - - str : If a string, it is assumed to be a path to a csv file with the subsets. + - str : If a string, it is assumed to be a path to a csv file with the subsets. The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - None : If None, each column will be treated as a subset. One column will be selected per subset. @@ -226,186 +229,193 @@ def __init__(self, scorers= [], - None: TPOT does not use memory caching. - preprocessing : bool or BaseEstimator/Pipeline, + preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL A pipeline that will be used to preprocess the data before CV. - bool : If True, will use a default preprocessing pipeline. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. - + validation_strategy : str, default='none' EXPERIMENTAL The validation strategy to use for selecting the final pipeline from the population. TPOT2 may overfit the cross validation score. A second validation set can be used to select the final pipeline. - 'auto' : Automatically determine the validation strategy based on the dataset shape. - - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. - - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. + - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. + - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. - 'none' : Do not use a separate validation set for final validation. Select based on the original cross-validation score. This is the default for large datasets. validation_fraction : float, default=0.2 EXPERIMENTAL The fraction of the dataset to use for the validation set when validation_strategy is 'split'. Must be between 0 and 1. - + disable_label_encoder : bool, default=False If True, TPOT will check if the target needs to be relabeled to be sequential ints from 0 to N. This is necessary for XGBoost compatibility. If the labels need to be encoded, TPOT2 will use sklearn.preprocessing.LabelEncoder to encode the labels. The encoder can be accessed via the self.label_encoder_ attribute. If False, no additional label encoders will be used. population_size : int, default=50 Size of the population - + initial_population_size : int, default=None Size of the initial population. If None, population_size will be used. - + population_scaling : int, default=0.5 Scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - - generations_until_end_population : int, default=1 - Number of generations until the population size reaches population_size - + + generations_until_end_population : int, default=1 + Number of generations until the population size reaches population_size + generations : int, default=50 Number of generations to run - + early_stop : int, default=None Number of evaluated individuals without improvement before early stopping. Counted across all objectives independently. Triggered when all objectives have not improved by the given number of individuals. - + early_stop_seconds : float, default=None Number of seconds without improvement before early stopping. All objectives must not have improved for the given number of seconds for this to be triggered. - scorers_early_stop_tol : + scorers_early_stop_tol : -list of floats list of tolerances for each scorer. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - - other_objectives_early_stop_tol : + + other_objectives_early_stop_tol : -list of floats list of tolerances for each of the other objective function. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - + max_time_seconds : float, default=float("inf") Maximum time to run the optimization. If none or inf, will run until the end of the generations. - + max_eval_time_seconds : float, default=60*5 Maximum time to evaluate a single individual. If none or inf, there will be no time limit per evaluation. - + n_jobs : int, default=1 Number of processes to run in parallel. - + memory_limit : str, default="4GB" Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information. - + client : dask.distributed.Client, default=None - A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. + A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. crossover_probability : float, default=.2 Probability of generating a new individual by crossover between two individuals. - + mutate_probability : float, default=.7 Probability of generating a new individual by crossover between one individuals. - + mutate_then_crossover_probability : float, default=.05 Probability of generating a new individual by mutating two individuals followed by crossover. - + crossover_then_mutate_probability : float, default=.05 Probability of generating a new individual by crossover between two individuals followed by a mutation of the resulting individual. - + survival_selector : function, default=survival_select_NSGA2 Function to use to select individuals for survival. Must take a matrix of scores and return selected indexes. Used to selected population_size individuals at the start of each generation to use for mutation and crossover. - + parent_selector : function, default=parent_select_NSGA2 Function to use to select pairs parents for crossover and individuals for mutation. Must take a matrix of scores and return selected indexes. - + budget_range : list [start, end], default=None A starting and ending budget to use for the budget scaling. - + budget_scaling float : [0,1], default=0.5 A scaling factor to use when determining how fast we move the budget from the start to end budget. - + individuals_until_end_budget : int, default=1 The number of generations to run before reaching the max budget. - + stepwise_steps : int, default=1 The number of staircase steps to take when scaling the budget and population size. - + threshold_evaluation_early_stop : list [start, end], default=None starting and ending percentile to use as a threshold for the evaluation early stopping. Values between 0 and 100. - + threshold_evaluation_scaling : float [0,inf), default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. - + min_history_threshold : int, default=0 The minimum number of previous scores needed before using threshold early stopping. - + selection_evaluation_early_stop : list, default=None A lower and upper percent of the population size to select each round of CV. Values between 0 and 1. - - selection_evaluation_scaling : float, default=0.5 + + selection_evaluation_scaling : float, default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. - + n_initial_optimizations : int, default=0 Number of individuals to optimize before starting the evolution. - - optimization_cv : int + + optimization_cv : int Number of folds to use for the optuna optimization's internal cross-validation. - + max_optimize_time_seconds : float, default=60*5 Maximum time to run an optimization - + optimization_steps : int, default=10 Number of steps per optimization - + warm_start : bool, default=False If True, will use the continue the evolutionary algorithm from the last generation of the previous run. - + subset_column : str or int, default=None EXPERIMENTAL The column to use for the subset selection. Must also pass in unique_subset_values to GraphIndividual to function. - - verbose : int, default=1 + + verbose : int, default=1 How much information to print during the optimization process. Higher values include the information from lower values. 0. nothing 1. progress bar - + 3. best individual 4. warnings >=5. full warnings trace - - + + random_state : int, None, default=None + A seed for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes + - int + Will be used to create and lock in Generator instance with 'numpy.random.default_rng()' + - None + Will be used to create Generator for 'numpy.random.default_rng()' where a fresh, unpredictable entropy will be pulled from the OS + + periodic_checkpoint_folder : str, default=None Folder to save the population to periodically. If None, no periodic saving will be done. If provided, training will resume from this checkpoint. - + callback : tpot2.CallBackInterface, default=None Callback object. Not implemented processes : bool, default=True If True, will use multiprocessing to parallelize the optimization process. If False, will use threading. True seems to perform better. However, False is required for interactive debugging. - + Attributes ---------- fitted_pipeline_ : GraphPipeline A fitted instance of the GraphPipeline that inherits from sklearn BaseEstimator. This is fitted on the full X, y passed to fit. - evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. - Columns: + evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. + Columns: - *objective functions : The first few columns correspond to the passed in scorers and objective functions - Parents : A tuple containing the indexes of the pipelines used to generate the pipeline of that row. If NaN, this pipeline was generated randomly in the initial population. - Variation_Function : Which variation function was used to mutate or crossover the parents. If NaN, this pipeline was generated randomly in the initial population. - Individual : The internal representation of the individual that is used during the evolutionary algorithm. This is not an sklearn BaseEstimator. - - Generation : The generation the pipeline first appeared. - - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. - To save on computational time, the best frontier is updated iteratively each generation. + - Generation : The generation the pipeline first appeared. + - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. + To save on computational time, the best frontier is updated iteratively each generation. The pipelines with the 0th pareto front do represent the exact best frontier. However, the pipelines with pareto front >= 1 are only in reference to the other pipelines in the final population. - All other pipelines are set to NaN. - - Instance : The unfitted GraphPipeline BaseEstimator. + All other pipelines are set to NaN. + - Instance : The unfitted GraphPipeline BaseEstimator. - *validation objective functions : Objective function scores evaluated on the validation set. - Validation_Pareto_Front : The full pareto front calculated on the validation set. This is calculated for all pipelines with Pareto_Front equal to 0. Unlike the Pareto_Front which only calculates the frontier and the final population, the Validation Pareto Front is calculated for all pipelines tested on the validation set. - + pareto_front : The same pandas dataframe as evaluated individuals, but containing only the frontier pareto front pipelines. ''' @@ -440,7 +450,7 @@ def __init__(self, scorers= [], self.early_stop_seconds = early_stop_seconds self.scorers_early_stop_tol = scorers_early_stop_tol self.other_objectives_early_stop_tol = other_objectives_early_stop_tol - self.max_time_seconds = max_time_seconds + self.max_time_seconds = max_time_seconds self.max_eval_time_seconds = max_eval_time_seconds self.n_jobs= n_jobs self.memory_limit = memory_limit @@ -473,6 +483,13 @@ def __init__(self, scorers= [], self.optuna_optimize_pareto_front_timeout = optuna_optimize_pareto_front_timeout self.optuna_storage = optuna_storage + # create random number generator based on rng_seed + self.rng = np.random.default_rng(random_state) + # save random state passed to us for other functions that use random_state + self.random_state = random_state + # set the numpy seed so anything using it will be consistent as well + np.random.seed(random_state) + self.max_evaluated_individuals = max_evaluated_individuals @@ -491,24 +508,24 @@ def __init__(self, scorers= [], self._scorers = [self.scorers] else: self._scorers = self.scorers - + self._scorers = [sklearn.metrics.get_scorer(scoring) for scoring in self._scorers] self._scorers_early_stop_tol = self.scorers_early_stop_tol - + self._evolver = tpot2.evolvers.SteadyStateEvolver - - + + self.objective_function_weights = [*scorers_weights, *other_objective_functions_weights] - + if self.objective_function_names is None: obj_names = [f.__name__ for f in other_objective_functions] else: obj_names = self.objective_function_names self.objective_names = [f._score_func.__name__ if hasattr(f,"_score_func") else f.__name__ for f in self._scorers] + obj_names - - + + if not isinstance(self.other_objectives_early_stop_tol, list): self._other_objectives_early_stop_tol = [self.other_objectives_early_stop_tol for _ in range(len(self.other_objective_functions))] else: @@ -520,7 +537,7 @@ def __init__(self, scorers= [], self._scorers_early_stop_tol = self._scorers_early_stop_tol self.early_stop_tol = [*self._scorers_early_stop_tol, *self._other_objectives_early_stop_tol] - + self._evolver_instance = None self.evaluated_individuals = None @@ -550,8 +567,8 @@ def fit(self, X, y): if self.classification and not self.disable_label_encoder and not check_if_y_is_encoded(y): warnings.warn("Labels are not encoded as ints from 0 to N. For compatibility with some classifiers such as sklearn, TPOT has encoded y with the sklearn LabelEncoder. When using pipelines outside the main TPOT estimator class, you can encode the labels with est.label_encoder_") - self.label_encoder_ = LabelEncoder() - y = self.label_encoder_.fit_transform(y) + self.label_encoder_ = LabelEncoder() + y = self.label_encoder_.fit_transform(y) self.evaluated_individuals = None #determine validation strategy @@ -570,9 +587,9 @@ def fit(self, X, y): if validation_strategy == 'split': if self.classification: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=self.random_state) else: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=self.random_state) X_original = X @@ -584,7 +601,7 @@ def fit(self, X, y): if self.classification: X, y = remove_underrepresented_classes(X, y, n_folds) - + if self.preprocessing: #X = pd.DataFrame(X) @@ -602,7 +619,7 @@ def fit(self, X, y): tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns else: #numpy array and no categorical columns specified, just do imputation - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) + self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) else: @@ -612,6 +629,15 @@ def fit(self, X, y): #Set up the configuation dictionaries and the search spaces + #check if self.cv is a number + if isinstance(self.cv, int) or isinstance(self.cv, float): + if self.classification: + self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + else: + self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + + else: + self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) n_samples= int(math.floor(X.shape[0]/n_folds)) @@ -625,54 +651,43 @@ def fit(self, X, y): if self.root_config_dict == 'Auto': if self.classification: n_classes = len(np.unique(y)) - root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) + root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) else: - root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) else: - root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, subsets=self.subsets,feature_names=self.feature_names) - - inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) - leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets,feature_names=self.feature_names) + inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) + leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) - #check if self.cv is a number - if isinstance(self.cv, int) or isinstance(self.cv, float): - if self.classification: - self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=42) - else: - self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=42) - - else: - self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) - - def objective_function(pipeline_individual, - X, + def objective_function(pipeline_individual, + X, y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, + scorers= self._scorers, + cv=self.cv_gen, other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, - **kwargs): + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, + **kwargs): return objective_function_generator( pipeline_individual, - X, - y, + X, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, ) - self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( + self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( inner_config_dict=inner_config_dict, root_config_dict=root_config_dict, leaf_config_dict=leaf_config_dict, @@ -691,7 +706,7 @@ def objective_function(pipeline_individual, #If warm start and we have an evolver instance, use the existing one if not(self.warm_start and self._evolver_instance is not None): - self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, + self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, objective_functions= [objective_function], objective_function_weights = self.objective_function_weights, objective_names=self.objective_names, @@ -703,8 +718,8 @@ def objective_function(pipeline_individual, verbose = self.verbose, max_time_seconds = self.max_time_seconds , max_eval_time_seconds = self.max_eval_time_seconds, - - + + periodic_checkpoint_folder = self.periodic_checkpoint_folder, @@ -712,7 +727,7 @@ def objective_function(pipeline_individual, early_stop_tol = self.early_stop_tol, early_stop= self.early_stop, early_stop_seconds = self.early_stop_seconds, - + budget_range = self.budget_range, budget_scaling = self.budget_scaling, individuals_until_end_budget = self.individuals_until_end_budget, @@ -728,12 +743,14 @@ def objective_function(pipeline_individual, mutate_probability = self.mutate_probability, mutate_then_crossover_probability= self.mutate_then_crossover_probability, crossover_then_mutate_probability= self.crossover_then_mutate_probability, - - max_evaluated_individuals = self.max_evaluated_individuals + + max_evaluated_individuals = self.max_evaluated_individuals, + + rng_=self.rng, ) - + self._evolver_instance.optimize() #self._evolver_instance.population.update_pareto_fronts(self.objective_names, self.objective_function_weights) self.make_evaluated_individuals() @@ -743,24 +760,24 @@ def objective_function(pipeline_individual, pareto_front_inds = self.pareto_front['Individual'].values all_graphs, all_scores = tpot2.individual_representations.graph_pipeline_individual.simple_parallel_optuna(pareto_front_inds, objective_function, self.objective_function_weights, _client, storage=self.optuna_storage, steps=self.optuna_optimize_pareto_front_trials, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, max_time_seconds=self.optuna_optimize_pareto_front_timeout, **{"X": X, "y": y}) all_scores = tpot2.utils.eval_utils.process_scores(all_scores, len(self.objective_function_weights)) - + if len(all_graphs) > 0: df = pd.DataFrame(np.column_stack((all_graphs, all_scores,np.repeat("Optuna",len(all_graphs)))), columns=["Individual"] + self.objective_names +["Parents"]) for obj in self.objective_names: df[obj] = df[obj].apply(convert_to_float) - + self.evaluated_individuals = pd.concat([self.evaluated_individuals, df], ignore_index=True) else: print("WARNING NO OPTUNA TRIALS COMPLETED") - + tpot2.utils.get_pareto_frontier(self.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) if validation_strategy == 'reshuffled': best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - + #reshuffle rows - X, y = sklearn.utils.shuffle(X, y, random_state=1) + X, y = sklearn.utils.shuffle(X, y, random_state=self.random_state) if self.scatter: X_future = _client.scatter(X) @@ -769,30 +786,30 @@ def objective_function(pipeline_individual, X_future = X y_future = y - val_objective_function_list = [lambda ind, - X, - y, + val_objective_function_list = [lambda ind, + X, + y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + scorers= self._scorers, + cv=self.cv_gen, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: objective_function_generator( ind, X, - y, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + objective_kwargs = {"X": X_future, "y": y_future} val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, @@ -807,7 +824,7 @@ def objective_function(pipeline_individual, elif validation_strategy == 'split': - if self.scatter: + if self.scatter: X_future = _client.scatter(X) y_future = _client.scatter(y) X_val_future = _client.scatter(X_val) @@ -819,33 +836,33 @@ def objective_function(pipeline_individual, y_val_future = y_val objective_kwargs = {"X": X_future, "y": y_future, "X_val" : X_val_future, "y_val":y_val_future } - + best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - val_objective_function_list = [lambda ind, - X, - y, - X_val, - y_val, - scorers= self._scorers, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + val_objective_function_list = [lambda ind, + X, + y, + X_val, + y_val, + scorers= self._scorers, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: val_objective_function_generator( ind, X, y, - X_val, - y_val, - scorers= scorers, + X_val, + y_val, + scorers= scorers, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, val_objective_function_list, n_jobs=self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds,n_expected_columns=len(self.objective_names),client=_client, **objective_kwargs) @@ -857,25 +874,25 @@ def objective_function(pipeline_individual, else: self.objective_names_for_selection = self.objective_names - val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) + val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) weighted_scores = val_scores*self.objective_function_weights - + if self.bigger_is_better: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmax() else: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmin() - + best_individual = self.evaluated_individuals.loc[best_idx]['Individual'] self.selected_best_score = self.evaluated_individuals.loc[best_idx] - + best_individual_pipeline = best_individual.export_pipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, subset_column=self.subset_column) if self.preprocessing: self.fitted_pipeline_ = sklearn.pipeline.make_pipeline(sklearn.base.clone(self._preprocessing_pipeline), best_individual_pipeline ) else: - self.fitted_pipeline_ = best_individual_pipeline - + self.fitted_pipeline_ = best_individual_pipeline + self.fitted_pipeline_.fit(X_original,y_original) #TODO use y_original as well? @@ -885,7 +902,7 @@ def objective_function(pipeline_individual, cluster.close() return self - + def _estimator_has(attr): '''Check if we can delegate a method to the underlying estimator. First, we check the first fitted final estimator if available, otherwise we @@ -897,7 +914,7 @@ def _estimator_has(attr): - + @available_if(_estimator_has('predict')) @@ -907,21 +924,21 @@ def predict(self, X, **predict_params): preds = self.fitted_pipeline_.predict(X,**predict_params) if self.classification and self.label_encoder_: preds = self.label_encoder_.inverse_transform(preds) - + return preds - + @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.predict_proba(X,**predict_params) - + @available_if(_estimator_has('decision_function')) def decision_function(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.decision_function(X,**predict_params) - + @available_if(_estimator_has('transform')) def transform(self, X, **predict_params): check_is_fitted(self) @@ -931,7 +948,7 @@ def transform(self, X, **predict_params): @property def classes_(self): """The classes labels. Only exist if the last step is a classifier.""" - + if self.label_encoder_: return self.label_encoder_.classes_ else: @@ -953,7 +970,7 @@ def make_evaluated_individuals(self): self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline)) return self.evaluated_individuals - + @property def pareto_front(self): #check if _evolver_instance exists @@ -964,5 +981,3 @@ def pareto_front(self): return self.evaluated_individuals else: return self.evaluated_individuals[self.evaluated_individuals["Pareto_Front"]==1] - - From f7b29296820c88cfe37d2c4ccb7423389786dddf Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 14:40:13 -0700 Subject: [PATCH 29/43] cleaned up and defaulted random_state to None --- tpot2/config/classifiers.py | 34 ++++++------ tpot2/config/classifiers_sklearnex.py | 16 +++--- tpot2/config/hyperparametersuggestor.py | 4 +- tpot2/config/mdr_configs.py | 2 - tpot2/config/regressors.py | 72 ++++++++++++------------- tpot2/config/regressors_sklearnex.py | 18 +++---- tpot2/config/selectors.py | 10 ++-- tpot2/config/transformers.py | 8 +-- 8 files changed, 80 insertions(+), 84 deletions(-) diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index 88a320b9..06ed2507 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -18,7 +18,7 @@ -def params_LogisticRegression(trial, random_state, name=None): +def params_LogisticRegression(trial, random_state=None, name=None): params = {} params['solver'] = trial.suggest_categorical(name=f'solver_{name}', choices=[f'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) @@ -57,7 +57,7 @@ def params_KNeighborsClassifier(trial, name=None, n_samples=10): } -def params_DecisionTreeClassifier(trial, random_state, name=None): +def params_DecisionTreeClassifier(trial, random_state=None, name=None): return { 'criterion': trial.suggest_categorical(f'criterion_{name}', ['gini', 'entropy']), 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 11), @@ -71,7 +71,7 @@ def params_DecisionTreeClassifier(trial, random_state, name=None): } -def params_SVC(trial, random_state, name=None): +def params_SVC(trial, random_state=None, name=None): return { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), @@ -86,7 +86,7 @@ def params_SVC(trial, random_state, name=None): } -def params_LinearSVC(trial, random_state, name=None): +def params_LinearSVC(trial, random_state=None, name=None): penalty = trial.suggest_categorical(name=f'penalty_{name}', choices=['l1', 'l2']) if penalty == 'l1': @@ -108,7 +108,7 @@ def params_LinearSVC(trial, random_state, name=None): } -def params_RandomForestClassifier(trial, random_state, name=None): +def params_RandomForestClassifier(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'criterion': trial.suggest_categorical(name=f'criterion_{name}', choices=['gini', 'entropy']), @@ -117,12 +117,12 @@ def params_RandomForestClassifier(trial, random_state, name=None): 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 20), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 20), 'n_jobs': 1, - 'random_state': random_state, + 'random_state': random_state } return params -def params_GradientBoostingClassifier(trial, random_state, n_classes=None, name=None): +def params_GradientBoostingClassifier(trial, random_state=None, n_classes=None, name=None): if n_classes is not None and n_classes > 2: loss = 'log_loss' @@ -139,12 +139,12 @@ def params_GradientBoostingClassifier(trial, random_state, n_classes=None, name= 'max_features': trial.suggest_float(f'max_features_{name}', 0.1, 1.0), 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 10), 'tol': 1e-4, - 'random_state': random_state, + 'random_state': random_state } return params -def params_XGBClassifier(trial, random_state, name=None): +def params_XGBClassifier(trial, random_state=None, name=None): return { 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1, log=True), 'subsample': trial.suggest_float(f'subsample_{name}', 0.1, 1.0), @@ -154,11 +154,11 @@ def params_XGBClassifier(trial, random_state, name=None): 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 11), 'n_jobs': 1, #'use_label_encoder' : True, - 'random_state': random_state, + 'random_state': random_state } -def params_LGBMClassifier(trial, random_state, name=None): +def params_LGBMClassifier(trial, random_state=None, name=None): params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -177,7 +177,7 @@ def params_LGBMClassifier(trial, random_state, name=None): return params -def params_ExtraTreesClassifier(trial, random_state, name=None): +def params_ExtraTreesClassifier(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'criterion': trial.suggest_categorical(name=f'criterion_{name}', choices=["gini", "entropy"]), @@ -186,11 +186,11 @@ def params_ExtraTreesClassifier(trial, random_state, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21, step=1), 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), 'n_jobs': 1, - 'random_state': random_state, + 'random_state': random_state } return params -def params_SGDClassifier(trial, random_state, name=None): +def params_SGDClassifier(trial, random_state=None, name=None): params = { 'loss': trial.suggest_categorical(f'loss_{name}', ['log_loss', 'modified_huber',]), 'penalty': 'elasticnet', @@ -206,11 +206,11 @@ def params_SGDClassifier(trial, random_state, name=None): return params -def params_MLPClassifier_tpot(trial, random_state, name=None): +def params_MLPClassifier_tpot(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-4, 1e-1, log=True), 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True), - 'random_state': random_state, + 'random_state': random_state } return params @@ -248,7 +248,7 @@ def params_MultinomialNB(trial, name=None): return params -def make_classifier_config_dictionary(random_state, n_samples=10, n_classes=None): +def make_classifier_config_dictionary(random_state=None, n_samples=10, n_classes=None): n_samples = min(n_samples,100) #TODO optimize this return { diff --git a/tpot2/config/classifiers_sklearnex.py b/tpot2/config/classifiers_sklearnex.py index fe7f213b..16983332 100644 --- a/tpot2/config/classifiers_sklearnex.py +++ b/tpot2/config/classifiers_sklearnex.py @@ -8,7 +8,7 @@ from functools import partial -def params_RandomForestClassifier(trial, random_state, name=None): +def params_RandomForestClassifier(trial, random_state=None, name=None): return { 'n_estimators': 100, 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), @@ -25,7 +25,7 @@ def params_KNeighborsClassifier(trial, name=None, n_samples=10): 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), } -def params_LogisticRegression(trial, random_state, name=None): +def params_LogisticRegression(trial, random_state=None, name=None): params = {} params['dual'] = False params['penalty'] = 'l2' @@ -42,10 +42,10 @@ def params_LogisticRegression(trial, random_state, name=None): 'dual': params['dual'], 'C': trial.suggest_float(f'C_{name}', 1e-4, 1e4, log=True), 'max_iter': 1000, - 'random_state': random_state, + 'random_state': random_state } -def params_SVC(trial, random_state, name=None): +def params_SVC(trial, random_state=None, name=None): return { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), @@ -54,10 +54,10 @@ def params_SVC(trial, random_state, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, - 'random_state': random_state, + 'random_state': random_state } -def params_NuSVC(trial, random_state, name=None): +def params_NuSVC(trial, random_state=None, name=None): return { 'nu': trial.suggest_float(f'subsample_{name}', 0.05, 1.0), 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), @@ -67,10 +67,10 @@ def params_NuSVC(trial, random_state, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, - 'random_state': random_state, + 'random_state': random_state } -def make_sklearnex_classifier_config_dictionary(random_state, n_samples=10, n_classes=None): +def make_sklearnex_classifier_config_dictionary(random_state=None, n_samples=10, n_classes=None): return { RandomForestClassifier: partial(params_RandomForestClassifier, random_state=random_state), KNeighborsClassifier: partial(params_KNeighborsClassifier, n_samples=n_samples), diff --git a/tpot2/config/hyperparametersuggestor.py b/tpot2/config/hyperparametersuggestor.py index 55786369..1d3ad1f0 100644 --- a/tpot2/config/hyperparametersuggestor.py +++ b/tpot2/config/hyperparametersuggestor.py @@ -3,7 +3,7 @@ import numpy as np #function that selects selects items from a list with each having independent probability p of being selected -def select(items, p, rng_): +def select(items, p, rng_=None): rng = np.random.default_rng(rng_) selected = [item for item in items if rng.random() < p] @@ -15,7 +15,7 @@ def select(items, p, rng_): class Trial(): - def __init__(self, rng_, old_params=None, alpha=1, hyperparameter_probability=1): + def __init__(self, rng_=None, old_params=None, alpha=1, hyperparameter_probability=1): self.rng = np.random.default_rng(rng_) self._params = dict() diff --git a/tpot2/config/mdr_configs.py b/tpot2/config/mdr_configs.py index 9634b534..1fe7cc7a 100644 --- a/tpot2/config/mdr_configs.py +++ b/tpot2/config/mdr_configs.py @@ -2,8 +2,6 @@ from skrebate import ReliefF, SURF, SURFstar, MultiSURF from functools import partial -import numpy as np - #MDR def params_MDR(trial, name=None): return { diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 7c08a28f..4cc031ae 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -18,8 +18,6 @@ from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import ElasticNetCV -import numpy as np - from xgboost import XGBRegressor from functools import partial @@ -29,19 +27,19 @@ #TODO: fill in remaining #TODO check for places were we could use log scaling -def params_RandomForestRegressor(trial, random_state, name=None): +def params_RandomForestRegressor(trial, random_state=None, name=None): return { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), - 'random_state': random_state, + 'random_state': random_state } # SGDRegressor parameters -def params_SGDRegressor(trial, random_state, name=None): +def params_SGDRegressor(trial, random_state=None, name=None): params = { 'loss': trial.suggest_categorical(f'loss_{name}', ['huber', 'squared_error', 'epsilon_insensitive', 'squared_epsilon_insensitive']), 'penalty': 'elasticnet', @@ -51,13 +49,13 @@ def params_SGDRegressor(trial, random_state, name=None): 'l1_ratio': trial.suggest_float(f'l1_ratio_{name}', 0.0, 1.0), 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True), - 'random_state': random_state, + 'random_state': random_state } return params # Ridge parameters -def params_Ridge(trial, random_state, name=None): +def params_Ridge(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -66,13 +64,13 @@ def params_Ridge(trial, random_state, name=None): #'max_iter': trial.suggest_int(f'max_iter_{name}', 100, 1000), 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'solver': trial.suggest_categorical(f'solver_{name}', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']), - 'random_state': random_state, + 'random_state': random_state } return params # Lasso parameters -def params_Lasso(trial, random_state, name=None): +def params_Lasso(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -84,21 +82,21 @@ def params_Lasso(trial, random_state, name=None): 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), 'selection': trial.suggest_categorical(f'selection_{name}', ['cyclic', 'random']), - 'random_state': random_state, + 'random_state': random_state } return params # ElasticNet parameters -def params_ElasticNet(trial, random_state, name=None): +def params_ElasticNet(trial, random_state=None, name=None): params = { 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0, log=True), 'l1_ratio': 1- trial.suggest_float(f'l1_ratio_{name}',0.0, 1.0), - 'random_state': random_state, + 'random_state': random_state } return params # Lars parameters -def params_Lars(trial, random_state, name=None): +def params_Lars(trial, random_state=None, name=None): params = { 'fit_intercept': True, 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), @@ -110,7 +108,7 @@ def params_Lars(trial, random_state, name=None): 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'fit_path': trial.suggest_categorical(f'fit_path_{name}', [True, False]), # 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), - 'random_state': random_state, + 'random_state': random_state } return params @@ -142,7 +140,7 @@ def params_BayesianRidge(trial, name=None): return params # LassoLars parameters -def params_LassoLars(trial, random_state, name=None): +def params_LassoLars(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), # 'fit_intercept': True, @@ -152,7 +150,7 @@ def params_LassoLars(trial, random_state, name=None): 'eps': trial.suggest_float(f'eps_{name}', 1e-5, 1e-1, log=True), # 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), # 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), - 'random_state': random_state, + 'random_state': random_state } return params @@ -165,14 +163,14 @@ def params_LassoLarsCV(trial, cv, name=None): return params # BaggingRegressor parameters -def params_BaggingRegressor(trial, random_state, name=None): +def params_BaggingRegressor(trial, random_state=None, name=None): params = { 'n_estimators': trial.suggest_int(f'n_estimators_{name}', 10, 100), 'max_samples': trial.suggest_float(f'max_samples_{name}', 0.05, 1.00), 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.00), 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), 'bootstrap_features': trial.suggest_categorical(f'bootstrap_features_{name}', [True, False]), - 'random_state': random_state, + 'random_state': random_state } return params @@ -196,14 +194,14 @@ def params_ARDRegression(trial, name=None): # TheilSenRegressor parameters -def params_TheilSenRegressor(trial, random_state, name=None): +def params_TheilSenRegressor(trial, random_state=None, name=None): params = { 'n_subsamples': trial.suggest_int(f'n_subsamples_{name}', 10, 100), 'max_subpopulation': trial.suggest_int(f'max_subpopulation_{name}', 100, 1000), 'fit_intercept': True, 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), - 'random_state': random_state, + 'random_state': random_state } return params @@ -220,7 +218,7 @@ def params_SVR(trial, name=None): return params # Perceptron parameters -def params_Perceptron(trial, random_state, name=None): +def params_Perceptron(trial, random_state=None, name=None): params = { 'penalty': trial.suggest_categorical(f'penalty_{name}', [None, 'l2', 'l1', 'elasticnet']), 'alpha': trial.suggest_float(f'alpha_{name}', 1e-5, 1e-1, log=True), @@ -238,22 +236,22 @@ def params_Perceptron(trial, random_state, name=None): 'class_weight': trial.suggest_categorical(f'class_weight_{name}', [None, 'balanced']), 'warm_start': trial.suggest_categorical(f'warm_start_{name}', [True, False]), 'average': trial.suggest_categorical(f'average_{name}', [True, False]), - 'random_state': random_state, + 'random_state': random_state } return params -def params_MLPRegressor(trial, random_state, name=None): +def params_MLPRegressor(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-4, 1e-1, log=True), 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True), - 'random_state': random_state, + 'random_state': random_state } return params #GradientBoostingRegressor parameters -def params_GradientBoostingRegressor(trial, random_state, name=None): +def params_GradientBoostingRegressor(trial, random_state=None, name=None): loss = trial.suggest_categorical(f'loss_{name}', ['ls', 'lad', 'huber', 'quantile']) params = { @@ -266,7 +264,7 @@ def params_GradientBoostingRegressor(trial, random_state, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), 'subsample': 1-trial.suggest_float(f'subsample_{name}', 0.05, 1.00, log=True), 'max_features': 1-trial.suggest_float(f'max_features_{name}', 0.05, 1.00, log=True), - 'random_state': random_state, + 'random_state': random_state } @@ -278,7 +276,7 @@ def params_GradientBoostingRegressor(trial, random_state, name=None): -def params_DecisionTreeRegressor(trial, random_state, name=None): +def params_DecisionTreeRegressor(trial, random_state=None, name=None): params = { 'max_depth': trial.suggest_int(f'max_depth_{name}', 1,11), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), @@ -287,7 +285,7 @@ def params_DecisionTreeRegressor(trial, random_state, name=None): # 'splitter': trial.suggest_categorical(f'splitter_{name}', ['best', 'random']), #'max_features': trial.suggest_categorical(f'max_features_{name}', [None, 'auto', 'sqrt', 'log2']), #'ccp_alpha': trial.suggest_float(f'ccp_alpha_{name}', 1e-1, 10.0), - 'random_state': random_state, + 'random_state': random_state } return params @@ -302,20 +300,20 @@ def params_KNeighborsRegressor(trial, name=None, n_samples=100): } return params -def params_LinearSVR(trial, random_state, name=None): +def params_LinearSVR(trial, random_state=None, name=None): params = { 'epsilon': trial.suggest_float(f'epsilon_{name}', 1e-4, 1.0, log=True), 'C': trial.suggest_float(f'C_{name}', 1e-4,25.0, log=True), 'dual': trial.suggest_categorical(f'dual_{name}', [True,False]), 'loss': trial.suggest_categorical(f'loss_{name}', ['epsilon_insensitive', 'squared_epsilon_insensitive']), - 'random_state': random_state, + 'random_state': random_state } return params # XGBRegressor parameters -def params_XGBRegressor(trial, random_state, name=None): +def params_XGBRegressor(trial, random_state=None, name=None): return { 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1, log=True), 'subsample': trial.suggest_float(f'subsample_{name}', 0.05, 1.0), @@ -326,22 +324,22 @@ def params_XGBRegressor(trial, random_state, name=None): 'nthread': 1, 'verbosity': 0, 'objective': 'reg:squarederror', - 'random_state': random_state, + 'random_state': random_state } -def params_AdaBoostRegressor(trial, random_state, name=None): +def params_AdaBoostRegressor(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1.0, log=True), 'loss': trial.suggest_categorical(f'loss_{name}', ['linear', 'square', 'exponential']), - 'random_state': random_state, + 'random_state': random_state } return params # ExtraTreesRegressor parameters -def params_ExtraTreesRegressor(trial, random_state, name=None): +def params_ExtraTreesRegressor(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), @@ -365,13 +363,13 @@ def params_ExtraTreesRegressor(trial, random_state, name=None): #'ccp_alpha': trial.suggest_float(f'ccp_alpha_{name}', 1e-5, 1e-1, log=True), # 'max_samples': trial.suggest_float(f'max_samples_{name}', 0.05, 1.00), - 'random_state': random_state, + 'random_state': random_state } return params -def make_regressor_config_dictionary(random_state, cv, n_samples=10): +def make_regressor_config_dictionary(random_state=None, cv, n_samples=10): n_samples = min(n_samples,100) #TODO optimize this diff --git a/tpot2/config/regressors_sklearnex.py b/tpot2/config/regressors_sklearnex.py index fe102525..279d2dba 100644 --- a/tpot2/config/regressors_sklearnex.py +++ b/tpot2/config/regressors_sklearnex.py @@ -14,14 +14,14 @@ from functools import partial -def params_RandomForestRegressor(trial, random_state, name=None): +def params_RandomForestRegressor(trial, random_state=None, name=None): return { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), - 'random_state': random_state, + 'random_state': random_state } def params_KNeighborsRegressor(trial, name=None, n_samples=100): @@ -34,15 +34,15 @@ def params_KNeighborsRegressor(trial, name=None, n_samples=100): def params_LinearRegression(trial, name=None): return {} -def params_Ridge(trial, random_state, name=None): +def params_Ridge(trial, random_state=None, name=None): return { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), - 'random_state': random_state, + 'random_state': random_state } -def params_Lasso(trial, random_state, name=None): +def params_Lasso(trial, random_state=None, name=None): return { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -50,14 +50,14 @@ def params_Lasso(trial, random_state, name=None): 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), 'selection': trial.suggest_categorical(f'selection_{name}', ['cyclic', 'random']), - 'random_state': random_state, + 'random_state': random_state } -def params_ElasticNet(trial, random_state, name=None): +def params_ElasticNet(trial, random_state=None, name=None): params = { 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0, log=True), 'l1_ratio': 1- trial.suggest_float(f'l1_ratio_{name}',0.0, 1.0), - 'random_state': random_state, + 'random_state': random_state } return params @@ -81,7 +81,7 @@ def params_NuSVR(trial, name=None): 'tol': 0.005, } -def make_sklearnex_regressor_config_dictionary(random_state, n_samples=10): +def make_sklearnex_regressor_config_dictionary(random_state=None, n_samples=10): return { RandomForestRegressor: partial(params_RandomForestRegressor, random_state=random_state), KNeighborsRegressor: params_KNeighborsRegressor, diff --git a/tpot2/config/selectors.py b/tpot2/config/selectors.py index 0e4b28ff..42589d83 100644 --- a/tpot2/config/selectors.py +++ b/tpot2/config/selectors.py @@ -32,7 +32,7 @@ def params_sklearn_feature_selection_VarianceThreshold(trial, name=None): #TODO add more estimator options? How will that interact with optuna? -def params_sklearn_feature_selection_RFE(trial, random_state, name=None, classifier=True): +def params_sklearn_feature_selection_RFE(trial, random_state=None, name=None, classifier=True): if classifier: estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, random_state=random_state, name=f"RFE_{name}")) @@ -47,7 +47,7 @@ def params_sklearn_feature_selection_RFE(trial, random_state, name=None, classif return params -def params_sklearn_feature_selection_SelectFromModel(trial, random_state, name=None, classifier=True): +def params_sklearn_feature_selection_SelectFromModel(trial, random_state=None, name=None, classifier=True): if classifier: estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, random_state=random_state, name=f"SFM_{name}")) @@ -63,7 +63,7 @@ def params_sklearn_feature_selection_SelectFromModel(trial, random_state, name=N -def params_sklearn_feature_selection_RFE_wrapped(trial, random_state, name=None, classifier=True): +def params_sklearn_feature_selection_RFE_wrapped(trial, random_state=None, name=None, classifier=True): params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), @@ -79,7 +79,7 @@ def params_sklearn_feature_selection_RFE_wrapped(trial, random_state, name=None, return params -def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, random_state, name=None, classifier=True): +def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, random_state=None, name=None, classifier=True): params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), @@ -96,7 +96,7 @@ def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, random_state -def make_selector_config_dictionary(random_state, classifier=True): +def make_selector_config_dictionary(random_state=None, classifier=True): if classifier: params = {RFE_ExtraTreesClassifier : partial(params_sklearn_feature_selection_RFE_wrapped, random_state=random_state, classifier=classifier), SelectFromModel_ExtraTreesClassifier : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, random_state=random_state, classifier=classifier), diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index 3f2fbb8a..94b154be 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -26,7 +26,7 @@ def params_sklearn_decomposition_FastICA(trial, random_state, name=None, n_featu 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), # number of components wrt number of features 'algorithm': trial.suggest_categorical(f'algorithm_{name}', ['parallel', 'deflation']), 'whiten':'unit-variance', - 'random_state': random_state, + 'random_state': random_state } def params_sklearn_cluster_FeatureAgglomeration(trial, name=None, n_features=100): @@ -52,7 +52,7 @@ def params_sklearn_kernel_approximation_Nystroem(trial, random_state, name=None, 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), 'kernel': trial.suggest_categorical(f'kernel_{name}', ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid']), 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), - 'random_state': random_state, + 'random_state': random_state } def params_sklearn_decomposition_PCA(trial, random_state, name=None, n_features=100): @@ -61,14 +61,14 @@ def params_sklearn_decomposition_PCA(trial, random_state, name=None, n_features= return { 'n_components': variance_explained, - 'random_state': random_state, + 'random_state': random_state } def params_sklearn_kernel_approximation_RBFSampler(trial, random_state, name=None, n_features=100): return { 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), - 'random_state': random_state, + 'random_state': random_state } def params_tpot_builtins_ZeroCount(trial, name=None): From fe4601cc70c62d5fd6dde4492a8523f2894030ee Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 14:41:31 -0700 Subject: [PATCH 30/43] cleaned up and defaulted random_state to None --- tpot2/config/transformers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index 94b154be..ab17b3c8 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -21,7 +21,7 @@ def params_sklearn_preprocessing_Binarizer(trial, name=None): 'threshold': trial.suggest_float(f'threshold_{name}', 0.0, 1.0), } -def params_sklearn_decomposition_FastICA(trial, random_state, name=None, n_features=100): +def params_sklearn_decomposition_FastICA(trial, random_state=None, name=None, n_features=100): return { 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), # number of components wrt number of features 'algorithm': trial.suggest_categorical(f'algorithm_{name}', ['parallel', 'deflation']), @@ -47,7 +47,7 @@ def params_sklearn_preprocessing_Normalizer(trial, name=None): 'norm': trial.suggest_categorical(f'norm_{name}', ['l1', 'l2', 'max']), } -def params_sklearn_kernel_approximation_Nystroem(trial, random_state, name=None, n_features=100): +def params_sklearn_kernel_approximation_Nystroem(trial, random_state=None, name=None, n_features=100): return { 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), 'kernel': trial.suggest_categorical(f'kernel_{name}', ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid']), @@ -55,7 +55,7 @@ def params_sklearn_kernel_approximation_Nystroem(trial, random_state, name=None, 'random_state': random_state } -def params_sklearn_decomposition_PCA(trial, random_state, name=None, n_features=100): +def params_sklearn_decomposition_PCA(trial, random_state=None, name=None, n_features=100): # keep the number of components required to explain 'variance_explained' of the variance variance_explained = 1.0 - trial.suggest_float(f'n_components_{name}', 0.001, 0.5, log=True) #values closer to 1 are more likely @@ -64,7 +64,7 @@ def params_sklearn_decomposition_PCA(trial, random_state, name=None, n_features= 'random_state': random_state } -def params_sklearn_kernel_approximation_RBFSampler(trial, random_state, name=None, n_features=100): +def params_sklearn_kernel_approximation_RBFSampler(trial, random_state=None, name=None, n_features=100): return { 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), @@ -79,7 +79,7 @@ def params_tpot_builtins_OneHotEncoder(trial, name=None): return {} -def make_transformer_config_dictionary(random_state, n_features=10): +def make_transformer_config_dictionary(random_state=None, n_features=10): #n_features = min(n_features,100) #TODO optimize this return { Binarizer: params_sklearn_preprocessing_Binarizer, From 0061d0d691b8383f73445946ad3bc866dc2a5f60 Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:22:11 -0700 Subject: [PATCH 31/43] typo fix --- tpot2/config/regressors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 4cc031ae..ad7aa182 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -369,7 +369,7 @@ def params_ExtraTreesRegressor(trial, random_state=None, name=None): -def make_regressor_config_dictionary(random_state=None, cv, n_samples=10): +def make_regressor_config_dictionary(random_state=None, cv=None, n_samples=10): n_samples = min(n_samples,100) #TODO optimize this From a5413271dbaf5d7af7f80c9cb6686b8b5773d716 Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:22:25 -0700 Subject: [PATCH 32/43] cleaned up and defaulted rng_ to None --- tpot2/selectors/lexicase_selection.py | 2 +- tpot2/selectors/max_weighted_average_selector.py | 2 +- tpot2/selectors/nsgaii.py | 2 +- tpot2/selectors/random_selector.py | 2 +- tpot2/selectors/tournament_selection.py | 2 +- tpot2/selectors/tournament_selection_dominated.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tpot2/selectors/lexicase_selection.py b/tpot2/selectors/lexicase_selection.py index 54a8f7de..0afe1f34 100644 --- a/tpot2/selectors/lexicase_selection.py +++ b/tpot2/selectors/lexicase_selection.py @@ -1,6 +1,6 @@ import numpy as np -def lexicase_selection(scores, k, rng_, n_parents=1,): +def lexicase_selection(scores, k, rng_=None, n_parents=1,): """Select the best individual according to Lexicase Selection, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. diff --git a/tpot2/selectors/max_weighted_average_selector.py b/tpot2/selectors/max_weighted_average_selector.py index b8379c10..d142bafd 100644 --- a/tpot2/selectors/max_weighted_average_selector.py +++ b/tpot2/selectors/max_weighted_average_selector.py @@ -1,6 +1,6 @@ import numpy as np -def max_weighted_average_selector(scores,k, rng_, n_parents=1,): +def max_weighted_average_selector(scores,k, rng_=None, n_parents=1,): ave_scores = [np.nanmean(s ) for s in scores ] #TODO make this more efficient chosen = np.argsort(ave_scores)[::-1][0:k] #TODO check this behavior with nans return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/nsgaii.py b/tpot2/selectors/nsgaii.py index 670d86a4..bb7bf76d 100644 --- a/tpot2/selectors/nsgaii.py +++ b/tpot2/selectors/nsgaii.py @@ -87,7 +87,7 @@ def crowding_distance(matrix): -def survival_select_NSGA2(scores, k, rng_): +def survival_select_NSGA2(scores, k, rng_=None): pareto_fronts = nondominated_sorting(scores) diff --git a/tpot2/selectors/random_selector.py b/tpot2/selectors/random_selector.py index 3eff5c41..54b37978 100644 --- a/tpot2/selectors/random_selector.py +++ b/tpot2/selectors/random_selector.py @@ -1,6 +1,6 @@ import numpy as np -def random_selector(scores, k, rng_, n_parents=1, ): +def random_selector(scores, k, rng_=None, n_parents=1, ): rng = np.random.default_rng(rng_) chosen = rng.choice(list(range(0,len(scores))), size=k*n_parents) return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/tournament_selection.py b/tpot2/selectors/tournament_selection.py index fc8ea598..a715a9dd 100644 --- a/tpot2/selectors/tournament_selection.py +++ b/tpot2/selectors/tournament_selection.py @@ -1,6 +1,6 @@ import numpy as np -def tournament_selection(scores, k, rng_, n_parents=1, tournament_size=2, score_index=0): +def tournament_selection(scores, k, rng_=None, n_parents=1, tournament_size=2, score_index=0): """Select the best individual among *tournsize* randomly chosen individuals, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. diff --git a/tpot2/selectors/tournament_selection_dominated.py b/tpot2/selectors/tournament_selection_dominated.py index ea8bc7e9..74556894 100644 --- a/tpot2/selectors/tournament_selection_dominated.py +++ b/tpot2/selectors/tournament_selection_dominated.py @@ -3,7 +3,7 @@ from.nsgaii import nondominated_sorting, crowding_distance, dominates #based on deap -def tournament_selection_dominated(scores, k, rng_, n_parents=2): +def tournament_selection_dominated(scores, k, rng_=None, n_parents=2): """Select the best individual among *tournsize* randomly chosen individuals, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. From 9a75b3006830de571afb87bedd0da2c6953ba13d Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:22:52 -0700 Subject: [PATCH 33/43] cleaned up and defaulted random_state,cv to None --- tpot2/tpot_estimator/estimator_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tpot2/tpot_estimator/estimator_utils.py b/tpot2/tpot_estimator/estimator_utils.py index 0dd69ed7..07fe65ac 100644 --- a/tpot2/tpot_estimator/estimator_utils.py +++ b/tpot2/tpot_estimator/estimator_utils.py @@ -21,12 +21,12 @@ def apply_make_pipeline(graphindividual, preprocessing_pipeline=None): except: return None -def get_configuration_dictionary(options, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None): +def get_configuration_dictionary(options, n_samples, n_features, classification, random_state=None, cv=None, subsets=None, feature_names=None, n_classes=None): if options is None: return options if isinstance(options, dict): - return recursive_with_defaults(options, n_samples, n_features, classification, random_state, cv, subsets=subsets, feature_names=feature_names, n_classes=n_classes) + return recursive_with_defaults(options, n_samples, n_features, classification, random_state=None, cv=None, subsets=subsets, feature_names=feature_names, n_classes=n_classes) if not isinstance(options, list): options = [options] @@ -86,7 +86,7 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, return config_dict -def recursive_with_defaults(config_dict, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None): +def recursive_with_defaults(config_dict, n_samples, n_features, classification, random_state=None, cv=None, subsets=None, feature_names=None, n_classes=None): for key in 'leaf_config_dict', 'root_config_dict', 'inner_config_dict', 'Recursive': if key in config_dict: From 1153ddbc495ff6a687dff7dbd7de11913713b880 Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:23:19 -0700 Subject: [PATCH 34/43] cleaned up and defaulted rng_ to None --- tpot2/population.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tpot2/population.py b/tpot2/population.py index a80a399c..e8bf96dc 100644 --- a/tpot2/population.py +++ b/tpot2/population.py @@ -12,7 +12,7 @@ import pickle import dask -def mutate(individual, rng_): +def mutate(individual, rng_=None): rng = np.random.default_rng(rng_) if isinstance(individual, collections.abc.Iterable): for ind in individual: @@ -21,19 +21,19 @@ def mutate(individual, rng_): individual.mutate(rng_=rng) return individual -def crossover(parents, rng_): +def crossover(parents, rng_=None): rng = np.random.default_rng(rng_) parents[0].crossover(parents[1], rng_=rng) return parents[0] -def mutate_and_crossover(parents, rng_): +def mutate_and_crossover(parents, rng_=None): rng = np.random.default_rng(rng_) parents[0].crossover(parents[1], rng_=rng) parents[0].mutate(rng_=rng) parents[1].mutate(rng_=rng) return parents -def crossover_and_mutate(parents, rng_): +def crossover_and_mutate(parents, rng_=None): rng = np.random.default_rng(rng_) for p in parents: p.mutate(rng_=rng) @@ -90,7 +90,7 @@ def __init__( self, self.callback=callback self.population = [] - def survival_select(self, selector, weights, columns_names, n_survivors, rng_, inplace=True): + def survival_select(self, selector, weights, columns_names, n_survivors, rng_=None, inplace=True): rng = np.random.default_rng(rng_) weighted_scores = self.get_column(self.population, column_names=columns_names) * weights new_population_index = np.ravel(selector(weighted_scores, k=n_survivors, rng_=rng)) #TODO make it clear that we are concatenating scores... @@ -99,7 +99,7 @@ def survival_select(self, selector, weights, columns_names, n_survivors, rng_, i self.set_population(new_population, rng_=rng) return new_population - def parent_select(self, selector, weights, columns_names, k, n_parents, rng_): + def parent_select(self, selector, weights, columns_names, k, n_parents, rng_=None): rng = np.random.default_rng(rng_) weighted_scores = self.get_column(self.population, column_names=columns_names) * weights parents_index = selector(weighted_scores, k=k, n_parents=n_parents, rng_=rng) @@ -136,7 +136,7 @@ def remove_invalid_from_population(self, column_names, invalid_value = "INVALID" # returns a list of individuals added to the live population #TODO make keep repeats allow for previously evaluated individuals, #but make sure that the live population only includes one of each, no repeats - def add_to_population(self, individuals: typing.List[BaseIndividual], rng_, keep_repeats=False, mutate_until_unique=True): + def add_to_population(self, individuals: typing.List[BaseIndividual], rng_=None, keep_repeats=False, mutate_until_unique=True): ''' Add individuals to the live population. Add individuals to the evaluated_individuals if they are not already there. @@ -252,7 +252,7 @@ def get_unevaluated_individuals(self, column_names, individual_list=None): # return self.evaluated_individuals[~self.evaluated_individuals[column_names_to_check].isin(invalid_values).any(axis=1)] #the live population empied and is set to new_population - def set_population(self, new_population, rng_, keep_repeats=True): + def set_population(self, new_population, rng_=None, keep_repeats=True): ''' sets population to new population for selection? @@ -262,7 +262,7 @@ def set_population(self, new_population, rng_, keep_repeats=True): self.add_to_population(new_population, rng_=rng, keep_repeats=keep_repeats) #TODO should we just generate one offspring per crossover? - def create_offspring(self, parents_list, var_op_list, rng_, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): + def create_offspring(self, parents_list, var_op_list, rng_=None, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): ''' parents_list: a list of lists of parents. var_op_list: a list of var_ops to apply to each list of parents. Should be the same length as parents_list. @@ -320,7 +320,7 @@ def create_offspring(self, parents_list, var_op_list, rng_, add_to_population=Tr #TODO should we just generate one offspring per crossover? - def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutation_function_weights, crossover_functions,crossover_function_weights, rng_, add_to_population=True, keep_repeats=False, mutate_until_unique=True): + def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutation_function_weights, crossover_functions,crossover_function_weights, rng_=None, add_to_population=True, keep_repeats=False, mutate_until_unique=True): rng = np.random.default_rng(rng_) new_offspring = [] @@ -395,7 +395,7 @@ def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutati def get_id(individual): return individual.unique_id() -def parallel_create_offspring(parents_list, var_op_list, rng_, n_jobs=1): +def parallel_create_offspring(parents_list, var_op_list, rng_=None, n_jobs=1): rng = np.random.default_rng(rng_) if n_jobs == 1: return nonparallel_create_offpring(parents_list, var_op_list, rng_=rng) @@ -411,7 +411,7 @@ def parallel_create_offspring(parents_list, var_op_list, rng_, n_jobs=1): num_workers=n_jobs, threads_per_worker=1) return offspring -def nonparallel_create_offpring(parents_list, var_op_list, rng_, n_jobs=1): +def nonparallel_create_offpring(parents_list, var_op_list, rng_=None, n_jobs=1): rng = np.random.default_rng(rng_) offspring = [] for parents, var_op in zip(parents_list,var_op_list): @@ -425,7 +425,7 @@ def nonparallel_create_offpring(parents_list, var_op_list, rng_, n_jobs=1): -def copy_and_change(parents, var_op, rng_): +def copy_and_change(parents, var_op, rng_=None): rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) offspring = var_op(offspring, rng_=rng) @@ -433,7 +433,7 @@ def copy_and_change(parents, var_op, rng_): offspring = offspring[0] return offspring -def copy_and_mutate(parents, var_op, rng_): +def copy_and_mutate(parents, var_op, rng_=None): rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) var_op(offspring, rng_=rng) @@ -441,7 +441,7 @@ def copy_and_mutate(parents, var_op, rng_): offspring = offspring[0] return offspring -def copy_and_crossover(parents, var_op, rng_): +def copy_and_crossover(parents, var_op, rng_=None): rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) var_op(offspring[0],offspring[1], rng_=rng) From 4f3a454c734aeb590a75862299719fcbe63c6f22 Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:24:30 -0700 Subject: [PATCH 35/43] cleaned up and defaulted rng_ to None --- .../graph_pipeline_individual/graph_utils/graph_utils.py | 4 ++-- tpot2/individual_representations/individual.py | 6 +++--- .../subset_selector/subsetselector.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py b/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py index 87d8a739..1956d49d 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py +++ b/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py @@ -55,7 +55,7 @@ def invert_dictionary(d): return inv_map -def select_nodes_same_depth(g1, node1, g2, node2, rng_): +def select_nodes_same_depth(g1, node1, g2, node2, rng_=None): rng = np.random.default_rng(rng_) g1_nodes = nx.shortest_path_length(g1, source=node1) @@ -86,7 +86,7 @@ def select_nodes_same_depth(g1, node1, g2, node2, rng_): for p in possible_pairs: yield p[0], p[1] -def select_nodes_randomly(g1, g2, rng_): +def select_nodes_randomly(g1, g2, rng_=None): rng = np.random.default_rng(rng_) sorted_self_nodes_list = list(g1.nodes) diff --git a/tpot2/individual_representations/individual.py b/tpot2/individual_representations/individual.py index 47169cb0..be61fdcb 100644 --- a/tpot2/individual_representations/individual.py +++ b/tpot2/individual_representations/individual.py @@ -13,7 +13,7 @@ def __init__(self) -> None: self.mutation_list = [] self.crossover_list = [] - def mutate(self, rng_): + def mutate(self, rng_=None): rng = np.random.default_rng(rng_) mutation_list_copy = self.mutation_list.copy() rng.shuffle(mutation_list_copy) @@ -22,7 +22,7 @@ def mutate(self, rng_): return True return False - def crossover(self, ind2, rng_): + def crossover(self, ind2, rng_=None): rng = np.random.default_rng(rng_) crossover_list_copy = self.crossover_list.copy() rng.shuffle(crossover_list_copy) @@ -32,7 +32,7 @@ def crossover(self, ind2, rng_): return False # a guided change of an individual when given an objective function - def optimize(self, rng_, objective_function, steps=5): + def optimize(self, objective_function, rng_=None , steps=5): rng = np.random.default_rng(rng_) for _ in range(steps): self.mutate(rng_=rng) diff --git a/tpot2/individual_representations/subset_selector/subsetselector.py b/tpot2/individual_representations/subset_selector/subsetselector.py index 7cddbccf..5dc1d8af 100644 --- a/tpot2/individual_representations/subset_selector/subsetselector.py +++ b/tpot2/individual_representations/subset_selector/subsetselector.py @@ -6,7 +6,7 @@ class SubsetSelector(BaseIndividual): def __init__( self, values, - rng_, + rng_=None, initial_set = None, k=1, #step size for shuffling ): @@ -29,7 +29,7 @@ def __init__( self, self.mutation_list = [self._mutate_add, self._mutate_remove] self.crossover_list = [self._crossover_swap] - def _mutate_add(self, rng_): + def _mutate_add(self, rng_=None): rng = np.random.default_rng(rng_) not_included = list(self.values.difference(self.subsets)) if len(not_included) > 1: @@ -38,12 +38,12 @@ def _mutate_add(self, rng_): else: return False - def _mutate_remove(self, rng_): + def _mutate_remove(self, rng_=None): rng = np.random.default_rng(rng_) if len(self.subsets) > 1: self.subsets = self.subsets - set(rng.choice(list(self.subsets), k=min(self.k, len(self.subsets)-1) )) - def _crossover_swap(self, ss2, rng_): + def _crossover_swap(self, ss2, rng_=None): rng = np.random.default_rng(rng_) diffs = self.subsets.symmetric_difference(ss2.subsets) From a55e318664703b50d97567db6c30071cbd693f0c Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:25:10 -0700 Subject: [PATCH 36/43] cleaned up and defaulted rng_ to None --- .../graph_pipeline_individual/individual.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index 724034ea..8ea3e0f2 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -106,7 +106,6 @@ class GraphIndividual(BaseIndividual): ''' def __init__( self, - rng_, root_config_dict, inner_config_dict=None, leaf_config_dict=None, @@ -123,6 +122,7 @@ def __init__( unique_subset_values = None, initial_subset_values = None, + rng_=None, ): self.__debug = False @@ -242,7 +242,7 @@ def select_config_dict(self, node): return self.inner_config_dict - def initialize_all_nodes(self, rng_): + def initialize_all_nodes(self, rng_=None): rng = np.random.default_rng(rng_) for node in self.graph: if isinstance(node,GraphIndividual): @@ -253,7 +253,7 @@ def initialize_all_nodes(self, rng_): get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - def fix_noncompliant_leafs(self, rng_): + def fix_noncompliant_leafs(self, rng_=None): rng = np.random.default_rng(rng_) leafs = [node for node in self.graph.nodes if len(list(self.graph.successors(node)))==0] compliant_leafs = [] @@ -517,13 +517,13 @@ def plot(self): ############# #TODO currently does not correctly return false when adding a leaf causes a duplicate node that is later merged - def mutate(self, rng_): + def mutate(self, rng_=None): rng = np.random.default_rng(rng_) self.key = None graph = self.select_graphindividual(rng_=rng) return graph._mutate(rng_=rng) - def _mutate(self, rng_): + def _mutate(self, rng_=None): rng = np.random.default_rng(rng_) rng.shuffle(self.mutate_methods_list) for mutate_method in self.mutate_methods_list: @@ -553,13 +553,13 @@ def _mutate(self, rng_): return False - def _mutate_row_subsets(self, rng_): + def _mutate_row_subsets(self, rng_=None): rng = np.random.default_rng(rng_) if self.unique_subset_values is not None: self.row_subset_selector.mutate(rng_=rng) - def _mutate_hyperparameters(self, rng_): + def _mutate_hyperparameters(self, rng_=None): ''' Mutates the hyperparameters for a randomly chosen node in the graph. ''' @@ -584,7 +584,7 @@ def _mutate_hyperparameters(self, rng_): - def _mutate_replace_node(self, rng_): + def _mutate_replace_node(self, rng_=None): ''' Replaces the method in a randomly chosen node by a method from the available methods for that node. @@ -613,7 +613,7 @@ def _mutate_replace_node(self, rng_): return False - def _mutate_remove_node(self, rng_): + def _mutate_remove_node(self, rng_=None): ''' Removes a randomly chosen node and connects its parents to its children. If the node is the only leaf for an inner node and 'leaf_config_dict' is not none, we do not remove it. @@ -647,7 +647,7 @@ def _mutate_remove_node(self, rng_): return False - def _mutate_remove_edge(self, rng_): + def _mutate_remove_edge(self, rng_=None): ''' Deletes an edge as long as deleting that edge does not make the graph disconnected. ''' @@ -667,7 +667,7 @@ def _mutate_remove_edge(self, rng_): return True return False - def _mutate_add_edge(self, rng_): + def _mutate_add_edge(self, rng_=None): ''' Randomly add an edge from a node to another node that is not an ancestor of the first node. ''' @@ -691,7 +691,7 @@ def _mutate_add_edge(self, rng_): return False - def _mutate_insert_leaf(self, rng_): + def _mutate_insert_leaf(self, rng_=None): rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) @@ -718,7 +718,7 @@ def _mutate_insert_leaf(self, rng_): return False - def _mutate_insert_bypass_node(self, rng_): + def _mutate_insert_bypass_node(self, rng_=None): rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) @@ -742,7 +742,7 @@ def _mutate_insert_bypass_node(self, rng_): return False - def _mutate_insert_inner_node(self, rng_): + def _mutate_insert_inner_node(self, rng_=None): rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) @@ -794,7 +794,7 @@ def _get_graphs(self, depth=1): return graphs - def select_graphindividual(self, rng_): + def select_graphindividual(self, rng_=None): rng = np.random.default_rng(rng_) graphs = self.get_graphs() weights = [g.graph.number_of_nodes() for g in graphs] @@ -803,7 +803,7 @@ def select_graphindividual(self, rng_): return rng.choice(graphs, p=weights) - def select_graph_same_recursive_depth(self,ind1,ind2,rng_): + def select_graph_same_recursive_depth(self,ind1,ind2,rng_=None): rng = np.random.default_rng(rng_) graphs1 = ind1.get_graphs() @@ -825,7 +825,7 @@ def select_graph_same_recursive_depth(self,ind1,ind2,rng_): return ind1,ind2 - def crossover(self, ind2, rng_): + def crossover(self, ind2, rng_=None): ''' self is the first individual, ind2 is the second individual If crossover_same_depth, it will select graphindividuals at the same recursive depth. @@ -849,7 +849,7 @@ def crossover(self, ind2, rng_): return g1._crossover(g2, rng_=rng) - def _crossover(self, Graph, rng_): + def _crossover(self, Graph, rng_=None): rng = np.random.default_rng(rng_) rng.shuffle(self.crossover_methods_list) @@ -868,13 +868,13 @@ def _crossover(self, Graph, rng_): return False - def _crossover_row_subsets(self, G2, rng_): + def _crossover_row_subsets(self, G2, rng_=None): rng = np.random.default_rng(rng_) if self.unique_subset_values is not None and G2.unique_subset_values is not None: self.row_subset_selector.crossover(G2.row_subset_selector, rng_=rng) - def _crossover_swap_node(self, G2, rng_): + def _crossover_swap_node(self, G2, rng_=None): ''' Swaps randomly chosen node from Parent1 with a randomly chosen node from Parent2. ''' @@ -910,7 +910,7 @@ def _crossover_swap_node(self, G2, rng_): - def _crossover_swap_branch(self, G2, rng_): + def _crossover_swap_branch(self, G2, rng_=None): ''' swaps a branch from parent1 with a branch from parent2. does not modify parent2 ''' @@ -961,7 +961,7 @@ def _crossover_swap_branch(self, G2, rng_): return False #TODO: Currently returns true even if hyperparameters are blank - def _crossover_hyperparameters(self, G2, rng_): + def _crossover_hyperparameters(self, G2, rng_=None): ''' Swaps the hyperparamters of one randomly chosen node in Parent1 with the hyperparameters of randnomly chosen node in Parent2. ''' @@ -986,7 +986,7 @@ def _crossover_hyperparameters(self, G2, rng_): #not including the nodes, just their children #Finds leaves attached to nodes and swaps them - def _crossover_swap_leaf_at_node(self, G2, rng_): + def _crossover_swap_leaf_at_node(self, G2, rng_=None): rng = np.random.default_rng(rng_) if self.crossover_same_depth: @@ -1019,7 +1019,7 @@ def _crossover_swap_leaf_at_node(self, G2, rng_): return success - def _crossover_take_branch(self, G2, rng_): + def _crossover_take_branch(self, G2, rng_=None): ''' Takes a subgraph from Parent2 and add it to a randomly chosen node in Parent1. ''' @@ -1065,7 +1065,7 @@ def _crossover_take_branch(self, G2, rng_): return False #TODO: swap all leaf nodes - def _crossover_swap_all_leafs(self, G2, rng_): + def _crossover_swap_all_leafs(self, G2, rng_=None): pass @@ -1174,7 +1174,7 @@ def full_node_list(self): -def create_node(config_dict, rng_): +def create_node(config_dict, rng_=None): ''' Takes a config_dict and returns a node with a random method_class and hyperparameters ''' @@ -1195,7 +1195,7 @@ def create_node(config_dict, rng_): return node -def random_weighted_sort(l,weights, rng_): +def random_weighted_sort(l,weights, rng_=None): rng = np.random.default_rng(rng_) sorted_l = [] indeces = {i: weights[i] for i in range(len(l))} From e86a6d86d5f5da93490e73ac76844d61f491e621 Mon Sep 17 00:00:00 2001 From: Jose Date: Tue, 31 Oct 2023 15:25:23 -0700 Subject: [PATCH 37/43] removed unused import --- .../graph_pipeline_individual/templates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/templates.py b/tpot2/individual_representations/graph_pipeline_individual/templates.py index e0599601..c08d047a 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/templates.py +++ b/tpot2/individual_representations/graph_pipeline_individual/templates.py @@ -3,7 +3,6 @@ import tpot2 import networkx as nx from tpot2.individual_representations.graph_pipeline_individual import GraphIndividual -import random from tpot2.individual_representations.graph_pipeline_individual.individual import create_node From 336485b6fcfaa4532730e32980a8dbdf4083e3cc Mon Sep 17 00:00:00 2001 From: Jose Date: Wed, 1 Nov 2023 11:43:42 -0700 Subject: [PATCH 38/43] updated estimator_graph_individual_generator to generate random pipelines with user specified sizes (if possible) --- .../graph_pipeline_individual/templates.py | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/templates.py b/tpot2/individual_representations/graph_pipeline_individual/templates.py index c08d047a..51ca2b2b 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/templates.py +++ b/tpot2/individual_representations/graph_pipeline_individual/templates.py @@ -7,6 +7,7 @@ from tpot2.individual_representations.graph_pipeline_individual.individual import create_node +# will randomly generate individuals (no predefined order) def estimator_graph_individual_generator( root_config_dict, inner_config_dict=None, @@ -22,47 +23,46 @@ def estimator_graph_individual_generator( rng = np.random.default_rng(rng_) - n_nodes = 0 while True: - if n_nodes < max_size: - n_nodes += 1 - - for k in root_config_dict.keys(): - - graph = nx.DiGraph() - root = create_node(config_dict={k:root_config_dict[k]}, rng_=rng) - graph.add_node(root) - - ind = GraphIndividual( rng_=rng, - inner_config_dict=inner_config_dict, - leaf_config_dict=leaf_config_dict, - root_config_dict=root_config_dict, - initial_graph = graph, - - max_size = max_size, - linear_pipeline = linear_pipeline, - hyperparameter_probability = hyperparameter_probability, - hyper_node_probability = hyper_node_probability, - hyperparameter_alpha = hyperparameter_alpha, - - **kwargs, - ) - - starting_ops = [] - if inner_config_dict is not None: - starting_ops.append(ind._mutate_insert_inner_node) - if leaf_config_dict is not None: - starting_ops.append(ind._mutate_insert_leaf) - - if len(starting_ops) > 0: - if n_nodes > 0: - for _ in range(rng.integers(0,min(n_nodes,3))): - func = rng.choice(starting_ops) - func(rng_=rng) - - - yield ind + # if user specified limit, grab a random number between that limit + if max_size is not np.inf: + n_nodes = rng.integers(0,max_size) + # else, grab random number between 0,10 (theaksaini) + else: + n_nodes = rng.integers(0,10) + + graph = nx.DiGraph() + root = create_node(config_dict=root_config_dict, rng_=rng) # grab random root model method + graph.add_node(root) + + ind = GraphIndividual( rng_=rng, + inner_config_dict=inner_config_dict, + leaf_config_dict=leaf_config_dict, + root_config_dict=root_config_dict, + initial_graph = graph, + + max_size = max_size, + linear_pipeline = linear_pipeline, + hyperparameter_probability = hyperparameter_probability, + hyper_node_probability = hyper_node_probability, + hyperparameter_alpha = hyperparameter_alpha, + + **kwargs, + ) + + starting_ops = [] + if inner_config_dict is not None: + starting_ops.append(ind._mutate_insert_inner_node) + if leaf_config_dict is not None: + starting_ops.append(ind._mutate_insert_leaf) + + if len(starting_ops) > 0: + for _ in range(n_nodes-1): + func = rng.choice(starting_ops) + func(rng_=rng) + + yield ind class BaggingCompositeGraphSklearn(): From 18ce1563f20518807b5008b4d33353c8cf145da2 Mon Sep 17 00:00:00 2001 From: Jose Date: Wed, 1 Nov 2023 13:38:01 -0700 Subject: [PATCH 39/43] fixed range to get correct number of nodes --- .../graph_pipeline_individual/templates.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/templates.py b/tpot2/individual_representations/graph_pipeline_individual/templates.py index 51ca2b2b..cd8015cc 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/templates.py +++ b/tpot2/individual_representations/graph_pipeline_individual/templates.py @@ -27,10 +27,10 @@ def estimator_graph_individual_generator( # if user specified limit, grab a random number between that limit if max_size is not np.inf: - n_nodes = rng.integers(0,max_size) - # else, grab random number between 0,10 (theaksaini) + n_nodes = rng.integers(1,max_size+1) + # else, grab random number between 1,11 (theaksaini) else: - n_nodes = rng.integers(0,10) + n_nodes = rng.integers(1,11) graph = nx.DiGraph() root = create_node(config_dict=root_config_dict, rng_=rng) # grab random root model method @@ -56,6 +56,7 @@ def estimator_graph_individual_generator( starting_ops.append(ind._mutate_insert_inner_node) if leaf_config_dict is not None: starting_ops.append(ind._mutate_insert_leaf) + n_nodes -= 1 if len(starting_ops) > 0: for _ in range(n_nodes-1): From 51ba13232119e52e3131c40db936f3828b751503 Mon Sep 17 00:00:00 2001 From: Jose Date: Thu, 2 Nov 2023 11:04:47 -0700 Subject: [PATCH 40/43] fix to allow mutations to actually occur to an individual --- tpot2/population.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpot2/population.py b/tpot2/population.py index e8bf96dc..a3a0c54c 100644 --- a/tpot2/population.py +++ b/tpot2/population.py @@ -330,9 +330,9 @@ def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutati for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class - if var_op == "mutation": + if var_op == "mutate": mutation_op = rng.choice(mutation_functions, p=mutation_function_weights) - all_offspring.append(copy_and_mutate(parents, mutation_op, rng_=rng)) + all_offspring.append(copy_and_mutate(parents[0], mutation_op, rng_=rng)) chosen_ops.append(mutation_op.__name__) From 2a8510a77a9f3fc82f41fb6d95160cfdf3846598 Mon Sep 17 00:00:00 2001 From: perib Date: Thu, 2 Nov 2023 17:41:38 -0700 Subject: [PATCH 41/43] bug fixes --- .../graph_pipeline_individual/individual.py | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index 8ea3e0f2..bc2a883a 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -209,6 +209,7 @@ def __init__( self._crossover_swap_branch, ] + if self.inner_config_dict is not None: self.mutate_methods_list.append(self._mutate_insert_inner_node) self.crossover_methods_list.append(self._crossover_take_branch) #this is the only crossover method that can create inner nodes @@ -217,7 +218,7 @@ def __init__( self.mutate_methods_list.append(self._mutate_remove_edge) self.mutate_methods_list.append(self._mutate_add_edge) - if not linear_pipeline: + if not linear_pipeline and (self.leaf_config_dict is not None or self.inner_config_dict is not None): self.mutate_methods_list.append(self._mutate_insert_leaf) @@ -595,20 +596,12 @@ def _mutate_replace_node(self, rng_=None): for node in sorted_nodes_list: if isinstance(node,GraphIndividual): continue - node.method_class = rng.choice(list(self.select_config_dict(node).keys())) - if isinstance(self.select_config_dict(node)[node.method_class], dict): - hyperparameters = self.select_config_dict(node)[node.method_class] - node.hyperparameters = hyperparameters - else: - #hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) - #get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=None, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - new_node = create_node(self.select_config_dict(node), rng_=rng) - #TODO cleanup - node.hyperparameters = new_node.hyperparameters - node.method_class = new_node.method_class - node.label = new_node.label - - return True + new_node = create_node(self.select_config_dict(node), rng_=rng) + #check if new node and old node are the same + #TODO: add attempts? + if node.method_class != new_node.method_class or node.hyperparameters != new_node.hyperparameters: + nx.relabel_nodes(self.graph, {new_node:node}, copy=False) + return True return False From a0095e52e2d8d20a13459a0d54d1fe9162891ea9 Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 15 Nov 2023 16:28:04 -0800 Subject: [PATCH 42/43] preprocessing fix --- tpot2/tpot_estimator/estimator.py | 44 ++++++++++++------- .../tpot_estimator/steady_state_estimator.py | 44 ++++++++++++------- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index c534a7c1..060539c7 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -619,23 +619,35 @@ def fit(self, X, y): if self.preprocessing: #X = pd.DataFrame(X) - #TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline - if isinstance(X, pd.DataFrame): #pandas dataframe - if self.categorical_features is not None: - X[self.categorical_features] = X[self.categorical_features].astype(object) - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), #impute categorical columns - tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), #impute numeric columns - tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001)) #one hot encode categorical columns - X = self._preprocessing_pipeline.fit_transform(X) - else: - if self.categorical_features is not None: #numpy array and categorical columns specified - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'), #impute categorical columns - tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns - tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns - else: #numpy array and no categorical columns specified, just do imputation - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) - + if not isinstance(self.preprocessing, bool) and isinstance(self.preprocessing, sklearn.base.BaseEstimator): + self._preprocessing_pipeline = self.preprocessing + #TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline + else: #if self.preprocessing is True or not a sklearn estimator + + pipeline_steps = [] + + if self.categorical_features is not None: #if categorical features are specified, use those + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'))) + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'))) + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent'))) + + else: + if isinstance(X, pd.DataFrame): + categorical_columns = X.select_dtypes(include=['object']).columns + if len(categorical_columns) > 0: + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'))) + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'))) + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent'))) + else: + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))) + else: + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))) + + self._preprocessing_pipeline = sklearn.pipeline.Pipeline(pipeline_steps) + + X = self._preprocessing_pipeline.fit_transform(X, y) + else: self._preprocessing_pipeline = None diff --git a/tpot2/tpot_estimator/steady_state_estimator.py b/tpot2/tpot_estimator/steady_state_estimator.py index 0f48c827..240b3a86 100644 --- a/tpot2/tpot_estimator/steady_state_estimator.py +++ b/tpot2/tpot_estimator/steady_state_estimator.py @@ -605,23 +605,35 @@ def fit(self, X, y): if self.preprocessing: #X = pd.DataFrame(X) - #TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline - if isinstance(X, pd.DataFrame): #pandas dataframe - if self.categorical_features is not None: - X[self.categorical_features] = X[self.categorical_features].astype(object) - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), #impute categorical columns - tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), #impute numeric columns - tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001)) #one hot encode categorical columns - X = self._preprocessing_pipeline.fit_transform(X) - else: - if self.categorical_features is not None: #numpy array and categorical columns specified - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'), #impute categorical columns - tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns - tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns - else: #numpy array and no categorical columns specified, just do imputation - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) - + if not isinstance(self.preprocessing, bool) and isinstance(self.preprocessing, sklearn.base.BaseEstimator): + self._preprocessing_pipeline = self.preprocessing + #TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline + else: #if self.preprocessing is True or not a sklearn estimator + + pipeline_steps = [] + + if self.categorical_features is not None: #if categorical features are specified, use those + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'))) + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'))) + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent'))) + + else: + if isinstance(X, pd.DataFrame): + categorical_columns = X.select_dtypes(include=['object']).columns + if len(categorical_columns) > 0: + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'))) + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'))) + pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent'))) + else: + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))) + else: + pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))) + + self._preprocessing_pipeline = sklearn.pipeline.Pipeline(pipeline_steps) + + X = self._preprocessing_pipeline.fit_transform(X, y) + else: self._preprocessing_pipeline = None From 8eec35226399a9e4042d5c5f0d30295b0a79f3bc Mon Sep 17 00:00:00 2001 From: nickotto Date: Thu, 16 Nov 2023 14:57:00 -0800 Subject: [PATCH 43/43] Update _version.py --- tpot2/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot2/_version.py b/tpot2/_version.py index dccccc2b..9ab09b40 100644 --- a/tpot2/_version.py +++ b/tpot2/_version.py @@ -1 +1 @@ -__version__ = '0.1.2-alpha' +__version__ = '0.1.4-alpha'