diff --git a/pysr/__init__.py b/pysr/__init__.py index f9cad605c..e303becb2 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -6,8 +6,8 @@ best_tex, best_callable, best_row, - install, ) +from .julia_helpers import install from .feynman_problems import Problem, FeynmanProblem from .export_jax import sympy2jax from .export_torch import sympy2torch diff --git a/pysr/export_numpy.py b/pysr/export_numpy.py new file mode 100644 index 000000000..5e2035935 --- /dev/null +++ b/pysr/export_numpy.py @@ -0,0 +1,40 @@ +"""Code for exporting discovered expressions to numpy""" +import numpy as np +import pandas as pd +from sympy import lambdify +import warnings + + +class CallableEquation: + """Simple wrapper for numpy lambda functions built with sympy""" + + def __init__(self, sympy_symbols, eqn, selection=None, variable_names=None): + self._sympy = eqn + self._sympy_symbols = sympy_symbols + self._selection = selection + self._variable_names = variable_names + + def __repr__(self): + return f"PySRFunction(X=>{self._sympy})" + + def __call__(self, X): + expected_shape = (X.shape[0],) + if isinstance(X, pd.DataFrame): + # Lambda function takes as argument: + return self._lambda( + **{k: X[k].values for k in self._variable_names} + ) * np.ones(expected_shape) + if self._selection is not None: + if X.shape[1] != len(self._selection): + warnings.warn( + "`X` should be of shape (n_samples, len(self._selection)). " + "Automatically filtering `X` to selection. " + "Note: Filtered `X` column order may not match column order in fit " + "this may lead to incorrect predictions and other errors." + ) + X = X[:, self._selection] + return self._lambda(*X.T) * np.ones(expected_shape) + + @property + def _lambda(self): + return lambdify(self._sympy_symbols, self._sympy) diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py new file mode 100644 index 000000000..c651700bc --- /dev/null +++ b/pysr/julia_helpers.py @@ -0,0 +1,126 @@ +"""Functions for initializing the Julia environment and installing deps.""" +import warnings +from pathlib import Path +import os + +from .version import __version__, __symbolic_regression_jl_version__ + + +def install(julia_project=None, quiet=False): # pragma: no cover + """ + Install PyCall.jl and all required dependencies for SymbolicRegression.jl. + + Also updates the local Julia registry. + """ + import julia + + julia.install(quiet=quiet) + + julia_project, is_shared = _get_julia_project(julia_project) + + Main = init_julia() + Main.eval("using Pkg") + + io = "devnull" if quiet else "stderr" + io_arg = f"io={io}" if is_julia_version_greater_eq(Main, "1.6") else "" + + # Can't pass IO to Julia call as it evaluates to PyObject, so just directly + # use Main.eval: + Main.eval( + f'Pkg.activate("{_escape_filename(julia_project)}", shared = Bool({int(is_shared)}), {io_arg})' + ) + if is_shared: + # Install SymbolicRegression.jl: + _add_sr_to_julia_project(Main, io_arg) + + Main.eval(f"Pkg.instantiate({io_arg})") + Main.eval(f"Pkg.precompile({io_arg})") + if not quiet: + warnings.warn( + "It is recommended to restart Python after installing PySR's dependencies," + " so that the Julia environment is properly initialized." + ) + + +def import_error_string(julia_project=None): + s = """ + Required dependencies are not installed or built. Run the following code in the Python REPL: + + >>> import pysr + >>> pysr.install() + """ + + if julia_project is not None: + s += f""" + Tried to activate project {julia_project} but failed.""" + + return s + + +def _get_julia_project(julia_project): + if julia_project is None: + is_shared = True + julia_project = f"pysr-{__version__}" + else: + is_shared = False + julia_project = Path(julia_project) + return julia_project, is_shared + + +def is_julia_version_greater_eq(Main, version="1.6"): + """Check if Julia version is greater than specified version.""" + return Main.eval(f'VERSION >= v"{version}"') + + +def init_julia(): + """Initialize julia binary, turning off compiled modules if needed.""" + from julia.core import JuliaInfo, UnsupportedPythonError + + try: + info = JuliaInfo.load(julia="julia") + except FileNotFoundError: + env_path = os.environ["PATH"] + raise FileNotFoundError( + f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}", + ) + + if not info.is_pycall_built(): + raise ImportError(import_error_string()) + + Main = None + try: + from julia import Main as _Main + + Main = _Main + except UnsupportedPythonError: + # Static python binary, so we turn off pre-compiled modules. + from julia.core import Julia + + jl = Julia(compiled_modules=False) + from julia import Main as _Main + + Main = _Main + + return Main + + +def _add_sr_to_julia_project(Main, io_arg): + Main.sr_spec = Main.PackageSpec( + name="SymbolicRegression", + url="https://github.com/MilesCranmer/SymbolicRegression.jl", + rev="v" + __symbolic_regression_jl_version__, + ) + Main.eval(f"Pkg.add(sr_spec, {io_arg})") + Main.clustermanagers_spec = Main.PackageSpec( + name="ClusterManagers", + url="https://github.com/JuliaParallel/ClusterManagers.jl", + rev="14e7302f068794099344d5d93f71979aaf4fbeb3", + ) + Main.eval(f"Pkg.add(clustermanagers_spec, {io_arg})") + + +def _escape_filename(filename): + """Turns a file into a string representation with correctly escaped backslashes""" + str_repr = str(filename) + str_repr = str_repr.replace("\\", "\\\\") + return str_repr diff --git a/pysr/sr.py b/pysr/sr.py index 24a618f6d..3d20e484c 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd import sympy -from sympy import sympify, lambdify +from sympy import sympify import re import tempfile import shutil @@ -11,63 +11,25 @@ from datetime import datetime import warnings from multiprocessing import cpu_count -from sklearn.base import BaseEstimator, RegressorMixin -from collections import OrderedDict -from hashlib import sha256 - -from .version import __version__, __symbolic_regression_jl_version__ +from sklearn.base import BaseEstimator, RegressorMixin, MultiOutputMixin +from sklearn.utils import check_array, check_consistent_length, check_random_state +from sklearn.utils.validation import ( + _check_feature_names_in, + check_is_fitted, +) + +from .julia_helpers import ( + init_julia, + _get_julia_project, + is_julia_version_greater_eq, + _escape_filename, + _add_sr_to_julia_project, + import_error_string, +) +from .export_numpy import CallableEquation from .deprecated import make_deprecated_kwargs_for_pysr_regressor -def install(julia_project=None, quiet=False): # pragma: no cover - """Install PyCall.jl and all required dependencies for SymbolicRegression.jl. - - Also updates the local Julia registry.""" - import julia - - julia.install(quiet=quiet) - - julia_project, is_shared = _get_julia_project(julia_project) - - Main = init_julia() - Main.eval("using Pkg") - - io = "devnull" if quiet else "stderr" - io_arg = f"io={io}" if is_julia_version_greater_eq(Main, "1.6") else "" - - # Can't pass IO to Julia call as it evaluates to PyObject, so just directly - # use Main.eval: - Main.eval( - f'Pkg.activate("{_escape_filename(julia_project)}", shared = Bool({int(is_shared)}), {io_arg})' - ) - if is_shared: - # Install SymbolicRegression.jl: - _add_sr_to_julia_project(Main, io_arg) - - Main.eval(f"Pkg.instantiate({io_arg})") - Main.eval(f"Pkg.precompile({io_arg})") - if not quiet: - warnings.warn( - "It is recommended to restart Python after installing PySR's dependencies," - " so that the Julia environment is properly initialized." - ) - - -def import_error_string(julia_project=None): - s = f""" - Required dependencies are not installed or built. Run the following code in the Python REPL: - - >>> import pysr - >>> pysr.install() - """ - - if julia_project is not None: - s += f""" - Tried to activate project {julia_project} but failed.""" - - return s - - Main = None # TODO: Rename to more descriptive name like "julia_runtime" already_ran = False @@ -114,15 +76,17 @@ def import_error_string(julia_project=None): def pysr(X, y, weights=None, **kwargs): # pragma: no cover warnings.warn( - "Calling `pysr` is deprecated. Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.", - DeprecationWarning, + "Calling `pysr` is deprecated. " + "Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.", + FutureWarning, ) model = PySRRegressor(**kwargs) model.fit(X, y, weights=weights) return model.equations -def _handle_constraints(binary_operators, unary_operators, constraints): +def _process_constraints(binary_operators, unary_operators, constraints): + constraints = constraints.copy() for op in unary_operators: if op not in constraints: constraints[op] = -1 @@ -132,7 +96,8 @@ def _handle_constraints(binary_operators, unary_operators, constraints): if op in ["plus", "sub", "+", "-"]: if constraints[op][0] != constraints[op][1]: raise NotImplementedError( - "You need equal constraints on both sides for - and +, due to simplification strategies." + "You need equal constraints on both sides for - and +, " + "due to simplification strategies." ) elif op in ["mult", "*"]: # Make sure the complex expression is in the left side. @@ -143,10 +108,13 @@ def _handle_constraints(binary_operators, unary_operators, constraints): constraints[op][1], constraints[op][0], ) + return constraints -def _create_inline_operators(binary_operators, unary_operators): +def _maybe_create_inline_operators(binary_operators, unary_operators): global Main + binary_operators = binary_operators.copy() + unary_operators = unary_operators.copy() for op_list in [binary_operators, unary_operators]: for i, op in enumerate(op_list): is_user_defined_operator = "(" in op @@ -162,33 +130,21 @@ def _create_inline_operators(binary_operators, unary_operators): if not re.match(r"^[a-zA-Z0-9_]+$", function_name): raise ValueError( f"Invalid function name {function_name}. " - "Only alphanumeric characters, numbers, and underscores are allowed." + "Only alphanumeric characters, numbers, " + "and underscores are allowed." ) op_list[i] = function_name - - -def _handle_feature_selection(X, select_k_features, y, variable_names): - if select_k_features is not None: - selection = run_feature_selection(X, y, select_k_features) - print(f"Using features {[variable_names[i] for i in selection]}") - X = X[:, selection] - - else: - selection = None - return X, selection + return binary_operators, unary_operators def _check_assertions( X, - binary_operators, - unary_operators, use_custom_variable_names, variable_names, weights, y, ): # Check for potential errors before they happen - assert len(unary_operators) + len(binary_operators) > 0 assert len(X.shape) == 2 assert len(y.shape) in [1, 2] assert X.shape[0] == y.shape[0] @@ -199,626 +155,664 @@ def _check_assertions( assert len(variable_names) == X.shape[1] -def run_feature_selection(X, y, select_k_features): - """Use a gradient boosting tree regressor as a proxy for finding - the k most important features in X, returning indices for those - features as output.""" - - from sklearn.ensemble import RandomForestRegressor - from sklearn.feature_selection import SelectFromModel - - clf = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0) - clf.fit(X, y) - selector = SelectFromModel( - clf, threshold=-np.inf, max_features=select_k_features, prefit=True - ) - return selector.get_support(indices=True) - - -def _escape_filename(filename): - """Turns a file into a string representation with correctly escaped backslashes""" - str_repr = str(filename) - str_repr = str_repr.replace("\\", "\\\\") - return str_repr - - def best(*args, **kwargs): # pragma: no cover raise NotImplementedError( - "`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation." + "`best` has been deprecated. Please use the `PySRRegressor` interface. " + "After fitting, you can return `.sympy()` to get the sympy representation " + "of the best equation." ) def best_row(*args, **kwargs): # pragma: no cover raise NotImplementedError( - "`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation, or `model.get_best()` to return the best equation's row in `model.equations`." + "`best_row` has been deprecated. Please use the `PySRRegressor` interface. " + "After fitting, you can run `print(model)` to view the best equation, or " + "`model.get_best()` to return the best equation's row in `model.equations`." ) def best_tex(*args, **kwargs): # pragma: no cover raise NotImplementedError( - "`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation." + "`best_tex` has been deprecated. Please use the `PySRRegressor` interface. " + "After fitting, you can return `.latex()` to get the sympy representation " + "of the best equation." ) def best_callable(*args, **kwargs): # pragma: no cover raise NotImplementedError( - "`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable." + "`best_callable` has been deprecated. Please use the `PySRRegressor` " + "interface. After fitting, you can use `.predict(X)` to use the best callable." ) -def _denoise(X, y, Xresampled=None): - """Denoise the dataset using a Gaussian process""" - from sklearn.gaussian_process import GaussianProcessRegressor - from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel - - gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel() - gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50) - gpr.fit(X, y) - if Xresampled is not None: - return Xresampled, gpr.predict(Xresampled) +# Class validation constants +VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"] - return X, gpr.predict(X) +class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): + """ + High-performance symbolic regression. + + This is the scikit-learn interface for SymbolicRegression.jl. + This model will automatically search for equations which fit + a given dataset subject to a particular loss and set of + constraints. + + Parameters + ---------- + model_selection : str, default="best" + Model selection criterion. Can be 'accuracy' or 'best'. + `"accuracy"` selects the candidate model with the lowest loss + (highest accuracy). `"best"` selects the candidate model with + the lowest sum of normalized loss and complexity. + + binary_operators : list[str], default=["+", "-", "*", "/"] + List of strings giving the binary operators in Julia's Base. + + unary_operators : list[str], default=None + Same as :param`binary_operators` but for operators taking a + single scalar. + + niterations : int, default=40 + Number of iterations of the algorithm to run. The best + equations are printed and migrate between populations at the + end of each iteration. + + populations : int, default=15 + Number of populations running. + + population_size : int, default=33 + Number of individuals in each population. + + max_evals : int, default=None + Limits the total number of evaluations of expressions to + this number. + + maxsize : int, default=20 + Max complexity of an equation. + + maxdepth : int, default=None + Max depth of an equation. You can use both :param`maxsize` and + :param`maxdepth`. :param`maxdepth` is by default not used. + + warmup_maxsize_by : float, default=0.0 + Whether to slowly increase max size from a small number up to + the maxsize (if greater than 0). If greater than 0, says the + fraction of training time at which the current maxsize will + reach the user-passed maxsize. + + timeout_in_seconds : float, default=None + Make the search return early once this many seconds have passed. + + constraints : dict[str, int | tuple[int,int]], default=None + Dictionary of int (unary) or 2-tuples (binary), this enforces + maxsize constraints on the individual arguments of operators. + E.g., `'pow': (-1, 1)` says that power laws can have any + complexity left argument, but only 1 complexity in the right + argument. Use this to force more interpretable solutions. + + nested_constraints : dict[str, dict], default=None + Specifies how many times a combination of operators can be + nested. For example, `{"sin": {"cos": 0}}, "cos": {"cos": 2}}` + specifies that `cos` may never appear within a `sin`, but `sin` + can be nested with itself an unlimited number of times. The + second term specifies that `cos` can be nested up to 2 times + within a `cos`, so that `cos(cos(cos(x)))` is allowed + (as well as any combination of `+` or `-` within it), but + `cos(cos(cos(cos(x))))` is not allowed. When an operator is not + specified, it is assumed that it can be nested an unlimited + number of times. This requires that there is no operator which + is used both in the unary operators and the binary operators + (e.g., `-` could be both subtract, and negation). For binary + operators, you only need to provide a single number: both + arguments are treated the same way, and the max of each + argument is constrained. + + loss : str, default="L2DistLoss()" + String of Julia code specifying the loss function. Can either + be a loss from LossFunctions.jl, or your own loss written as a + function. Examples of custom written losses include: + `myloss(x, y) = abs(x-y)` for non-weighted, or + `myloss(x, y, w) = w*abs(x-y)` for weighted. + + Among the included losses, these are as follows. + Regression: `LPDistLoss{P}()`, `L1DistLoss()`, + `L2DistLoss()` (mean square), `LogitDistLoss()`, + `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, + `PeriodicLoss(c)`, `QuantileLoss(τ)`. + Classification: `ZeroOneLoss()`, `PerceptronLoss()`, + `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, + `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, + `SigmoidLoss()`, `DWDMarginLoss(q)`. + + complexity_of_operators : dict[str, float], default=None + If you would like to use a complexity other than 1 for an + operator, specify the complexity here. For example, + `{"sin": 2, "+": 1}` would give a complexity of 2 for each use + of the `sin` operator, and a complexity of 1 for each use of + the `+` operator (which is the default). You may specify real + numbers for a complexity, and the total complexity of a tree + will be rounded to the nearest integer after computing. + + complexity_of_constants : float, default=1 + Complexity of constants. + + complexity_of_variables : float, default=1 + Complexity of variables. + + parsimony : float, default=0.0032 + Multiplicative factor for how much to punish complexity. + + use_frequency : bool, default=True + Whether to measure the frequency of complexities, and use that + instead of parsimony to explore equation space. Will naturally + find equations of all complexities. + + use_frequency_in_tournament : bool, default=True + Whether to use the frequency mentioned above in the tournament, + rather than just the simulated annealing. + + alpha : float, default=0.1 + Initial temperature for simulated annealing + (requires :param`annealing` to be `True`). + + annealing : bool, default=True + Whether to use annealing. You should (and it is default). + + early_stop_condition : { float | str }, default=None + Stop the search early if this loss is reached. You may also + pass a string containing a Julia function which + takes a loss and complexity as input, for example: + `"f(loss, complexity) = (loss < 0.1) && (complexity < 10)"`. + + ncyclesperiteration : int, default=550 + Number of total mutations to run, per 10 samples of the + population, per iteration. + + fraction_replaced : float, default=0.000364 + How much of population to replace with migrating equations from + other populations. + + fraction_replaced_hof : float, default=0.035 + How much of population to replace with migrating equations from + hall of fame. + + weight_add_node : float, default=0.79 + Relative likelihood for mutation to add a node. + + weight_insert_node : float, default=5.1 + Relative likelihood for mutation to insert a node. + + weight_delete_node : float, default=1.7 + Relative likelihood for mutation to delete a node. + + weight_do_nothing : float, default=0.21 + Relative likelihood for mutation to leave the individual. + + weight_mutate_constant : float, default=0.048 + Relative likelihood for mutation to change the constant slightly + in a random direction. + + weight_mutate_operator : float, default=0.47 + Relative likelihood for mutation to swap an operator. + + weight_randomize : float, default=0.00023 + Relative likelihood for mutation to completely delete and then + randomly generate the equation + + weight_simplify : float, default=0.0020 + Relative likelihood for mutation to simplify constant parts by evaluation + + crossover_probability : float, default=0.066 + Absolute probability of crossover-type genetic operation, instead of a mutation. + + skip_mutation_failures : bool, default=True + Whether to skip mutation and crossover failures, rather than + simply re-sampling the current member. + + migration : bool, default=True + Whether to migrate. -class CallableEquation: - """Simple wrapper for numpy lambda functions built with sympy""" + hof_migration : bool, default=True + Whether to have the hall of fame migrate. - def __init__(self, sympy_symbols, eqn, selection=None, variable_names=None): - self._sympy = eqn - self._sympy_symbols = sympy_symbols - self._selection = selection - self._variable_names = variable_names - self._lambda = lambdify(sympy_symbols, eqn) + topn : int, default=12 + How many top individuals migrate from each population. - def __repr__(self): - return f"PySRFunction(X=>{self._sympy})" + should_optimize_constants : bool, default=True + Whether to numerically optimize constants (Nelder-Mead/Newton) + at the end of each iteration. - def __call__(self, X): - expected_shape = (X.shape[0],) - if isinstance(X, pd.DataFrame): - # Lambda function takes as argument: - return self._lambda(**{k: X[k].values for k in X.columns}) * np.ones( - expected_shape - ) - elif self._selection is not None: - return self._lambda(*X[:, self._selection].T) * np.ones(expected_shape) - return self._lambda(*X.T) * np.ones(expected_shape) + optimizer_algorithm : str, default="BFGS" + Optimization scheme to use for optimizing constants. Can currently + be `NelderMead` or `BFGS`. + optimizer_nrestarts : int, default=2 + Number of time to restart the constants optimization process with + different initial conditions. -def _get_julia_project(julia_project): - if julia_project is None: - is_shared = True - julia_project = f"pysr-{__version__}" - else: - is_shared = False - julia_project = Path(julia_project) - return julia_project, is_shared + optimize_probability : float, default=0.14 + Probability of optimizing the constants during a single iteration of + the evolutionary algorithm. + optimizer_iterations : int, default=8 + Number of iterations that the constants optimizer can take. -def is_julia_version_greater_eq(Main, version="1.6"): - """Check if Julia version is greater than specified version.""" - return Main.eval(f'VERSION >= v"{version}"') + perturbation_factor : float, default=0.076 + Constants are perturbed by a max factor of + (perturbation_factor*T + 1). Either multiplied by this or + divided by this. + tournament_selection_n : int, default=10 + Number of expressions to consider in each tournament. -def init_julia(): - """Initialize julia binary, turning off compiled modules if needed.""" - from julia.core import JuliaInfo, UnsupportedPythonError + tournament_selection_p : float, default=0.86 + Probability of selecting the best expression in each + tournament. The probability will decay as p*(1-p)^n for other + expressions, sorted by loss. - try: - info = JuliaInfo.load(julia="julia") - except FileNotFoundError: - env_path = os.environ["PATH"] - raise FileNotFoundError( - f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}", - ) + procs : int, default=multiprocessing.cpu_count() + Number of processes (=number of populations running). - if not info.is_pycall_built(): - raise ImportError(import_error_string()) + multithreading : bool, default=True + Use multithreading instead of distributed backend. + Using procs=0 will turn off both. - Main = None - try: - from julia import Main as _Main + cluster_manager : str, default=None + For distributed computing, this sets the job queue system. Set + to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or + "htc". If set to one of these, PySR will run in distributed + mode, and use `procs` to figure out how many processes to launch. - Main = _Main - except UnsupportedPythonError: - # Static python binary, so we turn off pre-compiled modules. - from julia.core import Julia + batching : bool, default=False + Whether to compare population members on small batches during + evolution. Still uses full dataset for comparing against hall + of fame. - jl = Julia(compiled_modules=False) - from julia import Main as _Main + batch_size : int, default=50 + The amount of data to use if doing batching. - Main = _Main + fast_cycle : bool, default=False (experimental) + Batch over population subsamples. This is a slightly different + algorithm than regularized evolution, but does cycles 15% + faster. May be algorithmically less efficient. - return Main + precision : int, default=32 + What precision to use for the data. By default this is 32 + (float32), but you can select 64 or 16 as well. + + random_state : int, Numpy RandomState instance or None, default=None + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + deterministic : bool, default=False + Make a PySR search give the same result every run. + To use this, you must turn off parallelism + (with :param`procs`=0, :param`multithreading`=False), + and set :param`random_state` to a fixed seed. + + warm_start : bool, default=False + Tells fit to continue from where the last call to fit finished. + If false, each call to fit will be fresh, overwriting previous results. -def _add_sr_to_julia_project(Main, io_arg): - Main.sr_spec = Main.PackageSpec( - name="SymbolicRegression", - url="https://github.com/MilesCranmer/SymbolicRegression.jl", - rev="v" + __symbolic_regression_jl_version__, - ) - Main.eval(f"Pkg.add(sr_spec, {io_arg})") - Main.clustermanagers_spec = Main.PackageSpec( - name="ClusterManagers", - url="https://github.com/JuliaParallel/ClusterManagers.jl", - rev="14e7302f068794099344d5d93f71979aaf4fbeb3", - ) - Main.eval(f"Pkg.add(clustermanagers_spec, {io_arg})") + verbosity : int, default=1e9 + What verbosity level to use. 0 means minimal print statements. + update_verbosity : int, default=None + What verbosity level to use for package updates. + Will take value of :param`verbosity` if not given. + + progress : bool, default=True + Whether to use a progress bar instead of printing to stdout. + + equation_file : str, default=None + Where to save the files (.csv separated by |). + + temp_equation_file : bool, default=False + Whether to put the hall of fame file in the temp directory. + Deletion is then controlled with the :param`delete_tempfiles` + parameter. + + tempdir : str, default=None + directory for the temporary files. + + delete_tempfiles : bool, default=True + Whether to delete the temporary files after finishing. + + julia_project : str, default=None + A Julia environment location containing a Project.toml + (and potentially the source code for SymbolicRegression.jl). + Default gives the Python package directory, where a + Project.toml file should be present from the install. + + update: bool, default=True + Whether to automatically update Julia packages. + + output_jax_format : bool, default=False + Whether to create a 'jax_format' column in the output, + containing jax-callable functions and the default parameters in + a jax array. + + output_torch_format : bool, default=False + Whether to create a 'torch_format' column in the output, + containing a torch module with trainable parameters. + + extra_sympy_mappings : dict[str, Callable], default=None + Provides mappings between custom :param`binary_operators` or + :param`unary_operators` defined in julia strings, to those same + operators defined in sympy. + E.G if `unary_operators=["inv(x)=1/x"]`, then for the fitted + model to be export to sympy, :param`extra_sympy_mappings` + would be `{"inv": lambda x: 1/x}`. + + extra_jax_mappings : dict[Callable, str], default=None + Similar to :param`extra_sympy_mappings` but for model export + to jax. The dictionary maps sympy functions to jax functions. + For example: `extra_jax_mappings={sympy.sin: "jnp.sin"}` maps + the `sympy.sin` function to the equivalent jax expression `jnp.sin`. + + extra_torch_mappings : dict[Callable, Callable], default=None + The same as :param`extra_jax_mappings` but for model export + to pytorch. Note that the dictionary keys should be callable + pytorch expressions. + For example: `extra_torch_mappings={sympy.sin: torch.sin}` + + denoise : bool, default=False + Whether to use a Gaussian Process to denoise the data before + inputting to PySR. Can help PySR fit noisy data. + + select_k_features : int, default=None + whether to run feature selection in Python using random forests, + before passing to the symbolic regression code. None means no + feature selection; an int means select that many features. + + kwargs : dict, default=None + Supports deprecated keyword arguments. Other arguments will + result in an error. + + Attributes + ---------- + equations_ : pandas.DataFrame + DataFrame containing the results of model fitting. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + nout_ : int + Number of output dimensions. + + selection_mask_ : list[int] of length `select_k_features` + List of indices for input features that are selected when + :param`select_k_features` is set. + + tempdir_ : Path + Path to the temporary equations directory. + + equation_file_ : str + Output equation file name produced by the julia backend. + + raw_julia_state_ : tuple[list[PyCall.jlwrap], PyCall.jlwrap] + The state for the julia SymbolicRegression.jl backend post fitting. + + Notes + ----- + Most default parameters have been tuned over several example equations, + but you should adjust `niterations`, `binary_operators`, `unary_operators` + to your requirements. You can view more detailed explanations of the options + on the [options page](https://astroautomata.com/PySR/#/options) of the + documentation. + + Examples + -------- + >>> import numpy as np + >>> from pysr import PySRRegressor + >>> randstate = np.random.RandomState(0) + >>> X = 2 * randstate.randn(100, 5) + >>> # y = 2.5382 * cos(x_3) + x_0 - 0.5 + >>> y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5 + >>> model = PySRRegressor( + ... niterations=40, + ... binary_operators=["+", "*"], + ... unary_operators=[ + ... "cos", + ... "exp", + ... "sin", + ... "inv(x) = 1/x", # Custom operator (julia syntax) + ... ], + ... model_selection="best", + ... loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax) + ... ) + >>> model.fit(X, y) + >>> model + PySRRegressor.equations = [ + 0 0.000000 3.8552167 3.360272e+01 1 + 1 1.189847 (x0 * x0) 3.110905e+00 3 + 2 0.010626 ((x0 * x0) + -0.25573406) 3.045491e+00 5 + 3 0.896632 (cos(x3) + (x0 * x0)) 1.242382e+00 6 + 4 0.811362 ((x0 * x0) + (cos(x3) * 2.4384754)) 2.451971e-01 8 + 5 >>>> 13.733371 (((cos(x3) * 2.5382) + (x0 * x0)) + -0.5) 2.889755e-13 10 + 6 0.194695 ((x0 * x0) + (((cos(x3) + -0.063180044) * 2.53... 1.957723e-13 12 + 7 0.006988 ((x0 * x0) + (((cos(x3) + -0.32505524) * 1.538... 1.944089e-13 13 + 8 0.000955 (((((x0 * x0) + cos(x3)) + -0.8251649) + (cos(... 1.940381e-13 15 + ] + >>> model.score(X, y) + 1.0 + >>> model.predict(np.array([1,2,3,4,5])) + array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818]) + """ -class PySRRegressor(BaseEstimator, RegressorMixin): def __init__( self, model_selection="best", *, - weights=None, binary_operators=None, unary_operators=None, - procs=cpu_count(), - loss="L2DistLoss()", - complexity_of_operators=None, - complexity_of_constants=None, - complexity_of_variables=None, - populations=15, niterations=40, - ncyclesperiteration=550, + populations=15, + population_size=33, + max_evals=None, + maxsize=20, + maxdepth=None, + warmup_maxsize_by=0.0, timeout_in_seconds=None, + constraints=None, + nested_constraints=None, + loss="L2DistLoss()", + complexity_of_operators=None, + complexity_of_constants=1, + complexity_of_variables=1, + parsimony=0.0032, + use_frequency=True, + use_frequency_in_tournament=True, alpha=0.1, - annealing=False, + annealing=True, + early_stop_condition=None, + ncyclesperiteration=550, fraction_replaced=0.000364, fraction_replaced_hof=0.035, - population_size=33, - parsimony=0.0032, - migration=True, - hof_migration=True, - should_optimize_constants=True, - topn=12, weight_add_node=0.79, + weight_insert_node=5.1, weight_delete_node=1.7, weight_do_nothing=0.21, - weight_insert_node=5.1, weight_mutate_constant=0.048, weight_mutate_operator=0.47, weight_randomize=0.00023, weight_simplify=0.0020, crossover_probability=0.066, + skip_mutation_failures=True, + migration=True, + hof_migration=True, + topn=12, + should_optimize_constants=True, + optimizer_algorithm="BFGS", + optimizer_nrestarts=2, + optimize_probability=0.14, + optimizer_iterations=8, perturbation_factor=0.076, - extra_sympy_mappings=None, - extra_torch_mappings=None, - extra_jax_mappings=None, - equation_file=None, - verbosity=1e9, - update_verbosity=None, - progress=None, - maxsize=20, - fast_cycle=False, - maxdepth=None, - variable_names=None, + tournament_selection_n=10, + tournament_selection_p=0.86, + procs=cpu_count(), + multithreading=None, + cluster_manager=None, batching=False, batch_size=50, - select_k_features=None, - warmup_maxsize_by=0.0, - constraints=None, - nested_constraints=None, - use_frequency=True, - use_frequency_in_tournament=True, + fast_cycle=False, + precision=32, + random_state=None, + deterministic=False, + warm_start=False, + verbosity=1e9, + update_verbosity=None, + progress=True, + equation_file=None, + temp_equation_file=False, tempdir=None, delete_tempfiles=True, julia_project=None, update=True, - temp_equation_file=False, output_jax_format=False, output_torch_format=False, - optimizer_algorithm="BFGS", - optimizer_nrestarts=2, - optimize_probability=0.14, - optimizer_iterations=8, - tournament_selection_n=10, - tournament_selection_p=0.86, + extra_sympy_mappings=None, + extra_torch_mappings=None, + extra_jax_mappings=None, denoise=False, - Xresampled=None, - precision=32, - multithreading=None, - cluster_manager=None, - skip_mutation_failures=True, - max_evals=None, - early_stop_condition=None, - # To support deprecated kwargs: + select_k_features=None, **kwargs, ): - """Initialize settings for an equation search in PySR. - - Note: most default parameters have been tuned over several example - equations, but you should adjust `niterations`, - `binary_operators`, `unary_operators` to your requirements. - You can view more detailed explanations of the options on the - [options page](https://astroautomata.com/PySR/#/options) of the documentation. - - :param model_selection: How to select a model. Can be 'accuracy' or 'best'. The default, 'best', will optimize a combination of complexity and accuracy. - :type model_selection: str - :param binary_operators: List of strings giving the binary operators in Julia's Base. Default is ["+", "-", "*", "/",]. - :type binary_operators: list - :param unary_operators: Same but for operators taking a single scalar. Default is []. - :type unary_operators: list - :param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each. - :type niterations: int - :param populations: Number of populations running. - :type populations: int - :param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`. - :type loss: str - :param complexity_of_operators: If you would like to use a complexity other than 1 for - an operator, specify the complexity here. For example, `{"sin": 2, "+": 1}` would give - a complexity of 2 for each use of the `sin` operator, and a complexity of 1 - for each use of the `+` operator (which is the default). You may specify - real numbers for a complexity, and the total complexity of a tree will be rounded - to the nearest integer after computing. - :type complexity_of_operators: dict - :param complexity_of_constants: Complexity of constants. Default is 1. - :type complexity_of_constants: int/float - :param complexity_of_variables: Complexity of variables. Default is 1. - :type complexity_of_variables: int/float - :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data. - :type denoise: bool - :param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features. - :type select_k_features: None/int - :param procs: Number of processes (=number of populations running). - :type procs: int - :param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both. - :type multithreading: bool - :param cluster_manager: For distributed computing, this sets the job queue - system. Set to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or "htc". - If set to one of these, PySR will run in distributed mode, and use `procs` to figure - out how many processes to launch. - :type cluster_manager: str - :param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame. - :type batching: bool - :param batch_size: the amount of data to use if doing batching. - :type batch_size: int - :param maxsize: Max size of an equation. - :type maxsize: int - :param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration. - :type ncyclesperiteration: int - :param timeout_in_seconds: Make the search return early once this many seconds have passed. - :type timeout_in_seconds: float/int - :param alpha: Initial temperature. - :type alpha: float - :param annealing: Whether to use annealing. You should (and it is default). - :type annealing: bool - :param fraction_replaced: How much of population to replace with migrating equations from other populations. - :type fraction_replaced: float - :param fraction_replaced_hof: How much of population to replace with migrating equations from hall of fame. - :type fraction_replaced_hof: float - :param population_size: Number of individuals in each population - :type population_size: int - :param parsimony: Multiplicative factor for how much to punish complexity. - :type parsimony: float - :param migration: Whether to migrate. - :type migration: bool - :param hof_migration: Whether to have the hall of fame migrate. - :type hof_migration: bool - :param should_optimize_constants: Whether to numerically optimize constants (Nelder-Mead/Newton) at the end of each iteration. - :type should_optimize_constants: bool - :param topn: How many top individuals migrate from each population. - :type topn: int - :param perturbation_factor: Constants are perturbed by a max factor of (perturbation_factor*T + 1). Either multiplied by this or divided by this. - :type perturbation_factor: float - :param weight_add_node: Relative likelihood for mutation to add a node - :type weight_add_node: float - :param weight_insert_node: Relative likelihood for mutation to insert a node - :type weight_insert_node: float - :param weight_delete_node: Relative likelihood for mutation to delete a node - :type weight_delete_node: float - :param weight_do_nothing: Relative likelihood for mutation to leave the individual - :type weight_do_nothing: float - :param weight_mutate_constant: Relative likelihood for mutation to change the constant slightly in a random direction. - :type weight_mutate_constant: float - :param weight_mutate_operator: Relative likelihood for mutation to swap an operator. - :type weight_mutate_operator: float - :param weight_randomize: Relative likelihood for mutation to completely delete and then randomly generate the equation - :type weight_randomize: float - :param weight_simplify: Relative likelihood for mutation to simplify constant parts by evaluation - :type weight_simplify: float - :param crossover_probability: Absolute probability of crossover-type genetic operation, instead of a mutation. - :type crossover_probability: float - :param equation_file: Where to save the files (.csv separated by |) - :type equation_file: str - :param verbosity: What verbosity level to use. 0 means minimal print statements. - :type verbosity: int - :param update_verbosity: What verbosity level to use for package updates. Will take value of `verbosity` if not given. - :type update_verbosity: int - :param progress: Whether to use a progress bar instead of printing to stdout. - :type progress: bool - :param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant. - :type maxdepth: int - :param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient. - :type fast_cycle: bool - :param variable_names: a list of names for the variables, other than "x0", "x1", etc. - :type variable_names: list - :param warmup_maxsize_by: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize. - :type warmup_maxsize_by: float - :param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions. - :type constraints: dict - :param nested_constraints: Specifies how many times a combination of operators can be nested. For example, - `{"sin": {"cos": 0}}, "cos": {"cos": 2}}` specifies that `cos` may never appear within a `sin`, - but `sin` can be nested with itself an unlimited number of times. The second term specifies that `cos` - can be nested up to 2 times within a `cos`, so that `cos(cos(cos(x)))` is allowed (as well as any combination - of `+` or `-` within it), but `cos(cos(cos(cos(x))))` is not allowed. When an operator is not specified, - it is assumed that it can be nested an unlimited number of times. This requires that there is no operator - which is used both in the unary operators and the binary operators (e.g., `-` could be both subtract, and negation). - For binary operators, you only need to provide a single number: both arguments are treated the same way, - and the max of each argument is constrained. - :type nested_constraints: dict - :param use_frequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities. - :type use_frequency: bool - :param use_frequency_in_tournament: whether to use the frequency mentioned above in the tournament, rather than just the simulated annealing. - :type use_frequency_in_tournament: bool - :param tempdir: directory for the temporary files - :type tempdir: str/None - :param delete_tempfiles: whether to delete the temporary files after finishing - :type delete_tempfiles: bool - :param julia_project: a Julia environment location containing a Project.toml (and potentially the source code for SymbolicRegression.jl). Default gives the Python package directory, where a Project.toml file should be present from the install. - :type julia_project: str/None - :param update: Whether to automatically update Julia packages. - :type update: bool - :param temp_equation_file: Whether to put the hall of fame file in the temp directory. Deletion is then controlled with the delete_tempfiles argument. - :type temp_equation_file: bool - :param output_jax_format: Whether to create a 'jax_format' column in the output, containing jax-callable functions and the default parameters in a jax array. - :type output_jax_format: bool - :param output_torch_format: Whether to create a 'torch_format' column in the output, containing a torch module with trainable parameters. - :type output_torch_format: bool - :param tournament_selection_n: Number of expressions to consider in each tournament. - :type tournament_selection_n: int - :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss. - :type tournament_selection_p: float - :param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well. - :type precision: int - :param skip_mutation_failures: Whether to skip mutation and crossover failures, rather than simply re-sampling the current member. - :type skip_mutation_failures: bool - :param max_evals: Limits the total number of evaluations of expressions to this number. - :type max_evals: int - :param early_stop_condition: Stop the search early if this loss is reached. - :type early_stop_condition: float - :param kwargs: Supports deprecated keyword arguments. Other arguments will result - in an error - :type kwargs: dict - :returns: Initialized model. Call `.fit(X, y)` to fit your data! - :type: PySRRegressor - """ - super().__init__() - # First, check for deprecated kwargs: - if len(kwargs) > 0: # pragma: no cover - deprecated_kwargs = make_deprecated_kwargs_for_pysr_regressor() - for k, v in kwargs.items(): - if k == "fractionReplaced": - fraction_replaced = v - elif k == "fractionReplacedHof": - fraction_replaced_hof = v - elif k == "npop": - population_size = v - elif k == "hofMigration": - hof_migration = v - elif k == "shouldOptimizeConstants": - should_optimize_constants = v - elif k == "weightAddNode": - weight_add_node = v - elif k == "weightDeleteNode": - weight_delete_node = v - elif k == "weightDoNothing": - weight_do_nothing = v - elif k == "weightInsertNode": - weight_insert_node = v - elif k == "weightMutateConstant": - weight_mutate_constant = v - elif k == "weightMutateOperator": - weight_mutate_operator = v - elif k == "weightRandomize": - weight_randomize = v - elif k == "weightSimplify": - weight_simplify = v - elif k == "crossoverProbability": - crossover_probability = v - elif k == "perturbationFactor": - perturbation_factor = v - elif k == "batchSize": - batch_size = v - elif k == "warmupMaxsizeBy": - warmup_maxsize_by = v - elif k == "useFrequency": - use_frequency = v - elif k == "useFrequencyInTournament": - use_frequency_in_tournament = v - else: - raise TypeError( - f"{k} is not a valid keyword argument for PySRRegressor" - ) - updated_name = deprecated_kwargs[k] - warnings.warn( - f"{k} has been renamed to {updated_name} in PySRRegressor." - f" Please use that instead.", - ) + # Hyperparameters + # - Model search parameters self.model_selection = model_selection - - if binary_operators is None: - binary_operators = "+ * - /".split(" ") - if unary_operators is None: - unary_operators = [] - if extra_sympy_mappings is None: - extra_sympy_mappings = {} - if variable_names is None: - variable_names = [] - if constraints is None: - constraints = {} - if multithreading is None: - # Default is multithreading=True, unless explicitly set, - # or procs is set to 0 (serial mode). - multithreading = procs != 0 and cluster_manager is None - if update_verbosity is None: - update_verbosity = verbosity - - buffer_available = "buffer" in sys.stdout.__dir__() - - if progress is not None: - if progress and not buffer_available: - warnings.warn( - "Note: it looks like you are running in Jupyter. The progress bar will be turned off." - ) - progress = False - else: - progress = buffer_available - - assert optimizer_algorithm in ["NelderMead", "BFGS"] - assert tournament_selection_n < population_size - - if extra_jax_mappings is not None: - for value in extra_jax_mappings.values(): - if not isinstance(value, str): - raise NotImplementedError( - "extra_jax_mappings must have keys that are strings! e.g., {sympy.sqrt: 'jnp.sqrt'}." - ) - else: - extra_jax_mappings = {} - - if extra_torch_mappings is not None: - for value in extra_jax_mappings.values(): - if not callable(value): - raise NotImplementedError( - "extra_torch_mappings must be callable functions! e.g., {sympy.sqrt: torch.sqrt}." - ) - else: - extra_torch_mappings = {} - - if maxsize > 40: - warnings.warn( - "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory." - ) - elif maxsize < 7: - raise NotImplementedError("PySR requires a maxsize of at least 7") - - if maxdepth is None: - maxdepth = maxsize - - if isinstance(binary_operators, str): - binary_operators = [binary_operators] - if isinstance(unary_operators, str): - unary_operators = [unary_operators] - - self.params = { - **dict( - weights=weights, - binary_operators=binary_operators, - unary_operators=unary_operators, - procs=procs, - loss=loss, - complexity_of_operators=complexity_of_operators, - complexity_of_constants=complexity_of_constants, - complexity_of_variables=complexity_of_variables, - populations=populations, - niterations=niterations, - ncyclesperiteration=ncyclesperiteration, - timeout_in_seconds=timeout_in_seconds, - alpha=alpha, - annealing=annealing, - fraction_replaced=fraction_replaced, - fraction_replaced_hof=fraction_replaced_hof, - population_size=population_size, - parsimony=float(parsimony), - migration=migration, - hof_migration=hof_migration, - should_optimize_constants=should_optimize_constants, - topn=topn, - weight_add_node=weight_add_node, - weight_insert_node=weight_insert_node, - weight_delete_node=weight_delete_node, - weight_do_nothing=weight_do_nothing, - weight_mutate_constant=weight_mutate_constant, - weight_mutate_operator=weight_mutate_operator, - weight_randomize=weight_randomize, - weight_simplify=weight_simplify, - crossover_probability=crossover_probability, - perturbation_factor=perturbation_factor, - verbosity=verbosity, - update_verbosity=update_verbosity, - progress=progress, - maxsize=maxsize, - fast_cycle=fast_cycle, - maxdepth=maxdepth, - batching=batching, - batch_size=batch_size, - select_k_features=select_k_features, - warmup_maxsize_by=warmup_maxsize_by, - constraints=constraints, - nested_constraints=nested_constraints, - use_frequency=use_frequency, - use_frequency_in_tournament=use_frequency_in_tournament, - tempdir=tempdir, - delete_tempfiles=delete_tempfiles, - update=update, - temp_equation_file=temp_equation_file, - optimizer_algorithm=optimizer_algorithm, - optimizer_nrestarts=optimizer_nrestarts, - optimize_probability=optimize_probability, - optimizer_iterations=optimizer_iterations, - tournament_selection_n=tournament_selection_n, - tournament_selection_p=tournament_selection_p, - denoise=denoise, - Xresampled=Xresampled, - precision=precision, - multithreading=multithreading, - cluster_manager=cluster_manager, - skip_mutation_failures=skip_mutation_failures, - max_evals=max_evals, - early_stop_condition=early_stop_condition, - ), - } - - # Stored equations: - self.equations = None - self.params_hash = None - self.raw_julia_state = None - - self.multioutput = None + self.binary_operators = binary_operators + self.unary_operators = unary_operators + self.niterations = niterations + self.populations = populations + self.population_size = population_size + self.ncyclesperiteration = ncyclesperiteration + # - Equation Constraints + self.maxsize = maxsize + self.maxdepth = maxdepth + self.constraints = constraints + self.nested_constraints = nested_constraints + self.warmup_maxsize_by = warmup_maxsize_by + # - Early exit conditions: + self.max_evals = max_evals + self.timeout_in_seconds = timeout_in_seconds + self.early_stop_condition = early_stop_condition + # - Loss parameters + self.loss = loss + self.complexity_of_operators = complexity_of_operators + self.complexity_of_constants = complexity_of_constants + self.complexity_of_variables = complexity_of_variables + self.parsimony = parsimony + self.use_frequency = use_frequency + self.use_frequency_in_tournament = use_frequency_in_tournament + self.alpha = alpha + self.annealing = annealing + # - Evolutionary search parameters + # -- Mutation parameters + self.weight_add_node = weight_add_node + self.weight_insert_node = weight_insert_node + self.weight_delete_node = weight_delete_node + self.weight_do_nothing = weight_do_nothing + self.weight_mutate_constant = weight_mutate_constant + self.weight_mutate_operator = weight_mutate_operator + self.weight_randomize = weight_randomize + self.weight_simplify = weight_simplify + self.crossover_probability = crossover_probability + self.skip_mutation_failures = skip_mutation_failures + # -- Migration parameters + self.migration = migration + self.hof_migration = hof_migration + self.fraction_replaced = fraction_replaced + self.fraction_replaced_hof = fraction_replaced_hof + self.topn = topn + # -- Constants parameters + self.should_optimize_constants = should_optimize_constants + self.optimizer_algorithm = optimizer_algorithm + self.optimizer_nrestarts = optimizer_nrestarts + self.optimize_probability = optimize_probability + self.optimizer_iterations = optimizer_iterations + self.perturbation_factor = perturbation_factor + # -- Selection parameters + self.tournament_selection_n = tournament_selection_n + self.tournament_selection_p = tournament_selection_p + # Solver parameters + self.procs = procs + self.multithreading = multithreading + self.cluster_manager = cluster_manager + self.batching = batching + self.batch_size = batch_size + self.fast_cycle = fast_cycle + self.precision = precision + self.random_state = random_state + self.deterministic = deterministic + self.warm_start = warm_start + # Additional runtime parameters + # - Runtime user interface + self.verbosity = verbosity + self.update_verbosity = update_verbosity + self.progress = progress + # - Project management self.equation_file = equation_file - self.n_features = None - self.extra_sympy_mappings = extra_sympy_mappings - self.extra_torch_mappings = extra_torch_mappings - self.extra_jax_mappings = extra_jax_mappings + self.temp_equation_file = temp_equation_file + self.tempdir = tempdir + self.delete_tempfiles = delete_tempfiles + self.julia_project = julia_project + self.update = update self.output_jax_format = output_jax_format self.output_torch_format = output_torch_format - self.nout = 1 - self.selection = None - self.variable_names = variable_names - self.julia_project = julia_project + self.extra_sympy_mappings = extra_sympy_mappings + self.extra_jax_mappings = extra_jax_mappings + self.extra_torch_mappings = extra_torch_mappings + # Pre-modelling transformation + self.denoise = denoise + self.select_k_features = select_k_features - self.surface_parameters = [ - "model_selection", - "multioutput", - "equation_file", - "n_features", - "extra_sympy_mappings", - "extra_torch_mappings", - "extra_jax_mappings", - "output_jax_format", - "output_torch_format", - "nout", - "selection", - "variable_names", - "julia_project", - ] + # Once all valid parameters have been assigned handle the + # deprecated kwargs + if len(kwargs) > 0: # pragma: no cover + deprecated_kwargs = make_deprecated_kwargs_for_pysr_regressor() + for k, v in kwargs.items(): + # Handle renamed kwargs + if k in deprecated_kwargs: + updated_kwarg_name = deprecated_kwargs[k] + setattr(self, updated_kwarg_name, v) + warnings.warn( + f"{k} has been renamed to {updated_kwarg_name} in PySRRegressor. " + "Please use that instead.", + FutureWarning, + ) + # Handle kwargs that have been moved to the fit method + elif k in ["weights", "variable_names", "Xresampled"]: + warnings.warn( + f"{k} is a data dependant parameter so should be passed when fit is called. " + f"Ignoring parameter; please pass {k} during the call to fit instead.", + FutureWarning, + ) + else: + raise TypeError( + f"{k} is not a valid keyword argument for PySRRegressor." + ) def __repr__(self): - """Prints all current equations fitted by the model. + """ + Prints all current equations fitted by the model. The string `>>>>` denotes which equation is selected by the `model_selection`. """ - if not hasattr(self, "equations") or self.equations is None: - return "PySRRegressor.equations = None" + if not hasattr(self, "equations_") or self.equations_ is None: + return "PySRRegressor.equations_ = None" - output = "PySRRegressor.equations = [\n" + output = "PySRRegressor.equations_ = [\n" - equations = self.equations + equations = self.equations_ if not isinstance(equations, list): all_equations = [equations] else: @@ -858,382 +852,452 @@ def __repr__(self): output += "]" return output - def set_params(self, **params): - """Set parameters for equation search.""" - for key, value in params.items(): - if key in self.surface_parameters: - self.__setattr__(key, value) - elif key in self.params: - self.params[key] = value - else: - raise ValueError(f"Parameter {key} is not in the list of parameters.") - - return self - - def get_params(self, deep=True): - """Get parameters for equation search.""" - del deep - return { - **self.params, - **{key: self.__getattribute__(key) for key in self.surface_parameters}, + def __getstate__(self): + """ + Handles pickle serialization for PySRRegressor. + + The Scikit-learn standard requires estimators to be serializable via + `pickle.dumps()`. However, `PyCall.jlwrap` does not support pickle + serialization. + + Thus, for `PySRRegressor` to support pickle serialization, the + `raw_julia_state_` attribute must be hidden from pickle. This will + prevent the `warm_start` of any model that is loaded via `pickle.loads()`, + but does allow all other attributes of a fitted `PySRRegressor` estimator + to be serialized. Note: Jax and Torch format equations are also removed + from the pickled instance. + """ + state = self.__dict__ + if "raw_julia_state_" in state: + warnings.warn( + "raw_julia_state_ cannot be pickled and will be removed from the " + "serialized instance. This will prevent a `warm_start` fit of any " + "model that is deserialized via `pickle.load()`." + ) + pickled_state = { + key: None if key == "raw_julia_state_" else value + for key, value in state.items() } + if "equations_" in pickled_state: + pickled_state["output_torch_format"] = False + pickled_state["output_jax_format"] = False + pickled_columns = ~pickled_state["equations_"].columns.isin( + ["jax_format", "torch_format"] + ) + pickled_state["equations_"] = ( + pickled_state["equations_"].loc[:, pickled_columns].copy() + ) + return pickled_state - def get_best(self, index=None): - """Get best equation using `model_selection`. + @property + def equations(self): # pragma: no cover + warnings.warn( + "PySRRegressor.equations is now deprecated. " + "Please use PySRRegressor.equations_ instead.", + FutureWarning, + ) + return self.equations_ - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the row number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: Dictionary representing the best expression found. - :type: pd.Series + def get_best(self, index=None): """ - if self.equations is None: + Get best equation using `model_selection`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from `self.equations_`, + give the row number here. This overrides the :param`model_selection` + parameter. + + Returns + ------- + best_equation : pandas.Series + Dictionary representing the best expression found. + + Raises + ------ + NotImplementedError + Raised when an invalid model selection strategy is provided. + """ + check_is_fitted(self, attributes=["equations_"]) + if self.equations_ is None: raise ValueError("No equations have been generated yet.") if index is not None: - if isinstance(self.equations, list): + if isinstance(self.equations_, list): assert isinstance(index, list) - return [eq.iloc[i] for eq, i in zip(self.equations, index)] - return self.equations.iloc[index] + return [eq.iloc[i] for eq, i in zip(self.equations_, index)] + return self.equations_.iloc[index] if self.model_selection == "accuracy": - if isinstance(self.equations, list): - return [eq.iloc[-1] for eq in self.equations] - return self.equations.iloc[-1] + if isinstance(self.equations_, list): + return [eq.iloc[-1] for eq in self.equations_] + return self.equations_.iloc[-1] elif self.model_selection == "best": - if isinstance(self.equations, list): - return [eq.iloc[eq["score"].idxmax()] for eq in self.equations] - return self.equations.iloc[self.equations["score"].idxmax()] + if isinstance(self.equations_, list): + return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_] + return self.equations_.iloc[self.equations_["score"].idxmax()] else: raise NotImplementedError( f"{self.model_selection} is not a valid model selection strategy." ) - def fit(self, X, y, weights=None, variable_names=None): - """Search for equations to fit the dataset and store them in `self.equations`. - - :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces). - :type X: np.ndarray/pandas.DataFrame - :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y. - :type y: np.ndarray - :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y. - :type weights: np.ndarray - :param variable_names: a list of names for the variables, other than "x0", "x1", etc. - You can also pass a pandas DataFrame for X. - :type variable_names: list + def _setup_equation_file(self): """ - if variable_names is None: - variable_names = self.variable_names - - self._run( - X=X, - y=y, - weights=weights, - variable_names=variable_names, - ) + Sets the full pathname of the equation file, using :param`tempdir` and + :param`equation_file`. + """ + # Cast tempdir string as a Path object + self.tempdir_ = Path(tempfile.mkdtemp(dir=self.tempdir)) + if self.temp_equation_file: + self.equation_file_ = self.tempdir_ / "hall_of_fame.csv" + elif self.equation_file is None: + if self.warm_start and self.equation_file_: + pass + else: + date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3] + self.equation_file_ = "hall_of_fame_" + date_time + ".csv" + else: + self.equation_file_ = self.equation_file - return self + def _validate_and_set_init_params(self): + """ + Ensures parameters passed at initialization are valid. - def refresh(self): - # Updates self.equations with any new options passed, - # such as extra_sympy_mappings. - self.equations = self.get_hof() + Also returns a dictionary of parameters to update from their + values given at initialization. - def predict(self, X, index=None): - """Predict y from input X using the equation chosen by `model_selection`. - - You may see what equation is used by printing this object. X should have the same - columns as the training data. - - :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces). - :type X: np.ndarray/pandas.DataFrame - :param index: Optional. If you want to compute the output of - an expression using a particular row of - `self.equations`, you may specify the index here. - :type index: int - :returns: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). - :type: np.ndarray + Returns + ------- + packed_modified_params : dict + Dictionary of parameters to modify from their initialized + values. For example, default parameters are set here + when a parameter is left set to `None`. """ - self.refresh() - best = self.get_best(index=index) - try: - if self.multioutput: - return np.stack([eq["lambda_format"](X) for eq in best], axis=1) - return best["lambda_format"](X) - except Exception as error: - # Add extra information to the error, to say that the user - # should try to adjust extra_sympy_params. + # Immutable parameter validation + # Ensure instance parameters are allowable values: + if self.tournament_selection_n > self.population_size: raise ValueError( - "Failed to evaluate the expression. " - "If you are using a custom operator, make sure to define it in extra_sympy_mappings, " - "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`." - ) from error + "tournament_selection_n parameter must be smaller than population_size." + ) - def sympy(self, index=None): - """Return sympy representation of the equation(s) chosen by `model_selection`. + if self.maxsize > 40: + warnings.warn( + "Note: Using a large maxsize for the equation search will be " + "exponentially slower and use significant memory. You should consider " + "turning `use_frequency` to False, and perhaps use `warmup_maxsize_by`." + ) + elif self.maxsize < 7: + raise ValueError("PySR requires a maxsize of at least 7") + + if self.deterministic and not ( + self.multithreading in [False, None] + and self.procs == 0 + and self.random_state is not None + ): + raise ValueError( + "To ensure deterministic searches, you must set `random_state` to a seed, " + "`procs` to `0`, and `multithreading` to `False` or `None`." + ) - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the index number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: SymPy representation of the best expression. - """ - self.refresh() - best = self.get_best(index=index) - if self.multioutput: - return [eq["sympy_format"] for eq in best] - return best["sympy_format"] + if self.random_state is not None and ( + not self.deterministic or self.procs != 0 + ): + warnings.warn( + "Note: Setting `random_state` without also setting `deterministic` " + "to True and `procs` to 0 will result in non-deterministic searches. " + ) - def latex(self, index=None): - """Return latex representation of the equation(s) chosen by `model_selection`. + # NotImplementedError - Values that could be supported at a later time + if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS: + raise NotImplementedError( + f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}" + ) - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the index number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: LaTeX expression as a string - :type: str - """ - self.refresh() - sympy_representation = self.sympy(index=index) - if self.multioutput: - return [sympy.latex(s) for s in sympy_representation] - return sympy.latex(sympy_representation) + # 'Mutable' parameter validation + buffer_available = "buffer" in sys.stdout.__dir__() + # Params and their default values, if None is given: + default_param_mapping = { + "binary_operators": "+ * - /".split(" "), + "unary_operators": [], + "maxdepth": self.maxsize, + "constraints": {}, + "multithreading": self.procs != 0 and self.cluster_manager is None, + "batch_size": 1, + "update_verbosity": self.verbosity, + "progress": buffer_available, + } + packed_modified_params = {} + for parameter, default_value in default_param_mapping.items(): + parameter_value = getattr(self, parameter) + if parameter_value is None: + parameter_value = default_value + else: + # Special cases such as when binary_operators is a string + if parameter in ["binary_operators", "unary_operators"] and isinstance( + parameter_value, str + ): + parameter_value = [parameter_value] + elif parameter == "batch_size" and parameter_value < 1: + warnings.warn( + "Given :param`batch_size` must be greater than or equal to one. " + ":param`batch_size` has been increased to equal one." + ) + parameter_value = 1 + elif parameter == "progress" and not buffer_available: + warnings.warn( + "Note: it looks like you are running in Jupyter. " + "The progress bar will be turned off." + ) + parameter_value = False + packed_modified_params[parameter] = parameter_value - def jax(self, index=None): - """Return jax representation of the equation(s) chosen by `model_selection`. + assert ( + len(packed_modified_params["binary_operators"]) + + len(packed_modified_params["unary_operators"]) + > 0 + ) + return packed_modified_params - Each equation (multiple given if there are multiple outputs) is a dictionary - containing {"callable": func, "parameters": params}. To call `func`, pass - func(X, params). This function is differentiable using `jax.grad`. - - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the index number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: Dictionary of callable jax function in "callable" key, - and jax array of parameters as "parameters" key. - :type: dict + def _validate_and_set_fit_params(self, X, y, Xresampled, weights, variable_names): """ - if self.using_pandas: - warnings.warn( - "PySR's JAX modules are not set up to work with a " - "model that was trained on pandas dataframes. " - "Train on an array instead to ensure everything works as planned." - ) - self.set_params(output_jax_format=True) - self.refresh() - best = self.get_best(index=index) - if self.multioutput: - return [eq["jax_format"] for eq in best] - return best["jax_format"] + Validates the parameters passed to the :term`fit` method. - def pytorch(self, index=None): - """Return pytorch representation of the equation(s) chosen by `model_selection`. + This method also sets the `nout_` attribute. - Each equation (multiple given if there are multiple outputs) is a PyTorch module - containing the parameters as trainable attributes. You can use the module like - any other PyTorch module: `module(X)`, where `X` is a tensor with the same - column ordering as trained with. + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the row number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: PyTorch module representing the expression. - :type: torch.nn.Module - """ - if self.using_pandas: - warnings.warn( - "PySR's PyTorch modules are not set up to work with a " - "model that was trained on pandas dataframes. " - "Train on an array instead to ensure everything works as planned." - ) - self.set_params(output_torch_format=True) - self.refresh() - best = self.get_best(index=index) - if self.multioutput: - return [eq["torch_format"] for eq in best] - return best["torch_format"] - - def reset(self): - """Reset the search state.""" - self.equations = None - self.params_hash = None - self.raw_julia_state = None - self.variable_names = None - self.selection = None - - def _run(self, X, y, weights, variable_names): - global already_ran - global Main - - for key in self.surface_parameters: - if key in self.params: - raise ValueError( - f"{key} is a surface parameter, and cannot be in self.params" - ) - - multithreading = self.params["multithreading"] - cluster_manager = self.params["cluster_manager"] - procs = self.params["procs"] - binary_operators = self.params["binary_operators"] - unary_operators = self.params["unary_operators"] - batching = self.params["batching"] - maxsize = self.params["maxsize"] - select_k_features = self.params["select_k_features"] - Xresampled = self.params["Xresampled"] - denoise = self.params["denoise"] - constraints = self.params["constraints"] - update = self.params["update"] - loss = self.params["loss"] - weight_mutate_constant = self.params["weight_mutate_constant"] - weight_mutate_operator = self.params["weight_mutate_operator"] - weight_add_node = self.params["weight_add_node"] - weight_insert_node = self.params["weight_insert_node"] - weight_delete_node = self.params["weight_delete_node"] - weight_simplify = self.params["weight_simplify"] - weight_randomize = self.params["weight_randomize"] - weight_do_nothing = self.params["weight_do_nothing"] - - if Main is None: - if multithreading: - os.environ["JULIA_NUM_THREADS"] = str(procs) + Xresampled : {ndarray | pandas.DataFrame} of shape + (n_resampled, n_features), default=None + Resampled training data used for denoising. - Main = init_julia() + weights : {ndarray | pandas.DataFrame} of the same shape as y + Each element is how to weight the mean-square-error loss + for that particular element of y. - if cluster_manager is not None: - Main.eval(f"import ClusterManagers: addprocs_{cluster_manager}") - cluster_manager = Main.eval(f"addprocs_{cluster_manager}") + variable_names : list[str] of length n_features + Names of each variable in the training dataset, `X`. - if isinstance(X, pd.DataFrame): - if variable_names is not None: - warnings.warn("Resetting variable_names from X.columns") - - variable_names = list(X.columns) - X = np.array(X) - self.using_pandas = True - else: - self.using_pandas = False + Returns + ------- + X_validated : ndarray of shape (n_samples, n_features) + Validated training data. - if len(X.shape) == 1: - X = X[:, None] + y_validated : ndarray of shape (n_samples,) or (n_samples, n_targets) + Validated target data. - if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): - y = np.array(y) + Xresampled : ndarray of shape (n_resampled, n_features) + Validated resampled training data used for denoising. - if variable_names is None or len(variable_names) == 0: - variable_names = [f"x{i}" for i in range(X.shape[1])] + variable_names_validated : list[str] of length n_features + Validated list of variable names for each feature in `X`. - use_custom_variable_names = len(variable_names) != 0 - # TODO: this is always true. - - _check_assertions( - X, - binary_operators, - unary_operators, - use_custom_variable_names, - variable_names, - weights, - y, - ) - - self.n_features = X.shape[1] - - if len(X) > 10000 and not batching: - warnings.warn( - "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://astroautomata.com/PySR/#/options?id=batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed." - ) + """ + if isinstance(X, pd.DataFrame): + if variable_names: + variable_names = None + warnings.warn( + ":param`variable_names` has been reset to `None` as `X` is a DataFrame. " + "Using DataFrame column names instead." + ) - if self.n_features >= 10 and not select_k_features: + if X.columns.is_object() and X.columns.str.contains(" ").any(): + X.columns = X.columns.str.replace(" ", "_") + warnings.warn( + "Spaces in DataFrame column names are not supported. " + "Spaces have been replaced with underscores. \n" + "Please rename the columns to valid names." + ) + elif variable_names and [" " in name for name in variable_names].any(): + variable_names = [name.replace(" ", "_") for name in variable_names] warnings.warn( - "Note: you are running with 10 features or more. " - "Genetic algorithms like used in PySR scale poorly with large numbers of features. " - "Consider using feature selection techniques to select the most important features " - "(you can do this automatically with the `select_k_features` parameter), " - "or, alternatively, doing a dimensionality reduction beforehand. " - "For example, `X = PCA(n_components=6).fit_transform(X)`, " - "using scikit-learn's `PCA` class, " - "will reduce the number of features to 6 in an interpretable way, " - "as each resultant feature " - "will be a linear combination of the original features. " + "Spaces in `variable_names` are not supported. " + "Spaces have been replaced with underscores. \n" + "Please use valid names instead." ) - X, selection = _handle_feature_selection( - X, select_k_features, y, variable_names - ) + # Data validation and feature name fetching via sklearn + # This method sets the n_features_in_ attribute + if Xresampled is not None: + Xresampled = check_array(Xresampled) + if weights is not None: + weights = check_array(weights) + check_consistent_length(weights, y) + X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True) + self.feature_names_in_ = _check_feature_names_in(self, variable_names) + variable_names = self.feature_names_in_ + # Handle multioutput data if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1): - self.multioutput = False - self.nout = 1 y = y.reshape(-1) elif len(y.shape) == 2: - self.multioutput = True - self.nout = y.shape[1] + self.nout_ = y.shape[1] else: raise NotImplementedError("y shape not supported!") - if denoise: - if weights is not None: - raise NotImplementedError( - "No weights for denoising - the weights are learned." - ) + return X, y, Xresampled, weights, variable_names + + def _pre_transform_training_data( + self, X, y, Xresampled, variable_names, random_state + ): + """ + Transforms the training data before fitting the symbolic regressor. + + This method also updates/sets the `selection_mask_` attribute. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + Xresampled : {ndarray | pandas.DataFrame} of shape + (n_resampled, n_features), default=None + Resampled training data used for denoising. + + variable_names : list[str] of length n_features + Names of each variable in the training dataset, `X`. + + random_state : int, Numpy RandomState instance or None, default=None + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X_transformed : ndarray of shape (n_samples, n_features) + Transformed training data. n_samples will be equal to + :param`Xresampled.shape[0]` if :param`self.denoise` is `True`, + and :param`Xresampled is not None`, otherwise it will be + equal to :param`X.shape[0]`. n_features will be equal to + :param`self.select_k_features` if `self.select_k_features is not None`, + otherwise it will be equal to :param`X.shape[1]` + + y_transformed : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Transformed target data. n_samples will be equal to + :param`Xresampled.shape[0]` if :param`self.denoise` is `True`, + and :param`Xresampled is not None`, otherwise it will be + equal to :param`X.shape[0]`. + + variable_names_transformed : list[str] of length n_features + Names of each variable in the transformed dataset, + `X_transformed`. + """ + # Feature selection transformation + if self.select_k_features: + self.selection_mask_ = run_feature_selection( + X, y, self.select_k_features, random_state=random_state + ) + X = X[:, self.selection_mask_] + if Xresampled is not None: - # Select among only the selected features: - if isinstance(Xresampled, pd.DataFrame): - # Handle Xresampled is pandas dataframe - if selection is not None: - Xresampled = Xresampled[[variable_names[i] for i in selection]] - else: - Xresampled = Xresampled[variable_names] - Xresampled = np.array(Xresampled) - else: - if selection is not None: - Xresampled = Xresampled[:, selection] - if self.multioutput: + Xresampled = Xresampled[:, self.selection_mask_] + + # Reduce variable_names to selection + variable_names = [variable_names[i] for i in self.selection_mask_] + + # Re-perform data validation and feature name updating + X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True) + # Update feature names with selected variable names + self.feature_names_in_ = _check_feature_names_in(self, variable_names) + print(f"Using features {self.feature_names_in_}") + + # Denoising transformation + if self.denoise: + if self.nout_ > 1: y = np.stack( [ - _denoise(X, y[:, i], Xresampled=Xresampled)[1] - for i in range(self.nout) + _denoise( + X, y[:, i], Xresampled=Xresampled, random_state=random_state + )[1] + for i in range(self.nout_) ], axis=1, ) if Xresampled is not None: X = Xresampled else: - X, y = _denoise(X, y, Xresampled=Xresampled) + X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state) - self.julia_project, is_shared = _get_julia_project(self.julia_project) + return X, y, variable_names - tmpdir = Path(tempfile.mkdtemp(dir=self.params["tempdir"])) + def _run(self, X, y, mutated_params, weights, seed): + """ + Run the symbolic regression fitting process on the julia backend. - if self.params["temp_equation_file"]: - self.equation_file = tmpdir / "hall_of_fame.csv" - elif self.equation_file is None: - date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3] - self.equation_file = "hall_of_fame_" + date_time + ".csv" + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. - _create_inline_operators( - binary_operators=binary_operators, unary_operators=unary_operators - ) - _handle_constraints( - binary_operators=binary_operators, - unary_operators=unary_operators, - constraints=constraints, - ) + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. - una_constraints = [constraints[op] for op in unary_operators] - bin_constraints = [constraints[op] for op in binary_operators] + mutated_params : dict[str, Any] + Dictionary of mutated versions of some parameters passed in __init__. + + weights : {ndarray | pandas.DataFrame} of the same shape as y + Each element is how to weight the mean-square-error loss + for that particular element of y. + + seed : int + Random seed for julia backend process. + + Returns + ------- + self : object + Reference to `self` with fitted attributes. + + Raises + ------ + ImportError + Raised when the julia backend fails to import a package. + """ + # Need to be global as we don't want to recreate/reinstate julia for + # every new instance of PySRRegressor + global already_ran + global Main + + # These are the parameters which may be modified from the ones + # specified in init, so we define them here locally: + binary_operators = mutated_params["binary_operators"] + unary_operators = mutated_params["unary_operators"] + maxdepth = mutated_params["maxdepth"] + constraints = mutated_params["constraints"] + nested_constraints = self.nested_constraints + complexity_of_operators = self.complexity_of_operators + multithreading = mutated_params["multithreading"] + cluster_manager = self.cluster_manager + batch_size = mutated_params["batch_size"] + update_verbosity = mutated_params["update_verbosity"] + progress = mutated_params["progress"] + + # Start julia backend processes + if Main is None: + if multithreading: + os.environ["JULIA_NUM_THREADS"] = str(self.procs) + + Main = init_julia() + + if cluster_manager is not None: + Main.eval(f"import ClusterManagers: addprocs_{cluster_manager}") + cluster_manager = Main.eval(f"addprocs_{cluster_manager}") if not already_ran: + julia_project, is_shared = _get_julia_project(self.julia_project) Main.eval("using Pkg") - io = "devnull" if self.params["update_verbosity"] == 0 else "stderr" + io = "devnull" if update_verbosity == 0 else "stderr" io_arg = f"io={io}" if is_julia_version_greater_eq(Main, "1.6") else "" Main.eval( - f'Pkg.activate("{_escape_filename(self.julia_project)}", shared = Bool({int(is_shared)}), {io_arg})' + f'Pkg.activate("{_escape_filename(julia_project)}", shared = Bool({int(is_shared)}), {io_arg})' ) from julia.api import JuliaError @@ -1242,13 +1306,13 @@ def _run(self, X, y, weights, variable_names): _add_sr_to_julia_project(Main, io_arg) try: - if update: + if self.update: Main.eval(f"Pkg.resolve({io_arg})") Main.eval(f"Pkg.instantiate({io_arg})") else: Main.eval(f"Pkg.instantiate({io_arg})") except (JuliaError, RuntimeError) as e: - raise ImportError(import_error_string(self.julia_project)) from e + raise ImportError(import_error_string(julia_project)) from e Main.eval("using SymbolicRegression") Main.plus = Main.eval("(+)") @@ -1257,7 +1321,19 @@ def _run(self, X, y, weights, variable_names): Main.pow = Main.eval("(^)") Main.div = Main.eval("(/)") - nested_constraints = self.params["nested_constraints"] + # TODO(mcranmer): These functions should be part of this class. + binary_operators, unary_operators = _maybe_create_inline_operators( + binary_operators=binary_operators, unary_operators=unary_operators + ) + constraints = _process_constraints( + binary_operators=binary_operators, + unary_operators=unary_operators, + constraints=constraints, + ) + + una_constraints = [constraints[op] for op in unary_operators] + bin_constraints = [constraints[op] for op in binary_operators] + # Parse dict into Julia Dict for nested constraints:: if nested_constraints is not None: nested_constraints_str = "Dict(" @@ -1270,7 +1346,6 @@ def _run(self, X, y, weights, variable_names): nested_constraints = Main.eval(nested_constraints_str) # Parse dict into Julia Dict for complexities: - complexity_of_operators = self.params["complexity_of_operators"] if complexity_of_operators is not None: complexity_of_operators_str = "Dict(" for k, v in complexity_of_operators.items(): @@ -1278,109 +1353,83 @@ def _run(self, X, y, weights, variable_names): complexity_of_operators_str += ")" complexity_of_operators = Main.eval(complexity_of_operators_str) - Main.custom_loss = Main.eval(loss) - - mutationWeights = [ - float(weight_mutate_constant), - float(weight_mutate_operator), - float(weight_add_node), - float(weight_insert_node), - float(weight_delete_node), - float(weight_simplify), - float(weight_randomize), - float(weight_do_nothing), - ] - - params_to_hash = { - **{k: self.__getattribute__(k) for k in self.surface_parameters}, - **self.params, - } - params_excluded_from_hash = [ - "niterations", - ] - # Delete these^ from params_to_hash: - params_to_hash = { - k: v - for k, v in params_to_hash.items() - if k not in params_excluded_from_hash - } - - # Sort params_to_hash by key: - params_to_hash = OrderedDict(sorted(params_to_hash.items())) - # Hash all parameters: - cur_hash = sha256(str(params_to_hash).encode()).hexdigest() - - if self.params_hash is not None: - if cur_hash != self.params_hash: - warnings.warn( - "Warning: PySR options have changed since the last run. " - "This is experimental and may not work. " - "For example, if the operators change, or even their order," - " the saved equations will be in the wrong format." - "\n\n" - "To reset the search state, run `.reset()`. " - ) + custom_loss = Main.eval(self.loss) + early_stop_condition = Main.eval( + str(self.early_stop_condition) if self.early_stop_condition else None + ) - self.params_hash = cur_hash + mutation_weights = np.array( + [ + self.weight_mutate_constant, + self.weight_mutate_operator, + self.weight_add_node, + self.weight_insert_node, + self.weight_delete_node, + self.weight_simplify, + self.weight_randomize, + self.weight_do_nothing, + ], + dtype=float, + ) # Call to Julia backend. - # See https://github.com/search?q=%22function+Options%22+repo%3AMilesCranmer%2FSymbolicRegression.jl+path%3A%2Fsrc%2F+filename%3AOptions.jl+language%3AJulia&type=Code + # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl options = Main.Options( binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")), unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")), bin_constraints=bin_constraints, una_constraints=una_constraints, complexity_of_operators=complexity_of_operators, - complexity_of_constants=self.params["complexity_of_constants"], - complexity_of_variables=self.params["complexity_of_variables"], + complexity_of_constants=self.complexity_of_constants, + complexity_of_variables=self.complexity_of_variables, nested_constraints=nested_constraints, - loss=Main.custom_loss, - maxsize=int(maxsize), - hofFile=_escape_filename(self.equation_file), - npopulations=int(self.params["populations"]), - batching=batching, - batchSize=int( - min([self.params["batch_size"], len(X)]) if batching else len(X) - ), - mutationWeights=mutationWeights, - probPickFirst=self.params["tournament_selection_p"], - ns=self.params["tournament_selection_n"], + loss=custom_loss, + maxsize=int(self.maxsize), + hofFile=_escape_filename(self.equation_file_), + npopulations=int(self.populations), + batching=self.batching, + batchSize=int(min([batch_size, len(X)]) if self.batching else len(X)), + mutationWeights=mutation_weights, + probPickFirst=self.tournament_selection_p, + ns=self.tournament_selection_n, # These have the same name: - parsimony=self.params["parsimony"], - alpha=self.params["alpha"], - maxdepth=self.params["maxdepth"], - fast_cycle=self.params["fast_cycle"], - migration=self.params["migration"], - hofMigration=self.params["hof_migration"], - fractionReplacedHof=self.params["fraction_replaced_hof"], - shouldOptimizeConstants=self.params["should_optimize_constants"], - warmupMaxsizeBy=self.params["warmup_maxsize_by"], - useFrequency=self.params["use_frequency"], - useFrequencyInTournament=self.params["use_frequency_in_tournament"], - npop=self.params["population_size"], - ncyclesperiteration=self.params["ncyclesperiteration"], - fractionReplaced=self.params["fraction_replaced"], - topn=self.params["topn"], - verbosity=self.params["verbosity"], - optimizer_algorithm=self.params["optimizer_algorithm"], - optimizer_nrestarts=self.params["optimizer_nrestarts"], - optimize_probability=self.params["optimize_probability"], - optimizer_iterations=self.params["optimizer_iterations"], - perturbationFactor=self.params["perturbation_factor"], - annealing=self.params["annealing"], + parsimony=self.parsimony, + alpha=self.alpha, + maxdepth=maxdepth, + fast_cycle=self.fast_cycle, + migration=self.migration, + hofMigration=self.hof_migration, + fractionReplacedHof=self.fraction_replaced_hof, + shouldOptimizeConstants=self.should_optimize_constants, + warmupMaxsizeBy=self.warmup_maxsize_by, + useFrequency=self.use_frequency, + useFrequencyInTournament=self.use_frequency_in_tournament, + npop=self.population_size, + ncyclesperiteration=self.ncyclesperiteration, + fractionReplaced=self.fraction_replaced, + topn=self.topn, + verbosity=self.verbosity, + optimizer_algorithm=self.optimizer_algorithm, + optimizer_nrestarts=self.optimizer_nrestarts, + optimize_probability=self.optimize_probability, + optimizer_iterations=self.optimizer_iterations, + perturbationFactor=self.perturbation_factor, + annealing=self.annealing, stateReturn=True, # Required for state saving. - progress=self.params["progress"], - timeout_in_seconds=self.params["timeout_in_seconds"], - crossoverProbability=self.params["crossover_probability"], - skip_mutation_failures=self.params["skip_mutation_failures"], - max_evals=self.params["max_evals"], - earlyStopCondition=self.params["early_stop_condition"], + progress=progress, + timeout_in_seconds=self.timeout_in_seconds, + crossoverProbability=self.crossover_probability, + skip_mutation_failures=self.skip_mutation_failures, + max_evals=self.max_evals, + earlyStopCondition=early_stop_condition, + seed=seed, + deterministic=self.deterministic, ) - np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[ - self.params["precision"] - ] + # Convert data to desired precision + np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self.precision] + # This converts the data into a Julia array: Main.X = np.array(X, dtype=np_dtype).T if len(y.shape) == 1: Main.y = np.array(y, dtype=np_dtype) @@ -1394,50 +1443,352 @@ def _run(self, X, y, weights, variable_names): else: Main.weights = None - cprocs = 0 if multithreading else procs + cprocs = 0 if multithreading else self.procs # Call to Julia backend. - # See https://github.com/search?q=%22function+EquationSearch%22+repo%3AMilesCranmer%2FSymbolicRegression.jl+path%3A%2Fsrc%2F+filename%3ASymbolicRegression.jl+language%3AJulia&type=Code - self.raw_julia_state = Main.EquationSearch( + # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl + self.raw_julia_state_ = Main.EquationSearch( Main.X, Main.y, weights=Main.weights, - niterations=int(self.params["niterations"]), - varMap=( - variable_names - if selection is None - else [variable_names[i] for i in selection] - ), + niterations=int(self.niterations), + varMap=self.feature_names_in_.tolist(), options=options, numprocs=int(cprocs), multithreading=bool(multithreading), - saved_state=self.raw_julia_state, + saved_state=self.raw_julia_state_, addprocs_function=cluster_manager, ) - self.variable_names = variable_names - self.selection = selection + # Set attributes + self.equations_ = self.get_hof() - # Not in params: - # selection, variable_names, multioutput + if self.delete_tempfiles: + shutil.rmtree(self.tempdir_) - self.equations = self.get_hof() + already_ran = True - if self.params["delete_tempfiles"]: - shutil.rmtree(tmpdir) + return self - already_ran = True + def fit( + self, + X, + y, + Xresampled=None, + weights=None, + variable_names=None, + ): + """ + Search for equations to fit the dataset and store them in `self.equations_`. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + Xresampled : {ndarray | pandas.DataFrame} of shape + (n_resampled, n_features), default=None + Resampled training data to generate a denoised data on. This + will be used as the training data, rather than `X`. + + weights : {ndarray | pandas.DataFrame} of the same shape as y, default=None + Each element is how to weight the mean-square-error loss + for that particular element of `y`. Alternatively, + if a custom `loss` was set, it will can be used + in arbitrary ways. + + variable_names : list[str], default=None + A list of names for the variables, rather than "x0", "x1", etc. + If :param`X` is a pandas dataframe, the column names will be used + instead of `variable_names`. Cannot contain spaces or special + characters. Avoid variable names which are also + function names in `sympy`, such as "N". + + Returns + ------- + self : object + Fitted estimator. + """ + # Init attributes that are not specified in BaseEstimator + if self.warm_start and hasattr(self, "raw_julia_state_"): + pass + else: + if hasattr(self, "raw_julia_state_"): + warnings.warn( + "The discovered expressions are being reset. " + "Please set `warm_start=True` if you wish to continue " + "to start a search where you left off.", + ) + + self.equations_ = None + self.nout_ = 1 + self.selection_mask_ = None + self.raw_julia_state_ = None + + random_state = check_random_state(self.random_state) # For np random + seed = random_state.get_state()[1][0] # For julia random + + self._setup_equation_file() + + mutated_params = self._validate_and_set_init_params() + + X, y, Xresampled, weights, variable_names = self._validate_and_set_fit_params( + X, y, Xresampled, weights, variable_names + ) + + if X.shape[0] > 10000 and not self.batching: + warnings.warn( + "Note: you are running with more than 10,000 datapoints. " + "You should consider turning on batching (https://astroautomata.com/PySR/#/options?id=batching). " + "You should also reconsider if you need that many datapoints. " + "Unless you have a large amount of noise (in which case you " + "should smooth your dataset first), generally < 10,000 datapoints " + "is enough to find a functional form with symbolic regression. " + "More datapoints will lower the search speed." + ) + + # Pre transformations (feature selection and denoising) + X, y, variable_names = self._pre_transform_training_data( + X, y, Xresampled, variable_names, random_state + ) + + # Warn about large feature counts (still warn if feature count is large + # after running feature selection) + if self.n_features_in_ >= 10: + warnings.warn( + "Note: you are running with 10 features or more. " + "Genetic algorithms like used in PySR scale poorly with large numbers of features. " + "Consider using feature selection techniques to select the most important features " + "(you can do this automatically with the `select_k_features` parameter), " + "or, alternatively, doing a dimensionality reduction beforehand. " + "For example, `X = PCA(n_components=6).fit_transform(X)`, " + "using scikit-learn's `PCA` class, " + "will reduce the number of features to 6 in an interpretable way, " + "as each resultant feature " + "will be a linear combination of the original features. " + ) + + # Assertion checks + use_custom_variable_names = variable_names is not None + # TODO: this is always true. + + _check_assertions( + X, + use_custom_variable_names, + variable_names, + weights, + y, + ) + + # Fitting procedure + return self._run(X, y, mutated_params, weights=weights, seed=seed) + + def refresh(self, checkpoint_file=None): + """ + Updates self.equations_ with any new options passed, such as + :param`extra_sympy_mappings`. + + Parameters + ---------- + checkpoint_file : str, default=None + Path to checkpoint hall of fame file to be loaded. + """ + check_is_fitted(self, attributes=["equation_file_"]) + if checkpoint_file: + self.equation_file_ = checkpoint_file + self.equations_ = self.get_hof() + + def predict(self, X, index=None): + """ + Predict y from input X using the equation chosen by `model_selection`. + + You may see what equation is used by printing this object. X should + have the same columns as the training data. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + + index : int, default=None + If you want to compute the output of an expression using a + particular row of `self.equations_`, you may specify the index here. + + Returns + ------- + y_predicted : ndarray of shape (n_samples, nout_) + Values predicted by substituting `X` into the fitted symbolic + regression model. + + Raises + ------ + ValueError + Raises if the `best_equation` cannot be evaluated. + """ + check_is_fitted( + self, attributes=["selection_mask_", "feature_names_in_", "nout_"] + ) + best_equation = self.get_best(index=index) + + # When X is an numpy array or a pandas dataframe with a RangeIndex, + # the self.feature_names_in_ generated during fit, for the same X, + # will cause a warning to be thrown during _validate_data. + # To avoid this, convert X to a dataframe, apply the selection mask, + # and then set the column/feature_names of X to be equal to those + # generated during fit. + if not isinstance(X, pd.DataFrame): + X = check_array(X) + X = pd.DataFrame(X) + if isinstance(X.columns, pd.RangeIndex): + if self.selection_mask_ is not None: + # RangeIndex enforces column order allowing columns to + # be correctly filtered with self.selection_mask_ + X = X.iloc[:, self.selection_mask_] + X.columns = self.feature_names_in_ + # Without feature information, CallableEquation/lambda_format equations + # require that the column order of X matches that of the X used during + # the fitting process. _validate_data removes this feature information + # when it converts the dataframe to an np array. Thus, to ensure feature + # order is preserved after conversion, the dataframe columns must be + # reordered/reindexed to match those of the transformed (denoised and + # feature selected) X in fit. + X = X.reindex(columns=self.feature_names_in_) + X = self._validate_data(X, reset=False) + + try: + if self.nout_ > 1: + return np.stack( + [eq["lambda_format"](X) for eq in best_equation], axis=1 + ) + return best_equation["lambda_format"](X) + except Exception as error: + raise ValueError( + "Failed to evaluate the expression. " + "If you are using a custom operator, make sure to define it in :param`extra_sympy_mappings`, " + "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`." + ) from error + + def sympy(self, index=None): + """ + Return sympy representation of the equation(s) chosen by `model_selection`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the index number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : str, list[str] of length nout_ + SymPy representation of the best equation. + """ + self.refresh() + best_equation = self.get_best(index=index) + if self.nout_ > 1: + return [eq["sympy_format"] for eq in best_equation] + return best_equation["sympy_format"] + + def latex(self, index=None): + """ + Return latex representation of the equation(s) chosen by `model_selection`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the index number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : str or list[str] of length nout_ + LaTeX expression of the best equation. + """ + self.refresh() + sympy_representation = self.sympy(index=index) + if self.nout_ > 1: + return [sympy.latex(s) for s in sympy_representation] + return sympy.latex(sympy_representation) + + def jax(self, index=None): + """ + Return jax representation of the equation(s) chosen by `model_selection`. + + Each equation (multiple given if there are multiple outputs) is a dictionary + containing {"callable": func, "parameters": params}. To call `func`, pass + func(X, params). This function is differentiable using `jax.grad`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the row number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : dict[str, Any] + Dictionary of callable jax function in "callable" key, + and jax array of parameters as "parameters" key. + """ + self.set_params(output_jax_format=True) + self.refresh() + best_equation = self.get_best(index=index) + if self.nout_ > 1: + return [eq["jax_format"] for eq in best_equation] + return best_equation["jax_format"] + + def pytorch(self, index=None): + """ + Return pytorch representation of the equation(s) chosen by `model_selection`. + + Each equation (multiple given if there are multiple outputs) is a PyTorch module + containing the parameters as trainable attributes. You can use the module like + any other PyTorch module: `module(X)`, where `X` is a tensor with the same + column ordering as trained with. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the row number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : torch.nn.Module + PyTorch module representing the expression. + """ + self.set_params(output_torch_format=True) + self.refresh() + best_equation = self.get_best(index=index) + if self.nout_ > 1: + return [eq["torch_format"] for eq in best_equation] + return best_equation["torch_format"] def get_hof(self): """Get the equations from a hall of fame file. If no arguments entered, the ones used previously from a call to PySR will be used.""" - + check_is_fitted( + self, + attributes=[ + "nout_", + "equation_file_", + "selection_mask_", + "feature_names_in_", + ], + ) try: - if self.multioutput: + if self.nout_ > 1: all_outputs = [] - for i in range(1, self.nout + 1): + for i in range(1, self.nout_ + 1): df = pd.read_csv( - str(self.equation_file) + f".out{i}" + ".bkup", + str(self.equation_file_) + f".out{i}" + ".bkup", sep="|", ) # Rename Complexity column to complexity: @@ -1452,7 +1803,7 @@ def get_hof(self): all_outputs.append(df) else: - all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")] + all_outputs = [pd.read_csv(str(self.equation_file_) + ".bkup", sep="|")] all_outputs[-1].rename( columns={ "Complexity": "complexity", @@ -1463,9 +1814,33 @@ def get_hof(self): ) except FileNotFoundError: raise RuntimeError( - "Couldn't find equation file! The equation search likely exited before a single iteration completed." + "Couldn't find equation file! The equation search likely exited " + "before a single iteration completed." ) + # It is expected extra_jax/torch_mappings will be updated after fit. + # Thus, validation is performed here instead of in _validate_init_params + extra_jax_mappings = self.extra_jax_mappings + extra_torch_mappings = self.extra_torch_mappings + if extra_jax_mappings is not None: + for value in extra_jax_mappings.values(): + if not isinstance(value, str): + raise ValueError( + "extra_jax_mappings must have keys that are strings! " + "e.g., {sympy.sqrt: 'jnp.sqrt'}." + ) + else: + extra_jax_mappings = {} + if extra_torch_mappings is not None: + for value in extra_jax_mappings.values(): + if not callable(value): + raise ValueError( + "extra_torch_mappings must be callable functions! " + "e.g., {sympy.sqrt: torch.sqrt}." + ) + else: + extra_torch_mappings = {} + ret_outputs = [] for output in all_outputs: @@ -1479,20 +1854,14 @@ def get_hof(self): jax_format = [] if self.output_torch_format: torch_format = [] - use_custom_variable_names = len(self.variable_names) != 0 local_sympy_mappings = { - **self.extra_sympy_mappings, + **(self.extra_sympy_mappings if self.extra_sympy_mappings else {}), **sympy_mappings, } - if use_custom_variable_names: - sympy_symbols = [ - sympy.Symbol(self.variable_names[i]) for i in range(self.n_features) - ] - else: - sympy_symbols = [ - sympy.Symbol("x%d" % i) for i in range(self.n_features) - ] + sympy_symbols = [ + sympy.Symbol(variable) for variable in self.feature_names_in_ + ] for _, eqn_row in output.iterrows(): eqn = sympify(eqn_row["equation"], locals=local_sympy_mappings) @@ -1501,7 +1870,7 @@ def get_hof(self): # Numpy: lambda_format.append( CallableEquation( - sympy_symbols, eqn, self.selection, self.variable_names + sympy_symbols, eqn, self.selection_mask_, self.feature_names_in_ ) ) @@ -1512,8 +1881,10 @@ def get_hof(self): func, params = sympy2jax( eqn, sympy_symbols, - selection=self.selection, - extra_jax_mappings=self.extra_jax_mappings, + selection=self.selection_mask_, + extra_jax_mappings=( + self.extra_jax_mappings if self.extra_jax_mappings else {} + ), ) jax_format.append({"callable": func, "parameters": params}) @@ -1524,8 +1895,12 @@ def get_hof(self): module = sympy2torch( eqn, sympy_symbols, - selection=self.selection, - extra_torch_mappings=self.extra_torch_mappings, + selection=self.selection_mask_, + extra_torch_mappings=( + self.extra_torch_mappings + if self.extra_torch_mappings + else {} + ), ) torch_format.append(module) @@ -1536,6 +1911,7 @@ def get_hof(self): cur_score = 0.0 else: if curMSE > 0.0: + # TODO Move this to more obvious function/file. cur_score = -np.log(curMSE / lastMSE) / ( curComplexity - lastComplexity ) @@ -1566,11 +1942,53 @@ def get_hof(self): ret_outputs.append(output[output_cols]) - if self.multioutput: + if self.nout_ > 1: return ret_outputs return ret_outputs[0] - def score(self, X, y): - del X - del y - raise NotImplementedError + +def _denoise(X, y, Xresampled=None, random_state=None): + """Denoise the dataset using a Gaussian process""" + from sklearn.gaussian_process import GaussianProcessRegressor + from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel + + gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel() + gpr = GaussianProcessRegressor( + kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state + ) + gpr.fit(X, y) + if Xresampled is not None: + return Xresampled, gpr.predict(Xresampled) + + return X, gpr.predict(X) + + +# Function has not been removed only due to usage in module tests +def _handle_feature_selection(X, select_k_features, y, variable_names): + if select_k_features is not None: + selection = run_feature_selection(X, y, select_k_features) + print(f"Using features {[variable_names[i] for i in selection]}") + X = X[:, selection] + + else: + selection = None + return X, selection + + +def run_feature_selection(X, y, select_k_features, random_state=None): + """ + Use a gradient boosting tree regressor as a proxy for finding + the k most important features in X, returning indices for those + features as output. + """ + from sklearn.ensemble import RandomForestRegressor + from sklearn.feature_selection import SelectFromModel + + clf = RandomForestRegressor( + n_estimators=100, max_depth=3, random_state=random_state + ) + clf.fit(X, y) + selector = SelectFromModel( + clf, threshold=-np.inf, max_features=select_k_features, prefit=True + ) + return selector.get_support(indices=True) diff --git a/pysr/version.py b/pysr/version.py index 07e11953f..bc4255156 100644 --- a/pysr/version.py +++ b/pysr/version.py @@ -1,2 +1,2 @@ -__version__ = "0.8.7" -__symbolic_regression_jl_version__ = "0.9.4" +__version__ = "0.9.0" +__symbolic_regression_jl_version__ = "0.9.6" diff --git a/test/test.py b/test/test.py index d63e8b12f..b969f24dc 100644 --- a/test/test.py +++ b/test/test.py @@ -1,11 +1,11 @@ import inspect import unittest -from unittest.mock import patch import numpy as np +from sklearn import model_selection from pysr import PySRRegressor from pysr.sr import run_feature_selection, _handle_feature_selection +from sklearn.utils.estimator_checks import check_estimator import sympy -from sympy import lambdify import pandas as pd import warnings @@ -21,6 +21,7 @@ def setUp(self): inspect.signature(PySRRegressor.__init__).parameters["populations"].default ) self.default_test_kwargs = dict( + progress=False, model_selection="accuracy", niterations=default_niterations * 2, populations=default_populations * 2, @@ -30,17 +31,25 @@ def setUp(self): def test_linear_relation(self): y = self.X[:, 0] - model = PySRRegressor(**self.default_test_kwargs) + model = PySRRegressor( + **self.default_test_kwargs, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1", + ) model.fit(self.X, y) - print(model.equations) + print(model.equations_) self.assertLessEqual(model.get_best()["loss"], 1e-4) def test_multiprocessing(self): y = self.X[:, 0] - model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False) + model = PySRRegressor( + **self.default_test_kwargs, + procs=2, + multithreading=False, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1", + ) model.fit(self.X, y) - print(model.equations) - self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4) + print(model.equations_) + self.assertLessEqual(model.equations_.iloc[-1]["loss"], 1e-4) def test_multioutput_custom_operator_quiet_custom_complexity(self): y = self.X[:, [0, 1]] ** 2 @@ -55,11 +64,12 @@ def test_multioutput_custom_operator_quiet_custom_complexity(self): # Test custom operators with constraints: nested_constraints={"square_op": {"square_op": 3}}, constraints={"square_op": 10}, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3", ) model.fit(self.X, y) - equations = model.equations + equations = model.equations_ print(equations) - self.assertIn("square_op", model.equations[0].iloc[-1]["equation"]) + self.assertIn("square_op", model.equations_[0].iloc[-1]["equation"]) self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4) self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4) @@ -95,6 +105,7 @@ def test_multioutput_weighted_with_callable_temp_equation(self): procs=0, temp_equation_file=True, delete_tempfiles=False, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 2", ) model.fit(X.copy(), y, weights=w) @@ -117,27 +128,29 @@ def test_multioutput_weighted_with_callable_temp_equation(self): print("Model equations: ", model.sympy()[1]) print("True equation: x1^2") - def test_empty_operators_single_input_multirun(self): + def test_empty_operators_single_input_warm_start(self): X = self.rstate.randn(100, 1) y = X[:, 0] + 3.0 regressor = PySRRegressor( unary_operators=[], binary_operators=["plus"], **self.default_test_kwargs, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3", ) self.assertTrue("None" in regressor.__repr__()) regressor.fit(X, y) self.assertTrue("None" not in regressor.__repr__()) self.assertTrue(">>>>" in regressor.__repr__()) - self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4) + self.assertLessEqual(regressor.equations_.iloc[-1]["loss"], 1e-4) np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1) # Test if repeated fit works: - regressor.set_params(niterations=0) + regressor.set_params(niterations=0, warm_start=True, early_stop_condition=None) + # This should exit immediately, and use the old equations regressor.fit(X, y) - self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4) + self.assertLessEqual(regressor.equations_.iloc[-1]["loss"], 1e-4) np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1) # Tweak model selection: @@ -157,7 +170,11 @@ def test_noisy(self): **self.default_test_kwargs, procs=0, denoise=True, + early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2", ) + # We expect in this case that the "best" + # equation should be the right one: + model.set_params(model_selection="best") model.fit(self.X, y) self.assertLessEqual(model.get_best()[1]["loss"], 1e-2) self.assertLessEqual(model.get_best()[1]["loss"], 1e-2) @@ -188,11 +205,11 @@ def test_pandas_resample_with_nested_constraints(self): unary_operators=[], binary_operators=["+", "*", "/", "-"], **self.default_test_kwargs, - Xresampled=Xresampled, denoise=True, nested_constraints={"/": {"+": 1, "-": 1}, "+": {"*": 4}}, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-3 && complexity == 7", ) - model.fit(X, y) + model.fit(X, y, Xresampled=Xresampled) self.assertNotIn("unused_feature", model.latex()) self.assertIn("T", model.latex()) self.assertIn("x", model.latex()) @@ -217,18 +234,31 @@ def test_high_dim_selection_early_stop(self): unary_operators=["cos"], select_k_features=3, early_stop_condition=1e-4, # Stop once most accurate equation is <1e-4 MSE - Xresampled=Xresampled, maxsize=12, **self.default_test_kwargs, ) - model.fit(X, y) model.set_params(model_selection="accuracy") - model.predict(X) + model.fit(X, y, Xresampled=Xresampled) self.assertLess(np.average((model.predict(X) - y) ** 2), 1e-4) + # Again, but with numpy arrays: + model.fit(X.values, y.values, Xresampled=Xresampled.values) + self.assertLess(np.average((model.predict(X.values) - y.values) ** 2), 1e-4) class TestBest(unittest.TestCase): def setUp(self): + self.rstate = np.random.RandomState(0) + self.X = self.rstate.randn(10, 2) + self.y = np.cos(self.X[:, 0]) ** 2 + self.model = PySRRegressor( + progress=False, + niterations=1, + extra_sympy_mappings={}, + output_jax_format=False, + model_selection="accuracy", + equation_file="equation_file.csv", + ) + self.model.fit(self.X, self.y) equations = pd.DataFrame( { "equation": ["1.0", "cos(x0)", "square(cos(x0))"], @@ -241,17 +271,8 @@ def setUp(self): "equation_file.csv.bkup", sep="|" ) - self.model = PySRRegressor( - equation_file="equation_file.csv", - variable_names="x0 x1".split(" "), - extra_sympy_mappings={}, - output_jax_format=False, - model_selection="accuracy", - ) - self.model.n_features = 2 self.model.refresh() - self.equations = self.model.equations - self.rstate = np.random.RandomState(0) + self.equations_ = self.model.equations_ def test_best(self): self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2) @@ -266,9 +287,9 @@ def test_best_tex(self): self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}") def test_best_lambda(self): - X = self.rstate.randn(10, 2) - y = np.cos(X[:, 0]) ** 2 - for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]: + X = self.X + y = self.y + for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]: np.testing.assert_almost_equal(f(X), y, decimal=4) @@ -308,12 +329,12 @@ def test_deprecation(self): This should give a warning, and sets the correct value. """ - with self.assertWarns(UserWarning): + with self.assertWarns(FutureWarning): model = PySRRegressor(fractionReplaced=0.2) # This is a deprecated parameter, so we should get a warning. # The correct value should be set: - self.assertEqual(model.params["fraction_replaced"], 0.2) + self.assertEqual(model.fraction_replaced, 0.2) def test_size_warning(self): """Ensure that a warning is given for a large input size.""" @@ -336,3 +357,59 @@ def test_feature_warning(self): with self.assertRaises(Exception) as context: model.fit(X, y) self.assertIn("with 10 features or more", str(context.exception)) + + def test_deterministic_warnings(self): + """Ensure that warnings are given for determinism""" + model = PySRRegressor(random_state=0) + X = np.random.randn(100, 2) + y = np.random.randn(100) + with warnings.catch_warnings(): + warnings.simplefilter("error") + with self.assertRaises(Exception) as context: + model.fit(X, y) + self.assertIn("`deterministic`", str(context.exception)) + + def test_deterministic_errors(self): + """Setting deterministic without random_state should error""" + model = PySRRegressor(deterministic=True) + X = np.random.randn(100, 2) + y = np.random.randn(100) + with self.assertRaises(ValueError): + model.fit(X, y) + + def test_scikit_learn_compatibility(self): + """Test PySRRegressor compatibility with scikit-learn.""" + model = PySRRegressor( + max_evals=1000, + verbosity=0, + progress=False, + random_state=0, + deterministic=True, + procs=0, + multithreading=False, + warm_start=False, + ) # Return early. + + check_generator = check_estimator(model, generate_only=True) + exception_messages = [] + for (_, check) in check_generator: + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # To ensure an equation file is written for each output in + # nout, set stop condition to niterations=1 + if check.func.__name__ == "check_regressor_multioutput": + model.set_params(niterations=1, max_evals=None) + else: + model.set_params(max_evals=10000) + check(model) + print("Passed", check.func.__name__) + except Exception as e: + error_message = str(e) + exception_messages.append(f"{check.func.__name__}: {error_message}\n") + print("Failed", check.func.__name__, "with:") + # Add a leading tab to error message, which + # might be multi-line: + print("\n".join([(" " * 4) + row for row in error_message.split("\n")])) + # If any checks failed don't let the test pass. + self.assertEqual([], exception_messages) diff --git a/test/test_jax.py b/test/test_jax.py index 84c069b59..3fb8693bf 100644 --- a/test/test_jax.py +++ b/test/test_jax.py @@ -4,8 +4,8 @@ import pandas as pd from jax import numpy as jnp from jax import random -from jax import grad import sympy +from functools import partial class TestJAX(unittest.TestCase): @@ -21,8 +21,16 @@ def test_sympy2jax(self): f, params = sympy2jax(cosx, [x, y, z]) self.assertTrue(jnp.all(jnp.isclose(f(X, params), true)).item()) - def test_pipeline(self): - X = np.random.randn(100, 10) + def test_pipeline_pandas(self): + X = pd.DataFrame(np.random.randn(100, 10)) + y = np.ones(X.shape[0]) + model = PySRRegressor( + progress=False, + max_evals=10000, + output_jax_format=True, + ) + model.fit(X, y) + equations = pd.DataFrame( { "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], @@ -35,16 +43,34 @@ def test_pipeline(self): "equation_file.csv.bkup", sep="|" ) - model = PySRRegressor( - equation_file="equation_file.csv", - output_jax_format=True, - variable_names="x1 x2 x3".split(" "), + model.refresh(checkpoint_file="equation_file.csv") + jformat = model.jax() + + np.testing.assert_almost_equal( + np.array(jformat["callable"](jnp.array(X), jformat["parameters"])), + np.square(np.cos(X.values[:, 1])), # Select feature 1 + decimal=4, ) - model.selection = [1, 2, 3] - model.n_features = 3 - model.using_pandas = False - model.refresh() + def test_pipeline(self): + X = np.random.randn(100, 10) + y = np.ones(X.shape[0]) + model = PySRRegressor(progress=False, max_evals=10000, output_jax_format=True) + model.fit(X, y) + + equations = pd.DataFrame( + { + "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], + "MSE": [1.0, 0.1, 1e-5], + "Complexity": [1, 2, 3], + } + ) + + equations["Complexity MSE Equation".split(" ")].to_csv( + "equation_file.csv.bkup", sep="|" + ) + + model.refresh(checkpoint_file="equation_file.csv") jformat = model.jax() np.testing.assert_almost_equal( @@ -52,3 +78,24 @@ def test_pipeline(self): np.square(np.cos(X[:, 1])), # Select feature 1 decimal=4, ) + + def test_feature_selection(self): + X = pd.DataFrame({f"k{i}": np.random.randn(1000) for i in range(10, 21)}) + y = X["k15"] ** 2 + np.cos(X["k20"]) + + model = PySRRegressor( + progress=False, + unary_operators=["cos"], + select_k_features=3, + early_stop_condition=1e-5, + ) + model.fit(X.values, y.values) + f, parameters = model.jax().values() + + np_prediction = model.predict + jax_prediction = partial(f, parameters=parameters) + + np_output = np_prediction(X.values) + jax_output = jax_prediction(X.values) + + np.testing.assert_almost_equal(np_output, jax_output, decimal=4) diff --git a/test/test_torch.py b/test/test_torch.py index 74802f714..01c8d602d 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -2,7 +2,20 @@ import numpy as np import pandas as pd from pysr import sympy2torch, PySRRegressor -import torch + +# Need to initialize Julia before importing torch... +import platform + +if platform.system() == "Darwin": + # Import PyJulia, then Torch + from pysr.julia_helpers import init_julia + + Main = init_julia() + import torch +else: + # Import Torch, then PyJulia + # https://github.com/pytorch/pytorch/issues/78829 + import torch import sympy @@ -13,6 +26,7 @@ def setUp(self): def test_sympy2torch(self): x, y, z = sympy.symbols("x y z") cosx = 1.0 * sympy.cos(x) + y + X = torch.tensor(np.random.randn(1000, 3)) true = 1.0 * torch.cos(X[:, 0]) + X[:, 1] torch_module = sympy2torch(cosx, [x, y, z]) @@ -20,8 +34,18 @@ def test_sympy2torch(self): np.all(np.isclose(torch_module(X).detach().numpy(), true.detach().numpy())) ) - def test_pipeline(self): - X = np.random.randn(100, 10) + def test_pipeline_pandas(self): + X = pd.DataFrame(np.random.randn(100, 10)) + y = np.ones(X.shape[0]) + model = PySRRegressor( + progress=False, + max_evals=10000, + model_selection="accuracy", + extra_sympy_mappings={}, + output_torch_format=True, + ) + model.fit(X, y) + equations = pd.DataFrame( { "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], @@ -34,23 +58,47 @@ def test_pipeline(self): "equation_file.csv.bkup", sep="|" ) + model.refresh(checkpoint_file="equation_file.csv") + tformat = model.pytorch() + self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") + + np.testing.assert_almost_equal( + tformat(torch.tensor(X.values)).detach().numpy(), + np.square(np.cos(X.values[:, 1])), # Selection 1st feature + decimal=4, + ) + + def test_pipeline(self): + X = np.random.randn(100, 10) + y = np.ones(X.shape[0]) model = PySRRegressor( + progress=False, + max_evals=10000, model_selection="accuracy", - equation_file="equation_file.csv", - variable_names="x1 x2 x3".split(" "), - extra_sympy_mappings={}, output_torch_format=True, ) - model.selection = [1, 2, 3] - model.n_features = 2 # TODO: Why is this 2 and not 3? - model.using_pandas = False - model.refresh() + model.fit(X, y) + + equations = pd.DataFrame( + { + "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], + "MSE": [1.0, 0.1, 1e-5], + "Complexity": [1, 2, 3], + } + ) + + equations["Complexity MSE Equation".split(" ")].to_csv( + "equation_file.csv.bkup", sep="|" + ) + + model.refresh(checkpoint_file="equation_file.csv") tformat = model.pytorch() self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") + np.testing.assert_almost_equal( tformat(torch.tensor(X)).detach().numpy(), - np.square(np.cos(X[:, 1])), # Selection 1st feature + np.square(np.cos(X[:, 1])), # 2nd feature decimal=4, ) @@ -73,6 +121,14 @@ def test_mod_mapping(self): def test_custom_operator(self): X = np.random.randn(100, 3) + y = np.ones(X.shape[0]) + model = PySRRegressor( + progress=False, + max_evals=10000, + model_selection="accuracy", + output_torch_format=True, + ) + model.fit(X, y) equations = pd.DataFrame( { @@ -86,18 +142,12 @@ def test_custom_operator(self): "equation_file_custom_operator.csv.bkup", sep="|" ) - model = PySRRegressor( - model_selection="accuracy", + model.set_params( equation_file="equation_file_custom_operator.csv", - variable_names="x1 x2 x3".split(" "), extra_sympy_mappings={"mycustomoperator": sympy.sin}, extra_torch_mappings={"mycustomoperator": torch.sin}, - output_torch_format=True, ) - model.selection = [0, 1, 2] - model.n_features = 3 - model.using_pandas = False - model.refresh() + model.refresh(checkpoint_file="equation_file_custom_operator.csv") self.assertEqual(str(model.sympy()), "sin(x1)") # Will automatically use the set global state from get_hof. @@ -105,6 +155,25 @@ def test_custom_operator(self): self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))") np.testing.assert_almost_equal( tformat(torch.tensor(X)).detach().numpy(), - np.sin(X[:, 0]), # Selection 1st feature + np.sin(X[:, 1]), decimal=4, ) + + def test_feature_selection(self): + X = pd.DataFrame({f"k{i}": np.random.randn(1000) for i in range(10, 21)}) + y = X["k15"] ** 2 + np.cos(X["k20"]) + + model = PySRRegressor( + progress=False, + unary_operators=["cos"], + select_k_features=3, + early_stop_condition=1e-5, + ) + model.fit(X.values, y.values) + torch_module = model.pytorch() + + np_output = model.predict(X.values) + + torch_output = torch_module(torch.tensor(X.values)).detach().numpy() + + np.testing.assert_almost_equal(np_output, torch_output, decimal=4)