diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c6247e86d..6c67627f0 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -90,7 +90,7 @@ jobs: - name: "Coveralls" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }} + COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}-${{ matrix.test-id }} COVERALLS_PARALLEL: true run: coveralls --service=github diff --git a/pyproject.toml b/pyproject.toml index 52277bb16..1fa799e36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pysr" -version = "0.18.4" +version = "0.18.5" authors = [ {name = "Miles Cranmer", email = "miles.cranmer@gmail.com"}, ] @@ -41,4 +41,5 @@ dev-dependencies = [ "pandas-stubs>=2.2.1.240316", "types-pytz>=2024.1.0.20240417", "types-openpyxl>=3.1.0.20240428", + "coverage>=7.5.3", ] diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index db8de4ec1..045d79e30 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=0.24.4" + "version": "=0.24.5" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", diff --git a/pysr/sr.py b/pysr/sr.py index ed29f53ce..8ccfb79cc 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,8 +1,6 @@ """Define the PySRRegressor scikit-learn interface.""" import copy -import difflib -import inspect import os import pickle as pkl import re @@ -57,6 +55,7 @@ _preprocess_julia_floats, _safe_check_feature_names_in, _subscriptify, + _suggest_keywords, ) ALREADY_RAN = False @@ -122,7 +121,7 @@ def _maybe_create_inline_operators( "and underscores are allowed." ) if (extra_sympy_mappings is None) or ( - not function_name in extra_sympy_mappings + function_name not in extra_sympy_mappings ): raise ValueError( f"Custom function {function_name} is not defined in `extra_sympy_mappings`. " @@ -139,6 +138,7 @@ def _check_assertions( X, use_custom_variable_names, variable_names, + complexity_of_variables, weights, y, X_units, @@ -163,6 +163,13 @@ def _check_assertions( "and underscores are allowed." ) assert_valid_sympy_symbol(var_name) + if ( + isinstance(complexity_of_variables, list) + and len(complexity_of_variables) != X.shape[1] + ): + raise ValueError( + "The number of elements in `complexity_of_variables` must equal the number of features in `X`." + ) if X_units is not None and len(X_units) != X.shape[1]: raise ValueError( "The number of units in `X_units` must equal the number of features in `X`." @@ -333,7 +340,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): `idx` argument to the function, which is `nothing` for non-batched, and a 1D array of indices for batched. Default is `None`. - complexity_of_operators : dict[str, float] + complexity_of_operators : dict[str, Union[int, float]] If you would like to use a complexity other than 1 for an operator, specify the complexity here. For example, `{"sin": 2, "+": 1}` would give a complexity of 2 for each use @@ -342,10 +349,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): numbers for a complexity, and the total complexity of a tree will be rounded to the nearest integer after computing. Default is `None`. - complexity_of_constants : float + complexity_of_constants : int | float Complexity of constants. Default is `1`. - complexity_of_variables : float - Complexity of variables. Default is `1`. + complexity_of_variables : int | float + Global complexity of variables. To set different complexities for + different variables, pass a list of complexities to the `fit` method + with keyword `complexity_of_variables`. You cannot use both. + Default is `1`. parsimony : float Multiplicative factor for how much to punish complexity. Default is `0.0032`. @@ -691,6 +701,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): n_features_in_: int feature_names_in_: ArrayLike[str] display_feature_names_in_: ArrayLike[str] + complexity_of_variables_: Union[int, float, List[Union[int, float]], None] X_units_: Union[ArrayLike[str], None] y_units_: Union[str, ArrayLike[str], None] nout_: int @@ -722,7 +733,7 @@ def __init__( loss_function: Optional[str] = None, complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None, complexity_of_constants: Union[int, float] = 1, - complexity_of_variables: Union[int, float] = 1, + complexity_of_variables: Optional[Union[int, float]] = None, parsimony: float = 0.0032, dimensional_constraint_penalty: Optional[float] = None, dimensionless_constants_only: bool = False, @@ -1344,13 +1355,22 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams: return param_container def _validate_and_set_fit_params( - self, X, y, Xresampled, weights, variable_names, X_units, y_units + self, + X, + y, + Xresampled, + weights, + variable_names, + complexity_of_variables, + X_units, + y_units, ) -> Tuple[ ndarray, ndarray, Optional[ndarray], Optional[ndarray], ArrayLike[str], + Union[int, float, List[Union[int, float]]], Optional[ArrayLike[str]], Optional[Union[str, ArrayLike[str]]], ]: @@ -1375,6 +1395,8 @@ def _validate_and_set_fit_params( for that particular element of y. variable_names : ndarray of length n_features Names of each variable in the training dataset, `X`. + complexity_of_variables : int | float | list[int | float] + Complexity of each variable in the training dataset, `X`. X_units : list[str] of length n_features Units of each variable in the training dataset, `X`. y_units : str | list[str] of length n_out @@ -1422,6 +1444,22 @@ def _validate_and_set_fit_params( "Please use valid names instead." ) + if ( + complexity_of_variables is not None + and self.complexity_of_variables is not None + ): + raise ValueError( + "You cannot set `complexity_of_variables` at both `fit` and `__init__`. " + "Pass it at `__init__` to set it to global default, OR use `fit` to set it for " + "each variable individually." + ) + elif complexity_of_variables is not None: + complexity_of_variables = complexity_of_variables + elif self.complexity_of_variables is not None: + complexity_of_variables = self.complexity_of_variables + else: + complexity_of_variables = 1 + # Data validation and feature name fetching via sklearn # This method sets the n_features_in_ attribute if Xresampled is not None: @@ -1452,10 +1490,20 @@ def _validate_and_set_fit_params( else: raise NotImplementedError("y shape not supported!") + self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables) self.X_units_ = copy.deepcopy(X_units) self.y_units_ = copy.deepcopy(y_units) - return X, y, Xresampled, weights, variable_names, X_units, y_units + return ( + X, + y, + Xresampled, + weights, + variable_names, + complexity_of_variables, + X_units, + y_units, + ) def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]: raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore @@ -1471,6 +1519,7 @@ def _pre_transform_training_data( y: ndarray, Xresampled: Union[ndarray, None], variable_names: ArrayLike[str], + complexity_of_variables: Union[int, float, List[Union[int, float]]], X_units: Union[ArrayLike[str], None], y_units: Union[ArrayLike[str], str, None], random_state: np.random.RandomState, @@ -1493,6 +1542,8 @@ def _pre_transform_training_data( variable_names : list[str] Names of each variable in the training dataset, `X`. Of length `n_features`. + complexity_of_variables : int | float | list[int | float] + Complexity of each variable in the training dataset, `X`. X_units : list[str] Units of each variable in the training dataset, `X`. y_units : str | list[str] @@ -1543,6 +1594,14 @@ def _pre_transform_training_data( ], ) + if isinstance(complexity_of_variables, list): + complexity_of_variables = [ + complexity_of_variables[i] + for i in range(len(complexity_of_variables)) + if selection_mask[i] + ] + self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables) + if X_units is not None: X_units = cast( ArrayLike[str], @@ -1567,7 +1626,7 @@ def _pre_transform_training_data( else: X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state) - return X, y, variable_names, X_units, y_units + return X, y, variable_names, complexity_of_variables, X_units, y_units def _run( self, @@ -1624,6 +1683,7 @@ def _run( nested_constraints = self.nested_constraints complexity_of_operators = self.complexity_of_operators + complexity_of_variables = self.complexity_of_variables_ cluster_manager = self.cluster_manager # Start julia backend processes @@ -1668,6 +1728,9 @@ def _run( complexity_of_operators = jl.seval(complexity_of_operators_str) # TODO: Refactor this into helper function + if isinstance(complexity_of_variables, list): + complexity_of_variables = jl_array(complexity_of_variables) + custom_loss = jl.seval( str(self.elementwise_loss) if self.elementwise_loss is not None @@ -1726,7 +1789,7 @@ def _run( una_constraints=jl_array(una_constraints), complexity_of_operators=complexity_of_operators, complexity_of_constants=self.complexity_of_constants, - complexity_of_variables=self.complexity_of_variables, + complexity_of_variables=complexity_of_variables, nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, @@ -1871,6 +1934,9 @@ def fit( Xresampled=None, weights=None, variable_names: Optional[ArrayLike[str]] = None, + complexity_of_variables: Optional[ + Union[int, float, List[Union[int, float]]] + ] = None, X_units: Optional[ArrayLike[str]] = None, y_units: Optional[Union[str, ArrayLike[str]]] = None, ) -> "PySRRegressor": @@ -1931,6 +1997,7 @@ def fit( self.selection_mask_ = None self.julia_state_stream_ = None self.julia_options_stream_ = None + self.complexity_of_variables_ = None self.X_units_ = None self.y_units_ = None @@ -1944,10 +2011,18 @@ def fit( Xresampled, weights, variable_names, + complexity_of_variables, X_units, y_units, ) = self._validate_and_set_fit_params( - X, y, Xresampled, weights, variable_names, X_units, y_units + X, + y, + Xresampled, + weights, + variable_names, + complexity_of_variables, + X_units, + y_units, ) if X.shape[0] > 10000 and not self.batching: @@ -1965,8 +2040,17 @@ def fit( seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random # Pre transformations (feature selection and denoising) - X, y, variable_names, X_units, y_units = self._pre_transform_training_data( - X, y, Xresampled, variable_names, X_units, y_units, random_state + X, y, variable_names, complexity_of_variables, X_units, y_units = ( + self._pre_transform_training_data( + X, + y, + Xresampled, + variable_names, + complexity_of_variables, + X_units, + y_units, + random_state, + ) ) # Warn about large feature counts (still warn if feature count is large @@ -1993,6 +2077,7 @@ def fit( X, use_custom_variable_names, variable_names, + complexity_of_variables, weights, y, X_units, @@ -2465,16 +2550,6 @@ def latex_table( return with_preamble(table_string) -def _suggest_keywords(cls, k: str) -> List[str]: - valid_keywords = [ - param - for param in inspect.signature(cls.__init__).parameters - if param not in ["self", "kwargs"] - ] - suggestions = difflib.get_close_matches(k, valid_keywords, n=3) - return suggestions - - def idx_model_selection(equations: pd.DataFrame, model_selection: str): """Select an expression and return its index.""" if model_selection == "accuracy": diff --git a/pysr/test/params.py b/pysr/test/params.py index 9850c9cdf..54da4ac7d 100644 --- a/pysr/test/params.py +++ b/pysr/test/params.py @@ -1,6 +1,6 @@ import inspect -from .. import PySRRegressor +from pysr import PySRRegressor DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default diff --git a/pysr/test/test.py b/pysr/test/test.py index 586c62e90..5d80b7a24 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -11,17 +11,18 @@ import sympy from sklearn.utils.estimator_checks import check_estimator -from .. import PySRRegressor, install, jl -from ..export_latex import sympy2latex -from ..feature_selection import _handle_feature_selection, run_feature_selection -from ..julia_helpers import init_julia -from ..sr import ( +from pysr import PySRRegressor, install, jl +from pysr.export_latex import sympy2latex +from pysr.feature_selection import _handle_feature_selection, run_feature_selection +from pysr.julia_helpers import init_julia +from pysr.sr import ( _check_assertions, _process_constraints, _suggest_keywords, idx_model_selection, ) -from ..utils import _csv_filename_to_pkl_filename +from pysr.utils import _csv_filename_to_pkl_filename + from .params import ( DEFAULT_NCYCLES, DEFAULT_NITERATIONS, @@ -29,6 +30,11 @@ DEFAULT_POPULATIONS, ) +# Disables local saving: +os.environ["SYMBOLIC_REGRESSION_IS_TESTING"] = os.environ.get( + "SYMBOLIC_REGRESSION_IS_TESTING", "true" +) + class TestPipeline(unittest.TestCase): def setUp(self): @@ -176,6 +182,63 @@ def test_multioutput_custom_operator_quiet_custom_complexity(self): self.assertLessEqual(mse1, 1e-4) self.assertLessEqual(mse2, 1e-4) + def test_custom_variable_complexity(self): + for outer in (True, False): + for case in (1, 2): + y = self.X[:, [0, 1]] + if case == 1: + kwargs = dict(complexity_of_variables=[2, 3]) + elif case == 2: + kwargs = dict(complexity_of_variables=2) + + if outer: + outer_kwargs = kwargs + inner_kwargs = dict() + else: + outer_kwargs = dict() + inner_kwargs = kwargs + + model = PySRRegressor( + binary_operators=["+"], + verbosity=0, + **self.default_test_kwargs, + early_stop_condition=( + f"stop_if_{case}(l, c) = l < 1e-8 && c <= {3 if case == 1 else 2}" + ), + **outer_kwargs, + ) + model.fit(self.X[:, [0, 1]], y, **inner_kwargs) + self.assertLessEqual(model.get_best()[0]["loss"], 1e-8) + self.assertLessEqual(model.get_best()[1]["loss"], 1e-8) + + self.assertEqual(model.get_best()[0]["complexity"], 2) + self.assertEqual( + model.get_best()[1]["complexity"], 3 if case == 1 else 2 + ) + + def test_error_message_custom_variable_complexity(self): + X = np.ones((10, 2)) + y = np.ones((10,)) + model = PySRRegressor() + with self.assertRaises(ValueError) as cm: + model.fit(X, y, complexity_of_variables=[1, 2, 3]) + + self.assertIn( + "number of elements in `complexity_of_variables`", str(cm.exception) + ) + + def test_error_message_both_variable_complexity(self): + X = np.ones((10, 2)) + y = np.ones((10,)) + model = PySRRegressor(complexity_of_variables=[1, 2]) + with self.assertRaises(ValueError) as cm: + model.fit(X, y, complexity_of_variables=[1, 2, 3]) + + self.assertIn( + "You cannot set `complexity_of_variables` at both `fit` and `__init__`.", + str(cm.exception), + ) + def test_multioutput_weighted_with_callable_temp_equation(self): X = self.X.copy() y = X[:, [0, 1]] ** 2 @@ -313,7 +376,10 @@ def test_pandas_resample_with_nested_constraints(self): "unused_feature": self.rstate.randn(500), } ) - true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837) + + def true_fn(x): + return np.array(x["T"] + x["x"] ** 2 + 1.323837) + y = true_fn(X) noise = self.rstate.randn(500) * 0.01 y = y + noise @@ -372,13 +438,12 @@ def test_high_dim_selection_early_stop(self): def test_load_model(self): """See if we can load a ran model from the equation file.""" - csv_file_data = """ - Complexity,Loss,Equation + csv_file_data = """Complexity,Loss,Equation 1,0.19951081,"1.9762075" 3,0.12717344,"(f0 + 1.4724599)" 4,0.104823045,"pow_abs(2.2683423, cos(f3))\"""" # Strip the indents: - csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")]) + csv_file_data = "\n".join([line.strip() for line in csv_file_data.split("\n")]) for from_backup in [False, True]: rand_dir = Path(tempfile.mkdtemp()) @@ -430,7 +495,7 @@ def test_load_model_simple(self): if os.path.exists(file_to_delete): os.remove(file_to_delete) - pickle_file = rand_dir / "equations.pkl" + # pickle_file = rand_dir / "equations.pkl" model3 = PySRRegressor.from_file( model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} ) @@ -1081,8 +1146,14 @@ def test_unit_checks(self): """This just checks the number of units passed""" use_custom_variable_names = False variable_names = None + complexity_of_variables = 1 weights = None - args = (use_custom_variable_names, variable_names, weights) + args = ( + use_custom_variable_names, + variable_names, + complexity_of_variables, + weights, + ) valid_units = [ (np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"), (np.ones((10, 1)), np.ones(10), ["m/s"], None), diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index 5e4e5ef18..0a3712019 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -5,7 +5,7 @@ import pandas as pd import sympy -from .. import PySRRegressor, sympy2jax +from pysr import PySRRegressor, sympy2jax class TestJAX(unittest.TestCase): @@ -89,7 +89,10 @@ def test_pipeline(self): def test_feature_selection_custom_operators(self): rstate = np.random.RandomState(0) X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)}) - cos_approx = lambda x: 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720 + + def cos_approx(x): + return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720 + y = X["k15"] ** 2 + 2 * cos_approx(X["k20"]) model = PySRRegressor( diff --git a/pysr/test/test_startup.py b/pysr/test/test_startup.py index 8a93ad3a3..8bc78a6c1 100644 --- a/pysr/test/test_startup.py +++ b/pysr/test/test_startup.py @@ -9,8 +9,9 @@ import numpy as np -from .. import PySRRegressor -from ..julia_import import jl_version +from pysr import PySRRegressor +from pysr.julia_import import jl_version + from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 35055c6a0..f9318fa37 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -4,7 +4,7 @@ import pandas as pd import sympy -from .. import PySRRegressor, sympy2torch +from pysr import PySRRegressor, sympy2torch class TestTorch(unittest.TestCase): diff --git a/pysr/utils.py b/pysr/utils.py index 2cceb5078..de7faf16e 100644 --- a/pysr/utils.py +++ b/pysr/utils.py @@ -1,3 +1,5 @@ +import difflib +import inspect import os import re from pathlib import Path @@ -61,3 +63,13 @@ def _subscriptify(i: int) -> str: For example, 123 -> "₁₂₃". """ return "".join([chr(0x2080 + int(c)) for c in str(i)]) + + +def _suggest_keywords(cls, k: str) -> List[str]: + valid_keywords = [ + param + for param in inspect.signature(cls.__init__).parameters + if param not in ["self", "kwargs"] + ] + suggestions = difflib.get_close_matches(k, valid_keywords, n=3) + return suggestions