diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 8e8af8a3b..d79c89179 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,6 +14,9 @@ on: paths: - '**' +permissions: + contents: write + jobs: test: runs-on: ${{ matrix.os }} @@ -30,8 +33,8 @@ jobs: os: [ubuntu-latest] test-id: [main] include: - - julia-version: '1.6' - python-version: '3.8' + - julia-version: '1.10' + python-version: '3.10' os: ubuntu-latest test-id: include - julia-version: '1' @@ -58,26 +61,18 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Assert Julia version" if: ${{ matrix.julia-version != '1'}} run: python3 -c "from pysr import jl; assert jl.VERSION.major == jl.seval('v\"${{ matrix.julia-version }}\"').major; assert jl.VERSION.minor == jl.seval('v\"${{ matrix.julia-version }}\"').minor" - - name: "Install test dependencies" - run: pip install coverage coveralls pytest nbval - name: "Set up coverage for subprocesses" run: echo 'import coverage; coverage.process_startup()' > "${{ github.workspace }}/sitecustomize.py" - name: "Run tests" run: coverage run -m pysr test main,cli,startup - - name: "Install JAX" - run: pip install jax jaxlib # (optional import) - if: ${{ matrix.test-id == 'main' }} - name: "Run JAX tests" run: coverage run --append -m pysr test jax if: ${{ matrix.test-id == 'main' }} - - name: "Install Torch" - run: pip install torch # (optional import) - if: ${{ matrix.test-id == 'main' }} - name: "Run Torch tests" run: coverage run --append -m pysr test torch if: ${{ matrix.test-id == 'main' }} @@ -103,15 +98,15 @@ jobs: julia-version: ['1'] include: - os: ubuntu-latest - python-version: '3.8' - julia-version: '1.6' + python-version: '3.10' + julia-version: '1.10' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install . + pip install '.[dev]' - name: "Run development test" run: PYSR_TEST_JULIA_VERSION=${{ matrix.julia-version }} PYSR_TEST_PYTHON_VERSION=${{ matrix.python-version }} python -m pysr test dev @@ -137,7 +132,7 @@ jobs: - name: "Set up Conda" uses: conda-incubator/setup-miniconda@v3 with: - miniforge-variant: Mambaforge + miniforge-variant: Miniforge3 miniforge-version: latest auto-activate-base: true python-version: ${{ matrix.python-version }} @@ -182,7 +177,7 @@ jobs: matrix: python-version: - '3.12' - - '3.8' + - '3.10' os: ['ubuntu-latest'] steps: @@ -195,14 +190,37 @@ jobs: - name: "Install PySR and all dependencies" run: | python -m pip install --upgrade pip - pip install . - pip install mypy - - name: "Install additional dependencies" - run: python -m pip install jax jaxlib torch paddlepaddle - if: ${{ matrix.python-version != '3.8' }} + pip install '.[dev]' - name: "Run mypy" run: python -m mypy --install-types --non-interactive pysr - if: ${{ matrix.python-version != '3.8' }} + if: ${{ matrix.python-version != '3.10' }} - name: "Run compatible mypy" run: python -m mypy --ignore-missing-imports pysr - if: ${{ matrix.python-version == '3.8' }} + if: ${{ matrix.python-version == '3.10' }} + + beartype: + name: Test with beartype + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + env: + PYSR_USE_BEARTYPE: "1" + strategy: + matrix: + python-version: ['3.12'] + os: ['ubuntu-latest'] + + steps: + - uses: actions/checkout@v4 + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: "Install PySR and all dependencies" + run: | + python -m pip install --upgrade pip + pip install '.[dev]' + - name: "Run tests" + run: python -m pysr test main,jax,torch diff --git a/.github/workflows/CI_Windows.yml b/.github/workflows/CI_Windows.yml index 81fe72730..bfd8df5b4 100644 --- a/.github/workflows/CI_Windows.yml +++ b/.github/workflows/CI_Windows.yml @@ -46,8 +46,7 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install pytest nbval "numpy<2.0.0" - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Run tests" run: | diff --git a/.github/workflows/CI_apptainer.yml b/.github/workflows/CI_apptainer.yml index e128de233..76583b656 100644 --- a/.github/workflows/CI_apptainer.yml +++ b/.github/workflows/CI_apptainer.yml @@ -31,10 +31,10 @@ jobs: with: apptainer-version: 1.3.0 - name: Build apptainer - run: apptainer build --notest pysr.sif Apptainer.def + run: sudo apptainer build --notest pysr.sif Apptainer.def - name: Test apptainer run: | TMPDIR=$(mktemp -d) cp pysr.sif $TMPDIR cd $TMPDIR - apptainer test ./pysr.sif + sudo apptainer test ./pysr.sif diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml index 98d064b93..52407c58f 100644 --- a/.github/workflows/CI_conda_forge.yml +++ b/.github/workflows/CI_conda_forge.yml @@ -20,20 +20,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.10', '3.11', '3.12'] os: ['ubuntu-latest'] use-mamba: [true, false] include: - - python-version: 3.9 + - python-version: '3.10' os: 'windows-latest' use-mamba: true - - python-version: 3.12 + - python-version: '3.12' os: 'windows-latest' use-mamba: true - - python-version: 3.9 + - python-version: '3.10' os: 'macos-latest' use-mamba: true - - python-version: 3.12 + - python-version: '3.12' os: 'macos-latest' use-mamba: true @@ -41,7 +41,7 @@ jobs: - name: "Set up Conda" uses: conda-incubator/setup-miniconda@v3 with: - miniforge-variant: Mambaforge + miniforge-variant: Miniforge3 miniforge-version: latest auto-activate-base: true python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/CI_docker_large_nightly.yml b/.github/workflows/CI_docker_large_nightly.yml index 185383ccf..4e6dbf8ba 100644 --- a/.github/workflows/CI_docker_large_nightly.yml +++ b/.github/workflows/CI_docker_large_nightly.yml @@ -18,8 +18,8 @@ jobs: strategy: fail-fast: false matrix: - julia-version: ['1.6', '1'] - python-version: ['3.8', '3.12'] + julia-version: ['1.10', '1'] + python-version: ['3.10', '3.12'] os: [ubuntu-latest] arch: ['linux/amd64', 'linux/arm64'] diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml index cbd9a7ef3..4598468bf 100644 --- a/.github/workflows/CI_large_nightly.yml +++ b/.github/workflows/CI_large_nightly.yml @@ -23,8 +23,8 @@ jobs: strategy: fail-fast: false matrix: - julia-version: ['1.6', '1.8', '1.10'] - python-version: ['3.8', '3.10', '3.12'] + julia-version: ['1.10', '1'] + python-version: ['3.10', '3.12'] os: [ubuntu-latest, macos-latest, windows-latest] steps: @@ -40,8 +40,7 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install pytest nbval - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Assert Julia version" if: ${{ matrix.julia-version != '1'}} diff --git a/.github/workflows/CI_mac.yml b/.github/workflows/CI_mac.yml index 4cae23845..4433ab3d0 100644 --- a/.github/workflows/CI_mac.yml +++ b/.github/workflows/CI_mac.yml @@ -46,18 +46,13 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install pytest nbval - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Run tests" run: | python -m pysr test main,cli,startup - - name: "Install JAX" - run: pip install jax jaxlib # (optional import) - name: "Run JAX tests" run: python -m pysr test jax - - name: "Install Torch" - run: pip install torch # (optional import) - name: "Run Torch tests" run: python -m pysr test torch - name: "Install Paddle" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index af36db783..c277815c3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,7 +25,7 @@ jobs: - name: "Set up Python" uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.12 cache: pip - name: "Install packages for docs building" run: pip install -r docs/requirements.txt @@ -33,5 +33,18 @@ jobs: run: pip install . && python -c 'import pysr' - name: "Build API docs" run: cd docs && ./gen_docs.sh - - name: "Deploy documentation" + - name: "Deploy documentation to primary repository" run: mkdocs gh-deploy --force + - name: "Deploy documentation to secondary repository" + env: + DEPLOY_KEY: ${{ secrets.DAMTP_DEPLOY_KEY }} + run: | + # Set up SSH key for authentication + mkdir -p ~/.ssh + echo "$DEPLOY_KEY" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan github.com >> ~/.ssh/known_hosts + + git checkout gh-pages + git remote add secondary git@github.com:ai-damtp-cam-ac-uk/pysr.git + git push secondary gh-pages --force diff --git a/.gitignore b/.gitignore index c40841695..c864e9903 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,5 @@ site venv requirements-dev.lock requirements.lock +outputs +.mypy_cache diff --git a/Apptainer.def b/Apptainer.def index 962f81687..baad8a4c8 100644 --- a/Apptainer.def +++ b/Apptainer.def @@ -1,10 +1,10 @@ # Build an Apptainer SIF file containing a working copy of PySR and its prereqs Bootstrap: docker -From: julia:1.10.4-bullseye +From: julia:1.11.1-bullseye Stage: jl Bootstrap: docker -From: python:3.12-bullseye +From: python:3.12.6-bullseye Stage: runtime %environment @@ -19,7 +19,6 @@ Stage: runtime /usr/local/julia /usr/local/julia %files - ./requirements.txt /pysr/requirements.txt ./pyproject.toml /pysr/pyproject.toml ./setup.py /pysr/setup.py ./pysr /pysr/pysr diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 409cb7f1c..b7e245b59 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -6,7 +6,7 @@ In this guide you will get an overview of the contribution workflow from opening ## New contributor guide -To get an overview of the project, read PySR's [README](README.md). The [PySR docs](https://astroautomata.com/PySR/) give additional information. +To get an overview of the project, read PySR's [README](README.md). The [PySR docs](https://ai.damtp.cam.ac.uk/pysr/) give additional information. Here are some resources to help you get started with open source contributions in general: - [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github) @@ -39,7 +39,7 @@ Scan through our [existing issues](https://github.com/MilesCranmer/PySR/issues) 2. Create a working branch and start with your changes! 3. (Optional) If you would like to make changes to PySR itself, skip to step 4. However, if you are interested in making changes to the _symbolic regression code_ itself, -check out the [guide](https://astroautomata.com/PySR/backend/) on modifying a custom SymbolicRegression.jl library. +check out the [guide](https://ai.damtp.cam.ac.uk/pysr/backend/) on modifying a custom SymbolicRegression.jl library. In this case, you might instead be interested in making suggestions to the [SymbolicRegression.jl](http://github.com/MilesCranmer/SymbolicRegression.jl) library. 4. You can install your local version of PySR with `python setup.py install`, and run tests with `python -m pysr test main`. diff --git a/Dockerfile b/Dockerfile index ed0b5f891..a446f3278 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ # This builds a dockerfile containing a working copy of PySR # with all pre-requisites installed. -ARG JLVERSION=1.10.4 -ARG PYVERSION=3.12.2 +ARG JLVERSION=1.11.1 +ARG PYVERSION=3.12.6 ARG BASE_IMAGE=bullseye FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl @@ -17,10 +17,6 @@ RUN pip install --no-cache-dir ipython matplotlib WORKDIR /pysr -# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project) -ADD ./requirements.txt /pysr/requirements.txt -RUN pip3 install --no-cache-dir -r /pysr/requirements.txt - # Install PySR: # We do a minimal copy so it doesn't need to rerun at every file change: ADD ./pyproject.toml /pysr/pyproject.toml diff --git a/README.md b/README.md index aa1801933..299f019a9 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ https://github.com/MilesCranmer/PySR/assets/7593028/c8511a49-b408-488f-8f18-b174 | **Docs** | **Forums** | **Paper** | **colab demo** | |:---:|:---:|:---:|:---:| -|[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://astroautomata.com/PySR/)|[![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions)|[![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)| +|[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://ai.damtp.cam.ac.uk/pysr/)|[![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions)|[![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)| | **pip** | **conda** | **Stats** | | :---: | :---: | :---: | @@ -20,14 +20,14 @@ https://github.com/MilesCranmer/PySR/assets/7593028/c8511a49-b408-488f-8f18-b174 If you find PySR useful, please cite the paper [arXiv:2305.01582](https://arxiv.org/abs/2305.01582). -If you've finished a project with PySR, please submit a PR to showcase your work on the [research showcase page](https://astroautomata.com/PySR/papers)! +If you've finished a project with PySR, please submit a PR to showcase your work on the [research showcase page](https://ai.damtp.cam.ac.uk/pysr/papers)! **Contents**: - [Why PySR?](#why-pysr) - [Installation](#installation) - [Quickstart](#quickstart) -- [→ Documentation](https://astroautomata.com/PySR) +- [→ Documentation](https://ai.damtp.cam.ac.uk/PySR) - [Contributors](#contributors-)
@@ -178,6 +178,7 @@ PySR's main interface is in the style of scikit-learn: from pysr import PySRRegressor model = PySRRegressor( + maxsize=20, niterations=40, # < Increase me for better results binary_operators=["+", "*"], unary_operators=[ @@ -254,22 +255,21 @@ model = PySRRegressor.from_file("hall_of_fame.2022-08-10_100832.281.pkl") There are several other useful features such as denoising (e.g., `denoise=True`), feature selection (e.g., `select_k_features=3`). -For examples of these and other features, see the [examples page](https://astroautomata.com/PySR/examples). -For a detailed look at more options, see the [options page](https://astroautomata.com/PySR/options). -You can also see the full API at [this page](https://astroautomata.com/PySR/api). -There are also tips for tuning PySR on [this page](https://astroautomata.com/PySR/tuning). +For examples of these and other features, see the [examples page](https://ai.damtp.cam.ac.uk/pysr/examples). +For a detailed look at more options, see the [options page](https://ai.damtp.cam.ac.uk/pysr/options). +You can also see the full API at [this page](https://ai.damtp.cam.ac.uk/pysr/api). +There are also tips for tuning PySR on [this page](https://ai.damtp.cam.ac.uk/pysr/tuning). ### Detailed Example The following code makes use of as many PySR features as possible. Note that is just a demonstration of features and you should not use this example as-is. -For details on what each parameter does, check out the [API page](https://astroautomata.com/PySR/api/). +For details on what each parameter does, check out the [API page](https://ai.damtp.cam.ac.uk/pysr/api/). ```python model = PySRRegressor( - procs=4, populations=8, - # ^ 2 populations per core, so one is always running. + # ^ Assuming we have 4 cores, this means 2 populations per core, so one is always running. population_size=50, # ^ Slightly larger populations, for greater diversity. ncycles_per_iteration=500, diff --git a/docs/_api.md b/docs/_api.md index bbc6c1f2f..28cfa3155 100644 --- a/docs/_api.md +++ b/docs/_api.md @@ -1,4 +1,4 @@ -# PySRRegressor Reference +# API Reference `PySRRegressor` has many options for controlling a symbolic regression search. Let's look at them below. @@ -60,3 +60,43 @@ PARAMSKEY show_root_heading: true heading_level: 3 show_root_full_path: false + +## Expression Specifications + +::: pysr.ExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.TemplateExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.ParametricExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.AbstractExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +## Logger Specifications + +::: pysr.TensorBoardLoggerSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.AbstractLoggerSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false diff --git a/docs/backend.md b/docs/backend.md index b7575d143..092251fbe 100644 --- a/docs/backend.md +++ b/docs/backend.md @@ -33,9 +33,9 @@ The main search code can be found in `src/SymbolicRegression.jl`. Here are some tips: -- The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/). +- The documentation for the backend is given [here](https://ai.damtp.cam.ac.uk/symbolicregression/dev/). - Throughout the package, you will often see template functions which typically use a symbol `T` (such as in the string `where {T<:Real}`). Here, `T` is simply the datatype of the input data and stored constants, such as `Float32` or `Float64`. Writing functions in this way lets us write functions generic to types, while still having access to the specific type specified at compilation time. -- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://astroautomata.com/SymbolicRegression.jl/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). +- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://ai.damtp.cam.ac.uk/symbolicregression/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). - For reference, the main loop itself is found in the `equation_search` function inside [`src/SymbolicRegression.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl). - Parts of the code which are typically edited by users include: - [`src/CheckConstraints.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/CheckConstraints.jl), particularly the function `check_constraints`. This function checks whether a given expression satisfies constraints, such as having a complexity lower than `maxsize`, and whether it contains any forbidden nestings of functions. @@ -70,6 +70,6 @@ For more information on `juliapkg.json`, see [`pyjuliapkg`](https://github.com/J ## Additional notes -If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://astroautomata.com/SymbolicRegression.jl/dev/). +If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://ai.damtp.cam.ac.uk/symbolicregression/dev/). If you make a change that you think could be useful to other users, don't hesitate to open a pull request on either the PySR or SymbolicRegression.jl repositories! Contributions are very appreciated. diff --git a/docs/examples.md b/docs/examples.md index 754875e7c..6d841b0e0 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -345,7 +345,7 @@ a real number from the loss function). But, you don't need to worry about this, make sure to return a scalar number of type `L`. The `tree` argument is the current expression being evaluated. You can read -about the `tree` fields [here](https://astroautomata.com/SymbolicRegression.jl/stable/types/). +about the `tree` fields [here](https://ai.damtp.cam.ac.uk/symbolicregression/stable/types/). For example, let's fix a symbolic form of an expression, as a rational function. i.e., $P(X)/Q(X)$ for polynomials $P$ and $Q$. @@ -523,7 +523,188 @@ Note that this expression has a large dynamic range so may be difficult to find. Note that you can also search for exclusively dimensionless constants by settings `dimensionless_constants_only` to `true`. -## 11. Additional features +## 11. Expression Specifications + +PySR 1.0 introduces powerful expression specifications that allow you to define structured equations. Here are two examples: + +### Template Expressions + +`TemplateExpressionSpec` allows you to define a specific structure for the equation. +For example, let's say we want to learn an equation of the form: + +$$ y = \sin(f(x_1, x_2)) + g(x_3) $$ + +We can do this as follows: + +```python +import numpy as np +from pysr import PySRRegressor, TemplateExpressionSpec + +# Create data +X = np.random.randn(1000, 3) +y = np.sin(X[:, 0] + X[:, 1]) + X[:, 2]**2 + +# Define template: we want sin(f(x1, x2)) + g(x3) +template = TemplateExpressionSpec( + function_symbols=["f", "g"], + combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)", +) + +model = PySRRegressor( + expression_spec=template, + binary_operators=["+", "*", "-", "/"], + unary_operators=["sin"], + maxsize=10, +) +model.fit(X, y) +``` + +You can also use no argument-functions for learning constants, like: + +```python +template = TemplateExpressionSpec( + function_symbols=["a", "f"], + combine="((; a, f), (x, y)) -> a() * sin(f(x, y))", +) +``` + +### Parametric Expressions + +When your data has categories with shared equation structure but different parameters, +you can use a `ParametricExpressionSpec`. Let's say we would like to learn the expression: + +$$ y = \alpha \sin(x_1) + \beta $$ + +for three different values of $\alpha$ and $\beta$. + +```python +import numpy as np +from pysr import PySRRegressor, ParametricExpressionSpec + +# Create data with 3 categories +X = np.random.uniform(-3, 3, (1000, 2)) +category = np.random.randint(0, 3, 1000) + +# Parameters for each category +offsets = [0.1, 1.5, -0.5] +scales = [1.0, 2.0, 0.5] + +# y = scale[category] * sin(x1) + offset[category] +y = np.array([ + scales[c] * np.sin(x1) + offsets[c] + for x1, c in zip(X[:, 0], category) +]) + +model = PySRRegressor( + expression_spec=ParametricExpressionSpec(max_parameters=2), + binary_operators=["+", "*", "-", "/"], + unary_operators=["sin"], + maxsize=10, +) +model.fit(X, y, category=category) + +# Predicting on new data: +# model.predict(X_test, category=category_test) +``` + +See [Expression Specifications](/api/#expression-specifications) for more details. + +## 12. Using TensorBoard for Logging + +You can use TensorBoard to visualize the search progress, as well as +record hyperparameters and final metrics (like `min_loss` and `pareto_volume` - the latter of which +is a performance measure of the entire Pareto front). + +```python +import numpy as np +from pysr import PySRRegressor, TensorBoardLoggerSpec + +rstate = np.random.RandomState(42) + +# Uniform dist between -3 and 3: +X = rstate.uniform(-3, 3, (1000, 2)) +y = np.exp(X[:, 0]) + X[:, 1] + +# Create a logger that writes to "logs/run*": +logger_spec = TensorBoardLoggerSpec( + log_dir="logs/run", + log_interval=10, # Log every 10 iterations +) + +model = PySRRegressor( + binary_operators=["+", "*", "-", "/"], + logger_spec=logger_spec, +) +model.fit(X, y) +``` + +You can then view the logs with: + +```bash +tensorboard --logdir logs/ +``` + +## 13. Using differential operators + +As part of the `TemplateExpressionSpec` described above, +you can also use differential operators within the template. +The operator for this is `D` which takes an expression as the first argument, +and the argument _index_ we are differentiating as the second argument. +This lets you compute integrals via evolution. + +For example, let's say we wish to find the integral of $\frac{1}{x^2 \sqrt{x^2 - 1}}$ +in the range $x > 1$. +We can compute the derivative of a function $f(x)$, and compare that +to numerical samples of $\frac{1}{x^2\sqrt{x^2-1}}$. Then, by extension, +$f(x)$ represents the indefinite integral of it with some constant offset! + +```python +import numpy as np +from pysr import PySRRegressor, TemplateExpressionSpec + +x = np.random.uniform(1, 10, (1000,)) # Integrand sampling points +y = 1 / (x**2 * np.sqrt(x**2 - 1)) # Evaluation of the integrand + +expression_spec = TemplateExpressionSpec( + ["f"], + """ + function diff_f_x((; f), (x,)) + df = D(f, 1) # Symbolic derivative of f with respect to its first arg + return df(x) + end + """ +) + +model = PySRRegressor( + binary_operators=["+", "-", "*", "/"], + unary_operators=["sqrt"], + expression_spec=expression_spec, + maxsize=20, +) +model.fit(x[:, np.newaxis], y) +``` + +If everything works, you should find something that simplifies to $\frac{\sqrt{x^2 - 1}}{x}$. + +Here, we write out a full function in Julia. +But we can also do an anonymous function, like `((; f), (x,)) -> D(f, 1)(x)`. We can also avoid the fancy unpacking syntax and write: +`(nt, xs) -> D(nt.f, 1)(xs[1])` which is completely equivalent. Note that in Julia, +the following two syntaxes are equivalent: + +```julia +nt = (; f=1, g=2) # Create a "named tuple" +(; f, g) = nt +``` + +and + +```julia +f = nt.f +g = nt.g +``` + + +## 14. Additional features For the many other features available in PySR, please read the [Options section](options.md). diff --git a/docs/interactive-docs.md b/docs/interactive-docs.md deleted file mode 100644 index 23be57076..000000000 --- a/docs/interactive-docs.md +++ /dev/null @@ -1,11 +0,0 @@ -# Interactive Reference - - - -The following docs are interactive, and, based on your selections, -will create a snippet of Python code at the bottom which you can execute locally. -Clicking on each parameter's name will display a description. -Note that this is an incomplete list of options; for the full list, -see the [API Reference](api.md). - - diff --git a/docs/operators.md b/docs/operators.md index db538d8bc..b67c7b1b4 100644 --- a/docs/operators.md +++ b/docs/operators.md @@ -7,56 +7,32 @@ takes one or two scalars as input, and returns on scalar as output, is likely to be a valid operator[^1]. A selection of these and other valid operators are stated below. -**Binary** - -- `+` -- `-` -- `*` -- `/` -- `^` -- `max` -- `min` -- `mod` -- `cond` - - Equal to `(x, y) -> x > 0 ? y : 0` -- `greater` - - Equal to `(x, y) -> x > y ? 1 : 0` -- `logical_or` - - Equal to `(x, y) -> (x > 0 || y > 0) ? 1 : 0` -- `logical_and` - - Equal to `(x, y) -> (x > 0 && y > 0) ? 1 : 0` - -**Unary** - -- `neg` -- `square` -- `cube` -- `exp` -- `abs` -- `log` -- `log10` -- `log2` -- `log1p` -- `sqrt` -- `sin` -- `cos` -- `tan` -- `sinh` -- `cosh` -- `tanh` -- `atan` -- `asinh` -- `acosh` -- `atanh_clip` - - Equal to `atanh(mod(x + 1, 2) - 1)` -- `erf` -- `erfc` -- `gamma` -- `relu` -- `round` -- `floor` -- `ceil` -- `sign` +Also, note that it's a good idea to not use too many operators, since +it can exponentially increase the search space. + +**Binary Operators** + +| Arithmetic | Comparison | Logic | +|--------------|------------|----------| +| `+` | `max` | `logical_or`[^2] | +| `-` | `min` | `logical_and`[^3]| +| `*` | `greater`[^4] | | +| `/` | `cond`[^5] | | +| `^` | `mod` | | + +**Unary Operators** + +| Basic | Exp/Log | Trig | Hyperbolic | Special | Rounding | +|------------|------------|-----------|------------|-----------|------------| +| `neg` | `exp` | `sin` | `sinh` | `erf` | `round` | +| `square` | `log` | `cos` | `cosh` | `erfc` | `floor` | +| `cube` | `log10` | `tan` | `tanh` | `gamma` | `ceil` | +| `cbrt` | `log2` | `asin` | `asinh` | `relu` | | +| `sqrt` | `log1p` | `acos` | `acosh` | `sinc` | | +| `abs` | | `atan` | `atanh` | | | +| `sign` | | | | | | +| `inv` | | | | | | + ## Custom @@ -96,3 +72,7 @@ any invalid values over the training dataset. [^1]: However, you will need to define a sympy equivalent in `extra_sympy_mapping` if you want to use a function not in the above list. +[^2]: `logical_or` is equivalent to `(x, y) -> (x > 0 || y > 0) ? 1 : 0` +[^3]: `logical_and` is equivalent to `(x, y) -> (x > 0 && y > 0) ? 1 : 0` +[^4]: `greater` is equivalent to `(x, y) -> x > y ? 1 : 0` +[^5]: `cond` is equivalent to `(x, y) -> x > 0 ? y : 0` diff --git a/docs/options.md b/docs/options.md index 0ccacbcab..7a8f5c11e 100644 --- a/docs/options.md +++ b/docs/options.md @@ -276,7 +276,7 @@ model = PySRRegressor(..., weights=weights, elementwise_loss="myloss(x, y, w) = model.fit(..., weights=weights) ``` -Built-in loss (faster) (see [losses](https://astroautomata.com/SymbolicRegression.jl/dev/losses/)). +Built-in loss (faster) (see [losses](https://ai.damtp.cam.ac.uk/symbolicregression/dev/losses/)). This one computes the L3 norm: ```python diff --git a/docs/tuning.md b/docs/tuning.md index 455c21da4..56ef1e5da 100644 --- a/docs/tuning.md +++ b/docs/tuning.md @@ -8,10 +8,12 @@ When running PySR, I usually do the following: I run from IPython (Jupyter Notebooks don't work as well[^1]) on the head node of a slurm cluster. Passing `cluster_manager="slurm"` will make PySR set up a run over the entire allocation. I set `procs` equal to the total number of cores over my entire allocation. +I use the [tensorboard feature](https://ai.damtp.cam.ac.uk/pysr/examples/#12-using-tensorboard-for-logging) for experiment tracking. + [^1]: Jupyter Notebooks are supported by PySR, but miss out on some useful features available in IPython and Python: the progress bar, and early stopping with "q". In Jupyter you cannot interrupt a search once it has started; you have to restart the kernel. See [this issue](https://github.com/MilesCranmer/PySR/issues/260) for updates. -1. Use the default parameters. -2. Use only the operators I think it needs and no more. +1. I start by using the default parameters. +2. I use only the operators I think it needs and no more. 3. Increase `populations` to `3*num_cores`. 4. If my dataset is more than 1000 points, I either subsample it (low-dimensional and not much noise) or set `batching=True` (high-dimensional or very noisy, so it needs to evaluate on all the data). 5. While on a laptop or single node machine, you might leave the default `ncycles_per_iteration`, on a cluster with ~100 cores I like to set `ncycles_per_iteration` to maybe `5000` or so, until the head node occupation is under `10%`. (A larger value means the workers talk less frequently to eachother, which is useful when you have many workers!) @@ -20,7 +22,7 @@ I run from IPython (Jupyter Notebooks don't work as well[^1]) on the head node o 8. I typically don't use `maxdepth`, but if I do, I set it strictly, while also leaving a bit of room for exploration. e.g., if you want a final equation limited to a depth of `5`, you might set this to `6` or `7`, so that it has a bit of room to explore. 9. Set `parsimony` equal to about the minimum loss you would expect, divided by 5-10. e.g., if you expect the final equation to have a loss of `0.001`, you might set `parsimony=0.0001`. 10. Set `weight_optimize` to some larger value, maybe `0.001`. This is very important if `ncycles_per_iteration` is large, so that optimization happens more frequently. -11. Set `bumper` to `True`. This turns on bump allocation but is experimental. It should give you a nice 20% speedup. +11. Set `turbo` to `True`. This turns on advanced loop vectorization, but is still quite experimental. It should give you a nice 20% or more speedup. 12. For final runs, after I have tuned everything, I typically set `niterations` to some very large value, and just let it run for a week until my job finishes (genetic algorithms tend not to converge, they can look like they settle down, but then find a new family of expression, and explore a new space). If I am satisfied with the current equations (which are visible either in the terminal or in the saved csv file), I quit the job early. Since I am running in IPython, I can just hit `q` and then `` to stop the job, tweak the hyperparameters, and then start the search again. diff --git a/environment.yml b/environment.yml index c7d6ceebf..994390c98 100644 --- a/environment.yml +++ b/environment.yml @@ -2,10 +2,10 @@ name: test channels: - conda-forge dependencies: - - python>=3.8 + - python>=3.10 - sympy>=1.0.0,<2.0.0 - pandas>=0.21.0,<3.0.0 - numpy>=1.13.0,<2.0.0 - scikit-learn>=1.0.0,<2.0.0 - - pyjuliacall>=0.9.21,<0.9.22 + - pyjuliacall>=0.9.22,<0.9.23 - click>=7.0.0,<9.0.0 diff --git a/examples/pysr_demo.ipynb b/examples/pysr_demo.ipynb index 8822effec..fabf07971 100644 --- a/examples/pysr_demo.ipynb +++ b/examples/pysr_demo.ipynb @@ -321,7 +321,7 @@ "id": "qvgVbOoSFtQY" }, "source": [ - "A full list of operators is given here: https://astroautomata.com/PySR/operators,\n", + "A full list of operators is given here: https://ai.damtp.cam.ac.uk/pysr/operators,\n", "but we can also use any binary or unary operator in `julia`, or define our own as arbitrary functions.\n", "\n", "Say that we want a command to do quartic powers:\n", @@ -1498,7 +1498,7 @@ "id": "S5dO61g1bDhk" }, "source": [ - "The full list of PySR parameters can be found here: https://astroautomata.com/PySR/api" + "The full list of PySR parameters can be found here: https://ai.damtp.cam.ac.uk/pysr/api" ] } ], diff --git a/mkdocs.yml b/mkdocs.yml index cff11241c..f66046d00 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -33,10 +33,9 @@ nav: - papers.md - api-advanced.md - backend.md - - interactive-docs.md extra: - homepage: https://astroautomata.com/PySR + homepage: https://ai.damtp.cam.ac.uk/pysr extra_css: - stylesheets/extra.css diff --git a/pyproject.toml b/pyproject.toml index aa867188b..e379cdf78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,18 +4,49 @@ build-backend = "setuptools.build_meta" [project] name = "pysr" -version = "0.19.4" -authors = [{ name = "Miles Cranmer", email = "miles.cranmer@gmail.com" }] +version = "1.3.1" +authors = [ + {name = "Miles Cranmer", email = "miles.cranmer@gmail.com"}, +] description = "Simple and efficient symbolic regression" -readme = { file = "README.md", content-type = "text/markdown" } -license = { file = "LICENSE" } -requires-python = ">=3.8" +readme = {file = "README.md", content-type = "text/markdown"} +license = {file = "LICENSE"} +requires-python = ">=3.10" classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", "License :: OSI Approved :: Apache Software License", ] -dynamic = ["dependencies"] +dependencies = [ + "sympy>=1.0.0,<2.0.0", + "pandas>=0.21.0,<3.0.0", + "numpy>=1.13.0,<3.0.0", + "scikit_learn>=1.0.0,<2.0.0", + "juliacall==0.9.23", + "click>=7.0.0,<9.0.0", + "setuptools>=50.0.0", +] + +[project.optional-dependencies] +dev = [ + "beartype>=0.19,<0.20", + "coverage>=7,<8", + "coveralls>=4,<5", + "ipykernel>=6,<7", + "ipython>=8,<9", + "jax[cpu]>=0.4,<0.5", + "jupyter>=1,<2", + "mypy>=1,<2", + "nbval>=0.11,<0.12", + "pandas-stubs", + "pre-commit>=3.7,<5", + "pytest-cov>=5,<7", + "pytest>=8,<9", + "tensorboard>=2,<3", + "torch>=2,<3", + "types-openpyxl", + "types-pytz", +] [tool.setuptools] packages = ["pysr", "pysr._cli", "pysr.test"] @@ -27,18 +58,3 @@ dependencies = { file = "requirements.txt" } [tool.isort] profile = "black" - -[tool.rye] -dev-dependencies = [ - "pre-commit>=3.7.0", - "ipython>=8.23.0", - "ipykernel>=6.29.4", - "mypy>=1.10.0", - "jax[cpu]>=0.4.26", - "torch>=2.3.0", - "paddlepaddle>=2.6.1", - "pandas-stubs>=2.2.1.240316", - "types-pytz>=2024.1.0.20240417", - "types-openpyxl>=3.1.0.20240428", - "coverage>=7.5.3", -] diff --git a/pysr/__init__.py b/pysr/__init__.py index 2b8ac5de3..9626c5ee8 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -1,3 +1,10 @@ +import os + +if os.environ.get("PYSR_USE_BEARTYPE", "0") == "1": + from beartype.claw import beartype_this_package + + beartype_this_package() + # This must be imported as early as possible to prevent # library linking issues caused by numpy/pytorch/etc. importing # old libraries: @@ -8,7 +15,14 @@ from .export_jax import sympy2jax from .export_paddle import sympy2paddle from .export_torch import sympy2torch +from .expression_specs import ( + AbstractExpressionSpec, + ExpressionSpec, + ParametricExpressionSpec, + TemplateExpressionSpec, +) from .julia_extensions import load_all_packages +from .logger_specs import AbstractLoggerSpec, TensorBoardLoggerSpec from .sr import PySRRegressor # This file is created by setuptools_scm during the build process: @@ -24,6 +38,12 @@ "install", "load_all_packages", "PySRRegressor", + "AbstractExpressionSpec", + "ExpressionSpec", + "TemplateExpressionSpec", + "ParametricExpressionSpec", + "AbstractLoggerSpec", + "TensorBoardLoggerSpec", "best", "best_callable", "best_row", diff --git a/pysr/denoising.py b/pysr/denoising.py index 5ab6a168e..92a640ee8 100644 --- a/pysr/denoising.py +++ b/pysr/denoising.py @@ -1,6 +1,6 @@ """Functions for denoising data during preprocessing.""" -from typing import Optional, Tuple, cast +from typing import cast import numpy as np from numpy import ndarray @@ -9,9 +9,9 @@ def denoise( X: ndarray, y: ndarray, - Xresampled: Optional[ndarray] = None, - random_state: Optional[np.random.RandomState] = None, -) -> Tuple[ndarray, ndarray]: + Xresampled: ndarray | None = None, + random_state: np.random.RandomState | None = None, +) -> tuple[ndarray, ndarray]: """Denoise the dataset using a Gaussian process.""" from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel @@ -31,8 +31,8 @@ def denoise( def multi_denoise( X: ndarray, y: ndarray, - Xresampled: Optional[ndarray] = None, - random_state: Optional[np.random.RandomState] = None, + Xresampled: ndarray | None = None, + random_state: np.random.RandomState | None = None, ): """Perform `denoise` along each column of `y` independently.""" y = np.stack( diff --git a/pysr/export.py b/pysr/export.py new file mode 100644 index 000000000..c1b589b84 --- /dev/null +++ b/pysr/export.py @@ -0,0 +1,88 @@ +import copy +from collections.abc import Callable + +import numpy as np +import pandas as pd +from numpy.typing import NDArray + +from .export_jax import sympy2jax +from .export_numpy import sympy2numpy +from .export_sympy import create_sympy_symbols, pysr2sympy +from .export_torch import sympy2torch +from .utils import ArrayLike + + +def add_export_formats( + output: pd.DataFrame, + *, + feature_names_in: ArrayLike[str], + selection_mask: NDArray[np.bool_] | None = None, + extra_sympy_mappings: dict[str, Callable] | None = None, + extra_torch_mappings: dict[Callable, Callable] | None = None, + output_torch_format: bool = False, + extra_jax_mappings: dict[Callable, str] | None = None, + output_jax_format: bool = False, +) -> pd.DataFrame: + """Create export formats for an equations dataframe. + + Returns a new dataframe containing only the exported formats. + """ + output = copy.deepcopy(output) + + sympy_format = [] + lambda_format = [] + jax_format = [] + torch_format = [] + + for _, eqn_row in output.iterrows(): + eqn = pysr2sympy( + eqn_row["equation"], + feature_names_in=feature_names_in, + extra_sympy_mappings=extra_sympy_mappings, + ) + sympy_format.append(eqn) + + # NumPy: + sympy_symbols = create_sympy_symbols(feature_names_in) + lambda_format.append( + sympy2numpy( + eqn, + sympy_symbols, + selection=selection_mask, + ) + ) + + # JAX: + if output_jax_format: + func, params = sympy2jax( + eqn, + sympy_symbols, + selection=selection_mask, + extra_jax_mappings=extra_jax_mappings, + ) + jax_format.append({"callable": func, "parameters": params}) + + # Torch: + if output_torch_format: + module = sympy2torch( + eqn, + sympy_symbols, + selection=selection_mask, + extra_torch_mappings=extra_torch_mappings, + ) + torch_format.append(module) + + exports = pd.DataFrame( + { + "sympy_format": sympy_format, + "lambda_format": lambda_format, + }, + index=output.index, + ) + + if output_jax_format: + exports["jax_format"] = jax_format + if output_torch_format: + exports["torch_format"] = torch_format + + return exports diff --git a/pysr/export_jax.py b/pysr/export_jax.py index ba36bfc66..4f8ead200 100644 --- a/pysr/export_jax.py +++ b/pysr/export_jax.py @@ -1,5 +1,6 @@ import numpy as np # noqa: F401 import sympy # type: ignore +from sympy.codegen.cfunctions import log2, log10 # type: ignore # Special since need to reduce arguments. MUL = 0 @@ -15,6 +16,8 @@ sympy.ceiling: "jnp.ceil", sympy.floor: "jnp.floor", sympy.log: "jnp.log", + log2: "jnp.log2", + log10: "jnp.log10", sympy.exp: "jnp.exp", sympy.sqrt: "jnp.sqrt", sympy.cos: "jnp.cos", diff --git a/pysr/export_latex.py b/pysr/export_latex.py index b7815d07c..55d24de0b 100644 --- a/pysr/export_latex.py +++ b/pysr/export_latex.py @@ -1,7 +1,5 @@ """Functions to help export PySR equations to LaTeX.""" -from typing import List, Optional, Tuple - import pandas as pd import sympy # type: ignore from sympy.printing.latex import LatexPrinter # type: ignore @@ -28,8 +26,8 @@ def sympy2latex(expr, prec=3, full_prec=True, **settings) -> str: def generate_table_environment( - columns: List[str] = ["equation", "complexity", "loss"] -) -> Tuple[str, str]: + columns: list[str] = ["equation", "complexity", "loss"] +) -> tuple[str, str]: margins = "c" * len(columns) column_map = { "complexity": "Complexity", @@ -61,9 +59,9 @@ def generate_table_environment( def sympy2latextable( equations: pd.DataFrame, - indices: Optional[List[int]] = None, + indices: list[int] | None = None, precision: int = 3, - columns: List[str] = ["equation", "complexity", "loss", "score"], + columns: list[str] = ["equation", "complexity", "loss", "score"], max_equation_length: int = 50, output_variable_name: str = "y", ) -> str: @@ -128,11 +126,11 @@ def sympy2latextable( def sympy2multilatextable( - equations: List[pd.DataFrame], - indices: Optional[List[List[int]]] = None, + equations: list[pd.DataFrame], + indices: list[list[int]] | None = None, precision: int = 3, - columns: List[str] = ["equation", "complexity", "loss", "score"], - output_variable_names: Optional[List[str]] = None, + columns: list[str] = ["equation", "complexity", "loss", "score"], + output_variable_names: list[str] | None = None, ) -> str: """Generate multiple latex tables for a list of equation sets.""" # TODO: Let user specify custom output variable diff --git a/pysr/export_numpy.py b/pysr/export_numpy.py index 4c1d12e73..163726d6f 100644 --- a/pysr/export_numpy.py +++ b/pysr/export_numpy.py @@ -1,7 +1,6 @@ """Code for exporting discovered expressions to numpy""" import warnings -from typing import List, Union import numpy as np import pandas as pd @@ -17,8 +16,8 @@ class CallableEquation: """Simple wrapper for numpy lambda functions built with sympy""" _sympy: Expr - _sympy_symbols: List[Symbol] - _selection: Union[NDArray[np.bool_], None] + _sympy_symbols: list[Symbol] + _selection: NDArray[np.bool_] | None def __init__(self, eqn, sympy_symbols, selection=None): self._sympy = eqn diff --git a/pysr/export_sympy.py b/pysr/export_sympy.py index f38593413..925c284b6 100644 --- a/pysr/export_sympy.py +++ b/pysr/export_sympy.py @@ -1,9 +1,10 @@ """Define utilities to export to sympy""" -from typing import Callable, Dict, List, Optional +from collections.abc import Callable import sympy # type: ignore from sympy import sympify +from sympy.codegen.cfunctions import log2, log10 # type: ignore from .utils import ArrayLike @@ -39,8 +40,8 @@ "erf": sympy.erf, "erfc": sympy.erfc, "log": lambda x: sympy.log(x), - "log10": lambda x: sympy.log(x, 10), - "log2": lambda x: sympy.log(x, 2), + "log10": lambda x: log10(x), + "log2": lambda x: log2(x), "log1p": lambda x: sympy.log(x + 1), "log_abs": lambda x: sympy.log(abs(x)), "log10_abs": lambda x: sympy.log(abs(x), 10), @@ -63,21 +64,21 @@ def create_sympy_symbols_map( feature_names_in: ArrayLike[str], -) -> Dict[str, sympy.Symbol]: +) -> dict[str, sympy.Symbol]: return {variable: sympy.Symbol(variable) for variable in feature_names_in} def create_sympy_symbols( feature_names_in: ArrayLike[str], -) -> List[sympy.Symbol]: +) -> list[sympy.Symbol]: return [sympy.Symbol(variable) for variable in feature_names_in] def pysr2sympy( - equation: str, + equation: str | float | int, *, - feature_names_in: Optional[ArrayLike[str]] = None, - extra_sympy_mappings: Optional[Dict[str, Callable]] = None, + feature_names_in: ArrayLike[str] | None = None, + extra_sympy_mappings: dict[str, Callable] | None = None, ): if feature_names_in is None: feature_names_in = [] diff --git a/pysr/export_torch.py b/pysr/export_torch.py index be3d6a163..eb3ccd8aa 100644 --- a/pysr/export_torch.py +++ b/pysr/export_torch.py @@ -5,6 +5,7 @@ import numpy as np # noqa: F401 import sympy # type: ignore +from sympy.codegen.cfunctions import log2, log10 # type: ignore def _reduce(fn): @@ -41,6 +42,8 @@ def _initialize_torch(): sympy.ceiling: torch.ceil, sympy.floor: torch.floor, sympy.log: torch.log, + log2: torch.log2, + log10: torch.log10, sympy.exp: torch.exp, sympy.sqrt: torch.sqrt, sympy.cos: torch.cos, diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py new file mode 100644 index 000000000..f9a6eee7c --- /dev/null +++ b/pysr/expression_specs.py @@ -0,0 +1,293 @@ +import copy +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, NewType, TypeAlias + +import numpy as np +import pandas as pd + +from .export import add_export_formats +from .julia_helpers import jl_array +from .julia_import import AnyValue, SymbolicRegression, jl + +# For type checking purposes +if TYPE_CHECKING: + from .sr import PySRRegressor # pragma: no cover + + PySRRegressor: TypeAlias = PySRRegressor # pragma: no cover +else: + PySRRegressor = NewType("PySRRegressor", Any) + + +class AbstractExpressionSpec(ABC): + """Abstract base class describing expression types. + + This basically just holds the options for the expression type, + as well as explains how to parse and evaluate them. + + All expression types must implement: + + 1. julia_expression_type(): The actual expression type, returned as a Julia object. + This will get stored as `expression_type` in `SymbolicRegression.Options`. + 2. julia_expression_options(): Method to create the expression options, returned as a Julia object. + These will get stored as `expression_options` in `SymbolicRegression.Options`. + 3. create_exports(), which will be used to create the exports of the equations, such as + the executable format, the SymPy format, etc. + + It may also optionally implement: + + - supports_sympy, supports_torch, supports_jax, supports_latex: Whether this expression type supports the corresponding export format. + """ + + @abstractmethod + def julia_expression_type(self) -> AnyValue: + """The expression type""" + pass # pragma: no cover + + @abstractmethod + def julia_expression_options(self) -> AnyValue: + """The expression options""" + pass # pragma: no cover + + @abstractmethod + def create_exports( + self, + model: PySRRegressor, + equations: pd.DataFrame, + search_output, + ) -> pd.DataFrame: + """Create additional columns in the equations dataframe.""" + pass # pragma: no cover + + @property + def evaluates_in_julia(self) -> bool: + return False + + @property + def supports_sympy(self) -> bool: + return False + + @property + def supports_torch(self) -> bool: + return False + + @property + def supports_jax(self) -> bool: + return False + + @property + def supports_latex(self) -> bool: + return False + + +class ExpressionSpec(AbstractExpressionSpec): + """The default expression specification, with no special behavior.""" + + def julia_expression_type(self): + return SymbolicRegression.Expression + + def julia_expression_options(self): + return jl.NamedTuple() + + def create_exports( + self, + model: PySRRegressor, + equations: pd.DataFrame, + search_output, + ): + return add_export_formats( + equations, + feature_names_in=model.feature_names_in_, + selection_mask=model.selection_mask_, + extra_sympy_mappings=model.extra_sympy_mappings, + extra_torch_mappings=model.extra_torch_mappings, + output_jax_format=model.output_jax_format, + extra_jax_mappings=model.extra_jax_mappings, + output_torch_format=model.output_torch_format, + ) + + @property + def supports_sympy(self): + return True + + @property + def supports_torch(self): + return True + + @property + def supports_jax(self): + return True + + @property + def supports_latex(self): + return True + + +class TemplateExpressionSpec(AbstractExpressionSpec): + """Spec for templated expressions. + + This class allows you to specify how multiple sub-expressions should be combined + in a structured way, with constraints on which variables each sub-expression can use. + Pass this to PySRRegressor with the `expression_spec` argument when you are using + the `TemplateExpression` expression type. + + Parameters + ---------- + function_symbols : list[str] + List of symbols representing the inner expressions (e.g., ["f", "g"]). + These will be used as keys in the template structure. + combine : str + Julia function string that defines how the sub-expressions are combined. + Takes a NamedTuple of expressions and a tuple of data vectors. + For example: "((; f, g), (x1, x2, x3)) -> f(x1, x2) + g(x3)^2" + would constrain f to use x1,x2 and g to use x3. + num_features : dict[str, int] + Dictionary mapping function symbols to the number of features each can use. + For example: {"f": 2, "g": 1} means f takes 2 inputs and g takes 1. + If not provided, will be inferred from the combine function. + + Examples + -------- + ```python + # Create template that combines f(x1, x2) and g(x3): + expression_spec = TemplateExpressionSpec( + function_symbols=["f", "g"], + combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)^2", + ) + + # Use in PySRRegressor: + model = PySRRegressor( + expression_spec=expression_spec + ) + ``` + """ + + def __init__( + self, + function_symbols: list[str], + combine: str, + num_features: dict[str, int] | None = None, + ): + self.function_symbols = function_symbols + self.combine = combine + self.num_features = num_features + + def julia_expression_type(self): + return SymbolicRegression.TemplateExpression + + def julia_expression_options(self): + f_combine = jl.seval(self.combine) + creator = jl.seval( + """ + function _pysr_create_template_structure( + @nospecialize(function_symbols::AbstractVector), + @nospecialize(combine::Function), + @nospecialize(num_features::Union{Nothing,AbstractDict}) + ) + tuple_symbol = (map(Symbol, function_symbols)..., ) + num_features = if num_features === nothing + nothing + else + (; num_features...) + end + structure = SymbolicRegression.TemplateStructure{tuple_symbol}(combine, num_features) + return (; structure) + end + """ + ) + return creator(self.function_symbols, f_combine, self.num_features) + + @property + def evaluates_in_julia(self): + return True + + def create_exports( + self, + model: PySRRegressor, + equations: pd.DataFrame, + search_output, + ) -> pd.DataFrame: + # We try to load the raw julia state from a saved binary stream + # if not provided. + search_output = search_output or model.julia_state_ + return _search_output_to_callable_expressions(equations, search_output) + + +class ParametricExpressionSpec(AbstractExpressionSpec): + """Spec for parametric expressions that vary by category. + + This class allows you to specify expressions with parameters that vary across different + categories in your dataset. The expression structure remains the same, but parameters + are optimized separately for each category. + + Parameters + ---------- + max_parameters : int + Maximum number of parameters that can appear in the expression. Each parameter + will take on different values for each category in the data. + + Examples + -------- + For example, if we want to allow for a model with up to 2 parameters (each category + can have a different value for these parameters), we can use: + + ```python + model = PySRRegressor( + expression_spec=ParametricExpressionSpec(max_parameters=2), + binary_operators=["+", "*"], + unary_operators=["sin"] + ) + model.fit(X, y, category=category) + ``` + """ + + def __init__(self, max_parameters: int): + self.max_parameters = max_parameters + + def julia_expression_type(self): + return SymbolicRegression.ParametricExpression + + def julia_expression_options(self): + return jl.seval("NamedTuple{(:max_parameters,)}")((self.max_parameters,)) + + @property + def evaluates_in_julia(self): + return True + + def create_exports( + self, + model: PySRRegressor, + equations: pd.DataFrame, + search_output, + ): + search_output = search_output or model.julia_state_ + return _search_output_to_callable_expressions(equations, search_output) + + +class CallableJuliaExpression: + def __init__(self, expression): + self.expression = expression + + def __call__(self, X: np.ndarray, *args): + raw_output = self.expression(jl_array(X.T), *args) + return np.array(raw_output).T + + +def _search_output_to_callable_expressions( + equations: pd.DataFrame, search_output +) -> pd.DataFrame: + equations = copy.deepcopy(equations) + (_, out_hof) = search_output + expressions = [] + callables = [] + + for _, row in equations.iterrows(): + curComplexity = row["complexity"] + expression = out_hof.members[curComplexity - 1].tree + expressions.append(expression) + callables.append(CallableJuliaExpression(expression)) + + df = pd.DataFrame( + {"julia_expression": expressions, "lambda_format": callables}, + index=equations.index, + ) + return df diff --git a/pysr/feature_selection.py b/pysr/feature_selection.py index 7702e255a..8c8358fdd 100644 --- a/pysr/feature_selection.py +++ b/pysr/feature_selection.py @@ -1,6 +1,6 @@ """Functions for doing feature selection during preprocessing.""" -from typing import Optional, cast +from typing import cast import numpy as np from numpy import ndarray @@ -13,7 +13,7 @@ def run_feature_selection( X: ndarray, y: ndarray, select_k_features: int, - random_state: Optional[np.random.RandomState] = None, + random_state: np.random.RandomState | None = None, ) -> NDArray[np.bool_]: """ Find most important features. @@ -38,7 +38,7 @@ def run_feature_selection( # Function has not been removed only due to usage in module tests def _handle_feature_selection( X: ndarray, - select_k_features: Optional[int], + select_k_features: int | None, y: ndarray, variable_names: ArrayLike[str], ): diff --git a/pysr/julia_extensions.py b/pysr/julia_extensions.py index ac4714d48..950c292ea 100644 --- a/pysr/julia_extensions.py +++ b/pysr/julia_extensions.py @@ -1,31 +1,40 @@ """This file installs and loads extensions for SymbolicRegression.""" -from typing import Optional +from typing import Literal from .julia_import import Pkg, jl +from .julia_registry_helpers import try_with_registry_fallback +from .logger_specs import AbstractLoggerSpec, TensorBoardLoggerSpec def load_required_packages( *, turbo: bool = False, bumper: bool = False, - enable_autodiff: bool = False, - cluster_manager: Optional[str] = None, + autodiff_backend: Literal["Zygote"] | None = None, + cluster_manager: str | None = None, + logger_spec: AbstractLoggerSpec | None = None, ): if turbo: load_package("LoopVectorization", "bdcacae8-1622-11e9-2a5c-532679323890") if bumper: load_package("Bumper", "8ce10254-0962-460f-a3d8-1f77fea1446e") - if enable_autodiff: + if autodiff_backend is not None: load_package("Zygote", "e88e6eb3-aa80-5325-afca-941959d7151f") if cluster_manager is not None: load_package("ClusterManagers", "34f1f09b-3a8b-5176-ab39-66d58a4d544e") + if isinstance(logger_spec, TensorBoardLoggerSpec): + load_package("TensorBoardLogger", "899adc3e-224a-11e9-021f-63837185c80f") def load_all_packages(): """Install and load all Julia extensions available to PySR.""" load_required_packages( - turbo=True, bumper=True, enable_autodiff=True, cluster_manager="slurm" + turbo=True, + bumper=True, + autodiff_backend="Zygote", + cluster_manager="slurm", + logger_spec=TensorBoardLoggerSpec(log_dir="logs"), ) @@ -39,8 +48,12 @@ def isinstalled(uuid_s: str): def load_package(package_name: str, uuid_s: str) -> None: if not isinstalled(uuid_s): - Pkg.add(name=package_name, uuid=uuid_s) - Pkg.resolve() + + def _add_package(): + Pkg.add(name=package_name, uuid=uuid_s) + Pkg.resolve() + + try_with_registry_fallback(_add_package) # TODO: Protect against loading the same symbol from two packages, # maybe with a @gensym here. diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index 18d4a6cf3..ef82be902 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -1,13 +1,13 @@ """Functions for initializing the Julia environment and installing deps.""" -from typing import Any, Callable, Union, cast +from typing import Any, Callable, cast, overload import numpy as np from juliacall import convert as jl_convert # type: ignore from numpy.typing import NDArray from .deprecated import init_julia, install -from .julia_import import jl +from .julia_import import AnyValue, jl jl_convert = cast(Callable[[Any, Any], Any], jl_convert) @@ -22,6 +22,8 @@ def _escape_filename(filename): """Turn a path into a string with correctly escaped backslashes.""" + if filename is None: + return None str_repr = str(filename) str_repr = str_repr.replace("\\", "\\\\") return str_repr @@ -41,6 +43,10 @@ def jl_array(x, dtype=None): return jl_convert(jl.Array[dtype], x) +def jl_dict(x): + return jl_convert(jl.Dict, x) + + def jl_is_function(f) -> bool: return cast(bool, jl.seval("op -> op isa Function")(f)) @@ -51,7 +57,11 @@ def jl_serialize(obj: Any) -> NDArray[np.uint8]: return np.array(jl.take_b(buf)) -def jl_deserialize(s: Union[NDArray[np.uint8], None]): +@overload +def jl_deserialize(s: NDArray[np.uint8]) -> AnyValue: ... +@overload +def jl_deserialize(s: None) -> None: ... +def jl_deserialize(s): if s is None: return s buf = jl.IOBuffer() diff --git a/pysr/julia_import.py b/pysr/julia_import.py index 0e032bee1..26288c754 100644 --- a/pysr/julia_import.py +++ b/pysr/julia_import.py @@ -4,6 +4,8 @@ from types import ModuleType from typing import cast +from .julia_registry_helpers import try_with_registry_fallback + # Check if JuliaCall is already loaded, and if so, warn the user # about the relevant environment variables. If not loaded, # set up sensible defaults. @@ -42,6 +44,16 @@ # Deprecated; so just pass to juliacall os.environ["PYTHON_JULIACALL_AUTOLOAD_IPYTHON_EXTENSION"] = autoload_extensions + +def _import_juliacall(): + import juliacall # type: ignore + + +try_with_registry_fallback(_import_juliacall) + + +from juliacall import AnyValue # type: ignore +from juliacall import VectorValue # type: ignore from juliacall import Main as jl # type: ignore jl = cast(ModuleType, jl) @@ -52,5 +64,8 @@ jl.seval("using SymbolicRegression") SymbolicRegression = jl.SymbolicRegression +# Expose `D` operator: +jl.seval("using SymbolicRegression: D") + jl.seval("using Pkg: Pkg") Pkg = jl.Pkg diff --git a/pysr/julia_registry_helpers.py b/pysr/julia_registry_helpers.py new file mode 100644 index 000000000..2c2162e75 --- /dev/null +++ b/pysr/julia_registry_helpers.py @@ -0,0 +1,44 @@ +"""Utilities for managing Julia registry preferences during package operations.""" + +import os +import warnings +from collections.abc import Callable +from typing import TypeVar + +T = TypeVar("T") + +PREFERENCE_KEY = "JULIA_PKG_SERVER_REGISTRY_PREFERENCE" + + +def try_with_registry_fallback(f: Callable[..., T], *args, **kwargs) -> T: + """Execute function with modified Julia registry preference. + + First tries with existing registry preference. If that fails with a Julia registry error, + temporarily modifies the registry preference to 'eager'. Restores original preference after + execution. + """ + try: + return f(*args, **kwargs) + except Exception as initial_error: + # Check if this is a Julia registry error by looking at the error message + if "JuliaError" not in str( + type(initial_error) + ) or "Unsatisfiable requirements detected" not in str(initial_error): + raise initial_error + + old_value = os.environ.get(PREFERENCE_KEY, None) + if old_value == "eager": + raise initial_error + + warnings.warn( + "Initial Julia registry operation failed. Attempting to use the `eager` registry flavor of the Julia " + + f"General registry from the Julia Pkg server (via the `{PREFERENCE_KEY}` environment variable)." + ) + os.environ[PREFERENCE_KEY] = "eager" + try: + return f(*args, **kwargs) + finally: + if old_value is not None: + os.environ[PREFERENCE_KEY] = old_value + else: + del os.environ[PREFERENCE_KEY] diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 6b6e8aceb..f6d709c21 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -1,9 +1,9 @@ { - "julia": "~1.6.7, ~1.7, ~1.8, ~1.9, =1.10.0, ~1.10.3", + "julia": "=1.10.0, 1.10.3", "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=0.24.5" + "version": "=1.5.1" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py new file mode 100644 index 000000000..f4c0a3dff --- /dev/null +++ b/pysr/logger_specs.py @@ -0,0 +1,85 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + +from .julia_helpers import jl_array, jl_dict +from .julia_import import AnyValue, jl + + +class AbstractLoggerSpec(ABC): + """Abstract base class for logger specifications.""" + + @abstractmethod + def create_logger(self) -> AnyValue: + """Create a logger instance.""" + pass # pragma: no cover + + @abstractmethod + def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None: + """Write hyperparameters to the logger.""" + pass # pragma: no cover + + @abstractmethod + def close(self, logger: AnyValue) -> None: + """Close the logger instance.""" + pass # pragma: no cover + + +@dataclass +class TensorBoardLoggerSpec(AbstractLoggerSpec): + """Specification for TensorBoard logger. + + Parameters + ---------- + log_dir : str + Directory where TensorBoard logs will be saved. If `overwrite` is `False`, + new logs will be saved to `{log_dir}_1`, and so on. Default is `"logs/run"`. + log_interval : int, optional + Interval (in steps) at which logs are written. Default is 10. + overwrite : bool, optional + Whether to overwrite existing logs in the directory. Default is False. + """ + + log_dir: str = "logs/run" + log_interval: int = 1 + overwrite: bool = False + + def create_logger(self) -> AnyValue: + # We assume that TensorBoardLogger is already imported via `julia_extensions.py` + make_logger = jl.seval( + """ + function make_logger(log_dir::AbstractString, overwrite::Bool, log_interval::Int) + base_logger = TensorBoardLogger.TBLogger( + log_dir, + (overwrite ? (TensorBoardLogger.tb_overwrite,) : ())... + ) + return SRLogger(; logger=base_logger, log_interval) + end + """ + ) + log_dir = str(self.log_dir) + return make_logger(log_dir, self.overwrite, self.log_interval) + + def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None: + base_logger = jl.SymbolicRegression.get_logger(logger) + writer = jl.seval("TensorBoardLogger.write_hparams!") + jl_clean_hparams = jl_dict( + { + k: (v if isinstance(v, (bool, int, float)) else str(v)) + for k, v in hparams.items() + } + ) + writer( + base_logger, + jl_clean_hparams, + jl_array( + [ + "search/data/summaries/pareto_volume", + "search/data/summaries/min_loss", + ], + ), + ) + + def close(self, logger: AnyValue) -> None: + base_logger = jl.SymbolicRegression.get_logger(logger) + jl.close(base_logger) diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 0ff9d63da..769e66477 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -2,6 +2,7 @@ - Creating the Search Space: - binary_operators - unary_operators + - expression_spec - maxsize - maxdepth - Setting the Search Size: @@ -22,6 +23,7 @@ - complexity_of_operators - complexity_of_constants - complexity_of_variables + - complexity_mapping - warmup_maxsize_by - use_frequency - use_frequency_in_tournament @@ -35,6 +37,7 @@ - weight_mutate_constant - weight_mutate_operator - weight_swap_operands + - weight_rotate_tree - weight_randomize - weight_simplify - weight_optimize @@ -42,6 +45,7 @@ - annealing - alpha - perturbation_factor + - probability_negate_constant - skip_mutation_failures - Tournament Selection: - tournament_selection_n @@ -49,6 +53,7 @@ - Constant Optimization: - optimizer_algorithm - optimizer_nrestarts + - optimizer_f_calls_limit - optimize_probability - optimizer_iterations - should_optimize_constants @@ -66,8 +71,8 @@ - timeout_in_seconds - early_stop_condition - Performance and Parallelization: + - parallelism - procs - - multithreading - cluster_manager - heap_size_hint_in_bytes - batching @@ -76,7 +81,7 @@ - fast_cycle - turbo - bumper - - enable_autodiff + - autodiff_backend - Determinism: - random_state - deterministic @@ -86,13 +91,16 @@ - update_verbosity - print_precision - progress + - logger_spec + - input_stream - Environment: - temp_equation_file - tempdir - delete_tempfiles - update - Exporting the Results: - - equation_file + - output_directory + - run_id - output_jax_format - output_torch_format - extra_sympy_mappings diff --git a/pysr/sr.py b/pysr/sr.py index 30448661a..566428128 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -4,16 +4,15 @@ import os import pickle as pkl import re -import shutil import sys import tempfile import warnings +from collections.abc import Callable from dataclasses import dataclass, fields -from datetime import datetime from io import StringIO from multiprocessing import cpu_count from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast +from typing import Any, Literal, cast import numpy as np import pandas as pd @@ -26,21 +25,22 @@ from .denoising import denoise, multi_denoise from .deprecated import DEPRECATED_KWARGS -from .export_jax import sympy2jax from .export_latex import ( sympy2latex, sympy2latextable, sympy2multilatextable, with_preamble, ) -from .export_numpy import sympy2numpy from .export_paddle import sympy2paddle -from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy -from .export_torch import sympy2torch +from .export_sympy import assert_valid_sympy_symbol +from .expression_specs import ( + AbstractExpressionSpec, + ExpressionSpec, + ParametricExpressionSpec, +) from .feature_selection import run_feature_selection from .julia_extensions import load_required_packages from .julia_helpers import ( - PythonCall, _escape_filename, _load_cluster_manager, jl_array, @@ -48,21 +48,32 @@ jl_is_function, jl_serialize, ) -from .julia_import import SymbolicRegression, jl +from .julia_import import AnyValue, SymbolicRegression, VectorValue, jl +from .logger_specs import AbstractLoggerSpec from .utils import ( ArrayLike, PathLike, - _csv_filename_to_pkl_filename, _preprocess_julia_floats, _safe_check_feature_names_in, _subscriptify, _suggest_keywords, ) +try: + from sklearn.utils.validation import validate_data + + OLD_SKLEARN = False +except ImportError: + OLD_SKLEARN = True + ALREADY_RAN = False -def _process_constraints(binary_operators, unary_operators, constraints): +def _process_constraints( + binary_operators: list[str], + unary_operators: list, + constraints: dict[str, int | tuple[int, int]], +) -> dict[str, int | tuple[int, int]]: constraints = constraints.copy() for op in unary_operators: if op not in constraints: @@ -77,30 +88,32 @@ def _process_constraints(binary_operators, unary_operators, constraints): "One typical constraint is to use `constraints={..., '^': (-1, 1)}`, which " "will allow arbitrary-complexity base (-1) but only powers such as " "a constant or variable (1). " - "For more tips, please see https://astroautomata.com/PySR/tuning/" + "For more tips, please see https://ai.damtp.cam.ac.uk/pysr/tuning/" ) constraints[op] = (-1, -1) + + constraint_tuple = cast(tuple[int, int], constraints[op]) if op in ["plus", "sub", "+", "-"]: - if constraints[op][0] != constraints[op][1]: + if constraint_tuple[0] != constraint_tuple[1]: raise NotImplementedError( "You need equal constraints on both sides for - and +, " "due to simplification strategies." ) elif op in ["mult", "*"]: # Make sure the complex expression is in the left side. - if constraints[op][0] == -1: + if constraint_tuple[0] == -1: continue - if constraints[op][1] == -1 or constraints[op][0] < constraints[op][1]: - constraints[op][0], constraints[op][1] = ( - constraints[op][1], - constraints[op][0], - ) + if constraint_tuple[1] == -1 or constraint_tuple[0] < constraint_tuple[1]: + constraints[op] = (constraint_tuple[1], constraint_tuple[0]) return constraints def _maybe_create_inline_operators( - binary_operators, unary_operators, extra_sympy_mappings -): + binary_operators: list[str], + unary_operators: list[str], + extra_sympy_mappings: dict[str, Callable] | None, + expression_spec: AbstractExpressionSpec, +) -> tuple[list[str], list[str]]: binary_operators = binary_operators.copy() unary_operators = unary_operators.copy() for op_list in [binary_operators, unary_operators]: @@ -121,9 +134,11 @@ def _maybe_create_inline_operators( "Only alphanumeric characters, numbers, " "and underscores are allowed." ) - if (extra_sympy_mappings is None) or ( - function_name not in extra_sympy_mappings - ): + missing_sympy_mapping = ( + extra_sympy_mappings is None + or function_name not in extra_sympy_mappings + ) + if missing_sympy_mapping and expression_spec.supports_sympy: raise ValueError( f"Custom function {function_name} is not defined in `extra_sympy_mappings`. " "You can define it with, " @@ -191,6 +206,25 @@ def _check_assertions( ) +def _validate_export_mappings(extra_jax_mappings, extra_torch_mappings): + # It is expected extra_jax/torch_mappings will be updated after fit. + # Thus, validation is performed here instead of in _validate_init_params + if extra_jax_mappings is not None: + for value in extra_jax_mappings.values(): + if not isinstance(value, str): + raise ValueError( + "extra_jax_mappings must have keys that are strings! " + "e.g., {sympy.sqrt: 'jnp.sqrt'}." + ) + if extra_torch_mappings is not None: + for value in extra_torch_mappings.values(): + if not callable(value): + raise ValueError( + "extra_torch_mappings must be callable functions! " + "e.g., {sympy.sqrt: torch.sqrt}." + ) + + # Class validation constants VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"] @@ -199,11 +233,10 @@ def _check_assertions( class _DynamicallySetParams: """Defines some parameters that are set at runtime.""" - binary_operators: List[str] - unary_operators: List[str] + binary_operators: list[str] + unary_operators: list[str] maxdepth: int - constraints: Dict[str, str] - multithreading: bool + constraints: dict[str, int | tuple[int, int]] batch_size: int update_verbosity: int progress: bool @@ -222,7 +255,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Most default parameters have been tuned over several example equations, but you should adjust `niterations`, `binary_operators`, `unary_operators` to your requirements. You can view more detailed explanations of the options - on the [options page](https://astroautomata.com/PySR/options) of the + on the [options page](https://ai.damtp.cam.ac.uk/pysr/options) of the documentation. Parameters @@ -242,29 +275,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): most accurate model. binary_operators : list[str] List of strings for binary operators used in the search. - See the [operators page](https://astroautomata.com/PySR/operators/) + See the [operators page](https://ai.damtp.cam.ac.uk/pysr/operators/) for more details. Default is `["+", "-", "*", "/"]`. unary_operators : list[str] Operators which only take a single scalar as input. For example, `"cos"` or `"exp"`. Default is `None`. + expression_spec : AbstractExpressionSpec + The type of expression to search for. By default, + this is just `ExpressionSpec()`. You can also use + `TemplateExpressionSpec(...)` which allows you to specify + a custom template for the expressions. + Default is `ExpressionSpec()`. niterations : int Number of iterations of the algorithm to run. The best equations are printed and migrate between populations at the end of each iteration. - Default is `40`. + Default is `100`. populations : int Number of populations running. - Default is `15`. + Default is `31`. population_size : int Number of individuals in each population. - Default is `33`. + Default is `27`. max_evals : int Limits the total number of evaluations of expressions to this number. Default is `None`. maxsize : int - Max complexity of an equation. Default is `20`. + Max complexity of an equation. Default is `30`. maxdepth : int Max depth of an equation. You can use both `maxsize` and `maxdepth`. `maxdepth` is by default not used. @@ -341,7 +380,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): `idx` argument to the function, which is `nothing` for non-batched, and a 1D array of indices for batched. Default is `None`. - complexity_of_operators : dict[str, Union[int, float]] + complexity_of_operators : dict[str, int | float] If you would like to use a complexity other than 1 for an operator, specify the complexity here. For example, `{"sin": 2, "+": 1}` would give a complexity of 2 for each use @@ -352,14 +391,20 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Default is `None`. complexity_of_constants : int | float Complexity of constants. Default is `1`. - complexity_of_variables : int | float + complexity_of_variables : int | float | list[int | float] Global complexity of variables. To set different complexities for different variables, pass a list of complexities to the `fit` method with keyword `complexity_of_variables`. You cannot use both. Default is `1`. + complexity_mapping : str + Alternatively, you can pass a function (a string of Julia code) that + takes the expression as input and returns the complexity. Make sure that + this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`), + and returns an integer. + Default is `None`. parsimony : float Multiplicative factor for how much to punish complexity. - Default is `0.0032`. + Default is `0.0`. dimensional_constraint_penalty : float Additive penalty for if dimensional analysis of an expression fails. By default, this is `1000.0`. @@ -381,11 +426,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): weight the contribution. If you find that the search is only optimizing the most complex expressions while the simpler expressions remain stagnant, you should increase this value. - Default is `20.0`. + Default is `1040.0`. alpha : float Initial temperature for simulated annealing (requires `annealing` to be `True`). - Default is `0.1`. + Default is `3.17`. annealing : bool Whether to use annealing. Default is `False`. early_stop_condition : float | str @@ -397,43 +442,46 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ncycles_per_iteration : int Number of total mutations to run, per 10 samples of the population, per iteration. - Default is `550`. + Default is `380`. fraction_replaced : float How much of population to replace with migrating equations from other populations. - Default is `0.000364`. + Default is `0.00036`. fraction_replaced_hof : float How much of population to replace with migrating equations from - hall of fame. Default is `0.035`. + hall of fame. Default is `0.0614`. weight_add_node : float Relative likelihood for mutation to add a node. - Default is `0.79`. + Default is `2.47`. weight_insert_node : float Relative likelihood for mutation to insert a node. - Default is `5.1`. + Default is `0.0112`. weight_delete_node : float Relative likelihood for mutation to delete a node. - Default is `1.7`. + Default is `0.870`. weight_do_nothing : float Relative likelihood for mutation to leave the individual. - Default is `0.21`. + Default is `0.273`. weight_mutate_constant : float Relative likelihood for mutation to change the constant slightly in a random direction. - Default is `0.048`. + Default is `0.0346`. weight_mutate_operator : float Relative likelihood for mutation to swap an operator. - Default is `0.47`. + Default is `0.293`. weight_swap_operands : float Relative likehood for swapping operands in binary operators. - Default is `0.1`. + Default is `0.198`. + weight_rotate_tree : float + How often to perform a tree rotation at a random node. + Default is `4.26`. weight_randomize : float Relative likelihood for mutation to completely delete and then randomly generate the equation - Default is `0.00023`. + Default is `0.000502`. weight_simplify : float Relative likelihood for mutation to simplify constant parts by evaluation - Default is `0.0020`. + Default is `0.00209`. weight_optimize: float Constant optimization can also be performed as a mutation, in addition to the normal strategy controlled by `optimize_probability` which happens @@ -442,7 +490,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Default is `0.0`. crossover_probability : float Absolute probability of crossover-type genetic operation, instead of a mutation. - Default is `0.066`. + Default is `0.0259`. skip_mutation_failures : bool Whether to skip mutation and crossover failures, rather than simply re-sampling the current member. @@ -468,6 +516,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Number of time to restart the constants optimization process with different initial conditions. Default is `2`. + optimizer_f_calls_limit : int + How many function calls to allow during optimization. + Default is `10_000`. optimize_probability : float Probability of optimizing the constants during a single iteration of the evolutionary algorithm. @@ -479,21 +530,24 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Constants are perturbed by a max factor of (perturbation_factor*T + 1). Either multiplied by this or divided by this. - Default is `0.076`. + Default is `0.129`. + probability_negate_constant : float + Probability of negating a constant in the equation when mutating it. + Default is `0.00743`. tournament_selection_n : int Number of expressions to consider in each tournament. - Default is `10`. + Default is `15`. tournament_selection_p : float Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss. - Default is `0.86`. - procs : int - Number of processes (=number of populations running). - Default is `cpu_count()`. - multithreading : bool - Use multithreading instead of distributed backend. - Using procs=0 will turn off both. Default is `True`. + Default is `0.982`. + parallelism: Literal["serial", "multithreading", "multiprocessing"] | None + Parallelism to use for the search. Can be `"serial"`, `"multithreading"`, or `"multiprocessing"`. + Default is `"multithreading"`. + procs: int | None + Number of processes to use for parallelism. If `None`, defaults to `cpu_count()`. + Default is `None`. cluster_manager : str For distributed computing, this sets the job queue system. Set to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or @@ -533,11 +587,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): If you pass complex data, the corresponding complex precision will be used (i.e., `64` for complex128, `32` for complex64). Default is `32`. - enable_autodiff : bool - Whether to create derivative versions of operators for automatic - differentiation. This is only necessary if you wish to compute - the gradients of an expression within a custom loss function. - Default is `False`. + autodiff_backend : Literal["Zygote"] | None + Which backend to use for automatic differentiation during constant + optimization. Currently only `"Zygote"` is supported. The default, + `None`, uses forward-mode or finite difference. + Default is `None`. random_state : int, Numpy RandomState instance or None Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. @@ -545,7 +599,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): deterministic : bool Make a PySR search give the same result every run. To use this, you must turn off parallelism - (with `procs`=0, `multithreading`=False), + (with `parallelism="serial"`), and set `random_state` to a fixed seed. Default is `False`. warm_start : bool @@ -564,8 +618,24 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): progress : bool Whether to use a progress bar instead of printing to stdout. Default is `True`. - equation_file : str - Where to save the files (.csv extension). + logger_spec: AbstractLoggerSpec | None + Logger specification for the Julia backend. See, for example, + `TensorBoardLoggerSpec`. + Default is `None`. + input_stream : str + The stream to read user input from. By default, this is `"stdin"`. + If you encounter issues with reading from `stdin`, like a hang, + you can simply pass `"devnull"` to this argument. You can also + reference an arbitrary Julia object in the `Main` namespace. + Default is `"stdin"`. + run_id : str + A unique identifier for the run. Will be generated using the + current date and time if not provided. + Default is `None`. + output_directory : str + The base directory to save output files to. Files + will be saved in a subdirectory according to the run ID. + Will be set to `outputs/` if not provided. Default is `None`. temp_equation_file : bool Whether to put the hall of fame file in the temp directory. @@ -653,15 +723,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Number of output dimensions. selection_mask_ : ndarray of shape (`n_features_in_`,) Mask of which features of `X` to use when `select_k_features` is set. - tempdir_ : Path + tempdir_ : Path | None Path to the temporary equations directory. - equation_file_ : Union[str, Path] - Output equation file name produced by the julia backend. julia_state_stream_ : ndarray The serialized state for the julia SymbolicRegression.jl backend (after fitting), stored as an array of uint8, produced by Julia's Serialization.serialize function. julia_options_stream_ : ndarray The serialized julia options, stored as an array of uint8, + logger_ : AnyValue | None + The logger instance used for this fit, if any. + expression_spec_ : AbstractExpressionSpec + The expression specification used for this fit. This is equal to + `self.expression_spec` if provided, or `ExpressionSpec()` otherwise. equation_file_contents_ : list[pandas.DataFrame] Contents of the equation file output by the Julia backend. show_pickle_warnings_ : bool @@ -708,113 +781,122 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ``` """ - equations_: Union[pd.DataFrame, List[pd.DataFrame], None] + equations_: pd.DataFrame | list[pd.DataFrame] | None n_features_in_: int feature_names_in_: ArrayLike[str] display_feature_names_in_: ArrayLike[str] - complexity_of_variables_: Union[int, float, List[Union[int, float]], None] - X_units_: Union[ArrayLike[str], None] - y_units_: Union[str, ArrayLike[str], None] + complexity_of_variables_: int | float | list[int | float] | None + X_units_: ArrayLike[str] | None + y_units_: str | ArrayLike[str] | None nout_: int - selection_mask_: Union[NDArray[np.bool_], None] - tempdir_: Path - equation_file_: PathLike - julia_state_stream_: Union[NDArray[np.uint8], None] - julia_options_stream_: Union[NDArray[np.uint8], None] - equation_file_contents_: Union[List[pd.DataFrame], None] + selection_mask_: NDArray[np.bool_] | None + run_id_: str + output_directory_: str + julia_state_stream_: NDArray[np.uint8] | None + julia_options_stream_: NDArray[np.uint8] | None + logger_: AnyValue | None + equation_file_contents_: list[pd.DataFrame] | None show_pickle_warnings_: bool def __init__( self, model_selection: Literal["best", "accuracy", "score"] = "best", *, - binary_operators: Optional[List[str]] = None, - unary_operators: Optional[List[str]] = None, - niterations: int = 40, - populations: int = 15, - population_size: int = 33, - max_evals: Optional[int] = None, - maxsize: int = 20, - maxdepth: Optional[int] = None, - warmup_maxsize_by: Optional[float] = None, - timeout_in_seconds: Optional[float] = None, - constraints: Optional[Dict[str, Union[int, Tuple[int, int]]]] = None, - nested_constraints: Optional[Dict[str, Dict[str, int]]] = None, - elementwise_loss: Optional[str] = None, - loss_function: Optional[str] = None, - complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None, - complexity_of_constants: Union[int, float] = 1, - complexity_of_variables: Optional[Union[int, float]] = None, - parsimony: float = 0.0032, - dimensional_constraint_penalty: Optional[float] = None, + binary_operators: list[str] | None = None, + unary_operators: list[str] | None = None, + expression_spec: AbstractExpressionSpec | None = None, + niterations: int = 100, + populations: int = 31, + population_size: int = 27, + max_evals: int | None = None, + maxsize: int = 30, + maxdepth: int | None = None, + warmup_maxsize_by: float | None = None, + timeout_in_seconds: float | None = None, + constraints: dict[str, int | tuple[int, int]] | None = None, + nested_constraints: dict[str, dict[str, int]] | None = None, + elementwise_loss: str | None = None, + loss_function: str | None = None, + complexity_of_operators: dict[str, int | float] | None = None, + complexity_of_constants: int | float | None = None, + complexity_of_variables: int | float | list[int | float] | None = None, + complexity_mapping: str | None = None, + parsimony: float = 0.0, + dimensional_constraint_penalty: float | None = None, dimensionless_constants_only: bool = False, use_frequency: bool = True, use_frequency_in_tournament: bool = True, - adaptive_parsimony_scaling: float = 20.0, - alpha: float = 0.1, + adaptive_parsimony_scaling: float = 1040.0, + alpha: float = 3.17, annealing: bool = False, - early_stop_condition: Optional[Union[float, str]] = None, - ncycles_per_iteration: int = 550, - fraction_replaced: float = 0.000364, - fraction_replaced_hof: float = 0.035, - weight_add_node: float = 0.79, - weight_insert_node: float = 5.1, - weight_delete_node: float = 1.7, - weight_do_nothing: float = 0.21, - weight_mutate_constant: float = 0.048, - weight_mutate_operator: float = 0.47, - weight_swap_operands: float = 0.1, - weight_randomize: float = 0.00023, - weight_simplify: float = 0.0020, + early_stop_condition: float | str | None = None, + ncycles_per_iteration: int = 380, + fraction_replaced: float = 0.00036, + fraction_replaced_hof: float = 0.0614, + weight_add_node: float = 2.47, + weight_insert_node: float = 0.0112, + weight_delete_node: float = 0.870, + weight_do_nothing: float = 0.273, + weight_mutate_constant: float = 0.0346, + weight_mutate_operator: float = 0.293, + weight_swap_operands: float = 0.198, + weight_rotate_tree: float = 4.26, + weight_randomize: float = 0.000502, + weight_simplify: float = 0.00209, weight_optimize: float = 0.0, - crossover_probability: float = 0.066, + crossover_probability: float = 0.0259, skip_mutation_failures: bool = True, migration: bool = True, hof_migration: bool = True, topn: int = 12, - should_simplify: Optional[bool] = None, + should_simplify: bool = True, should_optimize_constants: bool = True, optimizer_algorithm: Literal["BFGS", "NelderMead"] = "BFGS", optimizer_nrestarts: int = 2, + optimizer_f_calls_limit: int | None = None, optimize_probability: float = 0.14, optimizer_iterations: int = 8, - perturbation_factor: float = 0.076, - tournament_selection_n: int = 10, - tournament_selection_p: float = 0.86, - procs: int = cpu_count(), - multithreading: Optional[bool] = None, - cluster_manager: Optional[ - Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"] - ] = None, - heap_size_hint_in_bytes: Optional[int] = None, + perturbation_factor: float = 0.129, + probability_negate_constant: float = 0.00743, + tournament_selection_n: int = 15, + tournament_selection_p: float = 0.982, + parallelism: ( + Literal["serial", "multithreading", "multiprocessing"] | None + ) = None, + procs: int | None = None, + cluster_manager: ( + Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"] | None + ) = None, + heap_size_hint_in_bytes: int | None = None, batching: bool = False, batch_size: int = 50, fast_cycle: bool = False, turbo: bool = False, bumper: bool = False, - precision: int = 32, - enable_autodiff: bool = False, - random_state=None, + precision: Literal[16, 32, 64] = 32, + autodiff_backend: Literal["Zygote"] | None = None, + random_state: int | np.random.RandomState | None = None, deterministic: bool = False, warm_start: bool = False, verbosity: int = 1, - update_verbosity: Optional[int] = None, + update_verbosity: int | None = None, print_precision: int = 5, progress: bool = True, - equation_file: Optional[str] = None, + logger_spec: AbstractLoggerSpec | None = None, + input_stream: str = "stdin", + run_id: str | None = None, + output_directory: str | None = None, temp_equation_file: bool = False, - tempdir: Optional[str] = None, + tempdir: str | None = None, delete_tempfiles: bool = True, update: bool = False, output_jax_format: bool = False, output_torch_format: bool = False, - output_paddle_format: bool = False, extra_sympy_mappings: Optional[Dict[str, Callable]] = None, extra_torch_mappings: Optional[Dict[Callable, Callable]] = None, extra_jax_mappings: Optional[Dict[Callable, str]] = None, - extra_paddle_mappings: Optional[Dict[Callable, Callable]] = None, denoise: bool = False, - select_k_features: Optional[int] = None, + select_k_features: int | None = None, **kwargs, ): # Hyperparameters @@ -822,6 +904,7 @@ def __init__( self.model_selection = model_selection self.binary_operators = binary_operators self.unary_operators = unary_operators + self.expression_spec = expression_spec self.niterations = niterations self.populations = populations self.population_size = population_size @@ -843,6 +926,7 @@ def __init__( self.complexity_of_operators = complexity_of_operators self.complexity_of_constants = complexity_of_constants self.complexity_of_variables = complexity_of_variables + self.complexity_mapping = complexity_mapping self.parsimony = parsimony self.dimensional_constraint_penalty = dimensional_constraint_penalty self.dimensionless_constants_only = dimensionless_constants_only @@ -860,6 +944,7 @@ def __init__( self.weight_mutate_constant = weight_mutate_constant self.weight_mutate_operator = weight_mutate_operator self.weight_swap_operands = weight_swap_operands + self.weight_rotate_tree = weight_rotate_tree self.weight_randomize = weight_randomize self.weight_simplify = weight_simplify self.weight_optimize = weight_optimize @@ -875,15 +960,17 @@ def __init__( self.should_optimize_constants = should_optimize_constants self.optimizer_algorithm = optimizer_algorithm self.optimizer_nrestarts = optimizer_nrestarts + self.optimizer_f_calls_limit = optimizer_f_calls_limit self.optimize_probability = optimize_probability self.optimizer_iterations = optimizer_iterations self.perturbation_factor = perturbation_factor + self.probability_negate_constant = probability_negate_constant # -- Selection parameters self.tournament_selection_n = tournament_selection_n self.tournament_selection_p = tournament_selection_p # -- Performance parameters + self.parallelism = parallelism self.procs = procs - self.multithreading = multithreading self.cluster_manager = cluster_manager self.heap_size_hint_in_bytes = heap_size_hint_in_bytes self.batching = batching @@ -892,7 +979,7 @@ def __init__( self.turbo = turbo self.bumper = bumper self.precision = precision - self.enable_autodiff = enable_autodiff + self.autodiff_backend = autodiff_backend self.random_state = random_state self.deterministic = deterministic self.warm_start = warm_start @@ -902,8 +989,11 @@ def __init__( self.update_verbosity = update_verbosity self.print_precision = print_precision self.progress = progress + self.logger_spec = logger_spec + self.input_stream = input_stream # - Project management - self.equation_file = equation_file + self.run_id = run_id + self.output_directory = output_directory self.temp_equation_file = temp_equation_file self.tempdir = tempdir self.delete_tempfiles = delete_tempfiles @@ -932,6 +1022,9 @@ def __init__( "Please use that instead.", FutureWarning, ) + elif k == "multithreading": + # Specific advice given in `_map_parallelism_params` + self.multithreading: bool | None = v # Handle kwargs that have been moved to the fit method elif k in ["weights", "variable_names", "Xresampled"]: warnings.warn( @@ -942,7 +1035,7 @@ def __init__( elif k == "julia_project": warnings.warn( "The `julia_project` parameter has been deprecated. To use a custom " - "julia project, please see `https://astroautomata.com/PySR/backend`.", + "julia project, please see `https://ai.damtp.cam.ac.uk/pysr/backend`.", FutureWarning, ) elif k == "julia_kwargs": @@ -964,24 +1057,26 @@ def __init__( @classmethod def from_file( cls, - equation_file: PathLike, + equation_file: None = None, # Deprecated *, - binary_operators: Optional[List[str]] = None, - unary_operators: Optional[List[str]] = None, - n_features_in: Optional[int] = None, - feature_names_in: Optional[ArrayLike[str]] = None, - selection_mask: Optional[NDArray[np.bool_]] = None, + run_directory: PathLike, + binary_operators: list[str] | None = None, + unary_operators: list[str] | None = None, + n_features_in: int | None = None, + feature_names_in: ArrayLike[str] | None = None, + selection_mask: NDArray[np.bool_] | None = None, nout: int = 1, **pysr_kwargs, - ): + ) -> "PySRRegressor": """ Create a model from a saved model checkpoint or equation file. Parameters ---------- - equation_file : str or Path - Path to a pickle file containing a saved model, or a csv file - containing equations. + run_directory : str + The directory containing outputs from a previous run. + This is of the form `[output_directory]/[run_id]`. + Default is `None`. binary_operators : list[str] The same binary operators used when creating the model. Not needed if loading from a pickle file. @@ -1011,70 +1106,74 @@ def from_file( model : PySRRegressor The model with fitted equations. """ + if equation_file is not None: + raise ValueError( + "Passing `equation_file` is deprecated and no longer compatible with " + "the most recent versions of PySR's backend. Please pass `run_directory` " + "instead, which contains all checkpoint files." + ) - pkl_filename = _csv_filename_to_pkl_filename(equation_file) - - # Try to load model from .pkl - print(f"Checking if {pkl_filename} exists...") - if os.path.exists(pkl_filename): - print(f"Loading model from {pkl_filename}") + pkl_filename = Path(run_directory) / "checkpoint.pkl" + if pkl_filename.exists(): + print(f"Attempting to load model from {pkl_filename}...") assert binary_operators is None assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: - model = pkl.load(f) - # Change equation_file_ to be in the same dir as the pickle file - base_dir = os.path.dirname(pkl_filename) - base_equation_file = os.path.basename(model.equation_file_) - model.equation_file_ = os.path.join(base_dir, base_equation_file) + model = cast("PySRRegressor", pkl.load(f)) # Update any parameters if necessary, such as # extra_sympy_mappings: model.set_params(**pysr_kwargs) + if "equations_" not in model.__dict__ or model.equations_ is None: model.refresh() return model - - # Else, we re-create it. - print( - f"{pkl_filename} does not exist, " - "so we must create the model from scratch." - ) - assert binary_operators is not None or unary_operators is not None - assert n_features_in is not None - - # TODO: copy .bkup file if exists. - model = cls( - equation_file=str(equation_file), - binary_operators=binary_operators, - unary_operators=unary_operators, - **pysr_kwargs, - ) - - model.nout_ = nout - model.n_features_in_ = n_features_in - - if feature_names_in is None: - model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)]) - model.display_feature_names_in_ = np.array( - [f"x{_subscriptify(i)}" for i in range(n_features_in)] - ) else: - assert len(feature_names_in) == n_features_in - model.feature_names_in_ = feature_names_in - model.display_feature_names_in_ = feature_names_in + print( + f"Checkpoint file {pkl_filename} does not exist. " + "Attempting to recreate model from scratch..." + ) + csv_filename = Path(run_directory) / "hall_of_fame.csv" + csv_filename_bak = Path(run_directory) / "hall_of_fame.csv.bak" + if not csv_filename.exists() and not csv_filename_bak.exists(): + raise FileNotFoundError( + f"Hall of fame file `{csv_filename}` or `{csv_filename_bak}` does not exist. " + "Please pass a `run_directory` containing a valid checkpoint file." + ) + assert binary_operators is not None or unary_operators is not None + assert n_features_in is not None + model = cls( + binary_operators=binary_operators, + unary_operators=unary_operators, + **pysr_kwargs, + ) + model.nout_ = nout + model.n_features_in_ = n_features_in - if selection_mask is None: - model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_) - else: - model.selection_mask_ = selection_mask + if feature_names_in is None: + model.feature_names_in_ = np.array( + [f"x{i}" for i in range(n_features_in)] + ) + model.display_feature_names_in_ = np.array( + [f"x{_subscriptify(i)}" for i in range(n_features_in)] + ) + else: + assert len(feature_names_in) == n_features_in + model.feature_names_in_ = feature_names_in + model.display_feature_names_in_ = feature_names_in - model.refresh(checkpoint_file=equation_file) + if selection_mask is None: + model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_) + else: + model.selection_mask_ = selection_mask + + model.refresh(run_directory=run_directory) - return model + return model - def __repr__(self): + def __repr__(self) -> str: """ Print all current equations fitted by the model. @@ -1121,7 +1220,7 @@ def __repr__(self): output += "]" return output - def __getstate__(self): + def __getstate__(self) -> dict[str, Any]: """ Handle pickle serialization for PySRRegressor. @@ -1146,6 +1245,7 @@ def __getstate__(self): f"`{state_key}` at runtime." ) state_keys_to_clear = state_keys_containing_lambdas + state_keys_to_clear.append("logger_") pickled_state = { key: (None if key in state_keys_to_clear else value) for key, value in state.items() @@ -1185,10 +1285,18 @@ def _checkpoint(self): """ # Save model state: self.show_pickle_warnings_ = False - with open(_csv_filename_to_pkl_filename(self.equation_file_), "wb") as f: - pkl.dump(self, f) + with open(self.get_pkl_filename(), "wb") as f: + try: + pkl.dump(self, f) + except Exception as e: + print(f"Error checkpointing model: {e}") self.show_pickle_warnings_ = True + def get_pkl_filename(self) -> Path: + path = Path(self.output_directory_) / self.run_id_ / "checkpoint.pkl" + path.parent.mkdir(parents=True, exist_ok=True) + return path + @property def equations(self): # pragma: no cover warnings.warn( @@ -1206,7 +1314,10 @@ def julia_options_(self): @property def julia_state_(self): """The deserialized state.""" - return jl_deserialize(self.julia_state_stream_) + return cast( + tuple[VectorValue, AnyValue] | None, + jl_deserialize(self.julia_state_stream_), + ) @property def raw_julia_state_(self): @@ -1218,7 +1329,13 @@ def raw_julia_state_(self): ) return self.julia_state_ - def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: + @property + def expression_spec_(self): + return self.expression_spec or ExpressionSpec() + + def get_best( + self, index: int | list[int] | None = None + ) -> pd.Series | list[pd.Series]: """ Get best equation using `model_selection`. @@ -1264,27 +1381,44 @@ def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: equations_.loc[idx_model_selection(equations_, self.model_selection)], ) - def _setup_equation_file(self): - """ - Set the full pathname of the equation file. + @property + def equation_file_(self): + raise NotImplementedError( + "PySRRegressor.equation_file_ is now deprecated. " + "Please use PySRRegressor.output_directory_ and PySRRegressor.run_id_ " + "instead. For loading, you should pass `run_directory`." + ) - This is performed using `tempdir` and - `equation_file`. - """ - # Cast tempdir string as a Path object - self.tempdir_ = Path(tempfile.mkdtemp(dir=self.tempdir)) - if self.temp_equation_file: - self.equation_file_ = self.tempdir_ / "hall_of_fame.csv" - elif self.equation_file is None: - if self.warm_start and ( - hasattr(self, "equation_file_") and self.equation_file_ - ): - pass - else: - date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3] - self.equation_file_ = "hall_of_fame_" + date_time + ".csv" + def _setup_equation_file(self): + """Set the pathname of the output directory.""" + if self.warm_start and ( + hasattr(self, "run_id_") or hasattr(self, "output_directory_") + ): + assert hasattr(self, "output_directory_") + assert hasattr(self, "run_id_") + if self.run_id is not None: + assert self.run_id_ == self.run_id + if self.output_directory is not None: + assert self.output_directory_ == self.output_directory else: - self.equation_file_ = self.equation_file + self.output_directory_ = ( + tempfile.mkdtemp() + if self.temp_equation_file + else ( + "outputs" + if self.output_directory is None + else self.output_directory + ) + ) + self.run_id_ = ( + cast(str, SymbolicRegression.SearchUtilsModule.generate_run_id()) + if self.run_id is None + else self.run_id + ) + if self.temp_equation_file: + assert self.output_directory is None + + def _clear_equation_file_contents(self): self.equation_file_contents_ = None def _validate_and_modify_params(self) -> _DynamicallySetParams: @@ -1316,24 +1450,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams: elif self.maxsize < 7: raise ValueError("PySR requires a maxsize of at least 7") - if self.deterministic and not ( - self.multithreading in [False, None] - and self.procs == 0 - and self.random_state is not None - ): - raise ValueError( - "To ensure deterministic searches, you must set `random_state` to a seed, " - "`procs` to `0`, and `multithreading` to `False` or `None`." - ) - - if self.random_state is not None and ( - not self.deterministic or self.procs != 0 - ): - warnings.warn( - "Note: Setting `random_state` without also setting `deterministic` " - "to True and `procs` to 0 will result in non-deterministic searches. " - ) - if self.elementwise_loss is not None and self.loss_function is not None: raise ValueError( "You cannot set both `elementwise_loss` and `loss_function`." @@ -1350,7 +1466,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams: unary_operators=[], maxdepth=self.maxsize, constraints={}, - multithreading=self.procs != 0 and self.cluster_manager is None, batch_size=1, update_verbosity=int(self.verbosity), progress=self.progress, @@ -1386,15 +1501,15 @@ def _validate_and_set_fit_params( complexity_of_variables, X_units, y_units, - ) -> Tuple[ + ) -> tuple[ ndarray, ndarray, - Optional[ndarray], - Optional[ndarray], + ndarray | None, + ndarray | None, ArrayLike[str], - Union[int, float, List[Union[int, float]]], - Optional[ArrayLike[str]], - Optional[Union[str, ArrayLike[str]]], + int | float | list[int | float] | None, + ArrayLike[str] | None, + str | ArrayLike[str] | None, ]: """ Validate the parameters passed to the :term`fit` method. @@ -1480,7 +1595,7 @@ def _validate_and_set_fit_params( elif self.complexity_of_variables is not None: complexity_of_variables = self.complexity_of_variables else: - complexity_of_variables = 1 + complexity_of_variables = None # Data validation and feature name fetching via sklearn # This method sets the n_features_in_ attribute @@ -1527,23 +1642,37 @@ def _validate_and_set_fit_params( y_units, ) - def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]: - raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore - return cast(Tuple[ndarray, ndarray], raw_out) + def _validate_data_X_y(self, X: Any, y: Any) -> tuple[ndarray, ndarray]: + if OLD_SKLEARN: + raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore + else: + raw_out = validate_data(self, X=X, y=y, reset=True, multi_output=True) # type: ignore + return cast(tuple[ndarray, ndarray], raw_out) + + def _validate_data_X(self, X: Any) -> ndarray: + if OLD_SKLEARN: + raw_out = self._validate_data(X=X, reset=False) # type: ignore + else: + raw_out = validate_data(self, X=X, reset=False) # type: ignore + return cast(ndarray, raw_out) - def _validate_data_X(self, X) -> Tuple[ndarray]: - raw_out = self._validate_data(X=X, reset=False) # type: ignore - return cast(Tuple[ndarray], raw_out) + def _get_precision_mapped_dtype(self, X: np.ndarray) -> type: + is_complex = np.issubdtype(X.dtype, np.complexfloating) + is_real = not is_complex + if is_real: + return {16: np.float16, 32: np.float32, 64: np.float64}[self.precision] + else: + return {32: np.complex64, 64: np.complex128}[self.precision] def _pre_transform_training_data( self, X: ndarray, y: ndarray, - Xresampled: Union[ndarray, None], + Xresampled: ndarray | None, variable_names: ArrayLike[str], - complexity_of_variables: Union[int, float, List[Union[int, float]]], - X_units: Union[ArrayLike[str], None], - y_units: Union[ArrayLike[str], str, None], + complexity_of_variables: int | float | list[int | float] | None, + X_units: ArrayLike[str] | None, + y_units: ArrayLike[str] | str | None, random_state: np.random.RandomState, ): """ @@ -1564,7 +1693,7 @@ def _pre_transform_training_data( variable_names : list[str] Names of each variable in the training dataset, `X`. Of length `n_features`. - complexity_of_variables : int | float | list[int | float] + complexity_of_variables : int | float | list[int | float] | None Complexity of each variable in the training dataset, `X`. X_units : list[str] Units of each variable in the training dataset, `X`. @@ -1655,7 +1784,8 @@ def _run( X: ndarray, y: ndarray, runtime_params: _DynamicallySetParams, - weights: Optional[ndarray], + weights: ndarray | None, + category: ndarray | None, seed: int, ): """ @@ -1674,6 +1804,10 @@ def _run( Weight array of the same shape as `y`. Each element is how to weight the mean-square-error loss for that particular element of y. + category : ndarray | None + If `expression_spec` is a `ParametricExpressionSpec`, then this + argument should be a list of integers representing the category + of each sample in `X`. seed : int Random seed for julia backend process. @@ -1695,13 +1829,7 @@ def _run( # specified in init, so we define them here locally: binary_operators = runtime_params.binary_operators unary_operators = runtime_params.unary_operators - maxdepth = runtime_params.maxdepth constraints = runtime_params.constraints - multithreading = runtime_params.multithreading - batch_size = runtime_params.batch_size - update_verbosity = runtime_params.update_verbosity - progress = runtime_params.progress - warmup_maxsize_by = runtime_params.warmup_maxsize_by nested_constraints = self.nested_constraints complexity_of_operators = self.complexity_of_operators @@ -1709,10 +1837,31 @@ def _run( cluster_manager = self.cluster_manager # Start julia backend processes - if not ALREADY_RAN and update_verbosity != 0: + if not ALREADY_RAN and runtime_params.update_verbosity != 0: print("Compiling Julia backend...") + parallelism, numprocs = _map_parallelism_params( + self.parallelism, self.procs, getattr(self, "multithreading", None) + ) + + if self.deterministic and parallelism != "serial": + raise ValueError( + "To ensure deterministic searches, you must set `parallelism='serial'`. " + "Additionally, make sure to set `random_state` to a seed." + ) + if self.random_state is not None and ( + parallelism != "serial" or not self.deterministic + ): + warnings.warn( + "Note: Setting `random_state` without also setting `deterministic=True` " + "and `parallelism='serial'` will result in non-deterministic searches." + ) + if cluster_manager is not None: + if parallelism != "multiprocessing": + raise ValueError( + "To use cluster managers, you must set `parallelism='multiprocessing'`." + ) cluster_manager = _load_cluster_manager(cluster_manager) # TODO(mcranmer): These functions should be part of this class. @@ -1720,15 +1869,19 @@ def _run( binary_operators=binary_operators, unary_operators=unary_operators, extra_sympy_mappings=self.extra_sympy_mappings, + expression_spec=self.expression_spec_, ) - constraints = _process_constraints( - binary_operators=binary_operators, - unary_operators=unary_operators, - constraints=constraints, - ) - - una_constraints = [constraints[op] for op in unary_operators] - bin_constraints = [constraints[op] for op in binary_operators] + if constraints is not None: + _constraints = _process_constraints( + binary_operators=binary_operators, + unary_operators=unary_operators, + constraints=constraints, + ) + una_constraints = [_constraints[op] for op in unary_operators] + bin_constraints = [_constraints[op] for op in binary_operators] + else: + una_constraints = None + bin_constraints = None # Parse dict into Julia Dict for nested constraints:: if nested_constraints is not None: @@ -1768,17 +1921,26 @@ def _run( else "nothing" ) + input_stream = jl.seval(self.input_stream) + load_required_packages( turbo=self.turbo, bumper=self.bumper, - enable_autodiff=self.enable_autodiff, + autodiff_backend=self.autodiff_backend, cluster_manager=cluster_manager, + logger_spec=self.logger_spec, ) + if self.autodiff_backend is not None: + autodiff_backend = jl.Symbol(self.autodiff_backend) + else: + autodiff_backend = None + mutation_weights = SymbolicRegression.MutationWeights( mutate_constant=self.weight_mutate_constant, mutate_operator=self.weight_mutate_operator, swap_operands=self.weight_swap_operands, + rotate_tree=self.weight_rotate_tree, add_node=self.weight_add_node, insert_node=self.weight_insert_node, delete_node=self.weight_delete_node, @@ -1788,8 +1950,8 @@ def _run( optimize=self.weight_optimize, ) - jl_binary_operators: List[Any] = [] - jl_unary_operators: List[Any] = [] + jl_binary_operators: list[Any] = [] + jl_unary_operators: list[Any] = [] for input_list, output_list, name in [ (binary_operators, jl_binary_operators, "binary"), (unary_operators, jl_unary_operators, "unary"), @@ -1802,6 +1964,17 @@ def _run( ) output_list.append(jl_op) + complexity_mapping = ( + jl.seval(self.complexity_mapping) if self.complexity_mapping else None + ) + + if hasattr(self, "logger_") and self.logger_ is not None and self.warm_start: + logger = self.logger_ + else: + logger = self.logger_spec.create_logger() if self.logger_spec else None + + self.logger_ = logger + # Call to Julia backend. # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl options = SymbolicRegression.Options( @@ -1812,14 +1985,19 @@ def _run( complexity_of_operators=complexity_of_operators, complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, + complexity_mapping=complexity_mapping, + expression_type=self.expression_spec_.julia_expression_type(), + expression_options=self.expression_spec_.julia_expression_options(), nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, maxsize=int(self.maxsize), - output_file=_escape_filename(self.equation_file_), + output_directory=_escape_filename(self.output_directory_), npopulations=int(self.populations), batching=self.batching, - batch_size=int(min([batch_size, len(X)]) if self.batching else len(X)), + batch_size=int( + min([runtime_params.batch_size, len(X)]) if self.batching else len(X) + ), mutation_weights=mutation_weights, tournament_selection_p=self.tournament_selection_p, tournament_selection_n=self.tournament_selection_n, @@ -1828,17 +2006,17 @@ def _run( dimensional_constraint_penalty=self.dimensional_constraint_penalty, dimensionless_constants_only=self.dimensionless_constants_only, alpha=self.alpha, - maxdepth=maxdepth, + maxdepth=runtime_params.maxdepth, fast_cycle=self.fast_cycle, turbo=self.turbo, bumper=self.bumper, - enable_autodiff=self.enable_autodiff, + autodiff_backend=autodiff_backend, migration=self.migration, hof_migration=self.hof_migration, fraction_replaced_hof=self.fraction_replaced_hof, should_simplify=self.should_simplify, should_optimize_constants=self.should_optimize_constants, - warmup_maxsize_by=warmup_maxsize_by, + warmup_maxsize_by=runtime_params.warmup_maxsize_by, use_frequency=self.use_frequency, use_frequency_in_tournament=self.use_frequency_in_tournament, adaptive_parsimony_scaling=self.adaptive_parsimony_scaling, @@ -1849,14 +2027,17 @@ def _run( print_precision=self.print_precision, optimizer_algorithm=self.optimizer_algorithm, optimizer_nrestarts=self.optimizer_nrestarts, + optimizer_f_calls_limit=self.optimizer_f_calls_limit, optimizer_probability=self.optimize_probability, optimizer_iterations=self.optimizer_iterations, perturbation_factor=self.perturbation_factor, + probability_negate_constant=self.probability_negate_constant, annealing=self.annealing, timeout_in_seconds=self.timeout_in_seconds, crossover_probability=self.crossover_probability, skip_mutation_failures=self.skip_mutation_failures, max_evals=self.max_evals, + input_stream=input_stream, early_stop_condition=early_stop_condition, seed=seed, deterministic=self.deterministic, @@ -1867,12 +2048,7 @@ def _run( # Convert data to desired precision test_X = np.array(X) - is_complex = np.issubdtype(test_X.dtype, np.complexfloating) - is_real = not is_complex - if is_real: - np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self.precision] - else: - np_dtype = {32: np.complex64, 64: np.complex128}[self.precision] + np_dtype = self._get_precision_mapped_dtype(test_X) # This converts the data into a Julia array: jl_X = jl_array(np.array(X, dtype=np_dtype).T) @@ -1888,16 +2064,14 @@ def _run( else: jl_weights = None - if self.procs == 0 and not multithreading: - parallelism = "serial" - elif multithreading: - parallelism = "multithreading" + if category is not None: + offset_for_julia_indexing = 1 + jl_category = jl_array( + (category + offset_for_julia_indexing).astype(np.int64) + ) + jl_extra = jl.seval("NamedTuple{(:class,)}")((jl_category,)) else: - parallelism = "multiprocessing" - - cprocs = ( - None if parallelism in ["serial", "multithreading"] else int(self.procs) - ) + jl_extra = jl.NamedTuple() if len(y.shape) > 1: # We set these manually so that they respect Python's 0 indexing @@ -1908,12 +2082,11 @@ def _run( else: jl_y_variable_names = None - PythonCall.GC.disable() - out = SymbolicRegression.equation_search( jl_X, jl_y, weights=jl_weights, + extra=jl_extra, niterations=int(self.niterations), variable_names=jl_array([str(v) for v in self.feature_names_in_]), display_variable_names=jl_array( @@ -1927,24 +2100,28 @@ def _run( else self.y_units_ ), options=options, - numprocs=cprocs, + numprocs=numprocs, parallelism=parallelism, saved_state=self.julia_state_, return_state=True, + run_id=self.run_id_, addprocs_function=cluster_manager, heap_size_hint_in_bytes=self.heap_size_hint_in_bytes, - progress=progress and self.verbosity > 0 and len(y.shape) == 1, + progress=runtime_params.progress + and self.verbosity > 0 + and len(y.shape) == 1, verbosity=int(self.verbosity), + logger=logger, ) - PythonCall.GC.enable() + if self.logger_spec is not None: + self.logger_spec.write_hparams(logger, self.get_params()) + if not self.warm_start: + self.logger_spec.close(logger) self.julia_state_stream_ = jl_serialize(out) # Set attributes - self.equations_ = self.get_hof() - - if self.delete_tempfiles: - shutil.rmtree(self.tempdir_) + self.equations_ = self.get_hof(out) ALREADY_RAN = True @@ -1954,14 +2131,14 @@ def fit( self, X, y, + *, Xresampled=None, weights=None, - variable_names: Optional[ArrayLike[str]] = None, - complexity_of_variables: Optional[ - Union[int, float, List[Union[int, float]]] - ] = None, - X_units: Optional[ArrayLike[str]] = None, - y_units: Optional[Union[str, ArrayLike[str]]] = None, + variable_names: ArrayLike[str] | None = None, + complexity_of_variables: int | float | list[int | float] | None = None, + X_units: ArrayLike[str] | None = None, + y_units: str | ArrayLike[str] | None = None, + category: ndarray | None = None, ) -> "PySRRegressor": """ Search for equations to fit the dataset and store them in `self.equations_`. @@ -1998,6 +2175,10 @@ def fit( Similar to `X_units`, but as a unit for the target variable, `y`. If `y` is a matrix, a list of units should be passed. If `X_units` is given but `y_units` is not, then `y_units` will be arbitrary. + category : list[int] + If `expression_spec` is a `ParametricExpressionSpec`, then this + argument should be a list of integers representing the category + of each sample. Returns ------- @@ -2025,9 +2206,17 @@ def fit( self.y_units_ = None self._setup_equation_file() + self._clear_equation_file_contents() runtime_params = self._validate_and_modify_params() + if category is not None: + assert Xresampled is None + + if isinstance(self.expression_spec, ParametricExpressionSpec): + assert category is not None + + # TODO: Put `category` here ( X, y, @@ -2051,7 +2240,7 @@ def fit( if X.shape[0] > 10000 and not self.batching: warnings.warn( "Note: you are running with more than 10,000 datapoints. " - "You should consider turning on batching (https://astroautomata.com/PySR/options/#batching). " + "You should consider turning on batching (https://ai.damtp.cam.ac.uk/pysr/options/#batching). " "You should also reconsider if you need that many datapoints. " "Unless you have a large amount of noise (in which case you " "should smooth your dataset first), generally < 10,000 datapoints " @@ -2107,7 +2296,7 @@ def fit( self._checkpoint() # Perform the search: - self._run(X, y, runtime_params, weights=weights, seed=seed) + self._run(X, y, runtime_params, weights=weights, seed=seed, category=category) # Then, after fit, we save again, so the pickle file contains # the equations: @@ -2116,7 +2305,7 @@ def fit( return self - def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None: + def refresh(self, run_directory: PathLike | None = None) -> None: """ Update self.equations_ with any new options passed. @@ -2129,13 +2318,20 @@ def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None: Path to checkpoint hall of fame file to be loaded. The default will use the set `equation_file_`. """ - if checkpoint_file is not None: - self.equation_file_ = checkpoint_file - self.equation_file_contents_ = None - check_is_fitted(self, attributes=["equation_file_"]) + if run_directory is not None: + self.output_directory_ = str(Path(run_directory).parent) + self.run_id_ = Path(run_directory).name + self._clear_equation_file_contents() + check_is_fitted(self, attributes=["run_id_", "output_directory_"]) self.equations_ = self.get_hof() - def predict(self, X, index=None): + def predict( + self, + X, + index: int | list[int] | None = None, + *, + category: ndarray | None = None, + ) -> ndarray: """ Predict y from input X using the equation chosen by `model_selection`. @@ -2151,6 +2347,10 @@ def predict(self, X, index=None): particular row of `self.equations_`, you may specify the index here. For multiple output equations, you must pass a list of indices in the same order. + category : ndarray | None + If `expression_spec` is a `ParametricExpressionSpec`, then this + argument should be a list of integers representing the category + of each sample in `X`. Returns ------- @@ -2192,15 +2392,30 @@ def predict(self, X, index=None): # feature selected) X in fit. X = X.reindex(columns=self.feature_names_in_) X = self._validate_data_X(X) + if self.expression_spec_.evaluates_in_julia: + # Julia wants the right dtype + X = X.astype(self._get_precision_mapped_dtype(X)) + + if category is not None: + offset_for_julia_indexing = 1 + args: tuple = ( + jl_array((category + offset_for_julia_indexing).astype(np.int64)), + ) + else: + args = () try: if isinstance(best_equation, list): assert self.nout_ > 1 return np.stack( - [eq["lambda_format"](X) for eq in best_equation], axis=1 + [ + cast(ndarray, eq["lambda_format"](X, *args)) + for eq in best_equation + ], + axis=1, ) else: - return best_equation["lambda_format"](X) + return cast(ndarray, best_equation["lambda_format"](X, *args)) except Exception as error: raise ValueError( "Failed to evaluate the expression. " @@ -2210,7 +2425,7 @@ def predict(self, X, index=None): "You can then run `model.refresh()` to re-load the expressions." ) from error - def sympy(self, index=None): + def sympy(self, index: int | list[int] | None = None): """ Return sympy representation of the equation(s) chosen by `model_selection`. @@ -2228,6 +2443,10 @@ def sympy(self, index=None): best_equation : str, list[str] of length nout_ SymPy representation of the best equation. """ + if not self.expression_spec_.supports_sympy: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support sympy export." + ) self.refresh() best_equation = self.get_best(index=index) if isinstance(best_equation, list): @@ -2236,7 +2455,9 @@ def sympy(self, index=None): else: return best_equation["sympy_format"] - def latex(self, index=None, precision=3): + def latex( + self, index: int | list[int] | None = None, precision: int = 3 + ) -> str | list[str]: """ Return latex representation of the equation(s) chosen by `model_selection`. @@ -2258,6 +2479,10 @@ def latex(self, index=None, precision=3): best_equation : str or list[str] of length nout_ LaTeX expression of the best equation. """ + if not self.expression_spec_.supports_latex: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support latex export." + ) self.refresh() sympy_representation = self.sympy(index=index) if self.nout_ > 1: @@ -2291,6 +2516,10 @@ def jax(self, index=None): Dictionary of callable jax function in "callable" key, and jax array of parameters as "parameters" key. """ + if not self.expression_spec_.supports_jax: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support jax export." + ) self.set_params(output_jax_format=True) self.refresh() best_equation = self.get_best(index=index) @@ -2323,6 +2552,10 @@ def pytorch(self, index=None): best_equation : torch.nn.Module PyTorch module representing the expression. """ + if not self.expression_spec_.supports_torch: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support torch export." + ) self.set_params(output_torch_format=True) self.refresh() best_equation = self.get_best(index=index) @@ -2362,27 +2595,35 @@ def paddle(self, index=None): else: return best_equation["paddle_format"] - def _read_equation_file(self): + def get_equation_file(self, i: int | None = None) -> Path: + if i is not None: + return ( + Path(self.output_directory_) + / self.run_id_ + / f"hall_of_fame_output{i}.csv" + ) + else: + return Path(self.output_directory_) / self.run_id_ / "hall_of_fame.csv" + + def _read_equation_file(self) -> list[pd.DataFrame]: """Read the hall of fame file created by `SymbolicRegression.jl`.""" try: if self.nout_ > 1: all_outputs = [] for i in range(1, self.nout_ + 1): - cur_filename = str(self.equation_file_) + f".out{i}" + ".bkup" + cur_filename = str(self.get_equation_file(i)) + ".bak" if not os.path.exists(cur_filename): - cur_filename = str(self.equation_file_) + f".out{i}" + cur_filename = str(self.get_equation_file(i)) with open(cur_filename, "r", encoding="utf-8") as f: buf = f.read() buf = _preprocess_julia_floats(buf) - df = self._postprocess_dataframe(pd.read_csv(StringIO(buf))) - all_outputs.append(df) else: - filename = str(self.equation_file_) + ".bkup" + filename = str(self.get_equation_file()) + ".bak" if not os.path.exists(filename): - filename = str(self.equation_file_) + filename = str(self.get_equation_file()) with open(filename, "r", encoding="utf-8") as f: buf = f.read() buf = _preprocess_julia_floats(buf) @@ -2406,8 +2647,8 @@ def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: return df - def get_hof(self): - """Get the equations from a hall of fame file. + def get_hof(self, search_output=None) -> pd.DataFrame | list[pd.DataFrame]: + """Get the equations from a hall of fame file or search output. If no arguments entered, the ones used previously from a call to PySR will be used. @@ -2416,151 +2657,34 @@ def get_hof(self): self, attributes=[ "nout_", - "equation_file_", + "run_id_", + "output_directory_", "selection_mask_", "feature_names_in_", ], ) - if ( + should_read_from_file = ( not hasattr(self, "equation_file_contents_") - ) or self.equation_file_contents_ is None: + or self.equation_file_contents_ is None + ) + if should_read_from_file: self.equation_file_contents_ = self._read_equation_file() - # It is expected extra_jax/torch_mappings will be updated after fit. - # Thus, validation is performed here instead of in _validate_init_params - extra_jax_mappings = self.extra_jax_mappings - extra_torch_mappings = self.extra_torch_mappings - extra_paddle_mappings = self.extra_paddle_mappings - if extra_jax_mappings is not None: - for value in extra_jax_mappings.values(): - if not isinstance(value, str): - raise ValueError( - "extra_jax_mappings must have keys that are strings! " - "e.g., {sympy.sqrt: 'jnp.sqrt'}." - ) - else: - extra_jax_mappings = {} - if extra_torch_mappings is not None: - for value in extra_torch_mappings.values(): - if not callable(value): - raise ValueError( - "extra_torch_mappings must be callable functions! " - "e.g., {sympy.sqrt: torch.sqrt}." - ) - else: - extra_torch_mappings = {} - if extra_paddle_mappings is not None: - for value in extra_paddle_mappings.values(): - if not callable(value): - raise ValueError( - "extra_paddle_mappings must be callable functions! " - "e.g., {sympy.sqrt: paddle.sqrt}." - ) - else: - extra_paddle_mappings = {} - - ret_outputs = [] - - equation_file_contents = copy.deepcopy(self.equation_file_contents_) - - for output in equation_file_contents: - scores = [] - lastMSE = None - lastComplexity = 0 - sympy_format = [] - lambda_format = [] - jax_format = [] - torch_format = [] - paddle_format = [] - - for _, eqn_row in output.iterrows(): - eqn = pysr2sympy( - eqn_row["equation"], - feature_names_in=self.feature_names_in_, - extra_sympy_mappings=self.extra_sympy_mappings, - ) - sympy_format.append(eqn) - - # NumPy: - sympy_symbols = create_sympy_symbols(self.feature_names_in_) - lambda_format.append( - sympy2numpy( - eqn, - sympy_symbols, - selection=self.selection_mask_, - ) - ) + _validate_export_mappings(self.extra_jax_mappings, self.extra_torch_mappings) - # JAX: - if self.output_jax_format: - func, params = sympy2jax( - eqn, - sympy_symbols, - selection=self.selection_mask_, - extra_jax_mappings=self.extra_jax_mappings, - ) - jax_format.append({"callable": func, "parameters": params}) - - # Torch: - if self.output_torch_format: - module = sympy2torch( - eqn, - sympy_symbols, - selection=self.selection_mask_, - extra_torch_mappings=self.extra_torch_mappings, - ) - torch_format.append(module) - - # Paddle: - if self.output_paddle_format: - module = sympy2paddle( - eqn, - sympy_symbols, - selection=self.selection_mask_, - extra_paddle_mappings=self.extra_paddle_mappings, - ) - paddle_format.append(module) + equation_file_contents = cast(list[pd.DataFrame], self.equation_file_contents_) - curMSE = eqn_row["loss"] - curComplexity = eqn_row["complexity"] - - if lastMSE is None: - cur_score = 0.0 - else: - if curMSE > 0.0: - # TODO Move this to more obvious function/file. - cur_score = -np.log(curMSE / lastMSE) / ( - curComplexity - lastComplexity - ) - else: - cur_score = np.inf - - scores.append(cur_score) - lastMSE = curMSE - lastComplexity = curComplexity - - output["score"] = np.array(scores) - output["sympy_format"] = sympy_format - output["lambda_format"] = lambda_format - output_cols = [ - "complexity", - "loss", - "score", - "equation", - "sympy_format", - "lambda_format", - ] - if self.output_jax_format: - output_cols += ["jax_format"] - output["jax_format"] = jax_format - if self.output_torch_format: - output_cols += ["torch_format"] - output["torch_format"] = torch_format - if self.output_paddle_format: - output_cols += ["paddle_format"] - output["paddle_format"] = paddle_format - - ret_outputs.append(output[output_cols]) + ret_outputs = [ + pd.concat( + [ + output, + calculate_scores(output), + self.expression_spec_.create_exports(self, output, search_output), + ], + axis=1, + ) + for output in equation_file_contents + ] if self.nout_ > 1: return ret_outputs @@ -2568,10 +2692,10 @@ def get_hof(self): def latex_table( self, - indices=None, - precision=3, - columns=["equation", "complexity", "loss", "score"], - ): + indices: list[int] | None = None, + precision: int = 3, + columns: list[str] = ["equation", "complexity", "loss", "score"], + ) -> str: """Create a LaTeX/booktabs table for all, or some, of the equations. Parameters @@ -2594,6 +2718,10 @@ def latex_table( latex_table_str : str A string that will render a table in LaTeX of the equations. """ + if not self.expression_spec_.supports_latex: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support latex export." + ) self.refresh() if isinstance(self.equations_, list): @@ -2639,12 +2767,41 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str): return chosen_idx -def _mutate_parameter(param_name: str, param_value): - if param_name in ["binary_operators", "unary_operators"] and isinstance( - param_value, str - ): - return [param_value] +def calculate_scores(df: pd.DataFrame) -> pd.DataFrame: + """Calculate scores for each equation based on loss and complexity. + + Score is defined as the negated derivative of the log-loss with respect to complexity. + A higher score means the equation achieved a much better loss at a slightly higher complexity. + """ + scores = [] + lastMSE = None + lastComplexity = 0 + + for _, row in df.iterrows(): + curMSE = row["loss"] + curComplexity = row["complexity"] + + if lastMSE is None: + cur_score = 0.0 + else: + if curMSE > 0.0: + cur_score = -np.log(curMSE / lastMSE) / (curComplexity - lastComplexity) + else: + cur_score = np.inf + + scores.append(cur_score) + lastMSE = curMSE + lastComplexity = curComplexity + + return pd.DataFrame( + { + "score": np.array(scores), + }, + index=df.index, + ) + +def _mutate_parameter(param_name: str, param_value): if param_name == "batch_size" and param_value < 1: warnings.warn( "Given `batch_size` must be greater than or equal to one. " @@ -2664,3 +2821,88 @@ def _mutate_parameter(param_name: str, param_value): return False return param_value + + +def _map_parallelism_params( + parallelism: Literal["serial", "multithreading", "multiprocessing"] | None, + procs: int | None, + multithreading: bool | None, +) -> tuple[Literal["serial", "multithreading", "multiprocessing"], int | None]: + """Map old and new parallelism parameters to the new format. + + Parameters + ---------- + parallelism : str or None + New parallelism parameter. Can be "serial", "multithreading", or "multiprocessing". + procs : int or None + Number of processes parameter. + multithreading : bool or None + Old multithreading parameter. + + Returns + ------- + parallelism : str + Mapped parallelism mode. + procs : int or None + Mapped number of processes. + + Raises + ------ + ValueError + If both old and new parameters are specified, or if invalid combinations are given. + """ + # Check for mixing old and new parameters + using_new = parallelism is not None + using_old = multithreading is not None + + if using_new and using_old: + raise ValueError( + "Cannot mix old and new parallelism parameters. " + "Use either `parallelism` and `numprocs`, or `procs` and `multithreading`." + ) + elif using_old: + warnings.warn( + "The `multithreading: bool` parameter has been deprecated in favor " + "of `parallelism: Literal['multithreading', 'serial', 'multiprocessing']`.\n" + "Previous usage of `multithreading=True` (default) is now `parallelism='multithreading'`; " + "`multithreading=False, procs=0` is now `parallelism='serial'`; and " + "`multithreading=True, procs={int}` is now `parallelism='multiprocessing', procs={int}`." + ) + if multithreading: + _parallelism: Literal["multithreading", "multiprocessing", "serial"] = ( + "multithreading" + ) + _procs = None + elif procs is not None and procs > 0: + _parallelism = "multiprocessing" + _procs = procs + else: + _parallelism = "serial" + _procs = None + elif using_new: + _parallelism = cast( + Literal["serial", "multithreading", "multiprocessing"], parallelism + ) + _procs = procs + else: + _parallelism = "multithreading" + _procs = None + + if _parallelism not in {"serial", "multithreading", "multiprocessing"}: + raise ValueError( + "`parallelism` must be one of 'serial', 'multithreading', or 'multiprocessing'" + ) + elif _parallelism == "serial" and _procs is not None: + warnings.warn( + "`numprocs` is specified but will be ignored since `parallelism='serial'`" + ) + _procs = None + elif parallelism == "multithreading" and _procs is not None: + warnings.warn( + "`numprocs` is specified but will be ignored since `parallelism='multithreading'`" + ) + _procs = None + elif parallelism == "multiprocessing" and _procs is None: + _procs = cpu_count() + + return _parallelism, _procs diff --git a/pysr/test/__init__.py b/pysr/test/__init__.py index 4c2ccfc10..a1016e30e 100644 --- a/pysr/test/__init__.py +++ b/pysr/test/__init__.py @@ -1,7 +1,7 @@ -from .test import runtests from .test_cli import get_runtests as get_runtests_cli from .test_dev import runtests as runtests_dev from .test_jax import runtests as runtests_jax +from .test_main import runtests from .test_paddle import runtests as runtests_paddle from .test_startup import runtests as runtests_startup from .test_torch import runtests as runtests_torch diff --git a/pysr/test/params.py b/pysr/test/params.py index 54da4ac7d..10de46c99 100644 --- a/pysr/test/params.py +++ b/pysr/test/params.py @@ -1,4 +1,6 @@ import inspect +import os +import unittest from pysr import PySRRegressor @@ -6,3 +8,8 @@ DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default DEFAULT_POPULATIONS = DEFAULT_PARAMS["populations"].default DEFAULT_NCYCLES = DEFAULT_PARAMS["ncycles_per_iteration"].default + +skip_if_beartype = unittest.skipIf( + os.environ.get("PYSR_USE_BEARTYPE", "0") == "1", + "Skipping because beartype would fail test", +) diff --git a/pysr/test/test_dev.py b/pysr/test/test_dev.py index b8a2b4645..a0f4f7f5d 100644 --- a/pysr/test/test_dev.py +++ b/pysr/test/test_dev.py @@ -7,8 +7,8 @@ class TestDev(unittest.TestCase): def test_simple_change_to_backend(self): """Test that we can use a development version of SymbolicRegression.jl""" - PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.6") - PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.9") + PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.11") + PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.12") build_result = subprocess.run( [ "docker", diff --git a/pysr/test/test_dev_pysr.dockerfile b/pysr/test/test_dev_pysr.dockerfile index 2978e82b7..fcde426b9 100644 --- a/pysr/test/test_dev_pysr.dockerfile +++ b/pysr/test/test_dev_pysr.dockerfile @@ -2,8 +2,8 @@ # tries to manually edit SymbolicRegression.jl and # use it from PySR. -ARG JLVERSION=1.9.4 -ARG PYVERSION=3.11.6 +ARG JLVERSION=1.11.1 +ARG PYVERSION=3.12.6 ARG BASE_IMAGE=bullseye FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl @@ -15,10 +15,6 @@ ENV PATH="/usr/local/julia/bin:${PATH}" WORKDIR /pysr -# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project) -ADD ./requirements.txt /pysr/requirements.txt -RUN pip3 install --no-cache-dir -r /pysr/requirements.txt - # Install PySR: # We do a minimal copy so it doesn't need to rerun at every file change: ADD ./pyproject.toml /pysr/pyproject.toml diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index e0237c829..d261afcf5 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -1,5 +1,6 @@ import unittest from functools import partial +from pathlib import Path import numpy as np import pandas as pd @@ -46,11 +47,12 @@ def test_pipeline_pandas(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) jformat = model.jax() np.testing.assert_almost_equal( @@ -73,11 +75,12 @@ def test_pipeline(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) jformat = model.jax() np.testing.assert_almost_equal( @@ -121,6 +124,8 @@ def test_feature_selection_custom_operators(self): def cos_approx(x): return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720 + sp_cos_approx = sympy.Function("cos_approx") + y = X["k15"] ** 2 + 2 * cos_approx(X["k20"]) model = PySRRegressor( @@ -129,26 +134,19 @@ def cos_approx(x): select_k_features=3, maxsize=10, early_stop_condition=1e-5, - extra_sympy_mappings={"cos_approx": cos_approx}, + extra_sympy_mappings={"cos_approx": sp_cos_approx}, extra_jax_mappings={ - "cos_approx": "(lambda x: 1 - x**2 / 2 + x**4 / 24 + x**6 / 720)" + sp_cos_approx: "(lambda x: 1 - x**2 / 2 + x**4 / 24 + x**6 / 720)" }, random_state=0, deterministic=True, - procs=0, - multithreading=False, + parallelism="serial", ) np.random.seed(0) model.fit(X.values, y.values) f, parameters = model.jax().values() - - np_prediction = model.predict jax_prediction = partial(f, parameters=parameters) - - np_output = np_prediction(X.values) jax_output = jax_prediction(X.values) - - np.testing.assert_almost_equal(y.values, np_output, decimal=3) np.testing.assert_almost_equal(y.values, jax_output, decimal=3) diff --git a/pysr/test/test.py b/pysr/test/test_main.py similarity index 82% rename from pysr/test/test.py rename to pysr/test/test_main.py index c641e9f66..2f5f440a8 100644 --- a/pysr/test/test.py +++ b/pysr/test/test_main.py @@ -1,6 +1,7 @@ import importlib import os import pickle as pkl +import platform import tempfile import traceback import unittest @@ -12,7 +13,15 @@ import sympy # type: ignore from sklearn.utils.estimator_checks import check_estimator -from pysr import PySRRegressor, install, jl, load_all_packages +from pysr import ( + ParametricExpressionSpec, + PySRRegressor, + TemplateExpressionSpec, + TensorBoardLoggerSpec, + install, + jl, + load_all_packages, +) from pysr.export_latex import sympy2latex from pysr.feature_selection import _handle_feature_selection, run_feature_selection from pysr.julia_helpers import init_julia @@ -22,13 +31,13 @@ _suggest_keywords, idx_model_selection, ) -from pysr.utils import _csv_filename_to_pkl_filename from .params import ( DEFAULT_NCYCLES, DEFAULT_NITERATIONS, DEFAULT_PARAMS, DEFAULT_POPULATIONS, + skip_if_beartype, ) # Disables local saving: @@ -94,7 +103,7 @@ def test_multiprocessing_turbo_custom_objective(self): # Turbo needs to work with unsafe operators: unary_operators=["sqrt"], procs=2, - multithreading=False, + parallelism="multiprocessing", turbo=True, early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1", loss_function=""" @@ -348,9 +357,8 @@ def test_warm_start_set_at_init(self): def test_noisy_builtin_variable_names(self): y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05 model = PySRRegressor( - # Test that passing a single operator works: - unary_operators="sq(x) = x^2", - binary_operators="plus", + unary_operators=["sq(x) = x^2"], + binary_operators=["plus"], extra_sympy_mappings={"sq": lambda x: x**2}, **self.default_test_kwargs, procs=0, @@ -447,16 +455,17 @@ def test_load_model(self): csv_file_data = "\n".join([line.strip() for line in csv_file_data.split("\n")]) for from_backup in [False, True]: - rand_dir = Path(tempfile.mkdtemp()) - equation_filename = str(rand_dir / "equation.csv") - with open(equation_filename + (".bkup" if from_backup else ""), "w") as f: + output_directory = Path(tempfile.mkdtemp()) + equation_filename = str(output_directory / "hall_of_fame.csv") + with open(equation_filename + (".bak" if from_backup else ""), "w") as f: f.write(csv_file_data) model = PySRRegressor.from_file( - equation_filename, + run_directory=output_directory, n_features_in=5, feature_names_in=["f0", "f1", "f2", "f3", "f4"], binary_operators=["+", "*", "/", "-", "^"], unary_operators=["cos"], + precision=64, ) X = self.rstate.rand(100, 5) y_truth = 2.2683423 ** np.cos(X[:, 3]) @@ -468,9 +477,8 @@ def test_load_model_simple(self): # Test that we can simply load a model from its equation file. y = self.X[:, [0, 1]] ** 2 model = PySRRegressor( - # Test that passing a single operator works: - unary_operators="sq(x) = x^2", - binary_operators="plus", + unary_operators=["sq(x) = x^2"], + binary_operators=["plus"], extra_sympy_mappings={"sq": lambda x: x**2}, **self.default_test_kwargs, procs=0, @@ -478,27 +486,28 @@ def test_load_model_simple(self): early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2", ) rand_dir = Path(tempfile.mkdtemp()) - equation_file = rand_dir / "equations.csv" + equation_file = rand_dir / "1" / "hall_of_fame.csv" model.set_params(temp_equation_file=False) - model.set_params(equation_file=equation_file) + model.set_params(output_directory=rand_dir) + model.set_params(run_id="1") model.fit(self.X, y) # lambda functions are removed from the pickling, so we need # to pass it during the loading: model2 = PySRRegressor.from_file( - model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} + run_directory=rand_dir / "1", extra_sympy_mappings={"sq": lambda x: x**2} ) np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X)) # Try again, but using only the pickle file: - for file_to_delete in [str(equation_file), str(equation_file) + ".bkup"]: + for file_to_delete in [str(equation_file), str(equation_file) + ".bak"]: if os.path.exists(file_to_delete): os.remove(file_to_delete) # pickle_file = rand_dir / "equations.pkl" model3 = PySRRegressor.from_file( - model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} + run_directory=rand_dir / "1", extra_sympy_mappings={"sq": lambda x: x**2} ) np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X)) @@ -512,38 +521,219 @@ def test_jl_function_error(self): str(cm.exception), ) + def test_template_expressions_and_custom_complexity(self): + # Create random data between -1 and 1 + X = self.rstate.uniform(-1, 1, (100, 2)) + + # Ground truth: sin(x + y) + y = np.sin(X[:, 0] + X[:, 1]) + + # Create model with template that includes the missing sin operator + model = PySRRegressor( + expression_spec=TemplateExpressionSpec( + ["f"], "sin_of_f((; f), (x, y)) = sin(f(x, y))" + ), + binary_operators=["+", "-", "*", "/"], + unary_operators=[], # No sin operator! + maxsize=10, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 6", + # Custom complexity *function*: + complexity_mapping="my_complexity(ex) = sum(t -> 2, get_tree(ex))", + **self.default_test_kwargs, + ) + + model.fit(X, y) + + # Test on out of domain data - this should still work due to sin template! + X_test = self.rstate.uniform(2, 10, (25, 2)) + y_test = np.sin(X_test[:, 0] + X_test[:, 1]) + y_pred = model.predict(X_test) + + test_mse = np.mean((y_test - y_pred) ** 2) + self.assertLess(test_mse, 1e-5) + + # Check there is a row with complexity 6 and MSE < 1e-10 + df = model.equations_ + good_rows = df[(df.complexity == 6) & (df.loss < 1e-10)] + self.assertGreater(len(good_rows), 0) + + # Check there are NO rows with lower complexity and MSE < 1e-10 + simpler_good_rows = df[(df.complexity < 6) & (df.loss < 1e-10)] + self.assertEqual(len(simpler_good_rows), 0) + + # Make sure that a nice error is raised if we try to get the sympy expression: + # f"`expression_spec={self.expression_spec_}` does not support sympy export." + with self.assertRaises(ValueError) as cm: + model.sympy() + self.assertRegex( + str(cm.exception), + r"`expression_spec=.*TemplateExpressionSpec.*` does not support sympy export.", + ) + with self.assertRaises(ValueError): + model.latex() + with self.assertRaises(ValueError): + model.jax() + with self.assertRaises(ValueError): + model.pytorch() + with self.assertRaises(ValueError): + model.latex_table() + + def test_parametric_expression(self): + # Create data with two classes + n_points = 100 + X = self.rstate.uniform(-3, 3, (n_points, 2)) # x1, x2 + category = self.rstate.randint(0, 3, n_points) # class (0 or 1) + + # True parameters for each class + P1 = [0.1, 1.5, -5.2] # phase shift for each class + P2 = [3.2, 0.5, 1.2] # offset for each class + + # Ground truth: 2*cos(x2 + P1[class]) + x1^2 - P2[class] + y = np.array( + [ + 2 * np.cos(x2 + P1[c]) + x1**2 - P2[c] + for x1, x2, c in zip(X[:, 0], X[:, 1], category) + ] + ) + + model = PySRRegressor( + expression_spec=ParametricExpressionSpec(max_parameters=2), + binary_operators=["+", "*", "/", "-"], + unary_operators=["cos", "exp"], + maxsize=20, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity <= 14", + **self.default_test_kwargs, + ) + + model.fit(X, y, category=category) + + # Test on new data points + X_test = self.rstate.uniform(-6, 6, (10, 2)) + category_test = self.rstate.randint(0, 3, 10) + + y_test = np.array( + [ + 2 * np.cos(x2 + P1[c]) + x1**2 - P2[c] + for x1, x2, c in zip(X_test[:, 0], X_test[:, 1], category_test) + ] + ) + + y_test_pred = model.predict(X_test, category=category_test) + test_mse = np.mean((y_test - y_test_pred) ** 2) + self.assertLess(test_mse, 1e-3) + + with self.assertRaises(ValueError): + model.sympy() + with self.assertRaises(ValueError): + model.latex() + with self.assertRaises(ValueError): + model.jax() + with self.assertRaises(ValueError): + model.pytorch() + with self.assertRaises(ValueError): + model.latex_table() + + def test_tensorboard_logger(self): + + if platform.system() == "Windows": + self.skipTest("Skipping test on Windows") + + """Test TensorBoard logger functionality.""" + try: + from tensorboard.backend.event_processing.event_accumulator import ( # type: ignore + EventAccumulator, + ) + except ImportError: + self.skipTest("TensorBoard not installed. Skipping test.") + + y = self.X[:, 0] + for warm_start in [False, True]: + with tempfile.TemporaryDirectory() as tmpdir: + logger_spec = TensorBoardLoggerSpec( + log_dir=tmpdir, log_interval=2, overwrite=True + ) + model = PySRRegressor( + **self.default_test_kwargs, + logger_spec=logger_spec, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1", + warm_start=warm_start, + ) + model.fit(self.X, y) + logger = model.logger_ + # Should restart from same logger if warm_start is True + model.fit(self.X, y) + logger2 = model.logger_ + + if warm_start: + self.assertEqual(logger, logger2) + else: + self.assertNotEqual(logger, logger2) + + # Verify log directory exists and contains TensorBoard files + log_dir = Path(tmpdir) + assert log_dir.exists() + files = list(log_dir.glob("events.out.tfevents.*")) + assert len(files) == 1 if warm_start else 2 + + # Load and verify TensorBoard events + event_acc = EventAccumulator(str(log_dir)) + event_acc.Reload() + + # Check that we have the expected scalar summaries + scalars = event_acc.Tags()["scalars"] + self.assertIn("search/data/summaries/pareto_volume", scalars) + self.assertIn("search/data/summaries/min_loss", scalars) + + # Check that we have multiple events for each summary + pareto_events = event_acc.Scalars("search/data/summaries/pareto_volume") + min_loss_events = event_acc.Scalars("search/data/summaries/min_loss") + + self.assertGreater(len(pareto_events), 0) + self.assertGreater(len(min_loss_events), 0) + + # Verify model still works as expected + self.assertLessEqual(model.get_best()["loss"], 1e-4) + def manually_create_model(equations, feature_names=None): if feature_names is None: feature_names = ["x0", "x1"] + output_directory = tempfile.mkdtemp() + run_id = "test" model = PySRRegressor( progress=False, niterations=1, extra_sympy_mappings={}, output_jax_format=False, model_selection="accuracy", - equation_file="equation_file.csv", + output_directory=output_directory, + run_id=run_id, ) + model.output_directory_ = output_directory + model.run_id_ = run_id + os.makedirs(Path(output_directory) / run_id, exist_ok=True) # Set up internal parameters as if it had been fitted: if isinstance(equations, list): # Multi-output. - model.equation_file_ = "equation_file.csv" model.nout_ = len(equations) model.selection_mask_ = None model.feature_names_in_ = np.array(feature_names, dtype=object) for i in range(model.nout_): equations[i]["complexity loss equation".split(" ")].to_csv( - f"equation_file.csv.out{i+1}.bkup" + str( + Path(output_directory) + / run_id + / f"hall_of_fame_output{i+1}.csv.bak" + ) ) else: - model.equation_file_ = "equation_file.csv" model.nout_ = 1 model.selection_mask_ = None model.feature_names_in_ = np.array(feature_names, dtype=object) equations["complexity loss equation".split(" ")].to_csv( - "equation_file.csv.bkup" + str(Path(output_directory) / run_id / "hall_of_fame.csv.bak") ) model.refresh() @@ -630,27 +820,12 @@ def test_feature_selection_handler(self): class TestMiscellaneous(unittest.TestCase): """Test miscellaneous functions.""" - def test_csv_to_pkl_conversion(self): - """Test that csv filename to pkl filename works as expected.""" - tmpdir = Path(tempfile.mkdtemp()) - equation_file = tmpdir / "equations.389479384.28378374.csv" - expected_pkl_file = tmpdir / "equations.389479384.28378374.pkl" - - # First, test inputting the paths: - test_pkl_file = _csv_filename_to_pkl_filename(equation_file) - self.assertEqual(test_pkl_file, str(expected_pkl_file)) - - # Next, test inputting the strings. - test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file)) - self.assertEqual(test_pkl_file, str(expected_pkl_file)) - def test_pickle_with_temp_equation_file(self): """If we have a temporary equation file, unpickle the estimator.""" model = PySRRegressor( populations=int(1 + DEFAULT_POPULATIONS / 5), temp_equation_file=True, - procs=0, - multithreading=False, + parallelism="serial", ) nout = 3 X = np.random.randn(100, 2) @@ -660,9 +835,15 @@ def test_pickle_with_temp_equation_file(self): y_predictions = model.predict(X) - equation_file_base = model.equation_file_ + equation_file_base = Path("outputs") / model.run_id_ / "hall_of_fame" for i in range(1, nout + 1): - assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup") + assert not os.path.exists(str(equation_file_base) + f"_output{i}.csv.bak") + + equation_file_base = ( + Path(model.output_directory_) / model.run_id_ / "hall_of_fame" + ) + for i in range(1, nout + 1): + assert os.path.exists(str(equation_file_base) + f"_output{i}.csv.bak") with tempfile.NamedTemporaryFile() as pickle_file: pkl.dump(model, pickle_file) @@ -687,8 +868,7 @@ def test_scikit_learn_compatibility(self): progress=False, random_state=0, deterministic=True, # Deterministic as tests require this. - procs=0, - multithreading=False, + parallelism="serial", warm_start=False, temp_equation_file=True, ) # Return early. @@ -696,8 +876,14 @@ def test_scikit_learn_compatibility(self): check_generator = check_estimator(model, generate_only=True) exception_messages = [] for _, check in check_generator: - if check.func.__name__ == "check_complex_data": - # We can use complex data, so avoid this check. + if check.func.__name__ in { + # We can use complex data, so avoid this check + "check_complex_data", + # We handle kwargs manually, so skip this check + "check_do_not_raise_errors_in_init_or_set_params", + # TODO: + "check_n_features_in_after_fitting", + }: continue try: with warnings.catch_warnings(): @@ -748,6 +934,7 @@ def test_load_all_packages(self): class TestHelpMessages(unittest.TestCase): """Test user help messages.""" + @skip_if_beartype def test_deprecation(self): """Ensure that deprecation works as expected. @@ -760,6 +947,14 @@ def test_deprecation(self): # The correct value should be set: self.assertEqual(model.fraction_replaced, 0.2) + with self.assertRaises(NotImplementedError): + model.equation_file_ + + with self.assertRaises(ValueError) as cm: + PySRRegressor.from_file(equation_file="", run_directory="") + + self.assertIn("Passing `equation_file` is deprecated", str(cm.exception)) + def test_deprecated_functions(self): with self.assertWarns(FutureWarning): install() @@ -807,7 +1002,7 @@ def test_deterministic_warnings(self): warnings.simplefilter("error") with self.assertRaises(Exception) as context: model.fit(X, y) - self.assertIn("`deterministic`", str(context.exception)) + self.assertIn("`deterministic=True`", str(context.exception)) def test_deterministic_errors(self): """Setting deterministic without random_state should error""" @@ -846,6 +1041,7 @@ def test_bad_variable_names_fail(self): model.fit(X, y, variable_names=["f{c}"]) self.assertIn("Invalid variable name", str(cm.exception)) + @skip_if_beartype def test_bad_kwargs(self): bad_kwargs = [ dict( @@ -1207,8 +1403,8 @@ def test_unit_propagation(self): """ X = np.ones((100, 3)) y = np.ones((100, 1)) - temp_dir = Path(tempfile.mkdtemp()) - equation_file = str(temp_dir / "equation_file.csv") + output_dir = tempfile.mkdtemp() + run_id = "test" model = PySRRegressor( binary_operators=["+", "*"], early_stop_condition="(l, c) -> l < 1e-6 && c == 3", @@ -1219,11 +1415,11 @@ def test_unit_propagation(self): complexity_of_constants=10, weight_mutate_constant=0.0, should_optimize_constants=False, - multithreading=False, + parallelism="serial", deterministic=True, - procs=0, random_state=0, - equation_file=equation_file, + output_directory=output_dir, + run_id=run_id, warm_start=True, ) model.fit( @@ -1243,16 +1439,18 @@ def test_unit_propagation(self): ) # With pkl file: - pkl_file = str(temp_dir / "equation_file.pkl") - model2 = PySRRegressor.from_file(pkl_file) + run_directory = str(Path(output_dir) / run_id) + model2 = PySRRegressor.from_file(run_directory=run_directory) best2 = model2.get_best() self.assertIn("x0", best2["equation"]) # From csv file alone (we need to delete pkl file:) # First, we delete the pkl file: - os.remove(pkl_file) + os.remove(Path(run_directory) / "checkpoint.pkl") model3 = PySRRegressor.from_file( - equation_file, binary_operators=["+", "*"], n_features_in=X.shape[1] + run_directory=run_directory, + binary_operators=["+", "*"], + n_features_in=X.shape[1], ) best3 = model3.get_best() self.assertIn("x0", best3["equation"]) diff --git a/pysr/test/test_nb.ipynb b/pysr/test/test_nb.ipynb index 1cd394fff..ea0084ad7 100644 --- a/pysr/test/test_nb.ipynb +++ b/pysr/test/test_nb.ipynb @@ -122,7 +122,7 @@ "X = np.random.randn(10, 2)\n", "y = np.random.randn(10)\n", "\n", - "model = PySRRegressor(deterministic=True, multithreading=False, procs=0, random_state=0, verbosity=0, progress=False, niterations=1, ncycles_per_iteration=1)\n", + "model = PySRRegressor(deterministic=True, parallelism=\"serial\", random_state=0, verbosity=0, progress=False, niterations=1, ncycles_per_iteration=1)\n", "str(model)" ] }, diff --git a/pysr/test/test_startup.py b/pysr/test/test_startup.py index 6ad64b624..4b2a450b5 100644 --- a/pysr/test/test_startup.py +++ b/pysr/test/test_startup.py @@ -9,8 +9,9 @@ import numpy as np -from pysr import PySRRegressor +from pysr import PySRRegressor, jl from pysr.julia_import import jl_version +from pysr.julia_registry_helpers import PREFERENCE_KEY, try_with_registry_fallback from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS @@ -43,7 +44,8 @@ def test_warm_start_from_file(self): ) model.warm_start = True model.temp_equation_file = False - model.equation_file = Path(tmpdirname) / "equations.csv" + model.output_directory = tmpdirname + model.run_id = "test" model.deterministic = True model.multithreading = False model.random_state = 0 @@ -76,7 +78,9 @@ def test_warm_start_from_file(self): y = np.load("{y_file}") print("Loading model from file") - model = PySRRegressor.from_file("{model.equation_file}") + model = PySRRegressor.from_file( + run_directory="{str(Path(tmpdirname) / model.run_id_)}" + ) assert model.julia_state_ is not None @@ -130,8 +134,6 @@ def test_bad_startup_options(self): self.assertIn(warning_test["msg"], result.stderr.decode()) def test_notebook(self): - if jl_version < (1, 9, 0): - self.skipTest("Julia version too old") if platform.system() == "Windows": self.skipTest("Notebook test incompatible with Windows") if not os.access(Path(__file__).parent, os.W_OK): @@ -158,8 +160,73 @@ def test_notebook(self): self.assertEqual(result.returncode, 0) +class TestRegistryHelper(unittest.TestCase): + """Test the custom Julia registry preference handling.""" + + def setUp(self): + self.old_value = os.environ.get(PREFERENCE_KEY, None) + self.recorded_env_vars = [] + self.hits = 0 + + def failing_operation(): + self.recorded_env_vars.append(os.environ[PREFERENCE_KEY]) + self.hits += 1 + # Just add some package I know will not exist and also not be in the dependency chain: + jl.Pkg.add(name="AirspeedVelocity", version="100.0.0") + + self.failing_operation = failing_operation + + def tearDown(self): + if self.old_value is not None: + os.environ[PREFERENCE_KEY] = self.old_value + else: + os.environ.pop(PREFERENCE_KEY, None) + + def test_successful_operation(self): + self.assertEqual(try_with_registry_fallback(lambda s: s, "success"), "success") + + def test_non_julia_errors_reraised(self): + with self.assertRaises(SyntaxError) as context: + try_with_registry_fallback(lambda: exec("invalid syntax !@#$")) + self.assertNotIn("JuliaError", str(context.exception)) + + def test_julia_error_triggers_fallback(self): + os.environ[PREFERENCE_KEY] = "conservative" + + with self.assertWarns(Warning) as warn_context: + with self.assertRaises(Exception) as error_context: + try_with_registry_fallback(self.failing_operation) + + self.assertIn( + "Unsatisfiable requirements detected", str(error_context.exception) + ) + self.assertIn( + "Initial Julia registry operation failed. Attempting to use the `eager` registry flavor of the Julia", + str(warn_context.warning), + ) + + # Verify both modes are tried in order + self.assertEqual(self.recorded_env_vars, ["conservative", "eager"]) + self.assertEqual(self.hits, 2) + + # Verify environment is restored + self.assertEqual(os.environ[PREFERENCE_KEY], "conservative") + + def test_eager_mode_fails_directly(self): + os.environ[PREFERENCE_KEY] = "eager" + + with self.assertRaises(Exception) as context: + try_with_registry_fallback(self.failing_operation) + + self.assertIn("Unsatisfiable requirements detected", str(context.exception)) + self.assertEqual( + self.recorded_env_vars, ["eager"] + ) # Should only try eager mode + self.assertEqual(self.hits, 1) + + def runtests(just_tests=False): - tests = [TestStartup] + tests = [TestStartup, TestRegistryHelper] if just_tests: return tests suite = unittest.TestSuite() diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 8b26f5ca6..256d21f86 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -1,4 +1,5 @@ import unittest +from pathlib import Path import numpy as np import pandas as pd @@ -48,11 +49,12 @@ def test_pipeline_pandas(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) tformat = model.pytorch() self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") @@ -81,11 +83,12 @@ def test_pipeline(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) tformat = model.pytorch() self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") @@ -133,21 +136,26 @@ def test_custom_operator(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file_custom_operator.csv.bkup" - ) + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) + + MyCustomOperator = sympy.Function("mycustomoperator") model.set_params( - equation_file="equation_file_custom_operator.csv", - extra_sympy_mappings={"mycustomoperator": sympy.sin}, - extra_torch_mappings={"mycustomoperator": self.torch.sin}, + extra_sympy_mappings={"mycustomoperator": MyCustomOperator}, + extra_torch_mappings={MyCustomOperator: self.torch.sin}, ) - model.refresh(checkpoint_file="equation_file_custom_operator.csv") - self.assertEqual(str(model.sympy()), "sin(x1)") + # TODO: We shouldn't need to specify the run directory here. + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) + # self.assertEqual(str(model.sympy()), "sin(x1)") # Will automatically use the set global state from get_hof. tformat = model.pytorch() - self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))") + self.assertEqual( + str(tformat), "_SingleSymPyModule(expression=mycustomoperator(x1))" + ) np.testing.assert_almost_equal( tformat(self.torch.tensor(X)).detach().numpy(), np.sin(X[:, 1]), @@ -200,11 +208,9 @@ def cos_approx(x): maxsize=10, early_stop_condition=1e-5, extra_sympy_mappings={"cos_approx": cos_approx}, - extra_torch_mappings={"cos_approx": cos_approx}, random_state=0, deterministic=True, - procs=0, - multithreading=False, + parallelism="serial", ) np.random.seed(0) model.fit(X.values, y.values) diff --git a/pysr/utils.py b/pysr/utils.py index de7faf16e..e18a5b4c3 100644 --- a/pysr/utils.py +++ b/pysr/utils.py @@ -1,42 +1,35 @@ +from __future__ import annotations + import difflib import inspect -import os import re from pathlib import Path -from typing import Any, List, TypeVar, Union +from typing import Any, TypeVar from numpy import ndarray from sklearn.utils.validation import _check_feature_names_in # type: ignore T = TypeVar("T", bound=Any) -ArrayLike = Union[ndarray, List[T]] -PathLike = Union[str, Path] - +ArrayLike = ndarray | list[T] +PathLike = str | Path -def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike: - if os.path.splitext(csv_filename)[1] == ".pkl": - return csv_filename - # Assume that the csv filename is of the form "foo.csv" - assert str(csv_filename).endswith(".csv") +_regexp_im = re.compile(r"\b(\d+\.\d+)im\b") +_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b") +_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b") - dirname = str(os.path.dirname(csv_filename)) - basename = str(os.path.basename(csv_filename)) - base = str(os.path.splitext(basename)[0]) - pkl_basename = base + ".pkl" +def _apply_regexp_im(x: str): + return _regexp_im.sub(r"\1j", x) - return os.path.join(dirname, pkl_basename) +def _apply_regexp_im_sci(x: str): + return _regexp_im_sci.sub(r"\1e\2j", x) -_regexp_im = re.compile(r"\b(\d+\.\d+)im\b") -_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b") -_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b") -_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x) -_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x) -_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x) +def _apply_regexp_sci(x: str): + return _regexp_sci.sub(r"\1e\2", x) def _preprocess_julia_floats(s: str) -> str: @@ -65,7 +58,7 @@ def _subscriptify(i: int) -> str: return "".join([chr(0x2080 + int(c)) for c in str(i)]) -def _suggest_keywords(cls, k: str) -> List[str]: +def _suggest_keywords(cls, k: str) -> list[str]: valid_keywords = [ param for param in inspect.signature(cls.__init__).parameters diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index aa92aaf13..000000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -sympy>=1.0.0,<2.0.0 -pandas>=0.21.0,<3.0.0 -numpy>=1.13.0,<3.0.0 -scikit_learn>=1.0.0,<2.0.0 -juliacall==0.9.23 -click>=7.0.0,<9.0.0 -setuptools>=50.0.0