From 31c3b1734d6dc13107b3ed804290bd41223ff407 Mon Sep 17 00:00:00 2001 From: julianlheureux Date: Fri, 29 Mar 2024 21:48:59 -0300 Subject: [PATCH 1/9] Update of the examples section of README.md file and homepage from index.qmd --- .gitignore | 1 + README.md | 49 ++++++++++++++++++++++++++++---------- docs/index.qmd | 64 ++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 91 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 7744b9e87..63b0ce404 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ docs/_build .vscode/ pytest.ini /.quarto/ +.Rproj.user diff --git a/README.md b/README.md index 90d07424f..878408fe8 100644 --- a/README.md +++ b/README.md @@ -32,45 +32,68 @@ Bambi requires working versions of ArviZ, formulae, NumPy, pandas and PyMC. Depe In the following two examples we assume the following basic setup ```python +import arviz as az import bambi as bmb import numpy as np import pandas as pd - -data = pd.DataFrame({ - "y": np.random.normal(size=50), - "g": np.random.choice(["Yes", "No"], size=50), - "x1": np.random.normal(size=50), - "x2": np.random.normal(size=50) -}) ``` ### Linear regression +A simple fixed effect model is shown in the example below. + ```python -model = bmb.Model("y ~ x1 + x2", data) -fitted = model.fit() +#### Read in a database from the package content +data = bmb.load_data("sleepstudy") + +# Initialize the fixed effect only model +model = bmb.Model('Reaction ~ Days', data) +print(model) # Get model description + +# Fit the model using 1000 on each chain +results = model.fit(draws=1000) + +# Key summary and diagnostic info on the model parameters +az.summary(results) + +# Use ArviZ to plot the results +az.plot_trace(results) ``` -In the first line we create and build a Bambi `Model`. The second line tells the sampler to start +First, we create and build a Bambi `Model`. Then, the function `model.fit` tells the sampler to start running and it returns an `InferenceData` object, which can be passed to several ArviZ functions such as `az.summary()` to get a summary of the parameters distribution and sample diagnostics or - `az.plot_trace()` to visualize them. - +`az.plot_trace()` to visualize them. ### Logistic regression +In this example we will use a simulated dataset created as shown below. + +```python +data = pd.DataFrame({ + "g": np.random.choice(["Yes", "No"], size=50), + "x1": np.random.normal(size=50), + "x2": np.random.normal(size=50) +}) +``` + Here we just add the `family` argument set to `"bernoulli"` to tell Bambi we are modelling a binary response. By default, it uses a logit link. We can also use some syntax sugar to specify which event we want to model. We just say `g['Yes']` and Bambi will understand we want to model the probability of a `"Yes"` response. But this notation is not mandatory. If we use `"g ~ x1 + x2"`, Bambi will pick one of the events to model and will inform us which one it picked. - ```python model = bmb.Model("g['Yes'] ~ x1 + x2", data, family="bernoulli") fitted = model.fit() ``` +After this, we can evaluate the model as before. + +### More + +There are many additional examples in our [Examples](https://bambinos.github.io/bambi/notebooks/) webpage. + ## Documentation The Bambi documentation can be found in the [official docs](https://bambinos.github.io/bambi/index.html) diff --git a/docs/index.qmd b/docs/index.qmd index 1499cbba8..9eb76054c 100644 --- a/docs/index.qmd +++ b/docs/index.qmd @@ -58,31 +58,75 @@ If you use Conda, you can also install the latest release of Bambi with the foll conda install -c conda-forge bambi ``` -## Usage +## Examples of usage -A simple fixed effects model is shown in the example below. +## Example + +In the following two examples we assume the following basic setup ```python import arviz as az import bambi as bmb +import numpy as np import pandas as pd +``` + +### Linear regression -# Read in a tab-delimited file containing our data -data = pd.read_table('my_data.txt', sep='\t') +A simple fixed effect model is shown in the example below. -# Initialize the fixed effects only model -model = bmb.Model('DV ~ IV1 + IV2', data) +```python +#### Read in a database from the package content +data = bmb.load_data("sleepstudy") -# Fit the model using 1000 on each of 4 chains -results = model.fit(draws=1000, chains=4) +# Initialize the fixed effect only model +model = bmb.Model('Reaction ~ Days', data) +print(model) # Get model description -# Use ArviZ to plot the results -az.plot_trace(results) +# Fit the model using 1000 on each chain +results = model.fit(draws=1000) # Key summary and diagnostic info on the model parameters az.summary(results) + +# Use ArviZ to plot the results +az.plot_trace(results) +``` + +First, we create and build a Bambi `Model`. Then, the function `model.fit` tells the sampler to start +running and it returns an `InferenceData` object, which can be passed to several ArviZ functions +such as `az.summary()` to get a summary of the parameters distribution and sample diagnostics or +`az.plot_trace()` to visualize them. + +### Logistic regression + +In this example we will use a simulated dataset created as shown below. + +```python +data = pd.DataFrame({ + "g": np.random.choice(["Yes", "No"], size=50), + "x1": np.random.normal(size=50), + "x2": np.random.normal(size=50) +}) ``` +Here we just add the `family` argument set to `"bernoulli"` to tell Bambi we are modelling a binary +response. By default, it uses a logit link. We can also use some syntax sugar to specify which event +we want to model. We just say `g['Yes']` and Bambi will understand we want to model the probability +of a `"Yes"` response. But this notation is not mandatory. If we use `"g ~ x1 + x2"`, Bambi will +pick one of the events to model and will inform us which one it picked. + +```python +model = bmb.Model("g['Yes'] ~ x1 + x2", data, family="bernoulli") +fitted = model.fit() +``` + +After this, we can evaluate the model as before. + +### More + +There are many additional examples in our [Examples](https://bambinos.github.io/bambi/notebooks/) webpage. + For a more in-depth introduction to Bambi see our [Quickstart](https://github.com/bambinos/bambi#quickstart) or our set of example notebooks. From c132acd8bb59fe0adddef98765bab733a3d435ef Mon Sep 17 00:00:00 2001 From: julianlheureux <163574230+julianlheureux@users.noreply.github.com> Date: Fri, 29 Mar 2024 21:57:09 -0300 Subject: [PATCH 2/9] Update index.qmd --- docs/index.qmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/index.qmd b/docs/index.qmd index 9eb76054c..780523c57 100644 --- a/docs/index.qmd +++ b/docs/index.qmd @@ -60,8 +60,6 @@ conda install -c conda-forge bambi ## Examples of usage -## Example - In the following two examples we assume the following basic setup ```python From f7e4a8fb76fe7d9acac1930b1d95f3bf3c39aa7c Mon Sep 17 00:00:00 2001 From: julianlheureux Date: Fri, 29 Mar 2024 22:46:33 -0300 Subject: [PATCH 3/9] Update index.qmd --- docs/index.qmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/index.qmd b/docs/index.qmd index 9eb76054c..780523c57 100644 --- a/docs/index.qmd +++ b/docs/index.qmd @@ -60,8 +60,6 @@ conda install -c conda-forge bambi ## Examples of usage -## Example - In the following two examples we assume the following basic setup ```python From 8087a736124f0a890577eda56ca536bf8cc95480 Mon Sep 17 00:00:00 2001 From: julianlheureux Date: Mon, 1 Apr 2024 13:03:52 -0300 Subject: [PATCH 4/9] Update README.md and index.qmd --- README.md | 51 ++++++++++++++++++++++++++++++++++++++--------- docs/index.qmd | 54 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 84 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 878408fe8..772262c00 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Bambi is a high-level Bayesian model-building interface written in Python. It's ## Installation -Bambi requires a working Python interpreter (3.9+). We recommend installing Python and key numerical libraries using the [Anaconda Distribution](https://www.anaconda.com/products/individual#Downloads), which has one-click installers available on all major platforms. +Bambi requires a working Python interpreter (3.10+). We recommend installing Python and key numerical libraries using the [Anaconda Distribution](https://www.anaconda.com/products/individual#Downloads), which has one-click installers available on all major platforms. Assuming a standard Python environment is installed on your machine (including pip), Bambi itself can be installed in one line using pip: @@ -27,7 +27,7 @@ Alternatively, if you want the bleeding edge version of the package you can inst Bambi requires working versions of ArviZ, formulae, NumPy, pandas and PyMC. Dependencies are listed in `pyproject.toml` and should all be installed by the Bambi installer; no further action should be required. -## Example +## Examples In the following two examples we assume the following basic setup @@ -40,15 +40,20 @@ import pandas as pd ### Linear regression -A simple fixed effect model is shown in the example below. +A simple fixed effects model is shown in the example below. ```python -#### Read in a database from the package content +# Read in a dataset from the package content data = bmb.load_data("sleepstudy") -# Initialize the fixed effect only model +# See first rows +data.head() + +# Initialize the fixed effects only model model = bmb.Model('Reaction ~ Days', data) -print(model) # Get model description + +# Get model description +print(model) # Fit the model using 1000 on each chain results = model.fit(draws=1000) @@ -59,8 +64,36 @@ az.summary(results) # Use ArviZ to plot the results az.plot_trace(results) ``` +``` + Reaction Days Subject +0 249.5600 0 308 +1 258.7047 1 308 +2 250.8006 2 308 +3 321.4398 3 308 +4 356.8519 4 308 +``` +``` + Formula: Reaction ~ Days + Family: gaussian + Link: mu = identity + Observations: 180 + Priors: + target = mu + Common-level effects + Intercept ~ Normal(mu: 298.5079, sigma: 261.0092) + Days ~ Normal(mu: 0.0, sigma: 48.8915) + + Auxiliary parameters + sigma ~ HalfStudentT(nu: 4.0, sigma: 56.1721) +``` +``` + mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat +Intercept 251.552 6.658 238.513 263.417 0.083 0.059 6491.0 2933.0 1.0 +Days 10.437 1.243 8.179 12.793 0.015 0.011 6674.0 3242.0 1.0 +Reaction_sigma 47.949 2.550 43.363 52.704 0.035 0.025 5614.0 2974.0 1.0 +``` -First, we create and build a Bambi `Model`. Then, the function `model.fit` tells the sampler to start +First, we create and build a Bambi `Model`. Then, the method `model.fit()` tells the sampler to start running and it returns an `InferenceData` object, which can be passed to several ArviZ functions such as `az.summary()` to get a summary of the parameters distribution and sample diagnostics or `az.plot_trace()` to visualize them. @@ -92,7 +125,7 @@ After this, we can evaluate the model as before. ### More -There are many additional examples in our [Examples](https://bambinos.github.io/bambi/notebooks/) webpage. +For a more in-depth introduction to Bambi see our [Quickstart](https://github.com/bambinos/bambi#quickstart) and check the notebooks in the [Examples](https://bambinos.github.io/bambi/notebooks/) webpage. ## Documentation @@ -102,7 +135,7 @@ The Bambi documentation can be found in the [official docs](https://bambinos.git If you use Bambi and want to cite it please use -``` +```bibtex @article{Capretto2022, title={Bambi: A Simple Interface for Fitting Bayesian Linear Models in Python}, volume={103}, diff --git a/docs/index.qmd b/docs/index.qmd index 780523c57..b0188beae 100644 --- a/docs/index.qmd +++ b/docs/index.qmd @@ -25,7 +25,7 @@ social sciences and other disciplines. ## Dependencies -Bambi is tested on Python 3.9+ and depends on ArviZ, formulae, NumPy, pandas and PyMC +Bambi is tested on Python 3.10+ and depends on ArviZ, formulae, NumPy, pandas and PyMC (see [pyproject.toml](https://github.com/bambinos/bambi/blob/main/pyproject.toml) for version information). @@ -58,7 +58,7 @@ If you use Conda, you can also install the latest release of Bambi with the foll conda install -c conda-forge bambi ``` -## Examples of usage +## Examples In the following two examples we assume the following basic setup @@ -71,15 +71,20 @@ import pandas as pd ### Linear regression -A simple fixed effect model is shown in the example below. +A simple fixed effects model is shown in the example below. ```python -#### Read in a database from the package content +# Read in a dataset from the package content data = bmb.load_data("sleepstudy") -# Initialize the fixed effect only model +# See first rows +data.head() + +# Initialize the fixed effects only model model = bmb.Model('Reaction ~ Days', data) -print(model) # Get model description + +# Get model description +print(model) # Fit the model using 1000 on each chain results = model.fit(draws=1000) @@ -90,8 +95,36 @@ az.summary(results) # Use ArviZ to plot the results az.plot_trace(results) ``` +``` + Reaction Days Subject +0 249.5600 0 308 +1 258.7047 1 308 +2 250.8006 2 308 +3 321.4398 3 308 +4 356.8519 4 308 +``` +``` + Formula: Reaction ~ Days + Family: gaussian + Link: mu = identity + Observations: 180 + Priors: + target = mu + Common-level effects + Intercept ~ Normal(mu: 298.5079, sigma: 261.0092) + Days ~ Normal(mu: 0.0, sigma: 48.8915) + + Auxiliary parameters + sigma ~ HalfStudentT(nu: 4.0, sigma: 56.1721) +``` +``` + mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat +Intercept 251.552 6.658 238.513 263.417 0.083 0.059 6491.0 2933.0 1.0 +Days 10.437 1.243 8.179 12.793 0.015 0.011 6674.0 3242.0 1.0 +Reaction_sigma 47.949 2.550 43.363 52.704 0.035 0.025 5614.0 2974.0 1.0 +``` -First, we create and build a Bambi `Model`. Then, the function `model.fit` tells the sampler to start +First, we create and build a Bambi `Model`. Then, the method `model.fit()` tells the sampler to start running and it returns an `InferenceData` object, which can be passed to several ArviZ functions such as `az.summary()` to get a summary of the parameters distribution and sample diagnostics or `az.plot_trace()` to visualize them. @@ -123,16 +156,13 @@ After this, we can evaluate the model as before. ### More -There are many additional examples in our [Examples](https://bambinos.github.io/bambi/notebooks/) webpage. - -For a more in-depth introduction to Bambi see our -[Quickstart](https://github.com/bambinos/bambi#quickstart) or our set of example notebooks. +For a more in-depth introduction to Bambi see our [Quickstart](https://github.com/bambinos/bambi#quickstart) and check the notebooks in the [Examples](https://bambinos.github.io/bambi/notebooks/) webpage. ## Citation If you use Bambi and want to cite it please use -```bib +```bibtex @article{ Capretto2022, title={Bambi: A Simple Interface for Fitting Bayesian Linear Models in Python}, From 06cbddcc164b91fc9ceaeeb97faf5ec3e75c8eb9 Mon Sep 17 00:00:00 2001 From: julianlheureux <“lheureuxjulian@gmail.com”> Date: Thu, 22 Aug 2024 21:03:34 -0300 Subject: [PATCH 5/9] Adding horseshoe prior and example --- bambi/backend/utils.py | 9 +- docs/notebooks/horseshoe_prior.ipynb | 648 +++++++++++++++++++++++++++ 2 files changed, 656 insertions(+), 1 deletion(-) create mode 100644 docs/notebooks/horseshoe_prior.ipynb diff --git a/bambi/backend/utils.py b/bambi/backend/utils.py index 6827099d9..e7d78535f 100644 --- a/bambi/backend/utils.py +++ b/bambi/backend/utils.py @@ -5,7 +5,14 @@ import pytensor.tensor as pt import pymc as pm -MAPPING = {"Cumulative": pm.Categorical, "StoppingRatio": pm.Categorical} +def Horseshoe(name, tau_nu = 3, lam_nu = 1, dims=None): + tau = pm.HalfStudentT(f"{name}_tau", nu=tau_nu) + lam = pm.HalfStudentT(f"{name}_lam", nu=lam_nu, dims=dims) + beta_raw = pm.Normal(f"{name}_raw", 0, 1, dims=dims) + beta = pm.Deterministic(name, beta_raw * tau ** 2 * lam ** 2, dims=dims) + return beta + +MAPPING = {"Cumulative": pm.Categorical, "StoppingRatio": pm.Categorical, "Horseshoe": Horseshoe} def get_distribution(dist): diff --git a/docs/notebooks/horseshoe_prior.ipynb b/docs/notebooks/horseshoe_prior.ipynb new file mode 100644 index 000000000..a5869e974 --- /dev/null +++ b/docs/notebooks/horseshoe_prior.ipynb @@ -0,0 +1,648 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Horseshoe Prior\n", + "\n", + "In this example, we will use the Horseshoe Prior (Carvalho et al., 2009) to model a large number of variables, with only a few slopes being significantly different from zero. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import arviz as az\n", + "import bambi as bmb\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pymc as pm\n", + "import pytensor.tensor as pt\n", + "\n", + "from matplotlib import pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are simulating 100 observations with the 50 explanatory variables, also simulated. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "D = 50\n", + "D0 = 5\n", + "\n", + "SEED = 123456789 # for reproducibility\n", + "\n", + "rng = np.random.default_rng(SEED)\n", + "\n", + "INTERCEPT = rng.uniform(-3, 3) # simulate an intercept\n", + "\n", + "COEF = np.zeros(D)\n", + "# Simulate the slopes for significant variables\n", + "COEF[:D0] = rng.choice([-1, 1], size=D0) * rng.normal(5, 1, size=D0)\n", + "\n", + "N = 100\n", + "X = rng.normal(size=(N, D))\n", + "SIGMA = 1.\n", + "# Simulate the data\n", + "y = INTERCEPT + X.dot(COEF) + rng.normal(0, SIGMA, size=N)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we create the dataframe and the term name for the set of variables, to define the formula. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(X)\n", + "df.columns = [f\"x{i}\" for i in range(X.shape[1])]\n", + "df[\"y\"] = y" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'y ~ c(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49)'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "term_name = \"c(\" + \", \".join([f\"x{i}\" for i in range(X.shape[1])]) + \")\"\n", + "formula = f\"y ~ {term_name}\"\n", + "formula" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we call the Horseshoe prior and create the model" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "clusterpredictors_dim (50)\n", + "\n", + "predictors_dim (50)\n", + "\n", + "\n", + "cluster__obs__ (100)\n", + "\n", + "__obs__ (100)\n", + "\n", + "\n", + "\n", + "predictors_tau\n", + "\n", + "predictors_tau\n", + "~\n", + "HalfStudentT\n", + "\n", + "\n", + "\n", + "predictors\n", + "\n", + "predictors\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "predictors_tau->predictors\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Intercept\n", + "\n", + "Intercept\n", + "~\n", + "Normal\n", + "\n", + "\n", + "\n", + "mu\n", + "\n", + "mu\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "Intercept->mu\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "sigma\n", + "\n", + "sigma\n", + "~\n", + "HalfStudentT\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "~\n", + "Normal\n", + "\n", + "\n", + "\n", + "sigma->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predictors->mu\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predictors_lam\n", + "\n", + "predictors_lam\n", + "~\n", + "HalfStudentT\n", + "\n", + "\n", + "\n", + "predictors_lam->predictors\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predictors_raw\n", + "\n", + "predictors_raw\n", + "~\n", + "Normal\n", + "\n", + "\n", + "\n", + "predictors_raw->predictors\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mu->y\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "priors = {\n", + " term_name: bmb.Prior(\"Horseshoe\"),\n", + "}\n", + "model = bmb.Model(formula, df, priors=priors)\n", + "model.set_alias({term_name: \"predictors\"})\n", + "\n", + "model.build()\n", + "model.graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Auto-assigning NUTS sampler...\n", + "Initializing NUTS using jitter+adapt_diag...\n", + "Multiprocess sampling (2 chains in 2 jobs)\n", + "NUTS: [sigma, Intercept, predictors_tau, predictors_lam, predictors_raw]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1939127c06ab47b69b9e5051e3500949", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 444 seconds.\n", + "There were 64 divergences after tuning. Increase `target_accept` or reparameterize.\n", + "Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.\n", + "Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.\n", + "We recommend running at least 4 chains for robust computation of convergence diagnostics\n" + ] + } + ], + "source": [ + "idata = model.fit(target_accept = 0.95, chains=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "clusterpredictors_dim (50)\n", + "\n", + "predictors_dim (50)\n", + "\n", + "\n", + "cluster__obs__ (100)\n", + "\n", + "__obs__ (100)\n", + "\n", + "\n", + "\n", + "predictors_tau\n", + "\n", + "predictors_tau\n", + "~\n", + "HalfStudentT\n", + "\n", + "\n", + "\n", + "predictors\n", + "\n", + "predictors\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "predictors_tau->predictors\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Intercept\n", + "\n", + "Intercept\n", + "~\n", + "Normal\n", + "\n", + "\n", + "\n", + "mu\n", + "\n", + "mu\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "Intercept->mu\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "sigma\n", + "\n", + "sigma\n", + "~\n", + "HalfStudentT\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "~\n", + "Normal\n", + "\n", + "\n", + "\n", + "sigma->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predictors->mu\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predictors_lam\n", + "\n", + "predictors_lam\n", + "~\n", + "HalfStudentT\n", + "\n", + "\n", + "\n", + "predictors_lam->predictors\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predictors_raw\n", + "\n", + "predictors_raw\n", + "~\n", + "Normal\n", + "\n", + "\n", + "\n", + "predictors_raw->predictors\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mu->y\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "priors = {\n", + " term_name: bmb.Prior(\"Horseshoe\", tau_nu = 3, lam_nu = 3),\n", + "}\n", + "model = bmb.Model(formula, df, priors=priors)\n", + "model.set_alias({term_name: \"predictors\"})\n", + "\n", + "model.build()\n", + "model.graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Auto-assigning NUTS sampler...\n", + "Initializing NUTS using jitter+adapt_diag...\n", + "Multiprocess sampling (2 chains in 2 jobs)\n", + "NUTS: [sigma, Intercept, predictors_tau, predictors_lam, predictors_raw]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b0b4f67884bd4607a15224df17c185f8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 293 seconds.\n", + "There were 29 divergences after tuning. Increase `target_accept` or reparameterize.\n", + "We recommend running at least 4 chains for robust computation of convergence diagnostics\n" + ] + } + ], + "source": [ + "idata = model.fit(target_accept = 0.97, chains=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ax, = az.plot_forest(\n", + " idata, \n", + " var_names=[\"predictors\"], \n", + " coords={\"predictors_dim\": range(D0)},\n", + " kind='ridgeplot',\n", + " ridgeplot_truncate=False, \n", + " ridgeplot_alpha=0.5,\n", + " hdi_prob=0.95, \n", + " combined=True,\n", + " figsize=(8, 6)\n", + ")\n", + "ax.scatter(COEF[:D0][::-1], ax.get_yticks(), c='C1', label=\"Actual value\");\n", + "ax.set_xlabel(r\"$\\beta_i$\");\n", + "ax.set_ylim(bottom=None, top=1.55 * ax.get_yticks().max())\n", + "ax.set_yticklabels(range(D0)[::-1]);\n", + "ax.set_ylabel(r\"$i$\");\n", + "ax.legend(loc='upper center');\n", + "ax.set_title(\"Posterior distribution of nonzero coefficients\");" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last updated: Thu Aug 22 2024\n", + "\n", + "Python implementation: CPython\n", + "Python version : 3.11.9\n", + "IPython version : 8.24.0\n", + "\n", + "pandas : 2.2.2\n", + "numpy : 1.26.4\n", + "bambi : 0.14.1.dev12+g64e57423.d20240730\n", + "arviz : 0.18.0\n", + "matplotlib: 3.8.4\n", + "pymc : 5.16.1\n", + "pytensor : 2.23.0\n", + "\n", + "Watermark: 2.4.3\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -n -u -v -iv -w" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bambi-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b5c8d4f46550f0fb234047a6c40835948e887a45 Mon Sep 17 00:00:00 2001 From: julianlheureux <“lheureuxjulian@gmail.com”> Date: Thu, 22 Aug 2024 21:32:18 -0300 Subject: [PATCH 6/9] Adding horseshoe prior and example2 --- bambi/backend/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bambi/backend/utils.py b/bambi/backend/utils.py index e7d78535f..94915e067 100644 --- a/bambi/backend/utils.py +++ b/bambi/backend/utils.py @@ -5,14 +5,16 @@ import pytensor.tensor as pt import pymc as pm -def Horseshoe(name, tau_nu = 3, lam_nu = 1, dims=None): + +def horseshoe(name, tau_nu=3, lam_nu=1, dims=None): tau = pm.HalfStudentT(f"{name}_tau", nu=tau_nu) - lam = pm.HalfStudentT(f"{name}_lam", nu=lam_nu, dims=dims) + lam = pm.HalfStudentT(f"{name}_lam", nu=lam_nu, dims=dims) beta_raw = pm.Normal(f"{name}_raw", 0, 1, dims=dims) - beta = pm.Deterministic(name, beta_raw * tau ** 2 * lam ** 2, dims=dims) + beta = pm.Deterministic(name, beta_raw * tau**2 * lam**2, dims=dims) return beta -MAPPING = {"Cumulative": pm.Categorical, "StoppingRatio": pm.Categorical, "Horseshoe": Horseshoe} + +MAPPING = {"Cumulative": pm.Categorical, "StoppingRatio": pm.Categorical, "Horseshoe": horseshoe} def get_distribution(dist): From 25678d4600a7420a05f5580c9f8d9b988c2fa822 Mon Sep 17 00:00:00 2001 From: julianlheureux <“lheureuxjulian@gmail.com”> Date: Fri, 23 Aug 2024 00:16:13 -0300 Subject: [PATCH 7/9] Update horseshoe notebook --- docs/notebooks/horseshoe_prior.ipynb | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/notebooks/horseshoe_prior.ipynb b/docs/notebooks/horseshoe_prior.ipynb index a5869e974..3d5e39d3a 100644 --- a/docs/notebooks/horseshoe_prior.ipynb +++ b/docs/notebooks/horseshoe_prior.ipynb @@ -29,7 +29,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We are simulating 100 observations with the 50 explanatory variables, also simulated. " + "Here is what we did:\n", + "\n", + "* We defined an intercept.\n", + "* We defined a vector of 50 betas, 5 of which were drawn from a normal(5,1) distribution, and then assigned a random sign.\n", + "* We created the design matrix with normal(0,1) entries and set $\\sigma$ to 1.\n", + "* We calculated the deterministic means $\\mu$ using the intercept and the design matrix multiplied by the betas.\n", + "* We simulated 100 response variables (observations) from a normal distribution with mean $\\mu$ and standard deviation $\\sigma$.\n", + "\n", + "Next, we proceeded with the Bayesian estimation of the model. We proposed the horseshoe prior, for which the following parameters were calculated:\n", + "\n", + "$$\\mu_i = \\alpha + \\beta_1 x_{1i} + \\beta_2 x_{2i} + ... + \\beta_p x_{pi}$$\n", + "\n", + "$$y_i \\sim N(\\mu_i, \\sigma^2)$$\n", + "\n", + "$$\\alpha \\sim N(0,1)$$\n", + "\n", + "$$\\beta_j \\sim N(0,\\lambda_j^2 \\tau^2)$$\n", + "\n", + "$$\\lambda_j \\sim C^+(0,1)$$\n", + "\n", + "$$\\tau \\sim T^+(df=3)$$\n", + "\n", + "$$\\sigma^2 \\sim N^+(0,1)$$" ] }, { @@ -62,7 +84,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then we create the dataframe and the term name for the set of variables, to define the formula. " + "Here we create the dataframe and the term name for the set of variables, to define the formula. " ] }, { From 025ae0ee907b4fdf6da6502766e328046fb1fcaa Mon Sep 17 00:00:00 2001 From: julianlheureux <“lheureuxjulian@gmail.com”> Date: Wed, 28 Aug 2024 15:00:47 -0300 Subject: [PATCH 8/9] Add docstring --- bambi/backend/utils.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/bambi/backend/utils.py b/bambi/backend/utils.py index 94915e067..625afad3f 100644 --- a/bambi/backend/utils.py +++ b/bambi/backend/utils.py @@ -7,6 +7,26 @@ def horseshoe(name, tau_nu=3, lam_nu=1, dims=None): + """Simulate a beta coefficient value with a horseshoe prior. + This is an internal function which is not supposed to be used by users. + This will be used only when a horseshoe prior is called for beta coefficients. + + Parameters + ---------- + name: str + is the name of the parameters as registered in the PyMC model + tau_nu: int, float + Degrees of freedom of tau. Default: 3 + lam_nu: int, float + Degrees of freedom of lam. Default: 1 (equivalent to a HalfCauchy) + dims: str + dimensions passed to PyMC. Default: None + + Returns + ------ + np.ndarray + Array with the beta coefficient simulated. + """ tau = pm.HalfStudentT(f"{name}_tau", nu=tau_nu) lam = pm.HalfStudentT(f"{name}_lam", nu=lam_nu, dims=dims) beta_raw = pm.Normal(f"{name}_raw", 0, 1, dims=dims) From 9d547dedb29fc276de64085ec7d63c86d64d272b Mon Sep 17 00:00:00 2001 From: julianlheureux <“lheureuxjulian@gmail.com”> Date: Wed, 28 Aug 2024 15:05:16 -0300 Subject: [PATCH 9/9] Add docstring2 --- bambi/backend/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bambi/backend/utils.py b/bambi/backend/utils.py index 625afad3f..50f772a00 100644 --- a/bambi/backend/utils.py +++ b/bambi/backend/utils.py @@ -7,7 +7,7 @@ def horseshoe(name, tau_nu=3, lam_nu=1, dims=None): - """Simulate a beta coefficient value with a horseshoe prior. + """Simulate a beta coefficient value with a horseshoe prior. This is an internal function which is not supposed to be used by users. This will be used only when a horseshoe prior is called for beta coefficients. @@ -25,7 +25,7 @@ def horseshoe(name, tau_nu=3, lam_nu=1, dims=None): Returns ------ np.ndarray - Array with the beta coefficient simulated. + Array with the beta coefficient simulated. """ tau = pm.HalfStudentT(f"{name}_tau", nu=tau_nu) lam = pm.HalfStudentT(f"{name}_lam", nu=lam_nu, dims=dims)