From 068aaf2effce775437735046ea9ea2ebd840f910 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Fri, 28 Jun 2024 19:43:10 +0800 Subject: [PATCH 01/12] Added Inverse Gamma distribution --- skpro/distributions/inversegamma.py | 79 +++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 skpro/distributions/inversegamma.py diff --git a/skpro/distributions/inversegamma.py b/skpro/distributions/inversegamma.py new file mode 100644 index 000000000..daa3f9461 --- /dev/null +++ b/skpro/distributions/inversegamma.py @@ -0,0 +1,79 @@ +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +"""Inverse Gamma probability distribution.""" + +__author__ = ["meraldoantonio"] + +import pandas as pd +from scipy.stats import invgamma, rv_continuous + +from skpro.distributions.adapters.scipy import _ScipyAdapter + + +class InverseGamma(_ScipyAdapter): + r"""Inverse Gamma Distribution. + + Most methods wrap ``scipy.stats.invgamma``. + + The Inverse Gamma Distribution is parameterized by shape :math:`\alpha` and + scale :math:`\beta`, such that the pdf is + + .. math:: f(x) = \frac{\beta^{\alpha} x^{-\alpha-1} \exp\left(-\frac{\beta}{x}\right)}{\tau(\alpha)} + + where :math:`\tau(\alpha)` is the Gamma function. + For all positive integers, :math:`\tau(\alpha) = (\alpha-1)!`. + + Parameters + ---------- + alpha : float or array of float (1D or 2D) + It represents the shape parameter. + beta : float or array of float (1D or 2D) + It represents the scale parameter. + index : pd.Index, optional, default = RangeIndex + columns : pd.Index, optional, default = RangeIndex + + Example + ------- + >>> from skpro.distributions.invgamma import InverseGamma + + >>> d = InverseGamma(beta=[[1, 1], [2, 3], [4, 5]], alpha=2) + """ # noqa: E501 + + _tags = { + "capabilities:approx": ["energy", "pdfnorm"], + "capabilities:exact": ["mean", "var", "pdf", "log_pdf", "cdf", "ppf"], + "distr:measuretype": "continuous", + "distr:paramtype": "parametric", + "broadcast_init": "on", + } + + def __init__(self, alpha, beta, index=None, columns=None): + self.alpha = alpha + self.beta = beta + + super().__init__(index=index, columns=columns) + + def _get_scipy_object(self) -> rv_continuous: + return invgamma + + def _get_scipy_param(self): + alpha = self._bc_params["alpha"] + beta = self._bc_params["beta"] + scale = beta + + return [], {"a": alpha, "scale": scale} + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator.""" + # array case examples + params1 = {"alpha": [6, 2.5], "beta": [[1, 1], [2, 3], [4, 5]]} + params2 = { + "alpha": 2, + "beta": 3, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a", "b"]), + } + # scalar case examples + params3 = {"alpha": 1.5, "beta": 2.1} + + return [params1, params2, params3] From bf9e49bb47fa72f1294b3346c2fd22cc15a69ae3 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Fri, 28 Jun 2024 20:32:40 +0800 Subject: [PATCH 02/12] Added binomial distributions --- skpro/distributions/binomial.py | 73 +++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 skpro/distributions/binomial.py diff --git a/skpro/distributions/binomial.py b/skpro/distributions/binomial.py new file mode 100644 index 000000000..f3d57f0a1 --- /dev/null +++ b/skpro/distributions/binomial.py @@ -0,0 +1,73 @@ +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +"""Binomial probability distribution.""" + +__author__ = ["meraldoantonio"] + +import pandas as pd +from scipy.stats import binom, rv_discrete + +from skpro.distributions.adapters.scipy import _ScipyAdapter + + +class Binomial(_ScipyAdapter): + r"""Binomial distribution. + + Most methods wrap ``scipy.stats.binom``. + The Binomial distribution is parameterized by the number of trials :math:`n` + and the probability of success :math:`p`, + such that the probability mass function (PMF) is given by: + + .. math:: P(X = k) = \binom{n}{k} p^k (1-p)^{n-k} + + Parameters + ---------- + n : int or array of int (1D or 2D), must be non-negative + p : float or array of float (1D or 2D), must be in [0, 1] + index : pd.Index, optional, default = RangeIndex + columns : pd.Index, optional, default = RangeIndex + + Example + ------- + >>> from skpro.distributions.binomial import Binomial + + >>> d = Binomial(n=[[10, 10], [20, 30], [40, 50]], p=0.5) + """ + + _tags = { + "capabilities:approx": ["pmf"], + "capabilities:exact": ["mean", "var", "pmf", "log_pmf", "cdf", "ppf"], + "distr:measuretype": "discrete", + "distr:paramtype": "parametric", + "broadcast_init": "on", + } + + def __init__(self, n, p, index=None, columns=None): + self.n = n + self.p = p + + super().__init__(index=index, columns=columns) + + def _get_scipy_object(self) -> rv_discrete: + return binom + + def _get_scipy_param(self): + n = self._bc_params["n"] + p = self._bc_params["p"] + + return [], {"n": n, "p": p} + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator.""" + # array case examples + params1 = {"n": [[10, 10], [20, 30], [40, 50]], "p": 0.5} + params2 = { + "n": 10, + "p": 0.5, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a", "b"]), + } + # scalar case examples + params3 = {"n": 15, "p": 0.7} + + return [params1, params2, params3] From 5a53e984ba1640eb8b2f6f6bb9b82249324a8093 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Fri, 28 Jun 2024 20:43:40 +0800 Subject: [PATCH 03/12] Added the new distributions in __init__.py --- skpro/distributions/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index 2f5e321ed..e82cdbb7b 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -6,6 +6,7 @@ __all__ = [ "Alpha", "Beta", + "Binomial", "ChiSquared", "Delta", "Empirical", @@ -14,6 +15,7 @@ "Gamma", "HalfNormal", "IID", + "InverseGamma", "Laplace", "Logistic", "LogNormal", @@ -32,6 +34,7 @@ from skpro.distributions.alpha import Alpha from skpro.distributions.beta import Beta +from skpro.distributions.binomial import Binomial from skpro.distributions.chi_squared import ChiSquared from skpro.distributions.compose import IID from skpro.distributions.delta import Delta @@ -40,6 +43,7 @@ from skpro.distributions.fisk import Fisk from skpro.distributions.gamma import Gamma from skpro.distributions.halfnormal import HalfNormal +from skpro.distributions.inversegamma import InverseGamma from skpro.distributions.laplace import Laplace from skpro.distributions.logistic import Logistic from skpro.distributions.lognormal import LogNormal From cebe82fee8bffec91c71595e2bb22b851e6d2c81 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Sat, 29 Jun 2024 12:01:23 +0800 Subject: [PATCH 04/12] Added Binomial and InverseGamma --- docs/source/api_reference/distributions.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/api_reference/distributions.rst b/docs/source/api_reference/distributions.rst index 2068916d4..563ebcbfa 100644 --- a/docs/source/api_reference/distributions.rst +++ b/docs/source/api_reference/distributions.rst @@ -59,6 +59,7 @@ Continuous support - non-negative reals HalfCauchy HalfLogistic HalfNormal + InverseGamma LogLaplace Pareto Weibull @@ -73,6 +74,7 @@ Integer support :toctree: auto_generated/ :template: class.rst + Binomial Poisson From 7a9c78563cbe64ae8de3413c84ef5ae208fe79a9 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Sun, 30 Jun 2024 13:17:51 +0800 Subject: [PATCH 05/12] Typo in docstring --- skpro/distributions/inversegamma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/inversegamma.py b/skpro/distributions/inversegamma.py index daa3f9461..b5a6458c9 100644 --- a/skpro/distributions/inversegamma.py +++ b/skpro/distributions/inversegamma.py @@ -33,7 +33,7 @@ class InverseGamma(_ScipyAdapter): Example ------- - >>> from skpro.distributions.invgamma import InverseGamma + >>> from skpro.distributions.inversegamma import InverseGamma >>> d = InverseGamma(beta=[[1, 1], [2, 3], [4, 5]], alpha=2) """ # noqa: E501 From e223885a45b16311d988e95be671e488cc8f08e4 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Mon, 1 Jul 2024 23:31:37 +0800 Subject: [PATCH 06/12] Added BayesianProportionEstimator class --- skpro/regression/bayesian_proportion.py | 181 ++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 skpro/regression/bayesian_proportion.py diff --git a/skpro/regression/bayesian_proportion.py b/skpro/regression/bayesian_proportion.py new file mode 100644 index 000000000..a69ce9b6a --- /dev/null +++ b/skpro/regression/bayesian_proportion.py @@ -0,0 +1,181 @@ +"""Bayesian proportion estimator for probabilistic regression.""" +# todo: add an appropriate copyright notice for your estimator +# estimators contributed to skpro should have the copyright notice at the top +# estimators of your own do not need to have permissive or BSD-3 copyright + + +__author__ = ["meraldoantonio"] + +from skpro.distributions import Beta +from skpro.regression.base import BaseProbaRegressor + + +class BayesianProportionEstimator(BaseProbaRegressor): + """Bayesian probabilistic estimator for proportions. + + This estimator uses a Beta prior and Beta posterior with a Binomial likelihood + for Bayesian inference of proportions. It provides methods for updating the + posterior, making predictions, and various utilities for analysis and visualization. + + Parameters + ---------- + prior_alpha : float, optional (default=1) + Alpha parameter of the Beta prior distribution. + prior_beta : float, optional (default=1) + Beta parameter of the Beta prior distribution. + """ + + _tags = { + # packaging info + # -------------- + "authors": ["meraldoantonio"], + "python_dependencies": ["scipy", "matplotlib"], + "capability:multioutput": False, + "capability:missing": True, + # estimator tags + # -------------- + "capability:multioutput": False, # can the estimator handle multi-output data? + "capability:missing": True, # can the estimator handle missing data? + "X_inner_mtype": "pd_DataFrame_Table", # type seen in internal _fit, _predict + "y_inner_mtype": "pd_Series_Table", # type seen in internal _fit + } + + def __init__(self, prior_alpha=None, prior_beta=None, prior=None): + """Initialize the Bayesian inference class with priors. + + Parameters + ---------- + prior_alpha : float, optional + The alpha parameter for the Beta prior distribution. Default is None. + prior_beta : float, optional + The beta parameter for the Beta prior distribution. Default is None. + prior : Beta, optional + An existing Beta distribution prior. Default is None. + + Raises + ------ + ValueError + If neither (prior_alpha and prior_beta) nor prior are provided. + TypeError + If the provided prior is not an instance of Beta. + """ + if prior is None: + if prior_alpha is None or prior_beta is None: + raise ValueError( + "Must provide either (prior_alpha and prior_beta) or prior." + ) + self.prior_alpha = prior_alpha + self.prior_beta = prior_beta + self.prior = Beta(alpha=prior_alpha, beta=prior_beta) + else: + if not isinstance(prior, Beta): + raise TypeError("Prior must be an instance of Beta.") + self.prior = prior + self.prior_alpha = prior.alpha + self.prior_beta = prior.beta + + super().__init__() + + def _fit(self, X, y): + """Fit regressor to training data. + + Writes to self: + Sets fitted model attributes ending in "_". + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to; + will be ignored + y : pandas Series, must be same length as X + labels to fit regressor to + + Returns + ------- + self : reference to self + """ + assert y.apply( + lambda x: isinstance(x, bool) or x in [0, 1] + ).all(), "Values in y must be boolean or convertible to boolean (0 or 1)" + self._posterior = self._perform_bayesian_inference(self.prior, X, y) + return self + + def _predict_proba(self, X=None): + """Predict distribution over labels for data from features. + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for; + will be ignored + + Returns + ------- + y_pred : skpro BaseDistribution, same length as `X` + labels predicted for `X` + """ + y_pred = Beta(alpha=self._posterior.alpha, beta=self._posterior.beta) + return y_pred + + def _perform_bayesian_inference(self, prior, X, y): + """Perform Bayesian inference using a conjugate prior (Beta distribution). + + This method calculates the posterior Beta distribution parameters + given observed binary outcomes. + + Parameters + ---------- + prior : Beta + The prior Beta distribution from skpro distributions. + X : pandas DataFrame + Feature data corresponding to the observed outcomes `y`. + y : array-like, must be binary (0 or 1) + Observed binary outcomes. + + Returns + ------- + posterior : Beta + The posterior Beta distribution with updated parameters. + """ + n = len(y) + successes = y.sum() + posterior_alpha = prior.alpha + successes + posterior_beta = prior.beta + n - successes + return Beta(alpha=posterior_alpha, beta=posterior_beta) + + def update(self, X, y): + """Update the posterior with new data. + + Parameters + ---------- + X : pandas DataFrame + New feature instances + y : pandas Series + New labels + + Returns + ------- + self : reference to self + """ + self._posterior = self._perform_bayesian_inference(self._posterior, X, y) + return self + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + """ + params1 = {"prior_alpha": 1, "prior_beta": 1} + params2 = {"prior_alpha": 2, "prior_beta": 2} + + return [params1, params2] From 75031225f9665eba08e810a9d70593e49a03212a Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Mon, 1 Jul 2024 23:40:24 +0800 Subject: [PATCH 07/12] Added example notebook --- examples/04_bayesian_conjugate.ipynb | 532 +++++++++++++++++++++++++++ 1 file changed, 532 insertions(+) create mode 100644 examples/04_bayesian_conjugate.ipynb diff --git a/examples/04_bayesian_conjugate.ipynb b/examples/04_bayesian_conjugate.ipynb new file mode 100644 index 000000000..8574fcebe --- /dev/null +++ b/examples/04_bayesian_conjugate.ipynb @@ -0,0 +1,532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bayesian Conjugate" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from skpro.distributions import Beta\n", + "from skpro.regression.bayesian_proportion import BayesianProportionEstimator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Conjugate Bayesian analysis is a statistical method where the posterior distribution belongs to the same family as the prior distribution. \n", + "\n", + "In this notebook, we will use the `skpro` estimator `BayesianProportionEstimator` to demonstrate the application of Bayesian conjugates in the estimation of proportion." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the data above, we aim to estimate the proportion $p$ of heads. However, due to the small sample size, we might be uncertain about our estimate. This is where Bayesian analysis, and specifically the use of conjugate priors, becomes valuable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Motivation for Estimating Proportions: The Coin Toss Example\n", + "\n", + "Estimating proportions is a fundamental problem in statistics, often arising in situations where we want to understand the likelihood of a specific outcome. A classic example is the coin toss.\n", + "\n", + "Imagine we have a coin, and we want to determine if it is fair. A fair coin has an equal probability (0.5) of landing heads or tails. To estimate this probability, we can perform a series of coin tosses and observe the outcomes.\n", + "\n", + "Suppose we toss the coin 10 times and observe the following results:\n", + "\n", + "- Heads: 8 times\n", + "- Tails: 2 times\n", + "\n", + "This coin toss sample is represented in the pandas Series `y` shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 0\n", + "8 0\n", + "9 1\n", + "Name: head, dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the number of ones and zeros\n", + "ones = [1] * 8\n", + "zeros = [0] * 2\n", + "\n", + "# Combine them into a single list\n", + "data = ones + zeros\n", + "\n", + "# Shuffle the list to randomize the order\n", + "np.random.shuffle(data)\n", + "\n", + "# Create a Series from the shuffled list\n", + "y = pd.Series(data, name=\"head\")\n", + "\n", + "y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on this data, we aim to estimate the proportion `p` of heads. However, due to the small sample size, we might be uncertain about our estimate. This is where Bayesian analysis becomes valuable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Prior\n", + "\n", + "A Bayesian analysis begins with a **prior belief** about the parameter of interest, which in this case is the proportion `p`. This prior belief is represented by a **prior _distribution_**, which encapsulates our initial assumptions about `p` before observing any data. The choice of distribution can be based on previous knowledge, expert opinion, or even non-informative (if we have no prior knowledge).\n", + "\n", + "A **conjugate prior** is a specific type of prior distribution that, when combined with the likelihood of the observed data, yields a posterior distribution that belongs to the same family as the prior distribution. This property simplifies the process of updating our beliefs with new data, as the mathematical form of the distribution remains consistent.\n", + "\n", + "The **Beta distribution** is often chosen as the prior distribution because it is a conjugate prior to the Binomial likelihood, which describes the probability of observing a given number of heads in a series of coin tosses.\n", + "\n", + "\n", + "The distribution, denoted as $\\text{Beta}(\\alpha, \\beta)$, is parameterized by two positive parameters, $\\alpha$ and $\\beta$. These parameters reflect our prior beliefs about the proportion $p$. For example:\n", + "\n", + "- If $\\alpha = 1$ and $\\beta = 1$, we have a uniform prior, which indicates that all values of $p$ between 0 and 1 are equally likely.\n", + "- If $\\alpha > 1$ and $\\beta > 1$, the prior might be more peaked around a particular value, indicating a stronger prior belief about $p$.\n", + "\n", + "Suppose in our case that we have some conviction that the coin is fair, meaning we believe the proportion $p$ is centered around 0.5. To reflect this belief, we will choose a Beta distribution with parameters $\\alpha = 2$ and $\\beta = 2$. This distribution is symmetric and centered around 0.5, indicating our prior belief that the coin is likely fair but allowing for some uncertainty.\n", + "\n", + "To implement this in our analysis, we will instantiate a new `BayesianProportionEstimator` object, `B`. This object encapsulates our prior belief on `p` and provides the framework for updating this belief based on observed data. \n", + "\n", + "Here's how we can set this up in code:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "B = BayesianProportionEstimator(prior=Beta(2, 2))\n", + "# alternative instantiation:\n", + "B = BayesianProportionEstimator(prior_alpha=2, prior_beta=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the plot function to visualize the probability density function (PDF) of our prior distribution. This visualization helps us understand our initial belief about `p`. By plotting the PDF of the $Beta(2, 2)$ distribution, we can see how our belief is symmetrically distributed around the value 0.5, reflecting our prior conviction that the coin is fair with some allowance for variability." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "B.prior.plot(\"pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Likelihood and Posterior\n", + "\n", + "The likelihood function represents the probability of observing the data given the parameter $p$. For coin tosses, this is described by the Binomial distribution. If we observe $k$ heads out of $n$ tosses, the likelihood is given by:\n", + "\n", + "$$\\text{Binomial}(k | n, p) = \\binom{n}{k} p^k (1 - p)^{n - k}$$\n", + "\n", + "Using Bayes' theorem, we combine the prior distribution with the likelihood of the observed data to obtain the posterior distribution, which represents our updated beliefs about $p$ after observing the data. \n", + "\n", + "\n", + "Because the Beta distribution is a conjugate prior to the Binomial likelihood, the posterior distribution is also a Beta distribution with updated parameters. That is, if the prior is $\\text{Beta}(\\alpha, \\beta)$ and we observe $k$ heads in $n$ tosses, the posterior distribution is:\n", + "\n", + "$$\\text{Beta}(\\alpha + k, \\beta + n - k)$$\n", + "\n", + "This means that we simply add the number of observed heads to $\\alpha$ and the number of observed tails to $\\beta $ to get the parameters of the posterior distribution." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the `BayesianProportionEstimator`, the process of updating our prior belief with observed data is straightforward and can be accomplished in a single line of code. We simply use the fit method on our data.\n", + "\n", + "However, to maintain compatibility with the estimator's API, we need to supply an additional argument, `X`, which is expected to be a DataFrame or Series of the same length as `y`. This X parameter will be ignored during the fitting process, so for simplicity, we can simply set `X` to be the same as `y`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
BayesianProportionEstimator(prior=Beta(alpha=2, beta=2), prior_alpha=2,\n",
+       "                            prior_beta=2)
Please rerun this cell to show the HTML repr or trust the notebook.
" + ], + "text/plain": [ + "BayesianProportionEstimator(prior=Beta(alpha=2, beta=2), prior_alpha=2,\n", + " prior_beta=2)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = y.copy()\n", + "B.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, after fitting the BayesianProportionEstimator to our observed data, we obtain a posterior distribution represented by the Beta distribution with updated parameters $\\alpha=10$ and $\\beta=4$:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Beta(alpha=10, beta=4)
Please rerun this cell to show the HTML repr or trust the notebook.
" + ], + "text/plain": [ + "Beta(alpha=10, beta=4)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "B._posterior" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A plot of this posterior shows that it skews to the right with a mean of around 0.71, reflecting our updated belief that the coin is more likely to land on heads than tails. This skewness arises from the combination of our prior belief and the observed data, which had a higher number of heads compared to tails." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7142857142857143" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "B._posterior.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "B._posterior.plot(\"pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Updating\n", + "\n", + "A significant advantage of the Bayesian approach is its ability to incorporate new data and continuously update our posterior distribution. Suppose we conduct 100 additional coin tosses and observe that, unexpectedly, 80 of these new tosses result in tails:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + " ..\n", + "95 0\n", + "96 0\n", + "97 1\n", + "98 0\n", + "99 0\n", + "Name: head, Length: 100, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the number of ones and zeros\n", + "ones = [1] * 20\n", + "zeros = [0] * 80\n", + "\n", + "# Combine them into a single list\n", + "data = ones + zeros\n", + "\n", + "# Shuffle the list to randomize the order\n", + "np.random.shuffle(data)\n", + "\n", + "# Create a Series from the shuffled list\n", + "y2 = pd.Series(data, name=\"head\")\n", + "\n", + "y2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This new data can be integrated into our existing model to further refine our belief about the coin's fairness. We do so using the `update` method." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
BayesianProportionEstimator(prior=Beta(alpha=2, beta=2), prior_alpha=2,\n",
+       "                            prior_beta=2)
Please rerun this cell to show the HTML repr or trust the notebook.
" + ], + "text/plain": [ + "BayesianProportionEstimator(prior=Beta(alpha=2, beta=2), prior_alpha=2,\n", + " prior_beta=2)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = y2\n", + "B.update(X, y2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that after this update, the posterior becomes $\\text{Beta}(\\alpha=30, \\beta=84)$, which is a left-skewing distribution with a mean of 0.263." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Beta(alpha=30, beta=84)
Please rerun this cell to show the HTML repr or trust the notebook.
" + ], + "text/plain": [ + "Beta(alpha=30, beta=84)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "B._posterior" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2631578947368421" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "B._posterior.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "B._posterior.plot(\"pdf\")" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "pymc_env", + "language": "python", + "name": "pymc_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "3e631b8a076cc106144e9b132b7d31cae2f1e2660b47e5f9fcb0397caae5fbd5" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b55ffd3fce3552d90aa72bbb78a5dda300683e4f Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Mon, 1 Jul 2024 23:52:49 +0800 Subject: [PATCH 08/12] Added Meraldo --- .all-contributorsrc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index 8d6f60fa8..e05333be5 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -184,6 +184,16 @@ "contributions": [ "doc" ] + }, + { + "login": "meraldoantonio", + "name": "Meraldo Antonio", + "avatar_url": "https://avatars.githubusercontent.com/u/37468543?v=4", + "profile": "https://github.com/meraldoantonio", + "contributions": [ + "code", + "example" + ] } ] } From 921d2c9b9e31d183383bd9d029d016503ff2f889 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Mon, 1 Jul 2024 23:59:32 +0800 Subject: [PATCH 09/12] Added example --- skpro/regression/bayesian_proportion.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/skpro/regression/bayesian_proportion.py b/skpro/regression/bayesian_proportion.py index a69ce9b6a..252ff4461 100644 --- a/skpro/regression/bayesian_proportion.py +++ b/skpro/regression/bayesian_proportion.py @@ -1,4 +1,5 @@ """Bayesian proportion estimator for probabilistic regression.""" + # todo: add an appropriate copyright notice for your estimator # estimators contributed to skpro should have the copyright notice at the top # estimators of your own do not need to have permissive or BSD-3 copyright @@ -17,12 +18,6 @@ class BayesianProportionEstimator(BaseProbaRegressor): for Bayesian inference of proportions. It provides methods for updating the posterior, making predictions, and various utilities for analysis and visualization. - Parameters - ---------- - prior_alpha : float, optional (default=1) - Alpha parameter of the Beta prior distribution. - prior_beta : float, optional (default=1) - Beta parameter of the Beta prior distribution. """ _tags = { @@ -58,6 +53,21 @@ def __init__(self, prior_alpha=None, prior_beta=None, prior=None): If neither (prior_alpha and prior_beta) nor prior are provided. TypeError If the provided prior is not an instance of Beta. + + Examples + -------- + >>> from skpro.regression.bayesian_proportion import BayesianProportionEstimator + >>> from skpro.distributions import Beta + >>> import pandas as pd + >>> import numpy as np + + >>> B = BayesianProportionEstimator(prior=Beta(1, 2)) + >>> # Create a DataFrame X with all NaN values + >>> X = pd.DataFrame(np.nan, index=range(5), columns=['to_be_ignored']) + >>> # Create a Series y with random 1s and 0s + >>> y = pd.Series(np.random.randint(0, 2, size=5), name='win') + >>> B.fit(X, y) + """ if prior is None: if prior_alpha is None or prior_beta is None: From f2388a56218978c136814b3b3503e3228d1050ac Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Tue, 2 Jul 2024 10:39:39 +0800 Subject: [PATCH 10/12] Removed kernel --- examples/04_bayesian_conjugate.ipynb | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/04_bayesian_conjugate.ipynb b/examples/04_bayesian_conjugate.ipynb index 8574fcebe..3e21a9574 100644 --- a/examples/04_bayesian_conjugate.ipynb +++ b/examples/04_bayesian_conjugate.ipynb @@ -491,11 +491,6 @@ ], "metadata": { "hide_input": false, - "kernelspec": { - "display_name": "pymc_env", - "language": "python", - "name": "pymc_env" - }, "language_info": { "codemirror_mode": { "name": "ipython", From 0102675b2f97d6ef64f8671843ae51b05326139c Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Tue, 2 Jul 2024 10:49:51 +0800 Subject: [PATCH 11/12] Removed example --- skpro/regression/bayesian_proportion.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/skpro/regression/bayesian_proportion.py b/skpro/regression/bayesian_proportion.py index 252ff4461..ecd969c7e 100644 --- a/skpro/regression/bayesian_proportion.py +++ b/skpro/regression/bayesian_proportion.py @@ -53,21 +53,6 @@ def __init__(self, prior_alpha=None, prior_beta=None, prior=None): If neither (prior_alpha and prior_beta) nor prior are provided. TypeError If the provided prior is not an instance of Beta. - - Examples - -------- - >>> from skpro.regression.bayesian_proportion import BayesianProportionEstimator - >>> from skpro.distributions import Beta - >>> import pandas as pd - >>> import numpy as np - - >>> B = BayesianProportionEstimator(prior=Beta(1, 2)) - >>> # Create a DataFrame X with all NaN values - >>> X = pd.DataFrame(np.nan, index=range(5), columns=['to_be_ignored']) - >>> # Create a Series y with random 1s and 0s - >>> y = pd.Series(np.random.randint(0, 2, size=5), name='win') - >>> B.fit(X, y) - """ if prior is None: if prior_alpha is None or prior_beta is None: From 62d499996abbf0326f5f38db1f730069675fe587 Mon Sep 17 00:00:00 2001 From: Meraldo Antonio Date: Tue, 2 Jul 2024 19:42:31 +0800 Subject: [PATCH 12/12] Added more explanation in the _fit method --- skpro/regression/bayesian_proportion.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/skpro/regression/bayesian_proportion.py b/skpro/regression/bayesian_proportion.py index ecd969c7e..e638fa58a 100644 --- a/skpro/regression/bayesian_proportion.py +++ b/skpro/regression/bayesian_proportion.py @@ -1,10 +1,5 @@ """Bayesian proportion estimator for probabilistic regression.""" -# todo: add an appropriate copyright notice for your estimator -# estimators contributed to skpro should have the copyright notice at the top -# estimators of your own do not need to have permissive or BSD-3 copyright - - __author__ = ["meraldoantonio"] from skpro.distributions import Beta @@ -72,7 +67,7 @@ def __init__(self, prior_alpha=None, prior_beta=None, prior=None): super().__init__() def _fit(self, X, y): - """Fit regressor to training data. + """Fit regressor to the observed data. Writes to self: Sets fitted model attributes ending in "_". @@ -83,7 +78,8 @@ def _fit(self, X, y): feature instances to fit regressor to; will be ignored y : pandas Series, must be same length as X - labels to fit regressor to + represents a series of binary experiments + whose outcome are either True (1) or False (0) Returns -------