pymc-labs · juanitorduz · Nov 27, 2024 · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024
diff --git a/docs/source/notebooks/clv/bg_nbd.ipynb b/docs/source/notebooks/clv/bg_nbd.ipynb
diff --git a/docs/source/notebooks/clv/dev/beta_geo_dev.ipynb b/docs/source/notebooks/clv/dev/beta_geo_dev.ipynb
diff --git a/docs/source/notebooks/clv/dev/utilities.ipynb b/docs/source/notebooks/clv/dev/utilities.ipynb
diff --git a/docs/source/notebooks/clv/dev/utilities_plotting.ipynb b/docs/source/notebooks/clv/dev/utilities_plotting.ipynb
diff --git a/docs/source/notebooks/clv/pareto_nbd.ipynb b/docs/source/notebooks/clv/pareto_nbd.ipynb
diff --git a/docs/source/notebooks/clv/pnbd.nc b/docs/source/notebooks/clv/pnbd.nc
diff --git a/pymc_marketing/clv/__init__.py b/pymc_marketing/clv/__init__.py
@@ -24,6 +24,7 @@
 from pymc_marketing.clv.plotting import (
     plot_customer_exposure,
     plot_expected_purchases,
+    plot_expected_purchases_ppc,
     plot_frequency_recency_matrix,
     plot_probability_alive_matrix,
 )
@@ -46,6 +47,7 @@
     "plot_frequency_recency_matrix",
     "plot_expected_purchases",
     "plot_probability_alive_matrix",
+    "plot_expected_purchases_ppc",
     "rfm_segments",
     "rfm_summary",
     "rfm_train_test_split",

diff --git a/pymc_marketing/clv/plotting.py b/pymc_marketing/clv/plotting.py
@@ -18,16 +18,18 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import pymc as pm
 from matplotlib.lines import Line2D
 
 from pymc_marketing.clv import BetaGeoModel, ParetoNBDModel
 from pymc_marketing.clv.utils import _expected_cumulative_transactions
 
 __all__ = [
     "plot_customer_exposure",
+    "plot_expected_purchases",
     "plot_frequency_recency_matrix",
     "plot_probability_alive_matrix",
-    "plot_expected_purchases",
+    "plot_expected_purchases_ppc",
 ]
 
 
@@ -474,6 +476,105 @@
     return ax
 
 
+def plot_expected_purchases_ppc(
+    model,
+    ppc: str = "posterior",
+    max_purchases: int = 10,
+    samples: int = 1000,
+    random_seed: int = 45,
+    ax: plt.Axes | None = None,
+    **kwargs,
+) -> plt.Axes:
+    """Plot a prior or posterior predictive check for the customer purchase frequency distribution.
+
+    At this time only ParetoNBDModel and BetaGeoBetaBinomModel are supported.
+
+    Adapted from legacy ``lifetimes`` library:
+    https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/plotting.py#L25
+
+    Parameters
+    ----------
+    model : CLV model
+        Prior predictive checks can be performed before or after a model is fit.
+        Posterior predictive checks require a fitted model.
+    ppc : string, optional
+        Type of predictive check to perform. Options are 'prior' or 'posterior'; defaults to 'posterior'.
+    max_purchases : int, optional
+        Cutoff for bars of purchase counts to plot. Default is 10.
+    samples : int, optional
+        Number of samples to draw for prior predictive checks. This is not used for posterior predictive checks.
+    random_seed : int, optional
+        Random seed to fix sampling results
+    ax : matplotlib.AxesSubplot, optional
+        A matplotlib axes instance. Creates new axes instance by default.
+    **kwargs
+        Additional arguments to pass into the pandas.DataFrame.plot command.
+
+    Returns
+    -------
+    axes: matplotlib.AxesSubplot
+    """
+    # TODO: BetaGeoModel requires its own dist class in distributions.py for this function.
+    if isinstance(model, BetaGeoModel):
+        raise AttributeError("BetaGeoModel is unsupported for this function.")
+
+    if ax is None:
+        ax = plt.subplot(111)
+
+    match ppc:
+        case "prior":
+            # build model if it has not been fit yet
+            model.build_model()
+
+            prior_idata = pm.sample_prior_predictive(
+                samples=samples,
+                model=model.model,
+                random_seed=random_seed,
+            )
+
+            # obs_var must be retrieved from prior_idata if model has not been fit
+            obs_freq = prior_idata.observed_data["recency_frequency"].sel(
+                obs_var="frequency"
+            )
+            ppc_freq = prior_idata.prior_predictive["recency_frequency"].sel(
+                obs_var="frequency"
+            )
+            title = "Prior Predictive Check for Customer Frequency"
+        case "posterior":
+            obs_freq = model.idata.observed_data["recency_frequency"].sel(
+                obs_var="frequency"
+            )
+            # Keep samples at 1 here because (chain * draw * customer) samples are already being drawn
+            ppc_freq = model.distribution_new_customer_recency_frequency(
+                random_seed=random_seed,
+                n_samples=1,
+            ).sel(obs_var="frequency")
+            title = "Posterior Predictive Check for Customer Frequency"
+        case _:
+            raise NameError("Specify 'prior' or 'posterior' for 'ppc' parameter.")
+
+    # convert estimated and observed xarrays into dataframes for plotting
+    estimated = ppc_freq.to_dataframe().value_counts(normalize=True).sort_index()
+    observed = obs_freq.to_dataframe().value_counts(normalize=True).sort_index()
+
+    # PPC histogram plot
+    ax = pd.DataFrame(
+        {
+            "Estimated": estimated.reset_index()["proportion"].head(max_purchases),
+            "Observed": observed.reset_index()["proportion"].head(max_purchases),
+        },
+    ).plot(
+        kind="bar",
+        ax=ax,
+        title=title,
+        xlabel="Repeat Purchases",
+        ylabel="% of Customer Population",
+        rot=0.0,
+        **kwargs,
+    )
+    return ax
+
+
 def _force_aspect(ax: plt.Axes, aspect=1):
     im = ax.get_images()
     extent = im[0].get_extent()

diff --git a/tests/clv/test_plotting.py b/tests/clv/test_plotting.py
@@ -18,9 +18,10 @@
 import xarray as xr
 from pytensor.tensor import TensorVariable
 
-from pymc_marketing.clv.plotting import (
+from pymc_marketing.clv import (
     plot_customer_exposure,
     plot_expected_purchases,
+    plot_expected_purchases_ppc,
     plot_frequency_recency_matrix,
     plot_probability_alive_matrix,
 )
@@ -29,6 +30,7 @@
 class MockModel:
     def __init__(self, data: pd.DataFrame):
         self.data = data
+        self._model_type = None
 
     def _mock_posterior(self, data: pd.DataFrame) -> xr.DataArray:
         n_customers = len(data)
@@ -178,3 +180,34 @@ def test_plot_expected_purchases(
 
     # clear any existing pyplot figures
     plt.clf()
+
+
+def test_plot_expected_purchases_ppc_exceptions(fitted_bg, fitted_pnbd):
+    with pytest.raises(
+        AttributeError, match="BetaGeoModel is unsupported for this function."
+    ):
+        plot_expected_purchases_ppc(fitted_bg)
+
+    with pytest.raises(
+        NameError, match="Specify 'prior' or 'posterior' for 'ppc' parameter."
+    ):
+        plot_expected_purchases_ppc(fitted_pnbd, ppc="ppc")
+
+
+@pytest.mark.parametrize(
+    "ppc, max_purchases, samples, subplot",
+    [("prior", 10, 100, None), ("posterior", 20, 50, plt.subplot())],
+)
+def test_plot_expected_purchases_ppc(fitted_pnbd, ppc, max_purchases, samples, subplot):
+    ax = plot_expected_purchases_ppc(
+        model=fitted_pnbd,
+        ppc=ppc,
+        max_purchases=max_purchases,
+        samples=samples,
+        ax=subplot,
+    )
+
+    assert isinstance(ax, plt.Axes)
+
+    # clear any existing pyplot figures
+    plt.clf()
diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py
@@ -20,7 +20,7 @@
 import xarray
 from pandas.testing import assert_frame_equal
 
-from pymc_marketing.clv import BetaGeoModel, GammaGammaModel, ParetoNBDModel
+from pymc_marketing.clv import GammaGammaModel, ParetoNBDModel
 from pymc_marketing.clv.utils import (
     _expected_cumulative_transactions,
     _find_first_transactions,
@@ -59,57 +59,6 @@ def test_to_xarray():
     np.testing.assert_array_equal(new_y.coords["test_dim"], customer_id)
 
 
-@pytest.fixture(scope="module")
-def fitted_bg(test_summary_data) -> BetaGeoModel:
-    rng = np.random.default_rng(13)
-
-    model_config = {
-        # Narrow Gaussian centered at MLE params from lifetimes BetaGeoFitter
-        "a_prior": Prior("DiracDelta", c=1.85034151),
-        "alpha_prior": Prior("DiracDelta", c=1.86428187),
-        "b_prior": Prior("DiracDelta", c=3.18105431),
-        "r_prior": Prior("DiracDelta", c=0.16385072),
-    }
-    model = BetaGeoModel(
-        data=test_summary_data,
-        model_config=model_config,
-    )
-    model.build_model()
-    fake_fit = pm.sample_prior_predictive(
-        samples=50, model=model.model, random_seed=rng
-    ).prior
-    set_model_fit(model, fake_fit)
-
-    return model
-
-
-@pytest.fixture(scope="module")
-def fitted_pnbd(test_summary_data) -> ParetoNBDModel:
-    rng = np.random.default_rng(45)
-
-    model_config = {
-        # Narrow Gaussian centered at MLE params from lifetimes ParetoNBDFitter
-        "r_prior": Prior("DiracDelta", c=0.560),
-        "alpha_prior": Prior("DiracDelta", c=10.591),
-        "s_prior": Prior("DiracDelta", c=0.550),
-        "beta_prior": Prior("DiracDelta", c=9.756),
-    }
-    pnbd_model = ParetoNBDModel(
-        data=test_summary_data,
-        model_config=model_config,
-    )
-    pnbd_model.build_model()
-
-    # Mock an idata object for tests requiring a fitted model
-    # TODO: This is quite slow. Check similar fixtures in the model tests to speed this up.
-    fake_fit = pm.sample_prior_predictive(
-        samples=50, model=pnbd_model.model, random_seed=rng
-    ).prior
-    set_model_fit(pnbd_model, fake_fit)
-
-    return pnbd_model
-
-
 @pytest.fixture(scope="module")
 def fitted_gg(test_summary_data) -> GammaGammaModel:
     rng = np.random.default_rng(40)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -21,7 +21,8 @@
 from arviz import InferenceData
 from xarray import DataArray, Dataset
 
-from pymc_marketing.clv.models import CLVModel
+from pymc_marketing.clv.models import BetaGeoModel, CLVModel, ParetoNBDModel
+from pymc_marketing.prior import Prior
 
 
 def pytest_addoption(parser):
@@ -152,3 +153,62 @@ def mock_fit_MAP(self, *args, **kwargs):
     idata = mock_sample(*args, **kwargs, chains=chains, draws=draws, model=self.model)
 
     return idata.sel(chain=[0], draw=[0])
+
+
+# TODO: This fixture is used in the plotting and utils test modules.
+#       Consider creating a MockModel class to replace this and other fitted model fixtures.
+@pytest.fixture(scope="module")
+def fitted_bg(test_summary_data) -> BetaGeoModel:
+    rng = np.random.default_rng(13)
+
+    model_config = {
+        # Narrow Gaussian centered at MLE params from lifetimes BetaGeoFitter
+        "a_prior": Prior("DiracDelta", c=1.85034151),
+        "alpha_prior": Prior("DiracDelta", c=1.86428187),
+        "b_prior": Prior("DiracDelta", c=3.18105431),
+        "r_prior": Prior("DiracDelta", c=0.16385072),
+    }
+    model = BetaGeoModel(
+        data=test_summary_data,
+        model_config=model_config,
+    )
+    model.build_model()
+    fake_fit = pm.sample_prior_predictive(
+        samples=50, model=model.model, random_seed=rng
+    )
+    # posterior group required to pass L80 assert check
+    fake_fit.add_groups(posterior=fake_fit.prior)
+    set_model_fit(model, fake_fit)
+
+    return model
+
+
+# TODO: This fixture is used in the plotting and utils test modules.
+#       Consider creating a MockModel class to replace this and other fitted model fixtures.
+@pytest.fixture(scope="module")
+def fitted_pnbd(test_summary_data) -> ParetoNBDModel:
+    rng = np.random.default_rng(45)
+
+    model_config = {
+        # Narrow Gaussian centered at MLE params from lifetimes ParetoNBDFitter
+        "r_prior": Prior("DiracDelta", c=0.560),
+        "alpha_prior": Prior("DiracDelta", c=10.591),
+        "s_prior": Prior("DiracDelta", c=0.550),
+        "beta_prior": Prior("DiracDelta", c=9.756),
+    }
+    pnbd_model = ParetoNBDModel(
+        data=test_summary_data,
+        model_config=model_config,
+    )
+    pnbd_model.build_model()
+
+    # Mock an idata object for tests requiring a fitted model
+    # TODO: This is quite slow. Check similar fixtures in the model tests to speed this up.
+    fake_fit = pm.sample_prior_predictive(
+        samples=50, model=pnbd_model.model, random_seed=rng
+    )
+    # posterior group required to pass L80 assert check
+    fake_fit.add_groups(posterior=fake_fit.prior)
+    set_model_fit(pnbd_model, fake_fit)
+
+    return pnbd_model