From 951f0db2921a4c8b3cc63edd7599384eb13278ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20H=C3=B6pfl?=
 <66028655+shoepfl@users.noreply.github.com>
Date: Mon, 30 Sep 2024 15:38:54 +0200
Subject: [PATCH] Added HPD calculation to ensemble (#1431)

* Added HPD calculation to ensemble

The ensemble from_sample method has now an additional rel_cutoff argument. rel_cutoff allwos to cut the posterior to the alpha % highest density (here non-normalized posterior probability).

* Update ensemble.py

fixes print statement that was only used for debugging

* Update pypesto/ensemble/ensemble.py

Adds default value for burn_in

Co-authored-by: Paul Jonas Jost <70631928+PaulJonasJost@users.noreply.github.com>

* Update pypesto/ensemble/ensemble.py

Fixes wrong default value for the ci_level

Co-authored-by: Paul Jonas Jost <70631928+PaulJonasJost@users.noreply.github.com>

* added test for the hpd calculation and renamed rel_cutoff

* Forgot name change in test

* Integrated comments

---------

Co-authored-by: Paul Jonas Jost <70631928+PaulJonasJost@users.noreply.github.com>
Co-authored-by: PaulJonasJost <jost.pauljonas@gmail.com>
---
 pypesto/ensemble/ensemble.py | 88 ++++++++++++++++++++++++++++++++++++
 test/base/test_ensemble.py   | 47 +++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/pypesto/ensemble/ensemble.py b/pypesto/ensemble/ensemble.py
index ecd3d7b85..5446e2103 100644
--- a/pypesto/ensemble/ensemble.py
+++ b/pypesto/ensemble/ensemble.py
@@ -555,6 +555,7 @@ def __init__(
     def from_sample(
         result: Result,
         remove_burn_in: bool = True,
+        ci_level: float = None,
         chain_slice: slice = None,
         x_names: Sequence[str] = None,
         lower_bound: np.ndarray = None,
@@ -571,6 +572,10 @@ def from_sample(
         remove_burn_in:
             Exclude parameter vectors from the ensemble if they are in the
             "burn-in".
+        ci_level:
+            A form of relative cutoff. Exclude parameter vectors, for which the
+            (non-normalized) posterior value is not within the `ci_level` best
+            values.
         chain_slice:
             Subset the chain with a slice. Any "burn-in" removal occurs first.
         x_names:
@@ -594,14 +599,23 @@ def from_sample(
             lower_bound = result.problem.lb
         if upper_bound is None:
             upper_bound = result.problem.ub
+        burn_in = 0
         if remove_burn_in:
             if result.sample_result.burn_in is None:
                 geweke_test(result)
             burn_in = result.sample_result.burn_in
             x_vectors = x_vectors[burn_in:]
+
+        # added cutoff
+        if ci_level is not None:
+            x_vectors = calculate_hpd(
+                result=result, burn_in=burn_in, ci_level=ci_level
+            )
+
         if chain_slice is not None:
             x_vectors = x_vectors[chain_slice]
         x_vectors = x_vectors.T
+
         return Ensemble(
             x_vectors=x_vectors,
             x_names=x_names,
@@ -1253,3 +1267,77 @@ def calculate_cutoff(
 
     range = chi2.ppf(q=percentile / 100, df=df)
     return fval_opt + range
+
+
+def calculate_hpd(
+    result: Result,
+    burn_in: int = 0,
+    ci_level: float = 0.95,
+):
+    """
+    Calculate Highest Posterior Density (HPD) samples.
+
+    The HPD is calculated for a user-defined credibility level (`ci_level`). The
+    HPD includes all parameter vectors with a (non-normalized) posterior
+    probability that is higher than the lowest `1-ci_level` %
+    posterior probability values.
+
+    Parameters
+    ----------
+    result:
+        The sampling result from which to create the ensemble.
+    burn_in:
+        Burn in index that is cut off before HPD is calculated.
+    ci_level:
+        Credibility level of the resulting HPD. 0.95 corresponds to the 95% CI.
+        Only values between 0 and 1 are allowed.
+
+    Returns
+    -------
+    The HPD parameter vectors.
+    """
+    if not 0 <= ci_level <= 1:
+        raise ValueError(
+            f"ci_level={ci_level} is not valid. Choose 0<=ci_level<=1."
+        )
+    # get names of chain parameters
+    param_names = result.problem.get_reduced_vector(result.problem.x_names)
+
+    # Get converged parameter samples as numpy arrays
+    chain = np.asarray(result.sample_result.trace_x[0, burn_in:, :])
+    neglogpost = result.sample_result.trace_neglogpost[0, burn_in:]
+    indices = np.arange(
+        burn_in, len(result.sample_result.trace_neglogpost[0, :])
+    )
+
+    # create df first, as we need to match neglogpost to the according parameter values
+    pd_params = pd.DataFrame(chain, columns=param_names)
+    pd_fval = pd.DataFrame(neglogpost, columns=["neglogPosterior"])
+    pd_iter = pd.DataFrame(indices, columns=["iteration"])
+
+    params_df = pd.concat(
+        [pd_params, pd_fval, pd_iter], axis=1, ignore_index=False
+    )
+
+    # get lower neglogpost bound for HPD
+    # sort neglogpost values of MCMC chain without burn in
+    neglogpost_sort = np.sort(neglogpost)
+
+    # Get converged chain length
+    chain_length = len(neglogpost)
+
+    # most negative ci percentage samples of the posterior are kept to get the according HPD
+    neglogpost_lower_bound = neglogpost_sort[int(chain_length * (ci_level))]
+
+    # cut posterior to hpd
+    hpd_params_df = params_df[
+        params_df["neglogPosterior"] <= neglogpost_lower_bound
+    ]
+
+    # convert df to ensemble vector
+    hpd_params_df_vals_only = hpd_params_df.drop(
+        columns=["iteration", "neglogPosterior"]
+    )
+    hpd_ensemble_vector = hpd_params_df_vals_only.to_numpy()
+
+    return hpd_ensemble_vector
diff --git a/test/base/test_ensemble.py b/test/base/test_ensemble.py
index 12089af53..9a2383448 100644
--- a/test/base/test_ensemble.py
+++ b/test/base/test_ensemble.py
@@ -6,6 +6,7 @@
 
 import pypesto
 import pypesto.optimize as optimize
+import pypesto.sample as sample
 from pypesto.C import AMICI_STATUS, AMICI_T, AMICI_Y, MEAN, WEIGHTED_SIGMA
 from pypesto.engine import MultiProcessEngine
 from pypesto.ensemble import (
@@ -224,3 +225,49 @@ def post_processor(amici_outputs, output_type, output_ids):
         progress_bar=False,
     )
     return ensemble_prediction
+
+
+def test_hpd_calculation():
+    """Test the calculation of Highest Posterior Density (HPD)."""
+    problem = create_petab_problem()
+
+    sampler = sample.AdaptiveMetropolisSampler(
+        options={"show_progress": False}
+    )
+
+    result = optimize.minimize(
+        problem=problem,
+        n_starts=3,
+        progress_bar=False,
+    )
+
+    result = sample.sample(
+        problem=problem,
+        sampler=sampler,
+        n_samples=100,
+        result=result,
+    )
+
+    # Manually set up sample (only for testing)
+    burn_in = 1
+    result.sample_result.burn_in = burn_in
+    result.sample_result.trace_neglogpost[0][1:] = np.random.permutation(
+        np.arange(len(result.sample_result.trace_neglogpost[0][1:]))
+    )
+
+    hpd_ensemble = Ensemble.from_sample(
+        result=result, remove_burn_in=True, ci_level=0.95
+    )
+
+    expected_length = (
+        int((result.sample_result.trace_x[0][burn_in:].shape[0]) * 0.95) + 1
+    )
+    # Check that the HPD parameters have the expected shape
+    assert hpd_ensemble.x_vectors.shape == (problem.dim, expected_length)
+    x_indices = np.where(result.sample_result.trace_neglogpost[0][1:] <= 95)[0]
+    assert np.all(
+        [
+            np.any(np.all(x[:, None] == hpd_ensemble.x_vectors, axis=0))
+            for x in result.sample_result.trace_x[0][burn_in:][x_indices]
+        ]
+    )