Changed code to be easily adaptable for future projects. (#33)

* revamped code and ran black * Updated tests and notebooks with new code change * ready for pull request
hildensia · Dec 9, 2021 · 2dd95f5 · 2dd95f5
1 parent 2ba96fd
commit 2dd95f5
Show file tree

Hide file tree

Showing 15 changed files with 1,278 additions and 1,021 deletions.
diff --git a/Example Code.ipynb b/Example Code.ipynb
diff --git a/Example_Code.ipynb b/Example_Code.ipynb
diff --git a/Multivariate Example.ipynb b/Multivariate Example.ipynb
diff --git a/Multivariate_Example.ipynb b/Multivariate_Example.ipynb
diff --git a/bayesian_changepoint_detection/__init__.py b/bayesian_changepoint_detection/__init__.py
@@ -1,3 +1,3 @@
 #!/usr/bin/env python2
 
-__version__ = '0.2.dev1'
+__version__ = '0.3'
diff --git a/bayesian_changepoint_detection/bayesian_models.py b/bayesian_changepoint_detection/bayesian_models.py
@@ -0,0 +1,136 @@
+import numpy as np
+
+try:
+    from sselogsumexp import logsumexp
+except ImportError:
+    from scipy.special import logsumexp
+
+    print("Use scipy logsumexp().")
+else:
+    print("Use SSE accelerated logsumexp().")
+
+
+def offline_changepoint_detection(
+    data, prior_function, log_likelihood_class, truncate: int = -40
+):
+    """
+    Compute the likelihood of changepoints on data.
+
+    Parameters:
+    data    -- the time series data
+    truncate  -- the cutoff probability 10^truncate to stop computation for that changepoint log likelihood
+
+    Outputs:
+        P  -- the log-likelihood of a datasequence [t, s], given there is no changepoint between t and s
+        Q -- the log-likelihood of data
+        Pcp --  the log-likelihood that the i-th changepoint is at time step t. To actually get the probility of a changepoint at time step t sum the probabilities.
+    """
+
+    # Set up the placeholders for each parameter
+    n = len(data)
+    Q = np.zeros((n,))
+    g = np.zeros((n,))
+    G = np.zeros((n,))
+    P = np.ones((n, n)) * -np.inf
+
+    # save everything in log representation
+    for t in range(n):
+        g[t] = prior_function(t)
+        if t == 0:
+            G[t] = g[t]
+        else:
+            G[t] = np.logaddexp(G[t - 1], g[t])
+
+    P[n - 1, n - 1] = log_likelihood_class.pdf(data, t=n - 1, s=n)
+    Q[n - 1] = P[n - 1, n - 1]
+
+    for t in reversed(range(n - 1)):
+        P_next_cp = -np.inf  # == log(0)
+        for s in range(t, n - 1):
+            P[t, s] = log_likelihood_class.pdf(data, t=t, s=s + 1)
+
+            # compute recursion
+            summand = P[t, s] + Q[s + 1] + g[s + 1 - t]
+            P_next_cp = np.logaddexp(P_next_cp, summand)
+
+            # truncate sum to become approx. linear in time (see
+            # Fearnhead, 2006, eq. (3))
+            if summand - P_next_cp < truncate:
+                break
+
+        P[t, n - 1] = log_likelihood_class.pdf(data, t=t, s=n)
+
+        # (1 - G) is numerical stable until G becomes numerically 1
+        if G[n - 1 - t] < -1e-15:  # exp(-1e-15) = .99999...
+            antiG = np.log(1 - np.exp(G[n - 1 - t]))
+        else:
+            # (1 - G) is approx. -log(G) for G close to 1
+            antiG = np.log(-G[n - 1 - t])
+
+        Q[t] = np.logaddexp(P_next_cp, P[t, n - 1] + antiG)
+
+    Pcp = np.ones((n - 1, n - 1)) * -np.inf
+    for t in range(n - 1):
+        Pcp[0, t] = P[0, t] + Q[t + 1] + g[t] - Q[0]
+        if np.isnan(Pcp[0, t]):
+            Pcp[0, t] = -np.inf
+    for j in range(1, n - 1):
+        for t in range(j, n - 1):
+            tmp_cond = (
+                Pcp[j - 1, j - 1 : t]
+                + P[j : t + 1, t]
+                + Q[t + 1]
+                + g[0 : t - j + 1]
+                - Q[j : t + 1]
+            )
+            Pcp[j, t] = logsumexp(tmp_cond.astype(np.float32))
+            if np.isnan(Pcp[j, t]):
+                Pcp[j, t] = -np.inf
+
+    return Q, P, Pcp
+
+
+def online_changepoint_detection(data, hazard_function, log_likelihood_class):
+    """
+    Use online bayesian changepoint detection
+    https://scientya.com/bayesian-online-change-point-detection-an-intuitive-understanding-b2d2b9dc165b
+
+    Parameters:
+    data    -- the time series data
+
+    Outputs:
+        R  -- is the probability at time step t that the last sequence is already s time steps long
+        maxes -- the argmax on column axis of matrix R (growth probability value) for each time step
+    """
+    maxes = np.zeros(len(data) + 1)
+
+    R = np.zeros((len(data) + 1, len(data) + 1))
+    R[0, 0] = 1
+
+    for t, x in enumerate(data):
+        # Evaluate the predictive distribution for the new datum under each of
+        # the parameters.  This is the standard thing from Bayesian inference.
+        predprobs = log_likelihood_class.pdf(x)
+
+        # Evaluate the hazard function for this interval
+        H = hazard_function(np.array(range(t + 1)))
+
+        # Evaluate the growth probabilities - shift the probabilities down and to
+        # the right, scaled by the hazard function and the predictive
+        # probabilities.
+        R[1 : t + 2, t + 1] = R[0 : t + 1, t] * predprobs * (1 - H)
+
+        # Evaluate the probability that there *was* a changepoint and we're
+        # accumulating the mass back down at r = 0.
+        R[0, t + 1] = np.sum(R[0 : t + 1, t] * predprobs * H)
+
+        # Renormalize the run length probabilities for improved numerical
+        # stability.
+        R[:, t + 1] = R[:, t + 1] / np.sum(R[:, t + 1])
+
+        # Update the parameter sets for each possible run length.
+        log_likelihood_class.update_theta(x, t=t)
+
+        maxes[t] = R[:, t].argmax()
+
+    return R, maxes
diff --git a/bayesian_changepoint_detection/generate_data.py b/bayesian_changepoint_detection/generate_data.py
@@ -1,45 +1,50 @@
 from __future__ import division
 import numpy as np
 
-def generate_normal_time_series(num, minl=50, maxl=1000):
-  data = np.array([], dtype=np.float64)
-  partition = np.random.randint(minl, maxl, num)
-  for p in partition:
-    mean = np.random.randn()*10
-    var = np.random.randn()*1
-    if var < 0:
-      var = var * -1
-    tdata = np.random.normal(mean, var, p)
-    data = np.concatenate((data, tdata))
-  return partition, np.atleast_2d(data).T
-
-def generate_multinormal_time_series(num, dim, minl=50, maxl=1000):
-  data = np.empty((1,dim), dtype=np.float64)
-  partition = np.random.randint(minl, maxl, num)
-  for p in partition:
-    mean = np.random.standard_normal(dim)*10
-    # Generate a random SPD matrix
-    A = np.random.standard_normal((dim,dim))
-    var = np.dot(A,A.T)
-
-    tdata = np.random.multivariate_normal(mean, var, p)
-    data = np.concatenate((data, tdata))
-  return partition, data[1:,:]
-
-def generate_xuan_motivating_example(minl=50, maxl=1000):
-  dim = 2
-  num = 3
-  partition = np.random.randint(minl, maxl, num)
-  mu = np.zeros(dim)
-  Sigma1 = np.asarray([[1.0,0.75],[0.75,1.0]])
-  data = np.random.multivariate_normal(mu, Sigma1, partition[0])
-  Sigma2 = np.asarray([[1.0,0.0],[0.0,1.0]])
-  data = np.concatenate((data,np.random.multivariate_normal(mu, Sigma2, partition[1])))
-  Sigma3 = np.asarray([[1.0,-0.75],[-0.75,1.0]])
-  data = np.concatenate((data,np.random.multivariate_normal(mu, Sigma3, partition[2])))
-  return partition, data
-
-
-
-
 
+def generate_normal_time_series(num, minl: int=50, maxl: int=1000, seed: int=100):
+    np.random.seed(seed)
+    data = np.array([], dtype=np.float64)
+    partition = np.random.randint(minl, maxl, num)
+    for p in partition:
+        mean = np.random.randn() * 10
+        var = np.random.randn() * 1
+        if var < 0:
+            var = var * -1
+        tdata = np.random.normal(mean, var, p)
+        data = np.concatenate((data, tdata))
+    return partition, np.atleast_2d(data).T
+
+
+def generate_multinormal_time_series(num, dim, minl: int=50, maxl: int=1000, seed: int=100):
+    np.random.seed(seed)
+    data = np.empty((1, dim), dtype=np.float64)
+    partition = np.random.randint(minl, maxl, num)
+    for p in partition:
+        mean = np.random.standard_normal(dim) * 10
+        # Generate a random SPD matrix
+        A = np.random.standard_normal((dim, dim))
+        var = np.dot(A, A.T)
+
+        tdata = np.random.multivariate_normal(mean, var, p)
+        data = np.concatenate((data, tdata))
+    return partition, data[1:, :]
+
+
+def generate_xuan_motivating_example(minl: int=50, maxl: int=1000, seed: int=100):
+    np.random.seed(seed)
+    dim = 2
+    num = 3
+    partition = np.random.randint(minl, maxl, num)
+    mu = np.zeros(dim)
+    Sigma1 = np.asarray([[1.0, 0.75], [0.75, 1.0]])
+    data = np.random.multivariate_normal(mu, Sigma1, partition[0])
+    Sigma2 = np.asarray([[1.0, 0.0], [0.0, 1.0]])
+    data = np.concatenate(
+        (data, np.random.multivariate_normal(mu, Sigma2, partition[1]))
+    )
+    Sigma3 = np.asarray([[1.0, -0.75], [-0.75, 1.0]])
+    data = np.concatenate(
+        (data, np.random.multivariate_normal(mu, Sigma3, partition[2]))
+    )
+    return partition, data
diff --git a/bayesian_changepoint_detection/hazard_functions.py b/bayesian_changepoint_detection/hazard_functions.py
@@ -0,0 +1,11 @@
+import numpy as np
+
+
+def constant_hazard(lam, r):
+    """
+    Hazard function for bayesian online learning
+    Arguments:
+        lam - inital prob
+        r - R matrix
+    """
+    return 1 / lam * np.ones(r.shape)