Merge pull request #126 from scikit-learn-contrib/em_sampler_mle_debug

Em sampler mle debug
scikit-learn-contrib · Mar 8, 2024 · 53116b9 · 53116b9
2 parents 8708ed7 + 07d98a9
commit 53116b9
Show file tree

Hide file tree

Showing 24 changed files with 1,010 additions and 486 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.1
+current_version = 0.1.2
 commit = True
 tag = True
 

diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,18 @@
 History
 =======
 
+0.1.3 (2024-03-07)
+------------------
+
+* RPCA algorithms now start with a normalizing scaler
+* The EM algorithms now include a gradient projection step to be more robust to colinearity
+* The EM algorithm based on the Gaussian model is now initialized using a robust estimation of the covariance matrix
+* A bug in the EM algorithm has been patched: the normalizing matrix gamma was creating a sampling biais
+* Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method
+* The ImputeRegressor class now handles the nans by `row` by default
+* The metric `frechet` was not correctly called and has been patched
+* The EM algorithm with VAR(p) now fills initial holes in order to avoid exponential explosions
+
 0.1.2 (2024-02-28)
 ------------------
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -27,7 +27,7 @@
 author = "Quantmetry"
 
 # The full version, including alpha/beta/rc tags
-version = "0.1.1"
+version = "0.1.2"
 release = version
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/imputers.rst b/docs/imputers.rst
@@ -41,7 +41,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement
 The class :class:`RPCANoisy` implements an recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or  :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following
 
 .. math::
-   \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
+   \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
 
 with :math:`\mathbf{E} = \mathbf{D} - \mathbf{M} - \mathbf{A}`.
 See the :class:`~qolmat.imputations.imputers.ImputerRpcaNoisy` class for implementation details.

diff --git a/examples/RPCA.md b/examples/RPCA.md
@@ -8,12 +8,12 @@ jupyter:
       format_version: '1.3'
       jupytext_version: 1.14.4
   kernelspec:
-    display_name: Python 3 (ipykernel)
+    display_name: env_qolmat_dev
     language: python
-    name: python3
+    name: env_qolmat_dev
 ---
 
-```python
+```python tags=[]
 %reload_ext autoreload
 %autoreload 2
 
@@ -26,17 +26,18 @@ import sys
 
 from math import pi
 
-from qolmat.utils import plot, data
-from qolmat.imputations.rpca.rpca_pcp import RPCAPCP
-from qolmat.imputations.rpca.rpca_noisy import RPCANoisy
+from qolmat.utils import utils, plot, data
+from qolmat.imputations.rpca.rpca_pcp import RpcaPcp
+from qolmat.imputations.rpca.rpca_noisy import RpcaNoisy
+from qolmat.imputations.softimpute import SoftImpute
 from qolmat.imputations.rpca import rpca_utils
 from qolmat.utils.data import generate_artificial_ts
 ```
 
 **Generate synthetic data**
 
-```python
-n_samples = 1000
+```python tags=[]
+n_samples = 10000
 periods = [100, 20]
 amp_anomalies = 0.5
 ratio_anomalies = 0.05
@@ -47,13 +48,15 @@ X_true, A_true, E_true = generate_artificial_ts(n_samples, periods, amp_anomalie
 signal = X_true + A_true + E_true
 
 # Adding missing data
-#signal[5:20] = np.nan
-mask = np.random.choice(len(signal), round(len(signal) / 20))
-signal[mask] = np.nan
+signal[120:180] = np.nan
+signal[:20] = np.nan
+# signal[80:220] = np.nan
+# mask = np.random.choice(len(signal), round(len(signal) / 20))
+# signal[mask] = np.nan
 
 ```
 
-```python
+```python tags=[]
 fig = plt.figure(figsize=(15, 8))
 ax = fig.add_subplot(4, 1, 1)
 ax.title.set_text("Low-rank signal")
@@ -74,40 +77,172 @@ plt.plot(signal)
 plt.show()
 ```
 
+<!-- #region tags=[] -->
+# Fit RPCA Noisy
+<!-- #endregion -->
+
+```python tags=[]
+rpca_noisy = RpcaNoisy(tau=1, lam=.4, rank=1, norm="L2")
+```
+
+```python tags=[]
+period = 100
+D = utils.prepare_data(signal, period)
+Omega = ~np.isnan(D)
+D = utils.linear_interpolation(D)
+```
+
+```python tags=[]
+M, A, L, Q = rpca_noisy.decompose_with_basis(D, Omega)
+M2, A2 = rpca_noisy.decompose_on_basis(D, Omega, Q)
+```
+
+```python tags=[]
+M_final = utils.get_shape_original(M, signal.shape)
+A_final = utils.get_shape_original(A, signal.shape)
+D_final = utils.get_shape_original(D, signal.shape)
+signal_imputed = M_final + A_final
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(12, 4))
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+# plt.plot(D_final, label="D")
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+plt.legend()
+plt.show()
+```
+
 ## PCP RPCA
 
+```python tags=[]
+rpca_pcp = RpcaPcp(max_iterations=1000, lam=.1)
+```
+
+```python tags=[]
+period = 100
+D = utils.prepare_data(signal, period)
+Omega = ~np.isnan(D)
+D = utils.linear_interpolation(D)
+```
+
+```python tags=[]
+M, A = rpca_pcp.decompose(D, Omega)
+```
+
+```python tags=[]
+M_final = utils.get_shape_original(M, signal.shape)
+A_final = utils.get_shape_original(A, signal.shape)
+D_final = utils.get_shape_original(D, signal.shape)
+# Y_final = utils.get_shape_original(Y, signal.shape)
+signal_imputed = M_final + A_final
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(12, 4))
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+# plt.gca().twinx()
+# plt.plot(Y_final, label="Y")
+plt.legend()
+plt.show()
+```
+
+## Soft Impute
+
+```python tags=[]
+imputer = SoftImpute(max_iterations=1000, tau=.1)
+```
+
+```python tags=[]
+period = 100
+D = utils.prepare_data(signal, period)
+Omega = ~np.isnan(D)
+D = utils.linear_interpolation(D)
+```
+
+```python tags=[]
+M, A = imputer.decompose(D, Omega)
+```
+
+```python tags=[]
+M_final = utils.get_shape_original(M, signal.shape)
+A_final = utils.get_shape_original(A, signal.shape)
+D_final = utils.get_shape_original(D, signal.shape)
+# Y_final = utils.get_shape_original(Y, signal.shape)
+signal_imputed = M_final + A_final
+```
+
+```python tags=[]
+fig = plt.figure(figsize=(12, 4))
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+plt.legend()
+plt.show()
+```
+
+## Temporal RPCA
+
 ```python
 %%time
-rpca_pcp = RPCAPCP(period=100, max_iterations=100, mu=.5, lam=0.1)
-X, A = rpca_pcp.decompose_rpca_signal(signal)
-imputed = signal - A
+# rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, list_periods=[10], list_etas=[0.01], norm="L2")
+rpca_noisy = RpcaNoisy(tau=1, lam=0.4, rank=2, norm="L2")
+M, A = rpca_noisy.decompose(D, Omega)
+# imputed = X
 ```
 
-```python
+```python tags=[]
 fig = plt.figure(figsize=(12, 4))
-plt.plot(X, color="black")
-plt.plot(imputed)
+
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.plot(A_final, label="Anomalies")
+
+plt.plot(signal, color="black", label="Original signal")
+plt.xlim(0, 400)
+# plt.gca().twinx()
+# plt.plot(Y_final, label="Y")
+plt.legend()
+plt.show()
 ```
 
-## Temporal RPCA
+# EM VAR(p)
 
 ```python
-signal.shape
+from qolmat.imputations import em_sampler
 ```
 
 ```python
-%%time
-# rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, list_periods=[10], list_etas=[0.01], norm="L2")
-rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, rank=2, norm="L2")
-X, A = rpca_noisy.decompose_rpca_signal(signal)
-imputed =
+p = 1
+model = em_sampler.VARpEM(method="mle", max_iter_em=10, n_iter_ou=512, dt=1e-1, p=p)
+```
+
+```python
+D = signal.reshape(-1, 1)
+M_final = model.fit_transform(D)
 ```
 
 ```python
 fig = plt.figure(figsize=(12, 4))
-plt.plot(signal, color="black")
-plt.plot(X_true)
-plt.plot(X)
+plt.plot(signal_imputed, label="Imputed signal with anomalies")
+plt.plot(M_final, label="Imputed signal without anomalies")
+plt.xlim(0, 400)
+plt.legend()
+plt.show()
 ```
 
 ```python