usnistgov · sshojiro · Jun 19, 2018 · Jun 19, 2018 · Jun 20, 2018 · Jun 20, 2018
diff --git a/Examples/Rank.ipynb b/Examples/Rank.ipynb
diff --git a/README.rst b/README.rst
@@ -238,3 +238,4 @@ Contributors
 
 -   Charles H Camp Jr
 -   Charles Le Losq ([email protected])
+-   [Shojiro Shibayama](https://github.com/sshojiro)
diff --git a/pymcr/condition.py b/pymcr/condition.py
@@ -1,5 +1,7 @@
 """ Functions to condition / preprocess data """
 import numpy as _np
+from copy import deepcopy as _deepcopy
+
 
 def standardize(X, mean_ctr=True, with_std=True, axis=-1, copy=True):
     """
@@ -21,12 +23,12 @@ def standardize(X, mean_ctr=True, with_std=True, axis=-1, copy=True):
         Axis from which to calculate mean and standard deviation
 
     copy : bool
-        Copy data (X) if True, overwite if False
+        Copy data (X) if True, overwrite if False
 
     """
 
     if copy:
-        Xsc = 1*X
+        Xsc = _deepcopy(X)
     else:
         Xsc = X
 

diff --git a/pymcr/rank.py b/pymcr/rank.py
@@ -5,12 +5,12 @@
 
 from pymcr.condition import standardize as _standardize
 
-__all__ = ['ind', 'rod', 'pca']
+__all__ = ['rsd', 'ind', 'rod', 'pca']
 
 
 def pca(D, n_components=None):
     """
-    Principle component analysis
+    Principal component analysis
 
     Parameters
     ----------
@@ -25,66 +25,120 @@ def pca(D, n_components=None):
     Tuple with 3 items: Scores (T), Loadings (W), eigenvalues (singular values-squared)
 
     """
+    Dcenter = D - D.mean(axis=0, keepdims=True)
     if n_components is None:
-        W, s2, Wt = _svd(_np.dot((D - D.mean(axis=0, keepdims=True)).T,
-                                D - D.mean(axis=0, keepdims=True)),
+        W, s2, Wt = _svd(_np.dot(Dcenter.T, Dcenter),
                          full_matrices=False)
+        # Note: s2 contains trivial values.
+        # Ex) Let D n x d matrix (n >= d),
+        # s2 is n-length vector,
+        # though the mathematical rank of the metrics is at most d
     else:
-        W, s2, Wt = _svds(_np.dot((D - D.mean(axis=0, keepdims=True)).T,
-                                D - D.mean(axis=0, keepdims=True)), k=n_components)
+        if n_components == Dcenter.shape[0]:
+            n_components -= 1
+        W, s2, Wt = _svds(_np.dot(Dcenter.T, Dcenter), k=n_components)
 
         # svds does not sort by variance; thus, manually sorting from biggest to
         # smallest variance
         sort_vec = _np.flipud(_np.argsort(s2))
         W = W[:, sort_vec]
         Wt = Wt[sort_vec, :]
         s2 = s2[sort_vec]
+        # FIXME: T.var(axis=0) is not equal to s2 values.
 
-    assert _np.allclose(W, Wt.T)
+    # SVD decomposes A into U * S * V^T
+    # It is thought that U == Wt is false.
     T = _np.dot(D, W)
+    # Note: T.mean(axis=0) is almost zeros
+    return T, W, s2
 
-    return (T, W, s2)
 
-def ind(D_actual, ul_rank=100):
-    """ Malinowski's indicator function """
+def rsd(D_actual):
+    """
+    The residual standard deviation (RSD)
+
+    Parameters
+    ----------
+    D_actual: ndarray [n_sample, n_features]
+        Spectral data matrix
+
+    Returns
+    -------
+    RSD : ndarray [n_rank-2, ]
+        a measure of the lack of fit of a PCA model to a data set.
+        The number of PCA components is q - 1, when the rank of input data is q.
+        Centering preceding PCA reduces the rank by one.
+
+        RSD is computed over l from 1 to q - 2 by definition,
+        where l is the number of principal components,
+        q is the value of the rank of X
+
+    """
+    n_rank = _np.min(D_actual.shape)
     n_samples = D_actual.shape[0]
-    n_max_rank = _np.min([ul_rank, _np.min(D_actual.shape)-1])
-    error_squared = _np.zeros(n_max_rank)
+    pca_scores, _, _ = pca(D_actual, n_rank-1)
+    q = pca_scores.shape[1]
+    variances = pca_scores.var(axis=0)
+    csum = _np.cumsum(variances[::-1])[::-1]
+    RSD = _np.sqrt( csum / ( n_samples * (q-2) ) )
+    return RSD[1:]
 
-    T, W, s2 = pca(D_actual)
 
-    # ! PCA only centers the mean, it does not divide by the standard deviation
-    # PCA forces data matrices to be normalized.
-    # Therefore, D_actual also needs to be normalized.
-    # D_scale = _standardize(D_actual)
-    D_scale = D_actual - D_actual.mean(axis=0, keepdims=True)
-    # U, S, _ = _svd(D_actual)
-    # T = U * S
+def ind(D_actual):
+    """
+    Malinowski's indicator function
 
-    error_squared = _np.sum(D_scale**2) - _np.sum(_np.cumsum(T[:,:-1]**2, axis=-1), axis=0)
+    Parameters
+    ----------
+    D_actual : ndarray [n_sample, n_features]
+        Data
 
-    indicator = _np.sqrt(error_squared) / (n_samples - _np.arange(1, n_max_rank+1))**2
+    Returns
+    -------
+    IND : ndarray [n_rank-2, ]
+        n_rank-2 length vector
+        Centering before PCA operation and RSD computation reduce the rank of the matrix
+        by one respectively.
 
-    # n_samples = D_actual.shape[0]
-    # n_features = D_actual.shape[1]
+    """
+    n_rank = _np.min(D_actual.shape)# q
+    RSD = rsd(D_actual)# the length is q-2
+    denominator = _np.square(_np.arange(n_rank-2, 0, -1))# (q-1)^2, (q-2)^2, ..., 2^2, 1^2
+    IND = _np.divide(RSD, denominator)
+    return IND
 
-    # assert s2.size == n_features, 'Number of samples must be larger than number of features'
-    # k_vec = _np.arange(n_features) + 1
 
-    # eigen_values = _np.sqrt(s2)
+def rod(D_actual):
+    """
 
-    # indicator = _np.sqrt(_np.cumsum(eigen_values) / (n_samples * (n_features - k_vec))) / (n_samples - k_vec)**2
+    Ratio of Derivatives (ROD)
 
-    return indicator
+    argmax(ROD) is thought to correspond to the chemical rank of the data.
+    For example, a mixture spectrum consists of three pure components,
+    the chemical rank is three. The chemical rank of the mixture spectra
+    is expected to be three.
+
+    Parameters
+    ----------
+    D_actual : ndarray [n_sample, n_features]
+        Data
+
+    Returns
+    -------
+    ROD : ndarray [n_rank-2, ]
+        n_rank-2 length vector
+        Centering before PCA operation and RSD computation reduce the rank of the matrix
+        by one respectively.
+
+    """
 
+    IND = ind(D_actual)
+    n_ind = len(IND)
+    ROD = ( IND[0:(n_ind-2)] - IND[1:(n_ind-1)] ) \
+          / ( IND[1:(n_ind-1)] - IND[2:n_ind] )
+    return _np.array([0, 0]+list(ROD))
 
-def rod(D_actual, ul_rank=100):
-    """ Ratio of derivatives """
-    IND = ind(D_actual, ul_rank)
-    ROD = ( IND[0:(len(IND)-2)] - IND[1:(len(IND)-1)] ) \
-          / ( IND[1:(len(IND)-1)] - IND[2:len(IND)] )
-    return ROD
 
 if __name__ == '__main__':
     D = _np.vstack((_np.eye(3), -_np.eye(3)))
-    ind(D)
+    print(ind(D))
diff --git a/pymcr/tests/test_rank.py b/pymcr/tests/test_rank.py
@@ -1,7 +1,8 @@
 """ Test rank metrics """
 import numpy as np
 
-from pymcr.rank import pca
+from pymcr.rank import pca, rsd, ind, rod
+
 
 def test_pca_full():
     """ Test PCA (full rank)"""
@@ -17,8 +18,9 @@ def test_pca_full():
     assert np.allclose(np.eye(3), np.abs(W))
     assert np.allclose(np.abs(D), np.abs(T))
 
+
 def test_pca_n_components():
-    """ Test PCA (2 components)"""
+    """ Test PCA (2 components) """
     D = np.vstack((np.eye(3), -np.eye(3)))
     # Axis 0 is most variability
     # Axis 1 is next
@@ -36,4 +38,33 @@ def test_pca_n_components():
     # Sorting now built into pca
     # assert np.allclose(np.eye(3, 2), np.abs(W[:,np.flipud(np.argsort(s2))]))
     # assert np.allclose(np.abs(D)[:,:2], np.abs(T[:, np.flipud(np.argsort(s2))]))
-
+
+
+def test_rsd_length():
+    """
+    Test if RSD's length is correct
+    Note that centering before PCA and RSD computation reduce the rank by one respectively.
+    """
+    X = np.random.randn(10, 100)
+    n_rank = np.min(X.shape)
+    assert len(rsd(X)) == n_rank - 2
+
+
+def test_ind_length():
+    """
+    Test if IND's length is correct
+    Note that centering before PCA and RSD computation reduce the rank by one respectively.
+    """
+    X = np.random.randn(10, 100)
+    n_rank = np.min(X.shape)
+    assert len(ind(X)) == n_rank - 2
+
+
+def test_rod_length():
+    """
+    Test if ROD's length is correct
+    Note that centering before PCA and RSD computation reduce the rank by one respectively.
+    """
+    X = np.random.randn(10, 100)
+    n_rank = np.min(X.shape)
+    assert len(rod(X)) == n_rank - 2
Original file line number	Diff line number	Diff line change
Expand Up		@@ -238,3 +238,4 @@ Contributors

		- Charles H Camp Jr
		- Charles Le Losq ([email protected])
		- [Shojiro Shibayama](https://github.com/sshojiro)