Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Corrected and added the indicators to determine the chemical rank #8

Open
wants to merge 15 commits into
base: 0.4.X
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions Examples/Rank.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,4 @@ Contributors

- Charles H Camp Jr
- Charles Le Losq ([email protected])
- [Shojiro Shibayama](https://github.com/sshojiro)
6 changes: 4 additions & 2 deletions pymcr/condition.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
""" Functions to condition / preprocess data """
import numpy as _np
from copy import deepcopy as _deepcopy


def standardize(X, mean_ctr=True, with_std=True, axis=-1, copy=True):
"""
Expand All @@ -21,12 +23,12 @@ def standardize(X, mean_ctr=True, with_std=True, axis=-1, copy=True):
Axis from which to calculate mean and standard deviation

copy : bool
Copy data (X) if True, overwite if False
Copy data (X) if True, overwrite if False

"""

if copy:
Xsc = 1*X
Xsc = _deepcopy(X)
else:
Xsc = X

Expand Down
126 changes: 90 additions & 36 deletions pymcr/rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

from pymcr.condition import standardize as _standardize

__all__ = ['ind', 'rod', 'pca']
__all__ = ['rsd', 'ind', 'rod', 'pca']


def pca(D, n_components=None):
"""
Principle component analysis
Principal component analysis

Parameters
----------
Expand All @@ -25,66 +25,120 @@ def pca(D, n_components=None):
Tuple with 3 items: Scores (T), Loadings (W), eigenvalues (singular values-squared)

"""
Dcenter = D - D.mean(axis=0, keepdims=True)
if n_components is None:
W, s2, Wt = _svd(_np.dot((D - D.mean(axis=0, keepdims=True)).T,
D - D.mean(axis=0, keepdims=True)),
W, s2, Wt = _svd(_np.dot(Dcenter.T, Dcenter),
full_matrices=False)
# Note: s2 contains trivial values.
# Ex) Let D n x d matrix (n >= d),
# s2 is n-length vector,
# though the mathematical rank of the metrics is at most d
else:
W, s2, Wt = _svds(_np.dot((D - D.mean(axis=0, keepdims=True)).T,
D - D.mean(axis=0, keepdims=True)), k=n_components)
if n_components == Dcenter.shape[0]:
n_components -= 1
W, s2, Wt = _svds(_np.dot(Dcenter.T, Dcenter), k=n_components)

# svds does not sort by variance; thus, manually sorting from biggest to
# smallest variance
sort_vec = _np.flipud(_np.argsort(s2))
W = W[:, sort_vec]
Wt = Wt[sort_vec, :]
s2 = s2[sort_vec]
# FIXME: T.var(axis=0) is not equal to s2 values.

assert _np.allclose(W, Wt.T)
# SVD decomposes A into U * S * V^T
# It is thought that U == Wt is false.
T = _np.dot(D, W)
# Note: T.mean(axis=0) is almost zeros
return T, W, s2

return (T, W, s2)

def ind(D_actual, ul_rank=100):
""" Malinowski's indicator function """
def rsd(D_actual):
"""
The residual standard deviation (RSD)

Parameters
----------
D_actual: ndarray [n_sample, n_features]
Spectral data matrix

Returns
-------
RSD : ndarray [n_rank-2, ]
a measure of the lack of fit of a PCA model to a data set.
The number of PCA components is q - 1, when the rank of input data is q.
Centering preceding PCA reduces the rank by one.

RSD is computed over l from 1 to q - 2 by definition,
where l is the number of principal components,
q is the value of the rank of X

"""
n_rank = _np.min(D_actual.shape)
n_samples = D_actual.shape[0]
n_max_rank = _np.min([ul_rank, _np.min(D_actual.shape)-1])
error_squared = _np.zeros(n_max_rank)
pca_scores, _, _ = pca(D_actual, n_rank-1)
q = pca_scores.shape[1]
variances = pca_scores.var(axis=0)
csum = _np.cumsum(variances[::-1])[::-1]
RSD = _np.sqrt( csum / ( n_samples * (q-2) ) )
return RSD[1:]

T, W, s2 = pca(D_actual)

# ! PCA only centers the mean, it does not divide by the standard deviation
# PCA forces data matrices to be normalized.
# Therefore, D_actual also needs to be normalized.
# D_scale = _standardize(D_actual)
D_scale = D_actual - D_actual.mean(axis=0, keepdims=True)
# U, S, _ = _svd(D_actual)
# T = U * S
def ind(D_actual):
"""
Malinowski's indicator function

error_squared = _np.sum(D_scale**2) - _np.sum(_np.cumsum(T[:,:-1]**2, axis=-1), axis=0)
Parameters
----------
D_actual : ndarray [n_sample, n_features]
Data

indicator = _np.sqrt(error_squared) / (n_samples - _np.arange(1, n_max_rank+1))**2
Returns
-------
IND : ndarray [n_rank-2, ]
n_rank-2 length vector
Centering before PCA operation and RSD computation reduce the rank of the matrix
by one respectively.

# n_samples = D_actual.shape[0]
# n_features = D_actual.shape[1]
"""
n_rank = _np.min(D_actual.shape)# q
RSD = rsd(D_actual)# the length is q-2
denominator = _np.square(_np.arange(n_rank-2, 0, -1))# (q-1)^2, (q-2)^2, ..., 2^2, 1^2
IND = _np.divide(RSD, denominator)
return IND

# assert s2.size == n_features, 'Number of samples must be larger than number of features'
# k_vec = _np.arange(n_features) + 1

# eigen_values = _np.sqrt(s2)
def rod(D_actual):
"""

# indicator = _np.sqrt(_np.cumsum(eigen_values) / (n_samples * (n_features - k_vec))) / (n_samples - k_vec)**2
Ratio of Derivatives (ROD)

return indicator
argmax(ROD) is thought to correspond to the chemical rank of the data.
For example, a mixture spectrum consists of three pure components,
the chemical rank is three. The chemical rank of the mixture spectra
is expected to be three.

Parameters
----------
D_actual : ndarray [n_sample, n_features]
Data

Returns
-------
ROD : ndarray [n_rank-2, ]
n_rank-2 length vector
Centering before PCA operation and RSD computation reduce the rank of the matrix
by one respectively.

"""

IND = ind(D_actual)
n_ind = len(IND)
ROD = ( IND[0:(n_ind-2)] - IND[1:(n_ind-1)] ) \
/ ( IND[1:(n_ind-1)] - IND[2:n_ind] )
return _np.array([0, 0]+list(ROD))

def rod(D_actual, ul_rank=100):
""" Ratio of derivatives """
IND = ind(D_actual, ul_rank)
ROD = ( IND[0:(len(IND)-2)] - IND[1:(len(IND)-1)] ) \
/ ( IND[1:(len(IND)-1)] - IND[2:len(IND)] )
return ROD

if __name__ == '__main__':
D = _np.vstack((_np.eye(3), -_np.eye(3)))
ind(D)
print(ind(D))
37 changes: 34 additions & 3 deletions pymcr/tests/test_rank.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
""" Test rank metrics """
import numpy as np

from pymcr.rank import pca
from pymcr.rank import pca, rsd, ind, rod


def test_pca_full():
""" Test PCA (full rank)"""
Expand All @@ -17,8 +18,9 @@ def test_pca_full():
assert np.allclose(np.eye(3), np.abs(W))
assert np.allclose(np.abs(D), np.abs(T))


def test_pca_n_components():
""" Test PCA (2 components)"""
""" Test PCA (2 components) """
D = np.vstack((np.eye(3), -np.eye(3)))
# Axis 0 is most variability
# Axis 1 is next
Expand All @@ -36,4 +38,33 @@ def test_pca_n_components():
# Sorting now built into pca
# assert np.allclose(np.eye(3, 2), np.abs(W[:,np.flipud(np.argsort(s2))]))
# assert np.allclose(np.abs(D)[:,:2], np.abs(T[:, np.flipud(np.argsort(s2))]))



def test_rsd_length():
"""
Test if RSD's length is correct
Note that centering before PCA and RSD computation reduce the rank by one respectively.
"""
X = np.random.randn(10, 100)
n_rank = np.min(X.shape)
assert len(rsd(X)) == n_rank - 2


def test_ind_length():
"""
Test if IND's length is correct
Note that centering before PCA and RSD computation reduce the rank by one respectively.
"""
X = np.random.randn(10, 100)
n_rank = np.min(X.shape)
assert len(ind(X)) == n_rank - 2


def test_rod_length():
"""
Test if ROD's length is correct
Note that centering before PCA and RSD computation reduce the rank by one respectively.
"""
X = np.random.randn(10, 100)
n_rank = np.min(X.shape)
assert len(rod(X)) == n_rank - 2