Skip to content

Commit

Permalink
adding mahalanobis distance
Browse files Browse the repository at this point in the history
  • Loading branch information
nwmoriarty committed Aug 29, 2024
1 parent db0e425 commit 9ca4b76
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,5 @@ mmtbx/suitename/test/
mmtbx/suitename/suitename.stderr.log
mmtbx/suitename/assist/
mmtbx/suitename/*.show.txt

*sublime*
45 changes: 45 additions & 0 deletions libtbx/math_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,48 @@ def percentile_based_spread_with_selection(values, pbs_fraction=0.608,
else:
high = working
return last_value

def mahalanobis_using_sklearn(x=None, data=None, cov=None, verbose=False):
if cov is None:
cov = covariance_using_sklearn(data, verbose=verbose)
mahal_cov = cov.mahalanobis(x)
if verbose: print(mahal_cov)
return mahal_cov

def covariance_using_sklearn(data=None, values=None, verbose=False):
from sklearn.covariance import EmpiricalCovariance, MinCovDet
if values:
assert 0
cov=EmpiricalCovariance()
cov.covariance_=values
return cov
# fit a MCD robust estimator to data
robust_cov = MinCovDet().fit(data)
# fit a MLE estimator to data
emp_cov = EmpiricalCovariance().fit(data)
if verbose:
print(
"Estimated covariance matrix:\nMCD (Robust):\n{}\nMLE:\n{}".format(
robust_cov.covariance_, emp_cov.covariance_
)
)
# choose one
cov = emp_cov
return cov

def mahalanobis_p_values_outlier_indices(x=None, data=None, cov=None):
rc = mahalanobis_p_values(x=x, data=data, cov=cov)
return list(filter(lambda x: rc[x] <0.01, range(len(rc))))

def mahalanobis_p_values(x=None, data=None, cov=None, verbose=False):
from scipy.stats import chi2
df=len(x[0])-1
if verbose:
print(chi2.ppf((1-0.01), df=df))
#> 9.21
mahal_cov = mahalanobis_using_sklearn(x=x, data=data, cov=cov)
if verbose: print(mahal_cov)
return(1 - chi2.cdf(mahal_cov, df))

def mahalanobis(x=None, data=None, mu=None, cov=None):
return mahalanobis_using_sklearn(x=x, data=data, cov=cov)
110 changes: 110 additions & 0 deletions libtbx/tst_mahalanobis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import division
from io import StringIO

diamonds_short = '''"carat","cut","color","clarity","depth","table","price","x","y","z"
0.23,"Ideal","E","SI2",61.5,55,326,3.95,3.98,2.43
0.21,"Premium","E","SI1",59.8,61,326,3.89,3.84,2.31
0.23,"Good","E","VS1",56.9,65,327,4.05,4.07,2.31
0.29,"Premium","I","VS2",62.4,58,334,4.2,4.23,2.63
0.31,"Good","J","SI2",63.3,58,335,4.34,4.35,2.75
0.24,"Very Good","J","VVS2",62.8,57,336,3.94,3.96,2.48
0.24,"Very Good","I","VVS1",62.3,57,336,3.95,3.98,2.47
0.26,"Very Good","H","SI1",61.9,55,337,4.07,4.11,2.53
0.22,"Fair","E","VS2",65.1,61,337,3.87,3.78,2.49
0.23,"Very Good","H","VS1",59.4,61,338,4,4.05,2.39
0.3,"Good","J","SI1",64,55,339,4.25,4.28,2.73
0.23,"Ideal","J","VS1",62.8,56,340,3.93,3.9,2.46
0.22,"Premium","F","SI1",60.4,61,342,3.88,3.84,2.33
0.31,"Ideal","J","SI2",62.2,54,344,4.35,4.37,2.71
0.2,"Premium","E","SI2",60.2,62,345,3.79,3.75,2.27
0.32,"Premium","E","I1",60.9,58,345,4.38,4.42,2.68
0.3,"Ideal","I","SI2",62,54,348,4.31,4.34,2.68
0.3,"Good","J","SI1",63.4,54,351,4.23,4.29,2.7
0.3,"Good","J","SI1",63.8,56,351,4.23,4.26,2.71
'''

def mahalanobis_using_numpy_and_scipy(x=None, data=None, mu=None, cov=None):
"""
Compute the Mahalanobis Distance between each row of x and the data
x : vector or matrix of data with, say, p columns.
data : ndarray of the distribution from which Mahalanobis distance of each
observation of x is to be computed.
cov : covariance matrix (p x p) of the distribution. If None, will be
computed from data.
"""
# error prone
assert 0
import numpy as np
import scipy as sp
if mu:
x_minus_mu = x - mu
else:
print(x)
print(np.mean(data))
x_minus_mu = x - np.mean(data)
print('x_minus_mu',x_minus_mu)
if not cov: cov = np.cov(data)
print(cov)
inv_covmat = sp.linalg.inv(cov)
print(inv_covmat)
left_term = np.dot(x_minus_mu, inv_covmat)
mahal = np.dot(left_term, x_minus_mu.T)
return mahal.diagonal()

def main():
from libtbx import math_utils
import csv
data=[]
f=StringIO(diamonds_short)
spamreader = csv.reader(f, delimiter=',', quotechar='|')
for i, row in enumerate(spamreader):
if not i: continue
# print(', '.join(row))
data.append([float(row[0]), float(row[4]), float(row[6])])

zz=[(0.23, 61.5, 326),
(0.23, 56.9, 329),
(0.23, 56.9, 349),
]

rc=math_utils.mahalanobis(zz, data)
print(rc)
rc=math_utils.mahalanobis_p_values(zz, data, verbose=True)
print(rc)
rc=math_utils.mahalanobis_p_values_outlier_indices(zz, data)
assert rc==[2]
for i, p in enumerate(math_utils.mahalanobis_p_values(zz, data)):
outlier=''
if i in rc:
outlier='OUTLIER'
print(' %s %0.4f %s' % (i,p,outlier))
cov = math_utils.covariance_using_sklearn(data, verbose=True)
rc=math_utils.mahalanobis(zz, cov=cov)
print(rc)
rc=math_utils.mahalanobis_p_values(zz, cov=cov, verbose=True)
print(rc)
rc=math_utils.mahalanobis_p_values_outlier_indices(zz, cov=cov)
assert rc==[2]
rc=math_utils.mahalanobis_p_values_outlier_indices(zz[:1], cov=cov)
print(rc)

# values = cov.covariance_.tolist()
# cov = math_utils.covariance_using_sklearn(values=values, verbose=True)
# assert 0
from libtbx import easy_pickle
easy_pickle.dump('tst_mahal.pickle', cov)
cov=easy_pickle.load('tst_mahal.pickle')
# import pickle
# pf='tst_mahal.pickle'
# f=open(pf, 'w')
# pickle.dump(cov, f)
# del f
# f=open(pf, 'r')
# cov=pickle.load(f)
# del f
rc=math_utils.mahalanobis_p_values_outlier_indices(zz, cov=cov)
print(rc)
assert rc==[2]

if __name__ == '__main__':
main()

0 comments on commit 9ca4b76

Please sign in to comment.