-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtunespca.m
103 lines (93 loc) · 3.07 KB
/
tunespca.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
function [optC c r2] = tunespca(X,varargin)
%TUNESPCA Find optimal sparsity parameters for spca
% optC = TUNESPCA(X) returns the optimal sparsity parameter optC for the
% first sparse principal component of data matrix X. Optimality is
% determined using the missing value imputation scheme proposed in Witten
% et al. (2009): optC is the level of sparsity that has the best
% performance in terms of recovering censored values of X.
%
% [optC, c] = TUNESPCA(X) returns the c values that were tested.
%
% [optC, c, r2] = TUNESPCA(X) returns the mean squared errors for all the
% c values tested
% optC = TUNESPACE(X,c) tests the c values in c (should be a vector in
% ascending order
% optC = TUNESPACE(X,'K',3) sequentially optimises c for the first K
% principal components
%
% INPUTS:
% X - N by p data matrix with observations in rows and
% variables in columns
% c - Vector of c-values to try in ascending order
% NAME/VALUE INPUTS:
% 'K' - How many principal components (default: 1)
% OUTPUTS:
% optC - 1 by K vector, optimal sparsity parameter values
% c - Sparsity parameter values tested (same for all
% components)
% r2 - Mean squared error for all c values tested in each
% component
% EXAMPLE:
% load carbig;
% data = [Displacement Horsepower Weight Acceleration MPG];
% nans = sum(isnan(data),2) > 0;
% optC = tunespca(data(~nans,:),'K',2);
% [coeff,score] = spca(data(~nans,:),optC,'K',2);
% REFERENCES
% Witten, Daniela M., Robert Tibshirani, and Trevor Hastie. "A penalized
% matrix decomposition, with applications to sparse principal components
% and canonical correlation analysis." Biostatistics 10.3 (2009): 515-534.
c = [];
K = 1;
if ~isempty(varargin)
if isa(varargin{1},"double")
c = varargin{1};
varargin(1) = [];
end
if rem(size(varargin, 2), 2) ~= 0
error('Check optional inputs.');
else
for i = 1:2:size(varargin, 2)
switch varargin{1, i}
case 'K'
K = varargin{1, i+1};
otherwise
error(['Could not recognise optional input names.' ...
'\nNo input named "%s"'],...
varargin{1,i});
end
end
end
end
rounds = 5;
ind = crossvalind('KFold',numel(X),rounds);
if isempty(c)
c = linspace(0,1,100)';
else
if ~(sort(c)==c)
warning('c is not sorted, strange things may happen')
end
c = c(:);
end
r2 = zeros(rounds,numel(c),K);
optC = zeros(1,K);
X = X - mean(X,1);
for k=1:K
for r=1:rounds
Xtmp = X;
rm = ind==r;
Xtmp(rm) = 0;
[V,~,~,U] = spca(Xtmp,'c',c);
for ci=1:numel(c)
Xest = (U(:,1,ci)'*Xtmp*V(:,1,ci))*U(:,1,ci)*V(:,1,ci)';
r2(r,ci,k) = sum((X(rm)-Xest(rm)).^2);
end
end
r2k = mean(r2(:,:,k),1);
[~,I] = min(r2k);
optC(k) = c(I);
% deflate data
dk = U(:,1,I)'*X*V(:,1,I);
X = X - dk*U(:,1,I)*V(:,1,I)';
end
r2 = squeeze(mean(r2,1))';