-
Notifications
You must be signed in to change notification settings - Fork 5
/
runexample.m
119 lines (87 loc) · 3.42 KB
/
runexample.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
clear
% The sampling rate of signals/RIRs is set to 16 kHz, frame length for STFT is set to 256 samples (16 ms)
fs = 16000;
ftLen = 256;
MP = [1 1 1 2 2 3
2 3 4 3 4 4]; % all the six microphone pairs are used
%% Kinovis-MST example
% load HRIR
load('data/HRIR_NAO_48kHz_-175:5:180_degrees.mat');
freRan = [26:31 34:107]; % frequency bins used for Kinovis-MST example, which are not too noisy
% generate template
rtfTemp = generate_template(HRIR_NAO,freRan,MP);
% read audio
[y,rfs] = audioread('data/Kinovis-MST-example.wav');
micNum = size(y,2);
x = [];
for mic = 1:micNum
x(:,mic) = resample(y(:,mic),fs,rfs);
end
% localization
[GMMWeight,Peaks] = OnlineSSL_DPRTF_EG(x,rtfTemp,freRan,MP);
% resultplot
load('data/Kinovis-MST-speakerPosition.mat')
load('data/Kinovis-MST-speakerVAD.mat')
speakerPositionVAD = speakerPosition.*(speakerVAD);
speakerPositionVAD = speakerPositionVAD + (speakerPositionVAD==0)*200;
fraNum = size(speakerPosition,1);
figure;
subplot(311);hold;
plot(speakerPositionVAD,'.')
axis([1 fraNum -175 180])
set(gca,'xtick',1250:1250:fraNum,'xticklabels',10:10:fraNum/125,'ytick',-120:60:120,'yticklabels',-120:60:120,'FontSize',12,'box','on')
title('ground truth')
subplot(312);imagesc(GMMWeight(end:-1:1,:))
set(gca,'xtick',1250:1250:fraNum,'xticklabels',10:10:fraNum/125,'ytick',12:12:60,'yticklabels',-120:60:120,'FontSize',12,'box','on')
title('GMM Weight')
ylabel('Azimuth (degrees)')
subplot(313);imagesc(Peaks(end:-1:1,:))
set(gca,'xtick',1250:1250:fraNum,'xticklabels',10:10:fraNum/125,'ytick',12:12:60,'yticklabels',-120:60:120,'FontSize',12,'box','on')
title('Peak selection')
xlabel('Time (s)')
%% LOCATA example
% we use four (out of twelve) microphones
micPosition = [-0.031 0.023 0.042;
-0.036 -0.027 0.038;
0.034 -0.030 0.037;
0.035 0.025 0.039];
% candidate locations
AZI = (-pi+pi/36:pi/36:pi)'; % azimuth -175:5:180
ELE = pi*ones(size(AZI))/2;
Ran = 10;
canPosition = Ran*[-sin(ELE).*sin(AZI),sin(ELE).*cos(AZI),cos(ELE)]+0.04;
% TDOA
TDOA = compute_TDOA(micPosition,canPosition,MP);
% generate template
freRan = 2:65;
rtfTemp = generate_template(TDOA,freRan);
% read audio, four channels ([5 8 11 12]) of LOCATA Robot recording
[y,rfs] = audioread('data/LOCATA-dev-task6-rec3.wav');
micNum = size(y,2);
x = [];
for mic = 1:micNum
x(:,mic) = resample(y(:,mic),fs,rfs);
end
% localization
[GMMWeight,Peaks] = OnlineSSL_DPRTF_EG(x,rtfTemp,freRan,MP);
% result plot
load('data/LOCATA-speakerPosition.mat')
load('data/LOCATA-speakerVAD.mat')
speakerPositionVAD = speakerPosition.*(speakerVAD);
speakerPositionVAD = speakerPositionVAD + (speakerPositionVAD==0)*200;
fraNum_gt = size(speakerPosition,1);
fraNum = size(GMMWeight,2);
figure;
subplot(311);hold;
plot(speakerPositionVAD,'.')
axis([1 fraNum_gt -175 180])
set(gca,'xtick',1200:1200:fraNum_gt,'xticklabels',10:10:fraNum_gt/120,'ytick',-120:60:120,'yticklabels',-120:60:120,'FontSize',12,'box','on')
title('ground truth')
subplot(312);imagesc(GMMWeight(end:-1:1,:))
set(gca,'xtick',1250:1250:fraNum,'xticklabels',10:10:fraNum/125,'ytick',12:12:60,'yticklabels',-120:60:120,'FontSize',12,'box','on')
title('GMM Weight')
ylabel('Azimuth (degrees)')
subplot(313);imagesc(Peaks(end:-1:1,:))
set(gca,'xtick',1250:1250:fraNum,'xticklabels',10:10:fraNum/125,'ytick',12:12:60,'yticklabels',-120:60:120,'FontSize',12,'box','on')
title('Peak selection')
xlabel('Time (s)')