Skip to content

Commit

Permalink
added wavelets to mfcc and fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
PasoStudio73 committed Jul 26, 2024
1 parent 53e540e commit c58b66f
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 27 deletions.
6 changes: 3 additions & 3 deletions src/Audio911.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ include("structs/mfcc.jl")
export Mfcc, get_mfcc
export Deltas, get_deltas

include("structs/cwt.jl")
export Cwt, get_cwt

include("structs/spectral.jl")
export Spectral, get_spectrals

include("structs/f0.jl")
export F0, get_f0

include("structs/cwt.jl")
export Cwt, get_cwt

include("utils/histogram.jl")
include("utils/speech_detector.jl")
export speech_detector
Expand Down
2 changes: 1 addition & 1 deletion src/structs/cwt_fbank.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ function _get_cwtfb(;
end

cwt_fb.freq = (center_freq ./ scales) / (2π) .* cwt_fb.sr
cwt_fb.fbank = hcat(cwt_fb.fbank, zeros(size(cwt_fb.fbank[:,2:end])))
cwt_fb.fbank = hcat(cwt_fb.fbank, zeros(size(cwt_fb.fbank, 1), x_length - size(cwt_fb.fbank, 2)))

return cwt_fb
end
Expand Down
4 changes: 2 additions & 2 deletions src/structs/f0.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ function Base.display(f0::F0)
end

function get_f0(;
stft::Stft,
source::Stft,
kwargs...
)
_get_f0(x=stft.frames, win_length=stft.win_length, f0=F0(; sr=stft.sr, kwargs...))
_get_f0(x=source.frames, win_length=source.win_length, f0=F0(; sr=source.sr, kwargs...))
end
34 changes: 21 additions & 13 deletions src/structs/mfcc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ end
# keyword constructor
function Mfcc(;
sr,
ncoeffs = 13,
nbands,
ncoeffs = nothing,
rectification = :log, # :log, :cubic_root
dither = false,
mfcc = nothing,
freq = nothing,
)
if isnothing(ncoeffs)
ncoeffs = round(Int, nbands / 2)
end

mfcc = Mfcc(
sr,
ncoeffs,
Expand Down Expand Up @@ -107,28 +112,25 @@ end
# Mfcc #
# ---------------------------------------------------------------------------- #
function _get_mfcc(;
mel_spec::AbstractArray{Float64},
mel_bands::Int64,
spec::AbstractArray{Float64},
nbands::Int64,
freq::AbstractVector{Float64},
mfcc::Mfcc,
)
# # Rectify
# mel_spec = deepcopy(data.mel_spectrogram')

mfcc.dither ? mel_spec[mel_spec .< 1e-8] .= 1e-8 : mel_spec[mel_spec .== 0] .= floatmin(Float64)
mfcc.dither ? spec[spec .< 1e-8] .= 1e-8 : spec[spec .== 0] .= floatmin(Float64)

# Design DCT matrix
DCTmatrix = create_DCT_matrix(mel_bands)
DCTmatrix = create_DCT_matrix(nbands)

# apply DCT matrix
if (mfcc.rectification == :log)
coeffs = DCTmatrix * log10.(mel_spec)
coeffs = DCTmatrix * log10.(spec)
elseif (mfcc.rectification == :cubic_root)
# apply DCT matrix
coeffs = DCTmatrix * mel_spec .^ (1 / 3)
coeffs = DCTmatrix * spec .^ (1 / 3)
else
@warn("Unknown $mfcc.rectification DCT matrix rectification, defaulting to log.")
coeffs = DCTmatrix * log10.(mel_spec)
coeffs = DCTmatrix * log10.(spec)
end

# reduce to mfcc coefficients
Expand Down Expand Up @@ -159,10 +161,16 @@ function Base.display(mfcc::Mfcc)
end

function get_mfcc(;
melspec::MelSpec,
source = nothing,
kwargs...
)
_get_mfcc(mel_spec=melspec.spec, mel_bands=melspec.nbands, freq=melspec.freq, mfcc=Mfcc(; sr=melspec.sr, kwargs...))
if source isa MelSpec
_get_mfcc(spec=source.spec, nbands=source.nbands, freq=source.freq, mfcc=Mfcc(; sr=source.sr, nbands=source.nbands, kwargs...))
elseif source isa Cwt
_get_mfcc(spec=source.spec, nbands=length(source.freq), freq=source.freq, mfcc=Mfcc(; sr=source.sr, nbands=length(source.freq), kwargs...))
else
error("Unsupported type for spec: typeof(source) = $(typeof(source))")
end
end

function _get_deltas(;
Expand Down
5 changes: 2 additions & 3 deletions src/structs/spectral.jl
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,8 @@ end


function get_spectrals(;
stft::Stft,
source::Union{Stft, Cwt},
kwargs...
)
_get_spectrals(s=stft.spec, freq=stft.freq, spect=Spectral(; sr=stft.sr, kwargs...))
_get_spectrals(s=source.spec, freq=source.freq, spect=Spectral(; sr=source.sr, kwargs...))
end

16 changes: 11 additions & 5 deletions test/usage_examples.jl → test/usage_example.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Audio911
using Revise, Audio911

TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test")
TESTFILE = "common_voice_en_23616312.wav"
Expand All @@ -17,6 +17,10 @@ audio = load_audio(
norm=true,
);

# *** bear in mind that you can always display a plot of waht's going on *** #
# simply by typing "display(plot)" after the function call, like this:
# display(audio)

# if needed you can optionally perform vocal speech detection to cut silence and/or noise
audio = speech_detector(audio=audio);

Expand Down Expand Up @@ -57,11 +61,13 @@ melspec = get_melspec(
);

# compute mfcc
# with a mel spectrogram composed of 26 bands, we suggest to use 13 coefficients.
# with a mel spectrogram composed of 26 bands, we suggest to use 13 coefficients,
# exactly half of the number of filterbank coefficients.
# if you don't declare ncoeffs, it will be defaulted to half of the number of filterbank coefficients.
# we use a logarithmic rectification.
# the dithered mfcc is suggested, as it's more robust to noise. (taken from AudioFlux Python package)
mfcc = get_mfcc(
melspec=melspec,
source=melspec,
ncoeffs = 13,
rectification = :log, # :log, :cubic_root
dither = true,
Expand All @@ -70,7 +76,7 @@ mfcc = get_mfcc(
# take the fundamental frequency
# we use the NFC method, with a range of 50-400hz.
f0 = get_f0(
stft=stftspec,
source=stftspec,
method = :nfc,
freq_range = (50, 400)
);
Expand All @@ -90,7 +96,7 @@ f0 = get_f0(
# spect.spread
# also in this case, we use broadband frequency range.
spect = get_spectrals(
stft=stftspec,
source=stftspec,
freq_range = (0, round(Int, audio.sr / 2))
);

Expand Down
123 changes: 123 additions & 0 deletions test/wavelet_mfcc_example.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
using Revise, Audio911,

TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test")
TESTFILE = "common_voice_en_23616312.wav"
# TESTFILE = "104_1b1_Al_sc_Litt3200_4.wav"
wavfile = joinpath(TESTPATH, TESTFILE)

# ---------------------------------------------------------------------------- #
# continuous wavelets transform to feed mfcc and spectral features #
# ---------------------------------------------------------------------------- #
# load audiofile
# sample rate suggested for vocal analysis is 8000hz
# always good pratice to normalize the audio beforehand
audio = load_audio(
fname=wavfile,
sr=8000,
norm=true,
);

# *** bear in mind that you can always display a plot of waht's going on *** #
# simply by typing "display(plot)" after the function call, like this:
# display(audio)

# if needed you can optionally perform vocal speech detection to cut silence and/or noise
audio = speech_detector(audio=audio);

# compute the continuous wavelet transform filterbank
# as a starting point we suggest to use the bump wavelet
# the frequency range set starting at 80hz is due to wavelets filterbank algorithm:
# starting from 0hz, the filterbank is too compressed at low frequencies.
# our suggest is to avoid the low frequencies, and start at 80hz, for computational efficiency.
cwtfb = get_cwtfb(
audio=audio,
wavelet = :bump,
vpo = 10,
freq_range = (80, round(Int, audio.sr / 2))
);

# compute the continuous wavelet transform
# for further mel analysis we suggest to use a power normalization.
# we don't suggest to use a logarithmic scale, as it's intended to be used for visualization.
cwt = get_cwt(
audio=audio,
fbank=cwtfb,
norm = :power, # :power, :magnitude, :pow2mag
db_scale = false,
);

# now we are in frequency domain, and we already applied a filterbank, just like the mel spectrogram.
# but bear in mind that we are used to use a number of coefficients that is half of the number of mel bands,
# so we suggest to use the same proportion:
ncoeffs = round(Int, size(cwt.spec, 1) / 2)
# anyway you can skip that because if you don't declare ncoeffs,
# it will be defaulted to half of the number of filterbank coefficients.
mfcc = get_mfcc(
source=cwt,
ncoeffs=ncoeffs,
rectification=:log, # :log, :cubic_root
dither=true,
);

# now we should continue with fundamental frequency and spectral features
# because if we want a feature matrix, we need to have the same length for every feature.
# TODO resolve current hack in f0 algorithm.
cwtf0 = get_stft(
audio=audio,
stft_length=256,
win_type = (:hann, :periodic),
win_length=256,
overlap_length = 255,
norm = :power, # :none, :power, :magnitude, :pow2mag
);

f0 = get_f0(
source=cwtf0,
method = :nfc,
freq_range = (80, 400)
);
#[hack]
m = median(f0.f0)
f0.f0 = vcat(f0.f0, zeros(length(audio.data)-length(f0.f0)))

spect = get_spectrals(
source=cwt,
freq_range = (0, round(Int, audio.sr / 2))
);

# collect the features in a single Matrix
# case 1: we want rows as features and columns as frames
audio_features_rows = vcat(
cwt.spec,
mfcc.mfcc,
f0.f0',
spect.centroid',
spect.crest',
spect.entropy',
spect.flatness',
spect.flux',
spect.kurtosis',
spect.rolloff',
spect.skewness',
spect.decrease',
spect.slope',
spect.spread'
);

# case 2:we want colums as features and rows as frames
audio_features_cols = hcat(
cwt.spec',
mfcc.mfcc',
f0.f0,
spect.centroid,
spect.crest,
spect.entropy,
spect.flatness,
spect.flux,
spect.kurtosis,
spect.rolloff,
spect.skewness,
spect.decrease,
spect.slope,
spect.spread
);

0 comments on commit c58b66f

Please sign in to comment.