From c58b66f45a071cc0507f9576a69bdc914beabfd9 Mon Sep 17 00:00:00 2001 From: paso Date: Sat, 27 Jul 2024 00:48:21 +0200 Subject: [PATCH] added wavelets to mfcc and fixes --- src/Audio911.jl | 6 +- src/structs/cwt_fbank.jl | 2 +- src/structs/f0.jl | 4 +- src/structs/mfcc.jl | 34 +++-- src/structs/spectral.jl | 5 +- test/{usage_examples.jl => usage_example.jl} | 16 ++- test/wavelet_mfcc_example.jl | 123 +++++++++++++++++++ 7 files changed, 163 insertions(+), 27 deletions(-) rename test/{usage_examples.jl => usage_example.jl} (89%) create mode 100644 test/wavelet_mfcc_example.jl diff --git a/src/Audio911.jl b/src/Audio911.jl index 866cb9c..dccf5eb 100644 --- a/src/Audio911.jl +++ b/src/Audio911.jl @@ -48,15 +48,15 @@ include("structs/mfcc.jl") export Mfcc, get_mfcc export Deltas, get_deltas +include("structs/cwt.jl") +export Cwt, get_cwt + include("structs/spectral.jl") export Spectral, get_spectrals include("structs/f0.jl") export F0, get_f0 -include("structs/cwt.jl") -export Cwt, get_cwt - include("utils/histogram.jl") include("utils/speech_detector.jl") export speech_detector diff --git a/src/structs/cwt_fbank.jl b/src/structs/cwt_fbank.jl index c2a7f30..4461bfd 100644 --- a/src/structs/cwt_fbank.jl +++ b/src/structs/cwt_fbank.jl @@ -224,7 +224,7 @@ function _get_cwtfb(; end cwt_fb.freq = (center_freq ./ scales) / (2π) .* cwt_fb.sr - cwt_fb.fbank = hcat(cwt_fb.fbank, zeros(size(cwt_fb.fbank[:,2:end]))) + cwt_fb.fbank = hcat(cwt_fb.fbank, zeros(size(cwt_fb.fbank, 1), x_length - size(cwt_fb.fbank, 2))) return cwt_fb end diff --git a/src/structs/f0.jl b/src/structs/f0.jl index d8df0c7..d839741 100644 --- a/src/structs/f0.jl +++ b/src/structs/f0.jl @@ -134,8 +134,8 @@ function Base.display(f0::F0) end function get_f0(; - stft::Stft, + source::Stft, kwargs... ) - _get_f0(x=stft.frames, win_length=stft.win_length, f0=F0(; sr=stft.sr, kwargs...)) + _get_f0(x=source.frames, win_length=source.win_length, f0=F0(; sr=source.sr, kwargs...)) end \ No newline at end of file diff --git a/src/structs/mfcc.jl b/src/structs/mfcc.jl index d90746f..3dfc9ef 100644 --- a/src/structs/mfcc.jl +++ b/src/structs/mfcc.jl @@ -15,12 +15,17 @@ end # keyword constructor function Mfcc(; sr, - ncoeffs = 13, + nbands, + ncoeffs = nothing, rectification = :log, # :log, :cubic_root dither = false, mfcc = nothing, freq = nothing, ) + if isnothing(ncoeffs) + ncoeffs = round(Int, nbands / 2) + end + mfcc = Mfcc( sr, ncoeffs, @@ -107,28 +112,25 @@ end # Mfcc # # ---------------------------------------------------------------------------- # function _get_mfcc(; - mel_spec::AbstractArray{Float64}, - mel_bands::Int64, + spec::AbstractArray{Float64}, + nbands::Int64, freq::AbstractVector{Float64}, mfcc::Mfcc, ) - # # Rectify - # mel_spec = deepcopy(data.mel_spectrogram') - - mfcc.dither ? mel_spec[mel_spec .< 1e-8] .= 1e-8 : mel_spec[mel_spec .== 0] .= floatmin(Float64) + mfcc.dither ? spec[spec .< 1e-8] .= 1e-8 : spec[spec .== 0] .= floatmin(Float64) # Design DCT matrix - DCTmatrix = create_DCT_matrix(mel_bands) + DCTmatrix = create_DCT_matrix(nbands) # apply DCT matrix if (mfcc.rectification == :log) - coeffs = DCTmatrix * log10.(mel_spec) + coeffs = DCTmatrix * log10.(spec) elseif (mfcc.rectification == :cubic_root) # apply DCT matrix - coeffs = DCTmatrix * mel_spec .^ (1 / 3) + coeffs = DCTmatrix * spec .^ (1 / 3) else @warn("Unknown $mfcc.rectification DCT matrix rectification, defaulting to log.") - coeffs = DCTmatrix * log10.(mel_spec) + coeffs = DCTmatrix * log10.(spec) end # reduce to mfcc coefficients @@ -159,10 +161,16 @@ function Base.display(mfcc::Mfcc) end function get_mfcc(; - melspec::MelSpec, + source = nothing, kwargs... ) - _get_mfcc(mel_spec=melspec.spec, mel_bands=melspec.nbands, freq=melspec.freq, mfcc=Mfcc(; sr=melspec.sr, kwargs...)) + if source isa MelSpec + _get_mfcc(spec=source.spec, nbands=source.nbands, freq=source.freq, mfcc=Mfcc(; sr=source.sr, nbands=source.nbands, kwargs...)) + elseif source isa Cwt + _get_mfcc(spec=source.spec, nbands=length(source.freq), freq=source.freq, mfcc=Mfcc(; sr=source.sr, nbands=length(source.freq), kwargs...)) + else + error("Unsupported type for spec: typeof(source) = $(typeof(source))") + end end function _get_deltas(; diff --git a/src/structs/spectral.jl b/src/structs/spectral.jl index 36125cf..df31e27 100644 --- a/src/structs/spectral.jl +++ b/src/structs/spectral.jl @@ -226,9 +226,8 @@ end function get_spectrals(; - stft::Stft, + source::Union{Stft, Cwt}, kwargs... ) - _get_spectrals(s=stft.spec, freq=stft.freq, spect=Spectral(; sr=stft.sr, kwargs...)) + _get_spectrals(s=source.spec, freq=source.freq, spect=Spectral(; sr=source.sr, kwargs...)) end - diff --git a/test/usage_examples.jl b/test/usage_example.jl similarity index 89% rename from test/usage_examples.jl rename to test/usage_example.jl index 0ff2444..6eac3bc 100644 --- a/test/usage_examples.jl +++ b/test/usage_example.jl @@ -1,4 +1,4 @@ -using Audio911 +using Revise, Audio911 TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test") TESTFILE = "common_voice_en_23616312.wav" @@ -17,6 +17,10 @@ audio = load_audio( norm=true, ); +# *** bear in mind that you can always display a plot of waht's going on *** # +# simply by typing "display(plot)" after the function call, like this: +# display(audio) + # if needed you can optionally perform vocal speech detection to cut silence and/or noise audio = speech_detector(audio=audio); @@ -57,11 +61,13 @@ melspec = get_melspec( ); # compute mfcc -# with a mel spectrogram composed of 26 bands, we suggest to use 13 coefficients. +# with a mel spectrogram composed of 26 bands, we suggest to use 13 coefficients, +# exactly half of the number of filterbank coefficients. +# if you don't declare ncoeffs, it will be defaulted to half of the number of filterbank coefficients. # we use a logarithmic rectification. # the dithered mfcc is suggested, as it's more robust to noise. (taken from AudioFlux Python package) mfcc = get_mfcc( - melspec=melspec, + source=melspec, ncoeffs = 13, rectification = :log, # :log, :cubic_root dither = true, @@ -70,7 +76,7 @@ mfcc = get_mfcc( # take the fundamental frequency # we use the NFC method, with a range of 50-400hz. f0 = get_f0( - stft=stftspec, + source=stftspec, method = :nfc, freq_range = (50, 400) ); @@ -90,7 +96,7 @@ f0 = get_f0( # spect.spread # also in this case, we use broadband frequency range. spect = get_spectrals( - stft=stftspec, + source=stftspec, freq_range = (0, round(Int, audio.sr / 2)) ); diff --git a/test/wavelet_mfcc_example.jl b/test/wavelet_mfcc_example.jl new file mode 100644 index 0000000..8aa1c34 --- /dev/null +++ b/test/wavelet_mfcc_example.jl @@ -0,0 +1,123 @@ +using Revise, Audio911, + +TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test") +TESTFILE = "common_voice_en_23616312.wav" +# TESTFILE = "104_1b1_Al_sc_Litt3200_4.wav" +wavfile = joinpath(TESTPATH, TESTFILE) + +# ---------------------------------------------------------------------------- # +# continuous wavelets transform to feed mfcc and spectral features # +# ---------------------------------------------------------------------------- # +# load audiofile +# sample rate suggested for vocal analysis is 8000hz +# always good pratice to normalize the audio beforehand +audio = load_audio( + fname=wavfile, + sr=8000, + norm=true, +); + +# *** bear in mind that you can always display a plot of waht's going on *** # +# simply by typing "display(plot)" after the function call, like this: +# display(audio) + +# if needed you can optionally perform vocal speech detection to cut silence and/or noise +audio = speech_detector(audio=audio); + +# compute the continuous wavelet transform filterbank +# as a starting point we suggest to use the bump wavelet +# the frequency range set starting at 80hz is due to wavelets filterbank algorithm: +# starting from 0hz, the filterbank is too compressed at low frequencies. +# our suggest is to avoid the low frequencies, and start at 80hz, for computational efficiency. +cwtfb = get_cwtfb( + audio=audio, + wavelet = :bump, + vpo = 10, + freq_range = (80, round(Int, audio.sr / 2)) +); + +# compute the continuous wavelet transform +# for further mel analysis we suggest to use a power normalization. +# we don't suggest to use a logarithmic scale, as it's intended to be used for visualization. +cwt = get_cwt( + audio=audio, + fbank=cwtfb, + norm = :power, # :power, :magnitude, :pow2mag + db_scale = false, +); + +# now we are in frequency domain, and we already applied a filterbank, just like the mel spectrogram. +# but bear in mind that we are used to use a number of coefficients that is half of the number of mel bands, +# so we suggest to use the same proportion: +ncoeffs = round(Int, size(cwt.spec, 1) / 2) +# anyway you can skip that because if you don't declare ncoeffs, +# it will be defaulted to half of the number of filterbank coefficients. +mfcc = get_mfcc( + source=cwt, + ncoeffs=ncoeffs, + rectification=:log, # :log, :cubic_root + dither=true, +); + +# now we should continue with fundamental frequency and spectral features +# because if we want a feature matrix, we need to have the same length for every feature. +# TODO resolve current hack in f0 algorithm. +cwtf0 = get_stft( + audio=audio, + stft_length=256, + win_type = (:hann, :periodic), + win_length=256, + overlap_length = 255, + norm = :power, # :none, :power, :magnitude, :pow2mag +); + +f0 = get_f0( + source=cwtf0, + method = :nfc, + freq_range = (80, 400) +); +#[hack] +m = median(f0.f0) +f0.f0 = vcat(f0.f0, zeros(length(audio.data)-length(f0.f0))) + +spect = get_spectrals( + source=cwt, + freq_range = (0, round(Int, audio.sr / 2)) +); + +# collect the features in a single Matrix +# case 1: we want rows as features and columns as frames +audio_features_rows = vcat( + cwt.spec, + mfcc.mfcc, + f0.f0', + spect.centroid', + spect.crest', + spect.entropy', + spect.flatness', + spect.flux', + spect.kurtosis', + spect.rolloff', + spect.skewness', + spect.decrease', + spect.slope', + spect.spread' +); + +# case 2:we want colums as features and rows as frames +audio_features_cols = hcat( + cwt.spec', + mfcc.mfcc', + f0.f0, + spect.centroid, + spect.crest, + spect.entropy, + spect.flatness, + spect.flux, + spect.kurtosis, + spect.rolloff, + spect.skewness, + spect.decrease, + spect.slope, + spect.spread +); \ No newline at end of file