added wavelets to mfcc and fixes

aclai-lab · Jul 26, 2024 · c58b66f · c58b66f
1 parent 53e540e
commit c58b66f
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 27 deletions.
diff --git a/src/Audio911.jl b/src/Audio911.jl
@@ -48,15 +48,15 @@ include("structs/mfcc.jl")
 export Mfcc, get_mfcc
 export Deltas, get_deltas
 
+include("structs/cwt.jl")
+export Cwt, get_cwt
+
 include("structs/spectral.jl")
 export Spectral, get_spectrals
 
 include("structs/f0.jl")
 export F0, get_f0
 
-include("structs/cwt.jl")
-export Cwt, get_cwt
-
 include("utils/histogram.jl")
 include("utils/speech_detector.jl")
 export speech_detector

diff --git a/src/structs/cwt_fbank.jl b/src/structs/cwt_fbank.jl
@@ -224,7 +224,7 @@ function _get_cwtfb(;
     end
 
     cwt_fb.freq = (center_freq ./ scales) / (2π) .* cwt_fb.sr
-    cwt_fb.fbank = hcat(cwt_fb.fbank, zeros(size(cwt_fb.fbank[:,2:end])))
+    cwt_fb.fbank = hcat(cwt_fb.fbank, zeros(size(cwt_fb.fbank, 1), x_length - size(cwt_fb.fbank, 2)))
 
     return cwt_fb
 end

diff --git a/src/structs/f0.jl b/src/structs/f0.jl
@@ -134,8 +134,8 @@ function Base.display(f0::F0)
 end
 
 function get_f0(;
-	stft::Stft,
+	source::Stft,
 	kwargs...
 )
-	_get_f0(x=stft.frames, win_length=stft.win_length, f0=F0(; sr=stft.sr, kwargs...))
+	_get_f0(x=source.frames, win_length=source.win_length, f0=F0(; sr=source.sr, kwargs...))
 end
diff --git a/src/structs/mfcc.jl b/src/structs/mfcc.jl
@@ -15,12 +15,17 @@ end
 # keyword constructor
 function Mfcc(;
 	sr,
-    ncoeffs = 13,
+    nbands,
+    ncoeffs = nothing,
     rectification = :log, # :log, :cubic_root
     dither = false,
 	mfcc = nothing,
 	freq = nothing,
 )
+    if isnothing(ncoeffs)
+        ncoeffs = round(Int, nbands / 2)
+    end
+
 	mfcc = Mfcc(
         sr,
         ncoeffs,
@@ -107,28 +112,25 @@ end
 #                                     Mfcc                                     #
 # ---------------------------------------------------------------------------- #
 function _get_mfcc(;
-        mel_spec::AbstractArray{Float64},
-        mel_bands::Int64,
+        spec::AbstractArray{Float64},
+        nbands::Int64,
         freq::AbstractVector{Float64},
         mfcc::Mfcc,
 )
-    # # Rectify
-    # mel_spec = deepcopy(data.mel_spectrogram')
-
-    mfcc.dither ? mel_spec[mel_spec .< 1e-8] .= 1e-8 : mel_spec[mel_spec .== 0] .= floatmin(Float64)
+    mfcc.dither ? spec[spec .< 1e-8] .= 1e-8 : spec[spec .== 0] .= floatmin(Float64)
 
     # Design DCT matrix
-    DCTmatrix = create_DCT_matrix(mel_bands)
+    DCTmatrix = create_DCT_matrix(nbands)
 
     # apply DCT matrix
     if (mfcc.rectification == :log)
-        coeffs = DCTmatrix * log10.(mel_spec)
+        coeffs = DCTmatrix * log10.(spec)
     elseif (mfcc.rectification == :cubic_root)
         # apply DCT matrix
-        coeffs = DCTmatrix * mel_spec .^ (1 / 3)
+        coeffs = DCTmatrix * spec .^ (1 / 3)
     else
         @warn("Unknown $mfcc.rectification DCT matrix rectification, defaulting to log.")
-        coeffs = DCTmatrix * log10.(mel_spec)
+        coeffs = DCTmatrix * log10.(spec)
     end
 
     # reduce to mfcc coefficients
@@ -159,10 +161,16 @@ function Base.display(mfcc::Mfcc)
 end
 
 function get_mfcc(;
-    melspec::MelSpec,
+    source = nothing,
     kwargs...
 )
-    _get_mfcc(mel_spec=melspec.spec, mel_bands=melspec.nbands, freq=melspec.freq, mfcc=Mfcc(; sr=melspec.sr, kwargs...))
+    if source isa MelSpec
+        _get_mfcc(spec=source.spec, nbands=source.nbands, freq=source.freq, mfcc=Mfcc(; sr=source.sr, nbands=source.nbands, kwargs...))
+    elseif source isa Cwt
+        _get_mfcc(spec=source.spec, nbands=length(source.freq), freq=source.freq, mfcc=Mfcc(; sr=source.sr, nbands=length(source.freq), kwargs...))
+    else
+        error("Unsupported type for spec: typeof(source) = $(typeof(source))")
+    end
 end
 
 function _get_deltas(;

diff --git a/src/structs/spectral.jl b/src/structs/spectral.jl
@@ -226,9 +226,8 @@ end
 
 
 function get_spectrals(;
-    stft::Stft,
+    source::Union{Stft, Cwt},
     kwargs...
 )
-	_get_spectrals(s=stft.spec, freq=stft.freq, spect=Spectral(; sr=stft.sr, kwargs...))
+	_get_spectrals(s=source.spec, freq=source.freq, spect=Spectral(; sr=source.sr, kwargs...))
 end
-
diff --git a/test/usage_examples.jl → test/usage_example.jl b/test/usage_examples.jl → test/usage_example.jl
@@ -1,4 +1,4 @@
-using Audio911
+using Revise, Audio911
 
 TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test")
 TESTFILE = "common_voice_en_23616312.wav"
@@ -17,6 +17,10 @@ audio = load_audio(
     norm=true,
 );
 
+# *** bear in mind that you can always display a plot of waht's going on *** #
+# simply by typing "display(plot)" after the function call, like this:
+# display(audio)
+
 # if needed you can optionally perform vocal speech detection to cut silence and/or noise
 audio = speech_detector(audio=audio);
 
@@ -57,11 +61,13 @@ melspec =  get_melspec(
 );
 
 # compute mfcc
-# with a mel spectrogram composed of 26 bands, we suggest to use 13 coefficients.
+# with a mel spectrogram composed of 26 bands, we suggest to use 13 coefficients,
+# exactly half of the number of filterbank coefficients.
+# if you don't declare ncoeffs, it will be defaulted to half of the number of filterbank coefficients.
 # we use a logarithmic rectification.
 # the dithered mfcc is suggested, as it's more robust to noise. (taken from AudioFlux Python package)
 mfcc = get_mfcc(
-    melspec=melspec,
+    source=melspec,
     ncoeffs = 13,
     rectification = :log, # :log, :cubic_root
     dither = true,
@@ -70,7 +76,7 @@ mfcc = get_mfcc(
 # take the fundamental frequency
 # we use the NFC method, with a range of 50-400hz.
 f0 = get_f0(
-    stft=stftspec,
+    source=stftspec,
     method = :nfc,
 	freq_range = (50, 400)
 );
@@ -90,7 +96,7 @@ f0 = get_f0(
 # spect.spread
 # also in this case, we use broadband frequency range.
 spect = get_spectrals(
-    stft=stftspec,
+    source=stftspec,
     freq_range = (0, round(Int, audio.sr / 2))
 );
 

diff --git a/test/wavelet_mfcc_example.jl b/test/wavelet_mfcc_example.jl
@@ -0,0 +1,123 @@
+using Revise, Audio911, 
+
+TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test")
+TESTFILE = "common_voice_en_23616312.wav"
+# TESTFILE = "104_1b1_Al_sc_Litt3200_4.wav"
+wavfile = joinpath(TESTPATH, TESTFILE)
+
+# ---------------------------------------------------------------------------- #
+#      continuous wavelets transform to feed mfcc and spectral features        #
+# ---------------------------------------------------------------------------- #
+# load audiofile
+# sample rate suggested for vocal analysis is 8000hz
+# always good pratice to normalize the audio beforehand
+audio = load_audio(
+    fname=wavfile, 
+    sr=8000, 
+    norm=true,
+);
+
+# *** bear in mind that you can always display a plot of waht's going on *** #
+# simply by typing "display(plot)" after the function call, like this:
+# display(audio)
+
+# if needed you can optionally perform vocal speech detection to cut silence and/or noise
+audio = speech_detector(audio=audio);
+
+# compute the continuous wavelet transform filterbank
+# as a starting point we suggest to use the bump wavelet
+# the frequency range set starting at 80hz is due to wavelets filterbank algorithm:
+# starting from 0hz, the filterbank is too compressed at low frequencies.
+# our suggest is to avoid the low frequencies, and start at 80hz, for computational efficiency.
+cwtfb = get_cwtfb(
+    audio=audio,
+    wavelet = :bump,
+	vpo = 10,
+    freq_range = (80, round(Int, audio.sr / 2))
+);
+
+# compute the continuous wavelet transform 
+# for further mel analysis we suggest to use a power normalization.
+# we don't suggest to use a logarithmic scale, as it's intended to be used for visualization.
+cwt = get_cwt(
+    audio=audio, 
+    fbank=cwtfb,
+    norm = :power, # :power, :magnitude, :pow2mag
+    db_scale = false,
+);
+
+# now we are in frequency domain, and we already applied a filterbank, just like the mel spectrogram.
+# but bear in mind that we are used to use a number of coefficients that is half of the number of mel bands,
+# so we suggest to use the same proportion:
+ncoeffs = round(Int, size(cwt.spec, 1) / 2)
+# anyway you can skip that because if you don't declare ncoeffs,
+# it will be defaulted to half of the number of filterbank coefficients.
+mfcc = get_mfcc(
+    source=cwt,
+    ncoeffs=ncoeffs,
+    rectification=:log, # :log, :cubic_root
+    dither=true,
+);
+
+# now we should continue with fundamental frequency and spectral features
+# because if we want a feature matrix, we need to have the same length for every feature.
+# TODO resolve current hack in f0 algorithm.
+cwtf0 = get_stft(
+    audio=audio, 
+    stft_length=256,
+	win_type = (:hann, :periodic),
+	win_length=256,
+	overlap_length = 255,
+	norm = :power, # :none, :power, :magnitude, :pow2mag
+);
+
+f0 = get_f0(
+    source=cwtf0,
+    method = :nfc,
+	freq_range = (80, 400)
+);
+#[hack]
+m = median(f0.f0)
+f0.f0 = vcat(f0.f0, zeros(length(audio.data)-length(f0.f0)))
+
+spect = get_spectrals(
+    source=cwt,
+    freq_range = (0, round(Int, audio.sr / 2))
+);
+
+# collect the features in a single Matrix
+# case 1: we want rows as features and columns as frames
+audio_features_rows = vcat(
+    cwt.spec,
+    mfcc.mfcc,
+    f0.f0',
+    spect.centroid',
+    spect.crest',
+    spect.entropy',
+    spect.flatness',
+    spect.flux',
+    spect.kurtosis',
+    spect.rolloff',
+    spect.skewness',
+    spect.decrease',
+    spect.slope',
+    spect.spread'
+);
+
+# case 2:we want colums as features and rows as frames
+audio_features_cols = hcat(
+    cwt.spec',
+    mfcc.mfcc',
+    f0.f0,
+    spect.centroid,
+    spect.crest,
+    spect.entropy,
+    spect.flatness,
+    spect.flux,
+    spect.kurtosis,
+    spect.rolloff,
+    spect.skewness,
+    spect.decrease,
+    spect.slope,
+    spect.spread
+);