From 456ffe0da8a824a764acedd4fa6e2d492cbdf2d4 Mon Sep 17 00:00:00 2001 From: paso Date: Mon, 29 Jul 2024 18:01:56 +0200 Subject: [PATCH] fixes --- src/Audio911.jl | 8 ++++---- src/structs/audio.jl | 16 ++++++++-------- src/structs/mfcc.jl | 8 ++++---- src/utils/speech_detector.jl | 2 ++ src/wavelets/wpdec.jl | 6 +++--- test/afe.jl | 26 +++++++++++++------------- test/features_test.jl | 4 ++-- test/usage_example.jl | 2 +- test/wavelet_mfcc_example.jl | 2 +- 9 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/Audio911.jl b/src/Audio911.jl index dccf5eb..1c0012e 100644 --- a/src/Audio911.jl +++ b/src/Audio911.jl @@ -16,12 +16,12 @@ function __init__() import librosa as librosa import soundfile as soundfile - def load_audio(fname, sr): - x, sr_def = librosa.load(fname, sr=sr, mono=True) + def load_audio(file, sr): + x, sr_def = librosa.load(file, sr=sr, mono=True) return x, sr_def - def save_audio(fname, x, sr): - soundfile.write(fname, x, samplerate=sr, subtype='PCM_16') + def save_audio(file, x, sr): + soundfile.write(file, x, samplerate=sr, subtype='PCM_16') """ end diff --git a/src/structs/audio.jl b/src/structs/audio.jl index 852d959..5c180a5 100644 --- a/src/structs/audio.jl +++ b/src/structs/audio.jl @@ -21,23 +21,23 @@ function Base.display(audio::Audio) end function load_audio(; - fname::Union{AbstractString, AbstractVector{Float64}}, + file::Union{AbstractString, AbstractVector{Float64}}, sr::Union{Nothing, Int64} = nothing, norm::Bool = false ) - if fname isa AbstractString + if file isa AbstractString audio = Audio( - py"load_audio"(fname, sr)... + py"load_audio"(file, sr)... ) - elseif fname isa AbstractVector{Float64} && sr isa Int64 - audio = Audio(fname, sr) + elseif file isa AbstractVector{Float64} && sr isa Int64 + audio = Audio(file, sr) else throw(ArgumentError("Invalid arguments")) end # normalize audio if norm && length(audio.data) != 0 - audio.data ./ maximum(abs.(audio.data)) + audio.data = audio.data ./ maximum(abs.(audio.data)) end return audio @@ -45,7 +45,7 @@ end function save_audio(; audio::Audio, - fname::AbstractString + file::AbstractString ) - py"save_audio"(fname, audio.data, audio.sr) + py"save_audio"(file, audio.data, audio.sr) end \ No newline at end of file diff --git a/src/structs/mfcc.jl b/src/structs/mfcc.jl index 3dfc9ef..d7fef8c 100644 --- a/src/structs/mfcc.jl +++ b/src/structs/mfcc.jl @@ -174,12 +174,12 @@ function get_mfcc(; end function _get_deltas(; - mfcc::AbstractArray{Float64}, + source::AbstractArray{Float64}, freq::AbstractVector{Float64}, deltas::Deltas, ) deltas.delta = audioDelta( - mfcc, deltas.d_length, deltas.d_matrix) + source, deltas.d_length, deltas.d_matrix) deltas.ddelta = audioDelta( deltas.delta, deltas.d_length, deltas.d_matrix) @@ -220,8 +220,8 @@ function Base.display(deltas::Deltas) end function get_deltas(; - mfcc::Mfcc, + source::Mfcc, kwargs... ) - _get_deltas(mfcc=mfcc.mfcc, freq=mfcc.freq, deltas=Deltas(; sr=mfcc.sr, kwargs...)) + _get_deltas(source=source.mfcc, freq=source.freq, deltas=Deltas(; sr=source.sr, kwargs...)) end diff --git a/src/utils/speech_detector.jl b/src/utils/speech_detector.jl index a4e7b74..25396ee 100644 --- a/src/utils/speech_detector.jl +++ b/src/utils/speech_detector.jl @@ -168,6 +168,8 @@ function _speech_detector(; norm=:magnitude ); + # stftspec.spec = (stftspec.spec ./ (0.5 * sum(stftspec.win)))./2 + # determine short term energy energy = vec(stftspec.win' .^ 2 * stftspec.frames .^ 2) diff --git a/src/wavelets/wpdec.jl b/src/wavelets/wpdec.jl index 91b43dd..ee4305e 100644 --- a/src/wavelets/wpdec.jl +++ b/src/wavelets/wpdec.jl @@ -98,9 +98,9 @@ end # function orthfilt function wfilters( wname::String ) - fname = match(r"(?:[a-zA-Z]+)", wname).match # estrapola le lettere tramite regex da wname, .match riconverte da regex a string - wcode = wname[length(fname)+1:end] # estrapola la parte numerica - i_fam = winfo[fname] # recupera i dati dal dizionario generale + file = match(r"(?:[a-zA-Z]+)", wname).match # estrapola le lettere tramite regex da wname, .match riconverte da regex a string + wcode = wname[length(file)+1:end] # estrapola la parte numerica + i_fam = winfo[file] # recupera i dati dal dizionario generale F = i_fam.coeff[wcode] lo_D, hi_D, lo_R, hi_R = orthfilt(F) diff --git a/test/afe.jl b/test/afe.jl index 914112c..2d8ee04 100644 --- a/test/afe.jl +++ b/test/afe.jl @@ -22,7 +22,7 @@ function audio911_extractor( stft_norm::Symbol=:power, # :power, :magnitude, :pow2mag # mel filterbank module nbands::Int64=26, - scale::Symbol=:mel_htk, # :mel_htk, :mel_slaney, :erb, :bark + scale::Symbol=:bark, # :mel_htk, :mel_slaney, :erb, :bark melfb_norm::Symbol=:bandwidth, # :bandwidth, :area, :none freq_range::Union{Tuple{Int64, Int64}, Nothing}=nothing, # mel spectrogram module @@ -31,9 +31,9 @@ function audio911_extractor( ncoeffs::Int64=13, rectification::Symbol=:log, # :log, :cubic_root dither::Bool=true, - # deltas module - d_length = 9, - d_matrix = :transposed, # :standard, :transposed + # # deltas module + # d_length = 9, + # d_matrix = :transposed, # :standard, :transposed # f0 module method::Symbol=:nfc, f0_range::Tuple{Int64, Int64}=(50, 400), @@ -42,7 +42,7 @@ function audio911_extractor( ) # audio module audio = load_audio( - fname=wavfile, + file=wavfile, sr=sr, norm=norm, ); @@ -96,12 +96,12 @@ function audio911_extractor( dither=dither, ); - # deltas module - deltas = get_deltas( - source=mfcc, - d_length=d_length, - d_matrix=d_matrix - ); + # # deltas module + # deltas = get_deltas( + # source=mfcc, + # d_length=d_length, + # d_matrix=d_matrix + # ); # f0 module f0 = get_f0( @@ -122,8 +122,8 @@ function audio911_extractor( return hcat( melspec.spec', mfcc.mfcc', - deltas.delta', - deltas.ddelta', + # deltas.delta', + # deltas.ddelta', f0.f0, spect.centroid, spect.crest, diff --git a/test/features_test.jl b/test/features_test.jl index 0d1d856..9a0a3b9 100644 --- a/test/features_test.jl +++ b/test/features_test.jl @@ -6,8 +6,8 @@ TESTFILE = "common_voice_en_23616312.wav" wavfile = joinpath(TESTPATH, TESTFILE) sr = 16000 -audio = load_audio(fname=wavfile); -audio = load_audio(fname=wavfile, sr=sr); +audio = load_audio(file=wavfile); +audio = load_audio(file=wavfile, sr=sr); display(audio) stftspec = get_stft(audio=audio); diff --git a/test/usage_example.jl b/test/usage_example.jl index 24b81b1..b0555bd 100644 --- a/test/usage_example.jl +++ b/test/usage_example.jl @@ -12,7 +12,7 @@ wavfile = joinpath(TESTPATH, TESTFILE) # sample rate suggested for vocal analysis is 8000hz # always good pratice to normalize the audio beforehand audio = load_audio( - fname=wavfile, + file=wavfile, sr=8000, norm=true, ); diff --git a/test/wavelet_mfcc_example.jl b/test/wavelet_mfcc_example.jl index 8aa1c34..c113bbd 100644 --- a/test/wavelet_mfcc_example.jl +++ b/test/wavelet_mfcc_example.jl @@ -12,7 +12,7 @@ wavfile = joinpath(TESTPATH, TESTFILE) # sample rate suggested for vocal analysis is 8000hz # always good pratice to normalize the audio beforehand audio = load_audio( - fname=wavfile, + file=wavfile, sr=8000, norm=true, );