fixes

aclai-lab · Jul 29, 2024 · 456ffe0 · 456ffe0
1 parent f1c91a9
commit 456ffe0
Show file tree

Hide file tree

Showing 9 changed files with 38 additions and 36 deletions.
diff --git a/src/Audio911.jl b/src/Audio911.jl
@@ -16,12 +16,12 @@ function __init__()
     import librosa as librosa
     import soundfile as soundfile
 
-    def load_audio(fname, sr):
-        x, sr_def = librosa.load(fname, sr=sr, mono=True)
+    def load_audio(file, sr):
+        x, sr_def = librosa.load(file, sr=sr, mono=True)
         return x, sr_def
 
-    def save_audio(fname, x, sr):
-        soundfile.write(fname, x, samplerate=sr, subtype='PCM_16')
+    def save_audio(file, x, sr):
+        soundfile.write(file, x, samplerate=sr, subtype='PCM_16')
     """
 end
 

diff --git a/src/structs/audio.jl b/src/structs/audio.jl
@@ -21,31 +21,31 @@ function Base.display(audio::Audio)
 end
 
 function load_audio(;
-    fname::Union{AbstractString, AbstractVector{Float64}},
+    file::Union{AbstractString, AbstractVector{Float64}},
     sr::Union{Nothing, Int64} = nothing,
 	norm::Bool = false
 )
-	if fname isa AbstractString
+	if file isa AbstractString
 		audio = Audio(
-			py"load_audio"(fname, sr)...
+			py"load_audio"(file, sr)...
 		)
-	elseif fname isa AbstractVector{Float64} && sr isa Int64
-		audio = Audio(fname, sr)
+	elseif file isa AbstractVector{Float64} && sr isa Int64
+		audio = Audio(file, sr)
 	else
 		throw(ArgumentError("Invalid arguments"))
 	end
 
 	# normalize audio
 	if norm && length(audio.data) != 0
-		audio.data ./ maximum(abs.(audio.data))
+		audio.data = audio.data ./ maximum(abs.(audio.data))
 	end
 
 	return audio
 end
 
 function save_audio(;
 	audio::Audio,
-    fname::AbstractString
+    file::AbstractString
 )
-    py"save_audio"(fname, audio.data, audio.sr)
+    py"save_audio"(file, audio.data, audio.sr)
 end
diff --git a/src/structs/mfcc.jl b/src/structs/mfcc.jl
@@ -174,12 +174,12 @@ function get_mfcc(;
 end
 
 function _get_deltas(;
-        mfcc::AbstractArray{Float64},
+        source::AbstractArray{Float64},
         freq::AbstractVector{Float64},
         deltas::Deltas,
 )
     deltas.delta = audioDelta(
-        mfcc, deltas.d_length, deltas.d_matrix)
+        source, deltas.d_length, deltas.d_matrix)
     deltas.ddelta = audioDelta(
         deltas.delta, deltas.d_length, deltas.d_matrix)
 
@@ -220,8 +220,8 @@ function Base.display(deltas::Deltas)
 end
 
 function get_deltas(;
-    mfcc::Mfcc,
+    source::Mfcc,
     kwargs...
 )
-    _get_deltas(mfcc=mfcc.mfcc, freq=mfcc.freq, deltas=Deltas(; sr=mfcc.sr, kwargs...))
+    _get_deltas(source=source.mfcc, freq=source.freq, deltas=Deltas(; sr=source.sr, kwargs...))
 end
diff --git a/src/utils/speech_detector.jl b/src/utils/speech_detector.jl
@@ -168,6 +168,8 @@ function _speech_detector(;
         norm=:magnitude
     );
 
+    # stftspec.spec = (stftspec.spec ./ (0.5 * sum(stftspec.win)))./2
+
     # determine short term energy
     energy = vec(stftspec.win' .^ 2 * stftspec.frames .^ 2)
 

diff --git a/src/wavelets/wpdec.jl b/src/wavelets/wpdec.jl
@@ -98,9 +98,9 @@ end # function orthfilt
 function wfilters(
     wname::String
 )
-    fname = match(r"(?:[a-zA-Z]+)", wname).match # estrapola le lettere tramite regex da wname, .match riconverte da regex a string
-    wcode = wname[length(fname)+1:end] # estrapola la parte numerica
-    i_fam = winfo[fname] # recupera i dati dal dizionario generale
+    file = match(r"(?:[a-zA-Z]+)", wname).match # estrapola le lettere tramite regex da wname, .match riconverte da regex a string
+    wcode = wname[length(file)+1:end] # estrapola la parte numerica
+    i_fam = winfo[file] # recupera i dati dal dizionario generale
 
     F = i_fam.coeff[wcode]
     lo_D, hi_D, lo_R, hi_R = orthfilt(F)

diff --git a/test/afe.jl b/test/afe.jl
@@ -22,7 +22,7 @@ function audio911_extractor(
     stft_norm::Symbol=:power,               # :power, :magnitude, :pow2mag
     # mel filterbank module
     nbands::Int64=26,
-    scale::Symbol=:mel_htk,                 # :mel_htk, :mel_slaney, :erb, :bark
+    scale::Symbol=:bark,                 # :mel_htk, :mel_slaney, :erb, :bark
     melfb_norm::Symbol=:bandwidth,          # :bandwidth, :area, :none
     freq_range::Union{Tuple{Int64, Int64}, Nothing}=nothing,
     # mel spectrogram module
@@ -31,9 +31,9 @@ function audio911_extractor(
     ncoeffs::Int64=13,
     rectification::Symbol=:log,             # :log, :cubic_root
     dither::Bool=true,
-    # deltas module
-    d_length = 9,
-    d_matrix = :transposed,                 # :standard, :transposed
+    # # deltas module
+    # d_length = 9,
+    # d_matrix = :transposed,                 # :standard, :transposed
     # f0 module
     method::Symbol=:nfc,
     f0_range::Tuple{Int64, Int64}=(50, 400),
@@ -42,7 +42,7 @@ function audio911_extractor(
 )
     # audio module
     audio = load_audio(
-        fname=wavfile, 
+        file=wavfile, 
         sr=sr, 
         norm=norm,
     );
@@ -96,12 +96,12 @@ function audio911_extractor(
         dither=dither,
     );
 
-    # deltas module
-    deltas = get_deltas(
-        source=mfcc,
-        d_length=d_length,
-        d_matrix=d_matrix
-    );
+    # # deltas module
+    # deltas = get_deltas(
+    #     source=mfcc,
+    #     d_length=d_length,
+    #     d_matrix=d_matrix
+    # );
 
     # f0 module
     f0 = get_f0(
@@ -122,8 +122,8 @@ function audio911_extractor(
     return hcat(
         melspec.spec',
         mfcc.mfcc',
-        deltas.delta',
-        deltas.ddelta',
+        # deltas.delta',
+        # deltas.ddelta',
         f0.f0,
         spect.centroid,
         spect.crest,

diff --git a/test/features_test.jl b/test/features_test.jl
@@ -6,8 +6,8 @@ TESTFILE = "common_voice_en_23616312.wav"
 wavfile = joinpath(TESTPATH, TESTFILE)
 
 sr = 16000
-audio = load_audio(fname=wavfile);
-audio = load_audio(fname=wavfile, sr=sr);
+audio = load_audio(file=wavfile);
+audio = load_audio(file=wavfile, sr=sr);
 display(audio)
 
 stftspec = get_stft(audio=audio);

diff --git a/test/usage_example.jl b/test/usage_example.jl
@@ -12,7 +12,7 @@ wavfile = joinpath(TESTPATH, TESTFILE)
 # sample rate suggested for vocal analysis is 8000hz
 # always good pratice to normalize the audio beforehand
 audio = load_audio(
-    fname=wavfile, 
+    file=wavfile, 
     sr=8000, 
     norm=true,
 );

diff --git a/test/wavelet_mfcc_example.jl b/test/wavelet_mfcc_example.jl
@@ -12,7 +12,7 @@ wavfile = joinpath(TESTPATH, TESTFILE)
 # sample rate suggested for vocal analysis is 8000hz
 # always good pratice to normalize the audio beforehand
 audio = load_audio(
-    fname=wavfile, 
+    file=wavfile, 
     sr=8000, 
     norm=true,
 );