From dfa1b6b2d4bce0ecc00e9cf29562a76ba0ed0be5 Mon Sep 17 00:00:00 2001
From: paso <paso.studio73@gmail.com>
Date: Sat, 27 Apr 2024 00:15:15 +0200
Subject: [PATCH] added profile in full_features, fixed speech detector

---
 src/audioFeaturesExtractor.jl |   2 +-
 src/fft/fft.jl                |   2 +-
 src/signalDataStructure.jl    |  82 ++++++++++----------
 src/utils/speech_detector.jl  | 137 +++++++++++++++++++++-------------
 src/windowing/windowing.jl    |  68 ++++++++---------
 test/cwt_features.jl          |  66 ----------------
 6 files changed, 163 insertions(+), 194 deletions(-)
 delete mode 100644 test/cwt_features.jl

diff --git a/src/audioFeaturesExtractor.jl b/src/audioFeaturesExtractor.jl
index d63f98b..2f4c2a9 100644
--- a/src/audioFeaturesExtractor.jl
+++ b/src/audioFeaturesExtractor.jl
@@ -13,7 +13,7 @@ audio.get_mel_spec()
 audio.get_mfcc()
 audio.get_spectrals()
 audio.get_f0()
-audio.get_features()
+audio.get_features(profile, per ora solo :full)
 
 #########################################################################################################
 2-utilizzare l'invocazione get_feature per ottenere le features separatamente
diff --git a/src/fft/fft.jl b/src/fft/fft.jl
index a109d94..221f185 100644
--- a/src/fft/fft.jl
+++ b/src/fft/fft.jl
@@ -28,7 +28,7 @@ function get_fft!(setup::AudioSetup, data::AudioData)
 
     setup.fft_length = setup.window_length # definisce la fft pari alla finestra
     hop_length = setup.window_length - setup.overlap_length
-    data.fft_window, unused = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2])
+    data.fft_window, _ = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2])
 
     # split in windows
     y = buffer(data.x, setup.window_length, hop_length)
diff --git a/src/signalDataStructure.jl b/src/signalDataStructure.jl
index c9304ed..06c7256 100644
--- a/src/signalDataStructure.jl
+++ b/src/signalDataStructure.jl
@@ -187,45 +187,49 @@ mutable struct AudioObj
         return self.data.fft
     end
 
-    function get_features(self::AudioObj)
-        if isempty(self.data.fft)
-            get_fft!(self.setup, self.data)
-        end
-        if isempty(self.data.mel_spectrogram)
-            get_mel_spec!(self.setup, self.data)
-        end
-        if isempty(self.data.mfcc_coeffs)
-            get_mfcc!(self.setup, self.data)
-            get_mfcc_deltas!(self.setup, self.data)
-        end
-        if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram)
-            lin_spectrogram!(self.setup, self.data)
-        end
-        if isempty(self.data.spectral_centroid)
-            get_spectrals!(self.setup, self.data)
-        end
-        if isempty(self.data.f0)
-            get_f0!(self.setup, self.data)
-        end
+    function get_features(self::AudioObj; profile::Symbol)
+        if profile == :full
+            if isempty(self.data.fft)
+                get_fft!(self.setup, self.data)
+            end
+            if isempty(self.data.mel_spectrogram)
+                get_mel_spec!(self.setup, self.data)
+            end
+            if isempty(self.data.mfcc_coeffs)
+                get_mfcc!(self.setup, self.data)
+                get_mfcc_deltas!(self.setup, self.data)
+            end
+            if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram)
+                lin_spectrogram!(self.setup, self.data)
+            end
+            if isempty(self.data.spectral_centroid)
+                get_spectrals!(self.setup, self.data)
+            end
+            if isempty(self.data.f0)
+                get_f0!(self.setup, self.data)
+            end
 
-        return vcat((
-            self.data.mel_spectrogram',
-            self.data.mfcc_coeffs',
-            self.data.mfcc_delta',
-            self.data.mfcc_deltadelta',
-            self.data.spectral_centroid',
-            self.data.spectral_crest',
-            self.data.spectral_decrease',
-            self.data.spectral_entropy',
-            self.data.spectral_flatness',
-            self.data.spectral_flux',
-            self.data.spectral_kurtosis',
-            self.data.spectral_rolloff',
-            self.data.spectral_skewness',
-            self.data.spectral_slope',
-            self.data.spectral_spread',
-            self.data.f0'
-        )...)
+            return vcat((
+                self.data.mel_spectrogram',
+                self.data.mfcc_coeffs',
+                self.data.mfcc_delta',
+                self.data.mfcc_deltadelta',
+                self.data.spectral_centroid',
+                self.data.spectral_crest',
+                self.data.spectral_decrease',
+                self.data.spectral_entropy',
+                self.data.spectral_flatness',
+                self.data.spectral_flux',
+                self.data.spectral_kurtosis',
+                self.data.spectral_rolloff',
+                self.data.spectral_skewness',
+                self.data.spectral_slope',
+                self.data.spectral_spread',
+                self.data.f0'
+            )...)
+        else
+            @error("Unknown $profile profile.")
+        end
     end
 
     function AudioObj(setup::AudioSetup, data::AudioData)
@@ -237,7 +241,7 @@ mutable struct AudioObj
             () -> get_mfcc(obj),
             () -> get_spectrals(obj),
             () -> get_f0(obj),
-            () -> get_features(obj)
+            (x) -> get_features(obj; profile=x)
         )
         #   return obj
     end
diff --git a/src/utils/speech_detector.jl b/src/utils/speech_detector.jl
index cbd1de7..022de43 100644
--- a/src/utils/speech_detector.jl
+++ b/src/utils/speech_detector.jl
@@ -1,11 +1,7 @@
-# include("../windowing/windows.jl")
-# include("../windowing/windowing.jl")
-# include("../fft/spectral.jl")
-
 function moving_mean(
-    x::Vector{T},
-    w::Int64
-) where {T<:AbstractFloat}
+        x::Vector{Float64},
+        w::Int64
+)
     # w must be odd!
     x_length = size(x, 1)
     m = zeros(x_length)
@@ -19,10 +15,10 @@ function moving_mean(
 end
 
 function binpicker(
-    xmin::Float64,
-    xmax::Float64,
-    nbins::Int64,
-    raw_bins_width::Float64
+        xmin::Float64,
+        xmax::Float64,
+        nbins::Int64,
+        raw_bins_width::Float64
 )
     xscale = max(abs(xmin), abs(xmax))
     xrange = xmax - xmin
@@ -73,7 +69,8 @@ function binpicker(
         end
 
         nbins_actual = nbins
-        right_edge = min(max(left_edge + nbins_actual .* bin_width, xmax), floatmax(Float64))
+        right_edge = min(
+            max(left_edge + nbins_actual .* bin_width, xmax), floatmax(Float64))
         # end
 
     else # the data are nearly constant
@@ -101,18 +98,19 @@ function binpicker(
         # if binWidth overflows, don't worry about nice bin edges anymore
         edges = LinRange(left_edge, right_edge, nbins_actual + 1)
     else
-        edges = union(left_edge, left_edge .+ (1:nbins_actual-1) .* bin_width, right_edge)
-        step = round(minimum(diff(edges)), digits=8)
-        edges = range(edges[1], edges[end], step=step)
+        edges = union(
+            left_edge, left_edge .+ (1:(nbins_actual - 1)) .* bin_width, right_edge)
+        step = round(minimum(diff(edges)), digits = 8)
+        edges = range(edges[1], edges[end], step = step)
     end
 
     return edges
 end
 
 function histcounts(
-    feature::Vector{T},
-    hist_bins::Int64
-) where {T<:AbstractFloat}
+        feature::Vector{Float64},
+        hist_bins::Int64
+)
     edgestransposed = false
 
     xmin = minimum(feature)
@@ -122,12 +120,12 @@ function histcounts(
 
     edges = binpicker(xmin, xmax, hist_bins, raw_bins_width)
 
-    n, bin = histcountindices(feature, edges)
+    n, _ = histcountindices(feature, edges)
 
     return n, edges
 end
 
-function f_peaks(n::Vector{T}) where {T<:AbstractFloat}
+function f_peaks(n::Vector{T}) where {T <: AbstractFloat}
     z7 = zeros(Float64, 7)
     z8 = zeros(Float64, 8)
     z3 = zeros(Float64, 3)
@@ -137,7 +135,7 @@ function f_peaks(n::Vector{T}) where {T<:AbstractFloat}
     n[end] = 0
     temp = repeat([z3; n; z3], 6, 1)
 
-    b = all(reshape(nn .< temp, (Int(round(length(nn) / 6)), 6)), dims=2)
+    b = all(reshape(nn .< temp, (Int(round(length(nn) / 6)), 6)), dims = 2)
 
     peaks_idx = []
     for i in eachindex(b)
@@ -151,12 +149,12 @@ function f_peaks(n::Vector{T}) where {T<:AbstractFloat}
 end
 
 function get_threshs_from_feature(
-    feature::Vector{T},
-    bins::Int64,
-    type::Symbol,
-) where {T<:AbstractFloat}
+        feature::Vector{Float64},
+        bins::Int64,
+        type::Symbol
+)
     # get histogram
-    hist_bins = Int(round(length(feature) / bins))
+    hist_bins = round(Int, length(feature) / bins)
     # at leat 10 histogram
     hist_bins = max(10, hist_bins)
 
@@ -200,25 +198,53 @@ function get_threshs_from_feature(
     elseif length(peaks_idx) == 1
         eF0 = vcat(collect(edges_feature), 0)
         M1 = 0.5 * (vcat(0, collect(edges_feature)) .- eF0) + eF0
-        M1 = M1[peaks_idx.+1]
+        M1 = M1[peaks_idx .+ 1]
         M2 = minval
     else
         eF0 = vcat(collect(edges_feature), 0)
         AA = 0.5 * (vcat(0, collect(edges_feature)) .- eF0) + eF0
-        M2 = AA[peaks_idx[1]+1]
-        M1 = AA[peaks_idx[2]+1]
+        M2 = AA[peaks_idx[1] + 1]
+        M1 = AA[peaks_idx[2] + 1]
     end
 
     return M1, M2
 end
 
+function spectral_spread(
+        x::Vector{Float64},
+        sr::Int64;
+        fft_length::Int64,
+        window_length::Int64,
+        overlap_length::Int64,
+        window_norm::Bool = true,
+        spectrum_type::Symbol = :magnitude)
+    X = audio_features_obj(
+        x, sr,
+        fft_length = fft_length,
+        window_length = window_length,
+        overlap_length = overlap_length,
+        window_norm = window_norm,
+        spectrum_type = spectrum_type
+    )
+    X.get_lin_spec()
+
+    s, freq = X.data.lin_spectrogram', X.setup.lin_frequencies
+
+    sum_x1 = vec(sum(s, dims = 1))
+    spectral_centroid = vec(sum(s .* freq, dims = 1) ./ sum_x1')
+    spectral_centroid = replace!(spectral_centroid, NaN => 0)
+    higher_moment_tmp = freq .- spectral_centroid'
+
+    spectral_spread = vec(sqrt.(sum((higher_moment_tmp .^ 2) .* s, dims = 1) ./ sum_x1'))
+
+    return spectral_spread
+end
+
 function speech_detector(
-    x::AbstractVector{T},
-    sr::Int64
-) where {T<:AbstractFloat}
-    # window, unused = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2])
-    # la window la devo fare, perchè questa funzione è di utilità fatta prima del feature extraction
-    window, unused = gencoswin(:hann, Int(round(0.03 * sr)), :periodic)
+        x_in::AbstractVector{Float64},
+        sr::Int64;        #thresholds
+)
+    window, _ = gencoswin(:hann, Int(round(0.03 * sr)), :periodic)
     frame_length = size(window, 1)
     merge_distance = frame_length * 5
 
@@ -232,7 +258,13 @@ function speech_detector(
     #----------------------------------------------------------------------------------#
     #      step 1: extract short-term spectral spread and energy from whole signal     #
     #----------------------------------------------------------------------------------#
-    sig_max = maximum(abs.(x))
+    sig_max = maximum(abs.(x_in))
+
+    x = deepcopy(x_in)
+    # normalize
+    if sig_max > 0
+        x = x ./ sig_max
+    end
 
     # buffer signal
     frames = buffer(x, frame_length, frame_length)
@@ -240,22 +272,24 @@ function speech_detector(
     # determine short term energy
     energy = vec(window' .^ 2 * frames .^ 2)
     # filter the short term energy twice
-    filtered_energy = moving_mean(moving_mean(energy, smoothing_filter_length), smoothing_filter_length)
+    filtered_energy = moving_mean(
+        moving_mean(energy, smoothing_filter_length), smoothing_filter_length)
     # get spectral spread
     spec_spread = spectral_spread(
         x,
         sr,
-        fft_length=2 * frame_length,
-        window_length=frame_length,
-        overlap_length=0,
-        spectrum_type=:magnitude
+        fft_length = 2 * frame_length,
+        window_length = frame_length,
+        overlap_length = 0,
+        spectrum_type = :magnitude
     )
     # normalize the feature
     spec_spread = spec_spread / (sr / 2)
     # set spectral spread value to 0 for frames with low energy
-    spec_spread[energy.<spectral_spread_threshold] .= 0
+    spec_spread[energy .< spectral_spread_threshold] .= 0
     # filter spectral spread twice
-    filtered_spread = moving_mean(moving_mean(spec_spread, smoothing_filter_length), smoothing_filter_length)
+    filtered_spread = moving_mean(
+        moving_mean(spec_spread, smoothing_filter_length), smoothing_filter_length)
 
     #----------------------------------------------------------------------------------#
     #                        step 2: determine thresholds                              #
@@ -284,9 +318,9 @@ function speech_detector(
     frame_length_new = frame_length
 
     # change frames into data points
-    a = repeat(unbuff_out', outer=[frame_length_new, 1])
+    a = repeat(unbuff_out', outer = [frame_length_new, 1])
     unbuff_out_mask = [a[:]; falses(length(x) - length(a), 1)]
-    difference = diff([unbuff_out_mask; false], dims=1)
+    difference = diff([unbuff_out_mask; false], dims = 1)
 
     # find all changes from speech to silence; return index before change
     idx_m1 = findall(difference .== -1)
@@ -299,12 +333,11 @@ function speech_detector(
     else
         idx_p1 = findall(difference .== 1)
         idx_p1 = vcat(1, getindex.(idx_p1, 1))
-
     end
 
     # find gaps less than merge distance
     if length(idx_p1) > 1
-        testmask = idx_p1[2:end] .- idx_m1[1:length(idx_p1)-1] .<= merge_distance
+        testmask = idx_p1[2:end] .- idx_m1[1:(length(idx_p1) - 1)] .<= merge_distance
     else
         testmask = falses(0, 1)
     end
@@ -315,15 +348,19 @@ function speech_detector(
     else
         # arrange output
         idx_p2 = idx_p1[2:end, :]
-        idx_m2 = idx_m1[1:length(idx_p1)-1, :]
+        idx_m2 = idx_m1[1:(length(idx_p1) - 1), :]
         amask = .!testmask
         outidx = reshape([idx_p1[1]; idx_p2[amask]; idx_m2[amask]; idx_m1[end]], :, 2)
     end
 
     y = []
     for i in eachrow(outidx)
-        y = [y; x[i[1]:i[2]]]
+        y = [y; x_in[i[1]:i[2]]]
     end
 
-    return Float64.(y)
+    return Float64.(y), outidx
+end
+
+function speech_detector(x_in::AbstractVector{<:AbstractFloat}, sr::Int64)
+    speech_detector(Float64.(x_in), sr)
 end
\ No newline at end of file
diff --git a/src/windowing/windowing.jl b/src/windowing/windowing.jl
index 8c54741..e057a39 100644
--- a/src/windowing/windowing.jl
+++ b/src/windowing/windowing.jl
@@ -1,22 +1,18 @@
 # include("windows.jl")
 
 function buffer(
-    x::Union{AbstractVector{T},AbstractArray{T}},
-    windowLength::Int,
-    hopLength::Int
-) where {T<:Real}
-
-    xLength = size(x, 1)
-    numChan = size(x, 2)
-    numHops = Int(floor((xLength - windowLength) / hopLength) + 1)
-
-    y = zeros(eltype(x), windowLength, numHops * numChan)
-
-    for c = 1:numChan
-        for nH = 1:numHops
-            for w = 1:windowLength
-                y[w, nH+(c-1)*numHops] = x[w+hopLength*(nH-1), c]
-            end
+        x::AbstractVector{Float64},
+        window_length::Int64,
+        hop_length::Int64
+)
+    x_length = size(x, 1)
+    num_hops = floor(Int, (x_length - window_length) / hop_length) + 1
+
+    y = zeros(Float64, window_length, num_hops)
+
+    for j in 1:num_hops
+        for i in 1:window_length
+            y[i, j] = x[i + hop_length * (j - 1)]
         end
     end
 
@@ -24,29 +20,28 @@ function buffer(
 end # function buffer
 
 function logEnergyCoeffs(
-    x::AbstractArray{T}
-) where {T<:Real}
-
+        x::AbstractArray{T}
+) where {T <: Real}
     DT = eltype(x)
-    E = sum(x .^ 2, dims=1) # eleva tutti gli elementi ^2 e li somma per colonna
-    E[E.==0] .= floatmin(DT) # se un valore è zero, lo sostituisce col valore più piccolo positivo possibile, in accordo col tipo utilizzato
+    E = sum(x .^ 2, dims = 1) # eleva tutti gli elementi ^2 e li somma per colonna
+    E[E .== 0] .= floatmin(DT) # se un valore è zero, lo sostituisce col valore più piccolo positivo possibile, in accordo col tipo utilizzato
     logE = log.(E) # fa il log di tutti gli elementi
 end # logEnergyCoeffs
 
 function windowing(
-    x::Union{AbstractVector{T},AbstractArray{T}},
-    fftLength::Int64=256,
-    winType::Symbol=:hann,
-    winParam::Symbol=:symmetric,
-    logEnergy::Bool=false
-) where {T<:Real}
-
+        x::Union{AbstractVector{T}, AbstractArray{T}},
+        fftLength::Int64 = 256,
+        winType::Symbol = :hann,
+        winParam::Symbol = :symmetric,
+        logEnergy::Bool = false
+) where {T <: Real}
     xLength = size(x, 1) # lunghezza audio
     nChan = size(x, 2) # numero canali (mono, stereo)
     DT = eltype(x) # restituisce il tipo degli elementi
 
     # parto con un if then ma sarebbe bello implementare un Dict
-    if (winType == :hann || winType == :hamming || winType == :blackman || winType == :flattopwin)
+    if (winType == :hann || winType == :hamming || winType == :blackman ||
+        winType == :flattopwin)
         win, hL = gencoswin(winType, fftLength, winParam)
     elseif (winType == :rect)
         win, hL = rectwin(fftLength)
@@ -65,11 +60,10 @@ function windowing(
 end # function windowing
 
 function fade(
-    x::Union{AbstractVector{T},AbstractArray{T}},
-    fftLength::Int64,
-    type::Symbol
-) where {T<:Real}
-
+        x::Union{AbstractVector{T}, AbstractArray{T}},
+        fftLength::Int64,
+        type::Symbol
+) where {T <: Real}
     xLength = size(x, 1) # lunghezza audio
     nChan = size(x, 2) # numero canali (mono, stereo)
     DT = eltype(x) # restituisce il tipo degli elementi
@@ -80,12 +74,12 @@ function fade(
 
     # for c = 1:numChan
     if (type == :in)
-        for w = 1:Int(round(fftLength / 2))
+        for w in 1:Int(round(fftLength / 2))
             x[w] *= wincast[w]
         end
     elseif (type == :out)
-        for w = Int(round(fftLength / 2)):-1:fftLength
-            x[end-fftLength+w] *= wincast[w]
+        for w in Int(round(fftLength / 2)):-1:fftLength
+            x[end - fftLength + w] *= wincast[w]
         end
     end
     # end
diff --git a/test/cwt_features.jl b/test/cwt_features.jl
deleted file mode 100644
index a7c6dd9..0000000
--- a/test/cwt_features.jl
+++ /dev/null
@@ -1,66 +0,0 @@
-using Audio911
-
-using SpecialFunctions
-using Statistics, Roots
-using FFTW
-using Parameters
-using Plots
-include("/home/paso/.julia/dev/Audio911.jl/src/wavelet/cwt.jl")
-
-TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test")
-
-sr_setup = 8000
-x, sr = load_audio("$TESTPATH/common_voice_en_23616312.wav", sr=sr_setup)
-
-window_length = 256
-frequency_range=(80, 3000)
-# mel_bands = 26
-# num_coeffs = 13
-
-setup = AudioSetup(
-    sr=sr,
-    # fft
-    window_type=(:hann, :periodic),
-    window_length=window_length,
-    overlap_length=Int(round(window_length * 0.500)),
-    window_norm=true,
-    # spectrum
-    frequency_range=frequency_range,
-    spectrum_type=:power, # :power, :magnitude
-    # mel
-    mel_style=:htk, # :htk, :slaney
-    mel_bands=mel_bands,
-    filterbank_design_domain=:linear,
-    filterbank_normalization=:bandwidth, # :bandwidth, :area, :none
-    frequency_scale=:mel,
-    # mfcc
-    num_coeffs=num_coeffs,
-    normalization_type=:dithered, # :standard, :dithered
-    rectification=:log,
-    log_energy_source=:standard, # :standard (after windowing), :mfcc
-    log_energy_pos=:none, #:append, :replace, :none
-    delta_window_length=9,
-    delta_matrix=:standard, # :standard, :transposed
-    # spectral
-    spectral_spectrum=:mel # :linear, :linear_focused, :mel
-)
-
-data = AudioData(
-    x=Float64.(x)
-)
-
-get_fft!(data, setup)
-
-cwt_spectrum, _ = cwt(data.x, setup.sr, frequency_range=(80,3000))
-cwt_spectrum = abs.(cwt_spectrum')
-
-data.mel_spectrogram = cwt_windowing(cwt_spectrum, 32)
-setup.mel_bands = setup.num_coeffs = size(data.mel_spectrogram, 2)
-
-# mel_spectrogram(data, setup)
-_mfcc(data, setup)
-# lin_spectrogram(data, setup)
-spectral_features(data, setup)
-f0(data, setup)
-
-# setup.frequency_range = (80, 1000)
\ No newline at end of file