From dfa1b6b2d4bce0ecc00e9cf29562a76ba0ed0be5 Mon Sep 17 00:00:00 2001 From: paso Date: Sat, 27 Apr 2024 00:15:15 +0200 Subject: [PATCH] added profile in full_features, fixed speech detector --- src/audioFeaturesExtractor.jl | 2 +- src/fft/fft.jl | 2 +- src/signalDataStructure.jl | 82 ++++++++++---------- src/utils/speech_detector.jl | 137 +++++++++++++++++++++------------- src/windowing/windowing.jl | 68 ++++++++--------- test/cwt_features.jl | 66 ---------------- 6 files changed, 163 insertions(+), 194 deletions(-) delete mode 100644 test/cwt_features.jl diff --git a/src/audioFeaturesExtractor.jl b/src/audioFeaturesExtractor.jl index d63f98b..2f4c2a9 100644 --- a/src/audioFeaturesExtractor.jl +++ b/src/audioFeaturesExtractor.jl @@ -13,7 +13,7 @@ audio.get_mel_spec() audio.get_mfcc() audio.get_spectrals() audio.get_f0() -audio.get_features() +audio.get_features(profile, per ora solo :full) ######################################################################################################### 2-utilizzare l'invocazione get_feature per ottenere le features separatamente diff --git a/src/fft/fft.jl b/src/fft/fft.jl index a109d94..221f185 100644 --- a/src/fft/fft.jl +++ b/src/fft/fft.jl @@ -28,7 +28,7 @@ function get_fft!(setup::AudioSetup, data::AudioData) setup.fft_length = setup.window_length # definisce la fft pari alla finestra hop_length = setup.window_length - setup.overlap_length - data.fft_window, unused = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2]) + data.fft_window, _ = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2]) # split in windows y = buffer(data.x, setup.window_length, hop_length) diff --git a/src/signalDataStructure.jl b/src/signalDataStructure.jl index c9304ed..06c7256 100644 --- a/src/signalDataStructure.jl +++ b/src/signalDataStructure.jl @@ -187,45 +187,49 @@ mutable struct AudioObj return self.data.fft end - function get_features(self::AudioObj) - if isempty(self.data.fft) - get_fft!(self.setup, self.data) - end - if isempty(self.data.mel_spectrogram) - get_mel_spec!(self.setup, self.data) - end - if isempty(self.data.mfcc_coeffs) - get_mfcc!(self.setup, self.data) - get_mfcc_deltas!(self.setup, self.data) - end - if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) - lin_spectrogram!(self.setup, self.data) - end - if isempty(self.data.spectral_centroid) - get_spectrals!(self.setup, self.data) - end - if isempty(self.data.f0) - get_f0!(self.setup, self.data) - end + function get_features(self::AudioObj; profile::Symbol) + if profile == :full + if isempty(self.data.fft) + get_fft!(self.setup, self.data) + end + if isempty(self.data.mel_spectrogram) + get_mel_spec!(self.setup, self.data) + end + if isempty(self.data.mfcc_coeffs) + get_mfcc!(self.setup, self.data) + get_mfcc_deltas!(self.setup, self.data) + end + if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) + lin_spectrogram!(self.setup, self.data) + end + if isempty(self.data.spectral_centroid) + get_spectrals!(self.setup, self.data) + end + if isempty(self.data.f0) + get_f0!(self.setup, self.data) + end - return vcat(( - self.data.mel_spectrogram', - self.data.mfcc_coeffs', - self.data.mfcc_delta', - self.data.mfcc_deltadelta', - self.data.spectral_centroid', - self.data.spectral_crest', - self.data.spectral_decrease', - self.data.spectral_entropy', - self.data.spectral_flatness', - self.data.spectral_flux', - self.data.spectral_kurtosis', - self.data.spectral_rolloff', - self.data.spectral_skewness', - self.data.spectral_slope', - self.data.spectral_spread', - self.data.f0' - )...) + return vcat(( + self.data.mel_spectrogram', + self.data.mfcc_coeffs', + self.data.mfcc_delta', + self.data.mfcc_deltadelta', + self.data.spectral_centroid', + self.data.spectral_crest', + self.data.spectral_decrease', + self.data.spectral_entropy', + self.data.spectral_flatness', + self.data.spectral_flux', + self.data.spectral_kurtosis', + self.data.spectral_rolloff', + self.data.spectral_skewness', + self.data.spectral_slope', + self.data.spectral_spread', + self.data.f0' + )...) + else + @error("Unknown $profile profile.") + end end function AudioObj(setup::AudioSetup, data::AudioData) @@ -237,7 +241,7 @@ mutable struct AudioObj () -> get_mfcc(obj), () -> get_spectrals(obj), () -> get_f0(obj), - () -> get_features(obj) + (x) -> get_features(obj; profile=x) ) # return obj end diff --git a/src/utils/speech_detector.jl b/src/utils/speech_detector.jl index cbd1de7..022de43 100644 --- a/src/utils/speech_detector.jl +++ b/src/utils/speech_detector.jl @@ -1,11 +1,7 @@ -# include("../windowing/windows.jl") -# include("../windowing/windowing.jl") -# include("../fft/spectral.jl") - function moving_mean( - x::Vector{T}, - w::Int64 -) where {T<:AbstractFloat} + x::Vector{Float64}, + w::Int64 +) # w must be odd! x_length = size(x, 1) m = zeros(x_length) @@ -19,10 +15,10 @@ function moving_mean( end function binpicker( - xmin::Float64, - xmax::Float64, - nbins::Int64, - raw_bins_width::Float64 + xmin::Float64, + xmax::Float64, + nbins::Int64, + raw_bins_width::Float64 ) xscale = max(abs(xmin), abs(xmax)) xrange = xmax - xmin @@ -73,7 +69,8 @@ function binpicker( end nbins_actual = nbins - right_edge = min(max(left_edge + nbins_actual .* bin_width, xmax), floatmax(Float64)) + right_edge = min( + max(left_edge + nbins_actual .* bin_width, xmax), floatmax(Float64)) # end else # the data are nearly constant @@ -101,18 +98,19 @@ function binpicker( # if binWidth overflows, don't worry about nice bin edges anymore edges = LinRange(left_edge, right_edge, nbins_actual + 1) else - edges = union(left_edge, left_edge .+ (1:nbins_actual-1) .* bin_width, right_edge) - step = round(minimum(diff(edges)), digits=8) - edges = range(edges[1], edges[end], step=step) + edges = union( + left_edge, left_edge .+ (1:(nbins_actual - 1)) .* bin_width, right_edge) + step = round(minimum(diff(edges)), digits = 8) + edges = range(edges[1], edges[end], step = step) end return edges end function histcounts( - feature::Vector{T}, - hist_bins::Int64 -) where {T<:AbstractFloat} + feature::Vector{Float64}, + hist_bins::Int64 +) edgestransposed = false xmin = minimum(feature) @@ -122,12 +120,12 @@ function histcounts( edges = binpicker(xmin, xmax, hist_bins, raw_bins_width) - n, bin = histcountindices(feature, edges) + n, _ = histcountindices(feature, edges) return n, edges end -function f_peaks(n::Vector{T}) where {T<:AbstractFloat} +function f_peaks(n::Vector{T}) where {T <: AbstractFloat} z7 = zeros(Float64, 7) z8 = zeros(Float64, 8) z3 = zeros(Float64, 3) @@ -137,7 +135,7 @@ function f_peaks(n::Vector{T}) where {T<:AbstractFloat} n[end] = 0 temp = repeat([z3; n; z3], 6, 1) - b = all(reshape(nn .< temp, (Int(round(length(nn) / 6)), 6)), dims=2) + b = all(reshape(nn .< temp, (Int(round(length(nn) / 6)), 6)), dims = 2) peaks_idx = [] for i in eachindex(b) @@ -151,12 +149,12 @@ function f_peaks(n::Vector{T}) where {T<:AbstractFloat} end function get_threshs_from_feature( - feature::Vector{T}, - bins::Int64, - type::Symbol, -) where {T<:AbstractFloat} + feature::Vector{Float64}, + bins::Int64, + type::Symbol +) # get histogram - hist_bins = Int(round(length(feature) / bins)) + hist_bins = round(Int, length(feature) / bins) # at leat 10 histogram hist_bins = max(10, hist_bins) @@ -200,25 +198,53 @@ function get_threshs_from_feature( elseif length(peaks_idx) == 1 eF0 = vcat(collect(edges_feature), 0) M1 = 0.5 * (vcat(0, collect(edges_feature)) .- eF0) + eF0 - M1 = M1[peaks_idx.+1] + M1 = M1[peaks_idx .+ 1] M2 = minval else eF0 = vcat(collect(edges_feature), 0) AA = 0.5 * (vcat(0, collect(edges_feature)) .- eF0) + eF0 - M2 = AA[peaks_idx[1]+1] - M1 = AA[peaks_idx[2]+1] + M2 = AA[peaks_idx[1] + 1] + M1 = AA[peaks_idx[2] + 1] end return M1, M2 end +function spectral_spread( + x::Vector{Float64}, + sr::Int64; + fft_length::Int64, + window_length::Int64, + overlap_length::Int64, + window_norm::Bool = true, + spectrum_type::Symbol = :magnitude) + X = audio_features_obj( + x, sr, + fft_length = fft_length, + window_length = window_length, + overlap_length = overlap_length, + window_norm = window_norm, + spectrum_type = spectrum_type + ) + X.get_lin_spec() + + s, freq = X.data.lin_spectrogram', X.setup.lin_frequencies + + sum_x1 = vec(sum(s, dims = 1)) + spectral_centroid = vec(sum(s .* freq, dims = 1) ./ sum_x1') + spectral_centroid = replace!(spectral_centroid, NaN => 0) + higher_moment_tmp = freq .- spectral_centroid' + + spectral_spread = vec(sqrt.(sum((higher_moment_tmp .^ 2) .* s, dims = 1) ./ sum_x1')) + + return spectral_spread +end + function speech_detector( - x::AbstractVector{T}, - sr::Int64 -) where {T<:AbstractFloat} - # window, unused = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2]) - # la window la devo fare, perchè questa funzione è di utilità fatta prima del feature extraction - window, unused = gencoswin(:hann, Int(round(0.03 * sr)), :periodic) + x_in::AbstractVector{Float64}, + sr::Int64; #thresholds +) + window, _ = gencoswin(:hann, Int(round(0.03 * sr)), :periodic) frame_length = size(window, 1) merge_distance = frame_length * 5 @@ -232,7 +258,13 @@ function speech_detector( #----------------------------------------------------------------------------------# # step 1: extract short-term spectral spread and energy from whole signal # #----------------------------------------------------------------------------------# - sig_max = maximum(abs.(x)) + sig_max = maximum(abs.(x_in)) + + x = deepcopy(x_in) + # normalize + if sig_max > 0 + x = x ./ sig_max + end # buffer signal frames = buffer(x, frame_length, frame_length) @@ -240,22 +272,24 @@ function speech_detector( # determine short term energy energy = vec(window' .^ 2 * frames .^ 2) # filter the short term energy twice - filtered_energy = moving_mean(moving_mean(energy, smoothing_filter_length), smoothing_filter_length) + filtered_energy = moving_mean( + moving_mean(energy, smoothing_filter_length), smoothing_filter_length) # get spectral spread spec_spread = spectral_spread( x, sr, - fft_length=2 * frame_length, - window_length=frame_length, - overlap_length=0, - spectrum_type=:magnitude + fft_length = 2 * frame_length, + window_length = frame_length, + overlap_length = 0, + spectrum_type = :magnitude ) # normalize the feature spec_spread = spec_spread / (sr / 2) # set spectral spread value to 0 for frames with low energy - spec_spread[energy. 1 - testmask = idx_p1[2:end] .- idx_m1[1:length(idx_p1)-1] .<= merge_distance + testmask = idx_p1[2:end] .- idx_m1[1:(length(idx_p1) - 1)] .<= merge_distance else testmask = falses(0, 1) end @@ -315,15 +348,19 @@ function speech_detector( else # arrange output idx_p2 = idx_p1[2:end, :] - idx_m2 = idx_m1[1:length(idx_p1)-1, :] + idx_m2 = idx_m1[1:(length(idx_p1) - 1), :] amask = .!testmask outidx = reshape([idx_p1[1]; idx_p2[amask]; idx_m2[amask]; idx_m1[end]], :, 2) end y = [] for i in eachrow(outidx) - y = [y; x[i[1]:i[2]]] + y = [y; x_in[i[1]:i[2]]] end - return Float64.(y) + return Float64.(y), outidx +end + +function speech_detector(x_in::AbstractVector{<:AbstractFloat}, sr::Int64) + speech_detector(Float64.(x_in), sr) end \ No newline at end of file diff --git a/src/windowing/windowing.jl b/src/windowing/windowing.jl index 8c54741..e057a39 100644 --- a/src/windowing/windowing.jl +++ b/src/windowing/windowing.jl @@ -1,22 +1,18 @@ # include("windows.jl") function buffer( - x::Union{AbstractVector{T},AbstractArray{T}}, - windowLength::Int, - hopLength::Int -) where {T<:Real} - - xLength = size(x, 1) - numChan = size(x, 2) - numHops = Int(floor((xLength - windowLength) / hopLength) + 1) - - y = zeros(eltype(x), windowLength, numHops * numChan) - - for c = 1:numChan - for nH = 1:numHops - for w = 1:windowLength - y[w, nH+(c-1)*numHops] = x[w+hopLength*(nH-1), c] - end + x::AbstractVector{Float64}, + window_length::Int64, + hop_length::Int64 +) + x_length = size(x, 1) + num_hops = floor(Int, (x_length - window_length) / hop_length) + 1 + + y = zeros(Float64, window_length, num_hops) + + for j in 1:num_hops + for i in 1:window_length + y[i, j] = x[i + hop_length * (j - 1)] end end @@ -24,29 +20,28 @@ function buffer( end # function buffer function logEnergyCoeffs( - x::AbstractArray{T} -) where {T<:Real} - + x::AbstractArray{T} +) where {T <: Real} DT = eltype(x) - E = sum(x .^ 2, dims=1) # eleva tutti gli elementi ^2 e li somma per colonna - E[E.==0] .= floatmin(DT) # se un valore è zero, lo sostituisce col valore più piccolo positivo possibile, in accordo col tipo utilizzato + E = sum(x .^ 2, dims = 1) # eleva tutti gli elementi ^2 e li somma per colonna + E[E .== 0] .= floatmin(DT) # se un valore è zero, lo sostituisce col valore più piccolo positivo possibile, in accordo col tipo utilizzato logE = log.(E) # fa il log di tutti gli elementi end # logEnergyCoeffs function windowing( - x::Union{AbstractVector{T},AbstractArray{T}}, - fftLength::Int64=256, - winType::Symbol=:hann, - winParam::Symbol=:symmetric, - logEnergy::Bool=false -) where {T<:Real} - + x::Union{AbstractVector{T}, AbstractArray{T}}, + fftLength::Int64 = 256, + winType::Symbol = :hann, + winParam::Symbol = :symmetric, + logEnergy::Bool = false +) where {T <: Real} xLength = size(x, 1) # lunghezza audio nChan = size(x, 2) # numero canali (mono, stereo) DT = eltype(x) # restituisce il tipo degli elementi # parto con un if then ma sarebbe bello implementare un Dict - if (winType == :hann || winType == :hamming || winType == :blackman || winType == :flattopwin) + if (winType == :hann || winType == :hamming || winType == :blackman || + winType == :flattopwin) win, hL = gencoswin(winType, fftLength, winParam) elseif (winType == :rect) win, hL = rectwin(fftLength) @@ -65,11 +60,10 @@ function windowing( end # function windowing function fade( - x::Union{AbstractVector{T},AbstractArray{T}}, - fftLength::Int64, - type::Symbol -) where {T<:Real} - + x::Union{AbstractVector{T}, AbstractArray{T}}, + fftLength::Int64, + type::Symbol +) where {T <: Real} xLength = size(x, 1) # lunghezza audio nChan = size(x, 2) # numero canali (mono, stereo) DT = eltype(x) # restituisce il tipo degli elementi @@ -80,12 +74,12 @@ function fade( # for c = 1:numChan if (type == :in) - for w = 1:Int(round(fftLength / 2)) + for w in 1:Int(round(fftLength / 2)) x[w] *= wincast[w] end elseif (type == :out) - for w = Int(round(fftLength / 2)):-1:fftLength - x[end-fftLength+w] *= wincast[w] + for w in Int(round(fftLength / 2)):-1:fftLength + x[end - fftLength + w] *= wincast[w] end end # end diff --git a/test/cwt_features.jl b/test/cwt_features.jl deleted file mode 100644 index a7c6dd9..0000000 --- a/test/cwt_features.jl +++ /dev/null @@ -1,66 +0,0 @@ -using Audio911 - -using SpecialFunctions -using Statistics, Roots -using FFTW -using Parameters -using Plots -include("/home/paso/.julia/dev/Audio911.jl/src/wavelet/cwt.jl") - -TESTPATH = joinpath(dirname(pathof(Audio911)), "..", "test") - -sr_setup = 8000 -x, sr = load_audio("$TESTPATH/common_voice_en_23616312.wav", sr=sr_setup) - -window_length = 256 -frequency_range=(80, 3000) -# mel_bands = 26 -# num_coeffs = 13 - -setup = AudioSetup( - sr=sr, - # fft - window_type=(:hann, :periodic), - window_length=window_length, - overlap_length=Int(round(window_length * 0.500)), - window_norm=true, - # spectrum - frequency_range=frequency_range, - spectrum_type=:power, # :power, :magnitude - # mel - mel_style=:htk, # :htk, :slaney - mel_bands=mel_bands, - filterbank_design_domain=:linear, - filterbank_normalization=:bandwidth, # :bandwidth, :area, :none - frequency_scale=:mel, - # mfcc - num_coeffs=num_coeffs, - normalization_type=:dithered, # :standard, :dithered - rectification=:log, - log_energy_source=:standard, # :standard (after windowing), :mfcc - log_energy_pos=:none, #:append, :replace, :none - delta_window_length=9, - delta_matrix=:standard, # :standard, :transposed - # spectral - spectral_spectrum=:mel # :linear, :linear_focused, :mel -) - -data = AudioData( - x=Float64.(x) -) - -get_fft!(data, setup) - -cwt_spectrum, _ = cwt(data.x, setup.sr, frequency_range=(80,3000)) -cwt_spectrum = abs.(cwt_spectrum') - -data.mel_spectrogram = cwt_windowing(cwt_spectrum, 32) -setup.mel_bands = setup.num_coeffs = size(data.mel_spectrogram, 2) - -# mel_spectrogram(data, setup) -_mfcc(data, setup) -# lin_spectrogram(data, setup) -spectral_features(data, setup) -f0(data, setup) - -# setup.frequency_range = (80, 1000) \ No newline at end of file