From c183da739775d200574551bd8b769601b8454ea3 Mon Sep 17 00:00:00 2001 From: paso Date: Sun, 12 May 2024 22:11:44 +0200 Subject: [PATCH] modular architecture started --- Project.toml | 1 + src/Audio911.jl | 4 + src/audioFeaturesExtractor.jl | 71 +++-- src/fft/fft.jl | 240 ++++++++++++----- src/fft/lin.jl | 2 +- src/fft/mel.jl | 11 - src/fft/spectral.jl | 250 ++++++++++-------- src/signalDataStructure.jl | 481 ++-------------------------------- src/utils/histogram.jl | 107 ++++++++ src/utils/speech_detector.jl | 286 +++++--------------- src/windowing/windowing.jl | 37 +++ src/windowing/windows.jl | 2 + test/features_extraction.jl | 50 +++- 13 files changed, 661 insertions(+), 881 deletions(-) create mode 100644 src/utils/histogram.jl diff --git a/Project.toml b/Project.toml index 044b970..b2dba7d 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StatGeochem = "df4de05a-b714-11e8-3c2a-c30fb13e804c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] diff --git a/src/Audio911.jl b/src/Audio911.jl index 305f22c..e075911 100644 --- a/src/Audio911.jl +++ b/src/Audio911.jl @@ -37,6 +37,7 @@ include("fft/lin.jl") include("fft/mel.jl") include("fft/spectral.jl") # utils +include("utils/histogram.jl") include("utils/speech_detector.jl") include("utils/in_out.jl") include("utils/trimaudio.jl") @@ -60,4 +61,7 @@ get_fft! extractfeatures = export extractfeatures +# modular +export get_frames, get_frames!, get_stft, get_stft! + end # module Audio911 diff --git a/src/audioFeaturesExtractor.jl b/src/audioFeaturesExtractor.jl index 6d08302..e530a3f 100644 --- a/src/audioFeaturesExtractor.jl +++ b/src/audioFeaturesExtractor.jl @@ -126,22 +126,24 @@ function audio_setup( sr::Int64; # fft - fft_length::Int64 = 256, + fft_length::Int64 = sr <= 8000 ? 256 : 512, + stft_freq::AbstractVector{Float64} = Float64[], + window::AbstractVector{Float64} = Float64[], window_type::Tuple{Symbol, Symbol} = (:hann, :periodic), - window_length::Int64 = 0, - overlap_length::Int64 = 0, + window_length::Int64 = fft_length, # standard setting: round(Int, 0.03 * sr) + overlap_length::Int64 = round(Int, fft_length / 2), # standard setting: round(Int, 0.02 * sr) window_norm::Bool = false, # spectrum - frequency_range::Tuple{Int64, Int64} = (0, 0), - spectrum_type::Symbol = :power, # :power, :magnitude + frequency_range::Tuple{Int64, Int64} = (0, floor(Int, sr / 2)), + spectrum_type::Symbol = :power, # :power, :magnitude # mel - mel_style::Symbol = :htk, # :htk, :slaney, :tuned + mel_style::Symbol = :htk, # :htk, :slaney, :tuned mel_bands::Int64 = 26, filterbank_design_domain::Symbol = :linear, - filterbank_normalization::Symbol = :bandwidth, # :bandwidth, :area, :none - frequency_scale::Symbol = :mel, # TODO :mel, :bark, :erb + filterbank_normalization::Symbol = :bandwidth, # :bandwidth, :area, :none + frequency_scale::Symbol = :mel, # TODO :mel, :bark, :erb st_peak_range::Tuple{Int64, Int64} = (200, 700), # chroma @@ -151,15 +153,15 @@ function audio_setup( # mfcc num_coeffs::Int64 = 13, - normalization_type::Symbol = :dithered, # :standard, :dithered - rectification::Symbol = :log, # :log, :cubic_root - log_energy_source::Symbol = :standard, # :standard (after windowing), :mfcc - log_energy_pos::Symbol = :none, #:append, :replace, :none + normalization_type::Symbol = :dithered, # :standard, :dithered + rectification::Symbol = :log, # :log, :cubic_root + log_energy_source::Symbol = :standard, # :standard (after windowing), :mfcc + log_energy_pos::Symbol = :none, #:append, :replace, :none delta_window_length::Int64 = 9, - delta_matrix::Symbol = :transposed, # :standard, :transposed + delta_matrix::Symbol = :transposed, # :standard, :transposed # spectral - spectral_spectrum::Symbol = :lin, # :lin, :mel + spectral_spectrum::Symbol = :lin, # :lin, :mel # f0 f0_method::Symbol = :nfc, @@ -170,13 +172,6 @@ function audio_setup( freq_limits::Tuple{Float64, Float64} = (0.0, 0.0), transform_type::Symbol = :full, ) - - window_length == 0 ? window_length = fft_length : window_length - overlap_length == 0 ? overlap_length = round(Int, fft_length / 2) : overlap_length - # window_length == 0 ? window_length = round(Int, 0.03 * sr) : window_length - # overlap_length == 0 ? overlap_length = round(Int, 0.02 * sr) : overlap_length - frequency_range == (0, 0) ? frequency_range = (0, floor(Int, sr / 2)) : frequency_range - # TODO metti warning ed errori AudioSetup( @@ -184,6 +179,8 @@ function audio_setup( # fft fft_length = fft_length, + stft_freq = stft_freq, + window = window, window_type = window_type, window_length = window_length, overlap_length = overlap_length, @@ -243,7 +240,7 @@ function audio_obj( if preemphasis !== nothing zi = 2 * x[1] - x[2] - filt!(x, [1.0, -preemphasis], 1.0, x, [zi]) + filt!(x, [1.0, - preemphasis], 1.0, x, [zi]) # # aclai preemphasis # x = filt(PolynomialRatio([1.0, -preemphasis], [1.0]), x) end @@ -274,7 +271,7 @@ end function audio_obj( filepath::String, - sr::Int64, + sr::Int64; preemphasis = nothing, kwargs..., ) @@ -448,7 +445,7 @@ function get_features( feat::Symbol = :full, ) func_call = Dict([ - :full => get_full, + :full => get_full, :fft => get_fft, :lin => get_lin_spec, :mel => get_mel_spec, @@ -457,7 +454,31 @@ function get_features( :mfcc_delta => get_mfcc_delta, :spectral => get_spectrals, :f0 => get_f0, - :cqt => get_cqt]) + :cqt => get_cqt, + :age_set => + (x) -> begin + get_full(x) + return hcat( + ( + # audio_obj.data.mel_spectrogram, + # audio_obj.data.log_mel, + audio_obj.data.mfcc_coeffs, + audio_obj.data.mfcc_delta, + audio_obj.data.mfcc_deltadelta, + # audio_obj.data.spectral_centroid, + # audio_obj.data.spectral_crest, + # audio_obj.data.spectral_decrease, + # audio_obj.data.spectral_entropy, + # audio_obj.data.spectral_flatness, + # audio_obj.data.spectral_flux, + # audio_obj.data.spectral_kurtosis, + # audio_obj.data.spectral_rolloff, + # audio_obj.data.spectral_skewness, + # audio_obj.data.spectral_slope, + # audio_obj.data.spectral_spread, + # audio_obj.data.f0, + )...) + end]) if !isnothing(audio_obj) if haskey(func_call, feat) diff --git a/src/fft/fft.jl b/src/fft/fft.jl index bfbde53..129cbc8 100644 --- a/src/fft/fft.jl +++ b/src/fft/fft.jl @@ -1,70 +1,178 @@ -# reference: ETSI ES 201 108 V1.1.2 (2000-04) -# https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_13/Docs/PDF/SP-010566.pdf +function get_onesided_fft_range(fft_length::Int64) + if mod(fft_length, 2) == 0 + return collect(1:Int(fft_length / 2 + 1)) # EVEN + else + return collect(1:Int((fft_length + 1) / 2)) # ODD + end +end # get_onesided_fft_range -# use FFTW package +#------------------------------------------------------------------------------# +# fft version 1 as used in audio features extraction # +#------------------------------------------------------------------------------# +function _get_fft(x::AbstractArray{Float64}, setup::AudioSetup) + hop_length = setup.window_length - setup.overlap_length + if isempty(setup.window) + setup.window, _ = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2]) + end -# function get_fft_length( # switch case da ricordare!!! -# sr::Int64, -# ) -# sr <= 4000 && return 128 -# sr <= 8000 && return 256 -# sr <= 16000 && return 512 -# return 1024 -# end # get_fft_length + # split in windows + y = buffer(x, setup.window_length, setup.window_length - setup.overlap_length) -function get_onesided_fft_range(fft_length::Int64) - if mod(fft_length, 2) == 0 - return collect(1:Int(fft_length / 2 + 1)) # EVEN - else - return collect(1:Int((fft_length + 1) / 2)) # ODD - end -end # get_onesided_fft_range + # apply window and take fft + Z = fft(y .* setup.window, (1,)) + + # take one side + logical_ossb = falses(setup.fft_length) + logical_ossb[get_onesided_fft_range(setup.fft_length)] .= true + Z = Z[logical_ossb, :] + + # log energy + # reference: ETSI ES 201 108 V1.1.2 (2000-04) + # https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_13/Docs/PDF/SP-010566.pdf + if setup.log_energy_pos != :none && setup.log_energy_source == :standard + log_energy = sum(eachrow(y .^ 2)) + + if setup.normalization_type == :standard + log_energy[log_energy.==0] .= floatmin(Float64) + elseif setup.normalization_type == :dithered + log_energy[log_energy.<1e-8] .= 1e-8 + end + log_energy = log.(log_energy) + else + log_energy = Vector{Float64}() + end + + if setup.spectrum_type == :power + real(Z .* conj(Z)), log_energy + elseif setup.spectrum_type == :magnitude + abs.(Z), log_energy + else + error("Unknown spectrum type: $(setup.spectrum_type)") + end +end + +get_fft!(setup::AudioSetup, data::AudioData) = data.fft, data.log_energy = _get_fft(data.x, setup) + +#------------------------------------------------------------------------------# +# stft # +#------------------------------------------------------------------------------# +function _get_stft( + x::AbstractArray{Float64}, + sr::Int64; + fft_length::Int64, + window::AbstractVector{Float64}, + frequency_range::Tuple{Int64, Int64}, + spectrum_type::Symbol, +) + @assert fft_length >= size(x, 1) "fft_length must be > window length. Got fft_length = $fft_length, window length = $(size(x,1))." + + # ensure x is of length fft_length + # if the FFT window is larger than the window, the audio data will be zero-padded to match the size of the FFT window. + # this zero-padding in the time domain results in an interpolation in the frequency domain, + # which can provide a more detailed view of the spectral content of the signal. + x = size(x, 1) < fft_length ? vcat(x, zeros(eltype(x), fft_length - size(x, 1), size(x, 2))) : x[1:fft_length, :] + + # get fft + Y = fft(x, (1,)) + + # post process + # trim to desired range + bin_low = ceil(Int, frequency_range[1] * fft_length / sr + 1) + bin_high = floor(Int, frequency_range[2] * fft_length / sr + 1) + bins = collect(bin_low:bin_high) + y = Y[bins, :] + + # convert to half-sided power or magnitude spectrum + spectrum_funcs = Dict( + :power => x -> real.((x .* conj.(x)) / (0.5 * sum(window.^2))), + :magnitude => x -> abs.(x) / (0.5 * sum(window)), + ) + # check if spectrum_type is valid + @assert haskey(spectrum_funcs, spectrum_type) "Unknown spectrum_type: $spectrum_type." + + y = spectrum_funcs[spectrum_type](y) + + # trim borders + # halve the first bin if it's the lowest bin + bin_low == 1 && (y[1, :] *= 0.5) + # halve the last bin if it's the Nyquist bin and FFT length is even + bin_high == fld(fft_length, 2) + 1 && iseven(fft_length) && (y[end, :] *= 0.5) + + # create frequency vector + stft_freq = (sr / fft_length) * (bins .- 1) + # shift final bin if fftLength is odd and the final range is full to fs/2. + if fft_length % 2 != 0 && bin_high == floor(fftLength / 2 + 1) + stft_freq[end] = sr * (fft_length - 1) / (2 * fft_length) + end + + return y, stft_freq +end + +_get_stft(x::AbstractArray{Float64}, s::AudioSetup) = _get_stft( + x, + s.sr, + fft_length = s.fft_length, + window = s.window, + frequency_range = s.frequency_range, + spectrum_type = s.spectrum_type, +) + +function get_stft( + x::AbstractArray{<:AbstractFloat}, + sr::Int64, + window::AbstractVector{Float64}=Float64[]; + fft_length::Int64 = sr <= 8000 ? 256 : 512, + frequency_range::Tuple{Int64, Int64} = (0, floor(Int, sr / 2)), + spectrum_type::Symbol = :power, # :power, :magnitude +) + @assert !isempty(window) "Window missing as third parameter." + @assert sr > 0 "Sample rate must be > 0." + @assert 0 <= frequency_range[1] < frequency_range[2] <= sr / 2 "Frequency range must be (0, sr/2)." + + stft_spec, stft_freq = _get_stft( + x, + sr, + fft_length = fft_length, + window = window, + frequency_range = frequency_range, + spectrum_type = spectrum_type, + ) +end + +function get_stft( + x::AbstractVector{<:AbstractFloat}, + sr::Int64; + fft_length::Int64 = sr <= 8000 ? 256 : 512, + window_type::Tuple{Symbol, Symbol} = (:hann, :periodic), + window_length::Int64 = fft_length, + overlap_length::Int64 = round(Int, fft_length / 2), + frequency_range::Tuple{Int64, Int64} = (0, floor(Int, sr / 2)), + spectrum_type::Symbol = :power, # :power, :magnitude +) + @assert sr > 0 "Sample rate must be > 0." + @assert 0 <= frequency_range[1] < frequency_range[2] <= sr / 2 "Frequency range must be (0, sr/2)." + @assert 0 < overlap_length < window_length "Overlap length must be < window length." + + frames, window = _get_frames( + eltype(x) == Float64 ? x : Float64.(x), + window_type = window_type, + window_length = window_length, + overlap_length = overlap_length, + ) + + stft_spec, stft_freq = _get_stft( + frames .* window, + sr, + fft_length = fft_length, + window = window, + frequency_range = frequency_range, + spectrum_type = spectrum_type, + ) +end -################################################################################ -# main # -################################################################################ -function get_fft!(setup::AudioSetup, data::AudioData) - # TODO validate FFT length, validate overlap length (audioFeatureExtractor.m line 1324) - - setup.fft_length = setup.window_length # definisce la fft pari alla finestra - hop_length = setup.window_length - setup.overlap_length - data.fft_window, _ = gencoswin(setup.window_type[1], setup.window_length, setup.window_type[2]) - - # split in windows - y = buffer(data.x, setup.window_length, hop_length) - - # apply window and take fft - Z = fft(y .* data.fft_window, (1,)) - - # take one side - logical_ossb = falses(setup.fft_length) - logical_ossb[get_onesided_fft_range(setup.fft_length)] .= true - Z = Z[logical_ossb, :] - - # if setup.spectrum_type == :power - # data.fft = real(Z .* conj(Z)) - # elseif setup.spectrum_type == :magnitude - # data.fft = sqrt(real(Z .* conj(Z))) - # else - # data.fft = abs.(Z) - # end - - ######### NON è nel punto corretto: la fft deve essere a numeri complessi, poi convertita in reali nel calcolo degli spettrogrammi!!! - ########### DA CAMBIARE ANCHE la struttura dati... FORSE - setup.spectrum_type == :power ? data.fft = real(Z .* conj(Z)) : data.fft = abs.(Z) - - # log energy - # reference: ETSI ES 201 108 V1.1.2 (2000-04) - # https://www.3gpp.org/ftp/tsg_sa/TSG_SA/TSGS_13/Docs/PDF/SP-010566.pdf - if setup.log_energy_source == :standard - log_energy = sum(eachrow(y .^ 2)) - - if setup.normalization_type == :standard - log_energy[log_energy.==0] .= floatmin(Float64) - elseif setup.normalization_type == :dithered - log_energy[log_energy.<1e-8] .= 1e-8 - end - - data.log_energy = log.(log_energy) - end -end # get_fft! \ No newline at end of file +function get_stft!(a::AudioObj) + if isempty(a.data.frames) + a.data.frames, a.setup.window = _get_frames(a.data.x, a.setup) + end + a.data.stft, a.setup.stft_freq = _get_stft(a.data.frames .* a.setup.window, a.setup) +end diff --git a/src/fft/lin.jl b/src/fft/lin.jl index 845cec7..94cb888 100644 --- a/src/fft/lin.jl +++ b/src/fft/lin.jl @@ -58,7 +58,7 @@ function lin_spectrogram!( end # calculate linear spectrum and normalization - linear_norm_factor = get_lin_norm_factor(setup.spectrum_type, data.fft_window) + linear_norm_factor = get_lin_norm_factor(setup.spectrum_type, setup.window) full_spectrum ? data.lin_spectrogram = data.fft * linear_norm_factor : data.lin_spectrogram = data.fft[bins_logical, :] * linear_norm_factor if adjust_bins diff --git a/src/fft/mel.jl b/src/fft/mel.jl index 3fbd7d2..4c2a93a 100644 --- a/src/fft/mel.jl +++ b/src/fft/mel.jl @@ -104,17 +104,6 @@ function design_filterbank(data::AudioData, setup::AudioSetup) band_edges = mel2hz(LinRange(melRange[1], melRange[end], setup.mel_bands + 2), setup.mel_style) elseif setup.mel_style == :tuned band_edges = mel2hz(LinRange(melRange[1], melRange[end], setup.mel_bands + 2), :htk) - # elseif setup.mel_style == :semitones - # if isempty(data.lin_spectrogram) - # lin_spectrogram!(setup, data) - # end - - # _, loudest_index = catch_loudest_index(data.lin_spectrogram, data.lin_frequencies, setup.st_peak_range) - - # minsemitone = hz2semitone(setup.st_peak_range[1], data.lin_frequencies[loudest_index]) - # maxsemitone = hz2semitone(setup.st_peak_range[2], data.lin_frequencies[loudest_index]) - - # band_edges = semitone2hz.(minsemitone .+ collect(0:(setup.mel_bands+1)) / (setup.mel_bands + 1) * (maxsemitone - minsemitone), loudest_index) else error("Unknown mel_style $(setup.mel_style).") end diff --git a/src/fft/spectral.jl b/src/fft/spectral.jl index c61918f..7a403ab 100644 --- a/src/fft/spectral.jl +++ b/src/fft/spectral.jl @@ -1,150 +1,182 @@ function get_spectrum(setup::AudioSetup, data::AudioData) - setup.spectral_spectrum == :mel && return data.mel_spectrogram', data.mel_frequencies - setup.spectral_spectrum == :lin && return data.lin_spectrogram', data.lin_frequencies - error("Unknown spectral spectrum") + setup.spectral_spectrum == :mel && return data.mel_spectrogram', data.mel_frequencies + setup.spectral_spectrum == :lin && return data.lin_spectrogram', data.lin_frequencies + error("Unknown spectral spectrum") end function spectral_crest( - s::AbstractArray{Float64}, - data::AudioData, - sum_x1::Vector{Float64}, - arithmetic_mean::Vector{Float64} + s::AbstractArray{Float64}, + data::AudioData, + sum_x1::Vector{Float64}, + arithmetic_mean::Vector{Float64}, ) - # # calculate spectral mean - # m = sum(real(s), dims=1) ./ size(s, 1) - # calculate spectral peak - peak = maximum(s, dims=1) - # calculate spectral crest - data.spectral_crest = vec(peak ./ arithmetic_mean') + # # calculate spectral mean + # m = sum(real(s), dims=1) ./ size(s, 1) + # calculate spectral peak + peak = maximum(s, dims = 1) + # calculate spectral crest + data.spectral_crest = vec(peak ./ arithmetic_mean') end function spectral_decrease(s::AbstractArray{Float64}, data::AudioData) - # calculate decrease - data.spectral_decrease = vec(real(sum((s[2:end, :] .- s[1, :]') ./ (1:size(s, 1)-1), dims=1) ./ sum(s[2:end, :], dims=1))) + # calculate decrease + data.spectral_decrease = vec(real(sum((s[2:end, :] .- s[1, :]') ./ (1:size(s, 1)-1), dims = 1) ./ sum(s[2:end, :], dims = 1))) end function spectral_entropy( - s::AbstractArray{Float64}, - data::AudioData, - sum_x1::Vector{Float64} + s::AbstractArray{Float64}, + data::AudioData, + sum_x1::Vector{Float64}, ) - # calculate entropy - X = s ./ repeat(sum_x1', size(s, 1), 1) - t = replace!(-sum(X .* log2.(X), dims=1), NaN => 0) - data.spectral_entropy = vec(t ./ log2(size(s, 1))) + # calculate entropy + X = s ./ repeat(sum_x1', size(s, 1), 1) + t = replace!(-sum(X .* log2.(X), dims = 1), NaN => 0) + data.spectral_entropy = vec(t ./ log2(size(s, 1))) end function spectral_flatness( - s::AbstractArray{Float64}, - data::AudioData, - sum_x1::Vector{Float64}, - arithmetic_mean::Vector{Float64} + s::AbstractArray{Float64}, + data::AudioData, + sum_x1::Vector{Float64}, + arithmetic_mean::Vector{Float64}, ) - # calculate geometric mean, arithmetic mean, and flatness - geometric_mean = exp.(sum(log.(s .+ eps(Float64)), dims=1) / size(s, 1)) - data.spectral_flatness = vec(geometric_mean ./ arithmetic_mean') + # calculate geometric mean, arithmetic mean, and flatness + geometric_mean = exp.(sum(log.(s .+ eps(Float64)), dims = 1) / size(s, 1)) + data.spectral_flatness = vec(geometric_mean ./ arithmetic_mean') end function spectral_flux(s::AbstractArray{Float64}, data::AudioData) - initial_condition = s[:, 1] - # calculate flux - temp = diff(hcat(initial_condition, s), dims=2) - fl = [] - for i in axes(temp, 2) - append!(fl, norm(temp[:, i])) - end - data.spectral_flux = fl + initial_condition = s[:, 1] + # calculate flux + temp = diff(hcat(initial_condition, s), dims = 2) + fl = [] + for i in axes(temp, 2) + append!(fl, norm(temp[:, i])) + end + data.spectral_flux = fl end function spectral_kurtosis( - s::AbstractArray{Float64}, - data::AudioData, - # sum_x1::Vector{Float64}, - # centroid::Vector{Float64}, - higher_moment_tmp::AbstractArray{Float64}, - higher_moment_denom::Vector{Float64}, - higher_momement_num::AbstractArray{Float64} + s::AbstractArray{Float64}, + data::AudioData, + # sum_x1::Vector{Float64}, + # centroid::Vector{Float64}, + higher_moment_tmp::AbstractArray{Float64}, + higher_moment_denom::Vector{Float64}, + higher_momement_num::AbstractArray{Float64}, ) - # calculate centroid - # centroid = vec(sum(s .* setup.fft_frequencies, dims=1) ./ sum(s, dims=1)) - # # calculate spread - # temp = (setup.fft_frequencies .- centroid') .^ 2 - # spread = sqrt.(sum((temp) .* s, dims=1) ./ sum(s, dims=1)) - # # calculate kurtosis - # data.spectral_kurtosis = vec(real(sum(((temp .^ 2) .* s), dims=1) ./ ((spread .^ 4) .* sum(s, dims=1)))) - data.spectral_kurtosis = vec(sum(higher_momement_num .* higher_moment_tmp, dims=1) ./ (higher_moment_denom .* data.spectral_spread)') + # calculate centroid + # centroid = vec(sum(s .* setup.fft_frequencies, dims=1) ./ sum(s, dims=1)) + # # calculate spread + # temp = (setup.fft_frequencies .- centroid') .^ 2 + # spread = sqrt.(sum((temp) .* s, dims=1) ./ sum(s, dims=1)) + # # calculate kurtosis + # data.spectral_kurtosis = vec(real(sum(((temp .^ 2) .* s), dims=1) ./ ((spread .^ 4) .* sum(s, dims=1)))) + data.spectral_kurtosis = vec(sum(higher_momement_num .* higher_moment_tmp, dims = 1) ./ (higher_moment_denom .* data.spectral_spread)') end function spectral_rolloff(s::AbstractArray{Float64}, data::AudioData, fft_frequencies::Vector{Float64}) - # calculate rolloff point - threshold = 0.95 - c = cumsum(s, dims=1) - d = c[end, :] * threshold - idx = zeros(size(d)) - for i in eachindex(d) - idx[i] = findfirst(real(c[:, i]) .>= real(d[i])) - end - data.spectral_rolloff = fft_frequencies[Int.(idx)] + # calculate rolloff point + threshold = 0.95 + c = cumsum(s, dims = 1) + d = c[end, :] * threshold + idx = zeros(size(d)) + for i in eachindex(d) + idx[i] = findfirst(real(c[:, i]) .>= real(d[i])) + end + data.spectral_rolloff = fft_frequencies[Int.(idx)] end function spectral_skewness( - s::AbstractArray{Float64}, - data::AudioData, - sum_x1::Vector{Float64}, - centroid::Vector{Float64}, - higher_moment_denom::Vector{Float64}, - higher_momement_num::AbstractArray{Float64} + s::AbstractArray{Float64}, + data::AudioData, + sum_x1::Vector{Float64}, + centroid::Vector{Float64}, + higher_moment_denom::Vector{Float64}, + higher_momement_num::AbstractArray{Float64}, ) - # # calculate centroid - # centroid = vec(sum(s .* setup.fft_frequencies, dims=1) ./ sum(s, dims=1)) - # # calculate spread - # temp = (setup.fft_frequencies .- centroid') - # spread = sqrt.(sum((temp .^ 2) .* s, dims=1) ./ sum(s, dims=1)) - # # calculate skewness - # data.spectral_skewness = vec(real(sum((temp .^ 3) .* s, dims=1) ./ ((spread .^ 3) .* sum(s, dims=1)))) - data.spectral_skewness = vec(sum(higher_momement_num, dims=1) ./ higher_moment_denom') + # # calculate centroid + # centroid = vec(sum(s .* setup.fft_frequencies, dims=1) ./ sum(s, dims=1)) + # # calculate spread + # temp = (setup.fft_frequencies .- centroid') + # spread = sqrt.(sum((temp .^ 2) .* s, dims=1) ./ sum(s, dims=1)) + # # calculate skewness + # data.spectral_skewness = vec(real(sum((temp .^ 3) .* s, dims=1) ./ ((spread .^ 3) .* sum(s, dims=1)))) + data.spectral_skewness = vec(sum(higher_momement_num, dims = 1) ./ higher_moment_denom') end function spectral_slope( - s::AbstractArray{Float64}, - data::AudioData, - sum_x1::Vector{Float64}, - arithmetic_mean::Vector{Float64}, - fft_frequencies::Vector{Float64} + s::AbstractArray{Float64}, + data::AudioData, + sum_x1::Vector{Float64}, + arithmetic_mean::Vector{Float64}, + fft_frequencies::Vector{Float64}, ) - # calculate slope - f_minus_mu_f = fft_frequencies .- sum(fft_frequencies, dims=1) ./ size(s, 1) - X_minus_mu_X = s .- arithmetic_mean' - data.spectral_slope = vec(real(sum(X_minus_mu_X .* f_minus_mu_f, dims=1) ./ sum(f_minus_mu_f .^ 2))) + # calculate slope + f_minus_mu_f = fft_frequencies .- sum(fft_frequencies, dims = 1) ./ size(s, 1) + X_minus_mu_X = s .- arithmetic_mean' + data.spectral_slope = vec(real(sum(X_minus_mu_X .* f_minus_mu_f, dims = 1) ./ sum(f_minus_mu_f .^ 2))) +end + +function _get_spread(x::AbstractArray{Float64}, setup::AudioSetup) + end ################################################################################ # main # ################################################################################ function get_spectrals!( - setup::AudioSetup, - data::AudioData - ) - s, freq = get_spectrum(setup, data) - - # common data - size_x1 = size(s, 1) - sum_x1 = vec(sum(s, dims=1)) - arithmetic_mean = sum_x1 ./ size_x1 - data.spectral_centroid = vec(sum(s .* freq, dims=1) ./ sum_x1') - data.spectral_centroid = replace!(data.spectral_centroid, NaN => 0) - higher_moment_tmp = freq .- data.spectral_centroid' - data.spectral_spread = vec(sqrt.(sum((higher_moment_tmp .^ 2) .* s, dims=1) ./ sum_x1')) - higher_moment_denom = (data.spectral_spread .^ 3) .* sum_x1 - higher_momement_num = (higher_moment_tmp .^ 3) .* s - - spectral_crest(s, data, sum_x1, arithmetic_mean) - spectral_entropy(s, data, sum_x1) - spectral_flatness(s, data, sum_x1, arithmetic_mean) - spectral_flux(s, data) - spectral_kurtosis(s, data, higher_moment_tmp, higher_moment_denom, higher_momement_num) - spectral_rolloff(s, data, freq) - spectral_skewness(s, data, sum_x1, data.spectral_centroid, higher_moment_denom, higher_momement_num) - spectral_decrease(s, data) - spectral_slope(s, data, sum_x1, arithmetic_mean, freq) + setup::AudioSetup, + data::AudioData, +) + s, freq = get_spectrum(setup, data) + + # common data + size_x1 = size(s, 1) + sum_x1 = vec(sum(s, dims = 1)) + arithmetic_mean = sum_x1 ./ size_x1 + data.spectral_centroid = vec(sum(s .* freq, dims = 1) ./ sum_x1') + data.spectral_centroid = replace!(data.spectral_centroid, NaN => 0) + higher_moment_tmp = freq .- data.spectral_centroid' + data.spectral_spread = vec(sqrt.(sum((higher_moment_tmp .^ 2) .* s, dims = 1) ./ sum_x1')) + higher_moment_denom = (data.spectral_spread .^ 3) .* sum_x1 + higher_momement_num = (higher_moment_tmp .^ 3) .* s + + spectral_crest(s, data, sum_x1, arithmetic_mean) + spectral_entropy(s, data, sum_x1) + spectral_flatness(s, data, sum_x1, arithmetic_mean) + spectral_flux(s, data) + spectral_kurtosis(s, data, higher_moment_tmp, higher_moment_denom, higher_momement_num) + spectral_rolloff(s, data, freq) + spectral_skewness(s, data, sum_x1, data.spectral_centroid, higher_moment_denom, higher_momement_num) + spectral_decrease(s, data) + spectral_slope(s, data, sum_x1, arithmetic_mean, freq) +end + +# ---------------------------------------------------------------------------------- # +# utilities # +# ---------------------------------------------------------------------------------- # +# Calculate sums +sum_s = (s) -> sum(s, dims = 1) +sum_sfreq = (s, sfreq) -> sum(s .* sfreq, dims = 1) + +# ---------------------------------------------------------------------------------- # +# spectral centroid # +# ---------------------------------------------------------------------------------- # +function _get_spec_centroid( + s::AbstractArray{Float64}, + sfreq::AbstractVector{Float64}, +) + vec(sum_sfreq(s, sfreq) ./ sum_s(s)) + # replace!(vec(sum(s .* sfreq, dims = 1) ./ sum(x1, dims=1)), NaN => 0) +end +# ---------------------------------------------------------------------------------- # +# spectral spread # +# ---------------------------------------------------------------------------------- # +function _get_spec_spread( + s::AbstractArray{Float64}, + sfreq::AbstractVector{Float64}, +) + centroid = _get_spec_centroid(s, sfreq) + vec(sqrt.(sum_s(s .* (sfreq .- centroid').^2) ./ sum_s(s))) end \ No newline at end of file diff --git a/src/signalDataStructure.jl b/src/signalDataStructure.jl index a83c4d0..e85a663 100644 --- a/src/signalDataStructure.jl +++ b/src/signalDataStructure.jl @@ -6,11 +6,22 @@ AudioSetup stores all datas that has to be shared in Audio911 module AudioData stores all results from signal analysis """ +### da implementare forse +mutable struct Spectrals + # spectral + spec_source::Symbol +end +### + @with_kw mutable struct AudioSetup sr::Int64 # fft fft_length::Int64 + stft_freq::AbstractVector{Float64} + + # windowing + window::AbstractVector{Float64} window_type::Tuple{Symbol, Symbol} window_length::Int64 overlap_length::Int64 @@ -43,6 +54,7 @@ delta_matrix::Symbol # spectral + # spectrals::Spectrals spectral_spectrum::Symbol # f0 @@ -57,17 +69,18 @@ end @with_kw mutable struct AudioData x::AbstractVector{Float64} + frames::AbstractArray{Float64} = [] # fft fft::AbstractArray{Float64} = [] - fft_window::Vector{Float64} = [] + stft::AbstractArray{Float64} = [] # linear spectrum - lin_frequencies::Vector{Float64} = [] + lin_frequencies::AbstractVector{Float64} = [] lin_spectrogram::AbstractArray{Float64} = [] # mel_spectrum - mel_frequencies::Vector{Float64} = [] + mel_frequencies::AbstractVector{Float64} = [] mel_spectrogram::AbstractArray{Float64} = [] # logaritmic mel @@ -77,23 +90,23 @@ end mfcc_coeffs::AbstractArray{Float64} = [] mfcc_delta::AbstractArray{Float64} = [] mfcc_deltadelta::AbstractArray{Float64} = [] - log_energy::Vector{Float64} = [] + log_energy::AbstractVector{Float64} = [] # spectral - spectral_centroid::Vector{Float64} = [] - spectral_crest::Vector{Float64} = [] - spectral_decrease::Vector{Float64} = [] - spectral_entropy::Vector{Float64} = [] - spectral_flatness::Vector{Float64} = [] - spectral_flux::Vector{Float64} = [] - spectral_kurtosis::Vector{Float64} = [] - spectral_rolloff::Vector{Float64} = [] - spectral_skewness::Vector{Float64} = [] - spectral_slope::Vector{Float64} = [] - spectral_spread::Vector{Float64} = [] + spectral_centroid::AbstractVector{Float64} = [] + spectral_crest::AbstractVector{Float64} = [] + spectral_decrease::AbstractVector{Float64} = [] + spectral_entropy::AbstractVector{Float64} = [] + spectral_flatness::AbstractVector{Float64} = [] + spectral_flux::AbstractVector{Float64} = [] + spectral_kurtosis::AbstractVector{Float64} = [] + spectral_rolloff::AbstractVector{Float64} = [] + spectral_skewness::AbstractVector{Float64} = [] + spectral_slope::AbstractVector{Float64} = [] + spectral_spread::AbstractVector{Float64} = [] # f0 - f0::Vector{Float64} = [] + f0::AbstractVector{Float64} = [] # constant-q transform cqt_spec::AbstractArray{Float64} = [] @@ -103,439 +116,3 @@ mutable struct AudioObj setup::AudioSetup data::AudioData end - -# reference: -# https://www.functionalnoise.com/pages/2023-01-31-julia-class/ -# mutable struct AudioObj -# setup::AudioSetup -# data::AudioData - - # const get_fft::Function - # const get_lin_spec::Function - # const get_mel_spec::Function - # const get_log_mel::Function - # const get_mfcc::Function - # const get_spectrals::Function - # const get_f0::Function - # const get_cqt::Function - # const get_features::Function - - # function get_fft(self::AudioObj) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - - # return self.data.fft' - # end - - # function get_lin_spec(self::AudioObj) - # if isempty(self.data.lin_spectrogram) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - # lin_spectrogram!(self.setup, self.data) - # end - - # return self.data.lin_spectrogram - # end - - # function get_mel_spec(self::AudioObj) - # if isempty(self.data.mel_spectrogram) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - # get_mel_spec!(self.setup, self.data) - # end - - # return self.data.mel_spectrogram - # end - - # function get_log_mel(self::AudioObj) - # if isempty(self.data.log_mel) - # if isempty(self.data.mel_spectrogram) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - # get_mel_spec!(self.setup, self.data) - # end - # get_log_mel!(self.setup, self.data) - # end - - # return self.data.log_mel - # end - - # function get_mfcc(self::AudioObj) - # if isempty(self.data.mfcc_coeffs) - # if isempty(self.data.mel_spectrogram) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - # get_mel_spec!(self.setup, self.data) - # end - # get_mfcc!(self.setup, self.data) - # get_mfcc_deltas!(self.setup, self.data) - # end - - # return hcat((self.data.mfcc_coeffs, self.data.mfcc_delta, self.data.mfcc_deltadelta)...) - # end - - # function get_spectrals(self::AudioObj) - # if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - # lin_spectrogram!(self.setup, self.data) - # elseif self.setup.spectral_spectrum == :mel && isempty(self.data.mel_spectrogram) - # if isempty(self.data.fft) - # get_fft!(self.setup, self.data) - # end - # mel_spectrogram!(self.setup, self.data) - # end - # if isempty(self.data.spectral_centroid) - # get_spectrals!(self.setup, self.data) - # end - - # return hcat( - # ( - # self.data.spectral_centroid, - # self.data.spectral_crest, - # self.data.spectral_decrease, - # self.data.spectral_entropy, - # self.data.spectral_flatness, - # self.data.spectral_flux, - # self.data.spectral_kurtosis, - # self.data.spectral_rolloff, - # self.data.spectral_skewness, - # self.data.spectral_slope, - # self.data.spectral_spread, - # )..., - # ) - # end - - # function get_f0(self::AudioObj) - # if isempty(self.data.f0) - # get_f0!(self.setup, self.data) - # end - - # return self.data.f0 - # end - - # function get_cqt(self::AudioObj) - # if isempty(self.data.cqt_spec) - # get_cqt_spec!(self.setup, self.data) - # end - - # return self.data.cqt_spec - # end - -# # --------------------------------------------------------------------------- # -# # get features profiles # -# # --------------------------------------------------------------------------- # -# function get_features(self::AudioObj, profile::Symbol = :full) -# if profile == :full -# if isempty(self.data.fft) -# get_fft!(self.setup, self.data) -# end -# if isempty(self.data.mel_spectrogram) -# get_mel_spec!(self.setup, self.data) -# end -# if isempty(self.data.log_mel) -# get_log_mel!(self.setup, self.data) -# end -# if isempty(self.data.mfcc_coeffs) -# get_mfcc!(self.setup, self.data) -# get_mfcc_deltas!(self.setup, self.data) -# end -# if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) -# lin_spectrogram!(self.setup, self.data) -# end -# if isempty(self.data.spectral_centroid) -# get_spectrals!(self.setup, self.data) -# end -# if isempty(self.data.f0) -# get_f0!(self.setup, self.data) -# end - -# return hcat( -# ( -# self.data.mel_spectrogram, -# self.data.log_mel, -# self.data.mfcc_coeffs, -# self.data.mfcc_delta, -# self.data.mfcc_deltadelta, -# self.data.spectral_centroid, -# self.data.spectral_crest, -# self.data.spectral_decrease, -# self.data.spectral_entropy, -# self.data.spectral_flatness, -# self.data.spectral_flux, -# self.data.spectral_kurtosis, -# self.data.spectral_rolloff, -# self.data.spectral_skewness, -# self.data.spectral_slope, -# self.data.spectral_spread, -# self.data.f0, -# )..., -# ) - -# # --------------------------------------------------------------------------- # -# # emotion work in progress # -# # --------------------------------------------------------------------------- # -# elseif profile == :emotion -# if isempty(self.data.fft) -# get_fft!(self.setup, self.data) -# end -# if isempty(self.data.mel_spectrogram) -# get_mel_spec!(self.setup, self.data) -# end -# if isempty(self.data.log_mel) -# get_log_mel!(self.setup, self.data) -# end -# if isempty(self.data.mfcc_coeffs) -# get_mfcc!(self.setup, self.data) -# get_mfcc_deltas!(self.setup, self.data) -# end -# if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) -# lin_spectrogram!(self.setup, self.data) -# end -# if isempty(self.data.spectral_centroid) -# get_spectrals!(self.setup, self.data) -# end -# if isempty(self.data.f0) -# get_f0!(self.setup, self.data) -# end - -# return hcat( -# ( -# # self.data.mel_spectrogram, -# self.data.log_mel, -# # self.data.mfcc_coeffs, -# self.data.mfcc_delta, -# self.data.mfcc_deltadelta, -# self.data.spectral_centroid, -# self.data.spectral_crest, -# self.data.spectral_decrease, -# self.data.spectral_entropy, -# self.data.spectral_flatness, -# self.data.spectral_flux, -# self.data.spectral_kurtosis, -# self.data.spectral_rolloff, -# self.data.spectral_skewness, -# self.data.spectral_slope, -# self.data.spectral_spread, -# self.data.f0, -# )..., -# ) - -# # elseif profile == :emotion_set_1 -# # if isempty(self.data.fft) -# # get_fft!(self.setup, self.data) -# # end -# # if isempty(self.data.mel_spectrogram) -# # get_mel_spec!(self.setup, self.data) -# # end -# # if isempty(self.data.mfcc_coeffs) -# # get_mfcc!(self.setup, self.data) -# # # get_mfcc_deltas!(self.setup, self.data) -# # end -# # if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) -# # lin_spectrogram!(self.setup, self.data) -# # end -# # if isempty(self.data.spectral_centroid) -# # get_spectrals!(self.setup, self.data) -# # end -# # # if isempty(self.data.f0) -# # # get_f0!(self.setup, self.data) -# # # end - -# # return vcat( -# # ( -# # self.data.mel_spectrogram', -# # self.data.mfcc_coeffs', -# # # self.data.mfcc_delta', -# # # self.data.mfcc_deltadelta', -# # self.data.spectral_centroid', -# # self.data.spectral_crest', -# # self.data.spectral_decrease', -# # self.data.spectral_entropy', -# # self.data.spectral_flatness', -# # self.data.spectral_flux', -# # self.data.spectral_kurtosis', -# # self.data.spectral_rolloff', -# # self.data.spectral_skewness', -# # self.data.spectral_slope', -# # self.data.spectral_spread', -# # # self.data.f0' -# # )..., -# # ) - -# # elseif profile == :emotion_set_2 -# # if isempty(self.data.fft) -# # get_fft!(self.setup, self.data) -# # end -# # if isempty(self.data.mfcc_coeffs) -# # get_mel_spec!(self.setup, self.data) -# # get_mfcc!(self.setup, self.data) -# # get_mfcc_deltas!(self.setup, self.data) -# # end -# # if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) -# # lin_spectrogram!(self.setup, self.data) -# # end -# # if isempty(self.data.spectral_centroid) -# # get_spectrals!(self.setup, self.data) -# # end -# # if isempty(self.data.f0) -# # get_f0!(self.setup, self.data) -# # end - -# # return vcat( -# # ( -# # # self.data.mel_spectrogram', -# # self.data.mfcc_coeffs', -# # self.data.mfcc_delta', -# # self.data.mfcc_deltadelta', -# # self.data.spectral_centroid', -# # self.data.spectral_crest', -# # self.data.spectral_decrease', -# # self.data.spectral_entropy', -# # self.data.spectral_flatness', -# # self.data.spectral_flux', -# # self.data.spectral_kurtosis', -# # self.data.spectral_rolloff', -# # self.data.spectral_skewness', -# # self.data.spectral_slope', -# # self.data.spectral_spread', -# # self.data.f0', -# # )..., -# # ) - -# # elseif profile == :emotion_set_3 -# if isempty(self.data.fft) -# get_fft!(self.setup, self.data) -# end -# if isempty(self.data.mel_spectrogram) -# get_mel_spec!(self.setup, self.data) -# end -# if isempty(self.data.mfcc_coeffs) -# get_mfcc!(self.setup, self.data) -# get_mfcc_deltas!(self.setup, self.data) -# end -# if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) -# lin_spectrogram!(self.setup, self.data) -# end -# if isempty(self.data.spectral_centroid) -# get_spectrals!(self.setup, self.data) -# end -# if isempty(self.data.f0) -# get_f0!(self.setup, self.data) -# end - -# return vcat( -# ( -# self.data.mel_spectrogram', -# self.data.mfcc_coeffs', -# self.data.mfcc_delta', -# self.data.mfcc_deltadelta', -# self.data.spectral_centroid', -# self.data.spectral_crest', -# self.data.spectral_decrease', -# self.data.spectral_entropy', -# self.data.spectral_flatness', -# self.data.spectral_flux', -# self.data.spectral_kurtosis', -# self.data.spectral_rolloff', -# self.data.spectral_skewness', -# self.data.spectral_slope', -# self.data.spectral_spread', -# self.data.f0', -# )..., -# ) - -# # --------------------------------------------------------------------------- # -# # Gio Paglia # -# # --------------------------------------------------------------------------- # - -# elseif profile == :kdd -# if isempty(self.data.fft) -# get_fft!(self.setup, self.data) -# end -# if isempty(self.data.mel_spectrogram) -# get_mel_spec!(self.setup, self.data) -# end -# if isempty(self.data.mfcc_coeffs) -# get_mfcc!(self.setup, self.data) -# # get_mfcc_deltas!(self.setup, self.data) -# end -# # if self.setup.spectral_spectrum == :lin && isempty(self.data.lin_spectrogram) -# # lin_spectrogram!(self.setup, self.data) -# # end -# # if isempty(self.data.spectral_centroid) -# # get_spectrals!(self.setup, self.data) -# # end -# if isempty(self.data.f0) -# self.setup.f0_range = (200, 700) -# get_f0!(self.setup, self.data) -# end - -# return hcat( -# ( -# # self.data.mel_spectrogram, -# self.data.mfcc_coeffs, -# # self.data.mfcc_delta, -# # self.data.mfcc_deltadelta, -# # self.data.spectral_centroid, -# # self.data.spectral_crest, -# # self.data.spectral_decrease, -# # self.data.spectral_entropy, -# # self.data.spectral_flatness, -# # self.data.spectral_flux, -# # self.data.spectral_kurtosis, -# # self.data.spectral_rolloff, -# # self.data.spectral_skewness, -# # self.data.spectral_slope, -# # self.data.spectral_spread, -# self.data.f0, -# )..., -# ) -# elseif profile == :kdd2 -# if isempty(self.data.fft) -# get_fft!(self.setup, self.data) -# end -# if isempty(self.data.mel_spectrogram) -# get_mel_spec!(self.setup, self.data) -# end -# if isempty(self.data.log_mel) -# get_log_mel!(self.setup, self.data) -# end -# if isempty(self.data.f0) -# self.setup.f0_range = (200, 700) -# get_f0!(self.setup, self.data) -# end - -# return hcat( -# ( -# self.data.log_mel, -# self.data.f0, -# )..., -# ) -# else -# @error("Unknown $profile profile.") -# end -# end - -# function AudioObj(setup::AudioSetup, data::AudioData) -# obj = new( -# setup, data, -# () -> get_fft(obj), -# () -> get_lin_spec(obj), -# () -> get_mel_spec(obj), -# () -> get_log_mel(obj), -# () -> get_mfcc(obj), -# () -> get_spectrals(obj), -# () -> get_f0(obj), -# () -> get_cqt(obj), -# (x) -> get_features(obj, x), -# ) -# # return obj -# end -# end \ No newline at end of file diff --git a/src/utils/histogram.jl b/src/utils/histogram.jl new file mode 100644 index 0000000..9cb87c3 --- /dev/null +++ b/src/utils/histogram.jl @@ -0,0 +1,107 @@ +function binpicker( + xmin::Float64, + xmax::Float64, + nbins::Int64, + raw_bins_width::Float64, +) + xscale = max(abs(xmin), abs(xmax)) + xrange = xmax - xmin + raw_bins_width = max(raw_bins_width, eps(xscale)) + + # if the data are not constant, place the bins at "nice" locations + if xrange > max(sqrt(eps(Float64)) * xscale, floatmin(Float64)) + # choose the bin width as a "nice" value. + pow_of_ten = 10 .^ floor(log10(raw_bins_width)) # next lower power of 10 + rel_size = raw_bins_width / pow_of_ten # guaranteed in [1, 10) + + # automatic rule specified + # if isempty(nbins) nbins viene passato quindi questo codice non serve + # if rel_size < 1.5 + # bin_width = 1*pow_of_ten + # elseif rel_size < 2.5 + # bin_width = 2*pow_of_ten + # elseif rel_size < 4 + # bin_width = 3*pow_of_ten + # elseif rel_size < 7.5 + # bin_width = 5*pow_of_ten + # else + # bin_width = 10*pow_of_ten + # end + + # put the bin edges at multiples of the bin width, covering x. + # the actual number of bins used may not be exactly equal to the requested rule. + # left_edge = max(min(bin_width*floor(xmin ./ bin_width), xmin),-realmax(class(xmax))) + # nbinsActual = max(1, ceil((xmax-left_edge) ./ bin_width)) + # rightEdge = min(max(left_edge + nbinsActual.*bin_width, xmax),realmax(class(xmax))) + + # number of bins specified + # else + # temporarily set a raw bin_width to a nice power of 10. + # bin_width will be set again to a different value if more than 1 bin. + bin_width = pow_of_ten * floor(rel_size) + # set the left edge at multiples of the raw bin width. + # then adjust bin width such that all bins are of the same size and xmax fall into the rightmost bin. + left_edge = max(min(bin_width * floor(xmin ./ bin_width), xmin), -floatmax(Float64)) + if nbins > 1 + ll = (xmax - left_edge) / nbins # bin_width lower bound, xmax + # on right edge of last bin + ul = (xmax - left_edge) / (nbins - 1) # bin_width upper bound, + # xmax on left edge of last bin + p10 = 10^floor(log10(ul - ll)) + bin_width = p10 * ceil(ll ./ p10) # bin_width-ll < p10 <= ul-ll + # thus, bin_width < ul + end + + nbins_actual = nbins + right_edge = min( + max(left_edge + nbins_actual .* bin_width, xmax), floatmax(Float64)) + # end + + else # the data are nearly constant + # for automatic rules, use a single bin. + if isempty(nbins) + nbins = 1 + end + + # there's no way to know what scale the caller has in mind, just create + # something simple that covers the data. + + # make the bins cover a unit width, or as small an integer width as + # possible without the individual bin width being zero relative to xscale. + # put the left edge on an integer or half integer below xmin, with the data in the middle 50% of the bin. + # put the right edge similarly above xmax. + bin_range = max(1, ceil(nbins * eps(xscale))) + left_edge = floor(2 * (xmin - bin_range ./ 4)) / 2 + right_edge = ceil(2 * (xmax + bin_range ./ 4)) / 2 + + bin_width = (right_edge - left_edge) ./ nbins + nbins_actual = nbins + end + + if !isfinite(bin_width) + # if binWidth overflows, don't worry about nice bin edges anymore + edges = LinRange(left_edge, right_edge, nbins_actual + 1) + else + edges = union( + left_edge, left_edge .+ (1:(nbins_actual-1)) .* bin_width, right_edge) + step = round(minimum(diff(edges)), digits = 8) + edges = range(edges[1], edges[end], step = step) + end + + return edges +end + +function histcounts( + feature::Vector{Float64}, + hist_bins::Int64, +) + xmin, xmax = extrema(filter(x -> !isnan(x), feature)) + range_feature = xmax - xmin + raw_bins_width = range_feature / hist_bins + + edges = binpicker(xmin, xmax, hist_bins, raw_bins_width) + + n, _ = histcountindices(feature, edges) + + return n, edges +end \ No newline at end of file diff --git a/src/utils/speech_detector.jl b/src/utils/speech_detector.jl index 2a69312..28bebda 100644 --- a/src/utils/speech_detector.jl +++ b/src/utils/speech_detector.jl @@ -1,128 +1,8 @@ -function moving_mean( - x::Vector{Float64}, - w::Int64, -) - # w must be odd! - x_length = size(x, 1) - m = zeros(x_length) - pad = Int(floor(w / 2)) - for i in eachindex(x) - index1 = i - pad < 1 ? 1 : i - pad - index2 = i + pad > x_length ? x_length : i + pad - m[i] = median(x[index1:index2]) - end - return m -end - -function binpicker( - xmin::Float64, - xmax::Float64, - nbins::Int64, - raw_bins_width::Float64, -) - xscale = max(abs(xmin), abs(xmax)) - xrange = xmax - xmin - raw_bins_width = max(raw_bins_width, eps(xscale)) - - # if the data are not constant, place the bins at "nice" locations - if xrange > max(sqrt(eps(Float64)) * xscale, floatmin(Float64)) - # choose the bin width as a "nice" value. - pow_of_ten = 10 .^ floor(log10(raw_bins_width)) # next lower power of 10 - rel_size = raw_bins_width / pow_of_ten # guaranteed in [1, 10) - - # automatic rule specified - # if isempty(nbins) nbins viene passato quindi questo codice non serve - # if rel_size < 1.5 - # bin_width = 1*pow_of_ten - # elseif rel_size < 2.5 - # bin_width = 2*pow_of_ten - # elseif rel_size < 4 - # bin_width = 3*pow_of_ten - # elseif rel_size < 7.5 - # bin_width = 5*pow_of_ten - # else - # bin_width = 10*pow_of_ten - # end - - # put the bin edges at multiples of the bin width, covering x. - # the actual number of bins used may not be exactly equal to the requested rule. - # left_edge = max(min(bin_width*floor(xmin ./ bin_width), xmin),-realmax(class(xmax))) - # nbinsActual = max(1, ceil((xmax-left_edge) ./ bin_width)) - # rightEdge = min(max(left_edge + nbinsActual.*bin_width, xmax),realmax(class(xmax))) - - # number of bins specified - # else - # temporarily set a raw bin_width to a nice power of 10. - # bin_width will be set again to a different value if more than 1 bin. - bin_width = pow_of_ten * floor(rel_size) - # set the left edge at multiples of the raw bin width. - # then adjust bin width such that all bins are of the same size and xmax fall into the rightmost bin. - left_edge = max(min(bin_width * floor(xmin ./ bin_width), xmin), -floatmax(Float64)) - if nbins > 1 - ll = (xmax - left_edge) / nbins # bin_width lower bound, xmax - # on right edge of last bin - ul = (xmax - left_edge) / (nbins - 1) # bin_width upper bound, - # xmax on left edge of last bin - p10 = 10^floor(log10(ul - ll)) - bin_width = p10 * ceil(ll ./ p10) # bin_width-ll < p10 <= ul-ll - # thus, bin_width < ul - end - - nbins_actual = nbins - right_edge = min( - max(left_edge + nbins_actual .* bin_width, xmax), floatmax(Float64)) - # end - - else # the data are nearly constant - # for automatic rules, use a single bin. - if isempty(nbins) - nbins = 1 - end - - # there's no way to know what scale the caller has in mind, just create - # something simple that covers the data. - - # make the bins cover a unit width, or as small an integer width as - # possible without the individual bin width being zero relative to xscale. - # put the left edge on an integer or half integer below xmin, with the data in the middle 50% of the bin. - # put the right edge similarly above xmax. - bin_range = max(1, ceil(nbins * eps(xscale))) - left_edge = floor(2 * (xmin - bin_range ./ 4)) / 2 - right_edge = ceil(2 * (xmax + bin_range ./ 4)) / 2 - - bin_width = (right_edge - left_edge) ./ nbins - nbins_actual = nbins - end - - if !isfinite(bin_width) - # if binWidth overflows, don't worry about nice bin edges anymore - edges = LinRange(left_edge, right_edge, nbins_actual + 1) - else - edges = union( - left_edge, left_edge .+ (1:(nbins_actual-1)) .* bin_width, right_edge) - step = round(minimum(diff(edges)), digits = 8) - edges = range(edges[1], edges[end], step = step) - end - - return edges -end - -function histcounts( - feature::Vector{Float64}, - hist_bins::Int64, -) - edgestransposed = false - - xmin = minimum(feature) - xmax = maximum(feature) - range_feature = xmax - xmin - raw_bins_width = range_feature / hist_bins - - edges = binpicker(xmin, xmax, hist_bins, raw_bins_width) - - n, _ = histcountindices(feature, edges) - - return n, edges +function moving_mean(x::AbstractVector{Float64}, w::Int64) + @assert isodd(w) "Window size must be odd" + n = length(x) + pad = w ÷ 2 + map(i -> median(@view x[max(1, i - pad):min(n, i + pad)]), 1:n) end function f_peaks(n::Vector{T}) where {T <: AbstractFloat} @@ -210,103 +90,82 @@ function get_threshs_from_feature( return M1, M2 end -function spectral_spread( - x::Vector{Float64}, - sr::Int64; - fft_length::Int64, - window_length::Int64, - overlap_length::Int64, - window_norm::Bool = true, - spectrum_type::Symbol = :magnitude, -) - X = audio_features_obj( - x, sr, - fft_length = fft_length, - window_length = window_length, - overlap_length = overlap_length, - window_norm = window_norm, - spectrum_type = spectrum_type, - ) - s = X.get_lin_spec() - - freq = X.data.lin_frequencies - - sum_x1 = vec(sum(s, dims = 1)) - spectral_centroid = vec(sum(s .* freq, dims = 1) ./ sum_x1') - spectral_centroid = replace!(spectral_centroid, NaN => 0) - higher_moment_tmp = freq .- spectral_centroid' - - spectral_spread = vec(sqrt.(sum((higher_moment_tmp .^ 2) .* s, dims = 1) ./ sum_x1')) - - return spectral_spread -end - function speech_detector( - x_in::AbstractVector{Float64}, - sr::Int64; #thresholds + x::AbstractVector{Float64}, + sr::Int64; + window_type::Tuple{Symbol, Symbol} = (:hann, :periodic), + window_length::Int64 = round(Int, 0.03 * sr), + overlap_length::Int64 = 0, + thresholds::Tuple{Float64, Float64} = (-Inf, -Inf), + merge_distance::Int64 = window_length * 5, ) - window, _ = gencoswin(:hann, Int(round(0.03 * sr)), :periodic) - frame_length = size(window, 1) - merge_distance = frame_length * 5 - - ## helper detect speech - W = 5 # weight for finding local maxima - bins = 15 # number of bins for histograms - spectral_spread_threshold = 0.05 # threshold used for not counting spectral spread when under this energy - lower_spread_threshold_factor = 0.8 # factor to lower the spectral spread threshold. - smoothing_filter_length = 5 # after getting spectral spread and energy data, these features are smoothed by filters of this length - - #----------------------------------------------------------------------------------# - # step 1: extract short-term spectral spread and energy from whole signal # - #----------------------------------------------------------------------------------# - sig_max = maximum(abs.(x_in)) + # ---------------------------------------------------------------------------------- # + # parameters # + # ---------------------------------------------------------------------------------- # + # options = SdOptions(threshold, merge_distance) + + weight = 5 # weight for finding local maxima + bins = 15 # number of bins for histograms + spread_threshold = 0.05 # threshold used for not counting spectral spread when under this energy + lower_spread_threshold_factor = 0.8 # factor to lower the spectral spread threshold. + smoothing_filter_length = 5 # after getting spectral spread and energy data, + # these features are smoothed by filters of this length + + # ---------------------------------------------------------------------------------- # + # step 1: extract short-term spectral spread and energy from whole signal # + # ---------------------------------------------------------------------------------- # - x = deepcopy(x_in) # normalize - if sig_max > 0 - x = x ./ sig_max - end - - # buffer signal - frames = buffer(x, frame_length, frame_length) + x = normalize_audio(x) - # determine short term energy - energy = vec(window' .^ 2 * frames .^ 2) - # filter the short term energy twice - filtered_energy = moving_mean( - moving_mean(energy, smoothing_filter_length), smoothing_filter_length) - # get spectral spread - spec_spread = spectral_spread( + # get stft + frames, window = _get_frames( x, + window_type = window_type, + window_length = window_length, + overlap_length = overlap_length, + ) + s, sfreq = _get_stft( + frames .* window, sr, - fft_length = 2 * frame_length, - window_length = frame_length, - overlap_length = 0, + fft_length = 2 * window_length, + window = window, + frequency_range = (0, floor(Int, sr / 2)), spectrum_type = :magnitude, ) - # normalize the feature - spec_spread = spec_spread / (sr / 2) + + # determine short term energy + energy = vec(window' .^ 2 * frames .^ 2) + + # filter the short term energy twice + filtered_energy = moving_mean(moving_mean(energy, smoothing_filter_length), smoothing_filter_length) + + # get spectral spread and normalize + spread = _get_spec_spread(s, sfreq) / (sr / 2) + # set spectral spread value to 0 for frames with low energy - spec_spread[energy. windowing > stft +buffer_3, win_3 = get_frames(x) +stft_3, stft_freq_3 = get_stft(buffer_3, sr, win_3) + +# example 4, utilizing audio_obj +audio_4 = audio_obj(x, sr) +get_stft!(audio_4) +# display(audio_4.data.stft) + +# example 5, utilizing audio_obj with args +audio_5 = audio_obj(x, + sr, + fft_length = 512, + spectrum_type = :magnitude, + window_type = (:hamming, :symmetric), + frequency_range = (100, 3000)) +get_stft!(audio_5) +# display(audio_5.data.stft) + +# example 6, utilizing audio_obj modular +audio_6 = audio_obj(x, sr) +get_frames!(audio_6) +get_stft!(audio_6) +display(audio_6.data.stft)