mfcc added

aclai-lab · Nov 13, 2023 · ad2954c · ad2954c
1 parent ad53437
commit ad2954c
Show file tree

Hide file tree

Showing 4 changed files with 379 additions and 33 deletions.
diff --git a/src/mfcc.jl b/src/mfcc.jl
@@ -0,0 +1,312 @@
+using FFTW
+using DSP
+
+include("windowing.jl")
+
+mutable struct mfccOptions{T<:Real}
+    window::Vector{Any}
+    windowLength::Vector{Any}
+    overlapLength::Int64
+    FFTLength::Int64
+    bandEdges::AbstractVector{T}
+    numCoeffs::Int64
+    rectification::Symbol
+    logEnergy::Symbol
+    deltaWindowLength::Int64
+end
+
+function getDefaultBandEdges()
+    # default band edges
+    factor = 133.33333333333333
+    bE = zeros(1, 42)
+    for i in 1:13
+        bE[i] = factor + (factor / 2) * (i - 1)
+    end
+    for i in 14:42
+        bE[i] = bE[i-1] * 1.0711703
+    end
+
+    return vec(bE)
+end
+
+function designMelFilterBank(
+    sr::Int64,
+    bandEdges::AbstractVector{T},
+    NFFT::Int64,
+    sumExponent::Int64,
+    normalization::Symbol,
+    designDomain::Symbol,
+    dataType::DataType,
+    keepTwoSided::Bool,
+    FilterBankDesignDomain::Symbol,
+    melStyle::Symbol
+) where {T<:AbstractFloat}
+
+    zeroCast = convert(dataType, 0)
+    oneCast = convert(dataType, 1)
+    twoCast = convert(dataType, 2)
+    threeCast = convert(dataType, 3)
+    NFFTCast = convert(dataType, NFFT)
+    bandEdgesCast = convert.(dataType, bandEdges)
+    fsCast = convert(dataType, sr)
+
+    # Determine the number of bands
+    numEdges = convert(dataType, length(bandEdgesCast))
+    numBands = numEdges - twoCast
+
+    # Determine the number of valid bands
+    validNumEdges = sum((bandEdgesCast .- (fsCast / twoCast)) .< sqrt(eps(dataType)))
+    validNumBands = validNumEdges - twoCast
+
+    # Preallocate the filter bank
+    filterBank = zeros(dataType, NFFT, Int(numBands))
+    centerFrequencies = bandEdgesCast[2:end-1]
+
+    # Set this flag to true if the number of FFT length is insufficient to
+    # compute the specified number of mel bands
+    FFTLengthTooSmall = false
+
+    if (designDomain == :bin)
+
+    else # designDomain = :Hz
+        linFq = collect(zeroCast:NFFTCast-oneCast) / NFFTCast * fsCast
+
+        # Determine inflection points
+        @assert(validNumEdges <= numEdges)
+        p = zeros(dataType, validNumEdges, 1)
+
+        # da valutarne il risultato se Float64
+        for edgeNumber in 1:validNumEdges
+            for index in eachindex(linFq)
+                if linFq[index] > bandEdgesCast[edgeNumber]
+                    p[edgeNumber] = index
+                    break
+                end
+            end
+        end
+
+        bandEdgesCastMod = bandEdgesCast
+        FqMod = linFq
+
+        if (FilterBankDesignDomain == :mel || FilterBankDesignDomain == :warped)
+            #da fare!!!
+        elseif (FilterBankDesignDomain == :bark)
+            # da fare
+        end
+
+        bw = diff(bandEdgesCastMod)
+
+        for k in 1:Int(validNumBands)
+            for j in Int(p[k]):Int(p[k+1])-1
+                filterBank[j, k] = (FqMod[j] - bandEdgesCastMod[k]) / bw[k]
+            end
+
+            for j = Int(p[k+1]):Int(p[k+2])-1
+                filterBank[j, k] = (bandEdgesCastMod[k+2] - FqMod[j]) / bw[k+1]
+            end
+            emptyRange1 = p[k] .> p[k+1] - 1
+            emptyRange2 = p[k+1] .> p[k+2] - 1
+            if (!FFTLengthTooSmall && (emptyRange1 || emptyRange2))
+                FFTLengthTooSmall = true
+            end
+        end
+
+        # Apply normalization
+        weightPerBand = ones(dataType, 1, Int(numBands))
+
+        if (normalization == :area)
+            # da fare
+        elseif (normalization == :bandwidth)
+            filterBandWidth = bandEdgesCast[3:end] .- bandEdgesCast[1:end-2]
+            weightPerBand = filterBandWidth / 2
+        end
+
+        for i in 1:Int(numBands)
+            filterBank[:, i] = filterBank[:, i] / weightPerBand[i]
+        end
+
+        # if keepTwoSided
+        #     range = audio.internal.getOnesidedFFTRange(NFFT);
+        #     range = range(2:end);
+        #     filterBank(end:-1:end-numel(range)+1,:) = filterBank(range,:);
+        # end
+    end
+    return filterBank
+end # function designMelFilterBank
+
+function createDCTmatrix(
+    numCoeffs::Int64,
+    numFilters::Int64,
+    DT::DataType
+)
+    # create DCT matrix
+    N = convert(DT, numCoeffs)
+    K = numFilters
+    matrix = zeros(DT, Int(N), numFilters)
+    A = sqrt(1 / K)
+    B = sqrt(2 / K)
+    C = 2 * K
+    piCCast = convert(DT, 2 * pi / C)
+
+    matrix[1, :] .= A
+    for k in 1:K
+        for n in 2:Int(N)
+            matrix[n, k] = B * cos(piCCast * (n - 1) * (k - 0.5))
+        end
+    end
+
+    return matrix
+end
+
+function cepstralCoefficients(
+    S::AbstractMatrix{T},
+    numCoeffs::Int64,
+    rectification::Symbol
+) where {T<:AbstractFloat}
+    DT = eltype(S)
+    # Rectify
+    if (rectification == :log)
+        amin = floatmin(DT)
+        S[S.==0] .= amin
+        S = log10.(S)
+    end
+    # Reshape spectrogram to matrix for vectorized matrix multiplication
+    L, M = size(S)
+    N = 1
+    S = reshape(S, L, M * N)
+    # Design DCT matrix
+    DCTmatrix = createDCTmatrix(numCoeffs, L, DT)
+    # Apply DCT matrix
+    coeffs = DCTmatrix * S
+
+    return permutedims(coeffs, [2 1])
+
+    # # Unpack matrix back to 3d array DA EVITARE già da prima
+    # coeffs = reshape(coeffs,numCoeffs,M,N)
+    # # Put time along the first dimension
+    # coeffs = permutedims(coeffs,[2 1 3])
+end # function cepstralCoefficients
+
+function audioDelta(
+    x::AbstractMatrix{T},
+    windowLength::Int64
+) where {T<:AbstractFloat}
+
+    DT = eltype(x)
+    M = convert(DT, floor(windowLength / 2))
+    b = collect(M:-1:-M) ./ sum((1:M) .^ 2)
+
+    return delta = filt(b, 1, x)
+end
+
+function mfcc(
+    x::Union{AbstractVector{T},AbstractArray{T}},
+    sr::Int64;
+    numCoeffs::Int64=13,
+    rectification::Symbol=:log,
+    logEnergy::Symbol=:append,
+    overlapLength::Int64=Int(round(0.02 * sr)),
+    FFTLength::Int64=Int(round(0.02 * sr))
+) where {T<:AbstractFloat}
+
+    x = convert.(Float64, x) # mantiene compatibilità con Matlab che scricchiola con Float32, da verificare se è il caso.
+    # costruisco struttura deltaWindowLength
+    window = []
+    windowLength = []
+    bandEdges = getDefaultBandEdges()
+    deltaWindowLength = 9
+
+    options = mfccOptions(
+        window,
+        windowLength,
+        overlapLength,
+        FFTLength,
+        bandEdges,
+        numCoeffs,
+        rectification,
+        logEnergy,
+        deltaWindowLength
+    )
+
+    # win, winLength, overlapLength = iGetWindow(x, sr, options.window, options.windowLength, options.overlapLength)
+    win, hL = gencoswin(:hamming, Int(round(0.03 * sr)), :periodic)
+    win = convert.(eltype(x), win)
+    winLength = length(win)
+
+    # Convenience variables
+    r = size(x, 1)
+    DT = eltype(x)
+    hopLength = winLength - overlapLength
+
+    # y = windowing(x, FFTLength, :hamming, :periodic, true)
+    numChan = size(x, 2)
+    nHops = Integer(floor((r - FFTLength) / hopLength) + 1) # numero delle window necessarie
+    y = buffer(x, FFTLength, hopLength, nHops, 1)
+
+    # log energy
+    if (options.logEnergy != :ignore)
+        E = sum(eachrow(y .^ 2)) # somma per righe
+        E[E.==0] .= floatmin(DT) # il minimo float al posto di zero
+        logE = log.(E)
+    end
+
+    # Multiply each segment by the window, and take the FFT
+    wincast = convert.(DT, win[:]) # casta al tipo originale del file audio
+    Z = abs.(fft(y .* wincast, (1,)))
+
+    # Design the mel filter bank
+    numValidEdges = sum(options.bandEdges .<= convert(DT, sr) / convert(DT, 2))
+    if (numValidEdges < length(options.bandEdges))
+        edges = options.bandEdges[1:numValidEdges+1]
+        edges[end] = convert(DT, sr) / convert(DT, 2)
+    else
+        edges = options.bandEdges
+    end
+    filterBank = designMelFilterBank(sr, edges, FFTLength, 1, :bandwidth, :Hz, DT, false, :linear, :default)
+    # Calculate cepstral coefficients
+    coeffs = cepstralCoefficients(filterBank' * Z, options.numCoeffs, options.rectification)
+
+    # fermi a 177
+
+    if (options.logEnergy == :append)
+        # coeffs = (logE',coeffs)
+        coeffs = hcat(logE, coeffs)
+    elseif (options.logEnergy == :replace)
+        # coeffs = [logE.',coeffs(:,2:end)];
+        # da fare
+    end
+
+    # coeffs = reshape(coeffs, Int(size(coeffs, 1) / numChan), numChan, size(coeffs, 2))
+    # coeffs = permutedims(coeffs,[1 3 2])
+
+    # METTERE IL CASO CHE le delta vengono calcolate solo se necessario
+    delta = audioDelta(coeffs, options.deltaWindowLength)
+    deltaDelta = audioDelta(delta, options.deltaWindowLength)
+    loc = collect(0:(nHops-1)) * hopLength .+ winLength
+
+    return coeffs, delta, deltaDelta, loc
+end
+
+## Main
+# if (!@isdefined(x)) # carica solo se non è definita mono, serve giusto per debug
+#     # cosi ogni volta che lancio non ricarica tutto
+#     using PyCall
+#     librosa = pyimport("librosa")
+#     sr_src = 8000
+#     x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true)
+# end
+# numCoeffs = 13
+# rectification = :log
+# logEnergy = :append
+# overlapLength = Int(round(0.02 * sr))
+# FFTLength = Int(round(0.03 * sr))
+
+# coeffs, delta, deltaDelta, loc = mfcc(
+#     x,
+#     sr,
+#     numCoeffs=numCoeffs,
+#     rectification=rectification,
+#     logEnergy=logEnergy,
+#     overlapLength=overlapLength,
+#     FFTLength=FFTLength
+# )
diff --git a/src/test_wavelet.jl b/src/test_wavelet.jl
@@ -0,0 +1,34 @@
+using DSP
+using MFCC
+# using Plots
+
+include("wpspectrum.jl")
+include("cwt.jl")
+
+## main
+using PyCall
+librosa = pyimport("librosa")
+soundfile = pyimport("soundfile")
+plt = pyimport("matplotlib.pyplot")
+sr_src = 8000
+x, sr = librosa.load("/home/riccardopasini/Documents/Julia/test_mono.wav", sr=sr_src, mono=true)
+
+depth = 2
+wname = "sym6"
+entType=:shannon
+entPar=0.0
+
+t = wpdec(x, depth, wname, entType, entPar)
+
+spec, freqs, times = wpspectrum(x, sr, depth, wname)
+
+soundfile.write("test1.wav",spec[1,:],samplerate=sr,subtype="PCM_16")
+soundfile.write("test2.wav",spec[2,:],samplerate=sr,subtype="PCM_16")
+soundfile.write("test3.wav",spec[3,:],samplerate=sr,subtype="PCM_16")
+soundfile.write("test4.wav",spec[4,:],samplerate=sr,subtype="PCM_16")
+
+# heatmap(1:size(spec, 1),
+#     1:size(spec, 2), spec,
+#     c=cgrad([:blue, :white, :red, :yellow]),
+#     xlabel="x values", ylabel="y values",
+#     title="My title")
diff --git a/src/test_wavelet_mfcc.jl b/src/test_wavelet_mfcc.jl
@@ -80,7 +80,7 @@ using PyCall
 librosa = pyimport("librosa")
 plt = pyimport("matplotlib.pyplot")
 sr_src = 8000
-x, sr = librosa.load("/home/paso/Documents/Julia_additional_files/test.wav", sr=sr_src, mono=true)
+x, sr = librosa.load("/home/riccardopasini/Documents/Julia/test_mono.wav", sr=sr_src, mono=true)
 
 numcep = 13
 lifterexp = -22
@@ -95,7 +95,7 @@ fbtype = :htkmel
 usecmp = false
 modelorder = 0
 
-depth = 6
+depth = 2
 wname = "sym6"
 spec, freqs, times = wpspectrum(x, sr, depth, wname)
 
@@ -104,14 +104,14 @@ a = wmfcc(spec, sr)
 
 b, z, zz = mfcc(x, sr)
 
-# heatmap(1:size(a, 1),
-#     1:size(a, 2), a,
-#     c=cgrad([:blue, :white, :red, :yellow]),
-#     xlabel="x values", ylabel="y values",
-#     title="My title")
-
-heatmap(1:size(b, 1),
-    1:size(b, 2), b,
+heatmap(1:size(a, 1),
+    1:size(a, 2), a,
     c=cgrad([:blue, :white, :red, :yellow]),
     xlabel="x values", ylabel="y values",
-    title="My title")
+    title="My title")
+
+# heatmap(1:size(b, 1),
+#     1:size(b, 2), b,
+#     c=cgrad([:blue, :white, :red, :yellow]),
+#     xlabel="x values", ylabel="y values",
+#     title="My title")