-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ad53437
commit ad2954c
Showing
4 changed files
with
379 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,312 @@ | ||
using FFTW | ||
using DSP | ||
|
||
include("windowing.jl") | ||
|
||
mutable struct mfccOptions{T<:Real} | ||
window::Vector{Any} | ||
windowLength::Vector{Any} | ||
overlapLength::Int64 | ||
FFTLength::Int64 | ||
bandEdges::AbstractVector{T} | ||
numCoeffs::Int64 | ||
rectification::Symbol | ||
logEnergy::Symbol | ||
deltaWindowLength::Int64 | ||
end | ||
|
||
function getDefaultBandEdges() | ||
# default band edges | ||
factor = 133.33333333333333 | ||
bE = zeros(1, 42) | ||
for i in 1:13 | ||
bE[i] = factor + (factor / 2) * (i - 1) | ||
end | ||
for i in 14:42 | ||
bE[i] = bE[i-1] * 1.0711703 | ||
end | ||
|
||
return vec(bE) | ||
end | ||
|
||
function designMelFilterBank( | ||
sr::Int64, | ||
bandEdges::AbstractVector{T}, | ||
NFFT::Int64, | ||
sumExponent::Int64, | ||
normalization::Symbol, | ||
designDomain::Symbol, | ||
dataType::DataType, | ||
keepTwoSided::Bool, | ||
FilterBankDesignDomain::Symbol, | ||
melStyle::Symbol | ||
) where {T<:AbstractFloat} | ||
|
||
zeroCast = convert(dataType, 0) | ||
oneCast = convert(dataType, 1) | ||
twoCast = convert(dataType, 2) | ||
threeCast = convert(dataType, 3) | ||
NFFTCast = convert(dataType, NFFT) | ||
bandEdgesCast = convert.(dataType, bandEdges) | ||
fsCast = convert(dataType, sr) | ||
|
||
# Determine the number of bands | ||
numEdges = convert(dataType, length(bandEdgesCast)) | ||
numBands = numEdges - twoCast | ||
|
||
# Determine the number of valid bands | ||
validNumEdges = sum((bandEdgesCast .- (fsCast / twoCast)) .< sqrt(eps(dataType))) | ||
validNumBands = validNumEdges - twoCast | ||
|
||
# Preallocate the filter bank | ||
filterBank = zeros(dataType, NFFT, Int(numBands)) | ||
centerFrequencies = bandEdgesCast[2:end-1] | ||
|
||
# Set this flag to true if the number of FFT length is insufficient to | ||
# compute the specified number of mel bands | ||
FFTLengthTooSmall = false | ||
|
||
if (designDomain == :bin) | ||
|
||
else # designDomain = :Hz | ||
linFq = collect(zeroCast:NFFTCast-oneCast) / NFFTCast * fsCast | ||
|
||
# Determine inflection points | ||
@assert(validNumEdges <= numEdges) | ||
p = zeros(dataType, validNumEdges, 1) | ||
|
||
# da valutarne il risultato se Float64 | ||
for edgeNumber in 1:validNumEdges | ||
for index in eachindex(linFq) | ||
if linFq[index] > bandEdgesCast[edgeNumber] | ||
p[edgeNumber] = index | ||
break | ||
end | ||
end | ||
end | ||
|
||
bandEdgesCastMod = bandEdgesCast | ||
FqMod = linFq | ||
|
||
if (FilterBankDesignDomain == :mel || FilterBankDesignDomain == :warped) | ||
#da fare!!! | ||
elseif (FilterBankDesignDomain == :bark) | ||
# da fare | ||
end | ||
|
||
bw = diff(bandEdgesCastMod) | ||
|
||
for k in 1:Int(validNumBands) | ||
for j in Int(p[k]):Int(p[k+1])-1 | ||
filterBank[j, k] = (FqMod[j] - bandEdgesCastMod[k]) / bw[k] | ||
end | ||
|
||
for j = Int(p[k+1]):Int(p[k+2])-1 | ||
filterBank[j, k] = (bandEdgesCastMod[k+2] - FqMod[j]) / bw[k+1] | ||
end | ||
emptyRange1 = p[k] .> p[k+1] - 1 | ||
emptyRange2 = p[k+1] .> p[k+2] - 1 | ||
if (!FFTLengthTooSmall && (emptyRange1 || emptyRange2)) | ||
FFTLengthTooSmall = true | ||
end | ||
end | ||
|
||
# Apply normalization | ||
weightPerBand = ones(dataType, 1, Int(numBands)) | ||
|
||
if (normalization == :area) | ||
# da fare | ||
elseif (normalization == :bandwidth) | ||
filterBandWidth = bandEdgesCast[3:end] .- bandEdgesCast[1:end-2] | ||
weightPerBand = filterBandWidth / 2 | ||
end | ||
|
||
for i in 1:Int(numBands) | ||
filterBank[:, i] = filterBank[:, i] / weightPerBand[i] | ||
end | ||
|
||
# if keepTwoSided | ||
# range = audio.internal.getOnesidedFFTRange(NFFT); | ||
# range = range(2:end); | ||
# filterBank(end:-1:end-numel(range)+1,:) = filterBank(range,:); | ||
# end | ||
end | ||
return filterBank | ||
end # function designMelFilterBank | ||
|
||
function createDCTmatrix( | ||
numCoeffs::Int64, | ||
numFilters::Int64, | ||
DT::DataType | ||
) | ||
# create DCT matrix | ||
N = convert(DT, numCoeffs) | ||
K = numFilters | ||
matrix = zeros(DT, Int(N), numFilters) | ||
A = sqrt(1 / K) | ||
B = sqrt(2 / K) | ||
C = 2 * K | ||
piCCast = convert(DT, 2 * pi / C) | ||
|
||
matrix[1, :] .= A | ||
for k in 1:K | ||
for n in 2:Int(N) | ||
matrix[n, k] = B * cos(piCCast * (n - 1) * (k - 0.5)) | ||
end | ||
end | ||
|
||
return matrix | ||
end | ||
|
||
function cepstralCoefficients( | ||
S::AbstractMatrix{T}, | ||
numCoeffs::Int64, | ||
rectification::Symbol | ||
) where {T<:AbstractFloat} | ||
DT = eltype(S) | ||
# Rectify | ||
if (rectification == :log) | ||
amin = floatmin(DT) | ||
S[S.==0] .= amin | ||
S = log10.(S) | ||
end | ||
# Reshape spectrogram to matrix for vectorized matrix multiplication | ||
L, M = size(S) | ||
N = 1 | ||
S = reshape(S, L, M * N) | ||
# Design DCT matrix | ||
DCTmatrix = createDCTmatrix(numCoeffs, L, DT) | ||
# Apply DCT matrix | ||
coeffs = DCTmatrix * S | ||
|
||
return permutedims(coeffs, [2 1]) | ||
|
||
# # Unpack matrix back to 3d array DA EVITARE già da prima | ||
# coeffs = reshape(coeffs,numCoeffs,M,N) | ||
# # Put time along the first dimension | ||
# coeffs = permutedims(coeffs,[2 1 3]) | ||
end # function cepstralCoefficients | ||
|
||
function audioDelta( | ||
x::AbstractMatrix{T}, | ||
windowLength::Int64 | ||
) where {T<:AbstractFloat} | ||
|
||
DT = eltype(x) | ||
M = convert(DT, floor(windowLength / 2)) | ||
b = collect(M:-1:-M) ./ sum((1:M) .^ 2) | ||
|
||
return delta = filt(b, 1, x) | ||
end | ||
|
||
function mfcc( | ||
x::Union{AbstractVector{T},AbstractArray{T}}, | ||
sr::Int64; | ||
numCoeffs::Int64=13, | ||
rectification::Symbol=:log, | ||
logEnergy::Symbol=:append, | ||
overlapLength::Int64=Int(round(0.02 * sr)), | ||
FFTLength::Int64=Int(round(0.02 * sr)) | ||
) where {T<:AbstractFloat} | ||
|
||
x = convert.(Float64, x) # mantiene compatibilità con Matlab che scricchiola con Float32, da verificare se è il caso. | ||
# costruisco struttura deltaWindowLength | ||
window = [] | ||
windowLength = [] | ||
bandEdges = getDefaultBandEdges() | ||
deltaWindowLength = 9 | ||
|
||
options = mfccOptions( | ||
window, | ||
windowLength, | ||
overlapLength, | ||
FFTLength, | ||
bandEdges, | ||
numCoeffs, | ||
rectification, | ||
logEnergy, | ||
deltaWindowLength | ||
) | ||
|
||
# win, winLength, overlapLength = iGetWindow(x, sr, options.window, options.windowLength, options.overlapLength) | ||
win, hL = gencoswin(:hamming, Int(round(0.03 * sr)), :periodic) | ||
win = convert.(eltype(x), win) | ||
winLength = length(win) | ||
|
||
# Convenience variables | ||
r = size(x, 1) | ||
DT = eltype(x) | ||
hopLength = winLength - overlapLength | ||
|
||
# y = windowing(x, FFTLength, :hamming, :periodic, true) | ||
numChan = size(x, 2) | ||
nHops = Integer(floor((r - FFTLength) / hopLength) + 1) # numero delle window necessarie | ||
y = buffer(x, FFTLength, hopLength, nHops, 1) | ||
|
||
# log energy | ||
if (options.logEnergy != :ignore) | ||
E = sum(eachrow(y .^ 2)) # somma per righe | ||
E[E.==0] .= floatmin(DT) # il minimo float al posto di zero | ||
logE = log.(E) | ||
end | ||
|
||
# Multiply each segment by the window, and take the FFT | ||
wincast = convert.(DT, win[:]) # casta al tipo originale del file audio | ||
Z = abs.(fft(y .* wincast, (1,))) | ||
|
||
# Design the mel filter bank | ||
numValidEdges = sum(options.bandEdges .<= convert(DT, sr) / convert(DT, 2)) | ||
if (numValidEdges < length(options.bandEdges)) | ||
edges = options.bandEdges[1:numValidEdges+1] | ||
edges[end] = convert(DT, sr) / convert(DT, 2) | ||
else | ||
edges = options.bandEdges | ||
end | ||
filterBank = designMelFilterBank(sr, edges, FFTLength, 1, :bandwidth, :Hz, DT, false, :linear, :default) | ||
# Calculate cepstral coefficients | ||
coeffs = cepstralCoefficients(filterBank' * Z, options.numCoeffs, options.rectification) | ||
|
||
# fermi a 177 | ||
|
||
if (options.logEnergy == :append) | ||
# coeffs = (logE',coeffs) | ||
coeffs = hcat(logE, coeffs) | ||
elseif (options.logEnergy == :replace) | ||
# coeffs = [logE.',coeffs(:,2:end)]; | ||
# da fare | ||
end | ||
|
||
# coeffs = reshape(coeffs, Int(size(coeffs, 1) / numChan), numChan, size(coeffs, 2)) | ||
# coeffs = permutedims(coeffs,[1 3 2]) | ||
|
||
# METTERE IL CASO CHE le delta vengono calcolate solo se necessario | ||
delta = audioDelta(coeffs, options.deltaWindowLength) | ||
deltaDelta = audioDelta(delta, options.deltaWindowLength) | ||
loc = collect(0:(nHops-1)) * hopLength .+ winLength | ||
|
||
return coeffs, delta, deltaDelta, loc | ||
end | ||
|
||
## Main | ||
# if (!@isdefined(x)) # carica solo se non è definita mono, serve giusto per debug | ||
# # cosi ogni volta che lancio non ricarica tutto | ||
# using PyCall | ||
# librosa = pyimport("librosa") | ||
# sr_src = 8000 | ||
# x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true) | ||
# end | ||
# numCoeffs = 13 | ||
# rectification = :log | ||
# logEnergy = :append | ||
# overlapLength = Int(round(0.02 * sr)) | ||
# FFTLength = Int(round(0.03 * sr)) | ||
|
||
# coeffs, delta, deltaDelta, loc = mfcc( | ||
# x, | ||
# sr, | ||
# numCoeffs=numCoeffs, | ||
# rectification=rectification, | ||
# logEnergy=logEnergy, | ||
# overlapLength=overlapLength, | ||
# FFTLength=FFTLength | ||
# ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
using DSP | ||
using MFCC | ||
# using Plots | ||
|
||
include("wpspectrum.jl") | ||
include("cwt.jl") | ||
|
||
## main | ||
using PyCall | ||
librosa = pyimport("librosa") | ||
soundfile = pyimport("soundfile") | ||
plt = pyimport("matplotlib.pyplot") | ||
sr_src = 8000 | ||
x, sr = librosa.load("/home/riccardopasini/Documents/Julia/test_mono.wav", sr=sr_src, mono=true) | ||
|
||
depth = 2 | ||
wname = "sym6" | ||
entType=:shannon | ||
entPar=0.0 | ||
|
||
t = wpdec(x, depth, wname, entType, entPar) | ||
|
||
spec, freqs, times = wpspectrum(x, sr, depth, wname) | ||
|
||
soundfile.write("test1.wav",spec[1,:],samplerate=sr,subtype="PCM_16") | ||
soundfile.write("test2.wav",spec[2,:],samplerate=sr,subtype="PCM_16") | ||
soundfile.write("test3.wav",spec[3,:],samplerate=sr,subtype="PCM_16") | ||
soundfile.write("test4.wav",spec[4,:],samplerate=sr,subtype="PCM_16") | ||
|
||
# heatmap(1:size(spec, 1), | ||
# 1:size(spec, 2), spec, | ||
# c=cgrad([:blue, :white, :red, :yellow]), | ||
# xlabel="x values", ylabel="y values", | ||
# title="My title") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.