-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
different speech/silence thresholds (#19)
* different speech/silence thresholds use different VAD probability thresholds for speaking and silence. also add multiple VAD checks when the probability is close to the speaking/silence thresholds. * analyze-files example add support for processing individual files and outputting VAD timestamps, which can be used to evaluate VAD parameters.
- Loading branch information
Showing
9 changed files
with
205 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
const fs = require("fs"); | ||
const path = require("path"); | ||
const { SpeechRecorder } = require("../dist/index"); | ||
|
||
const quantile = (elements, q) => { | ||
const sorted = elements.sort((a, b) => a - b); | ||
const p = (sorted.length - 1) * q; | ||
const base = Math.floor(p); | ||
const rest = p - base; | ||
if (sorted[base + 1] !== undefined) { | ||
return sorted[base] + rest * (sorted[base + 1] - sorted[base]); | ||
} else { | ||
return sorted[base]; | ||
} | ||
}; | ||
|
||
if (process.argv.length < 4) { | ||
console.log("Usage: node analyze-files.js /path/to/wav/files /path/to/labels"); | ||
process.exit(1); | ||
} | ||
|
||
const sampleRate = 16000; | ||
let results = {}; | ||
let labels = JSON.parse(fs.readFileSync(process.argv[3], "utf8")); | ||
fs.readdir(process.argv[2], async (error, files) => { | ||
for (const file of files) { | ||
if (!file.endsWith(".wav")) { | ||
continue; | ||
} | ||
|
||
let samples = 0; | ||
results[file] = { speech: [] }; | ||
const recorder = new SpeechRecorder(); | ||
await recorder.processFile(path.join(process.argv[2], file), { | ||
onAudio: (audio) => { | ||
samples += audio.length / 2; | ||
}, | ||
|
||
onChunkStart: (audio) => { | ||
results[file].speech.push([]); | ||
results[file].speech[results[file].speech.length - 1].push(samples / sampleRate); | ||
}, | ||
|
||
onChunkEnd: () => { | ||
results[file].speech[results[file].speech.length - 1].push(samples / sampleRate); | ||
}, | ||
}); | ||
} | ||
|
||
let speechWindowTooSmall = []; | ||
let noiseWasSpeech = []; | ||
let noise = 0; | ||
let speech = 0; | ||
let extra = []; | ||
for (const i of Object.keys(results)) { | ||
const label = labels[i].speech; | ||
const result = results[i].speech; | ||
|
||
if (label.length == 0) { | ||
noise++; | ||
} else { | ||
speech++; | ||
} | ||
|
||
if (label.length == 0 && result.length > 0) { | ||
console.log("Noise was speech:", i); | ||
console.log("VAD:", result); | ||
noiseWasSpeech.push(i); | ||
} | ||
|
||
if (label.length > 0 && result.length > 0) { | ||
const start = Math.min(...result.map((e) => e[0])); | ||
const stop = Math.max(...result.map((e) => e[1])); | ||
if (isNaN(start) || isNaN(stop)) { | ||
continue; | ||
} | ||
|
||
const tolerance = 0.05; | ||
if (start - 0.4 > label[0] + tolerance || stop < label[1] - tolerance) { | ||
console.log("Speech window too small:", i); | ||
console.log("Label:", label); | ||
console.log("VAD:", result, start, stop); | ||
speechWindowTooSmall.push(i); | ||
} else if (stop > label[1]) { | ||
extra.push(stop - label[1]); | ||
} | ||
} | ||
} | ||
|
||
console.log( | ||
`\nSpeech window too small: ${(speechWindowTooSmall.length / speech).toFixed(2)} (${ | ||
speechWindowTooSmall.length | ||
} / ${speech})` | ||
); | ||
|
||
console.log( | ||
`Noise was speech: ${(noiseWasSpeech.length / noise).toFixed(2)} (${ | ||
noiseWasSpeech.length | ||
} / ${noise})` | ||
); | ||
|
||
console.log(`Average extra speech: ${(extra.reduce((a, b) => a + b) / extra.length).toFixed(2)}`); | ||
console.log(`p50 extra speech: ${quantile(extra, 0.5).toFixed(2)}`); | ||
console.log(`p90 extra speech: ${quantile(extra, 0.75).toFixed(2)}`); | ||
console.log(`Max extra speech: ${Math.max(...extra).toFixed(2)}`); | ||
}); |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
const fs = require("fs"); | ||
const { SpeechRecorder } = require("../dist/index"); | ||
const { WaveFile } = require("wavefile"); | ||
|
||
if (process.argv.length < 3) { | ||
console.log("Usage: node record.js /path/to/output.wav"); | ||
process.exit(1); | ||
} | ||
|
||
let buffer = []; | ||
const recorder = new SpeechRecorder(); | ||
console.log("Ready..."); | ||
setTimeout(() => { | ||
console.log("Go!"); | ||
recorder.start({ | ||
onAudio: (audio) => { | ||
for (let i = 0; i < audio.length; i += 2) { | ||
buffer.push(audio.readInt16LE(i)); | ||
} | ||
|
||
if (buffer.length == 16000 * 5) { | ||
let wav = new WaveFile(); | ||
wav.fromScratch(1, 16000, "16", buffer); | ||
fs.writeFileSync(process.argv[2], wav.toBuffer()); | ||
process.exit(1); | ||
} | ||
}, | ||
}); | ||
}, 1000); |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.