Skip to content

Commit

Permalink
different speech/silence thresholds (#19)
Browse files Browse the repository at this point in the history
* different speech/silence thresholds

use different VAD probability thresholds for speaking and silence.
also add multiple VAD checks when the probability is close to the
speaking/silence thresholds.

* analyze-files example

add support for processing individual files and outputting VAD
timestamps, which can be used to evaluate VAD parameters.
  • Loading branch information
tmacwill authored Dec 8, 2021
1 parent 809fba9 commit 5fdf03f
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 128 deletions.
106 changes: 106 additions & 0 deletions examples/analyze-files.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
const fs = require("fs");
const path = require("path");
const { SpeechRecorder } = require("../dist/index");

const quantile = (elements, q) => {
const sorted = elements.sort((a, b) => a - b);
const p = (sorted.length - 1) * q;
const base = Math.floor(p);
const rest = p - base;
if (sorted[base + 1] !== undefined) {
return sorted[base] + rest * (sorted[base + 1] - sorted[base]);
} else {
return sorted[base];
}
};

if (process.argv.length < 4) {
console.log("Usage: node analyze-files.js /path/to/wav/files /path/to/labels");
process.exit(1);
}

const sampleRate = 16000;
let results = {};
let labels = JSON.parse(fs.readFileSync(process.argv[3], "utf8"));
fs.readdir(process.argv[2], async (error, files) => {
for (const file of files) {
if (!file.endsWith(".wav")) {
continue;
}

let samples = 0;
results[file] = { speech: [] };
const recorder = new SpeechRecorder();
await recorder.processFile(path.join(process.argv[2], file), {
onAudio: (audio) => {
samples += audio.length / 2;
},

onChunkStart: (audio) => {
results[file].speech.push([]);
results[file].speech[results[file].speech.length - 1].push(samples / sampleRate);
},

onChunkEnd: () => {
results[file].speech[results[file].speech.length - 1].push(samples / sampleRate);
},
});
}

let speechWindowTooSmall = [];
let noiseWasSpeech = [];
let noise = 0;
let speech = 0;
let extra = [];
for (const i of Object.keys(results)) {
const label = labels[i].speech;
const result = results[i].speech;

if (label.length == 0) {
noise++;
} else {
speech++;
}

if (label.length == 0 && result.length > 0) {
console.log("Noise was speech:", i);
console.log("VAD:", result);
noiseWasSpeech.push(i);
}

if (label.length > 0 && result.length > 0) {
const start = Math.min(...result.map((e) => e[0]));
const stop = Math.max(...result.map((e) => e[1]));
if (isNaN(start) || isNaN(stop)) {
continue;
}

const tolerance = 0.05;
if (start - 0.4 > label[0] + tolerance || stop < label[1] - tolerance) {
console.log("Speech window too small:", i);
console.log("Label:", label);
console.log("VAD:", result, start, stop);
speechWindowTooSmall.push(i);
} else if (stop > label[1]) {
extra.push(stop - label[1]);
}
}
}

console.log(
`\nSpeech window too small: ${(speechWindowTooSmall.length / speech).toFixed(2)} (${
speechWindowTooSmall.length
} / ${speech})`
);

console.log(
`Noise was speech: ${(noiseWasSpeech.length / noise).toFixed(2)} (${
noiseWasSpeech.length
} / ${noise})`
);

console.log(`Average extra speech: ${(extra.reduce((a, b) => a + b) / extra.length).toFixed(2)}`);
console.log(`p50 extra speech: ${quantile(extra, 0.5).toFixed(2)}`);
console.log(`p90 extra speech: ${quantile(extra, 0.75).toFixed(2)}`);
console.log(`Max extra speech: ${Math.max(...extra).toFixed(2)}`);
});
11 changes: 0 additions & 11 deletions examples/audio.js

This file was deleted.

28 changes: 0 additions & 28 deletions examples/file.js

This file was deleted.

46 changes: 0 additions & 46 deletions examples/padding.js

This file was deleted.

29 changes: 29 additions & 0 deletions examples/record.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const fs = require("fs");
const { SpeechRecorder } = require("../dist/index");
const { WaveFile } = require("wavefile");

if (process.argv.length < 3) {
console.log("Usage: node record.js /path/to/output.wav");
process.exit(1);
}

let buffer = [];
const recorder = new SpeechRecorder();
console.log("Ready...");
setTimeout(() => {
console.log("Go!");
recorder.start({
onAudio: (audio) => {
for (let i = 0; i < audio.length; i += 2) {
buffer.push(audio.readInt16LE(i));
}

if (buffer.length == 16000 * 5) {
let wav = new WaveFile();
wav.fromScratch(1, 16000, "16", buffer);
fs.writeFileSync(process.argv[2], wav.toBuffer());
process.exit(1);
}
},
});
}, 1000);
14 changes: 0 additions & 14 deletions examples/silence.js

This file was deleted.

85 changes: 69 additions & 16 deletions index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import * as os from "os";
import { WaveFile } from "wavefile";
import { Readable } from "stream";
import WebrtcVad from "webrtcvad";
import uuid from "uuid/v4";
import SileroVad from "./vad";

const portAudioPath = `${__dirname}/../build/Release/portaudio.node`;
Expand Down Expand Up @@ -67,21 +66,24 @@ export class SpeechRecorder {
private highWaterMark: number = 64000;
private leadingBuffer: Buffer[] = [];
private leadingPadding: number = 20;
private minimumVolume: number = 200;
private minimumVolume: number = 1;
private sampleRate: number = 16000;
private speaking: boolean = false;
private speakingThreshold: number = 1;
private silenceThreshold: number = 10;
private silenceThreshold: number = 5;
private triggers: Trigger[] = [];
private webrtcVad: WebrtcVad;
private vad = new SileroVad();
private vadBuffer: number[] = [];
// 250 ms so that it's consistent with the silero python example.
private vadBufferSize: number = 4000;
private vadRateLimit: number = 5;
private vadLastSpeaking: boolean = false;
private vadBufferSize: number;
private vadLastProbability: number = 0;
private vadThreshold: number = 0.75;
private vadLastSpeaking: boolean = false;
private vadRateLimit: number = 3;
private vadRefinementWindow: number = 2;
private vadSilenceThreshold: number = 0.1;
private vadSpeechThreshold: number = 0.3;
private webrtcVad: WebrtcVad;
private webrtcResultsBuffer: boolean[] = [];
private webrtcResultsBufferSize: number = 3;

constructor(options: any = {}) {
if (options.disableSecondPass !== undefined) {
Expand All @@ -92,6 +94,10 @@ export class SpeechRecorder {
this.error = options.error;
}

if (options.firstPassResultsBufferSize !== undefined) {
this.webrtcResultsBufferSize = options.firstPassResultsBufferSize;
}

if (options.framesPerBuffer !== undefined) {
this.framesPerBuffer = options.framesPerBuffer;
}
Expand Down Expand Up @@ -128,11 +134,20 @@ export class SpeechRecorder {
this.vadRateLimit = options.vadRateLimit;
}

if (options.vadThreshold !== undefined) {
this.vadThreshold = options.vadThreshold;
if (options.vadRefinementWindow !== undefined) {
this.vadRefinementWindow = options.vadRefinementWindow;
}

if (options.vadSilenceThreshold !== undefined) {
this.vadSilenceThreshold = options.vadSilenceThreshold;
}

if (options.vadSpeechThreshold !== undefined) {
this.vadSpeechThreshold = options.vadSpeechThreshold;
}

this.webrtcVad = new WebrtcVad(this.sampleRate, options.firstPassLevel || 3);
this.vadBufferSize = this.sampleRate / 4 + this.framesPerBuffer * this.vadRefinementWindow;
this.webrtcVad = new WebrtcVad(this.sampleRate, options.firstPassLevel || 2);
}

async load() {
Expand All @@ -159,10 +174,18 @@ export class SpeechRecorder {
this.vadBuffer.splice(0, this.vadBuffer.length - this.vadBufferSize);
}

// until we've filled up the VAD buffer, ignore the results of both VADs
const volume = Math.floor(Math.sqrt(sum / (audio.length / 2)));
this.webrtcResultsBuffer.push(this.webrtcVad.process(audio));
if (this.webrtcResultsBuffer.length > this.webrtcResultsBufferSize) {
this.webrtcResultsBuffer.splice(
0,
this.webrtcResultsBuffer.length - this.webrtcResultsBufferSize
);
}

// until we've filled up the VAD buffer, ignore the results of both VADs
let speaking = !!(
this.webrtcVad.process(audio) &&
this.webrtcResultsBuffer.some((e) => e) &&
volume > this.minimumVolume &&
this.vadBuffer.length == this.vadBufferSize
);
Expand All @@ -172,9 +195,39 @@ export class SpeechRecorder {
if (speaking && !this.disableSecondPass && this.vad.ready) {
// cache values of probability and speaking for buffersUntilVad frames
if (this.buffersUntilVad == 0) {
this.vadLastProbability = await this.vad.process(this.vadBuffer);
this.vadLastSpeaking = this.vadLastProbability > this.vadThreshold;
this.buffersUntilVad = this.vadRateLimit;
this.vadLastProbability = await this.vad.process(
this.vadBuffer.slice(this.framesPerBuffer * this.vadRefinementWindow)
);

const tolerance = 0.15;
if (
(!this.speaking &&
Math.abs(this.vadLastProbability - this.vadSpeechThreshold) <
tolerance * this.vadSpeechThreshold) ||
(this.speaking &&
Math.abs(this.vadLastProbability - this.vadSilenceThreshold) <
tolerance * this.vadSilenceThreshold)
) {
let probabilities = [];
for (let i = 0; i < this.vadRefinementWindow; i++) {
probabilities.push(
await this.vad.process(
this.vadBuffer.slice(
this.framesPerBuffer * i,
this.vadBufferSize - this.framesPerBuffer * (this.vadRefinementWindow - i)
)
)
);
}

probabilities.push(this.vadLastProbability);
this.vadLastProbability = probabilities.reduce((a, b) => a + b, 0) / probabilities.length;
}

this.vadLastSpeaking = this.speaking
? this.vadLastProbability > this.vadSilenceThreshold
: this.vadLastProbability > this.vadSpeechThreshold;
}

speaking = this.vadLastSpeaking;
Expand Down
Loading

0 comments on commit 5fdf03f

Please sign in to comment.