different speech/silence thresholds (#19)

* different speech/silence thresholds use different VAD probability thresholds for speaking and silence. also add multiple VAD checks when the probability is close to the speaking/silence thresholds. * analyze-files example add support for processing individual files and outputting VAD timestamps, which can be used to evaluate VAD parameters.
serenadeai · Dec 8, 2021 · 5fdf03f · 5fdf03f
1 parent 809fba9
commit 5fdf03f
Show file tree

Hide file tree

Showing 9 changed files with 205 additions and 128 deletions.
diff --git a/examples/analyze-files.js b/examples/analyze-files.js
@@ -0,0 +1,106 @@
+const fs = require("fs");
+const path = require("path");
+const { SpeechRecorder } = require("../dist/index");
+
+const quantile = (elements, q) => {
+  const sorted = elements.sort((a, b) => a - b);
+  const p = (sorted.length - 1) * q;
+  const base = Math.floor(p);
+  const rest = p - base;
+  if (sorted[base + 1] !== undefined) {
+    return sorted[base] + rest * (sorted[base + 1] - sorted[base]);
+  } else {
+    return sorted[base];
+  }
+};
+
+if (process.argv.length < 4) {
+  console.log("Usage: node analyze-files.js /path/to/wav/files /path/to/labels");
+  process.exit(1);
+}
+
+const sampleRate = 16000;
+let results = {};
+let labels = JSON.parse(fs.readFileSync(process.argv[3], "utf8"));
+fs.readdir(process.argv[2], async (error, files) => {
+  for (const file of files) {
+    if (!file.endsWith(".wav")) {
+      continue;
+    }
+
+    let samples = 0;
+    results[file] = { speech: [] };
+    const recorder = new SpeechRecorder();
+    await recorder.processFile(path.join(process.argv[2], file), {
+      onAudio: (audio) => {
+        samples += audio.length / 2;
+      },
+
+      onChunkStart: (audio) => {
+        results[file].speech.push([]);
+        results[file].speech[results[file].speech.length - 1].push(samples / sampleRate);
+      },
+
+      onChunkEnd: () => {
+        results[file].speech[results[file].speech.length - 1].push(samples / sampleRate);
+      },
+    });
+  }
+
+  let speechWindowTooSmall = [];
+  let noiseWasSpeech = [];
+  let noise = 0;
+  let speech = 0;
+  let extra = [];
+  for (const i of Object.keys(results)) {
+    const label = labels[i].speech;
+    const result = results[i].speech;
+
+    if (label.length == 0) {
+      noise++;
+    } else {
+      speech++;
+    }
+
+    if (label.length == 0 && result.length > 0) {
+      console.log("Noise was speech:", i);
+      console.log("VAD:", result);
+      noiseWasSpeech.push(i);
+    }
+
+    if (label.length > 0 && result.length > 0) {
+      const start = Math.min(...result.map((e) => e[0]));
+      const stop = Math.max(...result.map((e) => e[1]));
+      if (isNaN(start) || isNaN(stop)) {
+        continue;
+      }
+
+      const tolerance = 0.05;
+      if (start - 0.4 > label[0] + tolerance || stop < label[1] - tolerance) {
+        console.log("Speech window too small:", i);
+        console.log("Label:", label);
+        console.log("VAD:", result, start, stop);
+        speechWindowTooSmall.push(i);
+      } else if (stop > label[1]) {
+        extra.push(stop - label[1]);
+      }
+    }
+  }
+
+  console.log(
+    `\nSpeech window too small: ${(speechWindowTooSmall.length / speech).toFixed(2)} (${
+      speechWindowTooSmall.length
+    } / ${speech})`
+  );
+
+  console.log(
+    `Noise was speech: ${(noiseWasSpeech.length / noise).toFixed(2)} (${
+      noiseWasSpeech.length
+    } / ${noise})`
+  );
+
+  console.log(`Average extra speech: ${(extra.reduce((a, b) => a + b) / extra.length).toFixed(2)}`);
+  console.log(`p50 extra speech: ${quantile(extra, 0.5).toFixed(2)}`);
+  console.log(`p90 extra speech: ${quantile(extra, 0.75).toFixed(2)}`);
+  console.log(`Max extra speech: ${Math.max(...extra).toFixed(2)}`);
+});
diff --git a/examples/audio.js b/examples/audio.js
diff --git a/examples/file.js b/examples/file.js
diff --git a/examples/padding.js b/examples/padding.js
diff --git a/examples/record.js b/examples/record.js
@@ -0,0 +1,29 @@
+const fs = require("fs");
+const { SpeechRecorder } = require("../dist/index");
+const { WaveFile } = require("wavefile");
+
+if (process.argv.length < 3) {
+  console.log("Usage: node record.js /path/to/output.wav");
+  process.exit(1);
+}
+
+let buffer = [];
+const recorder = new SpeechRecorder();
+console.log("Ready...");
+setTimeout(() => {
+  console.log("Go!");
+  recorder.start({
+    onAudio: (audio) => {
+      for (let i = 0; i < audio.length; i += 2) {
+        buffer.push(audio.readInt16LE(i));
+      }
+
+      if (buffer.length == 16000 * 5) {
+        let wav = new WaveFile();
+        wav.fromScratch(1, 16000, "16", buffer);
+        fs.writeFileSync(process.argv[2], wav.toBuffer());
+        process.exit(1);
+      }
+    },
+  });
+}, 1000);
diff --git a/examples/silence.js b/examples/silence.js
diff --git a/index.ts b/index.ts
@@ -4,7 +4,6 @@ import * as os from "os";
 import { WaveFile } from "wavefile";
 import { Readable } from "stream";
 import WebrtcVad from "webrtcvad";
-import uuid from "uuid/v4";
 import SileroVad from "./vad";
 
 const portAudioPath = `${__dirname}/../build/Release/portaudio.node`;
@@ -67,21 +66,24 @@ export class SpeechRecorder {
   private highWaterMark: number = 64000;
   private leadingBuffer: Buffer[] = [];
   private leadingPadding: number = 20;
-  private minimumVolume: number = 200;
+  private minimumVolume: number = 1;
   private sampleRate: number = 16000;
   private speaking: boolean = false;
   private speakingThreshold: number = 1;
-  private silenceThreshold: number = 10;
+  private silenceThreshold: number = 5;
   private triggers: Trigger[] = [];
-  private webrtcVad: WebrtcVad;
   private vad = new SileroVad();
   private vadBuffer: number[] = [];
-  // 250 ms so that it's consistent with the silero python example.
-  private vadBufferSize: number = 4000;
-  private vadRateLimit: number = 5;
-  private vadLastSpeaking: boolean = false;
+  private vadBufferSize: number;
   private vadLastProbability: number = 0;
-  private vadThreshold: number = 0.75;
+  private vadLastSpeaking: boolean = false;
+  private vadRateLimit: number = 3;
+  private vadRefinementWindow: number = 2;
+  private vadSilenceThreshold: number = 0.1;
+  private vadSpeechThreshold: number = 0.3;
+  private webrtcVad: WebrtcVad;
+  private webrtcResultsBuffer: boolean[] = [];
+  private webrtcResultsBufferSize: number = 3;
 
   constructor(options: any = {}) {
     if (options.disableSecondPass !== undefined) {
@@ -92,6 +94,10 @@ export class SpeechRecorder {
       this.error = options.error;
     }
 
+    if (options.firstPassResultsBufferSize !== undefined) {
+      this.webrtcResultsBufferSize = options.firstPassResultsBufferSize;
+    }
+
     if (options.framesPerBuffer !== undefined) {
       this.framesPerBuffer = options.framesPerBuffer;
     }
@@ -128,11 +134,20 @@ export class SpeechRecorder {
       this.vadRateLimit = options.vadRateLimit;
     }
 
-    if (options.vadThreshold !== undefined) {
-      this.vadThreshold = options.vadThreshold;
+    if (options.vadRefinementWindow !== undefined) {
+      this.vadRefinementWindow = options.vadRefinementWindow;
+    }
+
+    if (options.vadSilenceThreshold !== undefined) {
+      this.vadSilenceThreshold = options.vadSilenceThreshold;
+    }
+
+    if (options.vadSpeechThreshold !== undefined) {
+      this.vadSpeechThreshold = options.vadSpeechThreshold;
     }
 
-    this.webrtcVad = new WebrtcVad(this.sampleRate, options.firstPassLevel || 3);
+    this.vadBufferSize = this.sampleRate / 4 + this.framesPerBuffer * this.vadRefinementWindow;
+    this.webrtcVad = new WebrtcVad(this.sampleRate, options.firstPassLevel || 2);
   }
 
   async load() {
@@ -159,10 +174,18 @@ export class SpeechRecorder {
       this.vadBuffer.splice(0, this.vadBuffer.length - this.vadBufferSize);
     }
 
-    // until we've filled up the VAD buffer, ignore the results of both VADs
     const volume = Math.floor(Math.sqrt(sum / (audio.length / 2)));
+    this.webrtcResultsBuffer.push(this.webrtcVad.process(audio));
+    if (this.webrtcResultsBuffer.length > this.webrtcResultsBufferSize) {
+      this.webrtcResultsBuffer.splice(
+        0,
+        this.webrtcResultsBuffer.length - this.webrtcResultsBufferSize
+      );
+    }
+
+    // until we've filled up the VAD buffer, ignore the results of both VADs
     let speaking = !!(
-      this.webrtcVad.process(audio) &&
+      this.webrtcResultsBuffer.some((e) => e) &&
       volume > this.minimumVolume &&
       this.vadBuffer.length == this.vadBufferSize
     );
@@ -172,9 +195,39 @@ export class SpeechRecorder {
     if (speaking && !this.disableSecondPass && this.vad.ready) {
       // cache values of probability and speaking for buffersUntilVad frames
       if (this.buffersUntilVad == 0) {
-        this.vadLastProbability = await this.vad.process(this.vadBuffer);
-        this.vadLastSpeaking = this.vadLastProbability > this.vadThreshold;
         this.buffersUntilVad = this.vadRateLimit;
+        this.vadLastProbability = await this.vad.process(
+          this.vadBuffer.slice(this.framesPerBuffer * this.vadRefinementWindow)
+        );
+
+        const tolerance = 0.15;
+        if (
+          (!this.speaking &&
+            Math.abs(this.vadLastProbability - this.vadSpeechThreshold) <
+              tolerance * this.vadSpeechThreshold) ||
+          (this.speaking &&
+            Math.abs(this.vadLastProbability - this.vadSilenceThreshold) <
+              tolerance * this.vadSilenceThreshold)
+        ) {
+          let probabilities = [];
+          for (let i = 0; i < this.vadRefinementWindow; i++) {
+            probabilities.push(
+              await this.vad.process(
+                this.vadBuffer.slice(
+                  this.framesPerBuffer * i,
+                  this.vadBufferSize - this.framesPerBuffer * (this.vadRefinementWindow - i)
+                )
+              )
+            );
+          }
+
+          probabilities.push(this.vadLastProbability);
+          this.vadLastProbability = probabilities.reduce((a, b) => a + b, 0) / probabilities.length;
+        }
+
+        this.vadLastSpeaking = this.speaking
+          ? this.vadLastProbability > this.vadSilenceThreshold
+          : this.vadLastProbability > this.vadSpeechThreshold;
       }
 
       speaking = this.vadLastSpeaking;