shabados · manshantsingh · Jun 18, 2023 · Jun 18, 2023 · Jun 18, 2023 · Jun 18, 2023
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
diff --git a/frontend/package.json b/frontend/package.json
@@ -2,10 +2,12 @@
   "name": "@shabados/viewer--frontend",
   "version": "0.1.0",
   "dependencies": {
+    "events": "^3.0.0",
     "gurmukhi-utils": "^3.2.2",
     "jotai": "^1.8.4",
     "lodash": "^4.17.21",
     "lucide-react": "^0.88.0",
+    "microsoft-cognitiveservices-speech-sdk": "^1.29.0",
     "new-github-issue-url": "^1.0.0",
     "react": "^18.2.0",
     "react-dom": "^18.2.0",
@@ -14,7 +16,8 @@
     "react-router-dom": "^6.3.0",
     "screenfull": "^6.0.2",
     "swr": "^1.3.0",
-    "use-debounce": "^8.0.4"
+    "use-debounce": "^8.0.4",
+    "util": "^0.11.1"
   },
   "scripts": {
     "start": "vite --host --port 52525",

diff --git a/frontend/src/lib/speech/MicrosoftCognitiveServicesSpeechTranscriber.ts b/frontend/src/lib/speech/MicrosoftCognitiveServicesSpeechTranscriber.ts
@@ -0,0 +1,56 @@
+import { AudioConfig, ResultReason, SpeechConfig, SpeechRecognizer } from 'microsoft-cognitiveservices-speech-sdk'
+
+import { RecordingStateChangeCallback, ResultCallback, Transcriber } from './Transcriber'
+
+// This calls the Cognitive Services API, and will need a user to generate their own
+// resource in Azure portal.
+// Note: You can just use WebSpeechApi in Edge and will see similar results for free,
+// and hence only use this if you want to use MSFT provider on non-edge browser
+export class MicrosoftCognitiveServicesSpeechTranscriber extends Transcriber {
+  private speechRecognizer: SpeechRecognizer | undefined
+
+  constructor(
+    callback: ResultCallback,
+    recordingCallback: RecordingStateChangeCallback,
+    speechKey: string,
+    speechRegion: string
+  ) {
+    super( callback, recordingCallback )
+
+    const speechConfig = SpeechConfig.fromSubscription(
+      speechKey,
+      speechRegion
+    )
+    speechConfig.speechRecognitionLanguage = 'hi-IN'
+    const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
+    this.speechRecognizer = new SpeechRecognizer( speechConfig, audioConfig )
+
+    this.speechRecognizer.recognizing = ( s, e ) => {
+      if ( e.result.reason === ResultReason.RecognizingSpeech ) {
+        console.log( `Interim Transcription: ${e.result.text}` )
+        callback( e.result.text )
+      }
+    }
+
+    this.speechRecognizer.sessionStarted = ( s, e ) => {
+      this.m_recordingStateChangeCallback( true )
+    }
+
+    this.speechRecognizer.sessionStopped = ( s, e ) => {
+      this.m_recordingStateChangeCallback( false )
+    }
+  }
+
+  StartRecording(): void {
+    this.speechRecognizer?.startContinuousRecognitionAsync( () => {
+      console.log( 'Transcribing speech. Press Ctrl+C to stop.' )
+    } )
+  }
+
+  StopRecording(): void {
+    this.speechRecognizer?.stopContinuousRecognitionAsync( () => {
+      console.log( 'Transcription stopped.' )
+      this.speechRecognizer?.close()
+    } )
+  }
+}
diff --git a/frontend/src/lib/speech/PanktiSelector.ts b/frontend/src/lib/speech/PanktiSelector.ts
@@ -0,0 +1,210 @@
+import { MicrosoftCognitiveServicesSpeechTranscriber } from './MicrosoftCognitiveServicesSpeechTranscriber'
+import { RecordingStateChangeCallback, ResultCallback, Transcriber } from './Transcriber'
+import { WebSpeechApiTranscriber } from './WebSpeechApiTranscriber'
+
+export type PositionCallback = ( page: number, line: number ) => void
+
+export class PanktiSelector {
+  private static readonly transcriberNameCookieKey = 'TRANSCRIBER_NAME'
+  private static readonly transcriberLang = 'TRANSCRIBER_LANG'
+  private static readonly msftApiKeyCookieKey = 'SPEECH_KEY'
+  private static readonly msftApiregionCookieKey = 'SPEECH_REGION'
+  private static readonly openaiApiSecretCookieKey = 'OPENAI_SECRET'
+
+  private static readonly LOOKBACK_LENGTH = 75
+
+  private combinedLines = ''
+  private linePositions: number[] = []
+  private currentPage = 0
+
+  private prevTranscription = ''
+
+  private usePunjabi: boolean
+
+  private lineCallback: PositionCallback
+
+  private transcriber: Transcriber | undefined
+
+  private isRunning = false
+
+  private GetPositionFromTranscriptionResult: ResultCallback = ( newText: string ): void => {
+    let needle = ( this.usePunjabi ? '' : this.prevTranscription ) + this.transcriber.TransformOutput( newText )
+    needle = needle.length <= PanktiSelector.LOOKBACK_LENGTH
+      ? needle
+      : needle.substring( needle.length - PanktiSelector.LOOKBACK_LENGTH )
+
+    this.prevTranscription = needle
+
+    const result = PanktiSelector.fuzzyMatch( needle, this.combinedLines )
+    const endPos = result[ 2 ]
+
+    console.log( 'GetPositionFromTranscriptionResult: result=', result, ' needle=', needle, ' match=', this.combinedLines.substring( result[ 1 ], endPos ) )
+
+    for ( let i = 0; i < this.linePositions.length; i += 1 ) {
+      if ( this.linePositions[ i ] >= endPos ) {
+        this.lineCallback( this.currentPage, i )
+        break
+      }
+    }
+  }
+
+  constructor( setPanktiSelectorRunningState ) {
+    const transcriberName = PanktiSelector.getCookie( PanktiSelector.transcriberNameCookieKey )
+
+    this.usePunjabi = PanktiSelector.getCookie( PanktiSelector.transcriberLang ) === 'pa'
+
+    console.log( 'transcriberName:', transcriberName )
+
+    const recordingCallback: RecordingStateChangeCallback = ( newVal: boolean ): void => {
+      this.isRunning = newVal
+      console.log( 'Setting state of running to ', newVal )
+
+      setPanktiSelectorRunningState( newVal )
+    }
+
+    switch ( transcriberName ) {
+      case 'WebSpeechApi':
+      case null:
+        this.transcriber = new WebSpeechApiTranscriber(
+          this.GetPositionFromTranscriptionResult,
+          recordingCallback,
+          this.usePunjabi
+        )
+        break
+      case 'MSFT':
+        this.transcriber = new MicrosoftCognitiveServicesSpeechTranscriber(
+          this.GetPositionFromTranscriptionResult,
+          recordingCallback,
+          PanktiSelector.getCookie( PanktiSelector.msftApiKeyCookieKey ),
+          PanktiSelector.getCookie( PanktiSelector.msftApiregionCookieKey )
+        )
+        break
+      default:
+        console.log( `Not a valid Transcriber type name '${transcriberName}'. Skipping...` )
+    }
+  }
+
+  SetCallback( callback: PositionCallback ): void {
+    this.lineCallback = callback
+  }
+
+  SetLines( page: number, lines: string[] ): void {
+    if ( !( this.transcriber ) ) {
+      console.log( 'not setting lines as transcriber is', this.transcriber )
+      return
+    }
+
+    let newCombinedLines = ''
+    const newLinePositions: number[] = []
+
+    lines.forEach( ( line ) => {
+      newCombinedLines += `${this.transcriber.TransformInput( line )} `
+      newLinePositions.push( newCombinedLines.length )
+    } )
+
+    this.combinedLines = newCombinedLines
+    this.linePositions = newLinePositions
+    this.currentPage = page
+
+    console.log( 'combinedLines:', this.combinedLines )
+    console.log( 'linePositions:', this.linePositions )
+  }
+
+  ToggleRunningState(): void {
+    if ( !( this.transcriber ) ) {
+      console.log( 'looks like Transcriber is not yet set.' )
+      return
+    }
+
+    console.log( 'Previous state of running was ', this.isRunning )
+
+    if ( this.isRunning ) {
+      this.transcriber.StopRecording()
+    } else {
+      this.transcriber.StartRecording()
+    }
+  }
+
+  private static fuzzyMatch( needle: string, haystack: string ) : [number, number, number] {
+    const tuple = PanktiSelector.fuzzyMatchAllResults( needle, haystack )
+    const prev = tuple[ 0 ]
+    const prevBacktrace = tuple[ 1 ]
+
+    // get the farthest match with same value
+    // Note: don't use prev[0] as that value is just to assist the Dynamic Programming
+    let pos = prev.length - 1
+    for ( let i = prev.length - 2; i > 0; i -= 1 ) {
+      if ( prev[ pos ] > prev[ i ] ) {
+        pos = i
+      }
+    }
+
+    // (cost, startPosition, endPosition) where the match is substring [startPosition, endPosition)
+    return [ prev[ pos ], prevBacktrace[ pos ], pos ]
+  }
+
+  private static fuzzyMatchAllResults( needle: string, haystack: string ) : [number[], number[]] {
+    const arrayLength = haystack.length + 1
+    let prev: number[] = Array<number>( arrayLength ).fill( 0 )
+    let prevBacktrace: number[] = Array<number>( arrayLength ).fill( 0 ).map( ( _, i ) => i )
+
+    let current: number[] = Array<number>( arrayLength ).fill( 0 )
+    let currentBacktrace: number[] = Array<number>( arrayLength ).fill( 0 )
+
+    for ( let i = 0; i < needle.length; i += 1 ) {
+      const cost = 1 // TODO: update this to use different cost based on what the character is.
+
+      current[ 0 ] = prev[ 0 ] + 1
+      currentBacktrace[ 0 ] = 0
+
+      for ( let j = 0; j < arrayLength - 1; j += 1 ) {
+      // cost of deleting. If you make this the default option, it will
+      // encapsulate the longest match found with backtrace (including garbage chars on edge)
+        current[ j + 1 ] = current[ j ] + cost
+        currentBacktrace[ j + 1 ] = currentBacktrace[ j ]
+
+        // cost of substitute if needed. If you make this the default option, it will
+        // decrease the collection of garbage characters on the edges in
+        // the match found with backtrace
+        const substitutionCost = prev[ j ] + ( needle[ i ] !== haystack[ j ] ? cost : 0 )
+        if ( current[ j + 1 ] > substitutionCost ) {
+          current[ j + 1 ] = substitutionCost
+          currentBacktrace[ j + 1 ] = prevBacktrace[ j ]
+        }
+
+        // cost of inserting
+        const insertionCost = prev[ j + 1 ] + cost
+        if ( current[ j + 1 ] > insertionCost ) {
+          current[ j + 1 ] = insertionCost
+          currentBacktrace[ j + 1 ] = prevBacktrace[ j + 1 ]
+        }
+      }
+      const temp = prev
+      prev = current
+      current = temp
+
+      const tempBacktrace = prevBacktrace
+      prevBacktrace = currentBacktrace
+      currentBacktrace = tempBacktrace
+    }
+    return [ prev, prevBacktrace ]
+  }
+
+  private static getCookie( name: string ): string | null {
+    const cookies = document.cookie.split( '; ' )
+    for ( let i = 0; i < cookies.length; i++ ) {
+      const [ cookieName, cookieValue ] = cookies[ i ].split( '=' )
+      if ( cookieName === name ) {
+        return cookieValue
+      }
+    }
+    return null
+  }
+
+  public static setCookie( name: string, value: string, days = 1 ): void {
+    const expirationDate = new Date()
+    expirationDate.setTime( expirationDate.getTime() + days * 24 * 60 * 60 * 1000 )
+    const expires = `expires=${expirationDate.toUTCString()}`
+    document.cookie = `${name}=${value};${expires};path=/`
+  }
+}
diff --git a/frontend/src/lib/speech/Transcriber.ts b/frontend/src/lib/speech/Transcriber.ts
@@ -0,0 +1,25 @@
+import { stripEndings, stripVishraams, toHindi, toUnicode } from 'gurmukhi-utils'
+
+export type ResultCallback = ( result: string ) => void
+export type RecordingStateChangeCallback = ( newState: boolean ) => void
+
+export abstract class Transcriber {
+  protected m_callback: ResultCallback
+  protected m_recordingStateChangeCallback: RecordingStateChangeCallback
+
+  constructor(
+    callback: ResultCallback,
+    recordingStateChangeCallback: RecordingStateChangeCallback
+  ) {
+    this.m_callback = callback
+    this.m_recordingStateChangeCallback = recordingStateChangeCallback
+  }
+
+  abstract StartRecording() : void
+  abstract StopRecording() : void
+
+  TransformInput( input: string ): string {
+    return toHindi( stripEndings( stripVishraams( toUnicode( input ) ) ) )
+  }
+  TransformOutput( input: string ): string { return input }
+}
diff --git a/frontend/src/lib/speech/WebSpeechApiTranscriber.ts b/frontend/src/lib/speech/WebSpeechApiTranscriber.ts
@@ -0,0 +1,75 @@
+import { stripEndings, stripVishraams, toUnicode } from 'gurmukhi-utils'
+
+import { RecordingStateChangeCallback, ResultCallback, Transcriber } from './Transcriber'
+
+// This uses the inbuilt browswer's WebSpeechApi, and hence could use a different service
+// under the hood depending on the browswer. Hence some perform better than others.
+//
+// Example:
+// - Chrome uses a version of Google cloud Speech-to-text
+// - Edge uses Microsoft Cognitive Service's Speech-to-text
+//
+// In my tests, Chrome gives faster results but stops after some time (under a minute) and
+// needs to restart it, while Edge on the other hand is slower will continue to work.
+export class WebSpeechApiTranscriber extends Transcriber {
+  private recognition
+
+  private usePunjabi
+
+  constructor(
+    callback: ResultCallback,
+    recordingCallback: RecordingStateChangeCallback,
+    usePunjabi: boolean
+  ) {
+    super( callback, recordingCallback )
+
+    this.usePunjabi = usePunjabi
+
+    if ( usePunjabi ) {
+      console.log( 'WebSpeechApiTranscriber: Using Punjabi language' )
+    } else {
+      console.log( 'WebSpeechApiTranscriber: Using Hindi language' )
+    }
+
+    const recognitionClass = window.SpeechRecognition || window.webkitSpeechRecognition
+    this.recognition = new recognitionClass()
+    this.recognition.continuous = true
+    this.recognition.interimResults = true
+    this.recognition.lang = usePunjabi ? 'pa-Guru-IN' : 'hi-IN'
+
+    this.recognition.onstart = () => {
+      this.m_recordingStateChangeCallback( true )
+    }
+
+    this.recognition.onerror = ( event ) => {
+      console.log( 'onerror in recording (WebSpeechApi): ', event )
+    }
+
+    this.recognition.onend = () => {
+      this.m_recordingStateChangeCallback( false )
+    }
+
+    this.recognition.onresult = ( event ) => {
+      let interim_transcript = ''
+      for ( let i = event.resultIndex; i < event.results.length; ++i ) {
+        interim_transcript += event.results[ i ][ 0 ].transcript
+      }
+      callback( interim_transcript )
+    }
+  }
+
+  TransformInput( input: string ): string {
+    if ( this.usePunjabi ) {
+      return stripEndings( stripVishraams( toUnicode( input ) ) )
+    }
+    return super.TransformInput( input )
+  }
+
+  StartRecording(): void {
+    this.recognition.start()
+  }
+
+  StopRecording(): void {
+    this.recognition.stop()
+  }
+}