Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PanktiSelector with transcriber support (WebSpeechApi + MSFT) #415

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
226 changes: 219 additions & 7 deletions frontend/package-lock.json

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
"name": "@shabados/viewer--frontend",
"version": "0.1.0",
"dependencies": {
"events": "^3.0.0",
"gurmukhi-utils": "^3.2.2",
"jotai": "^1.8.4",
"lodash": "^4.17.21",
"lucide-react": "^0.88.0",
"microsoft-cognitiveservices-speech-sdk": "^1.29.0",
"new-github-issue-url": "^1.0.0",
"react": "^18.2.0",
"react-dom": "^18.2.0",
Expand All @@ -14,7 +16,8 @@
"react-router-dom": "^6.3.0",
"screenfull": "^6.0.2",
"swr": "^1.3.0",
"use-debounce": "^8.0.4"
"use-debounce": "^8.0.4",
"util": "^0.11.1"
},
"scripts": {
"start": "vite --host --port 52525",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { AudioConfig, ResultReason, SpeechConfig, SpeechRecognizer } from 'microsoft-cognitiveservices-speech-sdk'

import { RecordingStateChangeCallback, ResultCallback, Transcriber } from './Transcriber'

// This calls the Cognitive Services API, and will need a user to generate their own
// resource in Azure portal.
// Note: You can just use WebSpeechApi in Edge and will see similar results for free,
// and hence only use this if you want to use MSFT provider on non-edge browser
export class MicrosoftCognitiveServicesSpeechTranscriber extends Transcriber {
private speechRecognizer: SpeechRecognizer | undefined

constructor(
callback: ResultCallback,
recordingCallback: RecordingStateChangeCallback,
speechKey: string,
speechRegion: string
) {
super( callback, recordingCallback )

const speechConfig = SpeechConfig.fromSubscription(
speechKey,
speechRegion
)
speechConfig.speechRecognitionLanguage = 'hi-IN'
const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
this.speechRecognizer = new SpeechRecognizer( speechConfig, audioConfig )

this.speechRecognizer.recognizing = ( s, e ) => {
if ( e.result.reason === ResultReason.RecognizingSpeech ) {
console.log( `Interim Transcription: ${e.result.text}` )
callback( e.result.text )
}
}

this.speechRecognizer.sessionStarted = ( s, e ) => {
this.m_recordingStateChangeCallback( true )
}

this.speechRecognizer.sessionStopped = ( s, e ) => {
this.m_recordingStateChangeCallback( false )
}
}

StartRecording(): void {
this.speechRecognizer?.startContinuousRecognitionAsync( () => {
console.log( 'Transcribing speech. Press Ctrl+C to stop.' )
} )
}

StopRecording(): void {
this.speechRecognizer?.stopContinuousRecognitionAsync( () => {
console.log( 'Transcription stopped.' )
this.speechRecognizer?.close()
} )
}
}
210 changes: 210 additions & 0 deletions frontend/src/lib/speech/PanktiSelector.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import { MicrosoftCognitiveServicesSpeechTranscriber } from './MicrosoftCognitiveServicesSpeechTranscriber'
import { RecordingStateChangeCallback, ResultCallback, Transcriber } from './Transcriber'
import { WebSpeechApiTranscriber } from './WebSpeechApiTranscriber'

export type PositionCallback = ( page: number, line: number ) => void

export class PanktiSelector {
private static readonly transcriberNameCookieKey = 'TRANSCRIBER_NAME'
private static readonly transcriberLang = 'TRANSCRIBER_LANG'
private static readonly msftApiKeyCookieKey = 'SPEECH_KEY'
private static readonly msftApiregionCookieKey = 'SPEECH_REGION'
private static readonly openaiApiSecretCookieKey = 'OPENAI_SECRET'

private static readonly LOOKBACK_LENGTH = 75

private combinedLines = ''
private linePositions: number[] = []
private currentPage = 0

private prevTranscription = ''

private usePunjabi: boolean

private lineCallback: PositionCallback

private transcriber: Transcriber | undefined

private isRunning = false

private GetPositionFromTranscriptionResult: ResultCallback = ( newText: string ): void => {
let needle = ( this.usePunjabi ? '' : this.prevTranscription ) + this.transcriber.TransformOutput( newText )
needle = needle.length <= PanktiSelector.LOOKBACK_LENGTH
? needle
: needle.substring( needle.length - PanktiSelector.LOOKBACK_LENGTH )

this.prevTranscription = needle

const result = PanktiSelector.fuzzyMatch( needle, this.combinedLines )
const endPos = result[ 2 ]

console.log( 'GetPositionFromTranscriptionResult: result=', result, ' needle=', needle, ' match=', this.combinedLines.substring( result[ 1 ], endPos ) )

for ( let i = 0; i < this.linePositions.length; i += 1 ) {
if ( this.linePositions[ i ] >= endPos ) {
this.lineCallback( this.currentPage, i )
break
}
}
}

constructor( setPanktiSelectorRunningState ) {
const transcriberName = PanktiSelector.getCookie( PanktiSelector.transcriberNameCookieKey )

this.usePunjabi = PanktiSelector.getCookie( PanktiSelector.transcriberLang ) === 'pa'

console.log( 'transcriberName:', transcriberName )

const recordingCallback: RecordingStateChangeCallback = ( newVal: boolean ): void => {
this.isRunning = newVal
console.log( 'Setting state of running to ', newVal )

setPanktiSelectorRunningState( newVal )
}

switch ( transcriberName ) {
case 'WebSpeechApi':
case null:
this.transcriber = new WebSpeechApiTranscriber(
this.GetPositionFromTranscriptionResult,
recordingCallback,
this.usePunjabi
)
break
case 'MSFT':
this.transcriber = new MicrosoftCognitiveServicesSpeechTranscriber(
this.GetPositionFromTranscriptionResult,
recordingCallback,
PanktiSelector.getCookie( PanktiSelector.msftApiKeyCookieKey ),
PanktiSelector.getCookie( PanktiSelector.msftApiregionCookieKey )
)
break
default:
console.log( `Not a valid Transcriber type name '${transcriberName}'. Skipping...` )
}
}

SetCallback( callback: PositionCallback ): void {
this.lineCallback = callback
}

SetLines( page: number, lines: string[] ): void {
if ( !( this.transcriber ) ) {
console.log( 'not setting lines as transcriber is', this.transcriber )
return
}

let newCombinedLines = ''
const newLinePositions: number[] = []

lines.forEach( ( line ) => {
newCombinedLines += `${this.transcriber.TransformInput( line )} `
newLinePositions.push( newCombinedLines.length )
} )

this.combinedLines = newCombinedLines
this.linePositions = newLinePositions
this.currentPage = page

console.log( 'combinedLines:', this.combinedLines )
console.log( 'linePositions:', this.linePositions )
}

ToggleRunningState(): void {
if ( !( this.transcriber ) ) {
console.log( 'looks like Transcriber is not yet set.' )
return
}

console.log( 'Previous state of running was ', this.isRunning )

if ( this.isRunning ) {
this.transcriber.StopRecording()
} else {
this.transcriber.StartRecording()
}
}

private static fuzzyMatch( needle: string, haystack: string ) : [number, number, number] {
const tuple = PanktiSelector.fuzzyMatchAllResults( needle, haystack )
const prev = tuple[ 0 ]
const prevBacktrace = tuple[ 1 ]

// get the farthest match with same value
// Note: don't use prev[0] as that value is just to assist the Dynamic Programming
let pos = prev.length - 1
for ( let i = prev.length - 2; i > 0; i -= 1 ) {
if ( prev[ pos ] > prev[ i ] ) {
pos = i
}
}

// (cost, startPosition, endPosition) where the match is substring [startPosition, endPosition)
return [ prev[ pos ], prevBacktrace[ pos ], pos ]
}

private static fuzzyMatchAllResults( needle: string, haystack: string ) : [number[], number[]] {
const arrayLength = haystack.length + 1
let prev: number[] = Array<number>( arrayLength ).fill( 0 )
let prevBacktrace: number[] = Array<number>( arrayLength ).fill( 0 ).map( ( _, i ) => i )

let current: number[] = Array<number>( arrayLength ).fill( 0 )
let currentBacktrace: number[] = Array<number>( arrayLength ).fill( 0 )

for ( let i = 0; i < needle.length; i += 1 ) {
const cost = 1 // TODO: update this to use different cost based on what the character is.

current[ 0 ] = prev[ 0 ] + 1
currentBacktrace[ 0 ] = 0

for ( let j = 0; j < arrayLength - 1; j += 1 ) {
// cost of deleting. If you make this the default option, it will
// encapsulate the longest match found with backtrace (including garbage chars on edge)
current[ j + 1 ] = current[ j ] + cost
currentBacktrace[ j + 1 ] = currentBacktrace[ j ]

// cost of substitute if needed. If you make this the default option, it will
// decrease the collection of garbage characters on the edges in
// the match found with backtrace
const substitutionCost = prev[ j ] + ( needle[ i ] !== haystack[ j ] ? cost : 0 )
if ( current[ j + 1 ] > substitutionCost ) {
current[ j + 1 ] = substitutionCost
currentBacktrace[ j + 1 ] = prevBacktrace[ j ]
}

// cost of inserting
const insertionCost = prev[ j + 1 ] + cost
if ( current[ j + 1 ] > insertionCost ) {
current[ j + 1 ] = insertionCost
currentBacktrace[ j + 1 ] = prevBacktrace[ j + 1 ]
}
}
const temp = prev
prev = current
current = temp

const tempBacktrace = prevBacktrace
prevBacktrace = currentBacktrace
currentBacktrace = tempBacktrace
}
return [ prev, prevBacktrace ]
}

private static getCookie( name: string ): string | null {
const cookies = document.cookie.split( '; ' )
for ( let i = 0; i < cookies.length; i++ ) {
const [ cookieName, cookieValue ] = cookies[ i ].split( '=' )
if ( cookieName === name ) {
return cookieValue
}
}
return null
}

public static setCookie( name: string, value: string, days = 1 ): void {
const expirationDate = new Date()
expirationDate.setTime( expirationDate.getTime() + days * 24 * 60 * 60 * 1000 )
const expires = `expires=${expirationDate.toUTCString()}`
document.cookie = `${name}=${value};${expires};path=/`
}
}
25 changes: 25 additions & 0 deletions frontend/src/lib/speech/Transcriber.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { stripEndings, stripVishraams, toHindi, toUnicode } from 'gurmukhi-utils'

export type ResultCallback = ( result: string ) => void
export type RecordingStateChangeCallback = ( newState: boolean ) => void

export abstract class Transcriber {
protected m_callback: ResultCallback
protected m_recordingStateChangeCallback: RecordingStateChangeCallback

constructor(
callback: ResultCallback,
recordingStateChangeCallback: RecordingStateChangeCallback
) {
this.m_callback = callback
this.m_recordingStateChangeCallback = recordingStateChangeCallback
}

abstract StartRecording() : void
abstract StopRecording() : void

TransformInput( input: string ): string {
return toHindi( stripEndings( stripVishraams( toUnicode( input ) ) ) )
}
TransformOutput( input: string ): string { return input }
}
75 changes: 75 additions & 0 deletions frontend/src/lib/speech/WebSpeechApiTranscriber.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { stripEndings, stripVishraams, toUnicode } from 'gurmukhi-utils'

import { RecordingStateChangeCallback, ResultCallback, Transcriber } from './Transcriber'

// This uses the inbuilt browswer's WebSpeechApi, and hence could use a different service
// under the hood depending on the browswer. Hence some perform better than others.
//
// Example:
// - Chrome uses a version of Google cloud Speech-to-text
// - Edge uses Microsoft Cognitive Service's Speech-to-text
//
// In my tests, Chrome gives faster results but stops after some time (under a minute) and
// needs to restart it, while Edge on the other hand is slower will continue to work.
export class WebSpeechApiTranscriber extends Transcriber {
private recognition

private usePunjabi

constructor(
callback: ResultCallback,
recordingCallback: RecordingStateChangeCallback,
usePunjabi: boolean
) {
super( callback, recordingCallback )

this.usePunjabi = usePunjabi

if ( usePunjabi ) {
console.log( 'WebSpeechApiTranscriber: Using Punjabi language' )
} else {
console.log( 'WebSpeechApiTranscriber: Using Hindi language' )
}

const recognitionClass = window.SpeechRecognition || window.webkitSpeechRecognition
this.recognition = new recognitionClass()
this.recognition.continuous = true
this.recognition.interimResults = true
this.recognition.lang = usePunjabi ? 'pa-Guru-IN' : 'hi-IN'

this.recognition.onstart = () => {
this.m_recordingStateChangeCallback( true )
}

this.recognition.onerror = ( event ) => {
console.log( 'onerror in recording (WebSpeechApi): ', event )
}

this.recognition.onend = () => {
this.m_recordingStateChangeCallback( false )
}

this.recognition.onresult = ( event ) => {
let interim_transcript = ''
for ( let i = event.resultIndex; i < event.results.length; ++i ) {
interim_transcript += event.results[ i ][ 0 ].transcript
}
callback( interim_transcript )
}
}

TransformInput( input: string ): string {
if ( this.usePunjabi ) {
return stripEndings( stripVishraams( toUnicode( input ) ) )
}
return super.TransformInput( input )
}

StartRecording(): void {
this.recognition.start()
}

StopRecording(): void {
this.recognition.stop()
}
}
Loading