diff --git a/index.html b/index.html index 5e010b9..ae4a6b9 100644 --- a/index.html +++ b/index.html @@ -13,8 +13,6 @@
- - @@ -28,11 +26,9 @@ import { DagloAPI } from 'https://actionpower.github.io/dagloapi-js-beta/lib/daglo-api.module.js'; document.getElementById('enableButton').addEventListener('click', async (event) => { - const baseURL = document?.getElementById('host').value?.trim(); const dagloToken = document?.getElementById('token').value?.trim(); let client = new DagloAPI({ - host: baseURL, apiToken: dagloToken }); let transcriber = client.stream.transcriber(); @@ -46,8 +42,6 @@ document.getElementById('transcripts').append(span); } }) - // transcriber.start(); - // or let stream; try { diff --git a/lib/daglo-api.module.js b/lib/daglo-api.module.js index 8dea94a..5885447 100644 --- a/lib/daglo-api.module.js +++ b/lib/daglo-api.module.js @@ -16,10 +16,17 @@ import { AudioClassifier, FilesetResolver } from "https://actionpower.github.io/dagloapi-js-beta/lib/mediapipe-audio.module.js"; export class DagloAPI { - constructor({apiToken = '', host = 'https://apis.daglo.ai'}) { - host = host.replace(/\/+$/gim, ''); + // Declare static class variables for common configuration + static DEFAULT_CONFIG = { + threshold: 0.65, + concatLevel: 3, + bufferSize: 8192, + frameSize: 3, + maxDuration : 20, + sampleRate: 44100, + }; + constructor({apiToken = ''}) { this.apiToken = apiToken; - this.host = host; } stream = { @@ -27,8 +34,8 @@ export class DagloAPI { let lib = { model: { audioClassifier: undefined, - audioCtx: undefined, - audio: undefined, + audioCtx: undefined, + audio: undefined, }, utils: { __callback: { @@ -71,25 +78,25 @@ export class DagloAPI { view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); } } - + const numOfChannels = 1; // 단일 채널 (모노) const numOfFrames = buffer.length; const bytesPerSample = 2; // 16비트 PCM - + // WAV 파일의 헤더 크기와 데이터 크기 계산 const headerSize = 44; const dataSize = numOfFrames * numOfChannels * bytesPerSample; const totalSize = headerSize + dataSize; - + // WAV 파일을 위한 Uint8Array 생성 const wavBuffer = new ArrayBuffer(totalSize); const view = new DataView(wavBuffer); - + // RIFF 헤더 작성 writeString(view, 0, 'RIFF'); view.setUint32(4, totalSize - 8, true); writeString(view, 8, 'WAVE'); - + // fmt 서브체크 작성 writeString(view, 12, 'fmt '); view.setUint32(16, 16, true); // 서브체크 크기 (16바이트) @@ -99,14 +106,14 @@ export class DagloAPI { view.setUint32(28, sampleRate * numOfChannels * bytesPerSample, true); // 초당 바이트 수 view.setUint16(32, numOfChannels * bytesPerSample, true); // 블록 정렬 view.setUint16(34, bytesPerSample * 8, true); // 비트 수 (16비트) - + // data 서브체크 작성 writeString(view, 36, 'data'); view.setUint32(40, dataSize, true); - + // Float32Array 데이터를 16비트 PCM으로 변환하여 삽입 floatTo16BitPCM(view, 44, buffer); - + return new Blob([view], { type: 'audio/wav' }); }, }, @@ -129,63 +136,87 @@ export class DagloAPI { modelAssetPath: 'https://actionpower.github.io/dagloapi-js-beta/lib/audio.tflite' } }) - + if (typeof(lib.utils.__callback['open']) == 'function') { lib.utils.__callback['open'](lib.utils.nanoid()); } }, - start: async ({threshold = 0.65, concatLevel = 3, sttConfig = undefined} = {}) => { - const constraints = { audio: true }; - let stream; - - const bufferSize = 8192; - const frameSize = 3; - + connect: async (stream) => { + const threshold = DagloAPI.DEFAULT_CONFIG.threshold; + const concatLevel = DagloAPI.DEFAULT_CONFIG.concatLevel; + const bufferSize = DagloAPI.DEFAULT_CONFIG.bufferSize; + const frameSize = DagloAPI.DEFAULT_CONFIG.frameSize; + let audioBuffer = new Float32Array(bufferSize * frameSize); let SpeechBuffer = []; let buffers = Array(frameSize).fill(null).map(() => Array(bufferSize).fill(0)); - + let bufferIdx = 0; let nonSpeechTime = -1; - - if (sttConfig && sttConfig?.keywordBoost.enable == undefined && !sttConfig?.keywordBoost.keywords?.length) { - sttConfig.keywordBoost.enable = true; - } - if (sttConfig?.keywordBoost?.boost) { - sttConfig.keywordBoost.boost = Math.max(Math.min(15, Math.floor(sttConfig.keywordBoost.boost * 15)), 1); - } - - try { - stream = await navigator.mediaDevices.getUserMedia(constraints); - } - catch (err) { - console.log("The following error occured: " + err); - return alert("getUserMedia not supported on your browser"); - } - + if (!lib.model.audioCtx) { - lib.model.audioCtx = new AudioContext({ sampleRate: 44100 }); - + lib.model.audioCtx = new AudioContext({ sampleRate: DagloAPI.DEFAULT_CONFIG.sampleRate }); + const source = lib.model.audioCtx.createMediaStreamSource(stream); const scriptNode = lib.model.audioCtx.createScriptProcessor(bufferSize, 1, 1); - + + const processAudioBuffer = () => { + if (typeof(lib.utils.__callback['transcript']) == 'function' || + typeof(lib.utils.__callback['vad']) == 'function') { + + let audioBuffer = new Float32Array(bufferSize * (SpeechBuffer.length - Math.floor(concatLevel / 2))); + + for (let i = 0; i < SpeechBuffer.length - Math.floor(concatLevel / 2); i++) { + audioBuffer.set(SpeechBuffer[i], i * bufferSize); + } + + const wavBlob = lib.utils.float32ToWav(audioBuffer, DagloAPI.DEFAULT_CONFIG.sampleRate); + + if (typeof(lib.utils.__callback['vad']) == 'function') { + lib.utils.__callback['vad'](audioBuffer, wavBlob); + } + + const formData = new FormData(); + formData.append('file', wavBlob, 'recording.wav'); + + fetch(`https://apis.daglo.ai/stt/v1/sync/transcripts`, { + method: 'POST', + headers: { + 'Authorization': 'Bearer ' + this.apiToken + }, + body: formData, + }) + .then(response => response.json()) + .then(data => { + if (typeof(lib.utils.__callback['transcript']) == 'function') { + lib.utils.__callback['transcript']({text: data?.sttResult?.transcript}); + } + }) + .catch(error => { + console.error('Error:', error); + }); + } + }; + scriptNode.onaudioprocess = (audioProcessingEvent) => { const inputBuffer = audioProcessingEvent.inputBuffer; let inputData = inputBuffer.getChannelData(0); let _inputData = Object.assign([], inputData); buffers[bufferIdx % frameSize] = _inputData; - + + const MAX_SAMPLES = DagloAPI.DEFAULT_CONFIG.maxDuration * DagloAPI.DEFAULT_CONFIG.sampleRate; + for (let i = 0; i < frameSize; i++) { audioBuffer.set(buffers[(i + bufferIdx + 1) % frameSize], i * bufferSize); } - + bufferIdx++; - + const result = lib.model.audioClassifier.classify(audioBuffer); const categories = result[0].classifications[0].categories; const output = categories.filter((item) => item?.score >= threshold); const speechIdx = output.findIndex((item)=>item.categoryName == 'Speech'); - + if (speechIdx != -1) { SpeechBuffer.push(_inputData); nonSpeechTime = 0; @@ -193,47 +224,7 @@ export class DagloAPI { if (++nonSpeechTime < concatLevel) { SpeechBuffer.push(_inputData); } else { - if (typeof(lib.utils.__callback['transcript']) == 'function' || typeof(lib.utils.__callback['vad']) == 'function') { - let audioBuffer = new Float32Array(bufferSize * (SpeechBuffer.length - Math.floor(concatLevel / 2))); - - for (let i = 0; i < SpeechBuffer.length - Math.floor(concatLevel / 2); i++) { - audioBuffer.set(SpeechBuffer[i], i * bufferSize); - } - - const wavBlob = lib.utils.float32ToWav(audioBuffer, 44100); - - if (typeof(lib.utils.__callback['vad']) == 'function') { - lib.utils.__callback['vad'](audioBuffer, wavBlob); - } - - // FormData 객체 생성 - const formData = new FormData(); - formData.append('file', wavBlob, 'record.wav'); - - if (sttConfig) { - formData.append('sttConfig', JSON.stringify(sttConfig)); - } - - // Daglo transcript API - fetch(`${this.host}/stt/v1/sync/transcripts`, { - method: 'POST', - headers: { - 'Authorization': 'Bearer ' + this.apiToken - }, - body: formData, - }) - .then(response => response.json()) - .then(data => { - if (typeof(lib.utils.__callback['transcript']) == 'function') { - lib.utils.__callback['transcript']({text: data?.sttResult?.transcript}); //, audioBuffer, wavBlob - } - }) - .catch(error => { - console.error('Error:', error); - }); - } - - // buffers = Array(frameSize).fill(null).map(() => Array(bufferSize).fill(0)); + processAudioBuffer(); nonSpeechTime = -1; SpeechBuffer = []; } @@ -242,125 +233,11 @@ export class DagloAPI { SpeechBuffer = []; SpeechBuffer.push(_inputData); } - - if (typeof(lib.utils.__callback['debug']) == 'function') { - lib.utils.__callback['debug'](output); - } - }; - source.connect(scriptNode); - scriptNode.connect(lib.model.audioCtx.destination); - } - else if (lib.model.audioCtx.state === "running") { - await lib.model.audioCtx.suspend(); - - if (typeof(lib.utils.__callback['stop']) == 'function') { - lib.utils.__callback['stop'](); - } - - return; - } - - audioBuffer = new Float32Array(bufferSize * frameSize); - SpeechBuffer = []; - buffers = Array(frameSize).fill(null).map(() => Array(bufferSize).fill(0)); - - bufferIdx = 0; - nonSpeechTime = -1; - - await lib.model.audioCtx.resume(); - - if (typeof(lib.utils.__callback['start']) == 'function') { - lib.utils.__callback['start'](); - } - }, - connect: async (stream) => { - const threshold = 0.65; - const concatLevel = 3; - const bufferSize = 8192; - const frameSize = 3; - - let audioBuffer = new Float32Array(bufferSize * frameSize); - let SpeechBuffer = []; - let buffers = Array(frameSize).fill(null).map(() => Array(bufferSize).fill(0)); - - let bufferIdx = 0; - let nonSpeechTime = -1; - - if (!lib.model.audioCtx) { - lib.model.audioCtx = new AudioContext({ sampleRate: 44100 }); - - const source = lib.model.audioCtx.createMediaStreamSource(stream); - const scriptNode = lib.model.audioCtx.createScriptProcessor(bufferSize, 1, 1); - - scriptNode.onaudioprocess = (audioProcessingEvent) => { - const inputBuffer = audioProcessingEvent.inputBuffer; - let inputData = inputBuffer.getChannelData(0); - let _inputData = Object.assign([], inputData); - buffers[bufferIdx % frameSize] = _inputData; - - for (let i = 0; i < frameSize; i++) { - audioBuffer.set(buffers[(i + bufferIdx + 1) % frameSize], i * bufferSize); - } - - bufferIdx++; - - const result = lib.model.audioClassifier.classify(audioBuffer); - const categories = result[0].classifications[0].categories; - const output = categories.filter((item) => item?.score >= threshold); - const speechIdx = output.findIndex((item) => item.categoryName == 'Speech'); - - if (speechIdx != -1) { - SpeechBuffer.push(_inputData); - nonSpeechTime = 0; - } else if (nonSpeechTime != -1) { - if (++nonSpeechTime < concatLevel) { - SpeechBuffer.push(_inputData); - } else { - if (typeof(lib.utils.__callback['transcript']) == 'function' || typeof(lib.utils.__callback['vad']) == 'function') { - let audioBuffer = new Float32Array(bufferSize * (SpeechBuffer.length - Math.floor(concatLevel / 2))); - - for (let i = 0; i < SpeechBuffer.length - Math.floor(concatLevel / 2); i++) { - audioBuffer.set(SpeechBuffer[i], i * bufferSize); - } - - const wavBlob = lib.utils.float32ToWav(audioBuffer, 44100); - - if (typeof(lib.utils.__callback['vad']) == 'function') { - lib.utils.__callback['vad'](audioBuffer, wavBlob); - } - - // FormData 객체 생성 - const formData = new FormData(); - formData.append('file', wavBlob, 'recording.wav'); // 서버로 전송할 때 파일 이름 지정 - - // API 호출 (예시: fetch 사용) - fetch(`${this.host}/stt/v1/sync/transcripts`, { - method: 'POST', - headers: { - 'Authorization': 'Bearer ' + this.apiToken - }, - body: formData, - }) - .then(response => response.json()) - .then(data => { - console.log(data); - if (typeof(lib.utils.__callback['transcript']) == 'function') { - lib.utils.__callback['transcript']({text: data?.sttResult?.transcript}); //, audioBuffer, wavBlob - } - }) - .catch(error => { - console.error('Error:', error); - }); - } - - // buffers = Array(frameSize).fill(null).map(() => Array(bufferSize).fill(0)); - nonSpeechTime = -1; - SpeechBuffer = []; - } - } else { + + if (SpeechBuffer.length * bufferSize >= MAX_SAMPLES) { + processAudioBuffer(); nonSpeechTime = -1; SpeechBuffer = []; - SpeechBuffer.push(_inputData); } }; source.connect(scriptNode); @@ -368,20 +245,19 @@ export class DagloAPI { } else if (lib.model.audioCtx.state === "running") { await lib.model.audioCtx.suspend(); - return; } - + audioBuffer = new Float32Array(bufferSize * frameSize); SpeechBuffer = []; buffers = Array(frameSize).fill(null).map(() => Array(bufferSize).fill(0)); - + bufferIdx = 0; nonSpeechTime = -1; - + await lib.model.audioCtx.resume(); } - } + }; lib.init();