Skip to content

Commit

Permalink
Add translation functionality to transcribed text
Browse files Browse the repository at this point in the history
---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/collabora/WhisperLive?shareId=XXXX-XXXX-XXXX-XXXX).
  • Loading branch information
ywy366607 committed Oct 21, 2024
1 parent be71657 commit 8ead775
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 5 deletions.
24 changes: 24 additions & 0 deletions Audio-Transcription-Chrome/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ async function startCapture(options) {
task: options.task,
modelSize: options.modelSize,
useVad: options.useVad,
targetLanguage: options.targetLanguage, // Added target language for translation
},
});
} else {
Expand Down Expand Up @@ -188,6 +189,25 @@ async function stopCapture() {
}


/**
* Sends transcribed text to a large model for translation and sends the translated text to the client.
* @param {string} text - The transcribed text to be translated.
* @param {string} targetLanguage - The target language for translation.
* @returns {Promise<string>} - A Promise that resolves to the translated text.
*/
async function translateText(text, targetLanguage) {
// Placeholder function for sending transcribed text to a large model for translation
// Implement the actual translation logic here
return new Promise((resolve) => {
// Simulate translation delay
setTimeout(() => {
const translatedText = `Translated (${targetLanguage}): ${text}`;
resolve(translatedText);
}, 1000);
});
}


/**
* Listens for messages from the runtime and performs corresponding actions.
* @param {Object} message - The message received from the runtime.
Expand All @@ -205,6 +225,10 @@ chrome.runtime.onMessage.addListener(async (message) => {
chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false });
chrome.storage.local.set({ capturingState: { isCapturing: false } })
stopCapture();
} else if (message.action === "translateText") {
const { text, targetLanguage } = message;
const translatedText = await translateText(text, targetLanguage);
chrome.runtime.sendMessage({ action: "translatedText", translatedText });
}
});

Expand Down
3 changes: 2 additions & 1 deletion Audio-Transcription-Chrome/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ async function startRecord(option) {
language: option.language,
task: option.task,
model: option.modelSize,
use_vad: option.useVad
use_vad: option.useVad,
targetLanguage: option.targetLanguage, // Added target language for translation
})
);
};
Expand Down
21 changes: 17 additions & 4 deletions whisper_live/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,6 @@ def generate_segments(
# NOTE: This loop is obscurely flattened to make the diff readable.
# A later commit should turn this into a simpler nested loop.
# for seek_clip_start, seek_clip_end in seek_clips:
# while seek < seek_clip_end
while clip_idx < len(seek_clips):
seek_clip_start, seek_clip_end = seek_clips[clip_idx]
if seek_clip_end > content_frames:
Expand Down Expand Up @@ -1025,7 +1024,7 @@ def add_word_timestamps(
and segment["start"] - 0.5 > words[0]["start"]
):
words[0]["start"] = max(
0, min(words[0]["end"] - median_duration, segment["start"])
0, min(words[0]["end"] - max_duration, segment["start"])
)
else:
segment["start"] = words[0]["start"]
Expand All @@ -1041,8 +1040,6 @@ def add_word_timestamps(
else:
segment["end"] = words[-1]["end"]

last_speech_timestamp = segment["end"]

segment["words"] = words

def find_alignment(
Expand Down Expand Up @@ -1102,6 +1099,22 @@ def find_alignment(
)
]

def translate(
self,
text: str,
target_language: str,
model: str = "Helsinki-NLP/opus-mt-en-ROMANCE",
) -> str:
from transformers import MarianMTModel, MarianTokenizer

tokenizer = MarianTokenizer.from_pretrained(model)
model = MarianMTModel.from_pretrained(model)

translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

return translated_text


def restore_speech_timestamps(
segments: Iterable[Segment],
Expand Down
5 changes: 5 additions & 0 deletions whisper_live/transcriber_tensorrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,11 @@ def transcribe(
prediction = re.sub(r'<\|.*?\|>', '', prediction)
return prediction.strip()

def translate(self, text, target_language):
# Placeholder function for sending transcribed text to a large model for translation
# Implement the actual translation logic here
return f"Translated ({target_language}): {text}"


def decode_wav_file(
model,
Expand Down

0 comments on commit 8ead775

Please sign in to comment.