-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_runner.py
62 lines (56 loc) · 2.92 KB
/
model_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import soundfile
import io
import os
import contextlib
import numpy as np
import time
import stable_whisper
import textwrap
from colorama import init as colorama_init
from colorama import Fore
from colorama import Style
from audio_rec import audio_data
from strings_and_consts import data, strings
baseModel = None
def model_runner():
global data
global strings
global baseModel
while True:
# CPUs don't like doing stuff forever, so let's give it a break.
time.sleep(0.1)
if not audio_data.empty() and data["start"] == True:
if data["set_backend"] == data["backend"][0]:
baseModel = stable_whisper.load_hf_whisper(data["set_transcribe_model"])
elif data["set_backend"] == data["backend"][1]:
baseModel = stable_whisper.load_faster_whisper(data["set_transcribe_model"])
elif data["set_backend"] == data["backend"][2]:
print(f"{Fore.LIGHTRED_EX}Whisper.CPP is not supported yet.{Style.RESET_ALL}")
start_time = time.time()
print(f"{Fore.LIGHTGREEN_EX}Audio received. Processing...{Style.RESET_ALL}")
# While the queue is not empty, get each item and append it
recording = audio_data.get()
while not audio_data.empty():
new_audio_data = audio_data.get()
# Ensure both arrays have the same number of dimensions
if recording.ndim > new_audio_data.ndim:
new_audio_data = np.expand_dims(new_audio_data, axis=0)
elif recording.ndim < new_audio_data.ndim:
recording = np.expand_dims(recording, axis=0)
recording = np.concatenate((recording, new_audio_data))
recording = np.squeeze(recording)
# Convert numpy array to PCM_16 wav bytes in memory
with io.BytesIO() as f:
soundfile.write(f, recording, 44100, 'PCM_16', format="WAV")
audio_bytes = f.getvalue()
user_text = baseModel.transcribe(audio_bytes, vad=data["set_vad"], word_timestamps="true", denoiser=data["set_denoiser"],
batch_size=int(data["set_batch_size"]), use_word_position="true")
end_time = time.time() - start_time
print(f"{Fore.LIGHTGREEN_EX}Audio processed. Time taken: {end_time} seconds.{Style.RESET_ALL}")
print(f"{Fore.LIGHTGREEN_EX}Transcription: {user_text.text}{Style.RESET_ALL}")
data["currentText"] = user_text.text
#Add new lines, limiting it to 30 characters per line
split_text = textwrap.fill(user_text.text, width=30)
# Open file, with write permissions, remove the old text and write the new text
with open(data["set_working_folder"], "w") as f:
f.write(split_text)