Skip to content

Commit

Permalink
支持fasterwhipser
Browse files Browse the repository at this point in the history
  • Loading branch information
ddean2009 committed Jul 22, 2024
1 parent 7b8ef29 commit 2764e3c
Show file tree
Hide file tree
Showing 15 changed files with 269 additions and 42 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ final/*
resource/*
venv
=2.13.3
last_published_cn.txt
last_published_cn.txt
fasterwhisper
2 changes: 2 additions & 0 deletions chattts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pt文件下载地址:
https://modelscope.cn/studios/ttwwwaa/ChatTTS_Speaker
9 changes: 8 additions & 1 deletion config/config.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,16 @@ audio:
access_key_secret: ACCESS_KEY_SECRET
app_key: APP_KEY
provider: Azure
loca_tts:
local_tts:
provider: chatTTS
server_location: http://127.0.0.1:8080/
local_recognition:
provider: fasterwhisper
fasterwhisper:
model_name: tiny
device_type: cuda
compute_type: int8


captioning:
provider: Azure
Expand Down
6 changes: 6 additions & 0 deletions config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@

from tools.file_utils import read_yaml, save_yaml

local_audio_tts_providers = ['chatTTS', ]
local_audio_recognition_providers = ['fasterwhisper', ]
local_audio_recognition_fasterwhisper_module_names = ['large-v3', 'large-v2', 'large-v1', 'distil-large-v3', 'distil-large-v2', 'medium', 'base', 'small', 'tiny']
local_audio_recognition_fasterwhisper_device_types =['cuda', 'cpu','auto']
local_audio_recognition_fasterwhisper_compute_types =['int8','int8_float16','float16']

audio_types = {'remote': "云服务", 'local': "本地模型" }
languages = {'zh-CN': "简体中文", 'en': "english", 'zh-TW': "繁體中文"}
audio_languages = {'zh-CN': "中文", 'en-US': "english"}
Expand Down
4 changes: 4 additions & 0 deletions fasterwhisper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
这里放置faster-whisper的模型
模型下载地址:https://huggingface.co/Systran
比如进到当前目录下,执行:
git clone https://huggingface.co/Systran/faster-whisper-tiny
103 changes: 100 additions & 3 deletions gui.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import streamlit as st
from config.config import my_config, save_config, languages, test_config
from config.config import my_config, save_config, languages, test_config, local_audio_tts_providers, \
local_audio_recognition_providers, local_audio_recognition_fasterwhisper_module_names, \
local_audio_recognition_fasterwhisper_device_types, local_audio_recognition_fasterwhisper_compute_types
from pages.common import common_ui
from tools.tr_utils import tr

Expand Down Expand Up @@ -51,8 +53,26 @@ def set_local_audio_tts_provider():
save_config()


def set_local_audio_recognition_provider():
test_config(my_config, "audio", "local_recognition", 'provider')
my_config['audio']['local_recognition']['provider'] = st.session_state['local_audio_recognition_provider']
save_config()


def get_recognition_value(key):
recognition_provider = st.session_state['local_audio_recognition_provider']
return my_config['audio'].get('local_recognition', {}).get(recognition_provider, {}).get(key, '')


def set_recognition_value(key, session_key):
recognition_provider = st.session_state['local_audio_recognition_provider']
test_config(my_config, "audio", "local_recognition", recognition_provider, key)
my_config['audio']['local_recognition'][recognition_provider][key] = st.session_state[session_key]
save_config()


def get_chatTTS_server_location():
return my_config['audio'].get('local_tts', {}).get('server_location','')
return my_config['audio'].get('local_tts', {}).get('server_location', '')


def set_chatTTS_server_location():
Expand Down Expand Up @@ -160,9 +180,9 @@ def set_llm_model_name(provider, key):
with audio_container:
st.info(tr("Audio Provider Info"))

# local TTS config
local_tts_container = st.container(border=True)
with local_tts_container:
local_audio_tts_providers = ['chatTTS', ]
selected_local_audio_tts_provider = my_config['audio'].get('local_tts', {}).get('provider', '')
if not selected_local_audio_tts_provider:
selected_local_audio_tts_provider = 'chatTTS'
Expand All @@ -181,6 +201,83 @@ def set_llm_model_name(provider, key):
value=get_chatTTS_server_location(),
key="chatTTS_server_location", on_change=set_chatTTS_server_location)

# local recognition config
local_recognition_container = st.container(border=True)
with local_recognition_container:
selected_local_audio_recognition_provider = my_config['audio'].get('local_recognition', {}).get('provider', '')
if not selected_local_audio_recognition_provider:
selected_local_audio_recognition_provider = 'fasterwhisper'
st.session_state['local_audio_recognition_provider'] = selected_local_audio_recognition_provider
set_local_audio_recognition_provider()
selected_local_audio_recognition_provider_index = 0
for i, provider in enumerate(local_audio_recognition_providers):
if provider == selected_local_audio_recognition_provider:
selected_local_audio_recognition_provider_index = i
break

local_audio_recognition_provider = st.selectbox(tr("Local Audio recognition Provider"),
options=local_audio_recognition_providers,
index=selected_local_audio_recognition_provider_index,
key='local_audio_recognition_provider',
on_change=set_local_audio_recognition_provider)
recognition_columns = st.columns(3)
with recognition_columns[0]:
selected_local_audio_recognition_module = my_config['audio'].get('local_recognition', {}).get(st.session_state['local_audio_recognition_provider'],
{}).get('model_name',
'')
if not selected_local_audio_recognition_module:
selected_local_audio_recognition_module = 'tiny'
st.session_state['recognition_model_name'] = selected_local_audio_recognition_module
set_recognition_value('model_name', 'recognition_model_name')
selected_local_audio_recognition_module_index = 0
for i, module_name in enumerate(local_audio_recognition_fasterwhisper_module_names):
if module_name == selected_local_audio_recognition_module:
selected_local_audio_recognition_module_index = i
break
st.selectbox(tr("model name"),
options=local_audio_recognition_fasterwhisper_module_names,
index=selected_local_audio_recognition_module_index,
key='recognition_model_name',
on_change=set_recognition_value, args=('model_name', 'recognition_model_name',))
with recognition_columns[1]:
selected_local_audio_recognition_device = my_config['audio'].get('local_recognition', {}).get(
st.session_state['local_audio_recognition_provider'],
'').get('device_type', '')
if not selected_local_audio_recognition_device:
selected_local_audio_recognition_device = 'cpu'
st.session_state['recognition_device_type'] = selected_local_audio_recognition_device
set_recognition_value('device_type', 'recognition_device_type')
selected_local_audio_recognition_device_index = 0
for i, module_name in enumerate(local_audio_recognition_fasterwhisper_device_types):
if module_name == selected_local_audio_recognition_device:
selected_local_audio_recognition_device_index = i
break
st.selectbox(tr("device type"),
options=local_audio_recognition_fasterwhisper_device_types,
index=selected_local_audio_recognition_device_index,
key='recognition_device_type',
on_change=set_recognition_value, args=('device_type', 'recognition_device_type',))
with recognition_columns[2]:
selected_local_audio_recognition_compute = my_config['audio'].get('local_recognition', {}).get(
st.session_state['local_audio_recognition_provider'],
'').get('compute_type', '')
if not selected_local_audio_recognition_compute:
selected_local_audio_recognition_compute = 'int8'
st.session_state['recognition_compute_type'] = selected_local_audio_recognition_compute
set_recognition_value('compute_type', 'recognition_compute_type')
selected_local_audio_recognition_compute_index = 0
for i, module_name in enumerate(local_audio_recognition_fasterwhisper_compute_types):
if module_name == selected_local_audio_recognition_compute:
selected_local_audio_recognition_compute_index = i
break
st.selectbox(tr("device type"),
options=local_audio_recognition_fasterwhisper_compute_types,
index=selected_local_audio_recognition_compute_index,
key='recognition_compute_type',
on_change=set_recognition_value, args=('compute_type', 'recognition_compute_type',))


# remote Audio config
audio_providers = ['Azure', 'Ali', 'Tencent']
selected_audio_provider = my_config['audio']['provider']
selected_audio_provider_index = 0
Expand Down
8 changes: 6 additions & 2 deletions locales/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"Audio Provider Info": "配置音频库信息",
"Remote Audio Provider": "云服务音频库",
"Local Audio TTS Provider": "本地语音TTS",
"Local Audio recognition Provider": "本地语音识别",
"ChatTTS http server location": "ChatTTS Http Server地址(如: http://127.0.0.1:8000/)",
"Generate Video dubbing": "生成视频配音",
"Testing Audio": "试听声音",
Expand All @@ -32,8 +33,11 @@
"Video content language": "视频文案语言",
"Video length": "视频时长",

"Video Captioning": "视频配音区",
"Video Captioning": "视频TTS语音合成区",
"Audio language" : "配音语言",
"Choose recognition type": "选择语音识别类型",
"Choose recognition source": "选择语音识别源",
"Audio recognition": "语音识别配置",
"Audio voice": "配音语音",
"Audio speed": "配音语速",
"Video Background Music": "背景音乐",
Expand All @@ -42,7 +46,7 @@
"Background music": "背景音乐",
"Background music volume": "背景音乐音量(默认0.3)",
"Enable background music": "是否开启背景音乐",
"Choose audio type": "选择配音类型",
"Choose TTS audio type": "选择TTS语音合成类型",
"Refine text": "是否口语化",
"Audio Temperature": "Audio Temperature波动性",
"top_P": "top_P相关性",
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def main_generate_subtitle():
random_name = random_with_system_time()
captioning_output = os.path.join(audio_output_dir, f"{random_name}.srt")
st.session_state["captioning_output"] = captioning_output
audio_output_file = get_must_session_option("audio_output_file", "请先生成配音文件")
audio_output_file = get_must_session_option("audio_output_file", "请先生成视频对应的语音文件")
generate_caption()


Expand Down
25 changes: 13 additions & 12 deletions pages/01_auto_video.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import streamlit as st

from config.config import my_config, save_config, languages, audio_languages, transition_types, \
fade_list, audio_types
from config.config import my_config, save_config, languages, audio_languages, transition_types, \
fade_list, audio_types
from main import main_generate_video_content, main_generate_ai_video, main_generate_video_dubbing, \
main_get_video_resource, main_generate_subtitle, main_try_test_audio, get_audio_voices, main_try_test_local_audio
from pages.common import common_ui
from tools.tr_utils import tr

import os

from tools.utils import get_file_map_from_dir
from tools.utils import get_file_map_from_dir

# 获取当前脚本的绝对路径
script_path = os.path.abspath(__file__)
Expand All @@ -26,13 +26,6 @@
default_chattts_dir = os.path.abspath(default_chattts_dir)


# def select_folder():
# root = tk.Tk()
# root.withdraw()
# folder_path = filedialog.askdirectory(master=root)
# root.destroy()
# return folder_path

def save_to_config(region, key):
value = st.session_state.get(key)
if value:
Expand Down Expand Up @@ -64,6 +57,7 @@ def generate_video_dubbing():
def try_test_audio():
main_try_test_audio()


def try_test_local_audio():
main_try_test_local_audio()

Expand Down Expand Up @@ -106,7 +100,7 @@ def generate_video(video_generator):

llm_columns = st.columns(4)
with llm_columns[0]:
st.selectbox(label=tr("Choose audio type"), options=audio_types, format_func=lambda x: audio_types.get(x),
st.selectbox(label=tr("Choose TTS audio type"), options=audio_types, format_func=lambda x: audio_types.get(x),
key="audio_type")

if st.session_state.get("audio_type") == "remote":
Expand Down Expand Up @@ -176,8 +170,15 @@ def generate_video(video_generator):
st.button(label=tr("Testing Audio"), type="primary", on_click=try_test_local_audio)


recognition_container = st.container(border=True)
with recognition_container:
# 配音
st.subheader(tr("Audio recognition"))


llm_columns = st.columns(4)
with llm_columns[0]:
st.selectbox(label=tr("Choose recognition type"), options=audio_types, format_func=lambda x: audio_types.get(x),
key="recognition_audio_type")

# 背景音乐
bg_music_container = st.container(border=True)
Expand Down
9 changes: 9 additions & 0 deletions pages/02_mix_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,15 @@ def generate_video_for_mix(video_generator):
with llm_columns[3]:
st.button(label=tr("Testing Audio"), type="primary", on_click=try_test_local_audio)

recognition_container = st.container(border=True)
with recognition_container:
# 配音
st.subheader(tr("Audio recognition"))
llm_columns = st.columns(4)
with llm_columns[0]:
st.selectbox(label=tr("Choose recognition type"), options=audio_types, format_func=lambda x: audio_types.get(x),
key="recognition_audio_type")

# 背景音乐
bg_music_container = st.container(border=True)
with bg_music_container:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ pyperclip
pyaudio
torch==2.3.1
pybase16384
numpy==1.26.4
numpy==1.26.4
faster-whisper==1.0.3
69 changes: 69 additions & 0 deletions services/audio/faster_whisper_recognition_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
from typing import List

from config.config import my_config
from tools.utils import must_have_value
from faster_whisper import WhisperModel

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
# 获取当前脚本的绝对路径
script_path = os.path.abspath(__file__)

# print("当前脚本的绝对路径是:", script_path)

# 脚本所在的目录
script_dir = os.path.dirname(script_path)
# module输出目录
module_output_dir = os.path.join(script_dir, "../../fasterwhisper")
module_output_dir = os.path.abspath(module_output_dir)


def convert_module_to_path(module_name):
return_path = os.path.join(module_output_dir, module_name)
print(return_path, os.path.isdir(return_path))
return return_path


class FasterWhisperRecognitionResult:
def __init__(self, text, begin_time, end_time):
self.text = text
self.begin_time = begin_time
self.end_time = end_time

def __str__(self):
return f"{self.text} {self.begin_time} {self.end_time}"


class FasterWhisperRecognitionService:
def __init__(self):
super().__init__()
self.model_name = my_config['audio'].get('local_recognition', {}).get('fasterwhisper', {}).get('model_name')
must_have_value(self.model_name, "请设置语音识别model_name")
self.device_type = my_config['audio'].get('local_recognition', {}).get('fasterwhisper', {}).get('device_type')
self.compute_type = my_config['audio'].get('local_recognition', {}).get('fasterwhisper', {}).get('compute_type')
must_have_value(self.device_type, "请设置语音识别device_type")
must_have_value(self.compute_type, "请设置语音识别compute_type")

def process(self, audioFile, language) -> List[FasterWhisperRecognitionResult]:
result_list = []

# Run on GPU with FP16
model = WhisperModel(convert_module_to_path(self.model_name), device=self.device_type, compute_type=self.compute_type,
local_files_only=True)

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe(audioFile, beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
result_list.append(
FasterWhisperRecognitionResult(segment.text, segment.start,
segment.end))

return result_list
Loading

0 comments on commit 2764e3c

Please sign in to comment.