支持fasterwhipser

ddean2009 · Jul 22, 2024 · 2764e3c · 2764e3c
1 parent 7b8ef29
commit 2764e3c
Show file tree

Hide file tree

Showing 15 changed files with 269 additions and 42 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,5 @@ final/*
 resource/*
 venv
 =2.13.3
-last_published_cn.txt
+last_published_cn.txt
+fasterwhisper
diff --git a/chattts/README.md b/chattts/README.md
@@ -0,0 +1,2 @@
+pt文件下载地址：
+https://modelscope.cn/studios/ttwwwaa/ChatTTS_Speaker
diff --git a/config/config.example.yml b/config/config.example.yml
@@ -11,9 +11,16 @@ audio:
     access_key_secret: ACCESS_KEY_SECRET
     app_key: APP_KEY
   provider: Azure
-  loca_tts:
+  local_tts:
     provider: chatTTS
     server_location: http://127.0.0.1:8080/
+  local_recognition:
+    provider: fasterwhisper
+    fasterwhisper:
+      model_name: tiny
+      device_type: cuda
+      compute_type: int8
+
 
 captioning:
   provider: Azure

diff --git a/config/config.py b/config/config.py
@@ -3,6 +3,12 @@
 
 from tools.file_utils import read_yaml, save_yaml
 
+local_audio_tts_providers = ['chatTTS', ]
+local_audio_recognition_providers = ['fasterwhisper', ]
+local_audio_recognition_fasterwhisper_module_names = ['large-v3',  'large-v2', 'large-v1', 'distil-large-v3', 'distil-large-v2', 'medium', 'base', 'small', 'tiny']
+local_audio_recognition_fasterwhisper_device_types =['cuda', 'cpu','auto']
+local_audio_recognition_fasterwhisper_compute_types =['int8','int8_float16','float16']
+
 audio_types = {'remote': "云服务", 'local': "本地模型" }
 languages = {'zh-CN': "简体中文", 'en': "english", 'zh-TW': "繁體中文"}
 audio_languages = {'zh-CN': "中文", 'en-US': "english"}

diff --git a/fasterwhisper/README.md b/fasterwhisper/README.md
@@ -0,0 +1,4 @@
+这里放置faster-whisper的模型
+模型下载地址：https://huggingface.co/Systran
+比如进到当前目录下，执行：
+git clone https://huggingface.co/Systran/faster-whisper-tiny
diff --git a/gui.py b/gui.py
@@ -1,5 +1,7 @@
 import streamlit as st
-from config.config import my_config, save_config, languages, test_config
+from config.config import my_config, save_config, languages, test_config, local_audio_tts_providers, \
+    local_audio_recognition_providers, local_audio_recognition_fasterwhisper_module_names, \
+    local_audio_recognition_fasterwhisper_device_types, local_audio_recognition_fasterwhisper_compute_types
 from pages.common import common_ui
 from tools.tr_utils import tr
 
@@ -51,8 +53,26 @@ def set_local_audio_tts_provider():
     save_config()
 
 
+def set_local_audio_recognition_provider():
+    test_config(my_config, "audio", "local_recognition", 'provider')
+    my_config['audio']['local_recognition']['provider'] = st.session_state['local_audio_recognition_provider']
+    save_config()
+
+
+def get_recognition_value(key):
+    recognition_provider = st.session_state['local_audio_recognition_provider']
+    return my_config['audio'].get('local_recognition', {}).get(recognition_provider, {}).get(key, '')
+
+
+def set_recognition_value(key, session_key):
+    recognition_provider = st.session_state['local_audio_recognition_provider']
+    test_config(my_config, "audio", "local_recognition", recognition_provider, key)
+    my_config['audio']['local_recognition'][recognition_provider][key] = st.session_state[session_key]
+    save_config()
+
+
 def get_chatTTS_server_location():
-    return my_config['audio'].get('local_tts', {}).get('server_location','')
+    return my_config['audio'].get('local_tts', {}).get('server_location', '')
 
 
 def set_chatTTS_server_location():
@@ -160,9 +180,9 @@ def set_llm_model_name(provider, key):
 with audio_container:
     st.info(tr("Audio Provider Info"))
 
+    # local TTS config
     local_tts_container = st.container(border=True)
     with local_tts_container:
-        local_audio_tts_providers = ['chatTTS', ]
         selected_local_audio_tts_provider = my_config['audio'].get('local_tts', {}).get('provider', '')
         if not selected_local_audio_tts_provider:
             selected_local_audio_tts_provider = 'chatTTS'
@@ -181,6 +201,83 @@ def set_llm_model_name(provider, key):
                       value=get_chatTTS_server_location(),
                       key="chatTTS_server_location", on_change=set_chatTTS_server_location)
 
+    # local recognition config
+    local_recognition_container = st.container(border=True)
+    with local_recognition_container:
+        selected_local_audio_recognition_provider = my_config['audio'].get('local_recognition', {}).get('provider', '')
+        if not selected_local_audio_recognition_provider:
+            selected_local_audio_recognition_provider = 'fasterwhisper'
+            st.session_state['local_audio_recognition_provider'] = selected_local_audio_recognition_provider
+            set_local_audio_recognition_provider()
+        selected_local_audio_recognition_provider_index = 0
+        for i, provider in enumerate(local_audio_recognition_providers):
+            if provider == selected_local_audio_recognition_provider:
+                selected_local_audio_recognition_provider_index = i
+                break
+
+        local_audio_recognition_provider = st.selectbox(tr("Local Audio recognition Provider"),
+                                                        options=local_audio_recognition_providers,
+                                                        index=selected_local_audio_recognition_provider_index,
+                                                        key='local_audio_recognition_provider',
+                                                        on_change=set_local_audio_recognition_provider)
+        recognition_columns = st.columns(3)
+        with recognition_columns[0]:
+            selected_local_audio_recognition_module = my_config['audio'].get('local_recognition', {}).get(st.session_state['local_audio_recognition_provider'],
+                                                                                                          {}).get('model_name',
+                                                                                                            '')
+            if not selected_local_audio_recognition_module:
+                selected_local_audio_recognition_module = 'tiny'
+                st.session_state['recognition_model_name'] = selected_local_audio_recognition_module
+                set_recognition_value('model_name', 'recognition_model_name')
+            selected_local_audio_recognition_module_index = 0
+            for i, module_name in enumerate(local_audio_recognition_fasterwhisper_module_names):
+                if module_name == selected_local_audio_recognition_module:
+                    selected_local_audio_recognition_module_index = i
+                    break
+            st.selectbox(tr("model name"),
+                         options=local_audio_recognition_fasterwhisper_module_names,
+                         index=selected_local_audio_recognition_module_index,
+                         key='recognition_model_name',
+                         on_change=set_recognition_value, args=('model_name', 'recognition_model_name',))
+        with recognition_columns[1]:
+            selected_local_audio_recognition_device = my_config['audio'].get('local_recognition', {}).get(
+                st.session_state['local_audio_recognition_provider'],
+                '').get('device_type', '')
+            if not selected_local_audio_recognition_device:
+                selected_local_audio_recognition_device = 'cpu'
+                st.session_state['recognition_device_type'] = selected_local_audio_recognition_device
+                set_recognition_value('device_type', 'recognition_device_type')
+            selected_local_audio_recognition_device_index = 0
+            for i, module_name in enumerate(local_audio_recognition_fasterwhisper_device_types):
+                if module_name == selected_local_audio_recognition_device:
+                    selected_local_audio_recognition_device_index = i
+                    break
+            st.selectbox(tr("device type"),
+                         options=local_audio_recognition_fasterwhisper_device_types,
+                         index=selected_local_audio_recognition_device_index,
+                         key='recognition_device_type',
+                         on_change=set_recognition_value, args=('device_type', 'recognition_device_type',))
+        with recognition_columns[2]:
+            selected_local_audio_recognition_compute = my_config['audio'].get('local_recognition', {}).get(
+                st.session_state['local_audio_recognition_provider'],
+                '').get('compute_type', '')
+            if not selected_local_audio_recognition_compute:
+                selected_local_audio_recognition_compute = 'int8'
+                st.session_state['recognition_compute_type'] = selected_local_audio_recognition_compute
+                set_recognition_value('compute_type', 'recognition_compute_type')
+            selected_local_audio_recognition_compute_index = 0
+            for i, module_name in enumerate(local_audio_recognition_fasterwhisper_compute_types):
+                if module_name == selected_local_audio_recognition_compute:
+                    selected_local_audio_recognition_compute_index = i
+                    break
+            st.selectbox(tr("device type"),
+                         options=local_audio_recognition_fasterwhisper_compute_types,
+                         index=selected_local_audio_recognition_compute_index,
+                         key='recognition_compute_type',
+                         on_change=set_recognition_value, args=('compute_type', 'recognition_compute_type',))
+
+
+    # remote Audio config
     audio_providers = ['Azure', 'Ali', 'Tencent']
     selected_audio_provider = my_config['audio']['provider']
     selected_audio_provider_index = 0

diff --git a/locales/zh-CN.json b/locales/zh-CN.json
@@ -15,6 +15,7 @@
   "Audio Provider Info": "配置音频库信息",
   "Remote Audio Provider": "云服务音频库",
   "Local Audio TTS Provider": "本地语音TTS",
+  "Local Audio recognition Provider": "本地语音识别",
   "ChatTTS http server location": "ChatTTS Http Server地址(如: http://127.0.0.1:8000/)",
   "Generate Video dubbing": "生成视频配音",
   "Testing Audio": "试听声音",
@@ -32,8 +33,11 @@
   "Video content language": "视频文案语言",
   "Video length": "视频时长",
 
-  "Video Captioning": "视频配音区",
+  "Video Captioning": "视频TTS语音合成区",
   "Audio language" : "配音语言",
+  "Choose recognition type": "选择语音识别类型",
+  "Choose recognition source": "选择语音识别源",
+  "Audio recognition": "语音识别配置",
   "Audio voice": "配音语音",
   "Audio speed": "配音语速",
   "Video Background Music": "背景音乐",
@@ -42,7 +46,7 @@
   "Background music": "背景音乐",
   "Background music volume": "背景音乐音量(默认0.3)",
   "Enable background music": "是否开启背景音乐",
-  "Choose audio type": "选择配音类型",
+  "Choose TTS audio type": "选择TTS语音合成类型",
   "Refine text": "是否口语化",
   "Audio Temperature": "Audio Temperature波动性",
   "top_P": "top_P相关性",

diff --git a/main.py b/main.py
@@ -253,7 +253,7 @@ def main_generate_subtitle():
         random_name = random_with_system_time()
         captioning_output = os.path.join(audio_output_dir, f"{random_name}.srt")
         st.session_state["captioning_output"] = captioning_output
-        audio_output_file = get_must_session_option("audio_output_file", "请先生成配音文件")
+        audio_output_file = get_must_session_option("audio_output_file", "请先生成视频对应的语音文件")
         generate_caption()
 
 

diff --git a/pages/01_auto_video.py b/pages/01_auto_video.py
@@ -1,15 +1,15 @@
 import streamlit as st
 
-from config.config import my_config, save_config, languages, audio_languages,  transition_types, \
-    fade_list,  audio_types
+from config.config import my_config, save_config, languages, audio_languages, transition_types, \
+    fade_list, audio_types
 from main import main_generate_video_content, main_generate_ai_video, main_generate_video_dubbing, \
     main_get_video_resource, main_generate_subtitle, main_try_test_audio, get_audio_voices, main_try_test_local_audio
 from pages.common import common_ui
 from tools.tr_utils import tr
 
 import os
 
-from tools.utils import  get_file_map_from_dir
+from tools.utils import get_file_map_from_dir
 
 # 获取当前脚本的绝对路径
 script_path = os.path.abspath(__file__)
@@ -26,13 +26,6 @@
 default_chattts_dir = os.path.abspath(default_chattts_dir)
 
 
-# def select_folder():
-#    root = tk.Tk()
-#    root.withdraw()
-#    folder_path = filedialog.askdirectory(master=root)
-#    root.destroy()
-#    return folder_path
-
 def save_to_config(region, key):
     value = st.session_state.get(key)
     if value:
@@ -64,6 +57,7 @@ def generate_video_dubbing():
 def try_test_audio():
     main_try_test_audio()
 
+
 def try_test_local_audio():
     main_try_test_local_audio()
 
@@ -106,7 +100,7 @@ def generate_video(video_generator):
 
     llm_columns = st.columns(4)
     with llm_columns[0]:
-        st.selectbox(label=tr("Choose audio type"), options=audio_types, format_func=lambda x: audio_types.get(x),
+        st.selectbox(label=tr("Choose TTS audio type"), options=audio_types, format_func=lambda x: audio_types.get(x),
                      key="audio_type")
 
     if st.session_state.get("audio_type") == "remote":
@@ -176,8 +170,15 @@ def generate_video(video_generator):
                 st.button(label=tr("Testing Audio"), type="primary", on_click=try_test_local_audio)
 
 
+recognition_container = st.container(border=True)
+with recognition_container:
+    # 配音
+    st.subheader(tr("Audio recognition"))
 
-
+    llm_columns = st.columns(4)
+    with llm_columns[0]:
+        st.selectbox(label=tr("Choose recognition type"), options=audio_types, format_func=lambda x: audio_types.get(x),
+                     key="recognition_audio_type")
 
 # 背景音乐
 bg_music_container = st.container(border=True)

diff --git a/pages/02_mix_video.py b/pages/02_mix_video.py
@@ -187,6 +187,15 @@ def generate_video_for_mix(video_generator):
             with llm_columns[3]:
                 st.button(label=tr("Testing Audio"), type="primary", on_click=try_test_local_audio)
 
+recognition_container = st.container(border=True)
+with recognition_container:
+    # 配音
+    st.subheader(tr("Audio recognition"))
+    llm_columns = st.columns(4)
+    with llm_columns[0]:
+        st.selectbox(label=tr("Choose recognition type"), options=audio_types, format_func=lambda x: audio_types.get(x),
+                     key="recognition_audio_type")
+
 # 背景音乐
 bg_music_container = st.container(border=True)
 with bg_music_container:

diff --git a/requirements.txt b/requirements.txt
@@ -18,4 +18,5 @@ pyperclip
 pyaudio
 torch==2.3.1
 pybase16384
-numpy==1.26.4
+numpy==1.26.4
+faster-whisper==1.0.3
diff --git a/services/audio/faster_whisper_recognition_service.py b/services/audio/faster_whisper_recognition_service.py
@@ -0,0 +1,69 @@
+import os
+from typing import List
+
+from config.config import my_config
+from tools.utils import must_have_value
+from faster_whisper import WhisperModel
+
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+# 获取当前脚本的绝对路径
+script_path = os.path.abspath(__file__)
+
+# print("当前脚本的绝对路径是:", script_path)
+
+# 脚本所在的目录
+script_dir = os.path.dirname(script_path)
+# module输出目录
+module_output_dir = os.path.join(script_dir, "../../fasterwhisper")
+module_output_dir = os.path.abspath(module_output_dir)
+
+
+def convert_module_to_path(module_name):
+    return_path = os.path.join(module_output_dir, module_name)
+    print(return_path, os.path.isdir(return_path))
+    return return_path
+
+
+class FasterWhisperRecognitionResult:
+    def __init__(self, text, begin_time, end_time):
+        self.text = text
+        self.begin_time = begin_time
+        self.end_time = end_time
+
+    def __str__(self):
+        return f"{self.text} {self.begin_time} {self.end_time}"
+
+
+class FasterWhisperRecognitionService:
+    def __init__(self):
+        super().__init__()
+        self.model_name = my_config['audio'].get('local_recognition', {}).get('fasterwhisper', {}).get('model_name')
+        must_have_value(self.model_name, "请设置语音识别model_name")
+        self.device_type = my_config['audio'].get('local_recognition', {}).get('fasterwhisper', {}).get('device_type')
+        self.compute_type = my_config['audio'].get('local_recognition', {}).get('fasterwhisper', {}).get('compute_type')
+        must_have_value(self.device_type, "请设置语音识别device_type")
+        must_have_value(self.compute_type, "请设置语音识别compute_type")
+
+    def process(self, audioFile, language) -> List[FasterWhisperRecognitionResult]:
+        result_list = []
+
+        # Run on GPU with FP16
+        model = WhisperModel(convert_module_to_path(self.model_name), device=self.device_type, compute_type=self.compute_type,
+                             local_files_only=True)
+
+        # or run on GPU with INT8
+        # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+        # or run on CPU with INT8
+        # model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+        segments, info = model.transcribe(audioFile, beam_size=5)
+
+        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
+
+        for segment in segments:
+            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
+            result_list.append(
+                FasterWhisperRecognitionResult(segment.text, segment.start,
+                                               segment.end))
+
+        return result_list
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		pt文件下载地址：
		https://modelscope.cn/studios/ttwwwaa/ChatTTS_Speaker