增加说明增加图片增加输出字幕

m986883511 · Aug 1, 2021 · 81ce5f8 · 81ce5f8
1 parent b66c592
commit 81ce5f8
Show file tree

Hide file tree

Showing 11 changed files with 138 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -1,25 +1,78 @@
 # extract-video-subtittle
-使用深度学习框架提取视频硬字幕
+使用深度学习框架提取视频硬字幕；
 
+本地识别无需联网；
 
+CPU识别速度可观；
+
+容器提供API接口；
 
 
 
 # 运行环境
 
-本项目运行环境非常好搭建，我做好了docker容器免安装各种深度学习包
+本项目运行环境非常好搭建，我做好了docker容器免安装各种深度学习包；
+
+提供windows界面操作；
 
-提供界面操作
+容器为CPU版本；
 
 
 
 ## 视频演示
 
-xxxx
+bilibili
+
+
+
+## 程序说明
+
+1、先启动后端容器实例
+
+```shell
+docker run -d -p 6666:6666 m986883511/extract_subtitles
+```
+
+![image-20210801214757813](image/docker-run.png)
+
+2、启动程序
+
+简单介绍页面
+
+1：点击左边按钮连接第一步启动的容器；
+
+2：视频提取字幕的总进度
+
+3：当前视频帧显示的位置，就是视频进度条
+
+4：识别出来的文字会在这里显示一下
+
+![image-20210801215010179](image/2-run-exe.png)
+
+![image-20210801215258761](image/3-view.png)
+
+3、点击选择视频确认字幕位置
+
+点击选择视频按钮，这时你可以拖动进度条到有字幕的位置；然后点击选择字幕区域；在视频中画一个矩形；
+
+![image-20210801215258761](image/4-rect.png)
+
+4、点击测试连接API
+
+![image-20210801220206554](image/5-connect.png)
+
+后端没问题的话，会显示已连通；此时所有步骤准备就绪
 
+5、开始识别
 
+点击请先完成前几步按钮，内部分为这几个步骤
 
+1. 本地通过ffmpeg提取视频声音保存到temp目录（0%-10%）
+2. api通信将声音文件发送到容器内，容器内spleeter库提取声音中人声，结果保存在容器内temp目录，很耗时间，吃CPU和内存（10%-30）
+3. api通信，将人声根据停顿分片，返回分片结果，耗较短的时间（30%-40%）
+4. 根据说话分片时间开始识别字幕（40-%100%）
 
+当100%的时候查看temp目录就生成了和视频同名的srt字幕文件
 
 
 
@@ -47,7 +100,7 @@ C:/Python/Python38-32/Scripts/pyinstaller.exe main.spec
 
 ```
 
-# 深度学习后端
-本项目中深度学习源代码为/docker/backend.tar
+# 参考资料
+本项目中深度学习源代码为/docker/backend
 
 原作者为：https://github.com/YaoFANGUK/video-subtitle-extractor
diff --git a/image/2-run-exe.png b/image/2-run-exe.png
diff --git a/image/3-view.png b/image/3-view.png
diff --git a/image/4-rect.png b/image/4-rect.png
diff --git a/image/5-connect.png b/image/5-connect.png
diff --git a/image/docker-run.png b/image/docker-run.png
diff --git a/image/view.png b/image/view.png
diff --git a/image/weixin.jpg b/image/weixin.jpg
diff --git a/main.py b/main.py
@@ -1,3 +1,5 @@
+import datetime
+import json
 import os
 import time
 import sys
@@ -7,16 +9,60 @@
 sys.path.append(os.path.join(current_dir, 'ffmpeg'))
 
 import cv2
+import srt
 from PyQt5 import QtGui, QtWidgets
 from PyQt5.QtCore import *
 from PyQt5.QtGui import QPixmap
 from PyQt5.QtWidgets import QTextEdit, QLabel, QFileDialog, QMessageBox
-from PyQt5.QtWidgets import QSlider, QVBoxLayout, QLineEdit, QWidget
+from PyQt5.QtWidgets import QSlider, QVBoxLayout, QLineEdit, QWidget, QHBoxLayout
 from PyQt5.QtWidgets import QPushButton, QGridLayout, QDialog, QTabWidget
 
 from interface import ExtractSubtitleApi
 from custom_component import PaintRectLabel
-from scripts import get_extract_voice_progress
+from scripts import get_extract_voice_progress, get_data_time
+
+
+class SaveResult:
+    def __init__(self):
+        self.result = {
+            'video': {},
+            'text': {
+                'sub_tittle': []
+            }
+        }
+
+    def get_data_time(self, str_time):
+        return datetime.timedelta(hours=int(str_time[0:2]), minutes=int(str_time[4:5]),
+                                  seconds=int(str_time[7:8]), milliseconds=int(str_time[9:11]))
+
+    def add_sub_tittle(self, start_time, end_time, text):
+        if text:
+            self.result['text']['sub_tittle'].append(
+                {'start_time': start_time, 'end_time': end_time, 'text': text})
+
+    def set_video_msg(self, video_path):
+        self.result['video_path'] = video_path
+
+    def save_json(self):
+        video_path = self.result.get('video_path', '')
+        video_name = ''.join(os.path.basename(video_path).split('.')[:-1])
+        if video_name:
+            save_path = os.path.join(current_dir, 'temp', video_name + '.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                f.write(json.dumps(self.result, indent=4, ensure_ascii=False))
+        else:
+            print('not set video name')
+
+    def save_srt(self):
+        video_path = self.result.get('video_path', '')
+        video_name = ''.join(os.path.basename(video_path).split('.')[:-1])
+        save_path = os.path.join(current_dir, 'temp', video_name + '.srt')
+        sub_tittles = [
+            srt.Subtitle(i, start=get_data_time(i['start_time']), end=get_data_time(i['end_time']), content=i['text'])
+            for i in self.result['text']['sub_tittle']
+        ]
+        with open(save_path, 'w') as f:
+            f.write(srt.compose(sub_tittles))
 
 
 class SoftData:
@@ -105,11 +151,13 @@ def __init__(self, load_picture_thread: LoadVideoPicture):
     def get_current_sub_tittle_base64_img(self):
         rect_frame = SoftData.Video.current_frame[SoftData.Video.rect[1]:SoftData.Video.rect[3],
                      SoftData.Video.rect[0]:SoftData.Video.rect[2]]
-        image = cv2.imencode('.jpg',rect_frame)[1]
+        image = cv2.imencode('.jpg', rect_frame)[1]
         image_code = str(base64.b64encode(image))[2:-1]
         return image_code
 
     def run(self):
+        save_result = SaveResult()
+        save_result.set_video_msg(SoftData.Path.video)
         # 1、视频人声和背景声音分离
         self.log_signal.emit('开始提取人声结果，可能等待较长时间......')
         result = self.api_interface.extract_human_voice_from_sound(SoftData.Path.sound)
@@ -131,21 +179,31 @@ def run(self):
             zheng = int(time_zhong / (1000 / SoftData.Video.fps))
             self.load_picture_thread.set_value(zheng)
             result = self.api_interface.text_recognition(self.get_current_sub_tittle_base64_img())
+            self.log_signal.emit('{}'.format(value))
             self.log_signal.emit('{}'.format(result))
-            text = ';'.join([value[1] for value in result ])
-            time_point='{}-{}'.format(time.strftime("%H:%M:%S", time.gmtime(value[0]/100)),
-                                      time.strftime("%H:%M:%S", time.gmtime(value[1]/100)))
+            text = ';'.join([value[1] for value in result])
+            start_time = time.strftime("%H:%M:%S", time.gmtime(value[0] // 1000))
+            start_time = start_time + '.{}'.format(value[0] % 1000)
+            end_time = time.strftime("%H:%M:%S", time.gmtime(value[1] // 1000))
+            end_time = end_time + '.{}'.format(value[1] % 1000)
+            print(value, start_time, end_time)
+            time_point = '{}-{}'.format(start_time, end_time)
             self.load_picture_thread.subtittle_result_label.setText('{}: {}'.format(time_point, text))
+            text = '{}'.format(text)
+            save_result.add_sub_tittle(start_time, end_time, text)
         else:
             self.progress_signal.emit(100)
             self.log_signal.emit('视频字幕提取成功')
+            save_result.save_json()
+            save_result.save_srt()
 
 
 class MainUi(QDialog):
     process = QProcess()
 
     def __init__(self):
         super().__init__()
+        self.setWindowTitle('视频硬字幕提取 https://github.com/m986883511/extract-video-subtittle')
         self.api_interface = ExtractSubtitleApi()
         self.init_ui()
         self.video_capture = None
@@ -195,15 +253,20 @@ def get_background_print_tab(self):
 
     def get_thank_author_tab(self):
         thank_author_tab = QWidget()
-        layout = QVBoxLayout()
-        self.thank_author_label = QLabel('一张感谢作者的二维码转账图，哈哈哈')
+        layout = QHBoxLayout()
+        self.thank_author_label = QLabel('软件觉得不错，可以打赏')
+        self.thank_author_picture = QLabel()
+        picture = os.path.join(current_dir, 'image', 'weixin.jpg')
+        self.thank_author_picture.setScaledContents(True)
+        self.thank_author_picture.setPixmap(QPixmap(picture))
         layout.addWidget(self.thank_author_label)
+        layout.addWidget(self.thank_author_picture)
         thank_author_tab.setLayout(layout)
         return thank_author_tab, '感谢作者'
 
     def init_ui(self):
         self.setMinimumWidth(1000)
-        self.setMinimumHeight(618)
+        self.setMinimumHeight(800)
         layout = QVBoxLayout()
         self.video_path_button = QPushButton(SoftData.Button.select_video_path)
         self.video_path_button.clicked.connect(self.open_dialog_select_video_file)

diff --git a/requeirments.txt b/requeirments.txt
@@ -2,4 +2,4 @@ pyqt5
 requests
 opencv-python
 requests_toolbelt
-
+srt
diff --git a/scripts.py b/scripts.py
@@ -1,3 +1,5 @@
+import datetime
+
 import cv2
 
 
@@ -26,16 +28,7 @@ def get_extract_voice_progress(input_str: str)->int:
         progress = get_seconds(Value.now_time) / get_seconds(Value.all_time) * 100
         return int(progress)
 
-# def CropImage4File(image):
-#     sp = image.shape  # 获取图像形状：返回【行数值，列数值】列表
-#     sz1 = sp[0]  # 图像的高度（行 范围）
-#     sz2 = sp[1]  # 图像的宽度（列 范围）
-#     # sz3 = sp[2]                #像素值由【RGB】三原色组成
-#
-#     # 你想对文件的操作
-#     a = int(sz1 / 2 - 64)  # x start
-#     b = int(sz1 / 2 + 64)  # x end
-#     c = int(sz2 / 2 - 64)  # y start
-#     d = int(sz2 / 2 + 64)  # y end
-#     cropImg = image[a:b, c:d]  # 裁剪图像
-#     cv2.imwrite(dest, cropImg)  # 写入图像路径
+
+def get_data_time(str_time):
+    return datetime.timedelta(hours=int(str_time[0:2]), minutes=int(str_time[3:5]),
+                              seconds=int(str_time[6:8]), milliseconds=int(str_time[9:]))