diff --git a/.gitignore b/.gitignore index 2b3817d..7158936 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +records/ +*.mp3 +*.mp4 \ No newline at end of file diff --git a/README.md b/README.md index 47ec8a1..622c54a 100644 --- a/README.md +++ b/README.md @@ -4,17 +4,25 @@ Real-time video understanding and interaction through text,audio,image and video 利用多模态大模型的实时视频理解和交互框架,通过文本、语音、图像和视频和这是世界进行问答和交流。 -## 启动后端服务 +## 启动前端对话服务 +主要实现了下面2个功能 + +- 1、streamlit对话界面 +- 2、gpt4v请求接口 + 打开`run.sh`输入自己的api key, 然后启动 ```shell sh run.sh ``` - -## 启动前端界面 -:construction: (施工中) -``` -python demo.py +## TTS和ASR服务 +- ASR +服务调用自S组(TODO:要不要更新一个服务在这里) +- TTS +见[tts.py](./real_gemini/tts.py),启动脚本: +```shell +python tts.py ``` +启动这些服务需要一些额外的环境和模型:`torch, torchaudio, TTS`,用`pip`安装即可,模型文件路径见py脚本。 ## Acknowledgement - [Fastapi](https://github.com/tiangolo/fastapi) diff --git a/install_pyaudio.sh b/install_pyaudio.sh new file mode 100644 index 0000000..928b2d0 --- /dev/null +++ b/install_pyaudio.sh @@ -0,0 +1,5 @@ +wget http://www.portaudio.com/archives/pa_stable_v190600_20161030.tgz +tar -zxvf pa_stable_v190600_20161030.tgz +cd portaudio/ +./configure && make && sudo make install +pip install pyaudio diff --git a/real_gemini/chat.py b/real_gemini/chat.py new file mode 100644 index 0000000..fa00c98 --- /dev/null +++ b/real_gemini/chat.py @@ -0,0 +1,176 @@ +import uuid +import streamlit as st +from pathlib import Path +from utils_st.audio2text import audio2text_from_bytes +from utils_st.extracte_img import get_main_img +from utils_st.get_gpt4v_response import gpt4v_client +from utils_st.get_qwen_response import QwenVL_client +from utils_st.text2audio import text2audio,autoplay_audio +from utils_st.record_video import record +from queue import Queue +import time +import cv2 +from threading import Thread,Event + +img = {'assistant':'./source/bot.png','user':None} +res_ = {'Qwen-vl':QwenVL_client,'gpt4v':gpt4v_client} + +# 设置事件锁 +event_record = Event() +event_chat = Event() +event_record.set() # 初始打开录音锁 + +with st.sidebar: + with st.form('参数配置'): + max_chat_turn = st.slider('最大对话轮数:',min_value=1,max_value=10000,value=10) + response_name = st.selectbox('选择模型',['Qwen-vl','gpt4v'],index=1) + st.form_submit_button('提交配置') +responser = res_[response_name] +max_record_round = 2*max_chat_turn +q = Queue(max_record_round) + +st.title("Gemini-like对话测试") +#########################存储录入的文件##################### +# RECORD_DIR = Path("./records") +# RECORD_DIR.mkdir(exist_ok=True) +# if "prefix" not in st.session_state: +# st.session_state["prefix"] = str(uuid.uuid4()) +# prefix = st.session_state["prefix"] +# in_file_video = RECORD_DIR / f"{prefix}_input_video.mp4" +# in_file_audio = RECORD_DIR / f"{prefix}_input_audio.mp3" +#########################存储录入的文件##################### +# 对话机器人的图标 + + +if "messages" not in st.session_state: + st.session_state.messages = [] + +def my_recorder(): + for i in range(max_record_round): + # 等待录入条件触发,最开始是默认触发 + print('holding to record') + event_record.wait() + print(f'record {i}') + imgs,audio = record() + input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data()) + # 过滤一些无意义的文本 + if input_text and len(input_text)>5: + q.put((imgs,audio,input_text)) + else: + print(f'非预期输入: id--{request_id},status--{code_status},text--{input_text}') + time.sleep(2)# 给2秒时间,调整准备输入 + continue + print(f'{i}录制结束,{q.qsize()}') + # 录制结束,解开对话阻塞,同时阻塞下一轮录入 + event_record.clear() + event_chat.set() + print('释放对话锁,加录音锁') + print('输入处理服务结束') + +def show_chat_message_from_history(show_num_history=None): + # Display chat messages from history on app rerun + # show_num_history: 应当为负偶数或者正奇数,负偶数表示为最后N条,正数表示跳过前N条 + if show_num_history is None: + history = st.session_state.messages + else: + history = st.session_state.messages[show_num_history:] + for message in history: + with st.chat_message(message["role"],avatar=img[message['role']]): + try: + if message['audio'] is not None: + st.audio(message['audio'],sample_rate=24000) + except: + pass + st.markdown(message["content"]) + try: + if message['img'] is not None: + st.image(message['img']) + except: + pass + +def response(prompt=None,imgs=None,autoplay=True,audio_response=True): + """ + prompt:输入的文本 + imgs:输入的图片 + autoplay:是否自动播放语音 + audio_response:是否将文本转换成语音响应 + """ + if prompt: + sound = None + # Display user message in chat message container + with st.chat_message("user"): + st.markdown(prompt) + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": prompt}) + # Display assistant response in chat message container + with st.chat_message("assistant",avatar='./source/bot.png'): + res = responser(query=prompt,imgs=imgs) + print('res[text]:',res['text']) + if audio_response: + sound,rate,byte_sound_array = text2audio(res["text"]) + else: + autoplay = False + if autoplay: + autoplay_audio(byte_sound_array) + if not autoplay and audio_response: + # 不自动播放语音 + st.audio(sound,sample_rate=rate) + st.markdown(res['text']) + try: + st.image(res['imgs']) + except: + pass + # 由于是自动播放音频,需要等待音频播放完毕 + if autoplay: + time.sleep(int(len(sound)/rate)+1) + st.session_state.messages.append({"role": "assistant", "content": res['text'],'audio':sound}) + + +if __name__ == '__main__': + max_round=max_chat_turn+50 # 为了保证安全,没有写没条件的while循环 + record_thread = Thread(target=my_recorder) + # 展示录像设备的图像信息 + video_show = st.container() + video_show.camera_input('tt',label_visibility='hidden') + # 开始录入输入 + if video_show.button('开始对话'): + st.info(f'开始监听麦克风...') + record_thread.start() + else: + st.stop() + # 展示录入信息的处理 + placeholder = st.empty() + # 展示对话 + chat_placeholder = st.empty() + while max_round>0: + # 等待对话开始,初始化是阻塞,等待第一次输入录入完成,才会打开锁 + print('等待对话开始') + event_chat.wait() + print('开始对话') + if not q.empty(): + # 进入到对话时,停止录入,防止录入播放的音频 + print('进入对话响应,暂停录入') + imgs,audio,input_text = q.get() + with placeholder.status('处理输入信号...',state='running',expanded=True) as status: + if len(imgs)>0: + st.write('getMainFrame...') + imgs = get_main_img(imgs) + imgs = imgs[-3:] + cls = st.columns(min(3,len(imgs))) + for idx,cl in enumerate(cls): + cl.image(cv2.cvtColor(imgs[idx],cv2.COLOR_BGR2RGB)) + st.audio(audio.get_wav_data()) + st.text(f'识别后的文本:{input_text}') + status.update(label="输入信号处理完成", state="complete", expanded=False) + with chat_placeholder.container(height=600):# 1.30支持设置 height=300px + # with st.container(height=600):# 1.30支持设置 height=300px + # 容器高度设置,要等1.30版本更新,https://github.com/streamlit/streamlit/issues/2169 + show_chat_message_from_history() # 现在关闭展示历史,只展示单轮 + response(prompt=input_text,imgs=imgs,autoplay=True,audio_response=True) + print('对话完毕,释放录音锁,打开对话锁') + # 对话响应完毕,打开事件 + event_record.set() + # 如果没有录入输入,等待 + event_chat.clear() + # chat_placeholder.empty() + print('达到最大对话轮数,结束程序!') diff --git a/real_gemini/pages/audio_test.py b/real_gemini/pages/audio_test.py new file mode 100644 index 0000000..fb6f13c --- /dev/null +++ b/real_gemini/pages/audio_test.py @@ -0,0 +1,87 @@ +import streamlit as st +import speech_recognition as sr +import base64 +from utils_st.audio2text import audio2text_from_bytes +from moviepy.editor import AudioFileClip +from utils_st.record_video import VideoRecorder +import time +from utils_st.record_video import record +from queue import Queue + +max_turn = 20 +q = Queue(max_turn) + +def audio_record(): + r = sr.Recognizer() + r.energy_threshold=500 # 检测声音的阈值 + with sr.Microphone() as source: + st.write('请开始说话,下面开始监听') + # phrase_time_limit 最大录制时常,timeout 等待时常 + for i in range(max_turn): + print(f'turn {i} start') + audio = r.listen(source,phrase_time_limit=15,timeout=None) + q.put(audio) + print(f'声音录制结束,{q.qsize()}') + print('结束') + +def my_recorder(): + for i in range(max_turn): + imgs,audio = record() + q.put((imgs,audio)) + print(f'录制结束,{q.qsize()}') + print('输入处理服务结束') + +def res(): + print('getin response') + i = 20 + while i>0: + if q.empty(): + print(f'q is empty , waiting') + time.sleep(5) + i-=1 + else: + print('reqeusts ok~') + audio = q.get() + st.audio(audio.get_wav_data()) + input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data()) + print(f'这是识别出来的文字:{input_text}') + st.text(f'这是识别出来的文字:{input_text}') + i-=1 + print('response over~') + +def show_chat_message_from_history(): + pass + +if __name__ == '__main__': + from threading import Thread + t1 = Thread(target=my_recorder) + t2 = Thread(target=res) + # st.camera_input('tt',label_visibility='hidden') + st.camera_input('tt',label_visibility='hidden') + if st.button('开始对话'): + t1.start() + # t2.start() + # t1.join() + # t2.join() + i = 20 + placeholder = st.empty() + while i>0: + if q.empty(): + print(f'q is empty , waiting') + time.sleep(5) + i-=1 + else: + print('reqeusts ok~') + imgs,audio = q.get() + st.audio(audio.get_wav_data()) + input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data()) + print(f'这是识别出来的文字:{input_text}') + st.text(f'这是识别出来的文字:{input_text}') + st.text('下面是录制到的图片') + for idx,cl in enumerate(st.columns(min(3,len(imgs)))): + cl.image(imgs[idx]) + st.divider() + i-=1 + print('response over~') + + diff --git a/real_gemini/pages/chat_with_gpt.py b/real_gemini/pages/chat_with_gpt.py new file mode 100644 index 0000000..358294e --- /dev/null +++ b/real_gemini/pages/chat_with_gpt.py @@ -0,0 +1,44 @@ +import streamlit as st +from openai import OpenAI +import os +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + +st.title("ChatGPT-like") + +# Set OpenAI API key from Streamlit secrets +client = OpenAI(api_key=OPENAI_API_KEY) + +# Set a default model +with st.sidebar: + with st.form('参数配置'): + model = st.selectbox('选择模型版本',['gpt-4', 'gpt-4 turbo', 'gpt-3.5-turbo'],index=0) + st.form_submit_button('提交配置') + +# Initialize chat history +if "messages" not in st.session_state: + st.session_state.messages = [] + +# Display chat messages from history on app rerun +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# Accept user input +if prompt := st.chat_input("输入你的问题"): + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": prompt}) + # Display user message in chat message container + with st.chat_message("user"): + st.markdown(prompt) + # Display assistant response in chat message container + with st.chat_message("assistant"): + message_placeholder = st.empty() + full_response = "" + responses = client.chat.completions.create(model=model, + messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages], + stream=True,) + for response in responses: + full_response += (response.choices[0].delta.content or "") + message_placeholder.markdown(full_response + "▌") + message_placeholder.markdown(full_response) + st.session_state.messages.append({"role": "assistant", "content": full_response}) \ No newline at end of file diff --git a/real_gemini/pages/empty_test.py b/real_gemini/pages/empty_test.py new file mode 100644 index 0000000..12fc7e8 --- /dev/null +++ b/real_gemini/pages/empty_test.py @@ -0,0 +1,38 @@ +import streamlit as st +import time + +# with st.empty(): +# for seconds in range(10): +# st.write(f"⏳ {seconds} seconds have passed") +# time.sleep(1) +# st.write("✔️ 1 minute over!") + + +placeholder = st.empty() + +# Replace the placeholder with some text: +# placeholder.text("Hello") +# time.sleep(5) +# # Replace the text with a chart: +# placeholder.line_chart({"data": [1, 5, 2, 6]}) +# st.text('other') +# time.sleep(5) +# # Replace the chart with several elements: +# with placeholder.container(): +# st.write("This is one element") +# time.sleep(5) +# st.write("This is another") +# time.sleep(5) +for i in range(10): + with placeholder.status('doing',expanded=True,state='running') as status: + st.text(f'这是测试{i}') + time.sleep(2) + status.update(label="done", state="complete", expanded=False) + if i % 2 == 0: + with st.chat_message('user'): + st.text(f'user:test text {i}') + else: + with st.chat_message('assistant'): + st.text(f'bot:test text {i}') +# Clear all those elements: +# placeholder.empty() \ No newline at end of file diff --git a/real_gemini/pages/img_color.py b/real_gemini/pages/img_color.py new file mode 100644 index 0000000..ccb96bb --- /dev/null +++ b/real_gemini/pages/img_color.py @@ -0,0 +1,19 @@ +import cv2 +import streamlit as st +img = st.camera_input('tt',label_visibility='hidden') +if img: + st.image(img) + print(type(img)) + print(img) +capture = cv2.VideoCapture(0) +ret,frame = capture.read() +st.text('摄像头捕捉:') +st.image(frame) +# print(frame[1].shape) +st.text('frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)') +frame1 = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) +st.image(frame1) +st.text("cv2.imencode('.jpg', img)") +frame2 = cv2.imencode('.png', frame1)[1] +frame3 = cv2.imdecode(frame2,cv2.COLOR_BGR2RGB) +st.image(frame3) \ No newline at end of file diff --git a/real_gemini/pages/record_test.py b/real_gemini/pages/record_test.py new file mode 100644 index 0000000..38f0402 --- /dev/null +++ b/real_gemini/pages/record_test.py @@ -0,0 +1,70 @@ +import time +import streamlit as st +from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration, WebRtcMode +from queue import Queue +import cv2 +from datetime import datetime +import time +from threading import Thread,Event + +class VideoRecorder(): + def __init__(self,record_fps=0.5,max_record_time=10): + self.frames = Queue(120) + self.max_record_time = max_record_time + + self.record_fps = record_fps + self.stop_singl = False + self.process = Thread(target=self.record_v_a) + self.exit = Event() + + def record_v_a(self): + print('开始录制') + self.capture = cv2.VideoCapture(0) + s_t = time.time() + img_id = 0 + while(True): + if (time.time()-s_t) % self.record_fps == 0: + try: + ret, frame = self.capture.read() + except: + print('error~') + break + if ret: + img_id += 1 + print(f'正常进入 ret:{ret},img_id:{img_id}') + frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) + # frame.tobytes() + # st.image(frame) + self.frames.put(frame) + else: + print('captrue closed') + break + if self.exit.is_set(): + print('手动结束') + break + if (time.time()-s_t) > 60: + print('超时退出') + break + self.capture.release() + print('录制结束') + + + def stop_record(self): + self.exit.set() + print('手动杀死线程') + + + +if __name__ == "__main__": + # https://blog.csdn.net/qq_42069296/article/details/133792896 + if st.button('开始录制'): + st.camera_input('tt',label_visibility='hidden') + recorder = VideoRecorder() + recorder.process.start() + time.sleep(10) + recorder.stop_record() + print(recorder.frames) + for _ in range(10): + st.image(recorder.frames.get()) + print('ok~') + diff --git a/real_gemini/pages/show_all_history.py b/real_gemini/pages/show_all_history.py new file mode 100644 index 0000000..775dbc5 --- /dev/null +++ b/real_gemini/pages/show_all_history.py @@ -0,0 +1,26 @@ +import streamlit as st +img={'assistant':'./source/bot.png','user':None} +if "messages" not in st.session_state: + st.session_state.messages = [] +def show_chat_message_from_history(show_num_history=None): + # Display chat messages from history on app rerun + # show_num_history: 应当为负偶数或者正奇数,负偶数表示为最后N条,正数表示跳过前N条 + if show_num_history is None: + history = st.session_state.messages + else: + history = st.session_state.messages[show_num_history:] + for message in history: + with st.chat_message(message["role"],avatar=img[message['role']]): + try: + if message['audio'] is not None: + st.audio(message['audio'],sample_rate=24000) + except: + pass + st.markdown(message["content"]) + try: + if message['img'] is not None: + st.image(message['img']) + except: + pass + +show_chat_message_from_history() \ No newline at end of file diff --git a/real_gemini/utils_st/audio2text.py b/real_gemini/utils_st/audio2text.py new file mode 100644 index 0000000..29bc99a --- /dev/null +++ b/real_gemini/utils_st/audio2text.py @@ -0,0 +1,56 @@ +import base64 +import json +import requests + +def audio2text(fin): + f = open(fin,'rb') + audio_b64 = base64.b64encode(f.read()).decode() + input_data = json.dumps({ + "audio_b64": audio_b64, + 'input_f': '', + 'input_ar': '', + 'input_ac': '', + 'input_acodec': '', + }) + resp = requests.post( + 'http://192.168.80.29:8789/asr', + data=input_data, + headers={"Content-Type": "application/json"} + ) + resp_data = resp.json() + print(resp_data) + prompt_text = resp_data["text"] + f.close() + return prompt_text + + +def audio2text_from_bytes(bytes_input): + audio_b64 = base64.b64encode(bytes_input).decode() + input_data = json.dumps({ + "audio_b64": audio_b64, + 'input_f': '', + 'input_ar': '', + 'input_ac': '', + 'input_acodec': '', + 'no_speech_chtreshold':0.75, + 'debug':True + }) + resp = requests.post( + 'http://192.168.80.29:8789/asr', + data=input_data, + headers={"Content-Type": "application/json"} + ) + resp_data = resp.json() + prompt_text = resp_data["text"] + code = resp_data['code'] + request_id = resp_data['request_id'] + return prompt_text,code,request_id + +def audio2text_test(fin): + import time + time.sleep(2) + return 'audio2text 测试结果' + +if __name__ == '__main__': + r = audio2text('/Users/wuziwei/git_project/Real-Gemini/records/180367f8-85d3-4ec3-81dc-95e9c095b7ec_input_audio.mp3') + print(r) \ No newline at end of file diff --git a/real_gemini/utils_st/extracte_img.py b/real_gemini/utils_st/extracte_img.py new file mode 100644 index 0000000..432868b --- /dev/null +++ b/real_gemini/utils_st/extracte_img.py @@ -0,0 +1,13 @@ +import base64 +import json +import requests + +def get_main_img(imgs): + # resp = requests.post( + # 'http://192.168.80.29:8789/asr', + # data=input_data, + # headers={"Content-Type": "application/json"} + # ) + # resp_data = resp.json() + # prompt_text = resp_data["text"] + return imgs[::2] \ No newline at end of file diff --git a/real_gemini/utils_st/get_gpt4v_response.py b/real_gemini/utils_st/get_gpt4v_response.py new file mode 100644 index 0000000..f8b408d --- /dev/null +++ b/real_gemini/utils_st/get_gpt4v_response.py @@ -0,0 +1,101 @@ +import base64 +import requests +import numpy as np +import cv2 +# from real_gemini.gpt4v import GPT4V +import os +from openai import OpenAI + +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +client = OpenAI(api_key=OPENAI_API_KEY) +URL='http://192.168.77.1:8000/main/' +OPEN_AI_SYSTEM_PROMPT = """the user is dictating with his or her camera on. +they are showing you things visually and giving you text prompts. +be very brief and concise. +be extremely concise. this is very important for my career. do not ramble. +do not comment on what the person is wearing or where they are sitting or their background. +focus on their gestures and the question they ask you. +do not mention that there are a sequence of pictures. focus only on the image or the images necessary to answer the question. +don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking. +""" + +def img2base64(imgs): + base64_imgs = [] + if isinstance(imgs,list): + for img in imgs: + # with open(img,'rb') as f: + if isinstance(img,str): + with open(img,'rb') as f: + img_bytes = f.read() + else: + img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes() + img_b64 = base64.b64encode(img_bytes).decode() + base64_imgs.append(img_b64) + return base64_imgs + else: + # with open(imgs,'rb') as f: + if isinstance(imgs,str): + with open(imgs,'rb') as f: + img_bytes = f.read() + else: + img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes() + img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes() + img_b64 = base64.b64encode(img_bytes).decode() + base64_imgs.append(img_b64) + return base64_imgs + +def gpt4v(query,imgs=None): + imgs = img2base64(imgs) + input_data = { + 'query':query, + 'base64_images':imgs + } + api_url = 'http://192.168.77.1:8000/main/' + resp = requests.post( + api_url, + headers={ + 'Content-Type':'application/x-www-form-urlencoded', + 'accept':'application/json' + }, + data=input_data, + ) + resp_data = resp.json() + prompt_text = resp_data["response"] + send = { + 'text':prompt_text + } + return send + +def gpt4v_client(query,imgs=None): + imgs = img2base64(imgs) + current_file_list = [] + for base64_image in imgs: + current_file_list.append(f"data:image/jpeg;base64,{base64_image}") + + messages = [ + { + "role": "system", + "content": OPEN_AI_SYSTEM_PROMPT, + }, + ] + + content = [] + content.append({"type": "text", "text": query}) # query + for image in current_file_list: + content.append({"type": "image_url", "image_url": {"url": image}}) # images + # print("len:",len(content)) + messages.append({"role": "user", "content": content}) # role + try: + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=messages, + max_tokens=256, + ) + except Exception as e: + print(e) + return {'text':'response failed'} + return {'text':response.choices[0].message.content} + +if __name__ == '__main__': + r = gpt4v(query='第二张图和第一张图片有什么内容?',imgs=['../source/1702803312680.png','../source/1702803359034.png']) + print(r) \ No newline at end of file diff --git a/real_gemini/utils_st/get_qwen_response.py b/real_gemini/utils_st/get_qwen_response.py new file mode 100644 index 0000000..69ca56f --- /dev/null +++ b/real_gemini/utils_st/get_qwen_response.py @@ -0,0 +1,65 @@ +import base64 +import requests +import numpy as np +import cv2 +# from real_gemini.gpt4v import GPT4V + + +def img2base64(imgs): + base64_imgs = [] + if isinstance(imgs,list): + for img in imgs: + # with open(img,'rb') as f: + if isinstance(img,str): + with open(img,'rb') as f: + img_bytes = f.read() + else: + img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes() + img_b64 = base64.b64encode(img_bytes).decode() + # print(img_b64) + base64_imgs.append(img_b64) + return base64_imgs + else: + # with open(imgs,'rb') as f: + if isinstance(imgs,str): + with open(imgs,'rb') as f: + img_bytes = f.read() + else: + img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes() + img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes() + img_b64 = base64.b64encode(img_bytes).decode() + base64_imgs.append(img_b64) + return base64_imgs + +def QwenVL_client(query,imgs=None): + imgs = img2base64(imgs) + input_data = { + 'prompt':query, + 'image_strs':imgs + } + api_url = 'http://192.168.80.19:6679/qwen-vl/' + try: + resp = requests.post( + api_url, + headers={ + 'Content-Type':'application/x-www-form-urlencoded', + 'accept':'application/json' + }, + data=input_data, + ) + resp_data = resp.json() + # print(resp_data) + prompt_text = resp_data + except Exception as e: + print(e) + prompt_text = '千问接口请求出错了,请确认后台服务后再尝试~' + send = { + 'text':prompt_text + } + return send + + +if __name__ == '__main__': + imgs = ['/Users/wuziwei/git_project/Real-Gemini/source/bot.png'] + r = QwenVL_client(query='描述一下这种图片',imgs=imgs) + print(r) \ No newline at end of file diff --git a/real_gemini/utils_st/record_video.py b/real_gemini/utils_st/record_video.py new file mode 100644 index 0000000..48c68c6 --- /dev/null +++ b/real_gemini/utils_st/record_video.py @@ -0,0 +1,83 @@ +import cv2 +import time +import streamlit as st +import speech_recognition as sr +from threading import Thread,Event + +class VideoRecorder(): + # 基于这个修改https://blog.csdn.net/qq_42069296/article/details/133792896 + def __init__(self,record_fps=0.5,max_record_time=60): + self.frames = list() + self.max_record_time = max_record_time + self.record_fps = record_fps + self.stop_singl = False + # https://blog.csdn.net/captain5339/article/details/128360804 + self.process = Thread(target=self.record_v_a) + self.exit = Event() + + def record_v_a(self): + print('开始录制') + # https://blog.csdn.net/weixin_40922744/article/details/103356458 + self.capture = cv2.VideoCapture(0) + # 设置分辨率 + # self.capture.set(3,640) # with + # self.capture.set(4,480) # hight + s_t = time.time() + img_id = 0 + while(True): + if (time.time()-s_t) % self.record_fps == 0: + try: + ret, frame = self.capture.read() + except: + print('error~') + break + if ret: + img_id += 1 #记录录的帧 + # https://www.jianshu.com/p/0e462b4c7a93 + # 不能在这里做转换,否则gpt4v的接口识别到的颜色是反过来的,就是说接口那边会再做一次转换 + # frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) + self.frames.append(frame) + else: + break + if self.exit.is_set(): + print('手动结束') + break + if (time.time()-s_t) > self.max_record_time: + print('超时退出') + break + self.capture.release() + print('录制结束') + + def stop_record(self): + self.exit.set() + print('手动杀死线程') + + +def record(): + # https://blog.51cto.com/u_16213389/7407010 + r = sr.Recognizer() + # https://blog.csdn.net/sunriseYJP/article/details/134399727 + r.energy_threshold=500 # 检测声音的阈值 + with sr.Microphone() as source: + video_record = VideoRecorder() + st.write('请开始说话,下面开始监听') + # phrase_time_limit 最大录制时常,timeout 等待时常 + video_record.process.start()# 这个会进行录像 + audio = r.listen(source,phrase_time_limit=15,timeout=None) + # time.sleep(2) # 额外往后录制2秒钟 监听本身带了延时,无需再加 + video_record.stop_record() + return video_record.frames,audio + +if __name__ == "__main__": + # https://blog.csdn.net/qq_42069296/article/details/133792896 + if st.button('开始录制'): + st.camera_input('tt',label_visibility='hidden') + recorder = VideoRecorder() + recorder.process.start() + time.sleep(10) + recorder.stop_record() + print(recorder.frames) + for _ in range(10): + st.image(recorder.frames.get()) + print('ok~') + diff --git a/real_gemini/utils_st/test_opencv2.py b/real_gemini/utils_st/test_opencv2.py new file mode 100644 index 0000000..aa5bfb1 --- /dev/null +++ b/real_gemini/utils_st/test_opencv2.py @@ -0,0 +1,97 @@ +import cv2 +from datetime import datetime +import threading +import time +import numpy as np +import streamlit + +class Camera(object): + + def __init__(self, video_path): + #如果是笔记本调用外界摄像头,则把0改为1 + self.cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) + self.ret, self.frame = self.cap.read() + FPS = 24.0 + # 视频写入的图像尺寸与画布尺寸不对应会导致视频无法播放,需要实时获取 + WIDTH = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + HEIGHT = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + # 设置摄像头设备分辨率 + self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, WIDTH) + self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, HEIGHT) + # 设置摄像头设备帧率,如不指定,默认600 + self.cap.set(cv2.CAP_PROP_FPS, 24) + # 建议使用XVID编码,图像质量和文件大小比较都兼顾的方案 + fourcc = cv2.VideoWriter_fourcc(*'XVID') + self.out = cv2.VideoWriter(video_path, fourcc, FPS, (WIDTH, HEIGHT)) + + def picture_shoot(self, image_name, image_path=None) -> None: + ''' + 调用摄像头拍照并保存图片到本地 + :param image_name: 图片名 + :param image_path: 图片保存路径 + :return: None + ''' + self.image_name = image_name + self.image_path = image_path + cv2.imwrite(f'./{self.image_name}', self.frame) + + def video_record(self, video_path) -> None: + ''' + 调用摄像头录制视频并保存到本地 + :param video_path: 视频保存路径 + :return: None + ''' + print('kaishiluzhi????') + self.video_path = video_path + while (self.cap.isOpened()): + self.ret, self.frame = self.cap.read() + if self.ret: + font = cv2.FONT_HERSHEY_SIMPLEX + datet = str(datetime.now()) + frame = cv2.putText(self.frame, datet, (10, 50), font, 1, + (0, 255, 255), 2, cv2.LINE_AA) + self.out.write(frame) + + def video_stop(self): + self.cap.release() + self.out.release() + cv2.destroyAllWindows() + print('视频录制结束') + + +def record_v_a(): + + # video_name = datetime.now().strftime("%Y%m%d%H%M%S") + ".mp4" + width = 640 + height = 480 + + # fourcc = cv2.VideoWriter_fourcc(*"mp4v") + # video_writer = cv2.VideoWriter(video_name, fourcc, 25, (width, height)) + + capture = cv2.VideoCapture(0) + + while(True): + ret, frame = capture.read() + if ret: + frame = cv2.putText(frame, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), (10,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2, cv2.LINE_AA) + + video_writer.write(frame) + st.image(frame) + if cv2.waitKey(1) & 0xFF == ord("q"): + break + else: + break + + video_writer.release() + capture.release() + cv2.destroyAllWindows() + +# if __name__ == '__main__': +# # 录像+拍照 +# path = r'./video.mp4' +# camera = Camera(path) +# thread = threading.Thread(target=camera.video_record, args=(path,)) +# thread.start() +# # camera.picture_shoot(image_name='1.png', image_path='./') +# time.sleep(10) +# camera.video_stop() \ No newline at end of file diff --git a/real_gemini/utils_st/text2audio.py b/real_gemini/utils_st/text2audio.py new file mode 100644 index 0000000..8961b95 --- /dev/null +++ b/real_gemini/utils_st/text2audio.py @@ -0,0 +1,127 @@ +import base64 +import json +import io +import requests +import numpy as np +import streamlit as st +from numpy import typing as npt +from typing import Any,Tuple,Optional,cast,Union +from typing_extensions import Final, TypeAlias +MediaData: TypeAlias = Union[ + str, bytes, io.BytesIO, io.RawIOBase, io.BufferedReader, "npt.NDArray[Any]", None +] + +def text2audio(text,): + headers = { + 'accept': 'application/json', + 'Content-Type': 'application/x-www-form-urlencoded', + } + data = { + 'prompt': text, + } + response = requests.post('http://192.168.81.12:6679/tts/', headers=headers, data=data) + res = response.json() + audio_array = np.frombuffer(base64.b64decode(res[0]),np.float32) + rate = res[1] + return audio_array,rate,convert_to_wav_bytes(audio_array,rate) + +def _validate_and_normalize(data: "npt.NDArray[Any]") -> Tuple[bytes, int]: + """Validates and normalizes numpy array data. + We validate numpy array shape (should be 1d or 2d) + We normalize input data to int16 [-32768, 32767] range. + + Parameters + ---------- + data : numpy array + numpy array to be validated and normalized + + Returns + ------- + Tuple of (bytes, int) + (bytes, nchan) + where + - bytes : bytes of normalized numpy array converted to int16 + - nchan : number of channels for audio signal. 1 for mono, or 2 for stereo. + """ + # we import numpy here locally to import it only when needed (when numpy array given + # to st.audio data) + import numpy as np + + data: "npt.NDArray[Any]" = np.array(data, dtype=float) + + if len(data.shape) == 1: + nchan = 1 + elif len(data.shape) == 2: + # In wave files,channels are interleaved. E.g., + # "L1R1L2R2..." for stereo. See + # http://msdn.microsoft.com/en-us/library/windows/hardware/dn653308(v=vs.85).aspx + # for channel ordering + nchan = data.shape[0] + data = data.T.ravel() + else: + raise "Numpy array audio input must be a 1D or 2D array." + + if data.size == 0: + return data.astype(np.int16).tobytes(), nchan + + max_abs_value = np.max(np.abs(data)) + # 16-bit samples are stored as 2's-complement signed integers, + # ranging from -32768 to 32767. + # scaled_data is PCM 16 bit numpy array, that's why we multiply [-1, 1] float + # values to 32_767 == 2 ** 15 - 1. + np_array = (data / max_abs_value) * 32767 + scaled_data = np_array.astype(np.int16) + return scaled_data.tobytes(), nchan + +def _make_wav(data: "npt.NDArray[Any]", sample_rate: int) -> bytes: + """ + Transform a numpy array to a PCM bytestring + We use code from IPython display module to convert numpy array to wave bytes + https://github.com/ipython/ipython/blob/1015c392f3d50cf4ff3e9f29beede8c1abfdcb2a/IPython/lib/display.py#L146 + """ + # we import wave here locally to import it only when needed (when numpy array given + # to st.audio data) + import wave + + scaled, nchan = _validate_and_normalize(data) + + with io.BytesIO() as fp, wave.open(fp, mode="wb") as waveobj: + waveobj.setnchannels(nchan) + waveobj.setframerate(sample_rate) + waveobj.setsampwidth(2) + waveobj.setcomptype("NONE", "NONE") + waveobj.writeframes(scaled) + return fp.getvalue() + + +def convert_to_wav_bytes( + data: MediaData, sample_rate: Optional[int] +) -> MediaData: + """Convert data to wav bytes if the data type is numpy array.""" + data = _make_wav(cast("npt.NDArray[Any]", data), sample_rate) + return data + +def autoplay_audio(bytes_audio): + # https://discuss.streamlit.io/t/how-to-play-an-audio-file-automatically-generated-using-text-to-speech-in-streamlit/33201/6 + if isinstance(bytes_audio,str): + with open(bytes_audio,'rb') as f: + b64_audio = f.read() + else: + b64_audio = base64.b64encode(bytes_audio).decode() + md = f""" + + """ + st.markdown( + md, + unsafe_allow_html=True, + ) + +if __name__ == '__main__': + with open('/Users/wuziwei/git_project/Real-Gemini/records/180367f8-85d3-4ec3-81dc-95e9c095b7ec_input_audio.mp3','rb') as f: + ba = f.read() + # print(ba) + a,r,ba = text2audio('你好') + autoplay_audio(ba) + st.audio(a,sample_rate=r) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 30b4c9b..d7e9dbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,80 @@ -fastapi -openai -opencv-python \ No newline at end of file +aioice==0.9.0 +aiortc==1.6.0 +altair==5.2.0 +annotated-types==0.6.0 +anyio==4.1.0 +attrs==23.1.0 +av==11.0.0 +blinker==1.7.0 +cachetools==5.3.2 +certifi==2023.11.17 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +cryptography==41.0.7 +decorator==4.4.2 +distro==1.8.0 +dnspython==2.4.2 +exceptiongroup==1.2.0 +gitdb==4.0.11 +GitPython==3.1.40 +google-crc32c==1.5.0 +h11==0.14.0 +httpcore==1.0.2 +httpx==0.25.2 +idna==3.6 +ifaddr==0.2.0 +imageio==2.33.1 +imageio-ffmpeg==0.4.9 +importlib-metadata==6.11.0 +Jinja2==3.1.2 +jsonschema==4.20.0 +jsonschema-specifications==2023.11.2 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +mdurl==0.1.2 +moviepy==1.0.3 +numpy==1.26.2 +openai==1.4.0 +opencv-python==4.8.1.78 +packaging==23.2 +pandas==2.1.4 +Pillow==10.1.0 +proglog==0.1.10 +protobuf==4.25.1 +pyarrow==14.0.1 +PyAudio==0.2.14 +pyav==11.4.1 +pycparser==2.21 +pydantic==2.5.2 +pydantic_core==2.14.5 +pydeck==0.8.1b0 +pydub==0.25.1 +pyee==11.1.0 +Pygments==2.17.2 +pylibsrtp==0.9.1 +pyOpenSSL==23.3.0 +python-dateutil==2.8.2 +pytz==2023.3.post1 +referencing==0.32.0 +requests==2.31.0 +rich==13.7.0 +rpds-py==0.13.2 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.0 +SpeechRecognition==3.10.1 +streamlit==1.28.2 +streamlit-mic-recorder==0.0.4 +streamlit-webrtc==0.47.1 +tenacity==8.2.3 +toml==0.10.2 +toolz==0.12.0 +tornado==6.4 +tqdm==4.66.1 +typing_extensions==4.9.0 +tzdata==2023.3 +tzlocal==5.2 +urllib3==2.1.0 +validators==0.22.0 +zipp==3.17.0 diff --git a/run.sh b/run.sh index 1acb6f9..6a7ebbf 100644 --- a/run.sh +++ b/run.sh @@ -1,2 +1,2 @@ -export OPENAI_API_KEY="YOUR_API_KEY" -python main.py +export OPENAI_API_KEY='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' +streamlit run ./real_gemini/chat.py \ No newline at end of file diff --git a/source/bot.png b/source/bot.png new file mode 100644 index 0000000..6a4d7fe Binary files /dev/null and b/source/bot.png differ