diff --git a/.gitignore b/.gitignore
index 2b3817d..7158936 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+records/
+*.mp3
+*.mp4
\ No newline at end of file
diff --git a/README.md b/README.md
index 47ec8a1..622c54a 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,25 @@ Real-time video understanding and interaction through text,audio,image and video
 
 利用多模态大模型的实时视频理解和交互框架，通过文本、语音、图像和视频和这是世界进行问答和交流。
 
-## 启动后端服务
+## 启动前端对话服务
+主要实现了下面2个功能
+
+- 1、streamlit对话界面
+- 2、gpt4v请求接口
+
 打开`run.sh`输入自己的api key, 然后启动
 ```shell
 sh run.sh
 ```
-
-## 启动前端界面
-:construction: (施工中)
-```
-python demo.py
+## TTS和ASR服务
+- ASR
+服务调用自S组（TODO：要不要更新一个服务在这里）
+- TTS
+见[tts.py](./real_gemini/tts.py)，启动脚本：
+```shell
+python tts.py
 ```
+启动这些服务需要一些额外的环境和模型：`torch, torchaudio, TTS`，用`pip`安装即可，模型文件路径见py脚本。
 
 ## Acknowledgement
 - [Fastapi](https://github.com/tiangolo/fastapi)
diff --git a/install_pyaudio.sh b/install_pyaudio.sh
new file mode 100644
index 0000000..928b2d0
--- /dev/null
+++ b/install_pyaudio.sh
@@ -0,0 +1,5 @@
+wget http://www.portaudio.com/archives/pa_stable_v190600_20161030.tgz
+tar -zxvf pa_stable_v190600_20161030.tgz 
+cd portaudio/
+./configure && make && sudo make install 
+pip install pyaudio
diff --git a/real_gemini/chat.py b/real_gemini/chat.py
new file mode 100644
index 0000000..fa00c98
--- /dev/null
+++ b/real_gemini/chat.py
@@ -0,0 +1,176 @@
+import uuid
+import streamlit as st
+from pathlib import Path
+from utils_st.audio2text import audio2text_from_bytes
+from utils_st.extracte_img import get_main_img
+from utils_st.get_gpt4v_response import gpt4v_client
+from utils_st.get_qwen_response import QwenVL_client
+from utils_st.text2audio import text2audio,autoplay_audio
+from utils_st.record_video import record
+from queue import Queue
+import time
+import cv2
+from threading import Thread,Event
+
+img = {'assistant':'./source/bot.png','user':None}
+res_ = {'Qwen-vl':QwenVL_client,'gpt4v':gpt4v_client}
+
+# 设置事件锁
+event_record = Event()
+event_chat = Event()
+event_record.set() # 初始打开录音锁
+
+with st.sidebar:
+    with st.form('参数配置'):
+        max_chat_turn = st.slider('最大对话轮数:',min_value=1,max_value=10000,value=10)
+        response_name = st.selectbox('选择模型',['Qwen-vl','gpt4v'],index=1)
+        st.form_submit_button('提交配置')
+responser = res_[response_name]
+max_record_round = 2*max_chat_turn
+q = Queue(max_record_round)
+
+st.title("Gemini-like对话测试")
+#########################存储录入的文件#####################
+# RECORD_DIR = Path("./records")
+# RECORD_DIR.mkdir(exist_ok=True)
+# if "prefix" not in st.session_state:
+#     st.session_state["prefix"] = str(uuid.uuid4())
+# prefix = st.session_state["prefix"]
+# in_file_video = RECORD_DIR / f"{prefix}_input_video.mp4"
+# in_file_audio = RECORD_DIR / f"{prefix}_input_audio.mp3"
+#########################存储录入的文件#####################
+# 对话机器人的图标
+
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+def my_recorder():
+    for i in range(max_record_round):
+        # 等待录入条件触发，最开始是默认触发
+        print('holding to record')
+        event_record.wait()
+        print(f'record {i}')
+        imgs,audio = record()
+        input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data())
+        # 过滤一些无意义的文本
+        if input_text and len(input_text)>5:
+            q.put((imgs,audio,input_text))
+        else:
+            print(f'非预期输入: id--{request_id},status--{code_status},text--{input_text}')
+            time.sleep(2)# 给2秒时间，调整准备输入
+            continue
+        print(f'{i}录制结束，{q.qsize()}')
+        # 录制结束，解开对话阻塞，同时阻塞下一轮录入
+        event_record.clear()
+        event_chat.set()
+        print('释放对话锁，加录音锁')
+    print('输入处理服务结束')
+
+def show_chat_message_from_history(show_num_history=None):
+    # Display chat messages from history on app rerun
+    # show_num_history: 应当为负偶数或者正奇数，负偶数表示为最后N条，正数表示跳过前N条
+    if show_num_history is None:
+        history = st.session_state.messages
+    else:
+        history = st.session_state.messages[show_num_history:]
+    for message in history:
+        with st.chat_message(message["role"],avatar=img[message['role']]):
+            try:
+                if message['audio'] is not None:
+                    st.audio(message['audio'],sample_rate=24000)
+            except:
+                pass
+            st.markdown(message["content"])
+            try:
+                if message['img'] is not None:
+                    st.image(message['img'])
+            except:
+                pass
+
+def response(prompt=None,imgs=None,autoplay=True,audio_response=True):
+    """
+    prompt：输入的文本
+    imgs：输入的图片
+    autoplay：是否自动播放语音
+    audio_response：是否将文本转换成语音响应
+    """
+    if prompt:
+        sound = None
+        # Display user message in chat message container
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Add user message to chat history
+            st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display assistant response in chat message container
+        with st.chat_message("assistant",avatar='./source/bot.png'):
+            res = responser(query=prompt,imgs=imgs)
+            print('res[text]:',res['text'])
+            if audio_response:
+                sound,rate,byte_sound_array = text2audio(res["text"])
+            else:
+                autoplay = False
+            if autoplay:
+                autoplay_audio(byte_sound_array)
+            if not autoplay and audio_response:
+                # 不自动播放语音
+                st.audio(sound,sample_rate=rate)
+            st.markdown(res['text'])
+            try:
+                st.image(res['imgs'])
+            except:
+                pass
+            # 由于是自动播放音频，需要等待音频播放完毕
+            if autoplay:
+                time.sleep(int(len(sound)/rate)+1)
+            st.session_state.messages.append({"role": "assistant", "content": res['text'],'audio':sound})
+
+
+if __name__ == '__main__':
+    max_round=max_chat_turn+50 # 为了保证安全，没有写没条件的while循环
+    record_thread = Thread(target=my_recorder)
+    # 展示录像设备的图像信息
+    video_show = st.container()
+    video_show.camera_input('tt',label_visibility='hidden')
+    # 开始录入输入
+    if video_show.button('开始对话'):
+        st.info(f'开始监听麦克风...')
+        record_thread.start()
+    else:
+        st.stop()
+    # 展示录入信息的处理
+    placeholder = st.empty()
+    # 展示对话
+    chat_placeholder = st.empty()
+    while max_round>0:
+        # 等待对话开始，初始化是阻塞，等待第一次输入录入完成，才会打开锁
+        print('等待对话开始')
+        event_chat.wait()
+        print('开始对话')
+        if not q.empty():
+            # 进入到对话时，停止录入，防止录入播放的音频
+            print('进入对话响应，暂停录入')
+            imgs,audio,input_text = q.get()
+            with placeholder.status('处理输入信号...',state='running',expanded=True) as status:
+                if len(imgs)>0:
+                    st.write('getMainFrame...')
+                    imgs = get_main_img(imgs)
+                    imgs = imgs[-3:]
+                    cls = st.columns(min(3,len(imgs)))
+                    for idx,cl in enumerate(cls):
+                        cl.image(cv2.cvtColor(imgs[idx],cv2.COLOR_BGR2RGB))
+                st.audio(audio.get_wav_data())
+                st.text(f'识别后的文本：{input_text}')
+                status.update(label="输入信号处理完成", state="complete", expanded=False)
+            with chat_placeholder.container(height=600):# 1.30支持设置 height=300px
+            # with st.container(height=600):# 1.30支持设置 height=300px
+                # 容器高度设置，要等1.30版本更新，https://github.com/streamlit/streamlit/issues/2169
+                show_chat_message_from_history() # 现在关闭展示历史，只展示单轮
+                response(prompt=input_text,imgs=imgs,autoplay=True,audio_response=True)
+                print('对话完毕，释放录音锁，打开对话锁')
+                # 对话响应完毕，打开事件
+                event_record.set()
+                # 如果没有录入输入，等待
+                event_chat.clear()
+            # chat_placeholder.empty()
+    print('达到最大对话轮数，结束程序！')
diff --git a/real_gemini/pages/audio_test.py b/real_gemini/pages/audio_test.py
new file mode 100644
index 0000000..fb6f13c
--- /dev/null
+++ b/real_gemini/pages/audio_test.py
@@ -0,0 +1,87 @@
+import streamlit as st
+import speech_recognition as sr 
+import base64
+from utils_st.audio2text import audio2text_from_bytes
+from moviepy.editor import AudioFileClip
+from utils_st.record_video import VideoRecorder
+import time
+from utils_st.record_video import record
+from queue import Queue
+
+max_turn = 20
+q = Queue(max_turn)
+
+def audio_record():
+    r = sr.Recognizer()
+    r.energy_threshold=500 # 检测声音的阈值
+    with sr.Microphone() as source:
+        st.write('请开始说话，下面开始监听') 
+        # phrase_time_limit 最大录制时常，timeout 等待时常
+        for i in range(max_turn):
+            print(f'turn {i} start')
+            audio = r.listen(source,phrase_time_limit=15,timeout=None)
+            q.put(audio)
+            print(f'声音录制结束，{q.qsize()}')
+    print('结束')
+
+def my_recorder():
+    for i in range(max_turn):
+        imgs,audio = record()
+        q.put((imgs,audio))
+        print(f'录制结束，{q.qsize()}')
+    print('输入处理服务结束')
+
+def res():
+    print('getin response')
+    i = 20
+    while i>0:
+        if q.empty():
+            print(f'q is empty , waiting')
+            time.sleep(5)
+            i-=1
+        else:
+            print('reqeusts ok~')
+            audio = q.get()
+            st.audio(audio.get_wav_data())
+            input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data())
+            print(f'这是识别出来的文字：{input_text}')
+            st.text(f'这是识别出来的文字：{input_text}')
+            i-=1
+    print('response over~')
+
+def show_chat_message_from_history():
+    pass
+
+if __name__ == '__main__':
+    from threading import Thread
+    t1 = Thread(target=my_recorder)
+    t2 = Thread(target=res)
+    # st.camera_input('tt',label_visibility='hidden')
+    st.camera_input('tt',label_visibility='hidden')
+    if st.button('开始对话'):
+        t1.start()
+        # t2.start()
+        # t1.join()
+        # t2.join()
+        i = 20
+        placeholder = st.empty()
+        while i>0:
+            if q.empty():
+                print(f'q is empty , waiting')
+                time.sleep(5)
+                i-=1
+            else:
+                print('reqeusts ok~')
+                imgs,audio = q.get()
+                st.audio(audio.get_wav_data())
+                input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data())
+                print(f'这是识别出来的文字：{input_text}')
+                st.text(f'这是识别出来的文字：{input_text}')
+                st.text('下面是录制到的图片')
+                for idx,cl in enumerate(st.columns(min(3,len(imgs)))):
+                    cl.image(imgs[idx])
+                st.divider()
+                i-=1
+        print('response over~')
+
+    
diff --git a/real_gemini/pages/chat_with_gpt.py b/real_gemini/pages/chat_with_gpt.py
new file mode 100644
index 0000000..358294e
--- /dev/null
+++ b/real_gemini/pages/chat_with_gpt.py
@@ -0,0 +1,44 @@
+import streamlit as st
+from openai import OpenAI
+import os 
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+st.title("ChatGPT-like")
+
+# Set OpenAI API key from Streamlit secrets
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# Set a default model
+with st.sidebar:
+    with st.form('参数配置'):
+        model = st.selectbox('选择模型版本',['gpt-4', 'gpt-4 turbo', 'gpt-3.5-turbo'],index=0)
+        st.form_submit_button('提交配置')
+
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+# Accept user input
+if prompt := st.chat_input("输入你的问题"):
+    # Add user message to chat history
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    # Display user message in chat message container
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        full_response = ""
+        responses = client.chat.completions.create(model=model,
+            messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
+            stream=True,)
+        for response in responses:
+            full_response += (response.choices[0].delta.content or "")
+            message_placeholder.markdown(full_response + "▌")
+        message_placeholder.markdown(full_response)
+    st.session_state.messages.append({"role": "assistant", "content": full_response})
\ No newline at end of file
diff --git a/real_gemini/pages/empty_test.py b/real_gemini/pages/empty_test.py
new file mode 100644
index 0000000..12fc7e8
--- /dev/null
+++ b/real_gemini/pages/empty_test.py
@@ -0,0 +1,38 @@
+import streamlit as st
+import time
+
+# with st.empty():
+#     for seconds in range(10):
+#         st.write(f"⏳ {seconds} seconds have passed")
+#         time.sleep(1)
+#     st.write("✔️ 1 minute over!")
+
+
+placeholder = st.empty()
+
+# Replace the placeholder with some text:
+# placeholder.text("Hello")
+# time.sleep(5)
+# # Replace the text with a chart:
+# placeholder.line_chart({"data": [1, 5, 2, 6]})
+# st.text('other')
+# time.sleep(5)
+# # Replace the chart with several elements:
+# with placeholder.container():
+#     st.write("This is one element")
+#     time.sleep(5)
+#     st.write("This is another")
+# time.sleep(5)
+for i in range(10):
+    with placeholder.status('doing',expanded=True,state='running') as status:
+        st.text(f'这是测试{i}')
+        time.sleep(2)
+        status.update(label="done", state="complete", expanded=False)
+    if i % 2 == 0:
+        with st.chat_message('user'):
+            st.text(f'user:test text {i}')
+    else:
+        with st.chat_message('assistant'):
+            st.text(f'bot:test text {i}')
+# Clear all those elements:
+# placeholder.empty()
\ No newline at end of file
diff --git a/real_gemini/pages/img_color.py b/real_gemini/pages/img_color.py
new file mode 100644
index 0000000..ccb96bb
--- /dev/null
+++ b/real_gemini/pages/img_color.py
@@ -0,0 +1,19 @@
+import cv2
+import streamlit as st
+img = st.camera_input('tt',label_visibility='hidden')
+if img:
+    st.image(img)
+    print(type(img))
+    print(img)
+capture = cv2.VideoCapture(0)
+ret,frame = capture.read()
+st.text('摄像头捕捉：')
+st.image(frame)
+# print(frame[1].shape)
+st.text('frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)')
+frame1 = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
+st.image(frame1)
+st.text("cv2.imencode('.jpg', img)")
+frame2 = cv2.imencode('.png', frame1)[1]
+frame3 = cv2.imdecode(frame2,cv2.COLOR_BGR2RGB)
+st.image(frame3)
\ No newline at end of file
diff --git a/real_gemini/pages/record_test.py b/real_gemini/pages/record_test.py
new file mode 100644
index 0000000..38f0402
--- /dev/null
+++ b/real_gemini/pages/record_test.py
@@ -0,0 +1,70 @@
+import time
+import streamlit as st
+from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration, WebRtcMode
+from queue import Queue
+import cv2
+from datetime import datetime
+import time
+from threading import Thread,Event
+
+class VideoRecorder():
+    def __init__(self,record_fps=0.5,max_record_time=10):
+        self.frames = Queue(120)
+        self.max_record_time = max_record_time
+        
+        self.record_fps = record_fps
+        self.stop_singl = False
+        self.process = Thread(target=self.record_v_a)
+        self.exit = Event()
+
+    def record_v_a(self):
+        print('开始录制')
+        self.capture = cv2.VideoCapture(0)
+        s_t = time.time()
+        img_id = 0
+        while(True):
+            if (time.time()-s_t) % self.record_fps == 0:
+                try:
+                    ret, frame = self.capture.read()
+                except:
+                    print('error～')
+                    break
+                if ret:
+                    img_id += 1 
+                    print(f'正常进入 ret：{ret},img_id:{img_id}')
+                    frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
+                    # frame.tobytes()
+                    # st.image(frame)
+                    self.frames.put(frame)
+                else:
+                    print('captrue closed')
+                    break
+                if self.exit.is_set():
+                    print('手动结束')
+                    break
+                if (time.time()-s_t) > 60:
+                    print('超时退出')
+                    break
+        self.capture.release()
+        print('录制结束')
+                    
+    
+    def stop_record(self):
+        self.exit.set()
+        print('手动杀死线程')
+        
+
+
+if __name__ == "__main__":
+    # https://blog.csdn.net/qq_42069296/article/details/133792896
+    if st.button('开始录制'):
+        st.camera_input('tt',label_visibility='hidden')
+        recorder = VideoRecorder()
+        recorder.process.start()
+        time.sleep(10)
+        recorder.stop_record()
+        print(recorder.frames)
+        for _ in range(10):
+            st.image(recorder.frames.get())
+        print('ok~')
+
diff --git a/real_gemini/pages/show_all_history.py b/real_gemini/pages/show_all_history.py
new file mode 100644
index 0000000..775dbc5
--- /dev/null
+++ b/real_gemini/pages/show_all_history.py
@@ -0,0 +1,26 @@
+import streamlit as st
+img={'assistant':'./source/bot.png','user':None}
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+def show_chat_message_from_history(show_num_history=None):
+    # Display chat messages from history on app rerun
+    # show_num_history: 应当为负偶数或者正奇数，负偶数表示为最后N条，正数表示跳过前N条
+    if show_num_history is None:
+        history = st.session_state.messages
+    else:
+        history = st.session_state.messages[show_num_history:]
+    for message in history:
+        with st.chat_message(message["role"],avatar=img[message['role']]):
+            try:
+                if message['audio'] is not None:
+                    st.audio(message['audio'],sample_rate=24000)
+            except:
+                pass
+            st.markdown(message["content"])
+            try:
+                if message['img'] is not None:
+                    st.image(message['img'])
+            except:
+                pass
+        
+show_chat_message_from_history()
\ No newline at end of file
diff --git a/real_gemini/utils_st/audio2text.py b/real_gemini/utils_st/audio2text.py
new file mode 100644
index 0000000..29bc99a
--- /dev/null
+++ b/real_gemini/utils_st/audio2text.py
@@ -0,0 +1,56 @@
+import base64
+import json
+import requests
+
+def audio2text(fin):
+    f = open(fin,'rb')
+    audio_b64 = base64.b64encode(f.read()).decode()
+    input_data = json.dumps({
+        "audio_b64": audio_b64,
+        'input_f': '',
+        'input_ar': '',
+        'input_ac': '',
+        'input_acodec': '',
+        })
+    resp = requests.post(
+        'http://192.168.80.29:8789/asr', 
+        data=input_data, 
+        headers={"Content-Type": "application/json"}
+        )
+    resp_data = resp.json()
+    print(resp_data)
+    prompt_text = resp_data["text"]
+    f.close()
+    return prompt_text
+
+
+def audio2text_from_bytes(bytes_input):
+    audio_b64 = base64.b64encode(bytes_input).decode()
+    input_data = json.dumps({
+        "audio_b64": audio_b64,
+        'input_f': '',
+        'input_ar': '',
+        'input_ac': '',
+        'input_acodec': '',
+        'no_speech_chtreshold':0.75,
+        'debug':True
+        })
+    resp = requests.post(
+        'http://192.168.80.29:8789/asr', 
+        data=input_data, 
+        headers={"Content-Type": "application/json"}
+        )
+    resp_data = resp.json()
+    prompt_text = resp_data["text"]
+    code = resp_data['code']
+    request_id = resp_data['request_id']
+    return prompt_text,code,request_id
+
+def audio2text_test(fin):
+    import time 
+    time.sleep(2)
+    return 'audio2text 测试结果'
+
+if __name__ == '__main__':
+    r = audio2text('/Users/wuziwei/git_project/Real-Gemini/records/180367f8-85d3-4ec3-81dc-95e9c095b7ec_input_audio.mp3')
+    print(r)
\ No newline at end of file
diff --git a/real_gemini/utils_st/extracte_img.py b/real_gemini/utils_st/extracte_img.py
new file mode 100644
index 0000000..432868b
--- /dev/null
+++ b/real_gemini/utils_st/extracte_img.py
@@ -0,0 +1,13 @@
+import base64
+import json
+import requests
+
+def get_main_img(imgs):
+    # resp = requests.post(
+    #     'http://192.168.80.29:8789/asr', 
+    #     data=input_data, 
+    #     headers={"Content-Type": "application/json"}
+    #     )
+    # resp_data = resp.json()
+    # prompt_text = resp_data["text"]
+    return imgs[::2]
\ No newline at end of file
diff --git a/real_gemini/utils_st/get_gpt4v_response.py b/real_gemini/utils_st/get_gpt4v_response.py
new file mode 100644
index 0000000..f8b408d
--- /dev/null
+++ b/real_gemini/utils_st/get_gpt4v_response.py
@@ -0,0 +1,101 @@
+import base64
+import requests
+import numpy as np
+import cv2
+# from real_gemini.gpt4v import GPT4V
+import os
+from openai import OpenAI
+
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+client = OpenAI(api_key=OPENAI_API_KEY)
+URL='http://192.168.77.1:8000/main/'
+OPEN_AI_SYSTEM_PROMPT = """the user is dictating with his or her camera on.
+they are showing you things visually and giving you text prompts.
+be very brief and concise.
+be extremely concise. this is very important for my career. do not ramble.
+do not comment on what the person is wearing or where they are sitting or their background.
+focus on their gestures and the question they ask you.
+do not mention that there are a sequence of pictures. focus only on the image or the images necessary to answer the question.
+don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
+"""
+
+def img2base64(imgs):
+    base64_imgs = []
+    if isinstance(imgs,list):
+        for img in imgs:
+            # with open(img,'rb') as f:
+            if isinstance(img,str):
+                with open(img,'rb') as f:
+                    img_bytes = f.read()
+            else:
+                img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes()
+            img_b64 = base64.b64encode(img_bytes).decode()
+            base64_imgs.append(img_b64)
+        return base64_imgs
+    else:
+        # with open(imgs,'rb') as f:
+        if isinstance(imgs,str):
+            with open(imgs,'rb') as f:
+                img_bytes = f.read()
+        else:
+            img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes()
+        img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes()
+        img_b64 = base64.b64encode(img_bytes).decode()
+        base64_imgs.append(img_b64)
+        return base64_imgs
+
+def gpt4v(query,imgs=None):
+    imgs = img2base64(imgs)
+    input_data = {
+        'query':query,
+        'base64_images':imgs
+    }
+    api_url = 'http://192.168.77.1:8000/main/'
+    resp = requests.post(
+        api_url, 
+        headers={
+            'Content-Type':'application/x-www-form-urlencoded',
+            'accept':'application/json'
+        },
+        data=input_data, 
+        )
+    resp_data = resp.json()
+    prompt_text = resp_data["response"]
+    send = {
+        'text':prompt_text
+    }
+    return send
+
+def gpt4v_client(query,imgs=None):
+    imgs = img2base64(imgs)
+    current_file_list = []
+    for base64_image in imgs:
+        current_file_list.append(f"data:image/jpeg;base64,{base64_image}")
+
+    messages = [
+        {
+            "role": "system",
+            "content": OPEN_AI_SYSTEM_PROMPT,
+            },
+        ]
+    
+    content = []
+    content.append({"type": "text", "text": query}) # query
+    for image in current_file_list:
+        content.append({"type": "image_url", "image_url": {"url": image}}) # images
+    # print("len:",len(content))
+    messages.append({"role": "user", "content": content}) # role
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=messages,
+            max_tokens=256,
+        )
+    except Exception as e:
+        print(e)
+        return {'text':'response failed'}
+    return {'text':response.choices[0].message.content}
+
+if __name__ == '__main__':
+    r = gpt4v(query='第二张图和第一张图片有什么内容？',imgs=['../source/1702803312680.png','../source/1702803359034.png'])
+    print(r)
\ No newline at end of file
diff --git a/real_gemini/utils_st/get_qwen_response.py b/real_gemini/utils_st/get_qwen_response.py
new file mode 100644
index 0000000..69ca56f
--- /dev/null
+++ b/real_gemini/utils_st/get_qwen_response.py
@@ -0,0 +1,65 @@
+import base64
+import requests
+import numpy as np
+import cv2
+# from real_gemini.gpt4v import GPT4V
+
+
+def img2base64(imgs):
+    base64_imgs = []
+    if isinstance(imgs,list):
+        for img in imgs:
+            # with open(img,'rb') as f:
+            if isinstance(img,str):
+                with open(img,'rb') as f:
+                    img_bytes = f.read()
+            else:
+                img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes()
+            img_b64 = base64.b64encode(img_bytes).decode()
+            # print(img_b64)
+            base64_imgs.append(img_b64)
+        return base64_imgs
+    else:
+        # with open(imgs,'rb') as f:
+        if isinstance(imgs,str):
+            with open(imgs,'rb') as f:
+                img_bytes = f.read()
+        else:
+            img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes()
+        img_bytes = np.array(cv2.imencode('.png', img)[1]).tobytes()
+        img_b64 = base64.b64encode(img_bytes).decode()
+        base64_imgs.append(img_b64)
+        return base64_imgs
+
+def QwenVL_client(query,imgs=None):
+    imgs = img2base64(imgs)
+    input_data = {
+        'prompt':query,
+        'image_strs':imgs
+    }
+    api_url = 'http://192.168.80.19:6679/qwen-vl/'
+    try:
+        resp = requests.post(
+            api_url, 
+            headers={
+                'Content-Type':'application/x-www-form-urlencoded',
+                'accept':'application/json'
+            },
+            data=input_data, 
+            )
+        resp_data = resp.json()
+        # print(resp_data)
+        prompt_text = resp_data
+    except Exception as e:
+        print(e)
+        prompt_text = '千问接口请求出错了，请确认后台服务后再尝试～'
+    send = {
+        'text':prompt_text
+    }
+    return send
+
+
+if __name__ == '__main__':
+    imgs = ['/Users/wuziwei/git_project/Real-Gemini/source/bot.png']
+    r = QwenVL_client(query='描述一下这种图片',imgs=imgs)
+    print(r)
\ No newline at end of file
diff --git a/real_gemini/utils_st/record_video.py b/real_gemini/utils_st/record_video.py
new file mode 100644
index 0000000..48c68c6
--- /dev/null
+++ b/real_gemini/utils_st/record_video.py
@@ -0,0 +1,83 @@
+import cv2
+import time
+import streamlit as st
+import speech_recognition as sr 
+from threading import Thread,Event
+
+class VideoRecorder():
+    # 基于这个修改https://blog.csdn.net/qq_42069296/article/details/133792896
+    def __init__(self,record_fps=0.5,max_record_time=60):
+        self.frames = list()
+        self.max_record_time = max_record_time
+        self.record_fps = record_fps
+        self.stop_singl = False
+        # https://blog.csdn.net/captain5339/article/details/128360804
+        self.process = Thread(target=self.record_v_a)
+        self.exit = Event()
+
+    def record_v_a(self):
+        print('开始录制')
+        # https://blog.csdn.net/weixin_40922744/article/details/103356458
+        self.capture = cv2.VideoCapture(0)
+        # 设置分辨率
+        # self.capture.set(3,640) # with
+        # self.capture.set(4,480) # hight
+        s_t = time.time()
+        img_id = 0
+        while(True):
+            if (time.time()-s_t) % self.record_fps == 0:
+                try:
+                    ret, frame = self.capture.read()
+                except:
+                    print('error～')
+                    break
+                if ret:
+                    img_id += 1 #记录录的帧
+                    # https://www.jianshu.com/p/0e462b4c7a93
+                    # 不能在这里做转换，否则gpt4v的接口识别到的颜色是反过来的，就是说接口那边会再做一次转换
+                    # frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
+                    self.frames.append(frame)
+                else:
+                    break
+                if self.exit.is_set():
+                    print('手动结束')
+                    break
+                if (time.time()-s_t) > self.max_record_time:
+                    print('超时退出')
+                    break
+        self.capture.release()
+        print('录制结束')
+
+    def stop_record(self):
+        self.exit.set()
+        print('手动杀死线程')
+        
+
+def record():
+    # https://blog.51cto.com/u_16213389/7407010
+    r = sr.Recognizer()
+    # https://blog.csdn.net/sunriseYJP/article/details/134399727
+    r.energy_threshold=500 # 检测声音的阈值
+    with sr.Microphone() as source:
+        video_record = VideoRecorder()
+        st.write('请开始说话，下面开始监听') 
+        # phrase_time_limit 最大录制时常，timeout 等待时常
+        video_record.process.start()# 这个会进行录像
+        audio = r.listen(source,phrase_time_limit=15,timeout=None)
+        # time.sleep(2) # 额外往后录制2秒钟 监听本身带了延时，无需再加
+        video_record.stop_record()
+    return video_record.frames,audio
+
+if __name__ == "__main__":
+    # https://blog.csdn.net/qq_42069296/article/details/133792896
+    if st.button('开始录制'):
+        st.camera_input('tt',label_visibility='hidden')
+        recorder = VideoRecorder()
+        recorder.process.start()
+        time.sleep(10)
+        recorder.stop_record()
+        print(recorder.frames)
+        for _ in range(10):
+            st.image(recorder.frames.get())
+        print('ok~')
+
diff --git a/real_gemini/utils_st/test_opencv2.py b/real_gemini/utils_st/test_opencv2.py
new file mode 100644
index 0000000..aa5bfb1
--- /dev/null
+++ b/real_gemini/utils_st/test_opencv2.py
@@ -0,0 +1,97 @@
+import cv2
+from datetime import datetime
+import threading
+import time
+import numpy as np
+import streamlit
+ 
+class Camera(object):
+ 
+    def __init__(self, video_path):
+        #如果是笔记本调用外界摄像头，则把0改为1
+        self.cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
+        self.ret, self.frame = self.cap.read()
+        FPS = 24.0
+        # 视频写入的图像尺寸与画布尺寸不对应会导致视频无法播放，需要实时获取
+        WIDTH = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        HEIGHT = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # 设置摄像头设备分辨率
+        self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, WIDTH)
+        self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, HEIGHT)
+        # 设置摄像头设备帧率,如不指定,默认600
+        self.cap.set(cv2.CAP_PROP_FPS, 24)
+        # 建议使用XVID编码,图像质量和文件大小比较都兼顾的方案
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        self.out = cv2.VideoWriter(video_path, fourcc, FPS, (WIDTH, HEIGHT))
+ 
+    def picture_shoot(self, image_name, image_path=None) -> None:
+        '''
+        调用摄像头拍照并保存图片到本地
+        :param image_name: 图片名
+        :param image_path: 图片保存路径
+        :return: None
+        '''
+        self.image_name = image_name
+        self.image_path = image_path
+        cv2.imwrite(f'./{self.image_name}', self.frame)
+ 
+    def video_record(self, video_path) -> None:
+        '''
+        调用摄像头录制视频并保存到本地
+        :param video_path: 视频保存路径
+        :return: None
+        '''
+        print('kaishiluzhi????')
+        self.video_path = video_path
+        while (self.cap.isOpened()):
+            self.ret, self.frame = self.cap.read()
+            if self.ret:
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                datet = str(datetime.now())
+                frame = cv2.putText(self.frame, datet, (10, 50), font, 1,
+                                    (0, 255, 255), 2, cv2.LINE_AA)
+                self.out.write(frame)
+ 
+    def video_stop(self):
+        self.cap.release()
+        self.out.release()
+        cv2.destroyAllWindows()
+        print('视频录制结束')
+ 
+ 
+def record_v_a():
+
+    # video_name = datetime.now().strftime("%Y%m%d%H%M%S") + ".mp4"
+    width = 640
+    height = 480
+
+    # fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    # video_writer = cv2.VideoWriter(video_name, fourcc, 25, (width, height))
+
+    capture = cv2.VideoCapture(0)
+
+    while(True):
+        ret, frame = capture.read()
+        if ret:
+            frame = cv2.putText(frame, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), (10,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2, cv2.LINE_AA)
+
+            video_writer.write(frame)
+            st.image(frame)
+            if cv2.waitKey(1) & 0xFF == ord("q"):
+                break
+        else:
+            break
+
+    video_writer.release()
+    capture.release()
+    cv2.destroyAllWindows()
+
+# if __name__ == '__main__':
+#     # 录像+拍照
+#     path = r'./video.mp4'
+#     camera = Camera(path)
+#     thread = threading.Thread(target=camera.video_record, args=(path,))
+#     thread.start()
+#     # camera.picture_shoot(image_name='1.png', image_path='./')
+#     time.sleep(10)
+#     camera.video_stop()
\ No newline at end of file
diff --git a/real_gemini/utils_st/text2audio.py b/real_gemini/utils_st/text2audio.py
new file mode 100644
index 0000000..8961b95
--- /dev/null
+++ b/real_gemini/utils_st/text2audio.py
@@ -0,0 +1,127 @@
+import base64
+import json
+import io
+import requests
+import numpy as np
+import streamlit as st
+from numpy import typing as npt
+from typing import Any,Tuple,Optional,cast,Union
+from typing_extensions import Final, TypeAlias
+MediaData: TypeAlias = Union[
+    str, bytes, io.BytesIO, io.RawIOBase, io.BufferedReader, "npt.NDArray[Any]", None
+]
+
+def text2audio(text,):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/x-www-form-urlencoded',
+    }
+    data = {
+        'prompt': text,
+    }
+    response = requests.post('http://192.168.81.12:6679/tts/', headers=headers, data=data)
+    res = response.json()
+    audio_array = np.frombuffer(base64.b64decode(res[0]),np.float32)
+    rate = res[1]
+    return audio_array,rate,convert_to_wav_bytes(audio_array,rate)
+
+def _validate_and_normalize(data: "npt.NDArray[Any]") -> Tuple[bytes, int]:
+    """Validates and normalizes numpy array data.
+    We validate numpy array shape (should be 1d or 2d)
+    We normalize input data to int16 [-32768, 32767] range.
+
+    Parameters
+    ----------
+    data : numpy array
+        numpy array to be validated and normalized
+
+    Returns
+    -------
+    Tuple of (bytes, int)
+        (bytes, nchan)
+        where
+         - bytes : bytes of normalized numpy array converted to int16
+         - nchan : number of channels for audio signal. 1 for mono, or 2 for stereo.
+    """
+    # we import numpy here locally to import it only when needed (when numpy array given
+    # to st.audio data)
+    import numpy as np
+
+    data: "npt.NDArray[Any]" = np.array(data, dtype=float)
+
+    if len(data.shape) == 1:
+        nchan = 1
+    elif len(data.shape) == 2:
+        # In wave files,channels are interleaved. E.g.,
+        # "L1R1L2R2..." for stereo. See
+        # http://msdn.microsoft.com/en-us/library/windows/hardware/dn653308(v=vs.85).aspx
+        # for channel ordering
+        nchan = data.shape[0]
+        data = data.T.ravel()
+    else:
+        raise "Numpy array audio input must be a 1D or 2D array."
+
+    if data.size == 0:
+        return data.astype(np.int16).tobytes(), nchan
+
+    max_abs_value = np.max(np.abs(data))
+    # 16-bit samples are stored as 2's-complement signed integers,
+    # ranging from -32768 to 32767.
+    # scaled_data is PCM 16 bit numpy array, that's why we multiply [-1, 1] float
+    # values to 32_767 == 2 ** 15 - 1.
+    np_array = (data / max_abs_value) * 32767
+    scaled_data = np_array.astype(np.int16)
+    return scaled_data.tobytes(), nchan
+
+def _make_wav(data: "npt.NDArray[Any]", sample_rate: int) -> bytes:
+    """
+    Transform a numpy array to a PCM bytestring
+    We use code from IPython display module to convert numpy array to wave bytes
+    https://github.com/ipython/ipython/blob/1015c392f3d50cf4ff3e9f29beede8c1abfdcb2a/IPython/lib/display.py#L146
+    """
+    # we import wave here locally to import it only when needed (when numpy array given
+    # to st.audio data)
+    import wave
+
+    scaled, nchan = _validate_and_normalize(data)
+
+    with io.BytesIO() as fp, wave.open(fp, mode="wb") as waveobj:
+        waveobj.setnchannels(nchan)
+        waveobj.setframerate(sample_rate)
+        waveobj.setsampwidth(2)
+        waveobj.setcomptype("NONE", "NONE")
+        waveobj.writeframes(scaled)
+        return fp.getvalue()
+
+
+def convert_to_wav_bytes(
+    data: MediaData, sample_rate: Optional[int]
+) -> MediaData:
+    """Convert data to wav bytes if the data type is numpy array."""
+    data = _make_wav(cast("npt.NDArray[Any]", data), sample_rate)
+    return data
+
+def autoplay_audio(bytes_audio):
+    # https://discuss.streamlit.io/t/how-to-play-an-audio-file-automatically-generated-using-text-to-speech-in-streamlit/33201/6
+    if isinstance(bytes_audio,str):
+        with open(bytes_audio,'rb') as f:
+            b64_audio = f.read()
+    else:
+        b64_audio = base64.b64encode(bytes_audio).decode()
+    md = f"""
+            <audio controls autoplay="true">
+            <source src="data:audio/wav;base64,{b64_audio}" type="audio/wav">
+            </audio>
+            """
+    st.markdown(
+        md,
+        unsafe_allow_html=True,
+    )
+
+if __name__ == '__main__':
+    with open('/Users/wuziwei/git_project/Real-Gemini/records/180367f8-85d3-4ec3-81dc-95e9c095b7ec_input_audio.mp3','rb') as f:
+        ba = f.read()
+        # print(ba)
+    a,r,ba = text2audio('你好')
+    autoplay_audio(ba)
+    st.audio(a,sample_rate=r)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 30b4c9b..d7e9dbc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,80 @@
-fastapi
-openai
-opencv-python
\ No newline at end of file
+aioice==0.9.0
+aiortc==1.6.0
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.1.0
+attrs==23.1.0
+av==11.0.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cryptography==41.0.7
+decorator==4.4.2
+distro==1.8.0
+dnspython==2.4.2
+exceptiongroup==1.2.0
+gitdb==4.0.11
+GitPython==3.1.40
+google-crc32c==1.5.0
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.25.2
+idna==3.6
+ifaddr==0.2.0
+imageio==2.33.1
+imageio-ffmpeg==0.4.9
+importlib-metadata==6.11.0
+Jinja2==3.1.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+moviepy==1.0.3
+numpy==1.26.2
+openai==1.4.0
+opencv-python==4.8.1.78
+packaging==23.2
+pandas==2.1.4
+Pillow==10.1.0
+proglog==0.1.10
+protobuf==4.25.1
+pyarrow==14.0.1
+PyAudio==0.2.14
+pyav==11.4.1
+pycparser==2.21
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydeck==0.8.1b0
+pydub==0.25.1
+pyee==11.1.0
+Pygments==2.17.2
+pylibsrtp==0.9.1
+pyOpenSSL==23.3.0
+python-dateutil==2.8.2
+pytz==2023.3.post1
+referencing==0.32.0
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.2
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+SpeechRecognition==3.10.1
+streamlit==1.28.2
+streamlit-mic-recorder==0.0.4
+streamlit-webrtc==0.47.1
+tenacity==8.2.3
+toml==0.10.2
+toolz==0.12.0
+tornado==6.4
+tqdm==4.66.1
+typing_extensions==4.9.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+zipp==3.17.0
diff --git a/run.sh b/run.sh
index 1acb6f9..6a7ebbf 100644
--- a/run.sh
+++ b/run.sh
@@ -1,2 +1,2 @@
-export OPENAI_API_KEY="YOUR_API_KEY"
-python main.py
+export OPENAI_API_KEY='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'
+streamlit run ./real_gemini/chat.py
\ No newline at end of file
diff --git a/source/bot.png b/source/bot.png
new file mode 100644
index 0000000..6a4d7fe
Binary files /dev/null and b/source/bot.png differ