diff --git a/components/__init__.py b/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/components/streamlit_tesseract_scanner/__init__.py b/components/streamlit_tesseract_scanner/__init__.py new file mode 100644 index 0000000..d0509fe --- /dev/null +++ b/components/streamlit_tesseract_scanner/__init__.py @@ -0,0 +1,79 @@ +import base64 +from io import BytesIO +from pathlib import Path +from typing import Optional +import cv2 +import numpy as np +import pytesseract +from pytesseract import Output + +import streamlit as st +import streamlit.components.v1 as components + +# Tell streamlit that there is a component called camera_input_live, +# and that the code to display that component is in the "frontend" folder +frontend_dir = (Path(__file__).parent / "frontend").absolute() +_component_func = components.declare_component( + "tesseract_scanner", path=str(frontend_dir) +) + + +def tesseract_scanner(showimg: bool =False, + lang: str = 'eng', + blacklist: str = None, + whitelist: str = None, + psm: str = '3', + hrate: float=0.2, + key: Optional[str] = None + ) -> Optional[BytesIO]: + """ + Add a descriptive docstring + """ + b64_data: Optional[str] = _component_func(hrate=hrate, key=key) + + if b64_data is None: + return None + + raw_data = b64_data.split(",")[1] # Strip the data: type prefix + + component_value = BytesIO(base64.b64decode(raw_data)) + + # return component_value + # image = cv2.imdecode(np.frombuffer(component_value, np.uint8), cv2.IMREAD_COLOR) + + image = base64.b64decode(raw_data) + image = np.fromstring(image, dtype=np.uint8) + image = cv2.imdecode(image, cv2.IMREAD_COLOR) + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] + + if showimg: + st.image(image) + + # blacklist = '@*|©_Ⓡ®¢§š' + if blacklist: + custom_config = f'''--oem 3 --psm 11''' + else: + custom_config = f'''--oem 3 --psm 3''' + + text = pytesseract.image_to_string(image, lang=lang, config=custom_config) + # text = text.split('\n') + # while("" in text): text.remove("") + # while(" " in text): text.remove(" ") + # text.remove("\x0c") + + return text + + +def main(): + st.write("## Example") + + blacklist='@*|©_Ⓡ®¢§š' + data = tesseract_scanner(showimg=False, lang='vie+eng', + blacklist=blacklist, psm=3) + + if data is not None: + st.write(data) + +if __name__ == "__main__": + main() diff --git a/components/streamlit_tesseract_scanner/frontend/index.html b/components/streamlit_tesseract_scanner/frontend/index.html new file mode 100644 index 0000000..6215874 --- /dev/null +++ b/components/streamlit_tesseract_scanner/frontend/index.html @@ -0,0 +1,19 @@ + + + + + + + streamlit-camera-input-live + + + + + +
+ + + +
+ + \ No newline at end of file diff --git a/components/streamlit_tesseract_scanner/frontend/main.js b/components/streamlit_tesseract_scanner/frontend/main.js new file mode 100644 index 0000000..3c16ead --- /dev/null +++ b/components/streamlit_tesseract_scanner/frontend/main.js @@ -0,0 +1,84 @@ +// The `Streamlit` object exists because our html file includes +// `streamlit-component-lib.js`. +// If you get an error about "Streamlit" not being defined, that +// means you're missing that file. + +function sendValue(value) { + Streamlit.setComponentValue(value) + } + + /** + * The component's render function. This will be called immediately after + * the component is initially loaded, and then again every time the + * component gets new data from Python. + */ + function onRender(event) { + // Only run the render code the first time the component is loaded. + if (!window.rendered) { + // You most likely want to get the data passed in like this + var {hrate} = event.detail.args; + + let video = document.getElementById('video'); + let videoheight = document.getElementById('videoheight'); + let canvas = document.getElementById('canvas'); + + video.setAttribute('width', '100%'); + + var device = "desktop"; + width = video.clientWidth; + height = 3 / 4 * width; + + const ua = navigator.userAgent; + if (/(tablet|ipad|playbook|silk)|(android(?!.*mobi))/i.test(ua)) { + device = "mobile"; // return "tablet"; + height = 16/9 * width; + } + if (/Mobile|iP(hone|od)|Android|BlackBerry|IEMobile|Kindle|Silk-Accelerated|(hpw|web)OS|Opera M(obi|ini)/.test(ua)) { + device = "mobile"; + height = 16/9 * width; + } + + Streamlit.setFrameHeight(hrate * height); + + const constraints = { facingMode: 'environment', advanced : [{focusMode: "continuous"}]}; + navigator.mediaDevices.getUserMedia({ video: constraints }) + .then(function(stream) { + video.srcObject = stream; + video.play(); + }) + .catch(function(err) { + console.log("An error occurred: " + err); + }); + + function beep() { + var snd = new Audio("data:audio/mpeg;base64,"); + snd.play(); + } + + function hrateChange() { + hrate = videoheight.value / 100; + Streamlit.setFrameHeight(hrate * height); + } + + function takePicture() { + let context = canvas.getContext('2d'); + canvas.width = width; + canvas.height = hrate * height; + context.drawImage(video, 0, 0, width, height); + var data = canvas.toDataURL('image/png'); + beep(); + sendValue(data); + } + videoheight.addEventListener('change', hrateChange); + video.addEventListener('click', takePicture); + window.rendered = true + } + } + + // Render the component whenever python send a "render event" + Streamlit.events.addEventListener(Streamlit.RENDER_EVENT, onRender) + // Tell Streamlit that the component is ready to receive events + Streamlit.setComponentReady() + // Don't actually need to display anything, so set the height to 0 + Streamlit.setFrameHeight(0) + \ No newline at end of file diff --git a/components/streamlit_tesseract_scanner/frontend/streamlit-component-lib.js b/components/streamlit_tesseract_scanner/frontend/streamlit-component-lib.js new file mode 100644 index 0000000..6b2ec02 --- /dev/null +++ b/components/streamlit_tesseract_scanner/frontend/streamlit-component-lib.js @@ -0,0 +1,35 @@ + +// Borrowed minimalistic Streamlit API from Thiago +// https://discuss.streamlit.io/t/code-snippet-create-components-without-any-frontend-tooling-no-react-babel-webpack-etc/13064 +function sendMessageToStreamlitClient(type, data) { + console.log(type, data) + const outData = Object.assign({ + isStreamlitMessage: true, + type: type, + }, data); + window.parent.postMessage(outData, "*"); + } + + const Streamlit = { + setComponentReady: function() { + sendMessageToStreamlitClient("streamlit:componentReady", {apiVersion: 1}); + }, + setFrameHeight: function(height) { + sendMessageToStreamlitClient("streamlit:setFrameHeight", {height: height}); + }, + setComponentValue: function(value) { + sendMessageToStreamlitClient("streamlit:setComponentValue", {value: value}); + }, + RENDER_EVENT: "streamlit:render", + events: { + addEventListener: function(type, callback) { + window.addEventListener("message", function(event) { + if (event.data.type === type) { + event.detail = event.data + callback(event); + } + }); + } + } + } + \ No newline at end of file diff --git a/components/streamlit_tesseract_scanner/frontend/test.js b/components/streamlit_tesseract_scanner/frontend/test.js new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/components/streamlit_tesseract_scanner/frontend/test.js @@ -0,0 +1 @@ + diff --git a/libs/http.py b/libs/http.py new file mode 100644 index 0000000..881c188 --- /dev/null +++ b/libs/http.py @@ -0,0 +1,28 @@ +import requests +import os + + +def search_knowledge(collection, query): + gpt_address = os.getenv("GPT_SERVICE_ADDRESS") + api_token = os.getenv("GPT_SERVICE_TOKEN") + url = f"{gpt_address}/knowledge/search" + headers = { + "Accept": "application/json", + "Content-Type": "application/json", + "Authorization": f"Bearer {api_token}" + } + payload = { + "collection": collection, + "query": query + } + + response = requests.post(url, headers=headers, json=payload) + if response.status_code != 200: + return f"Error searching knowledge: {response.text}" + data = response.json() + + def fmt(v): + return f'**Score**: {v["score"]}\n\n{v["content"]}\n\n---\n\n' + + return "\n\n".join([fmt(v) for v in data["result"]["data"]]) + diff --git a/libs/msal.py b/libs/msal.py new file mode 100644 index 0000000..ab5c405 --- /dev/null +++ b/libs/msal.py @@ -0,0 +1,24 @@ +from msal_streamlit_authentication import msal_authentication +import os + + +def msal_auth(): + tenant_id = os.getenv("MSAL_TENANTID") + app_id = os.getenv("MSAL_APPID") + return msal_authentication( + auth={ + "clientId": app_id, + "authority": f"https://login.microsoftonline.com/{tenant_id}", + "redirectUri": "/", + "postLogoutRedirectUri": "/" + }, + cache={ + "cacheLocation": "sessionStorage", + "storeAuthStateInCookie": False + }, + login_button_text="Microsoft Account Login", + login_request={ + "scopes": [f"{app_id}/.default"] + }, + key="msal_token" + ) diff --git a/pages/02_Knowledge_Search.py b/pages/02_Knowledge_Search.py index 0dc0414..d4d48b8 100644 --- a/pages/02_Knowledge_Search.py +++ b/pages/02_Knowledge_Search.py @@ -1,13 +1,56 @@ import streamlit as st +import os +import sys +from dotenv import load_dotenv -# 在其他页面 -if 'authenticated' not in st.session_state or not st.session_state['authenticated']: - st.error("请先登录。") - st.stop() # 阻止未认证的用户访问页面内容 +sys.path.append(os.path.abspath('..')) +load_dotenv() +from libs.http import search_knowledge +from libs.msal import msal_auth + +if os.getenv("DEV_MODE") not in ["true", "1", "on"]: + value = msal_auth() + if value is None: + st.stop() + +knowledges = { + "青少年编程": "codeboy", + "对数课堂": "logbot", +} st.sidebar.markdown("# 知识库搜索") st.title("知识库搜索") -st.subheader("搜索知识库内容") st.divider() +if "messages" not in st.session_state.keys(): + st.session_state.messages = [{"role": "assistant", "content": "欢迎使用知识库检索, 请输入主题"}] + +collection = st.selectbox("选择知识库", knowledges.keys()) +collection_value = knowledges[collection] + +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.write(message["content"]) + + +def clear_chat_history(): + st.session_state.messages = [{"role": "assistant", "content": "欢迎使用知识库检索,请输入主题"}] + + +st.sidebar.button('清除历史', on_click=clear_chat_history) + +if prompt := st.chat_input("输入检索主题"): + st.session_state.messages.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.write(prompt) + +if st.session_state.messages[-1]["role"] != "assistant": + with st.chat_message("assistant"): + with st.spinner("Thinking..."): + response = search_knowledge(collection_value, prompt) + if response is None: + response = "没有找到相关知识" + st.markdown(response) + message = {"role": "assistant", "content": response} + st.session_state.messages.append(message) diff --git a/pages/03_Msal.py b/pages/03_Msal.py index 31c1ffd..4072357 100644 --- a/pages/03_Msal.py +++ b/pages/03_Msal.py @@ -1,33 +1,12 @@ import streamlit as st -from msal_streamlit_authentication import msal_authentication +import sys import os +from libs.msal import msal_auth from dotenv import load_dotenv +sys.path.append(os.path.abspath('..')) load_dotenv() -MSAL_TENANTID = os.getenv("MSAL_TENANTID") -MSAL_APPID = os.getenv("MSAL_APPID") - - -st.session_state - -if "token" in st.session_state and st.session_state["token"]: - st.write("Token", st.session_state["token"]) -else: - value = msal_authentication( - auth={ - "clientId": MSAL_APPID, - "authority": f"https://login.microsoftonline.com/{MSAL_TENANTID}", - "redirectUri": "/", - "postLogoutRedirectUri": "/" - }, - cache={ - "cacheLocation": "sessionStorage", - "storeAuthStateInCookie": False - }, - login_request={ - "scopes": [f"{MSAL_APPID}/.default"] - }, - key=1) - st.session_state["token"] = value +value = msal_auth() +st.write(value) diff --git a/pages/04_OCR.py b/pages/04_OCR.py new file mode 100644 index 0000000..3f00287 --- /dev/null +++ b/pages/04_OCR.py @@ -0,0 +1,10 @@ +import streamlit as st +from components.streamlit_tesseract_scanner import tesseract_scanner + +img_file_buffer = st.camera_input("Take a picture") + +blacklist='@*|©_Ⓡ®¢§š' +data = tesseract_scanner(showimg=True, lang='chi_sim+eng', psm=11) + +if data is not None: + st.write(data) diff --git a/requirements-gptstudio.txt b/requirements-gptstudio.txt index dfb6628..8f2fde7 100644 --- a/requirements-gptstudio.txt +++ b/requirements-gptstudio.txt @@ -12,3 +12,5 @@ graphviz jinja2 streamlit msal_streamlit_authentication +opencv-python-headless +