diff --git a/examples/langchain/.env.example b/examples/langchain/.env.example index ee60bef7b6ecc..900fd5babe3f8 100644 --- a/examples/langchain/.env.example +++ b/examples/langchain/.env.example @@ -1,7 +1,4 @@ -OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX -CUBE_API_URL=https://anonymous-colstrip.gcp-us-central1.cubecloudapp.dev/cubejs-api/v1 -CUBE_API_SECRET=SECRET -DATABASE_URL=postgresql://cube:PWD@anonymous-colstrip.sql.gcp-us-central1.cubecloudapp.dev:5432/anonymous-colstrip -LANGCHAIN_TRACING_V2=true -LANGCHAIN_ENDPOINT=https://api.smith.langchain.com -LANGCHAIN_API_KEY=ls__XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \ No newline at end of file +OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXXXXXX +CUBE_API_URL=https://example-url.gcp-us-central1.cubecloudapp.dev/cubejs-api/v1 +CUBE_API_SECRET=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +DATABASE_URL=postgresql://cube:xxx@example-url.gcp-us-central1.cubecloudapp.dev:5432/example \ No newline at end of file diff --git a/examples/langchain/.gitignore b/examples/langchain/.gitignore new file mode 100644 index 0000000000000..0b6e7eff9dbfc --- /dev/null +++ b/examples/langchain/.gitignore @@ -0,0 +1,3 @@ +.env +__pycache__ +vectorstore.pkl \ No newline at end of file diff --git a/examples/langchain/README.md b/examples/langchain/README.md index 2d9a0a4dd1aab..de99bdb928603 100644 --- a/examples/langchain/README.md +++ b/examples/langchain/README.md @@ -1,18 +1,30 @@ -# Cube and Langchain demo app +# Tabular Data Retrieval -This is an example of a chatbot built with Cube, Langchain, and Streamlit. +This is an example of a chatbot built with Cube, Langchain, Snowflake and Streamlit. -[Why use a semantic layer with LLM for chatbots?](https://cube.dev/blog/semantic-layer-the-backbone-of-ai-powered-data-experiences) +[Check this app deployed on Streamlit Cloud.](https://cube-langchain.streamlit.app/) -## Pre-requisites +## Why Semantic Layer for LLM-powered apps? -- Valid Cube Cloud deployment. Your data model should have at least one view. -- This example uses OpenAI API, so you'll need an OpenAI API key. -- Python version `>=` 3.8 +When building text-to-SQL applications, it is crucial to provide LLM with rich context about underlying data model. Without enough context it’s hard for humans to comprehend data, LLM will simply compound on that confusion to produce wrong answers. -## How to run +In many cases it is not enough to feed LLM with database schema and expect it to generate the correct SQL. To operate correctly and execute trustworthy actions, it needs to have enough context and semantics about the data it consumes; it must understand the metrics, dimensions, entities, and relational aspects of the data by which it's powered. Basically—LLM needs a semantic layer. +![architecture](https://ucarecdn.com/32e98c8b-a920-4620-a8d2-05d57618db8e/) + +[Read more on why to use a semantic layer with LLM-power apps.](https://cube.dev/blog/semantic-layer-the-backbone-of-ai-powered-data-experiences) + + + + +## Getting Started + +- **Cube project**. If you don't have a Cube project already, you follow [this tutorial](https://cube.dev/docs/product/getting-started/cloud) to get started with with sample e-commerce data model. +- **OpenAI API**. This example uses OpenAI API, so you'll need an OpenAI API key. +- Make sure you have Python version >= 3.8 - Install dependencies: `pip install -r requirements.txt` -- Copy `.env.example` as `.env` and fill it in with your credentials -- Run `python ingest.py`. It will use `CubeSemanticLoader` Langchain library to load metadata and save it in vectorstore -- Run `streamlit run main.py` +- Copy `.env.example` as `.env` and fill it in with your credentials. You need OpenAI API Key and credentials to access your Cube deployment. +- Run `streamlit run streamlit_app.py` + +## Community +If you have any questions or need help - please [join our Slack community](https://slack.cube.dev/?ref=langchain-example-readme) of amazing developers and data engineers. diff --git a/examples/langchain/ingest.py b/examples/langchain/ingest.py deleted file mode 100644 index 89573f0de6d7f..0000000000000 --- a/examples/langchain/ingest.py +++ /dev/null @@ -1,32 +0,0 @@ -import pickle -import jwt -import os - -from dotenv import load_dotenv -from langchain.embeddings import OpenAIEmbeddings -from langchain.vectorstores.faiss import FAISS - -from langchain.document_loaders import CubeSemanticLoader - -load_dotenv() - - -def ingest_cube_meta(): - api_url = os.environ["CUBE_API_URL"] - cubejs_api_secret = os.environ["CUBE_API_SECRET"] - security_context = {} - api_token = jwt.encode(security_context, cubejs_api_secret, algorithm="HS256") - - loader = CubeSemanticLoader(api_url, api_token) - documents = loader.load() - - embeddings = OpenAIEmbeddings() - vectorstore = FAISS.from_documents(documents, embeddings) - - # Save vectorstore - with open("vectorstore.pkl", "wb") as f: - pickle.dump(vectorstore, f) - - -if __name__ == "__main__": - ingest_cube_meta() diff --git a/examples/langchain/main.py b/examples/langchain/main.py deleted file mode 100644 index 980aaf3624e78..0000000000000 --- a/examples/langchain/main.py +++ /dev/null @@ -1,142 +0,0 @@ -import streamlit as st -import pandas as pd -import os -import json -import re - -from dotenv import load_dotenv -from langchain import OpenAI - -from utils import ( - create_docs_from_values, - create_vectorstore, - init_vectorstore, - check_input, - log, - call_sql_api, - CUBE_SQL_API_PROMPT, - _NO_ANSWER_TEXT, - PROMPT_POSTFIX, -) - -load_dotenv() - -llm = OpenAI( - temperature=0, openai_api_key=os.environ.get("OPENAI_API_KEY"), verbose=True -) - -st.title("CubeSemanticLoader on LangChain") - -question = st.text_input( - "Your question: ", placeholder="Ask me anything ...", key="input" -) - -if st.button("Submit", type="primary"): - check_input(question) - vectorstore = init_vectorstore() - - # log("Quering vectorstore and building the prompt...") - - docs = vectorstore.similarity_search(question) - # take the first document as the best guess - table_name = docs[0].metadata["table_name"] - - # Columns - columns_question = "All available columns" - column_docs = vectorstore.similarity_search( - columns_question, filter=dict(table_name=table_name), k=15 - ) - - lines = [] - for column_doc in column_docs: - column_title = column_doc.metadata["column_title"] - column_name = column_doc.metadata["column_name"] - column_data_type = column_doc.metadata["column_data_type"] - print(column_name) - lines.append( - f"title: {column_title}, column name: {column_name}, datatype: {column_data_type}, member type: {column_doc.metadata['column_member_type']}" - ) - columns = "\n\n".join(lines) - - # Construct the prompt - prompt = CUBE_SQL_API_PROMPT.format( - input_question=question, - table_info=table_name, - columns_info=columns, - top_k=1000, - no_answer_text=_NO_ANSWER_TEXT, - ) - - # log("Prepared prompt") - - # Call LLM API to get the SQL query - log("Calling LLM API to generate SQL query...") - llm_answer = llm(prompt + PROMPT_POSTFIX) - bare_llm_answer = re.sub(r"(?i)Answer:\s*", "", llm_answer) - print(prompt + PROMPT_POSTFIX) - - # log("Got response from LLM:") - # st.info(bare_llm_answer) - - if llm_answer.strip() == _NO_ANSWER_TEXT: - st.stop() - - # Parse the response - parsed_data = json.loads(bare_llm_answer) - - # Access the objects - sql_query = parsed_data["query"] - filters = parsed_data["filters"] - - log("Query generated by LLM:") - st.info(sql_query) - - if len(filters) > 0: - # log("Query has filters:") - # st.info(filters) - - # Handling filters for better accuracy - # log("Processing filters for better accuracy...") - for filter in filters: - print(filter) - column_name = filter["column"] - operator = filter["operator"] - filter_value = f"{column_name}" - doc = vectorstore.similarity_search( - filter["column"], filter=dict(column_name=filter_value), k=1 - ) - - if doc: - print("Creating docs from values...") - value_docs = create_docs_from_values( - doc[0].metadata["column_values"], table_name, column_name - ) - - # Create vectorstore for values search - print(f"{column_name}: Creating vectorstore for values search...") - value_vectorstore = create_vectorstore(value_docs) - - # Search for the value - print("Searching for the value...") - value_doc = value_vectorstore.similarity_search(filter["value"], k=1) - cleaned_value = value_doc[0].page_content - - if cleaned_value and cleaned_value != filter["value"]: - log("Replacing filter value with the closest match...") - old_filter_sql = ( - f"{filter['column']} {filter['operator']} '{filter['value']}'" - ) - new_filter_sql = ( - f"{filter['column']} {filter['operator']} '{cleaned_value}'" - ) - sql_query = sql_query.replace(old_filter_sql, new_filter_sql) - st.info(sql_query) - - # Call Cube SQL API - columns, rows = call_sql_api(sql_query) - print("Result from SQL API:") - print(rows) - - # Display the result - df = pd.DataFrame(rows, columns=columns) - st.table(df) diff --git a/examples/langchain/requirements.txt b/examples/langchain/requirements.txt index f49bf8a1450e1..1c0fee288b932 100644 --- a/examples/langchain/requirements.txt +++ b/examples/langchain/requirements.txt @@ -2,9 +2,9 @@ streamlit pandas python-dotenv langchain -psycopg2 pathlib PyJWT -faiss-cpu openai -tiktoken \ No newline at end of file +tiktoken +faiss-cpu +psycopg2-binary \ No newline at end of file diff --git a/examples/langchain/streamlit_app.py b/examples/langchain/streamlit_app.py new file mode 100644 index 0000000000000..18c90cf10ce38 --- /dev/null +++ b/examples/langchain/streamlit_app.py @@ -0,0 +1,123 @@ +import streamlit as st +import pandas as pd +import os +import re +import pickle +import jwt + +from dotenv import load_dotenv +from langchain import OpenAI +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.faiss import FAISS +from langchain.document_loaders import CubeSemanticLoader +from pathlib import Path + +from utils import ( + create_docs_from_values, + create_vectorstore, + init_vectorstore, + check_input, + log, + call_sql_api, + CUBE_SQL_API_PROMPT, + _NO_ANSWER_TEXT, + PROMPT_POSTFIX, +) + +load_dotenv() + +def ingest_cube_meta(): + security_context = {} + token = jwt.encode(security_context, os.environ["CUBE_API_SECRET"], algorithm="HS256") + + loader = CubeSemanticLoader(os.environ["CUBE_API_URL"], token) + documents = loader.load() + + embeddings = OpenAIEmbeddings() + vectorstore = FAISS.from_documents(documents, embeddings) + + # Save vectorstore + with open("vectorstore.pkl", "wb") as f: + pickle.dump(vectorstore, f) + +if not Path("vectorstore.pkl").exists(): + with st.spinner('Loading context from Cube API...'): + ingest_cube_meta(); + +llm = OpenAI( + temperature=0, openai_api_key=os.environ.get("OPENAI_API_KEY"), verbose=True +) + +st.title("Cube and LangChain demo 🤖🚀") + +multi = ''' +Follow [this tutorial on Github](https://github.com/cube-js/cube/tree/master/examples/langchain) to clone this project and run it locally. + +You can use these sample questions to quickly test the demo -- +* How many orders? +* How many completed orders? +* What are top selling product categories? +* What product category drives the highest average order value? +''' +st.markdown(multi) + +question = st.text_input( + "Your question: ", placeholder="Ask me anything ...", key="input" +) + +if st.button("Submit", type="primary"): + check_input(question) + vectorstore = init_vectorstore() + + # log("Quering vectorstore and building the prompt...") + + docs = vectorstore.similarity_search(question) + # take the first document as the best guess + table_name = docs[0].metadata["table_name"] + + # Columns + columns_question = "All available columns" + column_docs = vectorstore.similarity_search( + columns_question, filter=dict(table_name=table_name), k=15 + ) + + lines = [] + for column_doc in column_docs: + column_title = column_doc.metadata["column_title"] + column_name = column_doc.metadata["column_name"] + column_data_type = column_doc.metadata["column_data_type"] + print(column_name) + lines.append( + f"title: {column_title}, column name: {column_name}, datatype: {column_data_type}, member type: {column_doc.metadata['column_member_type']}" + ) + columns = "\n\n".join(lines) + + # Construct the prompt + prompt = CUBE_SQL_API_PROMPT.format( + input_question=question, + table_info=table_name, + columns_info=columns, + top_k=1000, + no_answer_text=_NO_ANSWER_TEXT, + ) + + # Call LLM API to get the SQL query + log("Calling LLM API to generate SQL query...") + llm_answer = llm(prompt) + bare_llm_answer = re.sub(r"(?i)Answer:\s*", "", llm_answer) + + if llm_answer.strip() == _NO_ANSWER_TEXT: + st.stop() + + sql_query = llm_answer + + log("Query generated by LLM:") + st.info(sql_query) + + # Call Cube SQL API + log("Sending the above query to Cube...") + columns, rows = call_sql_api(sql_query) + + # Display the result + df = pd.DataFrame(rows, columns=columns) + st.table(df) diff --git a/examples/langchain/utils.py b/examples/langchain/utils.py index 25f41502a4cf4..1011635b5a23b 100644 --- a/examples/langchain/utils.py +++ b/examples/langchain/utils.py @@ -106,8 +106,7 @@ def call_sql_api(sql_query: str): # Initializing Cube SQL API connection) connection = psycopg2.connect(CONN_STR) - - log("Running query...") + cursor = connection.cursor() cursor.execute(sql_query)