From de3dbf345b3111c971f5993db1376eda6f5243ed Mon Sep 17 00:00:00 2001
From: lvalics <lvalics@gmail.com>
Date: Sun, 18 Feb 2024 09:28:02 +0000
Subject: [PATCH] - The conversational retrieval functionality is now operating
 as expected. It successfully sends the conversation history to the language
 model, allowing the context from previous interactions to be utilized
 effectively. - Added support for Ollama as the Language Model (LLM). Ensure
 Ollama is specified in the .env configuration and the model is preloaded on
 the server.

---
 .gitignore                                    |  2 +
 dj_backend_server/.gitignore                  |  3 +-
 dj_backend_server/CHANGELOG.MD                |  4 ++
 dj_backend_server/api/utils/get_embeddings.py | 18 +++---
 dj_backend_server/api/utils/get_openai_llm.py | 62 +++++++------------
 dj_backend_server/api/utils/make_chain.py     |  1 +
 dj_backend_server/api/views/views_chat.py     | 14 ++---
 dj_backend_server/api/views/views_message.py  | 32 ++++------
 dj_backend_server/example.env                 | 11 ++--
 dj_backend_server/requirements.txt            |  7 ++-
 .../web/services/chat_history_service.py      | 39 +++++++-----
 11 files changed, 92 insertions(+), 101 deletions(-)

diff --git a/.gitignore b/.gitignore
index c9ae65c9..2a1061ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,5 @@ dj_backend_server/nginx/nginx.conf
 dj_backend_server.code-workspace
 .aider*
 .aiderignore
+dj_backend_server/.vscode/settings.json
+
diff --git a/dj_backend_server/.gitignore b/dj_backend_server/.gitignore
index 6244bad9..c4b9e9a6 100644
--- a/dj_backend_server/.gitignore
+++ b/dj_backend_server/.gitignore
@@ -37,4 +37,5 @@ pip-delete-this-directory.txt
 website_data_sources/*
 venv
 open-llama-7B-open-instruct.ggmlv3.q4_K_M.bin
-llama-2-7b-chat.ggmlv3.q4_K_M.bin
\ No newline at end of file
+llama-2-7b-chat.ggmlv3.q4_K_M.bin
+.vscode/
\ No newline at end of file
diff --git a/dj_backend_server/CHANGELOG.MD b/dj_backend_server/CHANGELOG.MD
index ee854117..9a720f01 100644
--- a/dj_backend_server/CHANGELOG.MD
+++ b/dj_backend_server/CHANGELOG.MD
@@ -1,3 +1,7 @@
+2.18.2024
+- The conversational retrieval functionality is now operating as expected. It successfully sends the conversation history to the language model, allowing the context from previous interactions to be utilized effectively.
+- Added support for Ollama as the Language Model (LLM). Ensure Ollama is specified in the .env configuration and the model is preloaded on the server.
+
 2.17.2024
 - Incorporate 'Ollama' into your example.env configuration and make sure to reflect these changes in your .env file for compatibility.
 - We've expanded the logging capabilities within settings.py by deploying logging.debug for more detailed insights, although it remains inactive when the DEBUG mode is off.
diff --git a/dj_backend_server/api/utils/get_embeddings.py b/dj_backend_server/api/utils/get_embeddings.py
index 61d8b19e..f378ba8c 100644
--- a/dj_backend_server/api/utils/get_embeddings.py
+++ b/dj_backend_server/api/utils/get_embeddings.py
@@ -18,8 +18,8 @@ def get_azure_embedding():
     deployment = os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
     openai_api_key = os.environ.get("AZURE_OPENAI_API_KEY")
     client = os.environ.get("AZURE_OPENAI_API_TYPE")
-    openai_api_base = os.environ['AZURE_OPENAI_API_BASE']
-    openai_api_version = os.environ['AZURE_OPENAI_API_VERSION']
+    openai_api_base = os.environ["AZURE_OPENAI_API_BASE"]
+    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"]
 
     return OpenAIEmbeddings(
         openai_api_key=openai_api_key,
@@ -27,14 +27,14 @@ def get_azure_embedding():
         client=client,
         chunk_size=8,
         openai_api_base=openai_api_base,
-        openai_api_version=openai_api_version
+        openai_api_version=openai_api_version,
     )
 
 
 def get_openai_embedding():
     """Gets embeddings using the OpenAI embedding provider."""
     openai_api_key = os.environ.get("OPENAI_API_KEY")
-    return OpenAIEmbeddings(openai_api_key=openai_api_key,  chunk_size=1)
+    return OpenAIEmbeddings(openai_api_key=openai_api_key, chunk_size=1)
 
 
 def get_llama2_embedding():
@@ -48,15 +48,17 @@ def choose_embedding_provider():
 
     if embedding_provider == EmbeddingProvider.azure.value:
         return get_azure_embedding()
-    
+
     elif embedding_provider == EmbeddingProvider.OPENAI.value:
         return get_openai_embedding()
-    
+
     elif embedding_provider == EmbeddingProvider.llama2.value:
         return get_llama2_embedding()
 
     else:
-        available_providers = ", ".join([service.value for service in EmbeddingProvider])
+        available_providers = ", ".join(
+            [service.value for service in EmbeddingProvider]
+        )
         raise ValueError(
             f"Embedding service '{embedding_provider}' is not currently available. "
             f"Available services: {available_providers}"
@@ -66,4 +68,4 @@ def choose_embedding_provider():
 # Main function to get embeddings
 def get_embeddings() -> Embeddings:
     """Gets embeddings using the chosen embedding provider."""
-    return choose_embedding_provider()
\ No newline at end of file
+    return choose_embedding_provider()
diff --git a/dj_backend_server/api/utils/get_openai_llm.py b/dj_backend_server/api/utils/get_openai_llm.py
index 87cabca3..95ce348e 100644
--- a/dj_backend_server/api/utils/get_openai_llm.py
+++ b/dj_backend_server/api/utils/get_openai_llm.py
@@ -5,15 +5,11 @@
 from django.utils.timezone import make_aware
 from datetime import datetime, timezone
 from uuid import uuid4
-from ollama import Client
-from openai import OpenAI
 from django.conf import settings
 from langchain_openai.chat_models import ChatOpenAI
-from langchain_community.llms import Ollama
+from langchain_community.chat_models import ChatOllama
 from langchain_community.llms import AzureOpenAI
 from langchain_community.llms import LlamaCpp
-from langchain.prompts import PromptTemplate
-from langchain.chains import LLMChain
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from web.models.failed_jobs import FailedJob
@@ -62,12 +58,7 @@ def get_llama_llm():
 def get_azure_openai_llm():
     """Returns AzureOpenAI instance configured from environment variables"""
     try:
-        if settings.DEBUG:
-            openai_api_type = "openai"  # JUST FOR DEVELOPMENT
-            logging.debug(f"DEVELOPMENT Using API Type: {openai_api_type}")
-        else:
-            openai_api_type = os.environ["AZURE_OPENAI_API_TYPE"]
-
+        openai_api_type = os.environ["AZURE_OPENAI_API_TYPE"]
         openai_api_key = os.environ["AZURE_OPENAI_API_KEY"]
         openai_deployment_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
         openai_model_name = os.environ["AZURE_OPENAI_COMPLETION_MODEL"]
@@ -134,22 +125,18 @@ def get_openai_llm():
         traceback.print_exc()
 
 
-def get_ollama_llm(sanitized_question):
-    """Returns an Ollama Server instance configured from environment variables"""
-    llm = Client(host=os.environ.get("OLLAMA_URL"))
-    # Use the client to make a request
+def get_ollama_llm():
+    """Returns an Ollama instance configured from environment variables"""
     try:
-        if sanitized_question:
-            response = llm.chat(
-                model=os.environ.get("OLLAMA_MODEL_NAME"),
-                messages=[{"role": "user", "content": sanitized_question}],
-            )
-        else:
-            raise ValueError("Question cannot be None.")
-        if response:
-            return response
-        else:
-            raise ValueError("Invalid response from Ollama.")
+        base_url = os.environ.get("OLLAMA_URL")
+        model = os.environ.get("OLLAMA_MODEL_NAME", "llama2")
+
+        llm = ChatOllama(
+            base_url=base_url,
+            model=model,
+            callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
+        )
+        return llm
 
     except Exception as e:
         logger.debug(f"Exception in get_ollama_llm: {e}")
@@ -157,7 +144,7 @@ def get_ollama_llm(sanitized_question):
             uuid=str(uuid4()),
             connection="default",
             queue="default",
-            payload="get_openai_llm",
+            payload="get_ollama_llm",
             exception=str(e),
             failed_at=make_aware(datetime.now(), timezone.utc),
         )
@@ -176,29 +163,26 @@ def get_llm():
             "ollama": lambda: get_ollama_llm(),
         }
 
+        # DEVENV
+        # if settings.DEBUG:
+        #     api_type = "ollama"
         api_type = os.environ.get("OPENAI_API_TYPE", "openai")
+
         if api_type not in clients:
             raise ValueError(f"Invalid OPENAI_API_TYPE: {api_type}")
 
         logging.debug(f"Using LLM: {api_type}")
 
         if api_type in clients:
-            if api_type == "ollama":
-                return clients[api_type]()
-            elif api_type != "ollama":
-                return clients[api_type]()
+            llm_instance = clients[api_type]()
+            if llm_instance is None:
+                logger.error(f"LLM instance for {api_type} could not be created.")
+                return None
+            return llm_instance
         else:
             raise ValueError(f"Invalid OPENAI_API_TYPE: {api_type}")
 
     except Exception as e:
-        failed_job = FailedJob(
-            uuid=str(uuid4()),
-            connection="default",
-            queue="default",
-            payload="get_llm",
-            exception=str(e),
-            failed_at=datetime.now(),
-        )
         failed_job = FailedJob(
             uuid=str(uuid4()),
             connection="default",
diff --git a/dj_backend_server/api/utils/make_chain.py b/dj_backend_server/api/utils/make_chain.py
index a3fdc870..f8e0a71d 100644
--- a/dj_backend_server/api/utils/make_chain.py
+++ b/dj_backend_server/api/utils/make_chain.py
@@ -98,6 +98,7 @@ def getConversationRetrievalChain(
         retriever=vector_store.as_retriever(),
         verbose=True,
         combine_docs_chain_kwargs={"prompt": prompt},
+        return_source_documents=True,
     )
     logger.debug(f"ConversationalRetrievalChain {llm}, created: {chain}")
     return chain
diff --git a/dj_backend_server/api/views/views_chat.py b/dj_backend_server/api/views/views_chat.py
index 166354c7..00a87214 100644
--- a/dj_backend_server/api/views/views_chat.py
+++ b/dj_backend_server/api/views/views_chat.py
@@ -165,21 +165,15 @@ def get_completion_response(
     elif chain_type == "conversation_retrieval":
         chain = getConversationRetrievalChain(vector_store, mode, initial_prompt)
         logger.debug("getConversationRetrievalChain")
-        chat_history_json = json.dumps(
-            get_chat_history_for_retrieval_chain(
-                session_id, limit=20, initial_prompt=initial_prompt
-            ),
-            ensure_ascii=False,
+        chat_history = get_chat_history_for_retrieval_chain(
+            session_id, limit=20, initial_prompt=initial_prompt
         )
-        chat_history_json = ""
-        logger.debug(f"Formatted Chat_history {chat_history_json}")
+        logger.debug(f"Formatted Chat_history {chat_history}")
 
         response = chain.invoke(
-            {"question": sanitized_question, "chat_history": chat_history_json}
+            {"question": sanitized_question, "chat_history": chat_history},
         )
-        logger.debug(f"response from chain.invoke: {response}")
         response_text = response.get("answer")
-        logger.debug(f"response_text   : {response_text}")
     try:
         # Attempt to parse the response_text as JSON
         response_text = json.loads(response_text)
diff --git a/dj_backend_server/api/views/views_message.py b/dj_backend_server/api/views/views_message.py
index 6bbce3bb..18ac6faf 100644
--- a/dj_backend_server/api/views/views_message.py
+++ b/dj_backend_server/api/views/views_message.py
@@ -170,26 +170,23 @@ def send_chat(request):
     """
     try:
 
-        if settings.DEBUG:
-            logger.debug("Entering send_chat function")
+        logger.debug("Entering send_chat function")
         # You can add additional validation for 'history' and 'content_type' if needed.
 
         bot_token = request.headers.get("X-Bot-Token")
         bot = get_object_or_404(Chatbot, token=bot_token)
 
         data = json.loads(request.body)
-        if settings.DEBUG:
-            logger.debug(
-                f"Request data: {data}"
-            )  # {'from': 'user', 'type': 'text', 'content': 'input text from chat'}
+        logger.debug(
+            f"Request data: {data}"
+        )  # {'from': 'user', 'type': 'text', 'content': 'input text from chat'}
         # Validate the request data
         content = data.get("content")
         history = data.get("history")
-        if settings.DEBUG:
-            logger.debug(f"Content: {content}")
-            logger.debug(
-                f"History: {history}"
-            )  # history is a list of chat history - None????
+        logger.debug(f"Content: {content}")
+        logger.debug(
+            f"History: {history}"
+        )  # history is a list of chat history - None????
         content_type = data.get("type")
 
         session_id = get_session_id(request=request, bot_id=bot.id)
@@ -198,10 +195,9 @@ def send_chat(request):
             {"message": entry.message, "from_user": entry.from_user}
             for entry in history
         ]
-        if settings.DEBUG:
-            logger.debug(
-                f"History entries in JSON: {history_entries} - and history in text from DB: {history}"
-            )
+        logger.debug(
+            f"History entries in JSON: {history_entries} - and history in text from DB: {history}"
+        )
 
         # Implement the equivalent logic for validation
         if not content:
@@ -211,8 +207,7 @@ def send_chat(request):
             )
 
         # Implement the equivalent logic to send the HTTP request to the external API
-        if settings.DEBUG:
-            logger.debug(f"External API response START")
+        logger.debug(f"External API response START")
         response = requests.post(
             os.getenv("APP_URL") + "/api/chat/",
             json={
@@ -226,8 +221,7 @@ def send_chat(request):
             },
             timeout=200,
         )
-        if settings.DEBUG:
-            logger.debug(f"External API response: {response.text} and {response}")
+        logger.debug(f"External API response: {response.text} and {response}")
 
         """
         This block will first check if the response content is not empty. If it is empty, 
diff --git a/dj_backend_server/example.env b/dj_backend_server/example.env
index 4fee96cd..5e9dcec7 100644
--- a/dj_backend_server/example.env
+++ b/dj_backend_server/example.env
@@ -19,7 +19,7 @@ OPENAI_API_TYPE=openai
 OPENAI_API_MODEL=gpt-4-1106-preview
 OPENAI_API_TEMPERATURE=1
 
-# azure | openai | llama2 | ollama
+# azure | openai | llama2 - change only if you know what you do
 EMBEDDING_PROVIDER=openai
 
 # If using azure
@@ -30,22 +30,20 @@ EMBEDDING_PROVIDER=openai
 # AZURE_OPENAI_DEPLOYMENT_NAME=
 # AZURE_OPENAI_COMPLETION_MODEL=gpt-35-turbo
 
-
+# OLLAMA_URL=""  #no trailing slash at the end or will not work.
+# OLLAMA_MODEL_NAME="" # ex openchat, llama2  - Be sure you have this on server downloaded "ollama pull openchat"
 
 # Vector Store, PINECONE|QDRANT
 STORE=QDRANT
 
-
 # if using pinecone
 # PINECONE_API_KEY=
 # PINECONE_ENV=
 # VECTOR_STORE_INDEX_NAME=
 
-
 # if using qdrant
 QDRANT_URL=http://qdrant:6333
 
-
 # optional, defaults to 15
 MAX_PAGES_CRAWL=150
 
@@ -73,5 +71,4 @@ OCR_LLM = '1'
 
 # retrieval_qa | conversation_retrieval, retrieval_qa works better with azure openai
 # if you want to use the conversation_retrieval | retrieval_qa chain
-CHAIN_TYPE=conversation_retrieval
-
+CHAIN_TYPE=conversation_retrieval
\ No newline at end of file
diff --git a/dj_backend_server/requirements.txt b/dj_backend_server/requirements.txt
index d772269a..e91d0037 100644
--- a/dj_backend_server/requirements.txt
+++ b/dj_backend_server/requirements.txt
@@ -25,6 +25,9 @@ drf-spectacular==0.27.1
 drf_spectacular.extensions==0.0.2
 exceptiongroup==1.1.2
 frozenlist==1.4.0
+filelock==3.13.1
+fsspec==2024.2.0
+huggingface-hub==0.20.3 
 grpcio==1.56.2
 grpcio-tools==1.56.2
 h11==0.14.0
@@ -71,6 +74,7 @@ qdrant-client==1.7.0
 redis==4.6.0
 regex==2023.6.3
 requests==2.31.0
+safetensors==0.4.2
 six==1.16.0
 sniffio==1.3.0
 soupsieve==2.4.1
@@ -79,6 +83,8 @@ sqlparse==0.4.4
 tenacity==8.2.2
 tiktoken==0.6.0
 tqdm==4.65.0
+tokenizers==0.15.2
+transformers==4.37.2
 typing-inspect==0.9.0
 typing_extensions==4.7.1
 tzdata==2023.3
@@ -88,4 +94,3 @@ wcwidth==0.2.6
 yarl==1.9.2
 django-cors-headers==4.3.1
 
-
diff --git a/dj_backend_server/web/services/chat_history_service.py b/dj_backend_server/web/services/chat_history_service.py
index 8a66f940..6e1e011c 100644
--- a/dj_backend_server/web/services/chat_history_service.py
+++ b/dj_backend_server/web/services/chat_history_service.py
@@ -1,13 +1,18 @@
 from typing import List, Optional, Tuple, NamedTuple
 from web.models.chat_histories import ChatHistory
 from langchain.schema import BaseMessage, AIMessage, HumanMessage
+from langchain.memory import ConversationSummaryBufferMemory
 from django.conf import settings
+from api.utils.get_openai_llm import get_llm
 import logging
 
 logging.config.dictConfig(settings.LOGGING)
 logger = logging.getLogger(__name__)
 
 
+from langchain.schema import HumanMessage, AIMessage
+
+
 def get_chat_history_for_retrieval_chain(
     session_id: str, limit: Optional[int] = None, initial_prompt: Optional[str] = None
 ) -> List[dict]:
@@ -25,28 +30,30 @@ def get_chat_history_for_retrieval_chain(
     query = ChatHistory.objects.filter(session_id=session_id).order_by("created_at")
     if limit:
         query = query[:limit]
-    for entry in query:
-        role = "user" if entry.from_user == "True" else "assistant"
-        logger.debug(f"Chat history entry: {entry}, role: {role}")
-
-    chat_history = [
-        {
-            "role": "system",
-            "content": "This is an initial system message setting up the context.",
-        }
-    ]
-    user_query = None
 
-    # Directly interpret the from_user flag to assign roles correctly
+    chat_history = []
+    user_query = None
+    llm = get_llm()
+    # Now, chat_history is properly defined and can be used to initialize the memory
+    memory = ConversationSummaryBufferMemory(
+        llm=llm,
+        max_token_limit=1024,
+        memory_key=session_id,
+        return_messages=True,
+    )
+    # Assuming chat_history is meant to be a list of messages for the memory
+    # Here you should convert your query results into the desired format for chat_history
+    # For example, appending dicts to chat_history list as shown below might be what you intended
     for entry in query:
         if entry.from_user == "True":
-            # This entry is a user query; store it to pair with the next bot response
+            chat_history.append(HumanMessage(content=entry.message))
             user_query = entry.message
         else:
-            # This entry is a bot response; pair it with the last user query if available
+            chat_history.append(AIMessage(content=entry.message))
             if user_query is not None:
-                chat_history.append({"role": "user", "content": user_query})
-                chat_history.append({"role": "assistant", "content": entry.message})
+                memory.save_context({"input": user_query}, {"output": entry.message})
                 user_query = None
 
+    logger.debug(f"Memory PRINT: {memory}")
+    # chat_history = memory.load_memory_variables({})
     return chat_history