From 0f0296552b2cc893266fe5b939daaf5afeda99bf Mon Sep 17 00:00:00 2001 From: Clemens Siebler Date: Wed, 5 Jul 2023 09:21:57 +0200 Subject: [PATCH] Moved LLM/Embeddings Model into LLMHelper to further decouple code --- backend/utilities/DocumentProcessor.py | 30 ++++------------------- backend/utilities/LLMHelper.py | 34 ++++++++++++++++++++++++++ backend/utilities/QuestionHandler.py | 18 +++----------- 3 files changed, 42 insertions(+), 40 deletions(-) create mode 100644 backend/utilities/LLMHelper.py diff --git a/backend/utilities/DocumentProcessor.py b/backend/utilities/DocumentProcessor.py index e270f648c..4bd6f019e 100644 --- a/backend/utilities/DocumentProcessor.py +++ b/backend/utilities/DocumentProcessor.py @@ -1,24 +1,21 @@ -from .formrecognizer import AzureFormRecognizerClient -from .azureblobstorage import AzureBlobStorageClient import os -import openai from dotenv import load_dotenv import logging import re import hashlib from typing import Optional -from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores.base import VectorStore from langchain.document_loaders.base import BaseLoader from langchain.document_loaders import WebBaseLoader from langchain.text_splitter import TokenTextSplitter, TextSplitter from langchain.document_loaders.base import BaseLoader -from opencensus.ext.azure.log_exporter import AzureLogHandler - +from .formrecognizer import AzureFormRecognizerClient +from .azureblobstorage import AzureBlobStorageClient from .azuresearch import AzureSearch +from .LLMHelper import LLMHelper from .ConfigHelper import ConfigHelper import pandas as pd @@ -28,10 +25,7 @@ class DocumentProcessor: - def __init__( - self - ): - + def __init__(self): self.pdf_parser: AzureFormRecognizerClient = AzureFormRecognizerClient() self.blob_client: AzureBlobStorageClient = AzureBlobStorageClient() self.user_agent: UserAgent = UserAgent() @@ -48,21 +42,7 @@ def __init__( self.azure_search_key: str = os.getenv("AZURE_SEARCH_KEY") self.index_name: str = os.getenv("AZURE_SEARCH_INDEX") - os.environ["OPENAI_API_BASE"] = f"https://{os.getenv('AZURE_OPENAI_RESOURCE')}.openai.azure.com/" - os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_KEY") - os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION") - - openai.api_type = "azure" - openai.api_base = os.getenv("OPENAI_API_BASE") - openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") - openai.api_key = os.getenv("OPENAI_API_KEY") - - # Azure OpenAI settings - self.api_base = openai.api_base - self.api_version = openai.api_version - - self.model: str = os.getenv("OPENAI_EMBEDDINGS_ENGINE_DOC", "text-embedding-ada-002") - self.embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=self.model, chunk_size=1) + self.embeddings = LLMHelper().get_embedding_model() self.vector_store: VectorStore = AzureSearch( azure_cognitive_search_name=self.azure_search_endpoint, diff --git a/backend/utilities/LLMHelper.py b/backend/utilities/LLMHelper.py new file mode 100644 index 000000000..bddb551f8 --- /dev/null +++ b/backend/utilities/LLMHelper.py @@ -0,0 +1,34 @@ +import os +import openai +from langchain.chat_models import AzureChatOpenAI +from langchain.embeddings.openai import OpenAIEmbeddings +from dotenv import load_dotenv + +class LLMHelper: + def __init__(self): + os.environ["OPENAI_API_BASE"] = f"https://{os.getenv('AZURE_OPENAI_RESOURCE')}.openai.azure.com/" + os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_KEY") + os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION") + + # Configure OpenAI API + openai.api_type = "azure" + openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") + openai.api_base = os.getenv('OPENAI_API_BASE') + openai.api_key = os.getenv("OPENAI_API_KEY") + + self.llm = AzureChatOpenAI(deployment_name=os.getenv("AZURE_OPENAI_MODEL"), temperature=0, max_tokens=os.getenv('AZURE_OPENAI_MAX_TOKENS', None), openai_api_version=openai.api_version) + self.embedding_model = OpenAIEmbeddings(model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL"), chunk_size=1) + + def get_llm(self): + return self.llm + + def get_embedding_model(self): + return self.embedding_model + + + + + + + + diff --git a/backend/utilities/QuestionHandler.py b/backend/utilities/QuestionHandler.py index 1986f2780..9798e6d6e 100644 --- a/backend/utilities/QuestionHandler.py +++ b/backend/utilities/QuestionHandler.py @@ -5,11 +5,8 @@ import json from azuresearch import AzureSearch from langchain.chains.qa_with_sources import load_qa_with_sources_chain -from langchain.chat_models import AzureChatOpenAI -from langchain.embeddings.openai import OpenAIEmbeddings from dotenv import load_dotenv from langchain.chains.llm import LLMChain -from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT from langchain.chains import ConversationalRetrievalChain from langchain.prompts import PromptTemplate from langchain.callbacks import get_openai_callback @@ -17,6 +14,7 @@ from .azuresearch import AzureSearch from .ConfigHelper import ConfigHelper +from .LLMHelper import LLMHelper from .azureblobstorage import AzureBlobStorageClient @@ -32,19 +30,9 @@ class QuestionHandler: def __init__(self): load_dotenv() - - os.environ["OPENAI_API_BASE"] = f"https://{os.getenv('AZURE_OPENAI_RESOURCE')}.openai.azure.com/" - os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_KEY") - os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION") - - # Configure OpenAI API - openai.api_type = "azure" - openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") - openai.api_base = os.getenv('OPENAI_API_BASE') - openai.api_key = os.getenv("OPENAI_API_KEY") - self.llm = AzureChatOpenAI(deployment_name=os.getenv("AZURE_OPENAI_MODEL"), temperature=0, max_tokens=os.getenv('AZURE_OPENAI_MAX_TOKENS', None), openai_api_version=openai.api_version) - self.embeddings = OpenAIEmbeddings(model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL"), chunk_size=1) + self.llm = LLMHelper().get_llm() + self.embeddings = LLMHelper().get_embedding_model() # Connect to search self.vector_store = AzureSearch(