From 6ba8a5a1e4a886ebe44af90d9ce3eb842b24b697 Mon Sep 17 00:00:00 2001
From: Javier Puerto <javier.puerto@movementsciences.ch>
Date: Sun, 17 Mar 2024 11:21:07 +0100
Subject: [PATCH 1/3] Add support for Text Embeddings Inference (TEI).

---
 src/initialize.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/initialize.py b/src/initialize.py
index 42dd93b..63e2c15 100644
--- a/src/initialize.py
+++ b/src/initialize.py
@@ -6,7 +6,7 @@
 import yaml
 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
-from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
 from langchain.prompts import (
     ChatPromptTemplate,
     HumanMessagePromptTemplate,
@@ -82,10 +82,7 @@ def _init_vector_store_pinecone(config_loader):
     )
     index_name = config_loader["vector_store_index_name"]
     index = pinecone.Index(index_name)
-    embeddings = HuggingFaceEmbeddings(
-        model_name=config_loader["embeddings_model_name"],
-        model_kwargs={"device": "cpu"},
-    )
+    embeddings = _init_embeddings(config_loader=config_loader)
     vector_store = Pinecone(index, embeddings.embed_query, "text")
     logger.info(pinecone.describe_index(index_name))
     logger.info(index.describe_index_stats())
@@ -103,10 +100,7 @@ def _init_vector_store_supabase(config_loader):
         supabase_key=os.environ.get("SUPABASE_API_KEY"),
         options=ClientOptions(postgrest_client_timeout=60),
     )
-    embeddings = HuggingFaceEmbeddings(
-        model_name=config_loader["embeddings_model_name"],
-        model_kwargs={"device": "cpu"},
-    )
+    embeddings = _init_embeddings(config_loader)
     vector_store = StandardSupabaseVectorStore(
         client=supabase_client,
         embedding=embeddings,
@@ -116,7 +110,6 @@ def _init_vector_store_supabase(config_loader):
     logger.info("Initialized vector store")
     return vector_store
 
-
 def _init_vector_stores_qdrant(config_loader):
     logger = lg.getLogger(_init_vector_stores_qdrant.__name__)
     logger.info("Initializing vector stores")
@@ -125,10 +118,7 @@ def _init_vector_stores_qdrant(config_loader):
         api_key=os.environ["QDRANT_API_KEY"],
         prefer_grpc=True,
     )
-    embeddings = HuggingFaceEmbeddings(
-        model_name=config_loader["embeddings_model_name"],
-        model_kwargs={"device": "cpu"},
-    )
+    embeddings = _init_embeddings(config_loader)
     vector_stores = {}
     for collection_name in config_loader["collections"]:
         if not _exists_collection(qdrant_client, collection_name):
@@ -145,6 +135,16 @@ def _init_vector_stores_qdrant(config_loader):
         logger.info("Initialized vector store for collection [%s]", collection_name)
     return vector_stores
 
+def _init_embeddings(config_loader):
+    model: str = config_loader["embeddings_model_name"]
+    if model.startswith('http'):
+        return HuggingFaceHubEmbeddings(model=model)
+    else:
+        return HuggingFaceEmbeddings(
+            model_name=config_loader["embeddings_model_name"],
+            model_kwargs={"device": "cpu"},
+        )
+
 
 def _init_openai_client():
     logger = lg.getLogger(_init_openai_client.__name__)

From ac9a5e447c60cde58cbe4f02dd84567f9b619824 Mon Sep 17 00:00:00 2001
From: Javier Puerto <javier.puerto@movementsciences.ch>
Date: Sun, 17 Mar 2024 12:30:29 +0100
Subject: [PATCH 2/3] Update requirements and documentation.

---
 doc/use_tei_for_embeddings.md | 27 +++++++++++++++++++++++++++
 requirements.txt              |  4 ++--
 2 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 doc/use_tei_for_embeddings.md

diff --git a/doc/use_tei_for_embeddings.md b/doc/use_tei_for_embeddings.md
new file mode 100644
index 0000000..a067764
--- /dev/null
+++ b/doc/use_tei_for_embeddings.md
@@ -0,0 +1,27 @@
+# Use Text Embeddings Inference (TEI) support
+
+TEI is an optimized tooklit for deploying and serving text embeddings and sequence classification models.
+
+See more information in the [TEI documentation](https://huggingface.co/docs/text-embeddings-inference/index)
+
+**Current limitation:** The `chunk_size` option must be 510 or lower to work. I was not able to configure a higher size.
+
+## How to use
+
+It is simple, just run a docker image suitable to your [compatible hardware](https://huggingface.co/docs/text-embeddings-inference/supported_models) like the following:
+
+```shell
+docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=<your-hf-token> -p 8080:80 -v <your-tei-local-data>:/data ghcr.io/huggingface/text-embeddings-inference:turing-1.1 --model-id dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn --max-client-batch-size 64
+```
+
+The previous command will start a new service with the model `dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn` ready to generate embeddings.
+
+In justicio's configuration, limit `chunk_size` to 510 and change the `embeddings_model_name` to the URL where TEI service is running and listening, like *http://localhost:8080*.
+
+You will need to have an environment variable where justicio is running to provide the HF token.
+
+```shell
+HUGGINGFACEHUB_API_TOKEN=<your-hf-token> python -m src.etls.boja.load dates 2024/01/01 2024/01/31
+```
+
+Embeddings will be generated using TEI and embedded into the configured vector database (only tested with Qdrant). 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index dab0ff7..1617773 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,9 +13,9 @@ retry==0.9.2
 typer==0.9.0
 schedule==1.2.1
 
-langchain==0.0.305
+langchain==0.0.354
 # langchainplus-sdk==0.0.20
-langsmith==0.0.41
+langsmith==0.0.92
 qdrant-client==1.5.4
 supabase==1.0.2
 pinecone-client==2.2.2

From 3552d329aebdb6b453c6e477347455126df709a4 Mon Sep 17 00:00:00 2001
From: Javier Puerto <jpuerto@gmail.com>
Date: Sun, 17 Mar 2024 15:34:31 +0100
Subject: [PATCH 3/3] Update src/initialize.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Darío López Padial <bukosabino@gmail.com>
---
 src/initialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/initialize.py b/src/initialize.py
index 63e2c15..16e03e0 100644
--- a/src/initialize.py
+++ b/src/initialize.py
@@ -141,7 +141,7 @@ def _init_embeddings(config_loader):
         return HuggingFaceHubEmbeddings(model=model)
     else:
         return HuggingFaceEmbeddings(
-            model_name=config_loader["embeddings_model_name"],
+            model_name=model,
             model_kwargs={"device": "cpu"},
         )