From 5146d2c1d1bdff3bed65d3022af78428b8bf6a9b Mon Sep 17 00:00:00 2001 From: Micko Date: Tue, 1 Oct 2024 18:54:15 +0700 Subject: [PATCH 1/5] add 0.1.0 opensearch Embedding for langchain community --- .../embeddings/test_opensearch.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 libs/community/tests/integration_tests/embeddings/test_opensearch.py diff --git a/libs/community/tests/integration_tests/embeddings/test_opensearch.py b/libs/community/tests/integration_tests/embeddings/test_opensearch.py new file mode 100644 index 0000000000000..c60d7ef8218ba --- /dev/null +++ b/libs/community/tests/integration_tests/embeddings/test_opensearch.py @@ -0,0 +1,49 @@ +import pytest +from opensearchpy import OpenSearch +from langchain_community.embeddings.opensearch import OpenSearchEmbedding + + +@pytest.fixture +def model_id() -> str: + """Fixture to provide model ID.""" + return "some-model-id" + + +@pytest.fixture +def opensearch_client() -> OpenSearch: + """Fixture to provide OpenSearch client connection.""" + return OpenSearch( + hosts=[{'host': "localhost", 'port': 9200}], # Remove sensitive info + http_auth=("username", "password"), # Remove sensitive info + use_ssl=True, + verify_certs=False + ) + + +@pytest.fixture +def opensearch_embedding(opensearch_client, model_id) -> OpenSearchEmbedding: + return OpenSearchEmbedding.from_opensearch_connection(opensearch_client, model_id) + + +def test_opensearch_embedding_documents(opensearch_embedding: OpenSearchEmbedding) -> None: + """ + Test OpenSearch embedding documents. + Convert a list of strings, into a list of floats with the shape of its element and its + embedding vector dimensions. + """ + documents = ["foo bar", "bar foo", "foo"] + output = opensearch_embedding.embed_documents(documents) + assert len(output) == 3 + assert len(output[0]) == 768 # Change 768 to the expected embedding size + assert len(output[1]) == 768 # Change 768 to the expected embedding size + assert len(output[2]) == 768 # Change 768 to the expected embedding size + + +def test_opensearch_embedding_query(opensearch_embedding: OpenSearchEmbedding) -> None: + """ + Test OpenSearch embedding documents. + Convert strings, into floats with the shape of its embedding vector dimensions. + """ + document = "foo bar" + output = opensearch_embedding.embed_query(document) + assert len(output) == 768 From 13af9ab8022ceb14df252d0b400fa792111cee22 Mon Sep 17 00:00:00 2001 From: Micko Date: Tue, 1 Oct 2024 18:59:33 +0700 Subject: [PATCH 2/5] add 0.1.0 opensearch Embedding for langchain community --- .../embeddings/opensearch.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 libs/community/langchain_community/embeddings/opensearch.py diff --git a/libs/community/langchain_community/embeddings/opensearch.py b/libs/community/langchain_community/embeddings/opensearch.py new file mode 100644 index 0000000000000..fdb771a0cf16b --- /dev/null +++ b/libs/community/langchain_community/embeddings/opensearch.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +if TYPE_CHECKING: + from opensearchpy import OpenSearch + +import json +from langchain_core.embeddings import Embeddings + +class OpenSearchEmbedding(Embeddings): + def __init__( + self, + client: OpenSearch, + model_id: str, + ): + self.client = client + self.model_id = model_id + + @classmethod + def from_opensearch_connection( + cls, + opensearch_connection: OpenSearch, + model_id: str, + ) -> OpenSearchEmbedding: + """ + Class method to create an OpenSearchEmbedding object from an OpenSearch connection. + + Args: + opensearch_connection (OpenSearch): The OpenSearch connection. + model_id (str): The ML model ID for generating embeddings. + input_field (str, optional): The input field for the text (default: 'text_field'). + + Returns: + OpenSearchEmbedding: An instance of the OpenSearchEmbedding class. + """ + return cls(opensearch_connection, model_id) + + def _embedding_func(self, texts: List[str]) -> List[List[float]]: + """ + Internal method that sends a request to OpenSearch's text embedding endpoint + and retrieves embeddings for the provided texts. + + Args: + texts (List[str]): A list of strings to be embedded. + + Returns: + List[List[float]]: A list of embeddings, where each embedding is a list of floats. + """ + endpoint = f"/_plugins/_ml/_predict/text_embedding/{self.model_id}" + body = { + "text_docs": texts, + "return_number": True, + "target_response": ["sentence_embedding"] + } + + response = self.client.transport.perform_request( + method="POST", + url=endpoint, + body=json.dumps(body), + ) + # Extract embeddings from the response + embeddings = [item['output'][0]['data'] for item in response['inference_results']] + return embeddings + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of documents. + + Args: + texts (List[str]): A list of text documents to embed. + + Returns: + List[List[float]]: A list of embeddings for each document. + """ + return self._embedding_func(texts) + + def embed_query(self, text: str) -> List[float]: + """ + Generate an embedding for a single query. + + Args: + text (str): The text query to embed. + + Returns: + List[float]: The embedding for the query. + """ + return self._embedding_func([text])[0] From 7feca1688cf0a15592e3ea9c21783df3803cc44f Mon Sep 17 00:00:00 2001 From: Micko Date: Tue, 1 Oct 2024 22:22:36 +0700 Subject: [PATCH 3/5] Fix lint and formatting in opnesearch and its test --- .../embeddings/opensearch.py | 31 ++++++------ .../embeddings/test_opensearch.py | 47 ++++++++++++------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/libs/community/langchain_community/embeddings/opensearch.py b/libs/community/langchain_community/embeddings/opensearch.py index fdb771a0cf16b..498813e53af92 100644 --- a/libs/community/langchain_community/embeddings/opensearch.py +++ b/libs/community/langchain_community/embeddings/opensearch.py @@ -1,14 +1,15 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING, List +from langchain_core.embeddings import Embeddings + if TYPE_CHECKING: from opensearchpy import OpenSearch -import json -from langchain_core.embeddings import Embeddings -class OpenSearchEmbedding(Embeddings): +class OpenSearchEmbeddings(Embeddings): def __init__( self, client: OpenSearch, @@ -18,40 +19,41 @@ def __init__( self.model_id = model_id @classmethod - def from_opensearch_connection( + def from_connection( cls, opensearch_connection: OpenSearch, model_id: str, - ) -> OpenSearchEmbedding: + ) -> OpenSearchEmbeddings: """ - Class method to create an OpenSearchEmbedding object from an OpenSearch connection. + Class method to create an OpenSearchEmbeddings object + from an OpenSearch connection. Args: opensearch_connection (OpenSearch): The OpenSearch connection. model_id (str): The ML model ID for generating embeddings. - input_field (str, optional): The input field for the text (default: 'text_field'). Returns: - OpenSearchEmbedding: An instance of the OpenSearchEmbedding class. + OpenSearchEmbeddings: An instance of the OpenSearchEmbedding class. """ return cls(opensearch_connection, model_id) def _embedding_func(self, texts: List[str]) -> List[List[float]]: """ - Internal method that sends a request to OpenSearch's text embedding endpoint - and retrieves embeddings for the provided texts. + Internal method that sends a request to OpenSearch's text + embedding endpoint and retrieves embeddings for the provided texts. Args: texts (List[str]): A list of strings to be embedded. Returns: - List[List[float]]: A list of embeddings, where each embedding is a list of floats. + List[List[float]]: A list of embeddings, + where each embedding is a list of floats. """ endpoint = f"/_plugins/_ml/_predict/text_embedding/{self.model_id}" body = { "text_docs": texts, "return_number": True, - "target_response": ["sentence_embedding"] + "target_response": ["sentence_embedding"], } response = self.client.transport.perform_request( @@ -59,8 +61,9 @@ def _embedding_func(self, texts: List[str]) -> List[List[float]]: url=endpoint, body=json.dumps(body), ) - # Extract embeddings from the response - embeddings = [item['output'][0]['data'] for item in response['inference_results']] + embeddings = [ + item["output"][0]["data"] for item in response["inference_results"] + ] return embeddings def embed_documents(self, texts: List[str]) -> List[List[float]]: diff --git a/libs/community/tests/integration_tests/embeddings/test_opensearch.py b/libs/community/tests/integration_tests/embeddings/test_opensearch.py index c60d7ef8218ba..98ceb332df035 100644 --- a/libs/community/tests/integration_tests/embeddings/test_opensearch.py +++ b/libs/community/tests/integration_tests/embeddings/test_opensearch.py @@ -1,6 +1,9 @@ +from typing import List + import pytest from opensearchpy import OpenSearch -from langchain_community.embeddings.opensearch import OpenSearchEmbedding + +from langchain_community.embeddings.opensearch import OpenSearchEmbeddings @pytest.fixture @@ -10,40 +13,48 @@ def model_id() -> str: @pytest.fixture -def opensearch_client() -> OpenSearch: +def client() -> OpenSearch: """Fixture to provide OpenSearch client connection.""" return OpenSearch( - hosts=[{'host': "localhost", 'port': 9200}], # Remove sensitive info + hosts=[{"host": "localhost", "port": 9200}], # Remove sensitive info http_auth=("username", "password"), # Remove sensitive info use_ssl=True, - verify_certs=False + verify_certs=False, ) @pytest.fixture -def opensearch_embedding(opensearch_client, model_id) -> OpenSearchEmbedding: - return OpenSearchEmbedding.from_opensearch_connection(opensearch_client, model_id) +def opensearch_embedding(client: OpenSearch, model_id: str) -> OpenSearchEmbeddings: + """Fixture to provide OpenSearch embeddings connection.""" + return OpenSearchEmbeddings.from_connection(client, model_id) + + +@pytest.fixture +def documents() -> List[str]: + """Fixture for test documents.""" + return ["foo bar", "bar foo", "foo"] -def test_opensearch_embedding_documents(opensearch_embedding: OpenSearchEmbedding) -> None: +def test_opensearch_embedding_documents( + opensearch_embedding: OpenSearchEmbeddings, documents: List[str] +) -> None: """ Test OpenSearch embedding documents. - Convert a list of strings, into a list of floats with the shape of its element and its - embedding vector dimensions. + Convert a list of strings into a list of floats, + with each element having the shape of its embedding vector dimensions. """ - documents = ["foo bar", "bar foo", "foo"] output = opensearch_embedding.embed_documents(documents) - assert len(output) == 3 - assert len(output[0]) == 768 # Change 768 to the expected embedding size - assert len(output[1]) == 768 # Change 768 to the expected embedding size - assert len(output[2]) == 768 # Change 768 to the expected embedding size + assert len(output) == len(documents) + for embedding in output: + assert len(embedding) == 768 # Expected embedding size -def test_opensearch_embedding_query(opensearch_embedding: OpenSearchEmbedding) -> None: +def test_opensearch_embedding_query(opensearch_embedding: OpenSearchEmbeddings) -> None: """ - Test OpenSearch embedding documents. - Convert strings, into floats with the shape of its embedding vector dimensions. + Test OpenSearch embedding query. + Convert a string into a float array, with the shape + corresponding to its embedding vector dimensions. """ document = "foo bar" output = opensearch_embedding.embed_query(document) - assert len(output) == 768 + assert len(output) == 768 # Expected embedding size From 74e07ce4f94cb6e327aa95118301e5584c449cea Mon Sep 17 00:00:00 2001 From: Micko Date: Tue, 1 Oct 2024 22:34:48 +0700 Subject: [PATCH 4/5] adding OpenSearch.ipynb to /docs/docs/text_embedding --- .../text_embedding/opensearch.ipynb | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/docs/integrations/text_embedding/opensearch.ipynb diff --git a/docs/docs/integrations/text_embedding/opensearch.ipynb b/docs/docs/integrations/text_embedding/opensearch.ipynb new file mode 100644 index 0000000000000..da37d568d1b7c --- /dev/null +++ b/docs/docs/integrations/text_embedding/opensearch.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "766a731c-fdc8-43a9-ab6a-aae8b3d82720", + "metadata": {}, + "source": [ + "# OpenSearch\n", + "\n", + "A guide to using embeddings with OpenSearch ML Plugins. Ensure that your OpenSearch cluster has the embedding plugins installed.\n", + "\n", + "For more information, visit: https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b0f624d-b469-4974-acd0-a8c8b74b5f48", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings.opensearch import OpenSearchEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "213122f3-169e-4fa6-99cb-c8a3bc77aff8", + "metadata": {}, + "outputs": [], + "source": [ + "#Let's initialized opensearch client using opensearchpy\n", + "from opensearchpy import OpenSearch\n", + "\n", + "client = OpenSearch(\n", + " hosts=[{'host': \"localhost\", 'port': 9200}],\n", + " http_auth=(\"username\", \"password\"),\n", + " use_ssl=True,\n", + " verify_certs=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "029f030e-06b0-40ec-8848-f1a91c8762f6", + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"embedding_model_id\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "013eabdf-9fbf-41f9-a932-7b580f2ece49", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenSearchEmbeddings.from_opensearch_connection(opensearch_client, model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "1c34d540-f642-48ef-ba10-be3f8948b6c7", + "metadata": {}, + "source": [ + "### Embedding documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8432efd-6315-4dcf-92a4-1a772c5caa9d", + "metadata": {}, + "outputs": [], + "source": [ + "documents = [\"Foo\", \"Bar\", \"Foo Bar\"]\n", + "embedded_documents = embeddings.embed_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "829001a3-2213-4eb3-9942-ee2583ff5577", + "metadata": {}, + "outputs": [], + "source": [ + "for i, doc in enumerate(documents):\n", + " print(f\"Document: {doc}\")\n", + " print(f\"Embedding: {embedded_documents[i][:5]}...\") # Show first 5 values to avoid overwhelming output\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "4b831983-1c2f-4b75-a36e-e1fea374cb1c", + "metadata": {}, + "source": [ + "### Embedding a query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c13f2f2-2357-4e31-a432-9a34d70bcc9a", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Hello World!\"\n", + "embedded_query = embeddings.embed_query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ad525df-e411-4f9c-a796-f8c388b21d7e", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Query Embedding:\")\n", + "print(f\"Query: {query}\")\n", + "print(f\"Embedding: {embedded_query[:5]}...\") # Show first 5 values of the embedding" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "vllm-langchain", + "language": "python", + "name": "vllm-langchain" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 742e1fc6b6de262368f430ac289865a94eb59f2a Mon Sep 17 00:00:00 2001 From: Micko Date: Thu, 3 Oct 2024 14:39:02 +0700 Subject: [PATCH 5/5] fixing error from_opensearch_connection to from_connection --- docs/docs/integrations/text_embedding/opensearch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/integrations/text_embedding/opensearch.ipynb b/docs/docs/integrations/text_embedding/opensearch.ipynb index da37d568d1b7c..fb5aea1904217 100644 --- a/docs/docs/integrations/text_embedding/opensearch.ipynb +++ b/docs/docs/integrations/text_embedding/opensearch.ipynb @@ -57,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "embeddings = OpenSearchEmbeddings.from_opensearch_connection(opensearch_client, model_id)" + "embeddings = OpenSearchEmbeddings.from_connection(opensearch_client, model_id)" ] }, {