langchain-ai · komikndr · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/docs/docs/integrations/text_embedding/opensearch.ipynb b/docs/docs/integrations/text_embedding/opensearch.ipynb
@@ -0,0 +1,148 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "766a731c-fdc8-43a9-ab6a-aae8b3d82720",
+   "metadata": {},
+   "source": [
+    "# OpenSearch\n",
+    "\n",
+    "A guide to using embeddings with OpenSearch ML Plugins. Ensure that your OpenSearch cluster has the embedding plugins installed.\n",
+    "\n",
+    "For more information, visit: https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b0f624d-b469-4974-acd0-a8c8b74b5f48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.embeddings.opensearch import OpenSearchEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "213122f3-169e-4fa6-99cb-c8a3bc77aff8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Let's initialized opensearch client using opensearchpy\n",
+    "from opensearchpy import OpenSearch\n",
+    "\n",
+    "client = OpenSearch(\n",
+    "        hosts=[{'host': \"localhost\", 'port': 9200}],\n",
+    "        http_auth=(\"username\", \"password\"),\n",
+    "        use_ssl=True,\n",
+    "        verify_certs=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "029f030e-06b0-40ec-8848-f1a91c8762f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"embedding_model_id\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "013eabdf-9fbf-41f9-a932-7b580f2ece49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = OpenSearchEmbeddings.from_connection(opensearch_client, model_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c34d540-f642-48ef-ba10-be3f8948b6c7",
+   "metadata": {},
+   "source": [
+    "### Embedding documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8432efd-6315-4dcf-92a4-1a772c5caa9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = [\"Foo\", \"Bar\", \"Foo Bar\"]\n",
+    "embedded_documents = embeddings.embed_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "829001a3-2213-4eb3-9942-ee2583ff5577",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, doc in enumerate(documents):\n",
+    "    print(f\"Document: {doc}\")\n",
+    "    print(f\"Embedding: {embedded_documents[i][:5]}...\")  # Show first 5 values to avoid overwhelming output\n",
+    "    print(\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b831983-1c2f-4b75-a36e-e1fea374cb1c",
+   "metadata": {},
+   "source": [
+    "### Embedding a query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c13f2f2-2357-4e31-a432-9a34d70bcc9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Hello World!\"\n",
+    "embedded_query = embeddings.embed_query(query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ad525df-e411-4f9c-a796-f8c388b21d7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Query Embedding:\")\n",
+    "print(f\"Query: {query}\")\n",
+    "print(f\"Embedding: {embedded_query[:5]}...\")  # Show first 5 values of the embedding"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "vllm-langchain",
+   "language": "python",
+   "name": "vllm-langchain"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/community/langchain_community/embeddings/opensearch.py b/libs/community/langchain_community/embeddings/opensearch.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, List
+
+from langchain_core.embeddings import Embeddings
+
+if TYPE_CHECKING:
+    from opensearchpy import OpenSearch
+
+
+class OpenSearchEmbeddings(Embeddings):
+    def __init__(
+        self,
+        client: OpenSearch,
+        model_id: str,
+    ):
+        self.client = client
+        self.model_id = model_id
+
+    @classmethod
+    def from_connection(
+        cls,
+        opensearch_connection: OpenSearch,
+        model_id: str,
+    ) -> OpenSearchEmbeddings:
+        """
+        Class method to create an OpenSearchEmbeddings object
+        from an OpenSearch connection.
+
+        Args:
+            opensearch_connection (OpenSearch): The OpenSearch connection.
+            model_id (str): The ML model ID for generating embeddings.
+
+        Returns:
+            OpenSearchEmbeddings: An instance of the OpenSearchEmbedding class.
+        """
+        return cls(opensearch_connection, model_id)
+
+    def _embedding_func(self, texts: List[str]) -> List[List[float]]:
+        """
+        Internal method that sends a request to OpenSearch's text
+        embedding endpoint and retrieves embeddings for the provided texts.
+
+        Args:
+            texts (List[str]): A list of strings to be embedded.
+
+        Returns:
+            List[List[float]]: A list of embeddings,
+            where each embedding is a list of floats.
+        """
+        endpoint = f"/_plugins/_ml/_predict/text_embedding/{self.model_id}"
+        body = {
+            "text_docs": texts,
+            "return_number": True,
+            "target_response": ["sentence_embedding"],
+        }
+
+        response = self.client.transport.perform_request(
+            method="POST",
+            url=endpoint,
+            body=json.dumps(body),
+        )
+        embeddings = [
+            item["output"][0]["data"] for item in response["inference_results"]
+        ]
+        return embeddings
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Generate embeddings for a list of documents.
+
+        Args:
+            texts (List[str]): A list of text documents to embed.
+
+        Returns:
+            List[List[float]]: A list of embeddings for each document.
+        """
+        return self._embedding_func(texts)
+
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Generate an embedding for a single query.
+
+        Args:
+            text (str): The text query to embed.
+
+        Returns:
+            List[float]: The embedding for the query.
+        """
+        return self._embedding_func([text])[0]
diff --git a/libs/community/tests/integration_tests/embeddings/test_opensearch.py b/libs/community/tests/integration_tests/embeddings/test_opensearch.py
@@ -0,0 +1,60 @@
+from typing import List
+
+import pytest
+from opensearchpy import OpenSearch
+
+from langchain_community.embeddings.opensearch import OpenSearchEmbeddings
+
+
+@pytest.fixture
+def model_id() -> str:
+    """Fixture to provide model ID."""
+    return "some-model-id"
+
+
+@pytest.fixture
+def client() -> OpenSearch:
+    """Fixture to provide OpenSearch client connection."""
+    return OpenSearch(
+        hosts=[{"host": "localhost", "port": 9200}],  # Remove sensitive info
+        http_auth=("username", "password"),  # Remove sensitive info
+        use_ssl=True,
+        verify_certs=False,
+    )
+
+
+@pytest.fixture
+def opensearch_embedding(client: OpenSearch, model_id: str) -> OpenSearchEmbeddings:
+    """Fixture to provide OpenSearch embeddings connection."""
+    return OpenSearchEmbeddings.from_connection(client, model_id)
+
+
+@pytest.fixture
+def documents() -> List[str]:
+    """Fixture for test documents."""
+    return ["foo bar", "bar foo", "foo"]
+
+
+def test_opensearch_embedding_documents(
+    opensearch_embedding: OpenSearchEmbeddings, documents: List[str]
+) -> None:
+    """
+    Test OpenSearch embedding documents.
+    Convert a list of strings into a list of floats,
+    with each element having the shape of its embedding vector dimensions.
+    """
+    output = opensearch_embedding.embed_documents(documents)
+    assert len(output) == len(documents)
+    for embedding in output:
+        assert len(embedding) == 768  # Expected embedding size
+
+
+def test_opensearch_embedding_query(opensearch_embedding: OpenSearchEmbeddings) -> None:
+    """
+    Test OpenSearch embedding query.
+    Convert a string into a float array, with the shape
+    corresponding to its embedding vector dimensions.
+    """
+    document = "foo bar"
+    output = opensearch_embedding.embed_query(document)
+    assert len(output) == 768  # Expected embedding size