diff --git a/docs/docs/integrations/text_embedding/opensearch.ipynb b/docs/docs/integrations/text_embedding/opensearch.ipynb new file mode 100644 index 0000000000000..fb5aea1904217 --- /dev/null +++ b/docs/docs/integrations/text_embedding/opensearch.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "766a731c-fdc8-43a9-ab6a-aae8b3d82720", + "metadata": {}, + "source": [ + "# OpenSearch\n", + "\n", + "A guide to using embeddings with OpenSearch ML Plugins. Ensure that your OpenSearch cluster has the embedding plugins installed.\n", + "\n", + "For more information, visit: https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b0f624d-b469-4974-acd0-a8c8b74b5f48", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings.opensearch import OpenSearchEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "213122f3-169e-4fa6-99cb-c8a3bc77aff8", + "metadata": {}, + "outputs": [], + "source": [ + "#Let's initialized opensearch client using opensearchpy\n", + "from opensearchpy import OpenSearch\n", + "\n", + "client = OpenSearch(\n", + " hosts=[{'host': \"localhost\", 'port': 9200}],\n", + " http_auth=(\"username\", \"password\"),\n", + " use_ssl=True,\n", + " verify_certs=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "029f030e-06b0-40ec-8848-f1a91c8762f6", + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"embedding_model_id\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "013eabdf-9fbf-41f9-a932-7b580f2ece49", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenSearchEmbeddings.from_connection(opensearch_client, model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "1c34d540-f642-48ef-ba10-be3f8948b6c7", + "metadata": {}, + "source": [ + "### Embedding documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8432efd-6315-4dcf-92a4-1a772c5caa9d", + "metadata": {}, + "outputs": [], + "source": [ + "documents = [\"Foo\", \"Bar\", \"Foo Bar\"]\n", + "embedded_documents = embeddings.embed_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "829001a3-2213-4eb3-9942-ee2583ff5577", + "metadata": {}, + "outputs": [], + "source": [ + "for i, doc in enumerate(documents):\n", + " print(f\"Document: {doc}\")\n", + " print(f\"Embedding: {embedded_documents[i][:5]}...\") # Show first 5 values to avoid overwhelming output\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "4b831983-1c2f-4b75-a36e-e1fea374cb1c", + "metadata": {}, + "source": [ + "### Embedding a query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c13f2f2-2357-4e31-a432-9a34d70bcc9a", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Hello World!\"\n", + "embedded_query = embeddings.embed_query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ad525df-e411-4f9c-a796-f8c388b21d7e", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Query Embedding:\")\n", + "print(f\"Query: {query}\")\n", + "print(f\"Embedding: {embedded_query[:5]}...\") # Show first 5 values of the embedding" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "vllm-langchain", + "language": "python", + "name": "vllm-langchain" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/community/langchain_community/embeddings/opensearch.py b/libs/community/langchain_community/embeddings/opensearch.py new file mode 100644 index 0000000000000..498813e53af92 --- /dev/null +++ b/libs/community/langchain_community/embeddings/opensearch.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, List + +from langchain_core.embeddings import Embeddings + +if TYPE_CHECKING: + from opensearchpy import OpenSearch + + +class OpenSearchEmbeddings(Embeddings): + def __init__( + self, + client: OpenSearch, + model_id: str, + ): + self.client = client + self.model_id = model_id + + @classmethod + def from_connection( + cls, + opensearch_connection: OpenSearch, + model_id: str, + ) -> OpenSearchEmbeddings: + """ + Class method to create an OpenSearchEmbeddings object + from an OpenSearch connection. + + Args: + opensearch_connection (OpenSearch): The OpenSearch connection. + model_id (str): The ML model ID for generating embeddings. + + Returns: + OpenSearchEmbeddings: An instance of the OpenSearchEmbedding class. + """ + return cls(opensearch_connection, model_id) + + def _embedding_func(self, texts: List[str]) -> List[List[float]]: + """ + Internal method that sends a request to OpenSearch's text + embedding endpoint and retrieves embeddings for the provided texts. + + Args: + texts (List[str]): A list of strings to be embedded. + + Returns: + List[List[float]]: A list of embeddings, + where each embedding is a list of floats. + """ + endpoint = f"/_plugins/_ml/_predict/text_embedding/{self.model_id}" + body = { + "text_docs": texts, + "return_number": True, + "target_response": ["sentence_embedding"], + } + + response = self.client.transport.perform_request( + method="POST", + url=endpoint, + body=json.dumps(body), + ) + embeddings = [ + item["output"][0]["data"] for item in response["inference_results"] + ] + return embeddings + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of documents. + + Args: + texts (List[str]): A list of text documents to embed. + + Returns: + List[List[float]]: A list of embeddings for each document. + """ + return self._embedding_func(texts) + + def embed_query(self, text: str) -> List[float]: + """ + Generate an embedding for a single query. + + Args: + text (str): The text query to embed. + + Returns: + List[float]: The embedding for the query. + """ + return self._embedding_func([text])[0] diff --git a/libs/community/tests/integration_tests/embeddings/test_opensearch.py b/libs/community/tests/integration_tests/embeddings/test_opensearch.py new file mode 100644 index 0000000000000..98ceb332df035 --- /dev/null +++ b/libs/community/tests/integration_tests/embeddings/test_opensearch.py @@ -0,0 +1,60 @@ +from typing import List + +import pytest +from opensearchpy import OpenSearch + +from langchain_community.embeddings.opensearch import OpenSearchEmbeddings + + +@pytest.fixture +def model_id() -> str: + """Fixture to provide model ID.""" + return "some-model-id" + + +@pytest.fixture +def client() -> OpenSearch: + """Fixture to provide OpenSearch client connection.""" + return OpenSearch( + hosts=[{"host": "localhost", "port": 9200}], # Remove sensitive info + http_auth=("username", "password"), # Remove sensitive info + use_ssl=True, + verify_certs=False, + ) + + +@pytest.fixture +def opensearch_embedding(client: OpenSearch, model_id: str) -> OpenSearchEmbeddings: + """Fixture to provide OpenSearch embeddings connection.""" + return OpenSearchEmbeddings.from_connection(client, model_id) + + +@pytest.fixture +def documents() -> List[str]: + """Fixture for test documents.""" + return ["foo bar", "bar foo", "foo"] + + +def test_opensearch_embedding_documents( + opensearch_embedding: OpenSearchEmbeddings, documents: List[str] +) -> None: + """ + Test OpenSearch embedding documents. + Convert a list of strings into a list of floats, + with each element having the shape of its embedding vector dimensions. + """ + output = opensearch_embedding.embed_documents(documents) + assert len(output) == len(documents) + for embedding in output: + assert len(embedding) == 768 # Expected embedding size + + +def test_opensearch_embedding_query(opensearch_embedding: OpenSearchEmbeddings) -> None: + """ + Test OpenSearch embedding query. + Convert a string into a float array, with the shape + corresponding to its embedding vector dimensions. + """ + document = "foo bar" + output = opensearch_embedding.embed_query(document) + assert len(output) == 768 # Expected embedding size