Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Opensearch embedding #27025

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions docs/docs/integrations/text_embedding/opensearch.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "766a731c-fdc8-43a9-ab6a-aae8b3d82720",
"metadata": {},
"source": [
"# OpenSearch\n",
"\n",
"A guide to using embeddings with OpenSearch ML Plugins. Ensure that your OpenSearch cluster has the embedding plugins installed.\n",
"\n",
"For more information, visit: https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b0f624d-b469-4974-acd0-a8c8b74b5f48",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.embeddings.opensearch import OpenSearchEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "213122f3-169e-4fa6-99cb-c8a3bc77aff8",
"metadata": {},
"outputs": [],
"source": [
"#Let's initialized opensearch client using opensearchpy\n",
"from opensearchpy import OpenSearch\n",
"\n",
"client = OpenSearch(\n",
" hosts=[{'host': \"localhost\", 'port': 9200}],\n",
" http_auth=(\"username\", \"password\"),\n",
" use_ssl=True,\n",
" verify_certs=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "029f030e-06b0-40ec-8848-f1a91c8762f6",
"metadata": {},
"outputs": [],
"source": [
"model_id = \"embedding_model_id\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "013eabdf-9fbf-41f9-a932-7b580f2ece49",
"metadata": {},
"outputs": [],
"source": [
"embeddings = OpenSearchEmbeddings.from_connection(opensearch_client, model_id)"
]
},
{
"cell_type": "markdown",
"id": "1c34d540-f642-48ef-ba10-be3f8948b6c7",
"metadata": {},
"source": [
"### Embedding documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8432efd-6315-4dcf-92a4-1a772c5caa9d",
"metadata": {},
"outputs": [],
"source": [
"documents = [\"Foo\", \"Bar\", \"Foo Bar\"]\n",
"embedded_documents = embeddings.embed_documents(documents)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "829001a3-2213-4eb3-9942-ee2583ff5577",
"metadata": {},
"outputs": [],
"source": [
"for i, doc in enumerate(documents):\n",
" print(f\"Document: {doc}\")\n",
" print(f\"Embedding: {embedded_documents[i][:5]}...\") # Show first 5 values to avoid overwhelming output\n",
" print(\"\\n\")"
]
},
{
"cell_type": "markdown",
"id": "4b831983-1c2f-4b75-a36e-e1fea374cb1c",
"metadata": {},
"source": [
"### Embedding a query"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c13f2f2-2357-4e31-a432-9a34d70bcc9a",
"metadata": {},
"outputs": [],
"source": [
"query = \"Hello World!\"\n",
"embedded_query = embeddings.embed_query(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ad525df-e411-4f9c-a796-f8c388b21d7e",
"metadata": {},
"outputs": [],
"source": [
"print(\"Query Embedding:\")\n",
"print(f\"Query: {query}\")\n",
"print(f\"Embedding: {embedded_query[:5]}...\") # Show first 5 values of the embedding"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "vllm-langchain",
"language": "python",
"name": "vllm-langchain"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
91 changes: 91 additions & 0 deletions libs/community/langchain_community/embeddings/opensearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from __future__ import annotations

import json
from typing import TYPE_CHECKING, List

from langchain_core.embeddings import Embeddings

if TYPE_CHECKING:
from opensearchpy import OpenSearch


class OpenSearchEmbeddings(Embeddings):
def __init__(
self,
client: OpenSearch,
model_id: str,
):
self.client = client
self.model_id = model_id

@classmethod
def from_connection(
cls,
opensearch_connection: OpenSearch,
model_id: str,
) -> OpenSearchEmbeddings:
"""
Class method to create an OpenSearchEmbeddings object
from an OpenSearch connection.

Args:
opensearch_connection (OpenSearch): The OpenSearch connection.
model_id (str): The ML model ID for generating embeddings.

Returns:
OpenSearchEmbeddings: An instance of the OpenSearchEmbedding class.
"""
return cls(opensearch_connection, model_id)

def _embedding_func(self, texts: List[str]) -> List[List[float]]:
"""
Internal method that sends a request to OpenSearch's text
embedding endpoint and retrieves embeddings for the provided texts.

Args:
texts (List[str]): A list of strings to be embedded.

Returns:
List[List[float]]: A list of embeddings,
where each embedding is a list of floats.
"""
endpoint = f"/_plugins/_ml/_predict/text_embedding/{self.model_id}"
body = {
"text_docs": texts,
"return_number": True,
"target_response": ["sentence_embedding"],
}

response = self.client.transport.perform_request(
method="POST",
url=endpoint,
body=json.dumps(body),
)
embeddings = [
item["output"][0]["data"] for item in response["inference_results"]
]
return embeddings

def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for a list of documents.

Args:
texts (List[str]): A list of text documents to embed.

Returns:
List[List[float]]: A list of embeddings for each document.
"""
return self._embedding_func(texts)

def embed_query(self, text: str) -> List[float]:
"""
Generate an embedding for a single query.

Args:
text (str): The text query to embed.

Returns:
List[float]: The embedding for the query.
"""
return self._embedding_func([text])[0]
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import List

import pytest
from opensearchpy import OpenSearch

from langchain_community.embeddings.opensearch import OpenSearchEmbeddings


@pytest.fixture
def model_id() -> str:
"""Fixture to provide model ID."""
return "some-model-id"


@pytest.fixture
def client() -> OpenSearch:
"""Fixture to provide OpenSearch client connection."""
return OpenSearch(
hosts=[{"host": "localhost", "port": 9200}], # Remove sensitive info
http_auth=("username", "password"), # Remove sensitive info
use_ssl=True,
verify_certs=False,
)


@pytest.fixture
def opensearch_embedding(client: OpenSearch, model_id: str) -> OpenSearchEmbeddings:
"""Fixture to provide OpenSearch embeddings connection."""
return OpenSearchEmbeddings.from_connection(client, model_id)


@pytest.fixture
def documents() -> List[str]:
"""Fixture for test documents."""
return ["foo bar", "bar foo", "foo"]


def test_opensearch_embedding_documents(
opensearch_embedding: OpenSearchEmbeddings, documents: List[str]
) -> None:
"""
Test OpenSearch embedding documents.
Convert a list of strings into a list of floats,
with each element having the shape of its embedding vector dimensions.
"""
output = opensearch_embedding.embed_documents(documents)
assert len(output) == len(documents)
for embedding in output:
assert len(embedding) == 768 # Expected embedding size


def test_opensearch_embedding_query(opensearch_embedding: OpenSearchEmbeddings) -> None:
"""
Test OpenSearch embedding query.
Convert a string into a float array, with the shape
corresponding to its embedding vector dimensions.
"""
document = "foo bar"
output = opensearch_embedding.embed_query(document)
assert len(output) == 768 # Expected embedding size