elastic · maxjakob · Apr 30, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 18, 2024
diff --git a/elasticsearch/store/__init__.py b/elasticsearch/store/__init__.py
diff --git a/elasticsearch/store/_utilities.py b/elasticsearch/store/_utilities.py
@@ -0,0 +1,153 @@
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+from elasticsearch import (
+    AsyncElasticsearch,
+    BadRequestError,
+    ConflictError,
+    NotFoundError,
+)
+
+Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
+
+
+def create_elasticsearch_client(
+    agent_header: str,
+    client: Optional[AsyncElasticsearch] = None,
+    url: Optional[str] = None,
+    cloud_id: Optional[str] = None,
+    api_key: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    client_params: Optional[Dict[str, Any]] = None,
+) -> AsyncElasticsearch:
+    if not client:
+        if url and cloud_id:
+            raise ValueError(
+                "Both es_url and cloud_id are defined. Please provide only one."
+            )
+
+        connection_params: Dict[str, Any] = {}
+
+        if url:
+            connection_params["hosts"] = [url]
+        elif cloud_id:
+            connection_params["cloud_id"] = cloud_id
+        else:
+            raise ValueError("Please provide either elasticsearch_url or cloud_id.")
+
+        if api_key:
+            connection_params["api_key"] = api_key
+        elif username and password:
+            connection_params["basic_auth"] = (username, password)
+
+        if client_params is not None:
+            connection_params.update(client_params)
+
+        client = AsyncElasticsearch(**connection_params)
+
+    if not isinstance(client, AsyncElasticsearch):
+        raise TypeError("Elasticsearch client must be AsyncElasticsearch client")
+
+    # Add integration-specific usage header for tracking usage in Elastic Cloud.
+    # client.options preserces existing (non-user-agent) headers.
+    client = client.options(headers={"User-Agent": agent_header})
+
+    return client
+
+
+async def model_must_be_deployed_async(
+    client: AsyncElasticsearch, model_id: str
+) -> None:
+    try:
+        dummy = {"x": "y"}
+        await client.ml.infer_trained_model(model_id=model_id, docs=[dummy])
+    except NotFoundError as err:
+        raise err
+    except ConflictError as err:
+        raise NotFoundError(
+            f"model '{model_id}' not found, please deploy it first",
+            meta=err.meta,
+            body=err.body,
+        ) from err
+    except BadRequestError:
+        # This error is expected because we do not know the expected document
+        # shape and just use a dummy doc above.
+        pass
+
+    return None
+
+
+async def model_is_deployed_async(es_client: AsyncElasticsearch, model_id: str) -> bool:
+    try:
+        await model_must_be_deployed_async(es_client, model_id)
+        return True
+    except NotFoundError:
+        return False
+
+
+def maximal_marginal_relevance(
+    query_embedding: list,
+    embedding_list: list,
+    lambda_mult: float = 0.5,
+    k: int = 4,
+) -> List[int]:
+    """Calculate maximal marginal relevance."""
+    query_embedding_arr = np.array(query_embedding)
+
+    if min(k, len(embedding_list)) <= 0:
+        return []
+    if query_embedding_arr.ndim == 1:
+        query_embedding_arr = np.expand_dims(query_embedding_arr, axis=0)
+    similarity_to_query = _cosine_similarity(query_embedding_arr, embedding_list)[0]
+    most_similar = int(np.argmax(similarity_to_query))
+    idxs = [most_similar]
+    selected = np.array([embedding_list[most_similar]])
+    while len(idxs) < min(k, len(embedding_list)):
+        best_score = -np.inf
+        idx_to_add = -1
+        similarity_to_selected = _cosine_similarity(embedding_list, selected)
+        for i, query_score in enumerate(similarity_to_query):
+            if i in idxs:
+                continue
+            redundant_score = max(similarity_to_selected[i])
+            equation_score = (
+                lambda_mult * query_score - (1 - lambda_mult) * redundant_score
+            )
+            if equation_score > best_score:
+                best_score = equation_score
+                idx_to_add = i
+        idxs.append(idx_to_add)
+        selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
+    return idxs
+
+
+def _cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
+    """Row-wise cosine similarity between two equal-width matrices."""
+    if len(X) == 0 or len(Y) == 0:
+        return np.array([])
+
+    X = np.array(X)
+    Y = np.array(Y)
+    if X.shape[1] != Y.shape[1]:
+        raise ValueError(
+            f"Number of columns in X and Y must be the same. X has shape {X.shape} "
+            f"and Y has shape {Y.shape}."
+        )
+    try:
+        import simsimd as simd  # type: ignore
+
+        X = np.array(X, dtype=np.float32)
+        Y = np.array(Y, dtype=np.float32)
+        Z = 1 - simd.cdist(X, Y, metric="cosine")
+        if isinstance(Z, float):
+            return np.array([Z])
+        return np.array(Z)
+    except ImportError:
+        X_norm = np.linalg.norm(X, axis=1)
+        Y_norm = np.linalg.norm(Y, axis=1)
+        # Ignore divide by zero errors run time warnings as those are handled below.
+        with np.errstate(divide="ignore", invalid="ignore"):
+            similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
+        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
+        return similarity
diff --git a/elasticsearch/store/embedding_service.py b/elasticsearch/store/embedding_service.py
@@ -0,0 +1,132 @@
+import asyncio
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+import nest_asyncio  # type: ignore
+from elasticsearch import AsyncElasticsearch
+
+from elasticsearch.store._utilities import create_elasticsearch_client
+
+
+class EmbeddingService(ABC):
+    @abstractmethod
+    async def embed_documents_async(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a list of documents.
+
+        Args:
+            texts: A list of document strings to generate embeddings for.
+
+        Returns:
+            A list of embeddings, one for each document in the input.
+        """
+
+    @abstractmethod
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a list of documents.
+
+        Args:
+            texts: A list of document strings to generate embeddings for.
+
+        Returns:
+            A list of embeddings, one for each document in the input.
+        """
+
+    @abstractmethod
+    async def embed_query_async(self, query: str) -> List[float]:
+        """Generate an embedding for a single query text.
+
+        Args:
+            text: The query text to generate an embedding for.
+
+        Returns:
+            The embedding for the input query text.
+        """
+
+    @abstractmethod
+    def embed_query(self, query: str) -> List[float]:
+        """Generate an embedding for a single query text.
+
+        Args:
+            text: The query text to generate an embedding for.
+
+        Returns:
+            The embedding for the input query text.
+        """
+
+
+class ElasticsearchEmbeddings(EmbeddingService):
+    """Elasticsearch as a service for embedding model inference.
+
+    You need to have an embedding model downloaded and deployed in Elasticsearch:
+    - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-trained-model.html
+    - https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-deploy-models.html
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        agent_header: str,
+        model_id: str,
+        input_field: str = "text_field",
+        num_dimensions: Optional[int] = None,
+        # Connection params
+        es_client: Optional[AsyncElasticsearch] = None,
+        es_url: Optional[str] = None,
+        es_cloud_id: Optional[str] = None,
+        es_api_key: Optional[str] = None,
+        es_user: Optional[str] = None,
+        es_password: Optional[str] = None,
+    ):
+        """
+        Args:
+            agent_header: user agent header specific to the 3rd party integration.
+                Used for usage tracking in Elastic Cloud.
+            model_id: The model_id of the model deployed in the Elasticsearch cluster.
+            input_field: The name of the key for the input text field in the
+                document. Defaults to 'text_field'.
+            num_dimensions: The number of embedding dimensions. If None, then dimensions
+                will be infer from an example inference call.
+            es_client: Elasticsearch client connection. Alternatively specify the
+                Elasticsearch connection with the other es_* parameters.
+        """
+        nest_asyncio.apply()
+
+        client = create_elasticsearch_client(
+            agent_header=agent_header,
+            client=es_client,
+            url=es_url,
+            cloud_id=es_cloud_id,
+            api_key=es_api_key,
+            username=es_user,
+            password=es_password,
+        )
+
+        self.client = client.ml
+        self.model_id = model_id
+        self.input_field = input_field
+        self._num_dimensions = num_dimensions
+
+    async def embed_documents_async(self, texts: List[str]) -> List[List[float]]:
+        result = await self._embedding_func_async(texts)
+        return result
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        return asyncio.get_event_loop().run_until_complete(
+            self.embed_documents_async(texts)
+        )
+
+    async def embed_query_async(self, text: str) -> List[float]:
+        result = await self._embedding_func_async([text])
+        return result[0]
+
+    def embed_query(self, query: str) -> List[float]:
+        return asyncio.get_event_loop().run_until_complete(
+            self.embed_query_async(query)
+        )
+
+    async def _embedding_func_async(self, texts: List[str]) -> List[List[float]]:
+        response = await self.client.infer_trained_model(
+            model_id=self.model_id, docs=[{self.input_field: text} for text in texts]
+        )
+
+        embeddings = [doc["predicted_value"] for doc in response["inference_results"]]
+        return embeddings