From 62607c237b14b8286be75291c864594eb115daa5 Mon Sep 17 00:00:00 2001 From: Dmitrii Ogn Date: Wed, 14 Aug 2024 15:59:22 +0300 Subject: [PATCH] Fix to avoid overfloat and get rid of model_max_length (#319) * Fix to avoid overfloat and get rid of model_max_length * Fixes for max_length vs model_max_length logic Jupter warning disabled * Support of https://github.com/jwodder/versioningit/issues/48 * Update fastembed/common/preprocessor_utils.py --------- Co-authored-by: George --- fastembed/common/model_management.py | 4 ++- fastembed/common/preprocessor_utils.py | 16 +++++++--- fastembed/text/onnx_embedding.py | 8 ++--- tests/__init__.py | 4 +++ tests/test_text_onnx_embeddings.py | 43 +++++++------------------- 5 files changed, 31 insertions(+), 44 deletions(-) diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py index 0259d414..8e3d2dc5 100644 --- a/fastembed/common/model_management.py +++ b/fastembed/common/model_management.py @@ -148,7 +148,9 @@ def decompress_to_cache(cls, targz_path: str, cache_dir: str): # Open the tar.gz file with tarfile.open(targz_path, "r:gz") as tar: # Extract all files into the cache directory - tar.extractall(path=cache_dir) + tar.extractall( + path=cache_dir, + ) except tarfile.TarError as e: # If any error occurs while opening or extracting the tar.gz file, # delete the cache directory (if it was created in this function) diff --git a/fastembed/common/preprocessor_utils.py b/fastembed/common/preprocessor_utils.py index 1b734046..db2432f3 100644 --- a/fastembed/common/preprocessor_utils.py +++ b/fastembed/common/preprocessor_utils.py @@ -1,7 +1,6 @@ import json from pathlib import Path from typing import Tuple - from tokenizers import AddedToken, Tokenizer from fastembed.image.transform.operators import Compose @@ -18,7 +17,7 @@ def load_special_tokens(model_dir: Path) -> dict: return tokens_map -def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tuple[Tokenizer, dict]: +def load_tokenizer(model_dir: Path) -> Tuple[Tokenizer, dict]: config_path = model_dir / "config.json" if not config_path.exists(): raise ValueError(f"Could not find config.json in {model_dir}") @@ -36,13 +35,20 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tuple[Tokenizer, d with open(str(tokenizer_config_path)) as tokenizer_config_file: tokenizer_config = json.load(tokenizer_config_file) + assert ( + "model_max_length" in tokenizer_config or "max_length" in tokenizer_config + ), "Models without model_max_length or max_length are not supported." + if "model_max_length" not in tokenizer_config: + max_context = tokenizer_config["max_length"] + elif "max_length" not in tokenizer_config: + max_context = tokenizer_config["model_max_length"] + else: + max_context = min(tokenizer_config["model_max_length"], tokenizer_config["max_length"]) tokens_map = load_special_tokens(model_dir) tokenizer = Tokenizer.from_file(str(tokenizer_path)) - tokenizer.enable_truncation( - max_length=min(tokenizer_config["model_max_length"], max_length) - ) + tokenizer.enable_truncation(max_length=max_context) tokenizer.enable_padding( pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"] ) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index 614a24fe..7e1f35c3 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -244,9 +244,7 @@ def _preprocess_onnx_input( """ return onnx_input - def _post_process_onnx_output( - self, output: OnnxOutputContext - ) -> Iterable[np.ndarray]: + def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[np.ndarray]: embeddings = output.model_output return normalize(embeddings[:, 0]).astype(np.float32) @@ -258,6 +256,4 @@ def init_embedding( cache_dir: str, **kwargs, ) -> OnnxTextEmbedding: - return OnnxTextEmbedding( - model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs - ) + return OnnxTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs) diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..9e794cff 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,4 @@ +import os + +# disable DeprecationWarning https://github.com/jupyter/jupyter_core/issues/398 +os.environ["JUPYTER_PLATFORM_DIRS"] = "1" diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index b0590594..019a42f5 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -32,32 +32,20 @@ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array( [0.0094, 0.0184, 0.0328, 0.0072, -0.0351] ), - "intfloat/multilingual-e5-large": np.array( - [0.0098, 0.0045, 0.0066, -0.0354, 0.0070] - ), + "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), "sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array( [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299] ), - "jinaai/jina-embeddings-v2-small-en": np.array( - [-0.0455, -0.0428, -0.0122, 0.0613, 0.0015] - ), - "jinaai/jina-embeddings-v2-base-en": np.array( - [-0.0332, -0.0509, 0.0287, -0.0043, -0.0077] - ), - "jinaai/jina-embeddings-v2-base-de": np.array( - [-0.0085, 0.0417, 0.0342, 0.0309, -0.0149] - ), - "jinaai/jina-embeddings-v2-base-code": np.array( - [0.0145, -0.0164, 0.0136, -0.0170, 0.0734] - ), - "nomic-ai/nomic-embed-text-v1": np.array( - [0.3708 , 0.2031, -0.3406, -0.2114, -0.3230] - ), + "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), + "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), + "jinaai/jina-embeddings-v2-base-de": np.array([-0.0085, 0.0417, 0.0342, 0.0309, -0.0149]), + "jinaai/jina-embeddings-v2-base-code": np.array([0.0145, -0.0164, 0.0136, -0.0170, 0.0734]), + "nomic-ai/nomic-embed-text-v1": np.array([0.3708, 0.2031, -0.3406, -0.2114, -0.3230]), "nomic-ai/nomic-embed-text-v1.5": np.array( [-0.15407836, -0.03053198, -3.9138033, 0.1910364, 0.13224715] ), "nomic-ai/nomic-embed-text-v1.5-Q": np.array( - [-0.12525563, 0.38030425, -3.961622 , 0.04176439, -0.0758301] + [-0.12525563, 0.38030425, -3.961622, 0.04176439, -0.0758301] ), "thenlper/gte-large": np.array( [-0.01920587, 0.00113156, -0.00708992, -0.00632304, -0.04025577] @@ -65,21 +53,13 @@ "mixedbread-ai/mxbai-embed-large-v1": np.array( [0.02295546, 0.03196154, 0.016512, -0.04031524, -0.0219634] ), - "snowflake/snowflake-arctic-embed-xs": np.array( - [0.0092, 0.0619, 0.0196, 0.009, -0.0114] - ), - "snowflake/snowflake-arctic-embed-s": np.array( - [-0.0416, -0.0867, 0.0209, 0.0554, -0.0272] - ), - "snowflake/snowflake-arctic-embed-m": np.array( - [-0.0329, 0.0364, 0.0481, 0.0016, 0.0328] - ), + "snowflake/snowflake-arctic-embed-xs": np.array([0.0092, 0.0619, 0.0196, 0.009, -0.0114]), + "snowflake/snowflake-arctic-embed-s": np.array([-0.0416, -0.0867, 0.0209, 0.0554, -0.0272]), + "snowflake/snowflake-arctic-embed-m": np.array([-0.0329, 0.0364, 0.0481, 0.0016, 0.0328]), "snowflake/snowflake-arctic-embed-m-long": np.array( [0.0080, -0.0266, -0.0335, 0.0282, 0.0143] ), - "snowflake/snowflake-arctic-embed-l": np.array( - [0.0189, -0.0673, 0.0183, 0.0124, 0.0146] - ), + "snowflake/snowflake-arctic-embed-l": np.array([0.0189, -0.0673, 0.0183, 0.0124, 0.0146]), "Qdrant/clip-ViT-B-32-text": np.array([0.0083, 0.0103, -0.0138, 0.0199, -0.0069]), } @@ -94,7 +74,6 @@ def test_embedding(): dim = model_desc["dim"] model = TextEmbedding(model_name=model_desc["model"]) - docs = ["hello world", "flag embedding"] embeddings = list(model.embed(docs)) embeddings = np.stack(embeddings, axis=0)