Skip to content

Commit

Permalink
Fix paraphrase minilm (#436)
Browse files Browse the repository at this point in the history
* fix: Fix minilm paraphrase by adding it to pool models

* tests: Updated minilm paraphrase canonical vector

* chore: Added a warning message for updating the model

* chore: Added version where model will be removed
  • Loading branch information
hh-space-invader authored Jan 27, 2025
1 parent b05877d commit c2f6fd1
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 12 deletions.
11 changes: 0 additions & 11 deletions fastembed/text/onnx_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,6 @@
},
"model_file": "model_optimized.onnx",
},
{
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"dim": 384,
"description": "Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",
"license": "apache-2.0",
"size_in_GB": 0.22,
"sources": {
"hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
},
"model_file": "model_optimized.onnx",
},
{
"model": "thenlper/gte-large",
"dim": 1024,
Expand Down
11 changes: 11 additions & 0 deletions fastembed/text/pooled_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@
},
"model_file": "onnx/model.onnx",
},
{
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"dim": 384,
"description": "Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",
"license": "apache-2.0",
"size_in_GB": 0.22,
"sources": {
"hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
},
"model_file": "model_optimized.onnx",
},
]


Expand Down
8 changes: 8 additions & 0 deletions fastembed/text/text_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ def __init__(
UserWarning,
stacklevel=2,
)
if model_name == "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2":
warnings.warn(
"The model 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' has been updated to "
"include a mean pooling layer. Please ensure your usage aligns with the new functionality. "
"Support for the previous version without mean pooling will be removed as of version 0.5.2.",
UserWarning,
stacklevel=2,
)
for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
supported_models = EMBEDDING_MODEL_TYPE.list_supported_models()
if any(model_name.lower() == model["model"].lower() for model in supported_models):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_text_onnx_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
[-0.034478, 0.03102, 0.00673, 0.02611, -0.039362]
),
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array(
[0.0094, 0.0184, 0.0328, 0.0072, -0.0351]
[0.0361, 0.1862, 0.2776, 0.2461, -0.1904]
),
"intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array(
Expand Down

0 comments on commit c2f6fd1

Please sign in to comment.