Support of special characters

qdrant · Sep 19, 2024 · 400ffb1 · 400ffb1
1 parent fd79f3f
commit 400ffb1
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 16 deletions.
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
@@ -198,7 +198,9 @@ def retrieve_model_gcs(cls, model_name: str, source_url: str, cache_dir: str) ->
         return model_dir
 
     @classmethod
-    def download_model(cls, model: Dict[str, Any], cache_dir: Path, retries=3, **kwargs) -> Path:
+    def download_model(
+        cls, model: Dict[str, Any], cache_dir: Path, retries: object = 3, **kwargs: object
+    ) -> Path:
         """
         Downloads a model from HuggingFace Hub or Google Cloud Storage.
 

diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py
@@ -17,6 +17,7 @@
     SparseEmbedding,
     SparseTextEmbeddingBase,
 )
+from fastembed.sparse.utils.tokenizer import SimpleTokenizer
 
 supported_languages = [
     "arabic",
@@ -70,19 +71,6 @@ def append_to_file(file_path, token):
         file.write(token + "\n")
 
 
-class SimpleTokenizer:
-    def __init__(self):
-        pass
-
-    def tokenize(text: str):
-        text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
-        # text = re.sub(r'[^\w]', ' ', text.lower())
-        text = re.sub(r"\s+", " ", text)
-
-        # Optionally, strip leading/trailing spaces
-        return text.strip().split()
-
-
 def remove_non_alphanumeric(text):
     return re.sub(r"[^\w\s]", " ", text, flags=re.UNICODE)
 

diff --git a/fastembed/sparse/utils/tokenizer.py b/fastembed/sparse/utils/tokenizer.py
@@ -4,6 +4,17 @@
 from typing import List
 
 
+class SimpleTokenizer:
+    def __init__(self):
+        pass
+
+    def tokenize(text: str):
+        text = re.sub(r"[^\w]", " ", text.lower())
+        text = re.sub(r"\s+", " ", text)
+
+        return text.strip().split()
+
+
 class WordTokenizer:
     """The tokenizer is "destructive" such that the regexes applied will munge the
     input string to a state beyond re-construction.
@@ -68,8 +79,7 @@ class WordTokenizer:
         )
     ]
     CONTRACTIONS3 = [
-        re.compile(pattern)
-        for pattern in (r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b")
+        re.compile(pattern) for pattern in (r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b")
     ]
 
     @classmethod

diff --git a/tests/test_attention_embeddings.py b/tests/test_attention_embeddings.py
@@ -102,3 +102,21 @@ def test_multilanguage(model_name):
 
     assert embeddings[1].values.shape == (4,)
     assert embeddings[1].indices.shape == (4,)
+
+
+@pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
+def test_special_characters(model_name):
+    docs = [
+        "Über den größten Flüssen Österreichs äußern sich Experten häufig: Öko-Systeme müssen geschützt werden!",
+        "L'élève français s'écrie : « Où est mon crayon ? J'ai besoin de finir cet exercice avant la récréation!",
+        "Într-o zi însorită, Ștefan și Ioana au mâncat mămăligă cu brânză și au băut țuică la cabană.",
+        "Üzgün öğretmen öğrencilere seslendi: Lütfen gürültü yapmayın, sınavınızı bitirmeye çalışıyorum!",
+        "Ο Ξενοφών είπε: «Ψάχνω για ένα ωραίο δώρο για τη γιαγιά μου. Ίσως ένα φυτό ή ένα βιβλίο;»",
+        "Hola! ¿Cómo estás? Estoy muy emocionado por el cumpleaños de mi hermano, ¡va a ser increíble! También quiero comprar un pastel de chocolate con fresas y un regalo especial: un libro titulado «Cien años de soledad",
+    ]
+
+    model = SparseTextEmbedding(model_name=model_name, language="english")
+    embeddings = list(model.embed(docs))
+    for idx, shape in enumerate([14, 18, 15, 10, 15]):
+        assert embeddings[idx].values.shape == (shape,)
+        assert embeddings[idx].indices.shape == (shape,)