Skip to content

Commit

Permalink
Support of special characters
Browse files Browse the repository at this point in the history
  • Loading branch information
I8dNLo committed Sep 19, 2024
1 parent fd79f3f commit 400ffb1
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 16 deletions.
4 changes: 3 additions & 1 deletion fastembed/common/model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ def retrieve_model_gcs(cls, model_name: str, source_url: str, cache_dir: str) ->
return model_dir

@classmethod
def download_model(cls, model: Dict[str, Any], cache_dir: Path, retries=3, **kwargs) -> Path:
def download_model(
cls, model: Dict[str, Any], cache_dir: Path, retries: object = 3, **kwargs: object
) -> Path:
"""
Downloads a model from HuggingFace Hub or Google Cloud Storage.
Expand Down
14 changes: 1 addition & 13 deletions fastembed/sparse/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
SparseEmbedding,
SparseTextEmbeddingBase,
)
from fastembed.sparse.utils.tokenizer import SimpleTokenizer

supported_languages = [
"arabic",
Expand Down Expand Up @@ -70,19 +71,6 @@ def append_to_file(file_path, token):
file.write(token + "\n")


class SimpleTokenizer:
def __init__(self):
pass

def tokenize(text: str):
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
# text = re.sub(r'[^\w]', ' ', text.lower())
text = re.sub(r"\s+", " ", text)

# Optionally, strip leading/trailing spaces
return text.strip().split()


def remove_non_alphanumeric(text):
return re.sub(r"[^\w\s]", " ", text, flags=re.UNICODE)

Expand Down
14 changes: 12 additions & 2 deletions fastembed/sparse/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@
from typing import List


class SimpleTokenizer:
def __init__(self):
pass

def tokenize(text: str):
text = re.sub(r"[^\w]", " ", text.lower())
text = re.sub(r"\s+", " ", text)

return text.strip().split()


class WordTokenizer:
"""The tokenizer is "destructive" such that the regexes applied will munge the
input string to a state beyond re-construction.
Expand Down Expand Up @@ -68,8 +79,7 @@ class WordTokenizer:
)
]
CONTRACTIONS3 = [
re.compile(pattern)
for pattern in (r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b")
re.compile(pattern) for pattern in (r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b")
]

@classmethod
Expand Down
18 changes: 18 additions & 0 deletions tests/test_attention_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,21 @@ def test_multilanguage(model_name):

assert embeddings[1].values.shape == (4,)
assert embeddings[1].indices.shape == (4,)


@pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
def test_special_characters(model_name):
docs = [
"Über den größten Flüssen Österreichs äußern sich Experten häufig: Öko-Systeme müssen geschützt werden!",
"L'élève français s'écrie : « Où est mon crayon ? J'ai besoin de finir cet exercice avant la récréation!",
"Într-o zi însorită, Ștefan și Ioana au mâncat mămăligă cu brânză și au băut țuică la cabană.",
"Üzgün öğretmen öğrencilere seslendi: Lütfen gürültü yapmayın, sınavınızı bitirmeye çalışıyorum!",
"Ο Ξενοφών είπε: «Ψάχνω για ένα ωραίο δώρο για τη γιαγιά μου. Ίσως ένα φυτό ή ένα βιβλίο;»",
"Hola! ¿Cómo estás? Estoy muy emocionado por el cumpleaños de mi hermano, ¡va a ser increíble! También quiero comprar un pastel de chocolate con fresas y un regalo especial: un libro titulado «Cien años de soledad",
]

model = SparseTextEmbedding(model_name=model_name, language="english")
embeddings = list(model.embed(docs))
for idx, shape in enumerate([14, 18, 15, 10, 15]):
assert embeddings[idx].values.shape == (shape,)
assert embeddings[idx].indices.shape == (shape,)

0 comments on commit 400ffb1

Please sign in to comment.