From 942ab0bbfefc4469ec20aa060ba4533cd982f118 Mon Sep 17 00:00:00 2001 From: xhluca Date: Sun, 15 Sep 2024 20:50:11 -0400 Subject: [PATCH] Add new examples using numba --- examples/index_and_retrieve_with_numba.py | 51 +++++++++++++++++++ ...mba.py => retrieve_with_numba_advanced.py} | 0 examples/retrieve_with_numba_hf.py | 45 ++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 examples/index_and_retrieve_with_numba.py rename examples/{retrieve_with_numba.py => retrieve_with_numba_advanced.py} (100%) create mode 100644 examples/retrieve_with_numba_hf.py diff --git a/examples/index_and_retrieve_with_numba.py b/examples/index_and_retrieve_with_numba.py new file mode 100644 index 0000000..bf83510 --- /dev/null +++ b/examples/index_and_retrieve_with_numba.py @@ -0,0 +1,51 @@ +""" +# Example: Use Numba to speed up the retrieval process + +```bash +pip install "bm25s[full]" numba +``` + +To build an index, please refer to the `examples/index_and_upload_to_hf.py` script. + +Now, to run this script, execute: +```bash +python examples/retrieve_with_numba.py +``` +""" +import os +import Stemmer + +import bm25s.hf +import bm25s + +def main(dataset='scifact', dataset_dir='./datasets'): + queries = [ + "Is chemotherapy effective for treating cancer?", + "Is Cardiac injury is common in critical cases of COVID-19?", + ] + + bm25s.utils.beir.download_dataset(dataset=dataset, save_dir=dataset_dir) + corpus: dict = bm25s.utils.beir.load_corpus(dataset=dataset, save_dir=dataset_dir) + corpus_records = [ + {'id': k, 'title': v["title"], 'text': v["text"]} for k, v in corpus.items() + ] + corpus_lst = [r["title"] + " " + r["text"] for r in corpus_records] + + retriever = bm25s.BM25(corpus=corpus_records, backend='numba') + retriever.index(corpus_lst) + # corpus=corpus_records is optional, only used when you are calling retrieve and want to return the documents + + # Tokenize the queries + stemmer = Stemmer.Stemmer("english") + tokenizer = bm25s.tokenization.Tokenizer(stemmer=stemmer) + queries_tokenized = tokenizer.tokenize(queries) + # Retrieve the top-k results + results = retriever.retrieve(queries_tokenized, k=3) + # show first results + result = results.documents[0] + print(f"First score (# 1 result): {results.scores[0, 0]:.4f}") + print(f"First result id (# 1 result): {result[0]['id']}") + print(f"First result title (# 1 result): {result[0]['title']}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/retrieve_with_numba.py b/examples/retrieve_with_numba_advanced.py similarity index 100% rename from examples/retrieve_with_numba.py rename to examples/retrieve_with_numba_advanced.py diff --git a/examples/retrieve_with_numba_hf.py b/examples/retrieve_with_numba_hf.py new file mode 100644 index 0000000..2f690b5 --- /dev/null +++ b/examples/retrieve_with_numba_hf.py @@ -0,0 +1,45 @@ +""" +# Example: Use Numba to speed up the retrieval process + +```bash +pip install "bm25s[full]" numba +``` + +To build an index, please refer to the `examples/index_and_upload_to_hf.py` script. + +Now, to run this script, execute: +```bash +python examples/retrieve_with_numba.py +``` +""" +import os +import Stemmer + +import bm25s.hf + +def main(repo_name="xhluca/bm25s-fiqa-index"): + queries = [ + "Is chemotherapy effective for treating cancer?", + "Is Cardiac injury is common in critical cases of COVID-19?", + ] + + retriever = bm25s.hf.BM25HF.load_from_hub( + repo_name, load_corpus=False, mmap=False + ) + + retriever.backend = "numba" # this can also be set during initialization of the retriever + + # Tokenize the queries + stemmer = Stemmer.Stemmer("english") + tokenizer = bm25s.tokenization.Tokenizer(stemmer=stemmer) + queries_tokenized = tokenizer.tokenize(queries) + + # Retrieve the top-k results + results = retriever.retrieve(queries_tokenized, k=3) + # show first results + result = results.documents[0] + print(f"First score (# 1 result): {results.scores[0, 0]:.4f}") + print(f"First result (# 1 result): {result[0]}") + +if __name__ == "__main__": + main() \ No newline at end of file