From e12e58fdcb367733e1dbfd297cb6d4768f04c254 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 28 Aug 2024 02:33:56 -0700 Subject: [PATCH] Added rerank pipeline --- CHANGELOG.md | 1 + README.md | 19 ++++++++++++++++ lib/informers/models.rb | 7 ++++++ lib/informers/pipelines.rb | 43 +++++++++++++++++++++++++++++++++++++ lib/informers/tokenizers.rb | 6 ++++++ test/model_test.rb | 17 +++++++++++++++ test/pipeline_test.rb | 8 +++++++ 7 files changed, 101 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b016d81..901acae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 1.0.2 (unreleased) +- Added `rerank` pipeline - Added support for `nomic-ai/nomic-embed-text-v1` - Added support for `intfloat/e5-base-v2` to `Model` - Added support for `BAAI/bge-base-en-v1.5` to `Model` diff --git a/README.md b/README.md index b59c9ce..dad6022 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,18 @@ model = Informers.pipeline("feature-extraction", "BAAI/bge-base-en-v1.5", quanti embeddings = model.(input, pooling: "mean", normalize: true) ``` +### mixedbread-ai/mxbai-rerank-base-v1 + +[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1) [unreleased] + +```ruby +query = "How many people live in London?" +docs = ["Around 9 Million people live in London", "London is known for its financial district"] + +model = Informers.pipeline("rerank", "mixedbread-ai/mxbai-rerank-base-v1", quantized: false) +result = model.(query, docs) +``` + ### Other You can use the feature extraction pipeline directly. @@ -171,6 +183,13 @@ extractor = Informers.pipeline("feature-extraction") extractor.("We are very happy to show you the 🤗 Transformers library.") ``` +Reranking [unreleased] + +```ruby +ranker = Informers.pipeline("rerank") +ranker.("Who created Ruby?", ["Matz created Ruby", "Another doc"]) +``` + ## Credits This library was ported from [Transformers.js](https://github.com/xenova/transformers.js) and is available under the same license. diff --git a/lib/informers/models.rb b/lib/informers/models.rb index de882c5..c85f0f3 100644 --- a/lib/informers/models.rb +++ b/lib/informers/models.rb @@ -205,6 +205,12 @@ class NomicBertPreTrainedModel < PreTrainedModel class NomicBertModel < NomicBertPreTrainedModel end + class DebertaV2PreTrainedModel < PreTrainedModel + end + + class DebertaV2Model < DebertaV2PreTrainedModel + end + class DistilBertPreTrainedModel < PreTrainedModel end @@ -226,6 +232,7 @@ def call(model_inputs) MODEL_MAPPING_NAMES_ENCODER_ONLY = { "bert" => ["BertModel", BertModel], "nomic_bert" => ["NomicBertModel", NomicBertModel], + "deberta-v2" => ["DebertaV2Model", DebertaV2Model], "distilbert" => ["DistilBertModel", DistilBertModel] } diff --git a/lib/informers/pipelines.rb b/lib/informers/pipelines.rb index b2a0e10..ba59a0c 100644 --- a/lib/informers/pipelines.rb +++ b/lib/informers/pipelines.rb @@ -308,6 +308,40 @@ def call( end end + class RerankPipeline < Pipeline + def initialize(**options) + super(**options) + end + + def call( + query, + documents, + return_documents: false, + top_k: nil + ) + model_inputs = @tokenizer.([query] * documents.size, + text_pair: documents, + padding: true, + truncation: true + ) + + outputs = @model.(model_inputs) + + result = + Utils.sigmoid(outputs[0].map(&:first)) + .map.with_index { |s, i| {doc_id: i, score: s} } + .sort_by { |v| -v[:score] } + + if return_documents + result.each do |v| + v[:text] = documents[v[:doc_id]] + end + end + + top_k ? result.first(top_k) : result + end + end + SUPPORTED_TASKS = { "text-classification" => { tokenizer: AutoTokenizer, @@ -344,6 +378,15 @@ def call( model: "Xenova/all-MiniLM-L6-v2" }, type: "text" + }, + "rerank" => { + tokenizer: AutoTokenizer, + pipeline: RerankPipeline, + model: AutoModel, + default: { + model: "mixedbread-ai/mxbai-rerank-base-v1" + }, + type: "text" } } diff --git a/lib/informers/tokenizers.rb b/lib/informers/tokenizers.rb index b135c6e..35cc71f 100644 --- a/lib/informers/tokenizers.rb +++ b/lib/informers/tokenizers.rb @@ -83,12 +83,18 @@ class BertTokenizer < PreTrainedTokenizer # self.return_token_type_ids = true end + class DebertaV2Tokenizer < PreTrainedTokenizer + # TODO + # self.return_token_type_ids = true + end + class DistilBertTokenizer < PreTrainedTokenizer end class AutoTokenizer TOKENIZER_CLASS_MAPPING = { "BertTokenizer" => BertTokenizer, + "DebertaV2Tokenizer" => DebertaV2Tokenizer, "DistilBertTokenizer" => DistilBertTokenizer } diff --git a/test/model_test.rb b/test/model_test.rb index bd9594f..f9935f4 100644 --- a/test/model_test.rb +++ b/test/model_test.rb @@ -120,4 +120,21 @@ def test_bge_base assert_elements_in_delta [0.00029264, -0.0619305, -0.06199387], embeddings[0][..2] assert_elements_in_delta [-0.07482512, -0.0770234, 0.03398684], embeddings[-1][..2] end + + # https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1 + def test_mxbai_rerank + query = "How many people live in London?" + docs = ["Around 9 Million people live in London", "London is known for its financial district"] + + model = Informers.pipeline("rerank", "mixedbread-ai/mxbai-rerank-base-v1", quantized: false) + result = model.(query, docs, return_documents: true) + + assert_equal 0, result[0][:doc_id] + assert_in_delta 0.984, result[0][:score] + assert_equal docs[0], result[0][:text] + + assert_equal 1, result[1][:doc_id] + assert_in_delta 0.139, result[1][:score] + assert_equal docs[1], result[1][:text] + end end diff --git a/test/pipeline_test.rb b/test/pipeline_test.rb index 4dd4b99..ece5360 100644 --- a/test/pipeline_test.rb +++ b/test/pipeline_test.rb @@ -58,6 +58,14 @@ def test_feature_extraction assert_in_delta (-0.3130), output[-1][-1][-1] end + def test_rerank + ranker = Informers.pipeline("rerank") + result = ranker.("Who created Ruby?", ["Matz created Ruby", "Another doc"]) + assert_equal 2, result.size + assert_equal 0, result[0][:doc_id] + assert_equal 1, result[1][:doc_id] + end + def test_progress_callback msgs = [] extractor = Informers.pipeline("feature-extraction", progress_callback: ->(msg) { msgs << msg })