UKPLab · ArthurCamara · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py
@@ -658,7 +658,7 @@ def encode(
 
         return all_embeddings
 
-    def forward(self, input: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
+    def forward(self, input: dict[str, Tensor], **kwargs) -> dict[str, Tensor]:
         if self.module_kwargs is None:
             return super().forward(input)
 
@@ -1023,7 +1023,7 @@ def tokenize(self, texts: list[str] | list[dict] | list[tuple[str, str]]) -> dic
         """
         return self._first_module().tokenize(texts)
 
-    def get_sentence_features(self, *features) -> dict[Literal["sentence_embedding"], torch.Tensor]:
+    def get_sentence_features(self, *features) -> dict[Literal["sentence_embedding"], Tensor]:
         return self._first_module().get_sentence_features(*features)
 
     def get_sentence_embedding_dimension(self) -> int | None:

diff --git a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
@@ -133,6 +133,10 @@ def __init__(
             SimilarityFunction.DOT_PRODUCT.value: dot_score,
         },  # Score function, higher=more similar
         main_score_function: str | SimilarityFunction | None = None,
+        query_prompt: str | None = None,
+        query_prompt_name: str | None = None,
+        corpus_prompt: str | None = None,
+        corpus_prompt_name: str | None = None,
     ) -> None:
         """
         Initializes the InformationRetrievalEvaluator.
@@ -154,6 +158,10 @@ def __init__(
             truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
             score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
             main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
+            query_prompt (str, optional): A prompt to use for the queries. Defaults to None.
+            query_prompt_name (str, optional): A name for the query prompt. Defaults to None.
+            corpus_prompt (str, optional): A prompt to use for the corpus. Defaults to None.
+            corpus_prompt_name (str, optional): A name for the corpus prompt. Defaults to None.
         """
         super().__init__()
         self.queries_ids = []
@@ -166,6 +174,11 @@ def __init__(
         self.corpus_ids = list(corpus.keys())
         self.corpus = [corpus[cid] for cid in self.corpus_ids]
 
+        self.query_prompt = query_prompt
+        self.query_prompt_name = query_prompt_name
+        self.corpus_prompt = corpus_prompt
+        self.corpus_prompt_name = corpus_prompt_name
+
         self.relevant_docs = relevant_docs
         self.corpus_chunk_size = corpus_chunk_size
         self.mrr_at_k = mrr_at_k
@@ -294,6 +307,8 @@ def compute_metrices(
         with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
             query_embeddings = model.encode(
                 self.queries,
+                prompt_name=self.query_prompt_name,
+                prompt=self.query_prompt,
                 show_progress_bar=self.show_progress_bar,
                 batch_size=self.batch_size,
                 convert_to_tensor=True,
@@ -316,6 +331,8 @@ def compute_metrices(
                 ):
                     sub_corpus_embeddings = corpus_model.encode(
                         self.corpus[corpus_start_idx:corpus_end_idx],
+                        prompt_name=self.corpus_prompt_name,
+                        prompt=self.corpus_prompt,
                         show_progress_bar=False,
                         batch_size=self.batch_size,
                         convert_to_tensor=True,

diff --git a/sentence_transformers/models/Pooling.py b/sentence_transformers/models/Pooling.py
@@ -57,7 +57,7 @@ def __init__(
         pooling_mode_mean_sqrt_len_tokens: bool = False,
         pooling_mode_weightedmean_tokens: bool = False,
         pooling_mode_lasttoken: bool = False,
-        include_prompt=True,
+        include_prompt: bool = True,
     ) -> None:
         super().__init__()
 

diff --git a/sentence_transformers/sampler.py b/sentence_transformers/sampler.py
@@ -213,6 +213,98 @@ def __len__(self) -> int:
             return (len(self.dataset) + self.batch_size - 1) // self.batch_size
 
 
+class MultipleNegativesBatchSampler(SetEpochMixin, BatchSampler):
+    def __init__(
+        self,
+        dataset: Dataset,
+        batch_size: int,
+        drop_last: bool,
+        valid_label_columns: list[str] = [],
+        generator: torch.Generator = None,
+        seed: int = 0,
+    ) -> None:
+        """
+        This sampler creates batches such that each batch contains samples where the negatives are not present
+        in any of the positives already sampled in the batch. This is useful when using a loss with in-batch
+        negatives as it will avoid that a positive also appears as a negative for the same anchor.
+        Using this sampler also avoids that the positives become duplicated
+        the batch, as its hard negatives are part of the same sample.
+
+        Recommended for:
+            - :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss`
+            - :class:`~sentence_transformers.losses.CachedMultipleNegativesRankingLoss`
+            - :class:`~sentence_transformers.losses.MegaBatchMarginLoss`
+            - :class:`~sentence_transformers.losses.GISTEmbedLoss`
+            - :class:`~sentence_transformers.losses.CachedGISTEmbedLoss`
+
+        Args:
+            dataset (Dataset): The dataset to sample from.
+            batch_size (int): Number of samples per batch.
+            drop_last (bool): If True, drop the last incomplete batch if the dataset size
+                is not divisible by the batch size.
+            valid_label_columns (List[str]): List of column names to check for labels.
+                The first column name from ``valid_label_columns`` found in the dataset will
+                be used as the label column.
+            generator (torch.Generator, optional): Optional random number generator for shuffling
+                the indices.
+            seed (int, optional): Seed for the random number generator to ensure reproducibility.
+        """
+        super().__init__(dataset, batch_size, drop_last)
+        if label_columns := set(dataset.column_names) & (set(valid_label_columns) | {"dataset_name"}):
+            dataset = dataset.remove_columns(label_columns)
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.generator = generator
+        self.seed = seed
+
+    def __iter__(self) -> Iterator[list[int]]:
+        """
+        Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the
+        batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield
+        the batch indices and continue with the next batch.
+        """
+        if self.generator and self.seed:
+            self.generator.manual_seed(self.seed + self.epoch)
+        anchor_column = self.dataset.column_names[0]
+        positive_column = self.dataset.column_names[1]
+        negative_columns = [self.dataset.column_names[i] for i in range(2, len(self.dataset.column_names))]
+
+        remaining_indices = set(torch.randperm(len(self.dataset), generator=self.generator).tolist())
+
+        while remaining_indices:
+            batch_values = set()
+            batch_indices = []
+            for index in remaining_indices:
+                sample = self.dataset[index]
+                # Make sure that either the positive or the negatives ARE NOT in the seen positives or queries
+                if negative_columns:
+                    negatives = set([sample[negative_column] for negative_column in negative_columns])
+                    if negatives & batch_values:
+                        continue
+                elif sample[positive_column] in batch_values:
+                    continue
+                batch_indices.append(index)
+                if len(batch_indices) == self.batch_size:
+                    yield batch_indices
+                    break
+
+                batch_values.add(sample[anchor_column])
+                batch_values.add(sample[positive_column])
+            else:
+                # NOTE: some indices might still have been ignored here
+                if not self.drop_last:
+                    yield batch_indices
+
+            remaining_indices -= set(batch_indices)
+
+    def __len__(self) -> int:
+        if self.drop_last:
+            return len(self.dataset) // self.batch_size
+        else:
+            return (len(self.dataset) + self.batch_size - 1) // self.batch_size
+
+
 class RoundRobinBatchSampler(SetEpochMixin, BatchSampler):
     """
     Batch sampler that yields batches in a round-robin fashion from multiple batch samplers, until one is exhausted.

diff --git a/sentence_transformers/trainer.py b/sentence_transformers/trainer.py
@@ -24,6 +24,7 @@
 from sentence_transformers.sampler import (
     DefaultBatchSampler,
     GroupByLabelBatchSampler,
+    MultipleNegativesBatchSampler,
     NoDuplicatesBatchSampler,
     ProportionalBatchSampler,
     RoundRobinBatchSampler,
@@ -526,6 +527,14 @@ def get_batch_sampler(
                 batch_size=batch_size,
                 drop_last=drop_last,
             )
+        if self.args.batch_sampler == BatchSamplers.MULTIPLE_NEGATIVES:
+            return MultipleNegativesBatchSampler(
+                dataset=dataset,
+                batch_size=batch_size,
+                drop_last=drop_last,
+                valid_label_columns=valid_label_columns,
+                generator=generator,
+            )
 
     def get_multi_dataset_batch_sampler(
         self,

diff --git a/sentence_transformers/training_args.py b/sentence_transformers/training_args.py
@@ -74,6 +74,7 @@ class BatchSamplers(ExplicitEnum):
     BATCH_SAMPLER = "batch_sampler"
     NO_DUPLICATES = "no_duplicates"
     GROUP_BY_LABEL = "group_by_label"
+    MULTIPLE_NEGATIVES = "multiple_negatives"
 
 
 class MultiDatasetBatchSamplers(ExplicitEnum):