diff --git a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
index 6199683c0..21d15746f 100644
--- a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
+++ b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
@@ -158,10 +158,10 @@ def __init__(
             truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
             score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
             main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
-            query_prompt (str, optional): A prompt to use for the queries. Defaults to None.
-            query_prompt_name (str, optional): A name for the query prompt. Defaults to None.
-            corpus_prompt (str, optional): A prompt to use for the corpus. Defaults to None.
-            corpus_prompt_name (str, optional): A name for the corpus prompt. Defaults to None.
+            query_prompt (str, optional): The prompt to be used when encoding the corpus. Defaults to None.
+            query_prompt_name (str, optional): The name of the prompt to be used when encoding the corpus. Defaults to None.
+            corpus_prompt (str, optional): The prompt to be used when encoding the corpus. Defaults to None.
+            corpus_prompt_name (str, optional): The name of the prompt to be used when encoding the corpus. Defaults to None.
         """
         super().__init__()
         self.queries_ids = []
@@ -290,7 +290,7 @@ def __call__(
         return metrics
 
     def compute_metrices(
-        self, model: SentenceTransformer, corpus_model=None, corpus_embeddings: Tensor = None
+        self, model: SentenceTransformer, corpus_model=None, corpus_embeddings: Tensor | None = None
     ) -> dict[str, float]:
         if corpus_model is None:
             corpus_model = model
@@ -309,8 +309,8 @@ def compute_metrices(
                 self.queries,
                 prompt_name=self.query_prompt_name,
                 prompt=self.query_prompt,
-                show_progress_bar=self.show_progress_bar,
                 batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
                 convert_to_tensor=True,
             )
 
@@ -333,8 +333,8 @@ def compute_metrices(
                         self.corpus[corpus_start_idx:corpus_end_idx],
                         prompt_name=self.corpus_prompt_name,
                         prompt=self.corpus_prompt,
-                        show_progress_bar=False,
                         batch_size=self.batch_size,
+                        show_progress_bar=False,
                         convert_to_tensor=True,
                     )
             else:
diff --git a/sentence_transformers/evaluation/NanoBEIREvaluator.py b/sentence_transformers/evaluation/NanoBEIREvaluator.py
new file mode 100644
index 000000000..0b2972ffd
--- /dev/null
+++ b/sentence_transformers/evaluation/NanoBEIREvaluator.py
@@ -0,0 +1,415 @@
+from __future__ import annotations
+
+import logging
+import os
+from typing import TYPE_CHECKING, Callable, Literal
+
+import numpy as np
+from torch import Tensor
+from tqdm import tqdm
+
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.evaluation.InformationRetrievalEvaluator import InformationRetrievalEvaluator
+from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator
+from sentence_transformers.similarity_functions import SimilarityFunction
+from sentence_transformers.util import cos_sim, dot_score, is_datasets_available
+
+if TYPE_CHECKING:
+    from sentence_transformers.SentenceTransformer import SentenceTransformer
+
+logger = logging.getLogger(__name__)
+
+DatasetNameType = Literal[
+    "climatefever",
+    "dbpedia",
+    "fever",
+    "fiqa2018",
+    "hotpotqa",
+    "msmarco",
+    "nfcorpus",
+    "nq",
+    "quoraretrieval",
+    "scidocs",
+    "arguana",
+    "scifact",
+    "touche2020",
+]
+
+
+dataset_name_to_id = {
+    "climatefever": "zeta-alpha-ai/NanoClimateFEVER",
+    "dbpedia": "zeta-alpha-ai/NanoDBPedia",
+    "fever": "zeta-alpha-ai/NanoFEVER",
+    "fiqa2018": "zeta-alpha-ai/NanoFiQA2018",
+    "hotpotqa": "zeta-alpha-ai/NanoHotpotQA",
+    "msmarco": "zeta-alpha-ai/NanoMSMARCO",
+    "nfcorpus": "zeta-alpha-ai/NanoNFCorpus",
+    "nq": "zeta-alpha-ai/NanoNQ",
+    "quoraretrieval": "zeta-alpha-ai/NanoQuoraRetrieval",
+    "scidocs": "zeta-alpha-ai/NanoSCIDOCS",
+    "arguana": "zeta-alpha-ai/NanoArguAna",
+    "scifact": "zeta-alpha-ai/NanoSciFact",
+    "touche2020": "zeta-alpha-ai/NanoTouche2020",
+}
+
+dataset_name_to_human_readable = {
+    "climatefever": "ClimateFEVER",
+    "dbpedia": "DBPedia",
+    "fever": "FEVER",
+    "fiqa2018": "FiQA2018",
+    "hotpotqa": "HotpotQA",
+    "msmarco": "MSMARCO",
+    "nfcorpus": "NFCorpus",
+    "nq": "NQ",
+    "quoraretrieval": "QuoraRetrieval",
+    "scidocs": "SCIDOCS",
+    "arguana": "ArguAna",
+    "scifact": "SciFact",
+    "touche2020": "Touche2020",
+}
+
+
+class NanoBEIREvaluator(SentenceEvaluator):
+    """
+    This class evaluates the performance of a SentenceTransformer Model on the NanoBEIR collection of datasets.
+
+    The collection is a set of datasets based on the BEIR collection, but with a significantly smaller size, so it can be used for quickly evaluating the retrieval performance of a model before commiting to a full evaluation.
+    The datasets are available on HuggingFace at https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6
+    The Evaluator will return the same metrics as the InformationRetrievalEvaluator (i.e., MRR, nDCG, Recall@k), for each dataset and on average.
+
+
+    Example:
+        ::
+
+            from sentence_transformers import SentenceTransformer
+            from sentence_transformers.evaluation import NanoBEIREvaluator
+
+            # Load a model
+            model = SentenceTransformer('all-mpnet-base-v2')
+
+            datasets = ["QuoraRetrieval", "MSMARCO"]
+            query_prompts = {
+                "QuoraRetrieval": "Instruct: Given a question, retrieve questions that are semantically equivalent to the given question\nQuery: ",
+                "MSMARCO": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
+            }
+
+            evaluator = NanoBEIREvaluator(
+                dataset_names=datasets,
+                name="NanoBEIR",
+                query_prompts=query_prompts,
+            )
+
+            results = evaluator(model)
+            '''
+            NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset:
+            Evaluating NanoBeIRNanoQuoraRetrieval
+            Evaluating NanoBeIRNanoMSMARCO
+
+            Average Queries: 50.0
+            Average Corpus: 5044.5
+
+            Aggregated for Score Function: cosine
+            Accuracy@1: 39.00%
+            Accuracy@3: 57.00%
+            Accuracy@5: 66.00%
+            Accuracy@10: 77.00%
+            Precision@1: 39.00%
+            Recall@1: 34.03%
+            Precision@3: 20.67%
+            Recall@3: 54.07%
+            Precision@5: 15.00%
+            Recall@5: 64.27%
+            Precision@10: 8.90%
+            Recall@10: 75.97%
+            MRR@10: 0.5004
+            NDCG@10: 0.5513
+            Aggregated for Score Function: dot
+            Accuracy@1: 39.00%
+            Accuracy@3: 57.00%
+            Accuracy@5: 66.00%
+            Accuracy@10: 77.00%
+            Precision@1: 39.00%
+            Recall@1: 34.03%
+            Precision@3: 20.67%
+            Recall@3: 54.07%
+            Precision@5: 15.00%
+            Recall@5: 64.27%
+            Precision@10: 8.90%
+            Recall@10: 75.97%
+            MRR@10: 0.5004
+            NDCG@10: 0.5513
+            '''
+            logger.info(evaluator.primary_metric)
+            # => "cosine_ndcg@10"
+            logger.info(results["mean"][evaluator.primary_metric])
+            # => 0.5512516989358924
+    """
+
+    def __init__(
+        self,
+        dataset_names: list[DatasetNameType] | None = None,
+        mrr_at_k: list[int] = [10],
+        ndcg_at_k: list[int] = [10],
+        accuracy_at_k: list[int] = [1, 3, 5, 10],
+        precision_recall_at_k: list[int] = [1, 3, 5, 10],
+        map_at_k: list[int] = [100],
+        show_progress_bar: bool = False,
+        batch_size: int = 32,
+        write_csv: bool = True,
+        truncate_dim: int | None = None,
+        score_functions: dict[str, Callable[[Tensor, Tensor], Tensor]] = {
+            SimilarityFunction.COSINE.value: cos_sim,
+            SimilarityFunction.DOT_PRODUCT.value: dot_score,
+        },  # Score function, higher=more similar
+        main_score_function: str | SimilarityFunction | None = None,
+        aggregate_fn: Callable[[list[float]], float] = np.mean,
+        aggregate_key: str = "mean",
+        query_prompts: str | dict[str, str] | None = None,
+        corpus_prompts: str | dict[str, str] | None = None,
+    ):
+        """
+        Initializes the NanoBEIREvaluator.
+
+        Args:
+            dataset_names (List[str]): The names of the datasets to evaluate on.
+            mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
+            ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
+            accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
+            precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
+            map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
+            show_progress_bar (bool): Whether to show a progress bar during evaluation. Defaults to False.
+            batch_size (int): The batch size for evaluation. Defaults to 32.
+            write_csv (bool): Whether to write the evaluation results to a CSV file. Defaults to True.
+            truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
+            score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
+            main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
+            aggregate_fn (Callable[[list[float]], float]): The function to aggregate the scores. Defaults to np.mean.
+            aggregate_key (str): The key to use for the aggregated score. Defaults to "mean".
+            query_prompts (str | dict[str, str], optional): The prompts to add to the queries. If a string, will add the same prompt to all queries. If a dict, expects that all datasets in dataset_names are keys.
+            corpus_prompts (str | dict[str, str], optional): The prompts to add to the corpus. If a string, will add the same prompt to all corpus. If a dict, expects that all datasets in dataset_names are keys.
+        """
+        super().__init__()
+        if dataset_names is None:
+            dataset_names = list(dataset_name_to_id.keys())
+        self.dataset_names = dataset_names
+        self.aggregate_fn = aggregate_fn
+        self.aggregate_key = aggregate_key
+        self.write_csv = write_csv
+        self.query_prompts = query_prompts
+        self.corpus_prompts = corpus_prompts
+        self.show_progress_bar = show_progress_bar
+        self.write_csv = write_csv
+        self.score_functions = score_functions
+        self.score_function_names = sorted(list(self.score_functions.keys()))
+        self.main_score_function = main_score_function
+        self.truncate_dim = truncate_dim
+        self.name = f"NanoBEIR_{aggregate_key}"
+        if self.truncate_dim:
+            self.name += f"_{self.truncate_dim}"
+
+        self.mrr_at_k = mrr_at_k
+        self.ndcg_at_k = ndcg_at_k
+        self.accuracy_at_k = accuracy_at_k
+        self.precision_recall_at_k = precision_recall_at_k
+        self.map_at_k = map_at_k
+
+        self._validate_dataset_names()
+        self._validate_prompts()
+
+        ir_evaluator_kwargs = {
+            "mrr_at_k": mrr_at_k,
+            "ndcg_at_k": ndcg_at_k,
+            "accuracy_at_k": accuracy_at_k,
+            "precision_recall_at_k": precision_recall_at_k,
+            "map_at_k": map_at_k,
+            "show_progress_bar": show_progress_bar,
+            "batch_size": batch_size,
+            "write_csv": write_csv,
+            "truncate_dim": truncate_dim,
+            "score_functions": score_functions,
+            "main_score_function": main_score_function,
+        }
+
+        self.evaluators = [self._load_dataset(name, **ir_evaluator_kwargs) for name in self.dataset_names]
+
+        self.csv_file: str = f"NanoBEIR_evaluation_{aggregate_key}_results.csv"
+        self.csv_headers = ["epoch", "steps"]
+
+        for score_name in self.score_function_names:
+            for k in accuracy_at_k:
+                self.csv_headers.append(f"{score_name}-Accuracy@{k}")
+
+            for k in precision_recall_at_k:
+                self.csv_headers.append(f"{score_name}-Precision@{k}")
+                self.csv_headers.append(f"{score_name}-Recall@{k}")
+
+            for k in mrr_at_k:
+                self.csv_headers.append(f"{score_name}-MRR@{k}")
+
+            for k in ndcg_at_k:
+                self.csv_headers.append(f"{score_name}-NDCG@{k}")
+
+            for k in map_at_k:
+                self.csv_headers.append(f"{score_name}-MAP@{k}")
+
+    def __call__(
+        self, model: SentenceTransformer, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs
+    ) -> dict[str, float]:
+        per_metric_results = {}
+        per_dataset_results = {}
+        if epoch != -1:
+            if steps == -1:
+                out_txt = f" after epoch {epoch}"
+            else:
+                out_txt = f" in epoch {epoch} after {steps} steps"
+        else:
+            out_txt = ""
+        if self.truncate_dim is not None:
+            out_txt += f" (truncated to {self.truncate_dim})"
+        logger.info(f"NanoBEIR Evaluation of the model on {self.dataset_names} dataset{out_txt}:")
+        for evaluator in tqdm(self.evaluators, desc="Evaluating datasets", disable=not self.show_progress_bar):
+            logger.info(f"Evaluating {evaluator.name}")
+            evaluation = evaluator(model, output_path, epoch, steps)
+            for k in evaluation:
+                if self.truncate_dim:
+                    dataset, _, metric = k.split("_", maxsplit=2)
+                else:
+                    dataset, metric = k.split("_", maxsplit=1)
+                if metric not in per_metric_results:
+                    per_metric_results[metric] = []
+                per_dataset_results[dataset + "_" + metric] = evaluation[k]
+                per_metric_results[metric].append(evaluation[k])
+
+        agg_results = {}
+        for metric in per_metric_results:
+            agg_results[metric] = self.aggregate_fn(per_metric_results[metric])
+            per_dataset_results[self.aggregate_key + "_" + metric] = agg_results[metric]
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                fOut = open(csv_path, mode="w", encoding="utf-8")
+                fOut.write(",".join(self.csv_headers))
+                fOut.write("\n")
+
+            else:
+                fOut = open(csv_path, mode="a", encoding="utf-8")
+
+            output_data = [epoch, steps]
+            for name in self.score_function_names:
+                for k in self.accuracy_at_k:
+                    output_data.append(per_dataset_results[name]["accuracy@k"][k])
+
+                for k in self.precision_recall_at_k:
+                    output_data.append(per_dataset_results[name]["precision@k"][k])
+                    output_data.append(per_dataset_results[name]["recall@k"][k])
+
+                for k in self.mrr_at_k:
+                    output_data.append(per_dataset_results[name]["mrr@k"][k])
+
+                for k in self.ndcg_at_k:
+                    output_data.append(per_dataset_results[name]["ndcg@k"][k])
+
+                for k in self.map_at_k:
+                    output_data.append(per_dataset_results[name]["map@k"][k])
+
+            fOut.write(",".join(map(str, output_data)))
+            fOut.write("\n")
+            fOut.close()
+
+        if not self.primary_metric:
+            if self.main_score_function is None:
+                score_function = max(
+                    [(name, agg_results[f"{name}_ndcg@{max(self.ndcg_at_k)}"]) for name in self.score_function_names],
+                    key=lambda x: x[1],
+                )[0]
+                self.primary_metric = f"{self.aggregate_key}_{score_function}_ndcg@{max(self.ndcg_at_k)}"
+            else:
+                self.primary_metric = (
+                    f"{self.aggregate_key}_{self.main_score_function.value}_ndcg@{max(self.ndcg_at_k)}"
+                )
+
+        self.store_metrics_in_model_card_data(model, agg_results)
+
+        avg_queries = np.mean([len(evaluator.queries) for evaluator in self.evaluators])
+        avg_corpus = np.mean([len(evaluator.corpus) for evaluator in self.evaluators])
+        logger.info(f"\nAverage Queries: {avg_queries}")
+        logger.info(f"Average Corpus: {avg_corpus}\n")
+
+        for name in self.score_function_names:
+            logger.info(f"Aggregated for Score Function: {name}")
+            for k in self.accuracy_at_k:
+                logger.info("Accuracy@{}: {:.2f}%".format(k, agg_results[f"{name}_accuracy@{k}"] * 100))
+
+            for k in self.precision_recall_at_k:
+                logger.info("Precision@{}: {:.2f}%".format(k, agg_results[f"{name}_precision@{k}"] * 100))
+                logger.info("Recall@{}: {:.2f}%".format(k, agg_results[f"{name}_recall@{k}"] * 100))
+
+            for k in self.mrr_at_k:
+                logger.info("MRR@{}: {:.4f}".format(k, agg_results[f"{name}_mrr@{k}"]))
+
+            for k in self.ndcg_at_k:
+                logger.info("NDCG@{}: {:.4f}".format(k, agg_results[f"{name}_ndcg@{k}"]))
+        return per_dataset_results
+
+    def _get_human_readable_name(self, dataset_name: DatasetNameType) -> str:
+        human_readable_name = f"Nano{dataset_name_to_human_readable[dataset_name.lower()]}"
+        if self.truncate_dim is not None:
+            human_readable_name += f"_{self.truncate_dim}"
+        return human_readable_name
+
+    def _load_dataset(self, dataset_name: DatasetNameType, **ir_evaluator_kwargs) -> InformationRetrievalEvaluator:
+        if not is_datasets_available():
+            raise ValueError("datasets is not available. Please install it to use the NanoBEIREvaluator.")
+        from datasets import load_dataset
+
+        dataset_path = dataset_name_to_id[dataset_name.lower()]
+        corpus = load_dataset(dataset_path, "corpus", split="train")
+        queries = load_dataset(dataset_path, "queries", split="train")
+        qrels = load_dataset(dataset_path, "qrels", split="train")
+        corpus_dict = {sample["_id"]: sample["text"] for sample in corpus if len(sample["text"]) > 0}
+        queries_dict = {sample["_id"]: sample["text"] for sample in queries if len(sample["text"]) > 0}
+        qrels_dict = {}
+        for sample in qrels:
+            if sample["query-id"] not in qrels_dict:
+                qrels_dict[sample["query-id"]] = set()
+            qrels_dict[sample["query-id"]].add(sample["corpus-id"])
+
+        if self.query_prompts is not None:
+            ir_evaluator_kwargs["query_prompt"] = self.query_prompts.get(dataset_name, None)
+        if self.corpus_prompts is not None:
+            ir_evaluator_kwargs["corpus_prompt"] = self.corpus_prompts.get(dataset_name, None)
+        human_readable_name = self._get_human_readable_name(dataset_name)
+        return InformationRetrievalEvaluator(
+            queries=queries_dict,
+            corpus=corpus_dict,
+            relevant_docs=qrels_dict,
+            name=human_readable_name,
+            **ir_evaluator_kwargs,
+        )
+
+    def _validate_dataset_names(self):
+        if missing_datasets := [
+            dataset_name for dataset_name in self.dataset_names if dataset_name.lower() not in dataset_name_to_id
+        ]:
+            raise ValueError(
+                f"Dataset(s) {missing_datasets} not found in the NanoBEIR collection."
+                f"Valid dataset names are: {list(dataset_name_to_id.keys())}"
+            )
+
+    def _validate_prompts(self):
+        error_msg = ""
+        if self.query_prompts is not None:
+            if missing_query_prompts := [
+                dataset_name for dataset_name in self.dataset_names if dataset_name not in self.query_prompts
+            ]:
+                error_msg += f"The following datasets are missing query prompts: {missing_query_prompts}\n"
+
+        if self.corpus_prompts is not None:
+            if missing_corpus_prompts := [
+                dataset_name for dataset_name in self.dataset_names if dataset_name not in self.corpus_prompts
+            ]:
+                error_msg += f"The following datasets are missing corpus prompts: {missing_corpus_prompts}\n"
+
+        if error_msg:
+            raise ValueError(error_msg.strip())
diff --git a/sentence_transformers/evaluation/__init__.py b/sentence_transformers/evaluation/__init__.py
index da05aa40e..821c9c4a3 100644
--- a/sentence_transformers/evaluation/__init__.py
+++ b/sentence_transformers/evaluation/__init__.py
@@ -6,6 +6,7 @@
 from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
 from .MSEEvaluator import MSEEvaluator
 from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame
+from .NanoBEIREvaluator import NanoBEIREvaluator
 from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator
 from .RerankingEvaluator import RerankingEvaluator
 from .SentenceEvaluator import SentenceEvaluator
@@ -28,4 +29,5 @@
     "TranslationEvaluator",
     "TripletEvaluator",
     "RerankingEvaluator",
+    "NanoBEIREvaluator",
 ]
diff --git a/sentence_transformers/model_card.py b/sentence_transformers/model_card.py
index 99da35d96..2f98562a4 100644
--- a/sentence_transformers/model_card.py
+++ b/sentence_transformers/model_card.py
@@ -816,7 +816,7 @@ def try_to_pure_python(value: Any) -> Any:
                         task_name=description,
                         task_type=description.lower().replace(" ", "-"),
                         dataset_type=dataset_name or "unknown",
-                        dataset_name=dataset_name.replace("_", " ").replace("-", " ") or "Unknown",
+                        dataset_name=dataset_name.replace("_", " ").replace("-", " ") if dataset_name else "Unknown",
                         metric_name=metric_key.replace("_", " ").title(),
                         metric_type=metric_key,
                         metric_value=metric_value,