Merge pull request #50 from pinecone-io/recall

Add --pinecone-recall option to report Recall
pinecone-io · Feb 29, 2024 · 79a4bbe · 79a4bbe
2 parents 53bd554 + 3bf2952
commit 79a4bbe
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 14 deletions.
diff --git a/dataset.py b/dataset.py
@@ -25,6 +25,26 @@ def split_dataframe(df, batch_size):
             batch = df.iloc[i: i + batch_size]
             yield batch
 
+    @staticmethod
+    def recall(actual_matches: list, expected_matches: list):
+        # Recall@K : how many relevant items were returned against how many
+        # relevant items exist in the entire dataset. Defined as:
+        #     truePositives / (truePositives + falseNegatives)
+        #
+        # To allow us to calculate Recall when the count of actual_matches from
+        # the query differs from expected_matches (e.g. when Query is
+        # executed with a top_k different to what the Dataset was built with),
+        # limit denominator to the minimum of the expected & actual.
+        # (This allows use to use a Dataset with say 100 exact nearest
+        # neighbours and still test the quality of results when querying at
+        # top_k==10 as-if only the 10 exact nearest neighbours had been
+        # provided).
+        relevent_size = min(len(actual_matches), len(expected_matches))
+        expected_matches = expected_matches[:relevent_size]
+        true_positives = len(set(expected_matches).intersection(set(actual_matches)))
+        recall = true_positives / relevent_size
+        return recall
+
     def __init__(self, name: str = "", cache_dir: str = ""):
         self.name = name
         self.cache = pathlib.Path(cache_dir)

diff --git a/locustfile.py b/locustfile.py
@@ -86,6 +86,8 @@ def _(parser):
                                  "'if-count-mismatch': Populate if the number of items in the index differs from the "
                                  "number of items in th dataset, otherwise skip population. "
                                  "(default: %(default)s).")
+    pc_options.add_argument("--pinecone-recall", action=argparse.BooleanOptionalAction,
+                            help="Report the Recall score (out of 100) instead of latency (reported on UI / console as 'latency'")
     pc_options.add_argument("--pinecone-dataset-cache", type=str, default=".dataset_cache",
                             help="Path to directory to cache downloaded datasets (default: %(default)s).")
     pc_options.add_argument("--pinecone-throughput-per-user", type=float, default=0,
@@ -265,6 +267,17 @@ def __init__(self, environment):
             # Wait until the datset has been loaded for this environment (Runner)
             environment.setup_dataset_greenlet.join()
 
+        # Check for compatibility between different options.
+        # --pinecone-recall can only be used if the query set contains the
+        # exact top-K vectors.
+        if environment.parsed_options.pinecone_recall:
+            query = self._query_vector()
+            if "blob" not in query or "nearest_neighbors" not in query["blob"]:
+                logging.error(
+                    "--pinecone-recall specified but query set does not "
+                    "contain nearest neighbours - cannot calculate Recall")
+                sys.exit(1)
+
     def wait_time(self):
         if self.target_throughput > 0:
             return constant_throughput(self.target_throughput)(self)
@@ -274,7 +287,7 @@ def wait_time(self):
     @task
     def vectorQuery(self):
         self.client.query(name="Vector (Query only)",
-                          q_vector=self._query_vector(), top_k=self.top_k)
+                          query=self._query_vector(), top_k=self.top_k)
 
     @tag('fetch')
     @task
@@ -293,15 +306,15 @@ def deleteById(self):
     def vectorMetadataQuery(self):
         metadata = dict(color=random.choices(word_list))
         self.client.query(name="Vector + Metadata",
-                          q_vector=self._query_vector(),
+                          query=self._query_vector(),
                           top_k=self.top_k,
                           q_filter={"color": metadata['color'][0]})
 
     @tag('query_namespace')
     @task
     def vectorNamespaceQuery(self):
         self.client.query(name="Vector + Namespace (namespace1)",
-                          q_vector=self._query_vector(),
+                          query=self._query_vector(),
                           top_k=self.top_k,
                           namespace="namespace1")
 
@@ -310,7 +323,7 @@ def vectorNamespaceQuery(self):
     def vectorMetadataNamespaceQuery(self):
         metadata = dict(color=random.choices(word_list))
         self.client.query(name="Vector + Metadata + Namespace (namespace1)",
-                          q_vector=self._query_vector(),
+                          query=self._query_vector(),
                           top_k=self.top_k,
                           q_filter={"color": metadata['color'][0]},
                           namespace="namespace1")
@@ -322,8 +335,10 @@ def _query_vector(self):
         """
         if not self.environment.dataset.queries.empty:
             record = self.environment.dataset.queries.sample(n=1).iloc[0]
-            return record['vector']
-        return ((np.random.random_sample(self.dimensions) * 2.0) - 1.0).tolist()
+        else:
+            record = dict()
+            record["vector"] = ((np.random.random_sample(self.dimensions) * 2.0) - 1.0).tolist()
+        return record
 
 
 class PineconeRest(FastHttpUser):
@@ -339,8 +354,8 @@ def __init__(self, environment):
         self.host = environment.host
         super().__init__(environment)
 
-    def query(self, name: str, q_vector: list, top_k: int, q_filter=None, namespace=None):
-        json = {"vector": q_vector,
+    def query(self, name: str, query: dict, top_k: int, q_filter=None, namespace=None):
+        json = {"vector": query["vector"],
                 "topK": top_k,
                 "includeMetadata": includeMetadataValue,
                 "includeValues": includeValuesValue}
@@ -387,11 +402,11 @@ def __init__(self, environment, use_grpc : bool = False):
 
         self.index = self.pinecone.Index(host=self.host)
 
-    def query(self, name: str, q_vector: list, top_k: int, q_filter=None, namespace=None):
-        args = {'vector': q_vector,
-                  'top_k': top_k,
-                  'include_values': includeValuesValue,
-                  'include_metadata': includeValuesValue}
+    def query(self, name: str, query: dict, top_k: int, q_filter=None, namespace=None):
+        args = {'vector': query['vector'],
+                'top_k': top_k,
+                'include_values': includeValuesValue,
+                'include_metadata': includeValuesValue}
         if q_filter:
             args['filter'] = q_filter
         if namespace:
@@ -404,10 +419,18 @@ def query(self, name: str, q_vector: list, top_k: int, q_filter=None, namespace=
         response_time = (stop - start) * 1000.0
         match_count = len(result.matches)
 
+        if self.environment.parsed_options.pinecone_recall:
+            expected_neighbours = query['blob']['nearest_neighbors']
+            actual_neighbours = [r['id'] for r in result.matches]
+            recall_n = Dataset.recall(actual_neighbours, expected_neighbours)
+            metric = recall_n * 100
+        else:
+            metric = response_time
+
         events.request.fire(request_type=self.request_type,
                             name=name,
                             response_length=match_count,
-                            response_time=response_time)
+                            response_time=metric)
 
     def fetch(self, id : str):
         start = time.time()

diff --git a/tests/integration/test_requests.py b/tests/integration/test_requests.py
@@ -135,6 +135,25 @@ def test_dataset_load_empty_queries(self, index_host):
                                     "--pinecone-populate-index", "always",
                                     "--pinecone-dataset-ignore-queries"])
 
+    def test_recall(self, index_host):
+        # Simple smoke-test for --pinecone-recall; check it is accepted
+        # and no errors occur.
+        test_dataset = "ANN_MNIST_d784_euclidean"
+        self.do_request(index_host, "sdk", 'query', 'Vector (Query only)',
+                        extra_args=["--pinecone-dataset", test_dataset,
+                                    "--pinecone-dataset-limit", "10",
+                                    "--pinecone-recall"])
+
+    def test_recall_requires_nearest_neighbours(self, index_host):
+        # --pinecone-recall is incompatible with a dataset without
+        # nearest-neighbour information in the query set - e.g. when not
+        # specifying a dataset.
+        (proc, _, stderr) = spawn_locust(host=index_host,
+                                         mode="sdk",
+                                         timeout=10,
+                                         extra_args=["--tags", "query", "--pinecone-recall"])
+        assert "cannot calculate Recall" in stderr
+        assert proc.returncode == 1
 
 @pytest.mark.parametrize("mode", ["rest", "sdk", "sdk+grpc"])
 class TestPineconeModes(TestPineconeBase):

diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
@@ -20,3 +20,30 @@ def test_limit(self):
 
         dataset.load(limit=limit, load_queries=False)
         assert len(dataset.documents) == limit
+
+    def test_recall_equal(self):
+        # Test recall() for equal length actual and expected lists.
+        assert Dataset.recall(["1"], ["1"]) == 1.0
+        assert Dataset.recall(["0"], ["1"]) == 0
+        assert Dataset.recall(["1", "3"], ["1", "2"]) == 0.5
+        assert Dataset.recall(["3", "1"], ["1", "2"]) == 0.5
+        assert Dataset.recall(["1", "2"], ["2", "1"]) == 1
+        assert Dataset.recall(["2", "3", "4", "5"], ["1", "2", "3", "4"]) == 0.75
+
+    def test_recall_actual_fewer_expected(self):
+        # Test recall() when actual matches is fewer than expected - i.e.
+        # query ran with lower top_k. In this situation recall() should
+        # only consider the k nearest expected_matches.
+        assert Dataset.recall(["1"], ["1", "2"]) == 1.0
+        assert Dataset.recall(["2"], ["1", "2"]) == 0
+        assert Dataset.recall(["1"], ["1", "2", "3"]) == 1.0
+        assert Dataset.recall(["1", "2"], ["1", "2", "3"]) == 1.0
+
+    def test_recall_actual_more_expected(self):
+        # Test recall() when actual matches are more than expected - i.e.
+        # query ran with a higher top_k. In this situation we should still
+        # compare against the full expected_matches.
+        assert Dataset.recall(["1", "2"], ["1"]) == 1.0
+        assert Dataset.recall(["1", "2"], ["2"]) == 1.0
+        assert Dataset.recall(["1", "3"], ["2"]) == 0
+        assert Dataset.recall(["1", "2", "3"], ["3"]) == 1