diff --git a/dataset.py b/dataset.py index 8129573..0943a15 100644 --- a/dataset.py +++ b/dataset.py @@ -25,6 +25,15 @@ def split_dataframe(df, batch_size): batch = df.iloc[i: i + batch_size] yield batch + @staticmethod + def recall(actual_matches: set, expected_matches: set): + # Recall@K : how many relevant items were returned against how many + # relevant items exist in the entire dataset. Defined as: + # truePositives / (truePositives + falseNegatives) + true_positives = len(expected_matches.intersection(actual_matches)) + recall = true_positives / len(expected_matches) + return recall + def __init__(self, name: str = "", cache_dir: str = ""): self.name = name self.cache = pathlib.Path(cache_dir) diff --git a/locustfile.py b/locustfile.py index 301c20f..0d05a23 100644 --- a/locustfile.py +++ b/locustfile.py @@ -86,6 +86,8 @@ def _(parser): "'if-count-mismatch': Populate if the number of items in the index differs from the " "number of items in th dataset, otherwise skip population. " "(default: %(default)s).") + pc_options.add_argument("--pinecone-recall", action=argparse.BooleanOptionalAction, + help="Report the Recall score (out of 100) instead of latency (reported on UI / console as 'latency'") pc_options.add_argument("--pinecone-dataset-cache", type=str, default=".dataset_cache", help="Path to directory to cache downloaded datasets (default: %(default)s).") pc_options.add_argument("--pinecone-throughput-per-user", type=float, default=0, @@ -265,6 +267,17 @@ def __init__(self, environment): # Wait until the datset has been loaded for this environment (Runner) environment.setup_dataset_greenlet.join() + # Check for compatibility between different options. + # --pinecone-recall can only be used if the query set contains the + # exact top-K vectors. + if environment.parsed_options.pinecone_recall: + query = self._query_vector() + if "blob" not in query or "nearest_neighbors" not in query["blob"]: + logging.error( + "--pinecone-recall specified but query set does not " + "contain nearest neighbours - cannot calculate Recall") + sys.exit(1) + def wait_time(self): if self.target_throughput > 0: return constant_throughput(self.target_throughput)(self) @@ -274,7 +287,7 @@ def wait_time(self): @task def vectorQuery(self): self.client.query(name="Vector (Query only)", - q_vector=self._query_vector(), top_k=self.top_k) + query=self._query_vector(), top_k=self.top_k) @tag('fetch') @task @@ -293,7 +306,7 @@ def deleteById(self): def vectorMetadataQuery(self): metadata = dict(color=random.choices(word_list)) self.client.query(name="Vector + Metadata", - q_vector=self._query_vector(), + query=self._query_vector(), top_k=self.top_k, q_filter={"color": metadata['color'][0]}) @@ -301,7 +314,7 @@ def vectorMetadataQuery(self): @task def vectorNamespaceQuery(self): self.client.query(name="Vector + Namespace (namespace1)", - q_vector=self._query_vector(), + query=self._query_vector(), top_k=self.top_k, namespace="namespace1") @@ -310,7 +323,7 @@ def vectorNamespaceQuery(self): def vectorMetadataNamespaceQuery(self): metadata = dict(color=random.choices(word_list)) self.client.query(name="Vector + Metadata + Namespace (namespace1)", - q_vector=self._query_vector(), + query=self._query_vector(), top_k=self.top_k, q_filter={"color": metadata['color'][0]}, namespace="namespace1") @@ -322,8 +335,10 @@ def _query_vector(self): """ if not self.environment.dataset.queries.empty: record = self.environment.dataset.queries.sample(n=1).iloc[0] - return record['vector'] - return ((np.random.random_sample(self.dimensions) * 2.0) - 1.0).tolist() + else: + record = dict() + record["vector"] = ((np.random.random_sample(self.dimensions) * 2.0) - 1.0).tolist() + return record class PineconeRest(FastHttpUser): @@ -339,8 +354,8 @@ def __init__(self, environment): self.host = environment.host super().__init__(environment) - def query(self, name: str, q_vector: list, top_k: int, q_filter=None, namespace=None): - json = {"vector": q_vector, + def query(self, name: str, query: dict, top_k: int, q_filter=None, namespace=None): + json = {"vector": query["vector"], "topK": top_k, "includeMetadata": includeMetadataValue, "includeValues": includeValuesValue} @@ -387,11 +402,11 @@ def __init__(self, environment, use_grpc : bool = False): self.index = self.pinecone.Index(host=self.host) - def query(self, name: str, q_vector: list, top_k: int, q_filter=None, namespace=None): - args = {'vector': q_vector, - 'top_k': top_k, - 'include_values': includeValuesValue, - 'include_metadata': includeValuesValue} + def query(self, name: str, query: dict, top_k: int, q_filter=None, namespace=None): + args = {'vector': query['vector'], + 'top_k': top_k, + 'include_values': includeValuesValue, + 'include_metadata': includeValuesValue} if q_filter: args['filter'] = q_filter if namespace: @@ -404,10 +419,18 @@ def query(self, name: str, q_vector: list, top_k: int, q_filter=None, namespace= response_time = (stop - start) * 1000.0 match_count = len(result.matches) + if self.environment.parsed_options.pinecone_recall: + expected_neighbours = set(query['blob']['nearest_neighbors']) + actual_neighbours = set([r['id'] for r in result.matches]) + recall_n = Dataset.recall(actual_neighbours, expected_neighbours) + metric = recall_n * 100 + else: + metric = response_time + events.request.fire(request_type=self.request_type, name=name, response_length=match_count, - response_time=response_time) + response_time=metric) def fetch(self, id : str): start = time.time() diff --git a/tests/integration/test_requests.py b/tests/integration/test_requests.py index 9a014da..e133cbb 100644 --- a/tests/integration/test_requests.py +++ b/tests/integration/test_requests.py @@ -135,6 +135,25 @@ def test_dataset_load_empty_queries(self, index_host): "--pinecone-populate-index", "always", "--pinecone-dataset-ignore-queries"]) + def test_recall(self, index_host): + # Simple smoke-test for --pinecone-recall; check it is accepted + # and no errors occur. + test_dataset = "ANN_MNIST_d784_euclidean" + self.do_request(index_host, "sdk", 'query', 'Vector (Query only)', + extra_args=["--pinecone-dataset", test_dataset, + "--pinecone-dataset-limit", "10", + "--pinecone-recall"]) + + def test_recall_requires_nearest_neighbours(self, index_host): + # --pinecone-recall is incompatible with a dataset without + # nearest-neighbour information in the query set - e.g. when not + # specifying a dataset. + (proc, _, stderr) = spawn_locust(host=index_host, + mode="sdk", + timeout=10, + extra_args=["--tags", "query", "--pinecone-recall"]) + assert "cannot calculate Recall" in stderr + assert proc.returncode == 1 @pytest.mark.parametrize("mode", ["rest", "sdk", "sdk+grpc"]) class TestPineconeModes(TestPineconeBase):