diff --git a/openai_vector/challenges/default.json b/openai_vector/challenges/default.json index 79e0ae157..a23158302 100644 --- a/openai_vector/challenges/default.json +++ b/openai_vector/challenges/default.json @@ -42,7 +42,7 @@ "retry-until-success": true, "include-in-reporting": false } - }, + } {# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%} { "name": "post-ingest-sleep", @@ -50,53 +50,48 @@ "operation-type": "sleep", "duration": {{ post_ingest_sleep_duration|default(30) }} } - }, + } {%- endif -%}{# serverless-post-ingest-sleep-marker-end #} + {%- for i in range(p_search_ops|length) %}, { - "name": "standalone-search-knn-10-100-single-client", - "operation": "knn-search-10-100", - "warmup-iterations": 100, - "iterations": {{ standalone_search_iterations | default(10000) | int }} - }, - { - "name": "standalone-knn-search-100-1000-single-client", - "operation": "knn-search-100-1000", - "warmup-iterations": 100, + {%- if p_search_ops[i][2] > 0 -%} + "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-single-client", + "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}" + {%- else -%} + "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-single-client", + "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}" + {%- endif -%}, + "warmup-iterations": 1000, "iterations": {{ standalone_search_iterations | default(10000) | int }} }, { - "name": "standalone-search-knn-10-100-multiple-clients", - "operation": "knn-search-10-100", - "warmup-iterations": 100, + {%- if p_search_ops[i][2] > 0 -%} + "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-multiple-clients", + "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}" + {%- else -%} + "name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-multiple-clients", + "operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}" + {%- endif -%}, + "warmup-iterations": 1000, "clients": {{ standalone_search_clients | default(8) | int }}, "iterations": {{ standalone_search_iterations | default(10000) | int }} - }, + } + {%- endfor %}, { - "name": "standalone-search-knn-100-1000-multiple-clients", - "operation": "knn-search-100-1000", - "warmup-iterations": 100, - "clients": {{ standalone_search_clients | default(8) | int }}, - "iterations": {{ standalone_search_iterations | default(10000) | int }} - }, + "name": "parallel-documents-indexing-bulk", + "operation": "parallel-documents-indexing", + "warmup-time-period": 60, + "clients": {{ parallel_indexing_bulk_clients | default(1) | int }}, + "target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }} + } + {%- for i in range(p_search_ops|length) %}, { - "parallel": { - "tasks": [ - { - "name": "parallel-documents-indexing-bulk", - "operation": "parallel-documents-indexing", - "clients": {{ parallel_indexing_bulk_clients | default(1) | int }}, - "time-period": {{ parallel_indexing_time_period | default(1800) | int }}, - "target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }} - }, - { - "name": "parallel-documents-indexing-search-knn-10-100", - "operation": "knn-search-10-100", - "clients": {{ parallel_indexing_search_clients | default(3) | int }}, - "time-period": {{ parallel_indexing_time_period | default(1800) | int }}, - "target-throughput": {{ parallel_indexing_search_target_throughput | default(100) | int }} - } - ] - } + {%- if p_search_ops[i][2] > 0 -%} + "operation": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}" + {%- else -%} + "operation": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}" + {%- endif -%} } + {%- endfor %} ] } diff --git a/openai_vector/index-vectors-only-mapping-with-docid-mapping.json b/openai_vector/index-vectors-only-mapping-with-docid-mapping.json new file mode 100644 index 000000000..af5e1b66e --- /dev/null +++ b/openai_vector/index-vectors-only-mapping-with-docid-mapping.json @@ -0,0 +1,29 @@ +{ + "settings": { + {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} + {% if preload_pagecache %} + "index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"], + {% endif %} + "index.number_of_shards": {{number_of_shards | default(1)}}, + "index.number_of_replicas": {{number_of_replicas | default(0)}} + {%- endif -%}{# non-serverless-index-settings-marker-end #} + }, + "mappings": { + "dynamic": false, + "properties": { + "docid": { + "type": "keyword" + }, + "emb": { + "type": "dense_vector", + "element_type": "float", + "dims": 1536, + "index": true, + "similarity": "dot_product", + "index_options": { + "type": {{ vector_index_type | default("hnsw") | tojson }} + } + } + } + } +} diff --git a/openai_vector/index-vectors-only-mapping.json b/openai_vector/index-vectors-only-mapping.json index 77ebe0e3a..a619bad02 100644 --- a/openai_vector/index-vectors-only-mapping.json +++ b/openai_vector/index-vectors-only-mapping.json @@ -2,7 +2,7 @@ "settings": { {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} {% if preload_pagecache %} - "index.store.preload": [ "vec", "vex", "vem"], + "index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"], {% endif %} "index.number_of_shards": {{number_of_shards | default(1)}}, "index.number_of_replicas": {{number_of_replicas | default(0)}} diff --git a/openai_vector/index-vectors-with-text-mapping.json b/openai_vector/index-vectors-with-text-mapping.json index 3d07ae258..a0ea414ac 100644 --- a/openai_vector/index-vectors-with-text-mapping.json +++ b/openai_vector/index-vectors-with-text-mapping.json @@ -2,7 +2,7 @@ "settings": { {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} {% if preload_pagecache %} - "index.store.preload": [ "vec", "vex", "vem"], + "index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"], {% endif %} "index.number_of_shards": {{number_of_shards | default(1)}}, "index.number_of_replicas": {{number_of_replicas | default(0)}} diff --git a/openai_vector/open_ai_true_top_1000.json.bz2 b/openai_vector/open_ai_true_top_1000.json.bz2 new file mode 100644 index 000000000..f2bf17de3 Binary files /dev/null and b/openai_vector/open_ai_true_top_1000.json.bz2 differ diff --git a/openai_vector/operations/default.json b/openai_vector/operations/default.json index ebcf3219b..d9b18bb60 100644 --- a/openai_vector/operations/default.json +++ b/openai_vector/operations/default.json @@ -24,18 +24,31 @@ "corpora": "openai-parallel-indexing", "bulk-size": {{parallel_indexing_bulk_size | default(500)}}, "ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}} -}, +} +{%- set p_search_ops = (search_ops | default([(10, 20, 0), (10, 20, 1), (10, 20, 2), (10, 50, 1), (10, 50, 2), (10, 100, 1), (100, 120, 1), (100, 120, 2), (100, 200, 1), (100, 200, 2), (100, 500, 1), (100, 500, 2)]))%} +{%- for i in range(p_search_ops|length) %}, { - "name": "knn-search-10-100", + {%- if p_search_ops[i][2] > 0 -%} + "name": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}" + {%- else -%} + "name": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}" + {%- endif -%}, "operation-type": "search", "param-source": "knn-param-source", - "k": 10, - "num-candidates": 100 + "k": {{p_search_ops[i][0]}}, + "num-candidates": {{p_search_ops[i][1]}}, + "num-rescore": {{p_search_ops[i][2]}} }, { - "name": "knn-search-100-1000", - "operation-type": "search", - "param-source": "knn-param-source", - "k": 100, - "num-candidates": 1000 + {%- if p_search_ops[i][2] > 0 -%} + "name": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}" + {%- else -%} + "name": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}" + {%- endif -%}, + "operation-type": "knn-recall", + "param-source": "knn-recall-param-source", + "k": {{p_search_ops[i][0]}}, + "num-candidates": {{p_search_ops[i][1]}}, + "num-rescore": {{p_search_ops[i][2]}} } +{%- endfor %} \ No newline at end of file diff --git a/openai_vector/track.py b/openai_vector/track.py index 09f44b4ab..803572d55 100644 --- a/openai_vector/track.py +++ b/openai_vector/track.py @@ -1,9 +1,21 @@ import bz2 import json import os +import statistics +import logging +from typing import Any, List +logger = logging.getLogger(__name__) QUERIES_FILENAME: str = "queries.json.bz2" +TRUE_KNN_FILENAME: str = "open_ai_true_top_1000.json.bz2" +def compute_percentile(data: List[Any], percentile): + size = len(data) + if size <= 0: + return None + sorted_data = sorted(data) + index = int(round(percentile * size / 100)) - 1 + return sorted_data[max(min(index, size - 1), 0)] class KnnParamSource: def __init__(self, track, params, **kwargs): @@ -32,24 +44,133 @@ def partition(self, partition_index, total_partitions): def params(self): result = {"index": self._index_name, "cache": self._params.get("cache", False), "size": self._params.get("k", 10)} - - result["body"] = { - "knn": { - "field": "emb", - "query_vector": self._queries[self._iters], - "k": self._params.get("k", 10), - "num_candidates": self._params.get("num-candidates", 50), - }, - "_source": False, - } + num_candidates = self._params.get("num-candidates", 50) + num_rescore = self._params.get("num-rescore", 0) + query_vec = self._queries[self._iters] + knn_query = {"knn":{ + "field": "emb", + "query_vector": query_vec, + "k": result["size"], + "num_candidates": num_candidates, + }} if "filter" in self._params: - result["body"]["knn"]["filter"] = self._params["filter"] - + knn_query["knn"]["filter"] = self._params["filter"] + if num_rescore > 0: + knn_query["knn"]["rescore_vector"] = {"num_candidates_factor": num_rescore} + result["body"] = {"query": knn_query, "_source": False} self._iters += 1 if self._iters >= self._maxIters: self._iters = 0 return result +class KnnVectorStore: + def __init__(self): + cwd = os.path.dirname(__file__) + self._query_nearest_neighbor_docids = [] + self._queries = [] + with bz2.open(os.path.join(cwd, TRUE_KNN_FILENAME), "r") as queries_file: + for docids in queries_file: + self._query_nearest_neighbor_docids.append(json.loads(docids)) + with bz2.open(os.path.join(cwd, QUERIES_FILENAME), "r") as queries_file: + for vector_query in queries_file: + self._queries.append(json.loads(vector_query)) + + def get_query_vectors(self) -> List[List[float]]: + return self._queries + + def get_neighbors_for_query(self, query_id: int, size: int) -> List[str]: + if (query_id < 0) or (query_id >= len(self._query_nearest_neighbor_docids)): + raise ValueError(f"Unknown query with id: '{query_id}' provided") + if (size < 0) or (size > len(self._query_nearest_neighbor_docids[query_id])): + raise ValueError(f"Invalid size: '{size}' provided for query with id: '{query_id}'") + return self._query_nearest_neighbor_docids[query_id][:size] + +class KnnRecallParamSource: + def __init__(self, track, params, **kwargs): + if len(track.indices) == 1: + default_index = track.indices[0].name + else: + default_index = "_all" + + self._index_name = params.get("index", default_index) + self._cache = params.get("cache", False) + self._params = params + self.infinite = True + cwd = os.path.dirname(__file__) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + return { + "index": self._index_name, + "cache": self._params.get("cache", False), + "size": self._params.get("k", 10), + "num_candidates": self._params.get("num-candidates", 50), + "num_rescore": self._params.get("num-rescore", 0), + "knn_vector_store": KnnVectorStore() + } + +# Used in tandem with the KnnRecallParamSource. +# reads the queries, executes knn search and compares the results with the true nearest neighbors +class KnnRecallRunner: + + def get_knn_query(self, query_vec, k, num_candidates, num_rescore): + knn_query = {"knn":{ + "field": "emb", + "query_vector": query_vec, + "k": k, + "num_candidates": num_candidates, + }} + if num_rescore > 0: + knn_query["knn"]["rescore_vector"] = {"num_candidates_factor": num_rescore} + return {"query": knn_query, "_source": False} + + + async def __call__(self, es, params): + k = params["size"] + num_candidates = params["num_candidates"] + index = params["index"] + request_cache = params["cache"] + recall_total = 0 + exact_total = 0 + min_recall = k + max_recall = 0 + + knn_vector_store: KnnVectorStore = params["knn_vector_store"] + for query_id, query_vector in enumerate(knn_vector_store.get_query_vectors()): + knn_body = self.get_knn_query(query_vector, k, num_candidates, params["num_rescore"]) + knn_body["_source"] = False + knn_body["docvalue_fields"] = ["docid"] + knn_result = await es.search( + body=knn_body, + index=index, + request_cache=request_cache, + size=k, + ) + knn_hits = [hit["fields"]["docid"][0] for hit in knn_result["hits"]["hits"]] + true_neighbors = knn_vector_store.get_neighbors_for_query(query_id, k)[:k] + current_recall = len(set(knn_hits).intersection(set(true_neighbors))) + recall_total += current_recall + exact_total += len(true_neighbors) + min_recall = min(min_recall, current_recall) + max_recall = max(max_recall, current_recall) + to_return = { + "avg_recall": recall_total / exact_total, + "min_recall": min_recall, + "max_recall": max_recall, + "k": k, + "num_candidates": num_candidates, + "num_rescore": params["num_rescore"] + } + logger.info(f"Recall results: {to_return}") + return (to_return) + + def __repr__(self, *args, **kwargs): + return "knn-recall" + def register(registry): registry.register_param_source("knn-param-source", KnnParamSource) + registry.register_param_source("knn-recall-param-source", KnnRecallParamSource) + registry.register_runner("knn-recall", KnnRecallRunner(), async_runner=True)