Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to list details of available datasets, rename =help to =list #24

Merged
merged 1 commit into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions locustfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from pinecone import Pinecone
from pinecone.grpc import PineconeGRPC
import tempfile
import tabulate
from tqdm import tqdm, trange
import sys

Expand Down Expand Up @@ -50,9 +51,10 @@ def _(parser):
"'rest': Pinecone REST API (via a normal HTTP client). "
"'sdk': Pinecone Python SDK ('pinecone-client'). "
"'sdk+grpc': Pinecone Python SDK using gRPC as the underlying transport.")
pc_options.add_argument("--pinecone-dataset", type=str, metavar="<dataset_name> | 'help'", default=None,
pc_options.add_argument("--pinecone-dataset", type=str, metavar="<dataset_name> | 'list' | 'list-details'", default=None,
help="The dataset to use for index population and/or query generation. "
"Pass the value 'help' to list available datasets.")
"Pass the value 'list' to list available datasets, or pass 'list-details' to"
" list full details of available datasets.")
pc_options.add_argument("--pinecone-populate-index", choices=["always", "never", "if-count-mismatch"],
default="if-count-mismatch",
help="Should the index be populated with the dataset before issuing requests. Choices: "
Expand Down Expand Up @@ -96,15 +98,22 @@ def setup_dataset(environment: Environment, skip_download_and_populate: bool = F
if not dataset_name:
environment.dataset = Dataset()
return
if dataset_name == "help":
if dataset_name in ("list", "list-details"):
# Print out the list of available datasets, then exit.
print("Fetching list of available datasets for --pinecone-dataset...")
available = Dataset.list()
# Copy the 'dimensions' model field from 'dense_model' into the top level
for a in available:
a['dimension'] = a['dense_model']['dimension']
df = pandas.DataFrame(available, columns=['name', 'documents', 'queries', 'dimension'])
print(df.to_markdown(index=False, headers=["Name", "Documents", "Queries", "Dimension"], tablefmt="simple"))
if dataset_name == "list":
# Discard the more detailed fields before printing, however
# copy the 'dimensions' model field from 'dense_model' into the top
# level as that's a key piece of information (needed to create an
# index).
brief = []
for a in available:
a['dimension'] = a['dense_model']['dimension']
summary = {key.capitalize(): a[key] for key in ['name', 'documents', 'queries', 'dimension']}
brief.append(summary)
available = brief
print(tabulate.tabulate(available, headers="keys", tablefmt="simple"))
print()
sys.exit(1)

Expand Down
16 changes: 15 additions & 1 deletion tests/integration/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_datasets_list(self):
(proc, stdout, stderr) = spawn_locust(host="unused",
mode="rest",
timeout=20,
extra_args=["--pinecone-dataset=help"])
extra_args=["--pinecone-dataset=list"])
# Check that stdout contains a list of datasets (don't want to hardcode
# complete set as that just makes the test brittle if any new datasets are added).
for dataset in ["ANN_MNIST_d784_euclidean 60000 10000 784",
Expand All @@ -87,6 +87,20 @@ def test_datasets_list(self):
assert dataset in stdout
assert proc.returncode == 1

def test_datasets_list_details(self):
# Extend timeout for listing datasets, can take longer than default 4s.
(proc, stdout, stderr) = spawn_locust(host="unused",
mode="rest",
timeout=20,
extra_args=["--pinecone-dataset=list-details"])
# Check that stdout contains at least a couple of details from the datasets (don't want to hardcode
# complete set as that just makes the test brittle if any new datasets are added).
for detail in ["gs://pinecone-datasets-dev/ANN_DEEP1B_d96_angular",
"https://github.com/erikbern/ann-benchmarks",
"sentence-transformers/all-MiniLM-L6-v2"]:
assert detail in stdout
assert proc.returncode == 1

def test_dataset_load(self, index_host):
# Choosing a small dataset ("only" 60,000 documents) which also
# has a non-zero queries set.
Expand Down