Skip to content

Commit

Permalink
Merge pull request #44 from pinecone-io/dataset_limit
Browse files Browse the repository at this point in the history
Add --pinecone-dataset-limit option
  • Loading branch information
daverigby authored Feb 29, 2024
2 parents 60a4171 + cd697e3 commit 5b2e646
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 4 deletions.
9 changes: 6 additions & 3 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,17 @@ def list():
return datasets

def load(self, skip_download: bool = False, load_queries: bool = True,
limit: int = 0,
doc_sample_fraction: float = 1.0):
"""
Load the dataset, populating the 'documents' and 'queries' DataFrames.
"""
if not skip_download:
self._download_dataset_files()

# Load all the parquet dataset (made up of one or more parquet files),
# Load the parquet dataset (made up of one or more parquet files),
# to use for documents into a pandas dataframe.
self.documents = self._load_parquet_dataset("documents")
self.documents = self._load_parquet_dataset("documents", limit=limit)

# If there is an explicit 'queries' dataset, then load that and use
# for querying, otherwise use documents directly.
Expand Down Expand Up @@ -151,7 +152,7 @@ def should_download(blob):
blob.download_to_filename(self.cache / blob.name)
pbar.update(blob.size)

def _load_parquet_dataset(self, kind):
def _load_parquet_dataset(self, kind, limit=0):
parquet_files = [f for f in (self.cache / self.name).glob(kind + '/*.parquet')]
if not len(parquet_files):
return pandas.DataFrame
Expand All @@ -172,6 +173,8 @@ def _load_parquet_dataset(self, kind):
# and hence significantly reduces memory usage when we later prune away the underlying
# parrow data (see prune_documents).
df = dataset.read(columns=columns).to_pandas(types_mapper=pandas.ArrowDtype)
if limit:
df = df.iloc[:limit]

# And drop any columns which all values are missing - e.g. not all
# datasets have sparse_values, but the parquet file may still have
Expand Down
4 changes: 4 additions & 0 deletions locustfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def _(parser):
" list full details of available datasets.")
pc_options.add_argument("--pinecone-dataset-ignore-queries", action=argparse.BooleanOptionalAction,
help="Ignore and do not load the 'queries' table from the specified dataset.")
pc_options.add_argument("--pinecone-dataset-limit", type=int, default=0,
help="If non-zero, limit the dataset to the first N vectors.")
pc_options.add_argument("--pinecone-dataset-docs-sample-for-query", type=float, default=0.01,
metavar="<fraction> (0.0 - 1.0)",
help="Specify the fraction of docs which should be sampled when the documents vectorset "
Expand Down Expand Up @@ -141,8 +143,10 @@ def setup_dataset(environment: Environment, skip_download_and_populate: bool = F
environment.dataset = Dataset(dataset_name, environment.parsed_options.pinecone_dataset_cache)
ignore_queries = environment.parsed_options.pinecone_dataset_ignore_queries
sample_ratio = environment.parsed_options.pinecone_dataset_docs_sample_for_query
limit = environment.parsed_options.pinecone_dataset_limit
environment.dataset.load(skip_download=skip_download_and_populate,
load_queries=not ignore_queries,
limit=limit,
doc_sample_fraction=sample_ratio)
populate = environment.parsed_options.pinecone_populate_index
if not skip_download_and_populate and populate != "never":
Expand Down
5 changes: 4 additions & 1 deletion tests/integration/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,16 @@ def test_datasets_list_details(self):
def test_dataset_load(self, index_host):
# Choosing a small dataset ("only" 60,000 documents) which also
# has a non-zero queries set.
# We also test the --pinecone-dataset-limit option here (which has the
# bonus effect of speeding up the test - note that complete
# dataset loading is tested in test_dataset_load_multiprocess).
test_dataset = "ANN_MNIST_d784_euclidean"
self.do_request(index_host, "sdk", 'query', 'Vector (Query only)',
timeout=60,
extra_args=["--pinecone-dataset", test_dataset,
"--pinecone-dataset-limit", "123",
"--pinecone-populate-index", "always"])


def test_dataset_load_multiprocess(self, index_host):
# Choosing a small dataset ("only" 60,000 documents) which also
# has a non-zero queries set.
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

from dataset import Dataset
import pytest


class TestDataset:

def test_limit(self):
limit = 123
name = "langchain-python-docs-text-embedding-ada-002"
dataset = Dataset(name)
# Sanity check that the complete dataset size is greater than what
# we are going to limit to.
dataset_info = ([d for d in dataset.list() if d["name"] == name][0])
assert dataset_info["documents"] > limit, \
"Too few documents in dataset to be able to limit"

dataset.load(limit=limit, load_queries=False)
assert len(dataset.documents) == limit

0 comments on commit 5b2e646

Please sign in to comment.