Skip to content

Commit

Permalink
Dataset.load(): Additional logging
Browse files Browse the repository at this point in the history
Add additional logging to indicate how the query vectors are
generated.
  • Loading branch information
daverigby committed Feb 29, 2024
1 parent 8cc2a5a commit 162e530
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ def load(self, skip_download: bool = False, load_queries: bool = True,
# for querying, otherwise use documents directly.
if load_queries:
self.queries = self._load_parquet_dataset("queries")
if self.queries.empty:
logging.debug("Using complete documents dataset for query data")
if not self.queries.empty:
logging.info(
f"Using {len(self.queries)} query vectors loaded from dataset 'queries' table")
else:
# Queries expect a different schema than documents.
# Documents looks like:
# ["id", "values", "sparse_values", "metadata"]
Expand All @@ -78,6 +80,9 @@ def load(self, skip_download: bool = False, load_queries: bool = True,
# keeping a large complete dataset in memory for each
# worker process).
self.queries = self.queries.sample(frac=doc_sample_fraction, random_state=1)
logging.info(
f"Using {doc_sample_fraction * 100}% of documents' dataset "
f"for query data ({len(self.queries)} sampled)")

def upsert_into_index(self, index_host, api_key, skip_if_count_identical: bool = False):
"""
Expand Down

0 comments on commit 162e530

Please sign in to comment.