From 4219c4298c113170856a989b3c4ea8193a3ba372 Mon Sep 17 00:00:00 2001
From: Daniel Smilkov <dsmilkov@gmail.com>
Date: Mon, 29 Jan 2024 12:01:47 -0500
Subject: [PATCH] Improve clusters and several bug fixes (#1141)

https://huggingface.co/spaces/lilacai/daniel_staging

Clustering (backend):
- Lower `min_cluster_size` to 5 for the categories, so we can have more
coherent categories
- Add timeout of 7 sec (99%-tile response latency for OpenAI is like 3-4
sec) to avoid the 10min timeout. We can now title clusters of 1M docs
(11k clusters) in 8mins.
- Disable internal OpenAI retries (we used to have double retries)
- Replace "request" with "snippet" in the prompt to avoid biasing
towards user's requests -- improves forum/email/text clustering

UI
- make the histograms reactive to the currently selected group in "group
by"
- make pivot reactive to searches (e.g. keyword search, metadata search)
- remember the schema and nav bar state when flipping between cluster
view and item view
- Fix a bug with search box state, after page refresh
---
 lilac/__init__.py                             |   3 -
 lilac/data/clustering.py                      | 141 +++++++++++-------
 lilac/data/clustering_test.py                 |   7 +
 lilac/data/dataset.py                         |   5 +
 lilac/data/dataset_duckdb.py                  |  46 ++++--
 lilac/formats/__init__.py                     |   3 +-
 lilac/router_dataset.py                       |   4 +
 .../lib/components/datasetView/Dataset.svelte |   8 +-
 .../datasetView/DatasetPivotResult.svelte     |   2 +-
 .../datasetView/DatasetPivotViewer.svelte     |  14 +-
 .../datasetView/GroupByPanel.svelte           |  19 ++-
 .../datasetView/PrefetchRowItem.svelte        |  20 +--
 .../components/schemaView/FieldDetails.svelte |  13 +-
 .../src/lib/queries/datasetQueries.ts         |   8 +-
 web/lib/fastapi_client/models/PivotOptions.ts |   5 +
 .../models/SelectGroupsOptions.ts             |   5 +
 16 files changed, 198 insertions(+), 105 deletions(-)

diff --git a/lilac/__init__.py b/lilac/__init__.py
index 776126d86..7ffc96db1 100644
--- a/lilac/__init__.py
+++ b/lilac/__init__.py
@@ -18,7 +18,6 @@
 from .env import *  # noqa: F403
 from .env import LilacEnvironment, get_project_dir, set_project_dir
 from .formats import *  # noqa: F403
-from .formats import OpenChat, ShareGPT
 from .load import load
 from .load_dataset import create_dataset, from_dicts, from_huggingface
 from .project import init
@@ -73,8 +72,6 @@
   'deploy_project',
   'deploy_config',
   'SpanVector',
-  'ShareGPT',
-  'OpenChat',
   'download',
   'upload',
   'register_embedding',
diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
index 7b57a73dd..a12b9011f 100644
--- a/lilac/data/clustering.py
+++ b/lilac/data/clustering.py
@@ -44,8 +44,13 @@
 
 _SHORTEN_LEN = 400
 _TOP_K_CENTRAL_DOCS = 7
-_TOP_K_CENTRAL_TITLES = 15
+_TOP_K_CENTRAL_TITLES = 20
 _NUM_THREADS = 32
+_NUM_RETRIES = 16
+# OpenAI rate limits you on `max_tokens` so we ideally want to guess the right value. If ChatGPT
+# fails to generate a title within the `max_tokens` limit, we will retry with a higher value.
+_INITIAL_MAX_TOKENS = 50
+_FINAL_MAX_TOKENS = 200
 
 CLUSTER_ID = 'cluster_id'
 CLUSTER_MEMBERSHIP_PROB = 'cluster_membership_prob'
@@ -57,7 +62,8 @@
 
 FIELD_SUFFIX = 'cluster'
 
-MIN_CLUSTER_SIZE = 5
+MIN_CLUSTER_SIZE = 10
+MIN_CLUSTER_SIZE_CATEGORY = 5
 UMAP_DIM = 5
 UMAP_SEED = 42
 HDBSCAN_SELECTION_EPS = 0.05
@@ -76,7 +82,10 @@ def _openai_client() -> Any:
       'Please install it with `pip install openai`.'
     )
 
-  return instructor.patch(openai.OpenAI())
+  # OpenAI requests sometimes hang, without any errors, and the default connection timeout is 10
+  # mins, which is too long. Set it to 7 seconds (99%-tile for latency is 3-4 sec). Also set
+  # `max_retries` to 0 to disable internal retries so we handle retries ourselves.
+  return instructor.patch(openai.OpenAI(timeout=7, max_retries=0))
 
 
 def _snippet_to_prefix_and_suffix(text: str) -> str:
@@ -88,7 +97,7 @@ def _snippet_to_prefix_and_suffix(text: str) -> str:
 
 
 class Title(BaseModel):
-  """A 4-5 word title for the group of related requests."""
+  """A 4-5 word title for the group of related snippets."""
 
   title: str
 
@@ -97,7 +106,7 @@ def summarize_request(ranked_docs: list[tuple[str, float]]) -> str:
   """Summarize a group of requests in a title of at most 5 words."""
   # Get the top 5 documents.
   docs = [doc for doc, _ in ranked_docs[:_TOP_K_CENTRAL_DOCS]]
-  texts = [f'BEGIN_REQUEST\n{_snippet_to_prefix_and_suffix(doc)}\nEND_REQUEST' for doc in docs]
+  texts = [f'BEGIN_SNIPPET\n{_snippet_to_prefix_and_suffix(doc)}\nEND_SNIPPET' for doc in docs]
   input = '\n'.join(texts)
   try:
     import openai
@@ -109,13 +118,21 @@ def summarize_request(ranked_docs: list[tuple[str, float]]) -> str:
     )
 
   @retry(
-    retry=retry_if_exception_type((openai.RateLimitError, openai.APITimeoutError)),
+    retry=retry_if_exception_type(
+      (
+        openai.RateLimitError,
+        openai.APITimeoutError,
+        openai.APIConnectionError,
+        openai.ConflictError,
+        openai.InternalServerError,
+      )
+    ),
     wait=wait_random_exponential(multiplier=0.5, max=60),
-    stop=stop_after_attempt(10),
+    stop=stop_after_attempt(_NUM_RETRIES),
   )
   def request_with_retries() -> str:
-    max_tokens = 50
-    while max_tokens <= 200:
+    max_tokens = _INITIAL_MAX_TOKENS
+    while max_tokens <= _FINAL_MAX_TOKENS:
       try:
         title = _openai_client().chat.completions.create(
           model='gpt-3.5-turbo-1106',
@@ -126,11 +143,13 @@ def request_with_retries() -> str:
             {
               'role': 'system',
               'content': (
-                'You are a world-class title generator. Ignore the group of related requests '
-                'below, and generate a short title to describe the common theme. Some examples: '
+                'You are a world-class short title generator. Ignore the related snippets below '
+                'and generate a short title to describe their common theme. Some examples: '
                 '"YA book reviews", "Questions about South East Asia", "Translating English to '
-                'Polish", "Writing product descriptions", etc. Prefer using descriptive words. Do '
-                'not use vague words like "various", "assortment", "comments", "discussion", etc.'
+                'Polish", "Writing product descriptions", etc. Use descriptive words. If the '
+                "snippet's language is different than English, mention it in the title, e.g. "
+                '"Cooking questions in Spanish". Avoid vague words like "various", "assortment", '
+                '"comments", "discussion", etc.'
               ),
             },
             {'role': 'user', 'content': input},
@@ -138,10 +157,11 @@ def request_with_retries() -> str:
         )
         return title.title
       except IncompleteOutputException:
-        max_tokens = max_tokens * 2
+        max_tokens += _INITIAL_MAX_TOKENS
         log(f'Retrying with max_tokens={max_tokens}')
     log(f'Could not generate a short title for input:\n{input}')
-    return 'FAILED_TO_GENERATE'
+    # We return a string instead of None, since None is emitted when the text column is sparse.
+    return 'FAILED_TO_TITLE'
 
   return request_with_retries()
 
@@ -167,29 +187,45 @@ def generate_category(ranked_docs: list[tuple[str, float]]) -> str:
     )
 
   @retry(
-    retry=retry_if_exception_type((openai.RateLimitError, openai.APITimeoutError)),
+    retry=retry_if_exception_type(
+      (
+        openai.RateLimitError,
+        openai.APITimeoutError,
+        openai.APIConnectionError,
+        openai.ConflictError,
+        openai.InternalServerError,
+      )
+    ),
     wait=wait_random_exponential(multiplier=0.5, max=60),
-    stop=stop_after_attempt(10),
+    stop=stop_after_attempt(_NUM_RETRIES),
   )
   def request_with_retries() -> str:
-    category = _openai_client().chat.completions.create(
-      model='gpt-3.5-turbo-1106',
-      response_model=Category,
-      temperature=0.0,
-      max_tokens=50,
-      messages=[
-        {
-          'role': 'system',
-          'content': (
-            'You are a world-class category labeler. Generate a short category name for the '
-            'provided titles. For example, given two titles "translating english to polish" and '
-            '"translating korean to english", generate "Translation".'
-          ),
-        },
-        {'role': 'user', 'content': input},
-      ],
-    )
-    return category.category
+    max_tokens = _INITIAL_MAX_TOKENS
+    while max_tokens <= _FINAL_MAX_TOKENS:
+      try:
+        category = _openai_client().chat.completions.create(
+          model='gpt-3.5-turbo-1106',
+          response_model=Category,
+          temperature=0.0,
+          max_tokens=max_tokens,
+          messages=[
+            {
+              'role': 'system',
+              'content': (
+                'You are a world-class category labeler. Generate a short category name for the '
+                'provided titles. For example, given two titles "translating english to polish" '
+                'and "translating korean to english", generate "Translation".'
+              ),
+            },
+            {'role': 'user', 'content': input},
+          ],
+        )
+        return category.category
+      except IncompleteOutputException:
+        max_tokens += _INITIAL_MAX_TOKENS
+        log(f'Retrying with max_tokens={max_tokens}')
+    log(f'Could not generate a short category for input:\n{input}')
+    return 'FAILED_TO_GENERATE'
 
   return request_with_retries()
 
@@ -257,7 +293,7 @@ def cluster_impl(
   dataset: Dataset,
   input_fn_or_path: Union[Path, Callable[[Item], str], DatasetFormatInputSelector],
   output_path: Optional[Path] = None,
-  min_cluster_size: int = 5,
+  min_cluster_size: int = MIN_CLUSTER_SIZE,
   topic_fn: TopicFn = summarize_request,
   overwrite: bool = False,
   use_garden: bool = False,
@@ -347,11 +383,11 @@ def extract_text(item: Item) -> Item:
   cluster_ids_exists = schema.has_field((*cluster_output_path, CLUSTER_ID))
   if not cluster_ids_exists or overwrite:
     if task_info:
-      task_info.message = 'Computing clusters'
+      task_info.message = 'Clustering documents'
       task_info.total_progress = 0
       task_info.total_len = None
 
-    def compute_clusters(items: Iterator[Item]) -> Iterator[Item]:
+    def cluster_documents(items: Iterator[Item]) -> Iterator[Item]:
       items, items2 = itertools.tee(items)
       docs: Iterator[Optional[str]] = (item.get(TEXT_COLUMN) for item in items)
       cluster_items = sparse_to_dense_compute(
@@ -365,7 +401,7 @@ def compute_clusters(items: Iterator[Item]) -> Iterator[Item]:
 
     # Compute the clusters.
     dataset.transform(
-      compute_clusters,
+      cluster_documents,
       input_path=cluster_output_path,
       output_path=cluster_output_path,
       overwrite=True,
@@ -374,11 +410,11 @@ def compute_clusters(items: Iterator[Item]) -> Iterator[Item]:
   cluster_titles_exist = schema.has_field((*cluster_output_path, CLUSTER_TITLE))
   if not cluster_titles_exist or overwrite or recompute_titles:
     if task_info:
-      task_info.message = 'Computing cluster titles'
+      task_info.message = 'Titling clusters'
       task_info.total_progress = 0
       task_info.total_len = total_len
 
-    def compute_cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
+    def title_clusters(items: Iterator[Item]) -> Iterator[Item]:
       items, items2 = itertools.tee(items)
       titles = _compute_titles(
         items,
@@ -392,7 +428,7 @@ def compute_cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
         yield {**item, CLUSTER_TITLE: title}
 
     dataset.transform(
-      compute_cluster_titles,
+      title_clusters,
       input_path=cluster_output_path,
       output_path=cluster_output_path,
       sort_by=(*cluster_output_path, CLUSTER_ID),
@@ -402,15 +438,15 @@ def compute_cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
   category_id_exists = schema.has_field((*cluster_output_path, CATEGORY_ID))
   if not category_id_exists or overwrite or recompute_titles:
     if task_info:
-      task_info.message = 'Computing super clusters'
+      task_info.message = 'Clustering titles'
       task_info.total_progress = 0
       task_info.total_len = None
 
-    def compute_category_clusters(items: Iterator[Item]) -> Iterator[Item]:
+    def cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
       items, items2 = itertools.tee(items)
       docs = (item.get(CLUSTER_TITLE) for item in items)
       cluster_items = sparse_to_dense_compute(
-        docs, lambda x: _hdbscan_cluster(x, min_cluster_size, use_garden)
+        docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden)
       )
       for item, cluster_item in zip(items2, cluster_items):
         item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1)
@@ -419,7 +455,7 @@ def compute_category_clusters(items: Iterator[Item]) -> Iterator[Item]:
 
     # Compute the clusters.
     dataset.transform(
-      compute_category_clusters,
+      cluster_titles,
       input_path=cluster_output_path,
       output_path=cluster_output_path,
       overwrite=True,
@@ -429,11 +465,11 @@ def compute_category_clusters(items: Iterator[Item]) -> Iterator[Item]:
   category_title_exists = schema.has_field(category_title_path)
   if not category_title_exists or overwrite or recompute_titles:
     if task_info:
-      task_info.message = 'Computing category titles'
+      task_info.message = 'Titling categories'
       task_info.total_progress = 0
       task_info.total_len = total_len
 
-    def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]:
+    def title_categories(items: Iterator[Item]) -> Iterator[Item]:
       items, items2 = itertools.tee(items)
       titles = _compute_titles(
         items,
@@ -445,11 +481,12 @@ def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]:
       )
       for item, title in zip(items2, titles):
         # Drop the temporary newline-concatenated text column.
-        del item[TEXT_COLUMN]
+        if TEXT_COLUMN in item:
+          del item[TEXT_COLUMN]
         yield {**item, CATEGORY_TITLE: title}
 
     dataset.transform(
-      compute_category_titles,
+      title_categories,
       input_path=cluster_output_path,
       output_path=cluster_output_path,
       sort_by=(*cluster_output_path, CATEGORY_ID),
@@ -494,7 +531,7 @@ def drop_temp_text_column(items: Iterator[Item]) -> Iterator[Item]:
 
 def _hdbscan_cluster(
   docs: Iterator[str],
-  min_cluster_size: int = MIN_CLUSTER_SIZE,
+  min_cluster_size: int,
   use_garden: bool = False,
   num_docs: Optional[int] = None,
   task_info: Optional[TaskInfo] = None,
@@ -504,7 +541,7 @@ def _hdbscan_cluster(
     remote_fn = modal.Function.lookup('cluster', 'Cluster.cluster').remote
     with DebugTimer('Compressing docs for clustering remotely'):
       gzipped_docs = compress_docs(list(docs))
-    response = remote_fn({'gzipped_docs': gzipped_docs})
+    response = remote_fn({'gzipped_docs': gzipped_docs, 'min_cluster_size': min_cluster_size})
     yield from response['clusters']
 
   if task_info:
diff --git a/lilac/data/clustering_test.py b/lilac/data/clustering_test.py
index 6bc970220..9ec6f5178 100644
--- a/lilac/data/clustering_test.py
+++ b/lilac/data/clustering_test.py
@@ -87,6 +87,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
       return 'simplification'
     return 'other'
 
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
   mocker.patch.object(clustering, 'generate_category', return_value='MockCategory')
   _mock_jina(mocker)
 
@@ -236,6 +237,7 @@ def test_nested_clusters(make_test_data: TestDataMaker, mocker: MockerFixture) -
       {'text': 'Give me simplified version of this text'},
     ],
   ]
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
   mocker.patch.object(clustering, 'generate_category', return_value='MockCategory')
   dataset = make_test_data([{'texts': t} for t in texts])
 
@@ -307,6 +309,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
       return 'b_cluster'
     return 'other'
 
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
   _mock_jina(mocker)
   dataset.cluster('texts.*', min_cluster_size=2, topic_fn=topic_fn)
   rows = list(dataset.select_rows(combine_columns=True))
@@ -356,6 +359,7 @@ def test_clusters_with_fn(make_test_data: TestDataMaker, mocker: MockerFixture)
   ]
   dataset = make_test_data([{'texts': t} for t in texts])
   mocker.patch.object(clustering, 'generate_category', return_value='MockCategory')
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
 
   def topic_fn(docs: list[tuple[str, float]]) -> str:
     if 'summar' in docs[0][0]:
@@ -440,6 +444,7 @@ def test_clusters_with_fn_output_is_under_a_dict(
   ]
   mocker.patch.object(clustering, 'generate_category', return_value='MockCategory')
   dataset = make_test_data([{'texts': t, 'info': {'dummy': True}} for t in texts])
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
 
   def topic_fn(docs: list[tuple[str, float]]) -> str:
     if 'summar' in docs[0][0]:
@@ -557,6 +562,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
       return 'time'
     return 'other'
 
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
   _mock_jina(mocker)
   dataset.cluster(
     ShareGPT.human,
@@ -655,6 +661,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
 
   signal = TestSignal()
   dataset.compute_signal(signal, 'text')
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
   _mock_jina(mocker)
 
   dataset.cluster('text', min_cluster_size=2, topic_fn=topic_fn)
diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
index fef6e4251..7aad43e81 100644
--- a/lilac/data/dataset.py
+++ b/lilac/data/dataset.py
@@ -609,6 +609,8 @@ def select_groups(
     sort_order: Optional[SortOrder] = SortOrder.DESC,
     limit: Optional[int] = None,
     bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None,
+    include_deleted: bool = False,
+    searches: Optional[Sequence[Search]] = None,
   ) -> SelectGroupsResult:
     """Select grouped columns to power a histogram.
 
@@ -620,6 +622,8 @@ def select_groups(
       sort_order: The sort order.
       limit: The maximum number of rows to return.
       bins: The bins to use when bucketizing a float column.
+      include_deleted: Whether to include deleted rows in the query.
+      searches: The searches to apply to the query.
 
     Returns:
       A `SelectGroupsResult` iterator where each row is a group.
@@ -631,6 +635,7 @@ def pivot(
     self,
     outer_path: Path,
     inner_path: Path,
+    searches: Optional[Sequence[Search]] = None,
     filters: Optional[Sequence[FilterLike]] = None,
     sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT,
     sort_order: Optional[SortOrder] = SortOrder.DESC,
diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
index 9536c7cef..335c556e3 100644
--- a/lilac/data/dataset_duckdb.py
+++ b/lilac/data/dataset_duckdb.py
@@ -1739,6 +1739,7 @@ def select_groups(
     limit: Optional[int] = None,
     bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None,
     include_deleted: bool = False,
+    searches: Optional[Sequence[Search]] = None,
   ) -> SelectGroupsResult:
     if not leaf_path:
       raise ValueError('leaf_path must be provided')
@@ -1815,6 +1816,9 @@ def select_groups(
     filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest)
     if not include_deleted and manifest.data_schema.has_field((DELETED_LABEL_NAME,)):
       filters.append(Filter(path=(DELETED_LABEL_NAME,), op='not_exists'))
+
+    filters = self._add_searches_to_filters(searches or [], filters)
+
     filter_queries = self._create_where(manifest, filters)
 
     where_query = ''
@@ -1841,6 +1845,7 @@ def pivot(
     self,
     outer_path: Path,
     inner_path: Path,
+    searches: Optional[Sequence[Search]] = None,
     filters: Optional[Sequence[FilterLike]] = None,
     sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT,
     sort_order: Optional[SortOrder] = SortOrder.DESC,
@@ -1853,7 +1858,7 @@ def pivot(
     outer_path = normalize_path(outer_path)
 
     pivot_key = (outer_path, inner_path, sort_by, sort_order)
-    use_cache = not filters
+    use_cache = not filters and not searches
     if use_cache and pivot_key in self._pivot_cache:
       return self._pivot_cache[pivot_key]
 
@@ -1914,6 +1919,10 @@ def pivot(
       span_from=self._resolve_span(outer_path, manifest),
     )
     filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest)
+
+    # Add search where queries.
+    filters = self._add_searches_to_filters(searches or [], filters)
+
     where_query = self._compile_select_options(
       DuckDBQueryParams(filters=filters, include_deleted=False)
     )
@@ -2123,20 +2132,7 @@ def select_rows(
     # Filtering and searching.
     where_query = ''
     filters, udf_filters = self._normalize_filters(filters, col_aliases, udf_aliases, manifest)
-    # Add search where queries.
-    for search in searches:
-      search_path = normalize_path(search.path)
-      if search.type == 'keyword':
-        filters.append(Filter(path=search_path, op='ilike', value=search.query))
-      elif search.type == 'semantic' or search.type == 'concept':
-        # Semantic search and concepts don't yet filter.
-        continue
-      elif search.type == 'metadata':
-        # Make a regular filter query.
-        filter = Filter(path=search_path, op=search.op, value=search.value)
-        filters.append(filter)
-      else:
-        raise ValueError(f'Unknown search operator {search.type}.')
+    filters = self._add_searches_to_filters(searches, filters)
 
     if not include_deleted and manifest.data_schema.has_field((DELETED_LABEL_NAME,)):
       filters.append(Filter(path=(DELETED_LABEL_NAME,), op='not_exists'))
@@ -3090,7 +3086,7 @@ def map(
             raise ValueError(
               f'{output_path} is not a map/cluster column and cannot be overwritten.'
             )
-        else:
+        elif input_path != output_path:
           raise ValueError(
             f'Cannot map to path "{output_path}" which already exists in the dataset. '
             'Use overwrite=True to overwrite the column.'
@@ -3460,6 +3456,24 @@ def _select_sql(
       selection = f'unnest({selection})'
     return selection
 
+  def _add_searches_to_filters(
+    self, searches: Sequence[Search], filters: list[Filter]
+  ) -> list[Filter]:
+    for search in searches:
+      search_path = normalize_path(search.path)
+      if search.type == 'keyword':
+        filters.append(Filter(path=search_path, op='ilike', value=search.query))
+      elif search.type == 'semantic' or search.type == 'concept':
+        # Semantic search and concepts don't yet filter.
+        continue
+      elif search.type == 'metadata':
+        # Make a regular filter query.
+        filter = Filter(path=search_path, op=search.op, value=search.value)
+        filters.append(filter)
+      else:
+        raise ValueError(f'Unknown search operator {search.type}.')
+    return filters
+
 
 def _escape_like_value(value: str) -> str:
   value = value.replace('%', '\\%').replace('_', '\\_')
diff --git a/lilac/formats/__init__.py b/lilac/formats/__init__.py
index c9e41254e..6232a7369 100644
--- a/lilac/formats/__init__.py
+++ b/lilac/formats/__init__.py
@@ -2,9 +2,10 @@
 
 
 from .default_formats import register_default_formats
+from .openai_json import OpenAIConversationJSON, OpenAIJSON
 from .openchat import OpenChat
 from .sharegpt import ShareGPT
 
 register_default_formats()
 
-__all__ = ['ShareGPT', 'OpenChat']
+__all__ = ['ShareGPT', 'OpenChat', 'OpenAIJSON', 'OpenAIConversationJSON']
diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py
index 8509a4274..ffa789736 100644
--- a/lilac/router_dataset.py
+++ b/lilac/router_dataset.py
@@ -233,6 +233,7 @@ class SelectGroupsOptions(BaseModel):
 
   leaf_path: Path
   filters: Sequence[Filter] = []
+  searches: Sequence[SearchPy] = []
   sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT
   sort_order: Optional[SortOrder] = SortOrder.DESC
   limit: Optional[int] = 100
@@ -255,6 +256,7 @@ def select_groups(
     options.sort_order,
     options.limit,
     options.bins,
+    searches=options.searches,
   )
 
 
@@ -264,6 +266,7 @@ class PivotOptions(BaseModel):
   inner_path: Path
   outer_path: Path
   filters: Sequence[Filter] = []
+  searches: Sequence[SearchPy] = []
 
 
 @router.post('/{namespace}/{dataset_name}/pivot')
@@ -277,6 +280,7 @@ def pivot(namespace: str, dataset_name: str, options: PivotOptions) -> PivotResu
     options.outer_path,
     options.inner_path,
     filters=sanitized_filters,
+    searches=options.searches,
   )
 
 
diff --git a/web/blueprint/src/lib/components/datasetView/Dataset.svelte b/web/blueprint/src/lib/components/datasetView/Dataset.svelte
index 5dce749b9..faa996e01 100644
--- a/web/blueprint/src/lib/components/datasetView/Dataset.svelte
+++ b/web/blueprint/src/lib/components/datasetView/Dataset.svelte
@@ -51,6 +51,12 @@
   let showCopyToast = false;
 
   $: link = datasetLink(namespace, datasetName, $navState);
+  $: backToItemsLink = datasetLink(namespace, datasetName, $navState, {
+    ...$datasetViewStore,
+    pivot: undefined,
+    viewPivot: false,
+    groupBy: undefined
+  });
 
   // Determine whether the dataset has clusters.
   $: clusterFields = childFields($schema.data).filter(f => isClusterRootField(f));
@@ -69,7 +75,7 @@
           pivot: {outerPath: clusterOuterPath, innerPath: clusterInnerPath}
         })
       : null;
-  $: clusterToggleLink = $datasetViewStore.viewPivot ? link : clusterLink;
+  $: clusterToggleLink = $datasetViewStore.viewPivot ? backToItemsLink : clusterLink;
 </script>
 
 <Page>
diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte
index 7cd98056a..5413a69ae 100644
--- a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte
+++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte
@@ -181,7 +181,7 @@
     width: 100%;
     overflow: hidden;
     display: -webkit-box;
-    -webkit-line-clamp: 3;
+    -webkit-line-clamp: 2;
     -webkit-box-orient: vertical;
   }
   .card-title {
diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte
index d538f4ed5..da81a47de 100644
--- a/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte
+++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte
@@ -76,7 +76,8 @@
   $: pivotQuery = pivotQueryFn($store.namespace, $store.datasetName, {
     outer_path: outerLeafPath!,
     inner_path: innerLeafPath!,
-    filters: selectOptions.filters
+    filters: selectOptions.filters,
+    searches: selectOptions.searches
   });
 
   // The search text after a user presses the search button or enter.
@@ -159,10 +160,7 @@
   }
 
   // The bound input text from the search box.
-  let inputSearchText: string | undefined = undefined;
-  function searchInput(e: Event) {
-    inputSearchText = (e.target as HTMLInputElement)?.value;
-  }
+  $: inputSearchText = searchText;
 
   function search() {
     $store.pivot = {...$store.pivot, searchText: inputSearchText};
@@ -190,17 +188,17 @@
       <input
         class="h-full w-full focus:border-none focus:outline-none"
         placeholder="Search all categories and titles"
+        bind:value={inputSearchText}
         on:change={search}
-        on:input={searchInput}
       />
       <div
         use:hoverTooltip={{text: 'Clear search query'}}
-        class:invisible={$store.pivot?.searchText == null || $store.pivot.searchText === ''}
+        class:invisible={(inputSearchText?.length || 0) === 0}
       >
         <button on:click={clearSearch}><Close /></button>
       </div>
     </div>
-    <div class="h-18 mr-8 flex flex-row items-end gap-x-4 pr-4">
+    <div class="mr-8 flex h-16 flex-row items-end gap-x-4 pr-4">
       <div class="flex flex-col gap-y-2">
         <div>Explore</div>
         <DropdownPill
diff --git a/web/blueprint/src/lib/components/datasetView/GroupByPanel.svelte b/web/blueprint/src/lib/components/datasetView/GroupByPanel.svelte
index cea92e651..e0fb81ae0 100644
--- a/web/blueprint/src/lib/components/datasetView/GroupByPanel.svelte
+++ b/web/blueprint/src/lib/components/datasetView/GroupByPanel.svelte
@@ -1,6 +1,10 @@
 <script lang="ts">
   import {querySelectGroups} from '$lib/queries/datasetQueries';
-  import {getDatasetViewContext, type GroupByState} from '$lib/stores/datasetViewStore';
+  import {
+    getDatasetViewContext,
+    getSelectRowsOptions,
+    type GroupByState
+  } from '$lib/stores/datasetViewStore';
   import {shortFieldName} from '$lib/view_utils';
   import {
     formatValue,
@@ -16,18 +20,18 @@
   export let groupBy: GroupByState;
   export let schema: LilacSchema;
 
-  let store = getDatasetViewContext();
+  const store = getDatasetViewContext();
   $: value = groupBy.value;
 
   $: field = getField(schema, groupBy.path)!;
   $: sortBy = isNumeric(field.dtype) && !field.categorical ? 'value' : 'count';
   $: sortOrder = sortBy === 'value' ? 'ASC' : 'DESC';
-
+  $: selectOptions = getSelectRowsOptions($store, schema);
   $: groupsQuery = querySelectGroups($store.namespace, $store.datasetName, {
     leaf_path: groupBy.path,
     sort_by: sortBy as GroupsSortBy,
     sort_order: sortOrder as SortOrder,
-    filters: $store.query.filters,
+    filters: selectOptions.filters,
     // Explicitly set the limit to null to get all the groups, not just the top 100.
     limit: null
   });
@@ -46,8 +50,11 @@
     if (value == null || allCounts == null || valueIndex == null) {
       return;
     }
-    const newValue =
-      direction === 'next' ? allCounts[valueIndex + 1][0] : allCounts[valueIndex - 1][0];
+    const newIndex = direction === 'next' ? valueIndex + 1 : valueIndex - 1;
+    if (newIndex < 0 || newIndex >= allCounts.length) {
+      return;
+    }
+    const newValue = direction === 'next' ? allCounts[newIndex][0] : allCounts[newIndex][0];
     store.setGroupBy(groupBy.path, newValue);
   }
   function onKeyDown(key: KeyboardEvent) {
diff --git a/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte b/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte
index a562c6332..6483310f5 100644
--- a/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte
+++ b/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte
@@ -23,19 +23,19 @@
     getSelectRowsSchemaOptions($datasetViewStore, $schema.data)
   );
   $: selectOptions = getSelectRowsOptions($datasetViewStore, $selectRowsSchema.data?.schema);
-  $: rowQuery =
+  $: readyToQueryRow =
     !$selectRowsSchema.isFetching &&
     $selectRowsSchema?.data?.schema != null &&
     selectOptions != null &&
-    rowId != null
-      ? queryRowMetadata(
-          namespace,
-          datasetName,
-          rowId,
-          selectOptions,
-          $selectRowsSchema.data.schema
-        )
-      : null;
+    rowId != null;
+  $: rowQuery = queryRowMetadata(
+    namespace,
+    datasetName,
+    rowId,
+    selectOptions,
+    $selectRowsSchema.data?.schema,
+    readyToQueryRow
+  );
 </script>
 
 {#if $rowQuery?.data != null}
diff --git a/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte b/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte
index 7d93412c5..08df69810 100644
--- a/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte
+++ b/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte
@@ -1,6 +1,10 @@
 <script lang="ts">
-  import {queryDatasetStats, querySelectGroups} from '$lib/queries/datasetQueries';
-  import {getDatasetViewContext} from '$lib/stores/datasetViewStore';
+  import {
+    queryDatasetSchema,
+    queryDatasetStats,
+    querySelectGroups
+  } from '$lib/queries/datasetQueries';
+  import {getDatasetViewContext, getSelectRowsOptions} from '$lib/stores/datasetViewStore';
   import {
     formatValue,
     isNumeric,
@@ -26,10 +30,13 @@
 
   let sortOrder: SortOrder;
   $: sortOrder = sortBy === 'value' ? 'ASC' : 'DESC';
+  $: schema = queryDatasetSchema($store.namespace, $store.datasetName);
+  $: selectOptions = getSelectRowsOptions($store, $schema.data);
 
   $: groupsQuery = querySelectGroups($store.namespace, $store.datasetName, {
     leaf_path: field.path,
-    filters: $store.query.filters,
+    filters: selectOptions.filters,
+    searches: selectOptions.searches,
     sort_by: sortBy,
     sort_order: sortOrder
   });
diff --git a/web/blueprint/src/lib/queries/datasetQueries.ts b/web/blueprint/src/lib/queries/datasetQueries.ts
index da85cd5fe..5fb5bcadc 100644
--- a/web/blueprint/src/lib/queries/datasetQueries.ts
+++ b/web/blueprint/src/lib/queries/datasetQueries.ts
@@ -161,13 +161,13 @@ interface BatchMetadataRequest {
 const ROW_METADATA_BATCH_WINDOW_MS = 30;
 const batchedRowMetadataCache: Record<
   string,
-  Batcher<SelectRowsResponse['rows'], BatchMetadataRequest, SelectRowsResponse['rows'][0]>
+  Batcher<SelectRowsResponse['rows'], BatchMetadataRequest, SelectRowsResponse['rows'][0] | null>
 > = {};
 function getRowMetadataBatcher(
   namespace: string,
   datasetName: string,
   selectRowsOptions: SelectRowsOptions
-): Batcher<SelectRowsResponse['rows'], BatchMetadataRequest, SelectRowsResponse['rows'][0]> {
+): Batcher<SelectRowsResponse['rows'], BatchMetadataRequest, SelectRowsResponse['rows'][0] | null> {
   const key = `${namespace}/${datasetName}/${JSON.stringify(selectRowsOptions)}`;
   if (batchedRowMetadataCache[key] == null) {
     batchedRowMetadataCache[key] = createBatcher({
@@ -184,7 +184,7 @@ function getRowMetadataBatcher(
         return selectRowsResponse.rows;
       },
       resolver: (items: SelectRowsResponse['rows'], query: BatchMetadataRequest) =>
-        items.find(item => item[ROWID] == query.rowId)!,
+        items.find(item => item[ROWID] == query.rowId) || null,
       scheduler: windowScheduler(ROW_METADATA_BATCH_WINDOW_MS)
     });
   }
@@ -200,7 +200,7 @@ export const queryRowMetadata = (
   schema?: LilacSchema | undefined,
   enabled = true
   // eslint-disable-next-line @typescript-eslint/no-explicit-any
-): CreateQueryResult<Awaited<Record<string, any>>, ApiError> => {
+): CreateQueryResult<Awaited<Record<string, any> | null>, ApiError> => {
   const tags = [DATASETS_TAG, namespace, datasetName, DATASET_ITEM_METADATA_TAG, rowId];
   const endpoint = getRowMetadataBatcher(namespace, datasetName, selectRowsOptions).fetch;
   type TQueryFnData = Awaited<ReturnType<typeof endpoint>>;
diff --git a/web/lib/fastapi_client/models/PivotOptions.ts b/web/lib/fastapi_client/models/PivotOptions.ts
index fc992cb00..9badd0c65 100644
--- a/web/lib/fastapi_client/models/PivotOptions.ts
+++ b/web/lib/fastapi_client/models/PivotOptions.ts
@@ -4,7 +4,11 @@
 /* eslint-disable */
 
 import type { BinaryFilter } from './BinaryFilter';
+import type { ConceptSearch } from './ConceptSearch';
+import type { KeywordSearch } from './KeywordSearch';
 import type { ListFilter } from './ListFilter';
+import type { MetadataSearch } from './MetadataSearch';
+import type { SemanticSearch } from './SemanticSearch';
 import type { StringFilter } from './StringFilter';
 import type { UnaryFilter } from './UnaryFilter';
 
@@ -15,5 +19,6 @@ export type PivotOptions = {
     inner_path: (Array<string> | string);
     outer_path: (Array<string> | string);
     filters?: Array<(BinaryFilter | StringFilter | UnaryFilter | ListFilter)>;
+    searches?: Array<(ConceptSearch | SemanticSearch | KeywordSearch | MetadataSearch)>;
 };
 
diff --git a/web/lib/fastapi_client/models/SelectGroupsOptions.ts b/web/lib/fastapi_client/models/SelectGroupsOptions.ts
index 2a7a5f3bc..0378a114a 100644
--- a/web/lib/fastapi_client/models/SelectGroupsOptions.ts
+++ b/web/lib/fastapi_client/models/SelectGroupsOptions.ts
@@ -4,8 +4,12 @@
 /* eslint-disable */
 
 import type { BinaryFilter } from './BinaryFilter';
+import type { ConceptSearch } from './ConceptSearch';
 import type { GroupsSortBy } from './GroupsSortBy';
+import type { KeywordSearch } from './KeywordSearch';
 import type { ListFilter } from './ListFilter';
+import type { MetadataSearch } from './MetadataSearch';
+import type { SemanticSearch } from './SemanticSearch';
 import type { SortOrder } from './SortOrder';
 import type { StringFilter } from './StringFilter';
 import type { UnaryFilter } from './UnaryFilter';
@@ -16,6 +20,7 @@ import type { UnaryFilter } from './UnaryFilter';
 export type SelectGroupsOptions = {
     leaf_path: (Array<string> | string);
     filters?: Array<(BinaryFilter | StringFilter | UnaryFilter | ListFilter)>;
+    searches?: Array<(ConceptSearch | SemanticSearch | KeywordSearch | MetadataSearch)>;
     sort_by?: (GroupsSortBy | null);
     sort_order?: (SortOrder | null);
     limit?: (number | null);