From 4219c4298c113170856a989b3c4ea8193a3ba372 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Mon, 29 Jan 2024 12:01:47 -0500 Subject: [PATCH] Improve clusters and several bug fixes (#1141) https://huggingface.co/spaces/lilacai/daniel_staging Clustering (backend): - Lower `min_cluster_size` to 5 for the categories, so we can have more coherent categories - Add timeout of 7 sec (99%-tile response latency for OpenAI is like 3-4 sec) to avoid the 10min timeout. We can now title clusters of 1M docs (11k clusters) in 8mins. - Disable internal OpenAI retries (we used to have double retries) - Replace "request" with "snippet" in the prompt to avoid biasing towards user's requests -- improves forum/email/text clustering UI - make the histograms reactive to the currently selected group in "group by" - make pivot reactive to searches (e.g. keyword search, metadata search) - remember the schema and nav bar state when flipping between cluster view and item view - Fix a bug with search box state, after page refresh --- lilac/__init__.py | 3 - lilac/data/clustering.py | 141 +++++++++++------- lilac/data/clustering_test.py | 7 + lilac/data/dataset.py | 5 + lilac/data/dataset_duckdb.py | 46 ++++-- lilac/formats/__init__.py | 3 +- lilac/router_dataset.py | 4 + .../lib/components/datasetView/Dataset.svelte | 8 +- .../datasetView/DatasetPivotResult.svelte | 2 +- .../datasetView/DatasetPivotViewer.svelte | 14 +- .../datasetView/GroupByPanel.svelte | 19 ++- .../datasetView/PrefetchRowItem.svelte | 20 +-- .../components/schemaView/FieldDetails.svelte | 13 +- .../src/lib/queries/datasetQueries.ts | 8 +- web/lib/fastapi_client/models/PivotOptions.ts | 5 + .../models/SelectGroupsOptions.ts | 5 + 16 files changed, 198 insertions(+), 105 deletions(-) diff --git a/lilac/__init__.py b/lilac/__init__.py index 776126d86..7ffc96db1 100644 --- a/lilac/__init__.py +++ b/lilac/__init__.py @@ -18,7 +18,6 @@ from .env import * # noqa: F403 from .env import LilacEnvironment, get_project_dir, set_project_dir from .formats import * # noqa: F403 -from .formats import OpenChat, ShareGPT from .load import load from .load_dataset import create_dataset, from_dicts, from_huggingface from .project import init @@ -73,8 +72,6 @@ 'deploy_project', 'deploy_config', 'SpanVector', - 'ShareGPT', - 'OpenChat', 'download', 'upload', 'register_embedding', diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py index 7b57a73dd..a12b9011f 100644 --- a/lilac/data/clustering.py +++ b/lilac/data/clustering.py @@ -44,8 +44,13 @@ _SHORTEN_LEN = 400 _TOP_K_CENTRAL_DOCS = 7 -_TOP_K_CENTRAL_TITLES = 15 +_TOP_K_CENTRAL_TITLES = 20 _NUM_THREADS = 32 +_NUM_RETRIES = 16 +# OpenAI rate limits you on `max_tokens` so we ideally want to guess the right value. If ChatGPT +# fails to generate a title within the `max_tokens` limit, we will retry with a higher value. +_INITIAL_MAX_TOKENS = 50 +_FINAL_MAX_TOKENS = 200 CLUSTER_ID = 'cluster_id' CLUSTER_MEMBERSHIP_PROB = 'cluster_membership_prob' @@ -57,7 +62,8 @@ FIELD_SUFFIX = 'cluster' -MIN_CLUSTER_SIZE = 5 +MIN_CLUSTER_SIZE = 10 +MIN_CLUSTER_SIZE_CATEGORY = 5 UMAP_DIM = 5 UMAP_SEED = 42 HDBSCAN_SELECTION_EPS = 0.05 @@ -76,7 +82,10 @@ def _openai_client() -> Any: 'Please install it with `pip install openai`.' ) - return instructor.patch(openai.OpenAI()) + # OpenAI requests sometimes hang, without any errors, and the default connection timeout is 10 + # mins, which is too long. Set it to 7 seconds (99%-tile for latency is 3-4 sec). Also set + # `max_retries` to 0 to disable internal retries so we handle retries ourselves. + return instructor.patch(openai.OpenAI(timeout=7, max_retries=0)) def _snippet_to_prefix_and_suffix(text: str) -> str: @@ -88,7 +97,7 @@ def _snippet_to_prefix_and_suffix(text: str) -> str: class Title(BaseModel): - """A 4-5 word title for the group of related requests.""" + """A 4-5 word title for the group of related snippets.""" title: str @@ -97,7 +106,7 @@ def summarize_request(ranked_docs: list[tuple[str, float]]) -> str: """Summarize a group of requests in a title of at most 5 words.""" # Get the top 5 documents. docs = [doc for doc, _ in ranked_docs[:_TOP_K_CENTRAL_DOCS]] - texts = [f'BEGIN_REQUEST\n{_snippet_to_prefix_and_suffix(doc)}\nEND_REQUEST' for doc in docs] + texts = [f'BEGIN_SNIPPET\n{_snippet_to_prefix_and_suffix(doc)}\nEND_SNIPPET' for doc in docs] input = '\n'.join(texts) try: import openai @@ -109,13 +118,21 @@ def summarize_request(ranked_docs: list[tuple[str, float]]) -> str: ) @retry( - retry=retry_if_exception_type((openai.RateLimitError, openai.APITimeoutError)), + retry=retry_if_exception_type( + ( + openai.RateLimitError, + openai.APITimeoutError, + openai.APIConnectionError, + openai.ConflictError, + openai.InternalServerError, + ) + ), wait=wait_random_exponential(multiplier=0.5, max=60), - stop=stop_after_attempt(10), + stop=stop_after_attempt(_NUM_RETRIES), ) def request_with_retries() -> str: - max_tokens = 50 - while max_tokens <= 200: + max_tokens = _INITIAL_MAX_TOKENS + while max_tokens <= _FINAL_MAX_TOKENS: try: title = _openai_client().chat.completions.create( model='gpt-3.5-turbo-1106', @@ -126,11 +143,13 @@ def request_with_retries() -> str: { 'role': 'system', 'content': ( - 'You are a world-class title generator. Ignore the group of related requests ' - 'below, and generate a short title to describe the common theme. Some examples: ' + 'You are a world-class short title generator. Ignore the related snippets below ' + 'and generate a short title to describe their common theme. Some examples: ' '"YA book reviews", "Questions about South East Asia", "Translating English to ' - 'Polish", "Writing product descriptions", etc. Prefer using descriptive words. Do ' - 'not use vague words like "various", "assortment", "comments", "discussion", etc.' + 'Polish", "Writing product descriptions", etc. Use descriptive words. If the ' + "snippet's language is different than English, mention it in the title, e.g. " + '"Cooking questions in Spanish". Avoid vague words like "various", "assortment", ' + '"comments", "discussion", etc.' ), }, {'role': 'user', 'content': input}, @@ -138,10 +157,11 @@ def request_with_retries() -> str: ) return title.title except IncompleteOutputException: - max_tokens = max_tokens * 2 + max_tokens += _INITIAL_MAX_TOKENS log(f'Retrying with max_tokens={max_tokens}') log(f'Could not generate a short title for input:\n{input}') - return 'FAILED_TO_GENERATE' + # We return a string instead of None, since None is emitted when the text column is sparse. + return 'FAILED_TO_TITLE' return request_with_retries() @@ -167,29 +187,45 @@ def generate_category(ranked_docs: list[tuple[str, float]]) -> str: ) @retry( - retry=retry_if_exception_type((openai.RateLimitError, openai.APITimeoutError)), + retry=retry_if_exception_type( + ( + openai.RateLimitError, + openai.APITimeoutError, + openai.APIConnectionError, + openai.ConflictError, + openai.InternalServerError, + ) + ), wait=wait_random_exponential(multiplier=0.5, max=60), - stop=stop_after_attempt(10), + stop=stop_after_attempt(_NUM_RETRIES), ) def request_with_retries() -> str: - category = _openai_client().chat.completions.create( - model='gpt-3.5-turbo-1106', - response_model=Category, - temperature=0.0, - max_tokens=50, - messages=[ - { - 'role': 'system', - 'content': ( - 'You are a world-class category labeler. Generate a short category name for the ' - 'provided titles. For example, given two titles "translating english to polish" and ' - '"translating korean to english", generate "Translation".' - ), - }, - {'role': 'user', 'content': input}, - ], - ) - return category.category + max_tokens = _INITIAL_MAX_TOKENS + while max_tokens <= _FINAL_MAX_TOKENS: + try: + category = _openai_client().chat.completions.create( + model='gpt-3.5-turbo-1106', + response_model=Category, + temperature=0.0, + max_tokens=max_tokens, + messages=[ + { + 'role': 'system', + 'content': ( + 'You are a world-class category labeler. Generate a short category name for the ' + 'provided titles. For example, given two titles "translating english to polish" ' + 'and "translating korean to english", generate "Translation".' + ), + }, + {'role': 'user', 'content': input}, + ], + ) + return category.category + except IncompleteOutputException: + max_tokens += _INITIAL_MAX_TOKENS + log(f'Retrying with max_tokens={max_tokens}') + log(f'Could not generate a short category for input:\n{input}') + return 'FAILED_TO_GENERATE' return request_with_retries() @@ -257,7 +293,7 @@ def cluster_impl( dataset: Dataset, input_fn_or_path: Union[Path, Callable[[Item], str], DatasetFormatInputSelector], output_path: Optional[Path] = None, - min_cluster_size: int = 5, + min_cluster_size: int = MIN_CLUSTER_SIZE, topic_fn: TopicFn = summarize_request, overwrite: bool = False, use_garden: bool = False, @@ -347,11 +383,11 @@ def extract_text(item: Item) -> Item: cluster_ids_exists = schema.has_field((*cluster_output_path, CLUSTER_ID)) if not cluster_ids_exists or overwrite: if task_info: - task_info.message = 'Computing clusters' + task_info.message = 'Clustering documents' task_info.total_progress = 0 task_info.total_len = None - def compute_clusters(items: Iterator[Item]) -> Iterator[Item]: + def cluster_documents(items: Iterator[Item]) -> Iterator[Item]: items, items2 = itertools.tee(items) docs: Iterator[Optional[str]] = (item.get(TEXT_COLUMN) for item in items) cluster_items = sparse_to_dense_compute( @@ -365,7 +401,7 @@ def compute_clusters(items: Iterator[Item]) -> Iterator[Item]: # Compute the clusters. dataset.transform( - compute_clusters, + cluster_documents, input_path=cluster_output_path, output_path=cluster_output_path, overwrite=True, @@ -374,11 +410,11 @@ def compute_clusters(items: Iterator[Item]) -> Iterator[Item]: cluster_titles_exist = schema.has_field((*cluster_output_path, CLUSTER_TITLE)) if not cluster_titles_exist or overwrite or recompute_titles: if task_info: - task_info.message = 'Computing cluster titles' + task_info.message = 'Titling clusters' task_info.total_progress = 0 task_info.total_len = total_len - def compute_cluster_titles(items: Iterator[Item]) -> Iterator[Item]: + def title_clusters(items: Iterator[Item]) -> Iterator[Item]: items, items2 = itertools.tee(items) titles = _compute_titles( items, @@ -392,7 +428,7 @@ def compute_cluster_titles(items: Iterator[Item]) -> Iterator[Item]: yield {**item, CLUSTER_TITLE: title} dataset.transform( - compute_cluster_titles, + title_clusters, input_path=cluster_output_path, output_path=cluster_output_path, sort_by=(*cluster_output_path, CLUSTER_ID), @@ -402,15 +438,15 @@ def compute_cluster_titles(items: Iterator[Item]) -> Iterator[Item]: category_id_exists = schema.has_field((*cluster_output_path, CATEGORY_ID)) if not category_id_exists or overwrite or recompute_titles: if task_info: - task_info.message = 'Computing super clusters' + task_info.message = 'Clustering titles' task_info.total_progress = 0 task_info.total_len = None - def compute_category_clusters(items: Iterator[Item]) -> Iterator[Item]: + def cluster_titles(items: Iterator[Item]) -> Iterator[Item]: items, items2 = itertools.tee(items) docs = (item.get(CLUSTER_TITLE) for item in items) cluster_items = sparse_to_dense_compute( - docs, lambda x: _hdbscan_cluster(x, min_cluster_size, use_garden) + docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden) ) for item, cluster_item in zip(items2, cluster_items): item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1) @@ -419,7 +455,7 @@ def compute_category_clusters(items: Iterator[Item]) -> Iterator[Item]: # Compute the clusters. dataset.transform( - compute_category_clusters, + cluster_titles, input_path=cluster_output_path, output_path=cluster_output_path, overwrite=True, @@ -429,11 +465,11 @@ def compute_category_clusters(items: Iterator[Item]) -> Iterator[Item]: category_title_exists = schema.has_field(category_title_path) if not category_title_exists or overwrite or recompute_titles: if task_info: - task_info.message = 'Computing category titles' + task_info.message = 'Titling categories' task_info.total_progress = 0 task_info.total_len = total_len - def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]: + def title_categories(items: Iterator[Item]) -> Iterator[Item]: items, items2 = itertools.tee(items) titles = _compute_titles( items, @@ -445,11 +481,12 @@ def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]: ) for item, title in zip(items2, titles): # Drop the temporary newline-concatenated text column. - del item[TEXT_COLUMN] + if TEXT_COLUMN in item: + del item[TEXT_COLUMN] yield {**item, CATEGORY_TITLE: title} dataset.transform( - compute_category_titles, + title_categories, input_path=cluster_output_path, output_path=cluster_output_path, sort_by=(*cluster_output_path, CATEGORY_ID), @@ -494,7 +531,7 @@ def drop_temp_text_column(items: Iterator[Item]) -> Iterator[Item]: def _hdbscan_cluster( docs: Iterator[str], - min_cluster_size: int = MIN_CLUSTER_SIZE, + min_cluster_size: int, use_garden: bool = False, num_docs: Optional[int] = None, task_info: Optional[TaskInfo] = None, @@ -504,7 +541,7 @@ def _hdbscan_cluster( remote_fn = modal.Function.lookup('cluster', 'Cluster.cluster').remote with DebugTimer('Compressing docs for clustering remotely'): gzipped_docs = compress_docs(list(docs)) - response = remote_fn({'gzipped_docs': gzipped_docs}) + response = remote_fn({'gzipped_docs': gzipped_docs, 'min_cluster_size': min_cluster_size}) yield from response['clusters'] if task_info: diff --git a/lilac/data/clustering_test.py b/lilac/data/clustering_test.py index 6bc970220..9ec6f5178 100644 --- a/lilac/data/clustering_test.py +++ b/lilac/data/clustering_test.py @@ -87,6 +87,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str: return 'simplification' return 'other' + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) mocker.patch.object(clustering, 'generate_category', return_value='MockCategory') _mock_jina(mocker) @@ -236,6 +237,7 @@ def test_nested_clusters(make_test_data: TestDataMaker, mocker: MockerFixture) - {'text': 'Give me simplified version of this text'}, ], ] + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) mocker.patch.object(clustering, 'generate_category', return_value='MockCategory') dataset = make_test_data([{'texts': t} for t in texts]) @@ -307,6 +309,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str: return 'b_cluster' return 'other' + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) _mock_jina(mocker) dataset.cluster('texts.*', min_cluster_size=2, topic_fn=topic_fn) rows = list(dataset.select_rows(combine_columns=True)) @@ -356,6 +359,7 @@ def test_clusters_with_fn(make_test_data: TestDataMaker, mocker: MockerFixture) ] dataset = make_test_data([{'texts': t} for t in texts]) mocker.patch.object(clustering, 'generate_category', return_value='MockCategory') + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) def topic_fn(docs: list[tuple[str, float]]) -> str: if 'summar' in docs[0][0]: @@ -440,6 +444,7 @@ def test_clusters_with_fn_output_is_under_a_dict( ] mocker.patch.object(clustering, 'generate_category', return_value='MockCategory') dataset = make_test_data([{'texts': t, 'info': {'dummy': True}} for t in texts]) + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) def topic_fn(docs: list[tuple[str, float]]) -> str: if 'summar' in docs[0][0]: @@ -557,6 +562,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str: return 'time' return 'other' + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) _mock_jina(mocker) dataset.cluster( ShareGPT.human, @@ -655,6 +661,7 @@ def topic_fn(docs: list[tuple[str, float]]) -> str: signal = TestSignal() dataset.compute_signal(signal, 'text') + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) _mock_jina(mocker) dataset.cluster('text', min_cluster_size=2, topic_fn=topic_fn) diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py index fef6e4251..7aad43e81 100644 --- a/lilac/data/dataset.py +++ b/lilac/data/dataset.py @@ -609,6 +609,8 @@ def select_groups( sort_order: Optional[SortOrder] = SortOrder.DESC, limit: Optional[int] = None, bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None, + include_deleted: bool = False, + searches: Optional[Sequence[Search]] = None, ) -> SelectGroupsResult: """Select grouped columns to power a histogram. @@ -620,6 +622,8 @@ def select_groups( sort_order: The sort order. limit: The maximum number of rows to return. bins: The bins to use when bucketizing a float column. + include_deleted: Whether to include deleted rows in the query. + searches: The searches to apply to the query. Returns: A `SelectGroupsResult` iterator where each row is a group. @@ -631,6 +635,7 @@ def pivot( self, outer_path: Path, inner_path: Path, + searches: Optional[Sequence[Search]] = None, filters: Optional[Sequence[FilterLike]] = None, sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT, sort_order: Optional[SortOrder] = SortOrder.DESC, diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index 9536c7cef..335c556e3 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -1739,6 +1739,7 @@ def select_groups( limit: Optional[int] = None, bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None, include_deleted: bool = False, + searches: Optional[Sequence[Search]] = None, ) -> SelectGroupsResult: if not leaf_path: raise ValueError('leaf_path must be provided') @@ -1815,6 +1816,9 @@ def select_groups( filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest) if not include_deleted and manifest.data_schema.has_field((DELETED_LABEL_NAME,)): filters.append(Filter(path=(DELETED_LABEL_NAME,), op='not_exists')) + + filters = self._add_searches_to_filters(searches or [], filters) + filter_queries = self._create_where(manifest, filters) where_query = '' @@ -1841,6 +1845,7 @@ def pivot( self, outer_path: Path, inner_path: Path, + searches: Optional[Sequence[Search]] = None, filters: Optional[Sequence[FilterLike]] = None, sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT, sort_order: Optional[SortOrder] = SortOrder.DESC, @@ -1853,7 +1858,7 @@ def pivot( outer_path = normalize_path(outer_path) pivot_key = (outer_path, inner_path, sort_by, sort_order) - use_cache = not filters + use_cache = not filters and not searches if use_cache and pivot_key in self._pivot_cache: return self._pivot_cache[pivot_key] @@ -1914,6 +1919,10 @@ def pivot( span_from=self._resolve_span(outer_path, manifest), ) filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest) + + # Add search where queries. + filters = self._add_searches_to_filters(searches or [], filters) + where_query = self._compile_select_options( DuckDBQueryParams(filters=filters, include_deleted=False) ) @@ -2123,20 +2132,7 @@ def select_rows( # Filtering and searching. where_query = '' filters, udf_filters = self._normalize_filters(filters, col_aliases, udf_aliases, manifest) - # Add search where queries. - for search in searches: - search_path = normalize_path(search.path) - if search.type == 'keyword': - filters.append(Filter(path=search_path, op='ilike', value=search.query)) - elif search.type == 'semantic' or search.type == 'concept': - # Semantic search and concepts don't yet filter. - continue - elif search.type == 'metadata': - # Make a regular filter query. - filter = Filter(path=search_path, op=search.op, value=search.value) - filters.append(filter) - else: - raise ValueError(f'Unknown search operator {search.type}.') + filters = self._add_searches_to_filters(searches, filters) if not include_deleted and manifest.data_schema.has_field((DELETED_LABEL_NAME,)): filters.append(Filter(path=(DELETED_LABEL_NAME,), op='not_exists')) @@ -3090,7 +3086,7 @@ def map( raise ValueError( f'{output_path} is not a map/cluster column and cannot be overwritten.' ) - else: + elif input_path != output_path: raise ValueError( f'Cannot map to path "{output_path}" which already exists in the dataset. ' 'Use overwrite=True to overwrite the column.' @@ -3460,6 +3456,24 @@ def _select_sql( selection = f'unnest({selection})' return selection + def _add_searches_to_filters( + self, searches: Sequence[Search], filters: list[Filter] + ) -> list[Filter]: + for search in searches: + search_path = normalize_path(search.path) + if search.type == 'keyword': + filters.append(Filter(path=search_path, op='ilike', value=search.query)) + elif search.type == 'semantic' or search.type == 'concept': + # Semantic search and concepts don't yet filter. + continue + elif search.type == 'metadata': + # Make a regular filter query. + filter = Filter(path=search_path, op=search.op, value=search.value) + filters.append(filter) + else: + raise ValueError(f'Unknown search operator {search.type}.') + return filters + def _escape_like_value(value: str) -> str: value = value.replace('%', '\\%').replace('_', '\\_') diff --git a/lilac/formats/__init__.py b/lilac/formats/__init__.py index c9e41254e..6232a7369 100644 --- a/lilac/formats/__init__.py +++ b/lilac/formats/__init__.py @@ -2,9 +2,10 @@ from .default_formats import register_default_formats +from .openai_json import OpenAIConversationJSON, OpenAIJSON from .openchat import OpenChat from .sharegpt import ShareGPT register_default_formats() -__all__ = ['ShareGPT', 'OpenChat'] +__all__ = ['ShareGPT', 'OpenChat', 'OpenAIJSON', 'OpenAIConversationJSON'] diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py index 8509a4274..ffa789736 100644 --- a/lilac/router_dataset.py +++ b/lilac/router_dataset.py @@ -233,6 +233,7 @@ class SelectGroupsOptions(BaseModel): leaf_path: Path filters: Sequence[Filter] = [] + searches: Sequence[SearchPy] = [] sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT sort_order: Optional[SortOrder] = SortOrder.DESC limit: Optional[int] = 100 @@ -255,6 +256,7 @@ def select_groups( options.sort_order, options.limit, options.bins, + searches=options.searches, ) @@ -264,6 +266,7 @@ class PivotOptions(BaseModel): inner_path: Path outer_path: Path filters: Sequence[Filter] = [] + searches: Sequence[SearchPy] = [] @router.post('/{namespace}/{dataset_name}/pivot') @@ -277,6 +280,7 @@ def pivot(namespace: str, dataset_name: str, options: PivotOptions) -> PivotResu options.outer_path, options.inner_path, filters=sanitized_filters, + searches=options.searches, ) diff --git a/web/blueprint/src/lib/components/datasetView/Dataset.svelte b/web/blueprint/src/lib/components/datasetView/Dataset.svelte index 5dce749b9..faa996e01 100644 --- a/web/blueprint/src/lib/components/datasetView/Dataset.svelte +++ b/web/blueprint/src/lib/components/datasetView/Dataset.svelte @@ -51,6 +51,12 @@ let showCopyToast = false; $: link = datasetLink(namespace, datasetName, $navState); + $: backToItemsLink = datasetLink(namespace, datasetName, $navState, { + ...$datasetViewStore, + pivot: undefined, + viewPivot: false, + groupBy: undefined + }); // Determine whether the dataset has clusters. $: clusterFields = childFields($schema.data).filter(f => isClusterRootField(f)); @@ -69,7 +75,7 @@ pivot: {outerPath: clusterOuterPath, innerPath: clusterInnerPath} }) : null; - $: clusterToggleLink = $datasetViewStore.viewPivot ? link : clusterLink; + $: clusterToggleLink = $datasetViewStore.viewPivot ? backToItemsLink : clusterLink; diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte index 7cd98056a..5413a69ae 100644 --- a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte +++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte @@ -181,7 +181,7 @@ width: 100%; overflow: hidden; display: -webkit-box; - -webkit-line-clamp: 3; + -webkit-line-clamp: 2; -webkit-box-orient: vertical; } .card-title { diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte index d538f4ed5..da81a47de 100644 --- a/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte +++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte @@ -76,7 +76,8 @@ $: pivotQuery = pivotQueryFn($store.namespace, $store.datasetName, { outer_path: outerLeafPath!, inner_path: innerLeafPath!, - filters: selectOptions.filters + filters: selectOptions.filters, + searches: selectOptions.searches }); // The search text after a user presses the search button or enter. @@ -159,10 +160,7 @@ } // The bound input text from the search box. - let inputSearchText: string | undefined = undefined; - function searchInput(e: Event) { - inputSearchText = (e.target as HTMLInputElement)?.value; - } + $: inputSearchText = searchText; function search() { $store.pivot = {...$store.pivot, searchText: inputSearchText}; @@ -190,17 +188,17 @@
-
+
Explore
import {querySelectGroups} from '$lib/queries/datasetQueries'; - import {getDatasetViewContext, type GroupByState} from '$lib/stores/datasetViewStore'; + import { + getDatasetViewContext, + getSelectRowsOptions, + type GroupByState + } from '$lib/stores/datasetViewStore'; import {shortFieldName} from '$lib/view_utils'; import { formatValue, @@ -16,18 +20,18 @@ export let groupBy: GroupByState; export let schema: LilacSchema; - let store = getDatasetViewContext(); + const store = getDatasetViewContext(); $: value = groupBy.value; $: field = getField(schema, groupBy.path)!; $: sortBy = isNumeric(field.dtype) && !field.categorical ? 'value' : 'count'; $: sortOrder = sortBy === 'value' ? 'ASC' : 'DESC'; - + $: selectOptions = getSelectRowsOptions($store, schema); $: groupsQuery = querySelectGroups($store.namespace, $store.datasetName, { leaf_path: groupBy.path, sort_by: sortBy as GroupsSortBy, sort_order: sortOrder as SortOrder, - filters: $store.query.filters, + filters: selectOptions.filters, // Explicitly set the limit to null to get all the groups, not just the top 100. limit: null }); @@ -46,8 +50,11 @@ if (value == null || allCounts == null || valueIndex == null) { return; } - const newValue = - direction === 'next' ? allCounts[valueIndex + 1][0] : allCounts[valueIndex - 1][0]; + const newIndex = direction === 'next' ? valueIndex + 1 : valueIndex - 1; + if (newIndex < 0 || newIndex >= allCounts.length) { + return; + } + const newValue = direction === 'next' ? allCounts[newIndex][0] : allCounts[newIndex][0]; store.setGroupBy(groupBy.path, newValue); } function onKeyDown(key: KeyboardEvent) { diff --git a/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte b/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte index a562c6332..6483310f5 100644 --- a/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte +++ b/web/blueprint/src/lib/components/datasetView/PrefetchRowItem.svelte @@ -23,19 +23,19 @@ getSelectRowsSchemaOptions($datasetViewStore, $schema.data) ); $: selectOptions = getSelectRowsOptions($datasetViewStore, $selectRowsSchema.data?.schema); - $: rowQuery = + $: readyToQueryRow = !$selectRowsSchema.isFetching && $selectRowsSchema?.data?.schema != null && selectOptions != null && - rowId != null - ? queryRowMetadata( - namespace, - datasetName, - rowId, - selectOptions, - $selectRowsSchema.data.schema - ) - : null; + rowId != null; + $: rowQuery = queryRowMetadata( + namespace, + datasetName, + rowId, + selectOptions, + $selectRowsSchema.data?.schema, + readyToQueryRow + ); {#if $rowQuery?.data != null} diff --git a/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte b/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte index 7d93412c5..08df69810 100644 --- a/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte +++ b/web/blueprint/src/lib/components/schemaView/FieldDetails.svelte @@ -1,6 +1,10 @@