diff --git a/lilac/concepts/concept.py b/lilac/concepts/concept.py index 7580a369..de353761 100644 --- a/lilac/concepts/concept.py +++ b/lilac/concepts/concept.py @@ -66,8 +66,10 @@ class ExampleIn(BaseModel): @field_validator('text') @classmethod - def parse_text(cls, text: str) -> str: + def parse_text(cls, text: Optional[str]) -> Optional[str]: """Fixes surrogate errors in text: https://github.com/ijl/orjson/blob/master/README.md#str .""" + if not text: + return None return text.encode('utf-8', 'replace').decode('utf-8') diff --git a/lilac/concepts/db_concept.py b/lilac/concepts/db_concept.py index 45a40656..97d0513a 100644 --- a/lilac/concepts/db_concept.py +++ b/lilac/concepts/db_concept.py @@ -473,6 +473,8 @@ def _validate_examples( self, examples: List[Union[ExampleIn, Example]], type: ConceptType ) -> None: for example in examples: + if not example.text and not example.img: + raise ValueError('The example must have a text or image associated with it.') inferred_type = 'text' if example.text else 'unknown' if inferred_type != type: raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".') diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index 0ba749a2..a65d51ff 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -1,9 +1,10 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast from typing_extensions import override +from ..splitters.chunk_splitter import TextChunk from ..utils import log if TYPE_CHECKING: @@ -16,7 +17,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. @@ -69,11 +70,15 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding( lambda docs: self._model.encode(docs)['dense_vecs'], docs, self.local_batch_size * 16, - chunker=clustering_spacy_chunker, + chunker=chunker, ) @override diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py index a95eb437..4fcff463 100644 --- a/lilac/embeddings/cohere.py +++ b/lilac/embeddings/cohere.py @@ -1,5 +1,5 @@ """Cohere embeddings.""" -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast import numpy as np from typing_extensions import override @@ -7,9 +7,10 @@ from ..env import env from ..schema import Item from ..signal import TextEmbeddingSignal +from ..splitters.chunk_splitter import TextChunk from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker if TYPE_CHECKING: from cohere import Client @@ -65,6 +66,8 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]: ).embeddings ] - return chunked_compute_embedding( - _embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, ) + return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker) diff --git a/lilac/embeddings/gte.py b/lilac/embeddings/gte.py index 1da5d509..074d57e3 100644 --- a/lilac/embeddings/gte.py +++ b/lilac/embeddings/gte.py @@ -1,7 +1,7 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc import itertools -from typing import TYPE_CHECKING, ClassVar, Iterator, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Iterator, Optional, cast import modal from typing_extensions import override @@ -19,7 +19,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. @@ -69,8 +69,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding( - self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + self._model.encode, docs, self.local_batch_size * 16, chunker=chunker ) @override @@ -78,8 +82,12 @@ def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]: # Trim the docs to the max context size. trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs) + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) text_chunks: Iterator[tuple[int, TextChunk]] = ( - (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in clustering_spacy_chunker(doc) + (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc) ) text_chunks, text_chunks_2 = itertools.tee(text_chunks) chunk_texts = (chunk[0] for _, chunk in text_chunks) diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py index 24f703ee..118f7974 100644 --- a/lilac/embeddings/nomic_embed.py +++ b/lilac/embeddings/nomic_embed.py @@ -1,10 +1,12 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast import numpy as np from typing_extensions import override +from ..splitters.chunk_splitter import TextChunk + if TYPE_CHECKING: from sentence_transformers import SentenceTransformer @@ -14,7 +16,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. @@ -76,9 +78,11 @@ def _encode(doc: list[str]) -> list[np.ndarray]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. - return chunked_compute_embedding( - _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, ) + return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker) @override def teardown(self) -> None: diff --git a/lilac/embeddings/openai.py b/lilac/embeddings/openai.py index a9a70124..0da8653c 100644 --- a/lilac/embeddings/openai.py +++ b/lilac/embeddings/openai.py @@ -1,5 +1,5 @@ """OpenAI embeddings.""" -from typing import ClassVar, Optional +from typing import Callable, ClassVar, Optional, cast import numpy as np from tenacity import retry, stop_after_attempt, wait_random_exponential @@ -8,9 +8,10 @@ from ..env import env from ..schema import Item from ..signal import TextEmbeddingSignal +from ..splitters.chunk_splitter import TextChunk from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker API_NUM_PARALLEL_REQUESTS = 10 API_OPENAI_BATCH_SIZE = 128 @@ -92,6 +93,8 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]: ) return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data] - return chunked_compute_embedding( - embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, ) + return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker) diff --git a/lilac/embeddings/sbert.py b/lilac/embeddings/sbert.py index 3beba604..91c586e4 100644 --- a/lilac/embeddings/sbert.py +++ b/lilac/embeddings/sbert.py @@ -1,8 +1,9 @@ """Sentence-BERT embeddings. Open-source models, designed to run on device.""" -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast from typing_extensions import override +from ..splitters.chunk_splitter import TextChunk from ..tasks import TaskExecutionType if TYPE_CHECKING: @@ -12,7 +13,7 @@ from ..schema import Item from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device # The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times @@ -47,8 +48,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. + chunker = cast( + Callable[[str], list[TextChunk]], + clustering_spacy_chunker if self._split else identity_chunker, + ) return chunked_compute_embedding( - self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + self._model.encode, docs, self.local_batch_size * 16, chunker=chunker ) @override diff --git a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte index 9a4dc032..f1e7942f 100644 --- a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte +++ b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte @@ -76,7 +76,7 @@ }; $: { pathToSpans = {}; - spanPaths.forEach(sp => { + (spanPaths || []).forEach(sp => { if (row == null) return; let valueNodes = getValueNodes(row, sp); const isSpanNestedUnder = pathMatchesPrefix(sp, path); @@ -97,7 +97,7 @@ let spanPathToValueInfos: Record = {}; $: { spanPathToValueInfos = {}; - for (const spanValueInfo of spanValueInfos) { + for (const spanValueInfo of spanValueInfos || []) { const spanPathStr = serializePath(spanValueInfo.spanPath); if (spanPathToValueInfos[spanPathStr] == null) { spanPathToValueInfos[spanPathStr] = []; @@ -206,7 +206,7 @@ $: { if (model != null && editor != null) { let minPosition: Monaco.Position | null = null; - for (const renderSpan of monacoSpans) { + for (const renderSpan of monacoSpans || []) { const span = L.span(renderSpan.span)!; const position = model.getPositionAt(span.start); @@ -381,7 +381,7 @@ const conceptQuery = queryConcepts(); $: concepts = $conceptQuery.data; - let conceptsInMenu: Set; + let conceptsInMenu: Set = new Set(); let addToConceptItems: DropdownItem[] = []; $: {