Skip to content

Commit

Permalink
save
Browse files Browse the repository at this point in the history
  • Loading branch information
dsmilkov committed Feb 27, 2024
1 parent 33dcf03 commit 1b9ddc8
Show file tree
Hide file tree
Showing 9 changed files with 27 additions and 77 deletions.
2 changes: 2 additions & 0 deletions lilac/concepts/db_concept.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,8 @@ def _validate_examples(
self, examples: List[Union[ExampleIn, Example]], type: ConceptType
) -> None:
for example in examples:
if not example.text and not example.img:
raise ValueError('The example must have a text or image associated with it.')
inferred_type = 'text' if example.text else 'unknown'
if inferred_type != type:
raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".')
Expand Down
5 changes: 3 additions & 2 deletions lilac/embeddings/bge.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .embedding import chunked_compute_embedding, identity_chunker
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
Expand Down Expand Up @@ -69,11 +69,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
chunker = clustering_spacy_chunker if self._split else identity_chunker
return chunked_compute_embedding(
lambda docs: self._model.encode(docs)['dense_vecs'],
docs,
self.local_batch_size * 16,
chunker=clustering_spacy_chunker,
chunker=chunker,
)

@override
Expand Down
7 changes: 3 additions & 4 deletions lilac/embeddings/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .embedding import chunked_compute_embedding, identity_chunker

if TYPE_CHECKING:
from cohere import Client
Expand Down Expand Up @@ -65,6 +65,5 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]:
).embeddings
]

return chunked_compute_embedding(
_embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
)
chunker = clustering_spacy_chunker if self._split else identity_chunker
return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker)
8 changes: 5 additions & 3 deletions lilac/embeddings/gte.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .embedding import chunked_compute_embedding, identity_chunker
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
Expand Down Expand Up @@ -69,17 +69,19 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
chunker = clustering_spacy_chunker if self._split else identity_chunker
return chunked_compute_embedding(
self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
)

@override
def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]:
# Trim the docs to the max context size.

trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs)
chunker = clustering_spacy_chunker if self._split else identity_chunker
text_chunks: Iterator[tuple[int, TextChunk]] = (
(i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in clustering_spacy_chunker(doc)
(i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc)
)
text_chunks, text_chunks_2 = itertools.tee(text_chunks)
chunk_texts = (chunk[0] for _, chunk in text_chunks)
Expand Down
7 changes: 3 additions & 4 deletions lilac/embeddings/nomic_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .embedding import chunked_compute_embedding, identity_chunker
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
Expand Down Expand Up @@ -76,9 +76,8 @@ def _encode(doc: list[str]) -> list[np.ndarray]:
# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
return chunked_compute_embedding(
_encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
)
chunker = clustering_spacy_chunker if self._split else identity_chunker
return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker)

@override
def teardown(self) -> None:
Expand Down
7 changes: 3 additions & 4 deletions lilac/embeddings/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .embedding import chunked_compute_embedding, identity_chunker

API_NUM_PARALLEL_REQUESTS = 10
API_OPENAI_BATCH_SIZE = 128
Expand Down Expand Up @@ -92,6 +92,5 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]:
)
return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data]

return chunked_compute_embedding(
embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
)
chunker = clustering_spacy_chunker if self._split else identity_chunker
return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker)
5 changes: 3 additions & 2 deletions lilac/embeddings/sbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ..schema import Item
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from .embedding import chunked_compute_embedding
from .embedding import chunked_compute_embedding, identity_chunker
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device

# The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times
Expand Down Expand Up @@ -47,8 +47,9 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
chunker = clustering_spacy_chunker if self._split else identity_chunker
return chunked_compute_embedding(
self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
)

@override
Expand Down
55 changes: 1 addition & 54 deletions web/blueprint/src/lib/components/concepts/ConceptView.svelte
Original file line number Diff line number Diff line change
@@ -1,27 +1,21 @@
<script lang="ts">
import {goto} from '$app/navigation';
import {
editConceptMutation,
queryConceptModels,
queryConcepts
} from '$lib/queries/conceptQueries';
import {queryAuthInfo} from '$lib/queries/serverQueries';
import {queryEmbeddings} from '$lib/queries/signalQueries';
import {createDatasetViewStore} from '$lib/stores/datasetViewStore';
import {getNavigationContext} from '$lib/stores/navigationStore';
import {datasetLink} from '$lib/utils';
import type {Concept} from '$lilac';
import {Button, ToastNotification} from 'carbon-components-svelte';
import {ToastNotification} from 'carbon-components-svelte';
import {View, ViewOff} from 'carbon-icons-svelte';
import ThumbsDownFilled from 'carbon-icons-svelte/lib/ThumbsDownFilled.svelte';
import ThumbsUpFilled from 'carbon-icons-svelte/lib/ThumbsUpFilled.svelte';
import {get} from 'svelte/store';
import Expandable from '../Expandable.svelte';
import {hoverTooltip} from '../common/HoverTooltip';
import ConceptExampleList from './ConceptExampleList.svelte';
import ConceptMetrics from './ConceptMetrics.svelte';
import ConceptPreview from './ConceptPreview.svelte';
import DatasetFieldEmbeddingSelector from './DatasetFieldEmbeddingSelector.svelte';
import ConceptLabeler from './labeler/ConceptLabeler.svelte';
export let concept: Concept;
Expand All @@ -30,7 +24,6 @@
$: userId = $authInfo.data?.user?.id;
const concepts = queryConcepts();
const navState = getNavigationContext();
$: conceptInfo = $concepts.data?.find(
c => c.namespace === concept.namespace && c.name === concept.concept_name
Expand All @@ -45,25 +38,6 @@
$: randomPositive = positiveExamples[Math.floor(Math.random() * positiveExamples.length)];
// Apply to a dataset.
let applyDataset: {namespace: string; name: string} | undefined | null = undefined;
let applyPath: string[] | undefined;
let applyEmbedding: string | undefined = undefined;
function openDataset() {
if (applyPath == null || applyEmbedding == null || applyDataset == null) {
return;
}
const store = createDatasetViewStore(applyDataset.namespace, applyDataset.name);
store.addSearch({
path: applyPath,
type: 'concept',
concept_namespace: concept.namespace,
concept_name: concept.concept_name,
embedding: applyEmbedding
});
goto(datasetLink(applyDataset.namespace, applyDataset.name, $navState, get(store)));
}
function remove(id: string) {
if (!concept.namespace || !concept.concept_name) return;
$conceptMutation.mutate([concept.namespace, concept.concept_name, {remove: [id]}]);
Expand Down Expand Up @@ -115,33 +89,6 @@
</div>
</Expandable>

<Expandable>
<div slot="above" class="text-md font-semibold">Apply to a dataset</div>
<div slot="below">
<DatasetFieldEmbeddingSelector
bind:dataset={applyDataset}
bind:path={applyPath}
bind:embedding={applyEmbedding}
/>
{#if applyDataset != null && applyPath != null && applyEmbedding != null}
<div class="mt-4">
<Button iconDescription={'Open dataset and apply concept.'} on:click={() => openDataset()}
>Search by concept
</Button>
</div>
{:else}
<ToastNotification
hideCloseButton
kind="warning"
fullWidth
lowContrast
title="Choose a dataset with a computed embedding"
caption={'Dataset has no fields with computed embeddings. ' +
'Please compute an embedding index before you can search by concept.'}
/>
{/if}
</div>
</Expandable>
<Expandable>
<div slot="above" class="text-md font-semibold">Collect labels</div>
<div slot="below" class="w-full">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
};
$: {
pathToSpans = {};
spanPaths.forEach(sp => {
(spanPaths || []).forEach(sp => {
if (row == null) return;
let valueNodes = getValueNodes(row, sp);
const isSpanNestedUnder = pathMatchesPrefix(sp, path);
Expand All @@ -97,7 +97,7 @@
let spanPathToValueInfos: Record<string, SpanValueInfo[]> = {};
$: {
spanPathToValueInfos = {};
for (const spanValueInfo of spanValueInfos) {
for (const spanValueInfo of spanValueInfos || []) {
const spanPathStr = serializePath(spanValueInfo.spanPath);
if (spanPathToValueInfos[spanPathStr] == null) {
spanPathToValueInfos[spanPathStr] = [];
Expand Down Expand Up @@ -206,7 +206,7 @@
$: {
if (model != null && editor != null) {
let minPosition: Monaco.Position | null = null;
for (const renderSpan of monacoSpans) {
for (const renderSpan of monacoSpans || []) {
const span = L.span(renderSpan.span)!;
const position = model.getPositionAt(span.start);
Expand Down Expand Up @@ -381,7 +381,7 @@
const conceptQuery = queryConcepts();
$: concepts = $conceptQuery.data;
let conceptsInMenu: Set<string>;
let conceptsInMenu: Set<string> = new Set();
let addToConceptItems: DropdownItem[] = [];
$: {
Expand Down

0 comments on commit 1b9ddc8

Please sign in to comment.