Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mean pooling #438

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions fastembed/common/pooling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
from numpy import ufunc


class LateInteractionPooler(object):
def __init__(self, agg_type="row", agg="mean"):
self.agg = agg
self.type = agg_type

def _pick_operation(self) -> ufunc:
if self.agg == "mean":
return np.mean
elif self.agg == "max":
return np.max
elif self.agg == "min":
return np.min
else:
raise NotImplementedError(
f"LateInteractionPooler only supports agg=mean,min,max, provided {self.agg}"
)

def pool(self, embeddings_batch) -> np.array:
if isinstance(embeddings_batch, np.ndarray) and len(embeddings_batch.shape) == 2:
embeddings_batch = [embeddings_batch]

if self.type == "row":
pooled_embedding = self.pool_row(embeddings_batch)
elif self.type == "col":
pooled_embedding = self.pool_col(embeddings_batch)
else:
raise ValueError("type must be 'row' or 'col'")
return pooled_embedding

def pool_row(self, embeddings_batch) -> np.array:
return self._pick_operation()(embeddings_batch, axis=-1)

def pool_col(self, embeddings_batch) -> np.array:
return self._pick_operation()(embeddings_batch, axis=-2)
34 changes: 34 additions & 0 deletions tests/test_pooling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os

import numpy as np

from fastembed.late_interaction.late_interaction_text_embedding import (
LateInteractionTextEmbedding,
)
from fastembed.common.pooling import LateInteractionPooler

from tests.utils import delete_model_cache

CANONICAL_COLUMN_VALUES = {
"colbert-ir/colbertv2.0": np.array(
[4.0727495e-03, -2.4026826e-03, -6.8204990e-04, -7.1383954e-05, 4.4963313e-03]
),
}

docs = ["Hello World"]


def test_batch_embedding():
is_ci = os.getenv("CI")
docs_to_embed = docs * 10

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
pooler = LateInteractionPooler()
result = list(model.embed(docs_to_embed, batch_size=6))
pooled_result = pooler.pool(result)
assert np.allclose(pooled_result[0], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)
Loading