Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add feature to calculate similarity score between two embeddings without storing them in a collection #804

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/help.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ Commands:
embed Embed text and store or return the result
embed-models Manage available embedding models
embed-multi Store embeddings for multiple strings at once
embed-score Calculate similarity score between two embeddings without...
install Install packages from PyPI into the same environment as LLM
keys Manage stored API keys for different models
logs Tools for exploring logged prompts and responses
Expand Down Expand Up @@ -591,6 +592,31 @@ Options:
--help Show this message and exit.
```

(help-embed-score)=
### llm embed-score --help
```
Usage: llm embed-score [OPTIONS]

Calculate similarity score between two embeddings without storing them to a
collection.

Example usage:

llm embed-score -c1 "I like pelicans" -c2 "I love pelicans"
llm embed-score -i1 file1.txt -i2 file2.txt
llm embed-score -i1 image1.jpg -i2 image2.jpg --binary

Options:
-i1, --input1 PATH First file to embed
-i2, --input2 PATH Second file to embed
-c1, --content1 TEXT First content to embed
-c2, --content2 TEXT Second content to embed
--binary Treat input as binary data
-m, --model TEXT Embedding model to use
-f, --format [json|text] Output format
--help Show this message and exit.
```

(help-similar)=
### llm similar --help
```
Expand Down
101 changes: 101 additions & 0 deletions llm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Template,
UnknownModelError,
KeyModel,
cosine_similarity,
encode,
get_async_model,
get_default_model,
Expand Down Expand Up @@ -1882,6 +1883,106 @@ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
collection_obj.embed_multi(tuples(), **embed_kwargs)


@cli.command(name="embed-score")
@click.option(
"-i1",
"--input1",
type=click.Path(exists=True, readable=True, allow_dash=True),
help="First file to embed",
)
@click.option(
"-i2",
"--input2",
type=click.Path(exists=True, readable=True, allow_dash=True),
help="Second file to embed",
)
@click.option("-c1", "--content1", help="First content to embed")
@click.option("-c2", "--content2", help="Second content to embed")
@click.option("--binary", is_flag=True, help="Treat input as binary data")
@click.option("-m", "--model", help="Embedding model to use")
@click.option(
"format_",
"-f",
"--format",
type=click.Choice(["json", "text"]),
default="text",
help="Output format",
)
def embed_score(input1, input2, content1, content2, binary, model, format_):
"""
Calculate similarity score between two embeddings without storing them to a collection.

Example usage:

\b
llm embed-score -c1 "I like pelicans" -c2 "I love pelicans"
llm embed-score -i1 file1.txt -i2 file2.txt
llm embed-score -i1 image1.jpg -i2 image2.jpg --binary
"""
# Resolve the embedding model
if model is None:
model = get_default_embedding_model()
if model is None:
raise click.ClickException(
"You need to specify an embedding model (no default model is set)"
)
try:
model_obj = get_embedding_model(model)
except UnknownModelError:
raise click.ClickException(f"Unknown embedding model: {model}")

# Resolve first input
content_1 = None
if content1 is not None:
content_1 = content1
elif input1:
if input1 == "-":
# Read from stdin
input_source = sys.stdin.buffer if binary else sys.stdin
content_1 = input_source.read()
else:
mode = "rb" if binary else "r"
with open(input1, mode) as f:
content_1 = f.read()

if content_1 is None:
raise click.ClickException("No content provided for first input")

# Resolve second input
content_2 = None
if content2 is not None:
content_2 = content2
elif input2:
if input2 == "-":
# Read from stdin
input_source = sys.stdin.buffer if binary else sys.stdin
content_2 = input_source.read()
else:
mode = "rb" if binary else "r"
with open(input2, mode) as f:
content_2 = f.read()

if content_2 is None:
raise click.ClickException("No content provided for second input")

# Embed both inputs
embedding_1 = model_obj.embed(content_1)
embedding_2 = model_obj.embed(content_2)

# Calculate similarity score
score = cosine_similarity(embedding_1, embedding_2)

# Output the score in the requested format
if format_ == "json":
click.echo(
json.dumps(
{"score": score, "content1": embedding_1, "content2": embedding_2}
)
)
else:
click.echo(f"{score}")


@cli.command()
@click.argument("collection")
@click.argument("id", required=False)
Expand Down
107 changes: 107 additions & 0 deletions tests/test_embed_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import json
import pytest
from click.testing import CliRunner
from llm.cli import cli


def test_embed_score_with_content():
"""Test the embed-score command with content parameters"""
runner = CliRunner()
result = runner.invoke(
cli,
[
"embed-score",
"-c1",
"This is text one",
"-c2",
"This is text seven",
"-m",
"embed-demo",
],
)
assert result.exit_code == 0
assert float(result.output.strip()) == pytest.approx(0.9734171683335759)

# Test with JSON output format
result = runner.invoke(
cli,
[
"embed-score",
"-c1",
"This is text one",
"-c2",
"This is text seven",
"-f",
"json",
"-m",
"embed-demo",
],
)
assert result.exit_code == 0
assert json.loads(result.output.strip()) == {
"score": pytest.approx(0.9734171683335759),
"content1": [4, 2, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
"content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
}


def test_embed_score_with_files(tmp_path):
"""Test the embed-score command with file inputs"""
# Create temporary test files
file1 = tmp_path / "file1.txt"
file2 = tmp_path / "file2.txt"
file1.write_text("This is text one")
file2.write_text("This is text seven")

runner = CliRunner()
result = runner.invoke(
cli,
["embed-score", "-i1", str(file1), "-i2", str(file2), "-m", "embed-demo"],
)
assert result.exit_code == 0
assert float(result.output.strip()) == pytest.approx(0.9734171683335759)


def test_embed_score_binary_input(tmp_path):
"""Test the embed-score command with binary inputs"""
# Create temporary binary files
file1 = tmp_path / "file1.bin"
file2 = tmp_path / "file2.bin"
file1.write_bytes(b"\x00\x01\x02")
file2.write_bytes(b"\x03\x04\x05")

runner = CliRunner()
result = runner.invoke(
cli,
[
"embed-score",
"-i1",
str(file1),
"-i2",
str(file2),
"-m",
"embed-demo",
"--binary",
],
)
assert result.exit_code == 0
assert float(result.output.strip()) == pytest.approx(1.0)


def test_embed_score_missing_inputs():
"""Test the embed-score command with missing inputs"""
runner = CliRunner()

# Missing first input
result = runner.invoke(
cli, ["embed-score", "-c2", "This is text two", "-m", "embed-demo"]
)
assert result.exit_code != 0
assert "No content provided for first input" in result.output

# Missing second input
result = runner.invoke(
cli, ["embed-score", "-c1", "This is text one", "-m", "embed-demo"]
)
assert result.exit_code != 0
assert "No content provided for second input" in result.output
Loading