From 3ed238212be7575844c94504b31063accec482fa Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Fri, 28 Feb 2025 18:03:05 +0900 Subject: [PATCH 1/4] add command to calculate similarity score between two contents --- llm/cli.py | 97 ++++++++++++++++++++++++++++++++++ tests/test_embed_score.py | 107 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 tests/test_embed_score.py diff --git a/llm/cli.py b/llm/cli.py index 73d508a3..aa4edc6f 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -16,6 +16,7 @@ Template, UnknownModelError, KeyModel, + cosine_similarity, encode, get_async_model, get_default_model, @@ -1882,6 +1883,102 @@ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]: collection_obj.embed_multi(tuples(), **embed_kwargs) +@cli.command(name="embed-score") +@click.option( + "-i1", + "--input1", + type=click.Path(exists=True, readable=True, allow_dash=True), + help="First file to embed", +) +@click.option( + "-i2", + "--input2", + type=click.Path(exists=True, readable=True, allow_dash=True), + help="Second file to embed", +) +@click.option("-c1", "--content1", help="First content to embed") +@click.option("-c2", "--content2", help="Second content to embed") +@click.option("--binary", is_flag=True, help="Treat input as binary data") +@click.option("-m", "--model", help="Embedding model to use") +@click.option( + "format_", + "-f", + "--format", + type=click.Choice(["json", "text"]), + default="text", + help="Output format", +) +def embed_score(input1, input2, content1, content2, binary, model, format_): + """ + Calculate similarity score between two embeddings without storing them. + + Example usage: + + \b + llm embed-score -c1 "I like pelicans" -c2 "I love pelicans" + llm embed-score -i1 file1.txt -i2 file2.txt + llm embed-score -i1 image1.jpg -i2 image2.jpg --binary + """ + # Resolve the embedding model + if model is None: + model = get_default_embedding_model() + if model is None: + raise click.ClickException( + "You need to specify an embedding model (no default model is set)" + ) + try: + model_obj = get_embedding_model(model) + except UnknownModelError: + raise click.ClickException(f"Unknown embedding model: {model}") + + # Resolve first input + content_1 = None + if content1 is not None: + content_1 = content1 + elif input1: + if input1 == "-": + # Read from stdin + input_source = sys.stdin.buffer if binary else sys.stdin + content_1 = input_source.read() + else: + mode = "rb" if binary else "r" + with open(input1, mode) as f: + content_1 = f.read() + + if content_1 is None: + raise click.ClickException("No content provided for first input") + + # Resolve second input + content_2 = None + if content2 is not None: + content_2 = content2 + elif input2: + if input2 == "-": + # Read from stdin + input_source = sys.stdin.buffer if binary else sys.stdin + content_2 = input_source.read() + else: + mode = "rb" if binary else "r" + with open(input2, mode) as f: + content_2 = f.read() + + if content_2 is None: + raise click.ClickException("No content provided for second input") + + # Embed both inputs + embedding_1 = model_obj.embed(content_1) + embedding_2 = model_obj.embed(content_2) + + # Calculate similarity score + score = cosine_similarity(embedding_1, embedding_2) + + # Output the score in the requested format + if format_ == "json": + click.echo(json.dumps({"score": score, "content1": embedding_1, "content2": embedding_2})) + else: + click.echo(f"{score}") + + @cli.command() @click.argument("collection") @click.argument("id", required=False) diff --git a/tests/test_embed_score.py b/tests/test_embed_score.py new file mode 100644 index 00000000..b5a4525c --- /dev/null +++ b/tests/test_embed_score.py @@ -0,0 +1,107 @@ +import json +import pytest +from click.testing import CliRunner +from llm.cli import cli + + +def test_embed_score_with_content(): + """Test the embed-score command with content parameters""" + runner = CliRunner() + result = runner.invoke( + cli, + [ + "embed-score", + "-c1", + "This is text one", + "-c2", + "This is text seven", + "-m", + "embed-demo", + ], + ) + assert result.exit_code == 0 + assert float(result.output.strip()) == pytest.approx(0.9734171683335759) + + # Test with JSON output format + result = runner.invoke( + cli, + [ + "embed-score", + "-c1", + "This is text one", + "-c2", + "This is text seven", + "-f", + "json", + "-m", + "embed-demo", + ], + ) + assert result.exit_code == 0 + assert json.loads(result.output.strip()) == { + "score": pytest.approx(0.9734171683335759), + "content1": [4, 2, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + } + + +def test_embed_score_with_files(tmp_path): + """Test the embed-score command with file inputs""" + # Create temporary test files + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + file1.write_text("This is text one") + file2.write_text("This is text seven") + + runner = CliRunner() + result = runner.invoke( + cli, + ["embed-score", "-i1", str(file1), "-i2", str(file2), "-m", "embed-demo"], + ) + assert result.exit_code == 0 + assert float(result.output.strip()) == pytest.approx(0.9734171683335759) + + +def test_embed_score_binary_input(tmp_path): + """Test the embed-score command with binary inputs""" + # Create temporary binary files + file1 = tmp_path / "file1.bin" + file2 = tmp_path / "file2.bin" + file1.write_bytes(b"\x00\x01\x02") + file2.write_bytes(b"\x03\x04\x05") + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "embed-score", + "-i1", + str(file1), + "-i2", + str(file2), + "-m", + "embed-demo", + "--binary", + ], + ) + assert result.exit_code == 0 + assert float(result.output.strip()) == pytest.approx(1.0) + + +def test_embed_score_missing_inputs(): + """Test the embed-score command with missing inputs""" + runner = CliRunner() + + # Missing first input + result = runner.invoke( + cli, ["embed-score", "-c2", "This is text two", "-m", "embed-demo"] + ) + assert result.exit_code != 0 + assert "No content provided for first input" in result.output + + # Missing second input + result = runner.invoke( + cli, ["embed-score", "-c1", "This is text one", "-m", "embed-demo"] + ) + assert result.exit_code != 0 + assert "No content provided for second input" in result.output From d8b5c46b2a1385a8646bc90ff888db184b4c6fee Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Fri, 28 Feb 2025 18:08:46 +0900 Subject: [PATCH 2/4] update help message --- llm/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/cli.py b/llm/cli.py index aa4edc6f..2fe6c6a1 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -1910,7 +1910,7 @@ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]: ) def embed_score(input1, input2, content1, content2, binary, model, format_): """ - Calculate similarity score between two embeddings without storing them. + Calculate similarity score between two embeddings without storing them to a collection. Example usage: From 144e7d1524b77b196c9969606d78080b0df6acb7 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Fri, 28 Feb 2025 19:33:50 +0900 Subject: [PATCH 3/4] update docs/help.md --- docs/help.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/help.md b/docs/help.md index f266961b..c12104a4 100644 --- a/docs/help.md +++ b/docs/help.md @@ -71,6 +71,7 @@ Commands: embed Embed text and store or return the result embed-models Manage available embedding models embed-multi Store embeddings for multiple strings at once + embed-score Calculate similarity score between two embeddings without... install Install packages from PyPI into the same environment as LLM keys Manage stored API keys for different models logs Tools for exploring logged prompts and responses @@ -591,6 +592,31 @@ Options: --help Show this message and exit. ``` +(help-embed-score)= +### llm embed-score --help +``` +Usage: llm embed-score [OPTIONS] + + Calculate similarity score between two embeddings without storing them to a + collection. + + Example usage: + + llm embed-score -c1 "I like pelicans" -c2 "I love pelicans" + llm embed-score -i1 file1.txt -i2 file2.txt + llm embed-score -i1 image1.jpg -i2 image2.jpg --binary + +Options: + -i1, --input1 PATH First file to embed + -i2, --input2 PATH Second file to embed + -c1, --content1 TEXT First content to embed + -c2, --content2 TEXT Second content to embed + --binary Treat input as binary data + -m, --model TEXT Embedding model to use + -f, --format [json|text] Output format + --help Show this message and exit. +``` + (help-similar)= ### llm similar --help ``` From 99c270fdfb8f1c441e90788b764e243fc6c3a78f Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Fri, 28 Feb 2025 19:45:31 +0900 Subject: [PATCH 4/4] run black --- llm/cli.py | 6 +++++- tests/test_embed_score.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/llm/cli.py b/llm/cli.py index 2fe6c6a1..a7534fdd 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -1974,7 +1974,11 @@ def embed_score(input1, input2, content1, content2, binary, model, format_): # Output the score in the requested format if format_ == "json": - click.echo(json.dumps({"score": score, "content1": embedding_1, "content2": embedding_2})) + click.echo( + json.dumps( + {"score": score, "content1": embedding_1, "content2": embedding_2} + ) + ) else: click.echo(f"{score}") diff --git a/tests/test_embed_score.py b/tests/test_embed_score.py index b5a4525c..35de6c51 100644 --- a/tests/test_embed_score.py +++ b/tests/test_embed_score.py @@ -41,7 +41,7 @@ def test_embed_score_with_content(): assert json.loads(result.output.strip()) == { "score": pytest.approx(0.9734171683335759), "content1": [4, 2, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], }