simonw · mocobeta · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/docs/help.md b/docs/help.md
@@ -71,6 +71,7 @@ Commands:
   embed         Embed text and store or return the result
   embed-models  Manage available embedding models
   embed-multi   Store embeddings for multiple strings at once
+  embed-score   Calculate similarity score between two embeddings without...
   install       Install packages from PyPI into the same environment as LLM
   keys          Manage stored API keys for different models
   logs          Tools for exploring logged prompts and responses
@@ -591,6 +592,31 @@ Options:
   --help                       Show this message and exit.
 ```
 
+(help-embed-score)=
+### llm embed-score --help
+```
+Usage: llm embed-score [OPTIONS]
+
+  Calculate similarity score between two embeddings without storing them to a
+  collection.
+
+  Example usage:
+
+      llm embed-score -c1 "I like pelicans" -c2 "I love pelicans"
+      llm embed-score -i1 file1.txt -i2 file2.txt
+      llm embed-score -i1 image1.jpg -i2 image2.jpg --binary
+
+Options:
+  -i1, --input1 PATH        First file to embed
+  -i2, --input2 PATH        Second file to embed
+  -c1, --content1 TEXT      First content to embed
+  -c2, --content2 TEXT      Second content to embed
+  --binary                  Treat input as binary data
+  -m, --model TEXT          Embedding model to use
+  -f, --format [json|text]  Output format
+  --help                    Show this message and exit.
+```
+
 (help-similar)=
 ### llm similar --help
 ```

diff --git a/llm/cli.py b/llm/cli.py
@@ -16,6 +16,7 @@
     Template,
     UnknownModelError,
     KeyModel,
+    cosine_similarity,
     encode,
     get_async_model,
     get_default_model,
@@ -1882,6 +1883,106 @@ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
         collection_obj.embed_multi(tuples(), **embed_kwargs)
 
 
+@cli.command(name="embed-score")
+@click.option(
+    "-i1",
+    "--input1",
+    type=click.Path(exists=True, readable=True, allow_dash=True),
+    help="First file to embed",
+)
+@click.option(
+    "-i2",
+    "--input2",
+    type=click.Path(exists=True, readable=True, allow_dash=True),
+    help="Second file to embed",
+)
+@click.option("-c1", "--content1", help="First content to embed")
+@click.option("-c2", "--content2", help="Second content to embed")
+@click.option("--binary", is_flag=True, help="Treat input as binary data")
+@click.option("-m", "--model", help="Embedding model to use")
+@click.option(
+    "format_",
+    "-f",
+    "--format",
+    type=click.Choice(["json", "text"]),
+    default="text",
+    help="Output format",
+)
+def embed_score(input1, input2, content1, content2, binary, model, format_):
+    """
+    Calculate similarity score between two embeddings without storing them to a collection.
+
+    Example usage:
+
+    \b
+        llm embed-score -c1 "I like pelicans" -c2 "I love pelicans"
+        llm embed-score -i1 file1.txt -i2 file2.txt
+        llm embed-score -i1 image1.jpg -i2 image2.jpg --binary
+    """
+    # Resolve the embedding model
+    if model is None:
+        model = get_default_embedding_model()
+        if model is None:
+            raise click.ClickException(
+                "You need to specify an embedding model (no default model is set)"
+            )
+    try:
+        model_obj = get_embedding_model(model)
+    except UnknownModelError:
+        raise click.ClickException(f"Unknown embedding model: {model}")
+
+    # Resolve first input
+    content_1 = None
+    if content1 is not None:
+        content_1 = content1
+    elif input1:
+        if input1 == "-":
+            # Read from stdin
+            input_source = sys.stdin.buffer if binary else sys.stdin
+            content_1 = input_source.read()
+        else:
+            mode = "rb" if binary else "r"
+            with open(input1, mode) as f:
+                content_1 = f.read()
+
+    if content_1 is None:
+        raise click.ClickException("No content provided for first input")
+
+    # Resolve second input
+    content_2 = None
+    if content2 is not None:
+        content_2 = content2
+    elif input2:
+        if input2 == "-":
+            # Read from stdin
+            input_source = sys.stdin.buffer if binary else sys.stdin
+            content_2 = input_source.read()
+        else:
+            mode = "rb" if binary else "r"
+            with open(input2, mode) as f:
+                content_2 = f.read()
+
+    if content_2 is None:
+        raise click.ClickException("No content provided for second input")
+
+    # Embed both inputs
+    embedding_1 = model_obj.embed(content_1)
+    embedding_2 = model_obj.embed(content_2)
+
+    # Calculate similarity score
+    score = cosine_similarity(embedding_1, embedding_2)
+
+    # Output the score in the requested format
+    if format_ == "json":
+        click.echo(
+            json.dumps(
+                {"score": score, "content1": embedding_1, "content2": embedding_2}
+            )
+        )
+    else:
+        click.echo(f"{score}")
+
+
 @cli.command()
 @click.argument("collection")
 @click.argument("id", required=False)

diff --git a/tests/test_embed_score.py b/tests/test_embed_score.py
@@ -0,0 +1,107 @@
+import json
+import pytest
+from click.testing import CliRunner
+from llm.cli import cli
+
+
+def test_embed_score_with_content():
+    """Test the embed-score command with content parameters"""
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "embed-score",
+            "-c1",
+            "This is text one",
+            "-c2",
+            "This is text seven",
+            "-m",
+            "embed-demo",
+        ],
+    )
+    assert result.exit_code == 0
+    assert float(result.output.strip()) == pytest.approx(0.9734171683335759)
+
+    # Test with JSON output format
+    result = runner.invoke(
+        cli,
+        [
+            "embed-score",
+            "-c1",
+            "This is text one",
+            "-c2",
+            "This is text seven",
+            "-f",
+            "json",
+            "-m",
+            "embed-demo",
+        ],
+    )
+    assert result.exit_code == 0
+    assert json.loads(result.output.strip()) == {
+        "score": pytest.approx(0.9734171683335759),
+        "content1": [4, 2, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    }
+
+
+def test_embed_score_with_files(tmp_path):
+    """Test the embed-score command with file inputs"""
+    # Create temporary test files
+    file1 = tmp_path / "file1.txt"
+    file2 = tmp_path / "file2.txt"
+    file1.write_text("This is text one")
+    file2.write_text("This is text seven")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        ["embed-score", "-i1", str(file1), "-i2", str(file2), "-m", "embed-demo"],
+    )
+    assert result.exit_code == 0
+    assert float(result.output.strip()) == pytest.approx(0.9734171683335759)
+
+
+def test_embed_score_binary_input(tmp_path):
+    """Test the embed-score command with binary inputs"""
+    # Create temporary binary files
+    file1 = tmp_path / "file1.bin"
+    file2 = tmp_path / "file2.bin"
+    file1.write_bytes(b"\x00\x01\x02")
+    file2.write_bytes(b"\x03\x04\x05")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "embed-score",
+            "-i1",
+            str(file1),
+            "-i2",
+            str(file2),
+            "-m",
+            "embed-demo",
+            "--binary",
+        ],
+    )
+    assert result.exit_code == 0
+    assert float(result.output.strip()) == pytest.approx(1.0)
+
+
+def test_embed_score_missing_inputs():
+    """Test the embed-score command with missing inputs"""
+    runner = CliRunner()
+
+    # Missing first input
+    result = runner.invoke(
+        cli, ["embed-score", "-c2", "This is text two", "-m", "embed-demo"]
+    )
+    assert result.exit_code != 0
+    assert "No content provided for first input" in result.output
+
+    # Missing second input
+    result = runner.invoke(
+        cli, ["embed-score", "-c1", "This is text one", "-m", "embed-demo"]
+    )
+    assert result.exit_code != 0
+    assert "No content provided for second input" in result.output