From 3ed238212be7575844c94504b31063accec482fa Mon Sep 17 00:00:00 2001
From: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
Date: Fri, 28 Feb 2025 18:03:05 +0900
Subject: [PATCH 1/4] add command to calculate similarity score between two
 contents

---
 llm/cli.py                |  97 ++++++++++++++++++++++++++++++++++
 tests/test_embed_score.py | 107 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 tests/test_embed_score.py

diff --git a/llm/cli.py b/llm/cli.py
index 73d508a3..aa4edc6f 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -16,6 +16,7 @@
     Template,
     UnknownModelError,
     KeyModel,
+    cosine_similarity,
     encode,
     get_async_model,
     get_default_model,
@@ -1882,6 +1883,102 @@ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
         collection_obj.embed_multi(tuples(), **embed_kwargs)
 
 
+@cli.command(name="embed-score")
+@click.option(
+    "-i1",
+    "--input1",
+    type=click.Path(exists=True, readable=True, allow_dash=True),
+    help="First file to embed",
+)
+@click.option(
+    "-i2",
+    "--input2",
+    type=click.Path(exists=True, readable=True, allow_dash=True),
+    help="Second file to embed",
+)
+@click.option("-c1", "--content1", help="First content to embed")
+@click.option("-c2", "--content2", help="Second content to embed")
+@click.option("--binary", is_flag=True, help="Treat input as binary data")
+@click.option("-m", "--model", help="Embedding model to use")
+@click.option(
+    "format_",
+    "-f",
+    "--format",
+    type=click.Choice(["json", "text"]),
+    default="text",
+    help="Output format",
+)
+def embed_score(input1, input2, content1, content2, binary, model, format_):
+    """
+    Calculate similarity score between two embeddings without storing them.
+
+    Example usage:
+
+    \b
+        llm embed-score -c1 "I like pelicans" -c2 "I love pelicans"
+        llm embed-score -i1 file1.txt -i2 file2.txt
+        llm embed-score -i1 image1.jpg -i2 image2.jpg --binary
+    """
+    # Resolve the embedding model
+    if model is None:
+        model = get_default_embedding_model()
+        if model is None:
+            raise click.ClickException(
+                "You need to specify an embedding model (no default model is set)"
+            )
+    try:
+        model_obj = get_embedding_model(model)
+    except UnknownModelError:
+        raise click.ClickException(f"Unknown embedding model: {model}")
+
+    # Resolve first input
+    content_1 = None
+    if content1 is not None:
+        content_1 = content1
+    elif input1:
+        if input1 == "-":
+            # Read from stdin
+            input_source = sys.stdin.buffer if binary else sys.stdin
+            content_1 = input_source.read()
+        else:
+            mode = "rb" if binary else "r"
+            with open(input1, mode) as f:
+                content_1 = f.read()
+
+    if content_1 is None:
+        raise click.ClickException("No content provided for first input")
+
+    # Resolve second input
+    content_2 = None
+    if content2 is not None:
+        content_2 = content2
+    elif input2:
+        if input2 == "-":
+            # Read from stdin
+            input_source = sys.stdin.buffer if binary else sys.stdin
+            content_2 = input_source.read()
+        else:
+            mode = "rb" if binary else "r"
+            with open(input2, mode) as f:
+                content_2 = f.read()
+
+    if content_2 is None:
+        raise click.ClickException("No content provided for second input")
+
+    # Embed both inputs
+    embedding_1 = model_obj.embed(content_1)
+    embedding_2 = model_obj.embed(content_2)
+
+    # Calculate similarity score
+    score = cosine_similarity(embedding_1, embedding_2)
+
+    # Output the score in the requested format
+    if format_ == "json":
+        click.echo(json.dumps({"score": score, "content1": embedding_1, "content2": embedding_2}))
+    else:
+        click.echo(f"{score}")
+
+
 @cli.command()
 @click.argument("collection")
 @click.argument("id", required=False)
diff --git a/tests/test_embed_score.py b/tests/test_embed_score.py
new file mode 100644
index 00000000..b5a4525c
--- /dev/null
+++ b/tests/test_embed_score.py
@@ -0,0 +1,107 @@
+import json
+import pytest
+from click.testing import CliRunner
+from llm.cli import cli
+
+
+def test_embed_score_with_content():
+    """Test the embed-score command with content parameters"""
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "embed-score",
+            "-c1",
+            "This is text one",
+            "-c2",
+            "This is text seven",
+            "-m",
+            "embed-demo",
+        ],
+    )
+    assert result.exit_code == 0
+    assert float(result.output.strip()) == pytest.approx(0.9734171683335759)
+
+    # Test with JSON output format
+    result = runner.invoke(
+        cli,
+        [
+            "embed-score",
+            "-c1",
+            "This is text one",
+            "-c2",
+            "This is text seven",
+            "-f",
+            "json",
+            "-m",
+            "embed-demo",
+        ],
+    )
+    assert result.exit_code == 0
+    assert json.loads(result.output.strip()) == {
+        "score": pytest.approx(0.9734171683335759),
+        "content1": [4, 2, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    }
+
+
+def test_embed_score_with_files(tmp_path):
+    """Test the embed-score command with file inputs"""
+    # Create temporary test files
+    file1 = tmp_path / "file1.txt"
+    file2 = tmp_path / "file2.txt"
+    file1.write_text("This is text one")
+    file2.write_text("This is text seven")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        ["embed-score", "-i1", str(file1), "-i2", str(file2), "-m", "embed-demo"],
+    )
+    assert result.exit_code == 0
+    assert float(result.output.strip()) == pytest.approx(0.9734171683335759)
+
+
+def test_embed_score_binary_input(tmp_path):
+    """Test the embed-score command with binary inputs"""
+    # Create temporary binary files
+    file1 = tmp_path / "file1.bin"
+    file2 = tmp_path / "file2.bin"
+    file1.write_bytes(b"\x00\x01\x02")
+    file2.write_bytes(b"\x03\x04\x05")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "embed-score",
+            "-i1",
+            str(file1),
+            "-i2",
+            str(file2),
+            "-m",
+            "embed-demo",
+            "--binary",
+        ],
+    )
+    assert result.exit_code == 0
+    assert float(result.output.strip()) == pytest.approx(1.0)
+
+
+def test_embed_score_missing_inputs():
+    """Test the embed-score command with missing inputs"""
+    runner = CliRunner()
+
+    # Missing first input
+    result = runner.invoke(
+        cli, ["embed-score", "-c2", "This is text two", "-m", "embed-demo"]
+    )
+    assert result.exit_code != 0
+    assert "No content provided for first input" in result.output
+
+    # Missing second input
+    result = runner.invoke(
+        cli, ["embed-score", "-c1", "This is text one", "-m", "embed-demo"]
+    )
+    assert result.exit_code != 0
+    assert "No content provided for second input" in result.output

From d8b5c46b2a1385a8646bc90ff888db184b4c6fee Mon Sep 17 00:00:00 2001
From: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
Date: Fri, 28 Feb 2025 18:08:46 +0900
Subject: [PATCH 2/4] update help message

---
 llm/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/cli.py b/llm/cli.py
index aa4edc6f..2fe6c6a1 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1910,7 +1910,7 @@ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
 )
 def embed_score(input1, input2, content1, content2, binary, model, format_):
     """
-    Calculate similarity score between two embeddings without storing them.
+    Calculate similarity score between two embeddings without storing them to a collection.
 
     Example usage:
 

From 144e7d1524b77b196c9969606d78080b0df6acb7 Mon Sep 17 00:00:00 2001
From: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
Date: Fri, 28 Feb 2025 19:33:50 +0900
Subject: [PATCH 3/4] update docs/help.md

---
 docs/help.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/docs/help.md b/docs/help.md
index f266961b..c12104a4 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -71,6 +71,7 @@ Commands:
   embed         Embed text and store or return the result
   embed-models  Manage available embedding models
   embed-multi   Store embeddings for multiple strings at once
+  embed-score   Calculate similarity score between two embeddings without...
   install       Install packages from PyPI into the same environment as LLM
   keys          Manage stored API keys for different models
   logs          Tools for exploring logged prompts and responses
@@ -591,6 +592,31 @@ Options:
   --help                       Show this message and exit.
 ```
 
+(help-embed-score)=
+### llm embed-score --help
+```
+Usage: llm embed-score [OPTIONS]
+
+  Calculate similarity score between two embeddings without storing them to a
+  collection.
+
+  Example usage:
+
+      llm embed-score -c1 "I like pelicans" -c2 "I love pelicans"
+      llm embed-score -i1 file1.txt -i2 file2.txt
+      llm embed-score -i1 image1.jpg -i2 image2.jpg --binary
+
+Options:
+  -i1, --input1 PATH        First file to embed
+  -i2, --input2 PATH        Second file to embed
+  -c1, --content1 TEXT      First content to embed
+  -c2, --content2 TEXT      Second content to embed
+  --binary                  Treat input as binary data
+  -m, --model TEXT          Embedding model to use
+  -f, --format [json|text]  Output format
+  --help                    Show this message and exit.
+```
+
 (help-similar)=
 ### llm similar --help
 ```

From 99c270fdfb8f1c441e90788b764e243fc6c3a78f Mon Sep 17 00:00:00 2001
From: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
Date: Fri, 28 Feb 2025 19:45:31 +0900
Subject: [PATCH 4/4] run black

---
 llm/cli.py                | 6 +++++-
 tests/test_embed_score.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 2fe6c6a1..a7534fdd 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1974,7 +1974,11 @@ def embed_score(input1, input2, content1, content2, binary, model, format_):
 
     # Output the score in the requested format
     if format_ == "json":
-        click.echo(json.dumps({"score": score, "content1": embedding_1, "content2": embedding_2}))
+        click.echo(
+            json.dumps(
+                {"score": score, "content1": embedding_1, "content2": embedding_2}
+            )
+        )
     else:
         click.echo(f"{score}")
 
diff --git a/tests/test_embed_score.py b/tests/test_embed_score.py
index b5a4525c..35de6c51 100644
--- a/tests/test_embed_score.py
+++ b/tests/test_embed_score.py
@@ -41,7 +41,7 @@ def test_embed_score_with_content():
     assert json.loads(result.output.strip()) == {
         "score": pytest.approx(0.9734171683335759),
         "content1": [4, 2, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        "content2": [4, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
     }