From c49fbe3a4d5e89677e31a54bba875572adc6c389 Mon Sep 17 00:00:00 2001
From: LastWhisper <pkuwkl@gmail.com>
Date: Sun, 9 Feb 2025 16:22:33 +0800
Subject: [PATCH] feat: Internal deduplication impl. (#1568)

---
 camel/utils/__init__.py          |   3 +
 camel/utils/deduplication.py     | 199 ++++++++++++++++++++++++++++++
 pyproject.toml                   |   1 +
 test/utils/test_deduplication.py | 202 +++++++++++++++++++++++++++++++
 4 files changed, 405 insertions(+)
 create mode 100644 camel/utils/deduplication.py
 create mode 100644 test/utils/test_deduplication.py

diff --git a/camel/utils/__init__.py b/camel/utils/__init__.py
index 481471f3c3..c08c62c142 100644
--- a/camel/utils/__init__.py
+++ b/camel/utils/__init__.py
@@ -40,6 +40,7 @@
     track_agent,
 )
 from .constants import Constants
+from .deduplication import DeduplicationResult, deduplicate_internally
 from .response_format import get_pydantic_model
 from .token_counting import (
     AnthropicTokenCounter,
@@ -82,6 +83,8 @@
     "get_pydantic_model",
     "download_github_subdirectory",
     "generate_prompt_for_structured_output",
+    "deduplicate_internally",
+    "DeduplicationResult",
     "retry_on_error",
     "BatchProcessor",
 ]
diff --git a/camel/utils/deduplication.py b/camel/utils/deduplication.py
new file mode 100644
index 0000000000..cbb28884d0
--- /dev/null
+++ b/camel/utils/deduplication.py
@@ -0,0 +1,199 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+
+from typing import Dict, List, Literal, Optional
+
+import numpy as np
+from pydantic import BaseModel
+from sklearn.metrics.pairwise import cosine_similarity
+
+from camel.embeddings.base import BaseEmbedding
+
+
+class DeduplicationResult(BaseModel):
+    """
+    The result of deduplication.
+
+    Attributes:
+        original_texts (List[str]): The original texts.
+        unique_ids (List[int]): A list of ids that are unique (not duplicates).
+        unique_embeddings_dict (Dict[int, List[float]]):
+            A mapping from the index of each unique text to its embedding.
+        duplicate_to_target_map (Dict[int, int]):
+            A mapping from the index of the duplicate text to the index
+            of the text it is considered a duplicate of.
+    """
+
+    original_texts: List[str]
+    unique_ids: List[int]
+    unique_embeddings_dict: Dict[int, List[float]]
+    duplicate_to_target_map: Dict[int, int]
+
+
+def deduplicate_internally(
+    texts: List[str],
+    threshold: float = 0.65,
+    embedding_instance: Optional[BaseEmbedding[str]] = None,
+    embeddings: Optional[List[List[float]]] = None,
+    strategy: Literal["top1", "llm-supervise"] = "top1",
+) -> DeduplicationResult:
+    """
+    Deduplicate a list of strings based on their cosine similarity.
+
+    You can either:
+    1) Provide a Camel `BaseEmbedding` instance via `embedding_instance` to let
+        this function handle the embedding internally, OR
+    2) Directly pass a list of pre-computed embeddings to `embeddings`.
+
+    If both `embedding_instance` and `embeddings` are provided, the function
+    will raise a ValueError to avoid ambiguous usage.
+
+    strategy is used to specify different strategies, where 'top1' selects the
+    one with highest similarity, and 'llm-supervise' uses LLM to determine if
+    texts are duplicates (not yet implemented).
+
+    Args:
+        texts (List[str]): The list of texts to be deduplicated.
+        threshold (float, optional): The similarity threshold for considering
+            two texts as duplicates. Default is 0.65.
+        embedding_instance (Optional[BaseEmbedding[str]], optional):
+            A Camel embedding instance for automatic embedding. Defaults to
+            None.
+        embeddings (Optional[List[List[float]]], optional):
+            Pre-computed embeddings of `texts`. Each element in the list
+            corresponds to the embedding of the text in the same index of
+            `texts`. Defaults to None.
+        strategy (Literal["top1", "llm-supervise"], optional):
+            The strategy to use for deduplication. Defaults to "top1".
+
+    Returns:
+        DeduplicationResult: An object that contains:
+            - `original_texts`: The original texts.
+            - `unique_ids`: The unique ids after deduplication.
+            - `unique_embeddings_dict`: A dict mapping from (unique) text id
+              to its embedding.
+            - `duplicate_to_target_map`: A dict mapping from the id of a
+              duplicate text to the id of the text it is considered a duplicate
+              of.
+
+    Raises:
+        NotImplementedError: If the strategy is not "top1".
+        ValueError: If neither embeddings nor embedding_instance is provided,
+                    or if both are provided at the same time.
+        ValueError: If the length of `embeddings` does not match the length of
+            `texts`.
+
+    Example:
+        >>> from camel.embeddings.openai_embedding import OpenAIEmbedding
+        >>> # Suppose we have 5 texts, some of which may be duplicates
+        >>> texts = [
+        ...     "What is AI?",
+        ...     "Artificial Intelligence is about machines",
+        ...     "What is AI?",
+        ...     "Deep Learning is a subset of AI",
+        ...     "What is artificial intelligence?"
+        ... ]
+        >>> # or any other BaseEmbedding instance
+        >>> embedding_model = OpenAIEmbedding()
+        >>> result = deduplicate_internally(
+        ...     texts=texts,
+        ...     threshold=0.7,
+        ...     embedding_instance=embedding_model
+        ... )
+        >>> print("Unique ids:")
+        >>> for uid in result.unique_ids:
+        ...     print(texts[uid])
+        Unique ids:
+        What is AI?
+        Artificial Intelligence is about machines
+        Deep Learning is a subset of AI
+        What is artificial intelligence?
+
+        >>> print("Duplicate map:")
+        >>> print(result.duplicate_to_target_map)
+        {2: 0}
+        # This indicates the text at index 2 is considered
+        # a duplicate of index 0.
+    """
+    if strategy == "llm-supervise":
+        # TODO: Implement LLM-supervise deduplication.
+        raise NotImplementedError(
+            "LLM-supervise deduplication is not yet implemented."
+        )
+
+    # Check if the parameters are valid.
+    if embedding_instance is None and embeddings is None:
+        raise ValueError(
+            "Either 'embedding_instance' or 'embeddings' must be provided."
+        )
+    if embedding_instance is not None and embeddings is not None:
+        raise ValueError(
+            "Cannot provide both 'embedding_instance' and 'embeddings'. "
+            "Please choose only one way to supply embeddings."
+        )
+
+    if embedding_instance is not None:
+        # Use Camel's embedding_instance to vectorize.
+        embeddings = embedding_instance.embed_list(texts)
+    else:
+        # Use pre-supplied embeddings.
+        if embeddings and len(embeddings) != len(texts):
+            raise ValueError(
+                "The length of 'embeddings' does not match the length "
+                "of 'texts'."
+            )
+
+    # Calculate cosine similarity.
+    similarity_matrix = cosine_similarity(embeddings)
+    n = len(texts)
+
+    # Use the lower triangle to avoid redundant comparisons
+    # (or self-comparisons).
+    tril_mask = np.tril(np.ones((n, n)), k=-1)
+    similarity_matrix = similarity_matrix * tril_mask
+
+    # For each row, find the column with the highest similarity
+    # that exceeds the threshold. If no similarity exceeds the threshold,
+    # set the column index to -1.
+    masked_similarities = np.where(
+        similarity_matrix > threshold, similarity_matrix, -1
+    )
+    max_indices = masked_similarities.argmax(axis=1)
+
+    duplicate_to_target_map: Dict[int, int] = {}
+    above_threshold = similarity_matrix[np.arange(n), max_indices] > threshold
+
+    # Construct the "duplicate->target" mapping.
+    for i in range(n):
+        if above_threshold[i]:
+            duplicate_to_target_map[i] = max_indices[i]
+
+    # Get the actual unique ids and embeddings.
+    unique_ids = []
+    unique_embeddings_dict = {}
+
+    assert embeddings, "embeddings must be valid"
+
+    for i, (_, emb) in enumerate(zip(texts, embeddings)):
+        if i not in duplicate_to_target_map:
+            unique_ids.append(i)
+            unique_embeddings_dict[i] = emb
+
+    return DeduplicationResult(
+        original_texts=texts,
+        unique_ids=unique_ids,
+        unique_embeddings_dict=unique_embeddings_dict,
+        duplicate_to_target_map=duplicate_to_target_map,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 990c8d331f..be8d51e04a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -533,6 +533,7 @@ module = [
     "tree-sitter-python",
     "tree-sitter",
     "pandasai",
+    "sklearn.metrics.pairwise",
     "sympy",
 ]
 ignore_missing_imports = true
\ No newline at end of file
diff --git a/test/utils/test_deduplication.py b/test/utils/test_deduplication.py
new file mode 100644
index 0000000000..4ba3b95fc1
--- /dev/null
+++ b/test/utils/test_deduplication.py
@@ -0,0 +1,202 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+from typing import List
+
+import pytest
+
+from camel.embeddings.base import BaseEmbedding
+from camel.utils import DeduplicationResult, deduplicate_internally
+
+
+class MockEmbedding(BaseEmbedding[str]):
+    """
+    A mock embedding class that always returns the same embedding vector
+    for any input text. Useful for testing deduplication logic.
+    """
+
+    def embed(self, obj: str, **kwargs) -> List[float]:
+        return [0.5, 0.5, 0.5]
+
+    def embed_list(self, objs: List[str], **kwargs) -> List[List[float]]:
+        return [[0.5, 0.5, 0.5] for _ in objs]
+
+    def get_output_dim(self) -> int:
+        return 3
+
+
+def test_deduplicate_internally_with_mock_embedding():
+    texts = ["Hello world!", "Hello world!", "HELLO WORLD!", "Something else"]
+    mock_embedding_instance = MockEmbedding()
+
+    result: DeduplicationResult = deduplicate_internally(
+        texts=texts,
+        threshold=0.9,
+        embedding_instance=mock_embedding_instance,
+        strategy="top1",
+    )
+
+    # Since all embeddings are the same, the first two texts
+    # should be considered duplicates with very high similarity,
+    # likewise with the third text. So we expect only 1 unique ID
+    # if threshold is 0.9.
+    assert result.original_texts[0] == "Hello world!"
+    assert (
+        len(result.unique_ids) == 1
+    ), f"Expected 1 unique id, got {len(result.unique_ids)}"
+
+    # Check the mapping. Indices 1 and 2 should map to 0,
+    # as duplicates. 3 is a special case here: the embedding is also identical,
+    # so it should be a duplicate as well.
+    # So total texts = 4, unique = [0], duplicates = [1->0, 2->0, 3->0].
+    expected_duplicate_map = {1: 0, 2: 0, 3: 0}
+    assert result.duplicate_to_target_map == expected_duplicate_map, (
+        f"Expected duplicate map {expected_duplicate_map}, "
+        f"got {result.duplicate_to_target_map}"
+    )
+
+    # Also verify the returned embeddings
+    assert len(result.unique_embeddings_dict) == 1
+    assert list(result.unique_embeddings_dict.keys()) == [0]
+    assert result.unique_embeddings_dict[0] == [0.5, 0.5, 0.5]
+
+
+def test_deduplicate_internally_with_precomputed_embeddings():
+    texts = ["Text A", "Text B", "Text B (similar)", "Text C"]
+    # Embeddings:
+    # - index 0 -> [1, 0, 0]
+    # - index 1 -> [0, 1, 0]
+    # - index 2 -> [0, 0.99, 0] (nearly the same as index 1)
+    # - index 3 -> [0, 0, 1]
+    embeddings = [
+        [1.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0],
+        [0.0, 0.99, 0.0],
+        [0.0, 0.0, 1.0],
+    ]
+
+    result: DeduplicationResult = deduplicate_internally(
+        texts=texts,
+        threshold=0.95,
+        embeddings=embeddings,
+        # Not providing embedding_instance, so it will use precomputed.
+        strategy="top1",
+    )
+
+    # We expect "Text B" (index=1) and "Text B (similar)" (index=2) to be
+    # duplicates, since their embeddings have high cosine similarity (> 0.95).
+    # The others are distinct enough.
+    assert (
+        len(result.unique_ids) == 3
+    ), f"Expected 3 unique ids, got {len(result.unique_ids)}"
+    # The duplicates map should reflect that index 2 is mapped to 1
+    assert result.duplicate_to_target_map == {
+        2: 1
+    }, f"Expected {{2: 1}}, got {result.duplicate_to_target_map}"
+
+    # Check correctness of embeddings dictionary
+    # We have 3 unique IDs: e.g. [0, 1, 3]
+    # (the actual order might vary if threshold leads to a different mapping,
+    # but we expect to see them in the unique_embeddings_dict).
+    for uid in result.unique_ids:
+        assert (
+            uid in result.unique_embeddings_dict
+        ), f"Missing embedding for unique id {uid}"
+
+
+def test_deduplicate_internally_chain_scenario():
+    """
+    Test scenario:
+      - A <-> B similarity > threshold
+      - B <-> C similarity > threshold
+      - C <-> D similarity > threshold
+      But A <-> C, B <-> D, A <-> D are all < threshold.
+    According to the 'top1' strategy, each new text that is similar to
+    a previously seen text will be mapped to the closest one. This creates
+    a chain-like mapping where B -> A, C -> B, D -> C. In the end, only
+    A is considered truly unique, because every subsequent text maps
+    transitively back to A.
+    """
+
+    texts = ["A", "B", "C", "D"]
+    # Note: We rely on sklearn's cosine_similarity, which normalizes by
+    # vector norms. These 2D vectors are chosen so that consecutive pairs
+    # (A-B, B-C, C-D) have a cosine similarity > 0.8, while non-consecutive
+    # pairs remain < 0.8.
+    embeddings = [
+        [1.0, 0.0],  # A
+        [0.87, 0.5],  # B
+        [0.50, 0.87],  # C
+        [0.0, 1.0],  # D
+    ]
+
+    result: DeduplicationResult = deduplicate_internally(
+        texts=texts,
+        threshold=0.8,
+        embeddings=embeddings,
+        strategy="top1",
+    )
+
+    # We expect only index 0 ("A") to be truly unique.
+    # B (index=1) -> A, C (index=2) -> B, D (index=3) -> C
+    # which in the final data structure looks like:
+    # duplicate_to_target_map = {1: 0, 2: 1, 3: 2}
+
+    assert (
+        len(result.unique_ids) == 1
+    ), f"Expected exactly 1 unique id, got {len(result.unique_ids)}"
+    assert (
+        result.unique_ids[0] == 0
+    ), "Expected the only unique id to be index 0"
+
+    expected_map = {1: 0, 2: 1, 3: 2}
+    assert result.duplicate_to_target_map == expected_map, (
+        f"Expected chain map {expected_map}, "
+        f"but got {result.duplicate_to_target_map}"
+    )
+
+    # Also check embeddings
+    assert len(result.unique_embeddings_dict) == 1, (
+        "Expected 1 unique embedding, got "
+        f"{len(result.unique_embeddings_dict)}"
+    )
+    assert (
+        0 in result.unique_embeddings_dict
+    ), "Missing embedding for the unique text A."
+    # Optionally verify the embedding
+    assert result.unique_embeddings_dict[0] == [
+        1.0,
+        0.0,
+    ], "Expected 'A' to have embedding [1.0, 0.0]."
+
+
+def test_deduplicate_internally_with_llm_supervision():
+    with pytest.raises(NotImplementedError):
+        deduplicate_internally(
+            texts=["A", "B", "C"],
+            threshold=0.8,
+            embedding_instance=MockEmbedding(),
+            strategy="llm-supervise",
+        )
+
+
+def test_deduplicate_internally_with_inconsistent_embeddings():
+    with pytest.raises(ValueError):
+        deduplicate_internally(
+            texts=["A", "B", "C"],
+            threshold=0.8,
+            embeddings=[[1.0, 0.0], [0.0, 1.0]],  # The length of texts is 3,
+            # but the length of embeddings is 2.
+            strategy="top1",
+        )