From c49fbe3a4d5e89677e31a54bba875572adc6c389 Mon Sep 17 00:00:00 2001 From: LastWhisper Date: Sun, 9 Feb 2025 16:22:33 +0800 Subject: [PATCH] feat: Internal deduplication impl. (#1568) --- camel/utils/__init__.py | 3 + camel/utils/deduplication.py | 199 ++++++++++++++++++++++++++++++ pyproject.toml | 1 + test/utils/test_deduplication.py | 202 +++++++++++++++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 camel/utils/deduplication.py create mode 100644 test/utils/test_deduplication.py diff --git a/camel/utils/__init__.py b/camel/utils/__init__.py index 481471f3c3..c08c62c142 100644 --- a/camel/utils/__init__.py +++ b/camel/utils/__init__.py @@ -40,6 +40,7 @@ track_agent, ) from .constants import Constants +from .deduplication import DeduplicationResult, deduplicate_internally from .response_format import get_pydantic_model from .token_counting import ( AnthropicTokenCounter, @@ -82,6 +83,8 @@ "get_pydantic_model", "download_github_subdirectory", "generate_prompt_for_structured_output", + "deduplicate_internally", + "DeduplicationResult", "retry_on_error", "BatchProcessor", ] diff --git a/camel/utils/deduplication.py b/camel/utils/deduplication.py new file mode 100644 index 0000000000..cbb28884d0 --- /dev/null +++ b/camel/utils/deduplication.py @@ -0,0 +1,199 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= + + +from typing import Dict, List, Literal, Optional + +import numpy as np +from pydantic import BaseModel +from sklearn.metrics.pairwise import cosine_similarity + +from camel.embeddings.base import BaseEmbedding + + +class DeduplicationResult(BaseModel): + """ + The result of deduplication. + + Attributes: + original_texts (List[str]): The original texts. + unique_ids (List[int]): A list of ids that are unique (not duplicates). + unique_embeddings_dict (Dict[int, List[float]]): + A mapping from the index of each unique text to its embedding. + duplicate_to_target_map (Dict[int, int]): + A mapping from the index of the duplicate text to the index + of the text it is considered a duplicate of. + """ + + original_texts: List[str] + unique_ids: List[int] + unique_embeddings_dict: Dict[int, List[float]] + duplicate_to_target_map: Dict[int, int] + + +def deduplicate_internally( + texts: List[str], + threshold: float = 0.65, + embedding_instance: Optional[BaseEmbedding[str]] = None, + embeddings: Optional[List[List[float]]] = None, + strategy: Literal["top1", "llm-supervise"] = "top1", +) -> DeduplicationResult: + """ + Deduplicate a list of strings based on their cosine similarity. + + You can either: + 1) Provide a Camel `BaseEmbedding` instance via `embedding_instance` to let + this function handle the embedding internally, OR + 2) Directly pass a list of pre-computed embeddings to `embeddings`. + + If both `embedding_instance` and `embeddings` are provided, the function + will raise a ValueError to avoid ambiguous usage. + + strategy is used to specify different strategies, where 'top1' selects the + one with highest similarity, and 'llm-supervise' uses LLM to determine if + texts are duplicates (not yet implemented). + + Args: + texts (List[str]): The list of texts to be deduplicated. + threshold (float, optional): The similarity threshold for considering + two texts as duplicates. Default is 0.65. + embedding_instance (Optional[BaseEmbedding[str]], optional): + A Camel embedding instance for automatic embedding. Defaults to + None. + embeddings (Optional[List[List[float]]], optional): + Pre-computed embeddings of `texts`. Each element in the list + corresponds to the embedding of the text in the same index of + `texts`. Defaults to None. + strategy (Literal["top1", "llm-supervise"], optional): + The strategy to use for deduplication. Defaults to "top1". + + Returns: + DeduplicationResult: An object that contains: + - `original_texts`: The original texts. + - `unique_ids`: The unique ids after deduplication. + - `unique_embeddings_dict`: A dict mapping from (unique) text id + to its embedding. + - `duplicate_to_target_map`: A dict mapping from the id of a + duplicate text to the id of the text it is considered a duplicate + of. + + Raises: + NotImplementedError: If the strategy is not "top1". + ValueError: If neither embeddings nor embedding_instance is provided, + or if both are provided at the same time. + ValueError: If the length of `embeddings` does not match the length of + `texts`. + + Example: + >>> from camel.embeddings.openai_embedding import OpenAIEmbedding + >>> # Suppose we have 5 texts, some of which may be duplicates + >>> texts = [ + ... "What is AI?", + ... "Artificial Intelligence is about machines", + ... "What is AI?", + ... "Deep Learning is a subset of AI", + ... "What is artificial intelligence?" + ... ] + >>> # or any other BaseEmbedding instance + >>> embedding_model = OpenAIEmbedding() + >>> result = deduplicate_internally( + ... texts=texts, + ... threshold=0.7, + ... embedding_instance=embedding_model + ... ) + >>> print("Unique ids:") + >>> for uid in result.unique_ids: + ... print(texts[uid]) + Unique ids: + What is AI? + Artificial Intelligence is about machines + Deep Learning is a subset of AI + What is artificial intelligence? + + >>> print("Duplicate map:") + >>> print(result.duplicate_to_target_map) + {2: 0} + # This indicates the text at index 2 is considered + # a duplicate of index 0. + """ + if strategy == "llm-supervise": + # TODO: Implement LLM-supervise deduplication. + raise NotImplementedError( + "LLM-supervise deduplication is not yet implemented." + ) + + # Check if the parameters are valid. + if embedding_instance is None and embeddings is None: + raise ValueError( + "Either 'embedding_instance' or 'embeddings' must be provided." + ) + if embedding_instance is not None and embeddings is not None: + raise ValueError( + "Cannot provide both 'embedding_instance' and 'embeddings'. " + "Please choose only one way to supply embeddings." + ) + + if embedding_instance is not None: + # Use Camel's embedding_instance to vectorize. + embeddings = embedding_instance.embed_list(texts) + else: + # Use pre-supplied embeddings. + if embeddings and len(embeddings) != len(texts): + raise ValueError( + "The length of 'embeddings' does not match the length " + "of 'texts'." + ) + + # Calculate cosine similarity. + similarity_matrix = cosine_similarity(embeddings) + n = len(texts) + + # Use the lower triangle to avoid redundant comparisons + # (or self-comparisons). + tril_mask = np.tril(np.ones((n, n)), k=-1) + similarity_matrix = similarity_matrix * tril_mask + + # For each row, find the column with the highest similarity + # that exceeds the threshold. If no similarity exceeds the threshold, + # set the column index to -1. + masked_similarities = np.where( + similarity_matrix > threshold, similarity_matrix, -1 + ) + max_indices = masked_similarities.argmax(axis=1) + + duplicate_to_target_map: Dict[int, int] = {} + above_threshold = similarity_matrix[np.arange(n), max_indices] > threshold + + # Construct the "duplicate->target" mapping. + for i in range(n): + if above_threshold[i]: + duplicate_to_target_map[i] = max_indices[i] + + # Get the actual unique ids and embeddings. + unique_ids = [] + unique_embeddings_dict = {} + + assert embeddings, "embeddings must be valid" + + for i, (_, emb) in enumerate(zip(texts, embeddings)): + if i not in duplicate_to_target_map: + unique_ids.append(i) + unique_embeddings_dict[i] = emb + + return DeduplicationResult( + original_texts=texts, + unique_ids=unique_ids, + unique_embeddings_dict=unique_embeddings_dict, + duplicate_to_target_map=duplicate_to_target_map, + ) diff --git a/pyproject.toml b/pyproject.toml index 990c8d331f..be8d51e04a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -533,6 +533,7 @@ module = [ "tree-sitter-python", "tree-sitter", "pandasai", + "sklearn.metrics.pairwise", "sympy", ] ignore_missing_imports = true \ No newline at end of file diff --git a/test/utils/test_deduplication.py b/test/utils/test_deduplication.py new file mode 100644 index 0000000000..4ba3b95fc1 --- /dev/null +++ b/test/utils/test_deduplication.py @@ -0,0 +1,202 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= + +from typing import List + +import pytest + +from camel.embeddings.base import BaseEmbedding +from camel.utils import DeduplicationResult, deduplicate_internally + + +class MockEmbedding(BaseEmbedding[str]): + """ + A mock embedding class that always returns the same embedding vector + for any input text. Useful for testing deduplication logic. + """ + + def embed(self, obj: str, **kwargs) -> List[float]: + return [0.5, 0.5, 0.5] + + def embed_list(self, objs: List[str], **kwargs) -> List[List[float]]: + return [[0.5, 0.5, 0.5] for _ in objs] + + def get_output_dim(self) -> int: + return 3 + + +def test_deduplicate_internally_with_mock_embedding(): + texts = ["Hello world!", "Hello world!", "HELLO WORLD!", "Something else"] + mock_embedding_instance = MockEmbedding() + + result: DeduplicationResult = deduplicate_internally( + texts=texts, + threshold=0.9, + embedding_instance=mock_embedding_instance, + strategy="top1", + ) + + # Since all embeddings are the same, the first two texts + # should be considered duplicates with very high similarity, + # likewise with the third text. So we expect only 1 unique ID + # if threshold is 0.9. + assert result.original_texts[0] == "Hello world!" + assert ( + len(result.unique_ids) == 1 + ), f"Expected 1 unique id, got {len(result.unique_ids)}" + + # Check the mapping. Indices 1 and 2 should map to 0, + # as duplicates. 3 is a special case here: the embedding is also identical, + # so it should be a duplicate as well. + # So total texts = 4, unique = [0], duplicates = [1->0, 2->0, 3->0]. + expected_duplicate_map = {1: 0, 2: 0, 3: 0} + assert result.duplicate_to_target_map == expected_duplicate_map, ( + f"Expected duplicate map {expected_duplicate_map}, " + f"got {result.duplicate_to_target_map}" + ) + + # Also verify the returned embeddings + assert len(result.unique_embeddings_dict) == 1 + assert list(result.unique_embeddings_dict.keys()) == [0] + assert result.unique_embeddings_dict[0] == [0.5, 0.5, 0.5] + + +def test_deduplicate_internally_with_precomputed_embeddings(): + texts = ["Text A", "Text B", "Text B (similar)", "Text C"] + # Embeddings: + # - index 0 -> [1, 0, 0] + # - index 1 -> [0, 1, 0] + # - index 2 -> [0, 0.99, 0] (nearly the same as index 1) + # - index 3 -> [0, 0, 1] + embeddings = [ + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.99, 0.0], + [0.0, 0.0, 1.0], + ] + + result: DeduplicationResult = deduplicate_internally( + texts=texts, + threshold=0.95, + embeddings=embeddings, + # Not providing embedding_instance, so it will use precomputed. + strategy="top1", + ) + + # We expect "Text B" (index=1) and "Text B (similar)" (index=2) to be + # duplicates, since their embeddings have high cosine similarity (> 0.95). + # The others are distinct enough. + assert ( + len(result.unique_ids) == 3 + ), f"Expected 3 unique ids, got {len(result.unique_ids)}" + # The duplicates map should reflect that index 2 is mapped to 1 + assert result.duplicate_to_target_map == { + 2: 1 + }, f"Expected {{2: 1}}, got {result.duplicate_to_target_map}" + + # Check correctness of embeddings dictionary + # We have 3 unique IDs: e.g. [0, 1, 3] + # (the actual order might vary if threshold leads to a different mapping, + # but we expect to see them in the unique_embeddings_dict). + for uid in result.unique_ids: + assert ( + uid in result.unique_embeddings_dict + ), f"Missing embedding for unique id {uid}" + + +def test_deduplicate_internally_chain_scenario(): + """ + Test scenario: + - A <-> B similarity > threshold + - B <-> C similarity > threshold + - C <-> D similarity > threshold + But A <-> C, B <-> D, A <-> D are all < threshold. + According to the 'top1' strategy, each new text that is similar to + a previously seen text will be mapped to the closest one. This creates + a chain-like mapping where B -> A, C -> B, D -> C. In the end, only + A is considered truly unique, because every subsequent text maps + transitively back to A. + """ + + texts = ["A", "B", "C", "D"] + # Note: We rely on sklearn's cosine_similarity, which normalizes by + # vector norms. These 2D vectors are chosen so that consecutive pairs + # (A-B, B-C, C-D) have a cosine similarity > 0.8, while non-consecutive + # pairs remain < 0.8. + embeddings = [ + [1.0, 0.0], # A + [0.87, 0.5], # B + [0.50, 0.87], # C + [0.0, 1.0], # D + ] + + result: DeduplicationResult = deduplicate_internally( + texts=texts, + threshold=0.8, + embeddings=embeddings, + strategy="top1", + ) + + # We expect only index 0 ("A") to be truly unique. + # B (index=1) -> A, C (index=2) -> B, D (index=3) -> C + # which in the final data structure looks like: + # duplicate_to_target_map = {1: 0, 2: 1, 3: 2} + + assert ( + len(result.unique_ids) == 1 + ), f"Expected exactly 1 unique id, got {len(result.unique_ids)}" + assert ( + result.unique_ids[0] == 0 + ), "Expected the only unique id to be index 0" + + expected_map = {1: 0, 2: 1, 3: 2} + assert result.duplicate_to_target_map == expected_map, ( + f"Expected chain map {expected_map}, " + f"but got {result.duplicate_to_target_map}" + ) + + # Also check embeddings + assert len(result.unique_embeddings_dict) == 1, ( + "Expected 1 unique embedding, got " + f"{len(result.unique_embeddings_dict)}" + ) + assert ( + 0 in result.unique_embeddings_dict + ), "Missing embedding for the unique text A." + # Optionally verify the embedding + assert result.unique_embeddings_dict[0] == [ + 1.0, + 0.0, + ], "Expected 'A' to have embedding [1.0, 0.0]." + + +def test_deduplicate_internally_with_llm_supervision(): + with pytest.raises(NotImplementedError): + deduplicate_internally( + texts=["A", "B", "C"], + threshold=0.8, + embedding_instance=MockEmbedding(), + strategy="llm-supervise", + ) + + +def test_deduplicate_internally_with_inconsistent_embeddings(): + with pytest.raises(ValueError): + deduplicate_internally( + texts=["A", "B", "C"], + threshold=0.8, + embeddings=[[1.0, 0.0], [0.0, 1.0]], # The length of texts is 3, + # but the length of embeddings is 2. + strategy="top1", + )