Skip to content

Commit

Permalink
Merge branch 'main' into metadata-nested-lists
Browse files Browse the repository at this point in the history
  • Loading branch information
bjchambers committed Feb 19, 2025
2 parents 24befd2 + b99bddc commit 9040f94
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 13 deletions.
8 changes: 4 additions & 4 deletions data/animals.jsonl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{"id": "aardvark", "text": "the aardvark is a nocturnal mammal known for its burrowing habits and long snout used to sniff out ants.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["burrowing", "nocturnal", "ants", "savanna"], "habitat": "savanna", "tags": [{"a": 5, "b": 7}, {"a": 8, "b": 10}]}}
{"id": "albatross", "text": "the albatross is a large seabird with the longest wingspan of any bird, allowing it to glide effortlessly over oceans.", "metadata": {"type": "bird", "number_of_legs": 2, "keywords": ["seabird", "wingspan", "ocean"], "habitat": "marine", "tags": [{"a": 5, "b": 8}, {"a": 8, "b": 10}]}}
{"id": "alligator", "text": "alligators are large reptiles with powerful jaws and are commonly found in freshwater wetlands.", "metadata": {"type": "reptile", "number_of_legs": 4, "keywords": ["reptile", "jaws", "wetlands"], "diet": "carnivorous"}}
{"id": "alpaca", "text": "alpacas are domesticated mammals valued for their soft wool and friendly demeanor.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["wool", "domesticated", "friendly"], "origin": "south america"}}
{"id": "ant", "text": "ants are social insects that live in colonies and are known for their teamwork and strength.", "metadata": {"type": "insect", "number_of_legs": 6, "keywords": ["social", "colonies", "strength", "pollinator"], "diet": "omnivorous"}}
{"id": "anteater", "text": "anteaters use their long tongues to eat thousands of ants and termites each day.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["ants", "tongue", "termites"], "diet": "insectivore"}}
{"id": "alligator", "text": "alligators are large reptiles with powerful jaws and are commonly found in freshwater wetlands.", "metadata": {"type": "reptile", "number_of_legs": 4, "keywords": ["reptile", "jaws", "wetlands"], "diet": "carnivorous", "nested": { "a": 5 }}}
{"id": "alpaca", "text": "alpacas are domesticated mammals valued for their soft wool and friendly demeanor.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["wool", "domesticated", "friendly"], "origin": "south america", "nested": { "a": 5 }}}
{"id": "ant", "text": "ants are social insects that live in colonies and are known for their teamwork and strength.", "metadata": {"type": "insect", "number_of_legs": 6, "keywords": ["social", "colonies", "strength", "pollinator"], "diet": "omnivorous", "nested": { "a": 6 }}}
{"id": "anteater", "text": "anteaters use their long tongues to eat thousands of ants and termites each day.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["ants", "tongue", "termites"], "diet": "insectivore", "nested": { "b": 5 }}}
{"id": "antelope", "text": "antelopes are graceful herbivorous mammals that are often prey for large predators in the wild.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["graceful", "herbivore", "prey"], "habitat": "grasslands"}}
{"id": "armadillo", "text": "armadillos have hard, protective shells and are known for their ability to roll into a ball.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["protective", "shell", "rolling"], "diet": "insectivore"}}
{"id": "baboon", "text": "baboons are highly social primates with complex group dynamics and strong bonds.", "metadata": {"type": "mammal", "number_of_legs": 4, "keywords": ["social", "primates", "group"], "diet": "omnivorous"}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,11 @@ def _matches(self, filter: dict[str, Any] | None, content: Content) -> bool:
return True

for key, filter_value in filter.items():
content_value = content.metadata.get(key, SENTINEL)
content_value = content.metadata
for key_part in key.split("."):
content_value = content_value.get(key_part, SENTINEL)
if content_value is SENTINEL:
break
if not self._value_matches(filter_value, content_value):
return False
return True
Expand Down
11 changes: 10 additions & 1 deletion packages/graph-retriever/src/graph_retriever/edges/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ class Id:
"""


def _nested_get(metadata: dict[str, Any], key: str) -> Any:
value = metadata
for key_part in key.split("."):
value = value.get(key_part, SENTINEL)
if value is SENTINEL:
break
return value


class MetadataEdgeFunction:
"""
Helper for extracting and encoding edges in metadata.
Expand Down Expand Up @@ -116,7 +125,7 @@ def mk_edge(v) -> Edge:
if isinstance(source_key, Id):
edges.add(mk_edge(id))
else:
value = metadata.get(source_key, SENTINEL)
value = _nested_get(metadata, source_key)
if isinstance(value, BASIC_TYPES):
edges.add(mk_edge(value))
elif isinstance(value, Iterable):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def assert_ids_any_order(
assert set(result_ids) == set(expected), "should contain exactly expected IDs"


@dataclass
@dataclass(kw_only=True)
class AdapterComplianceCase(abc.ABC):
"""
Base dataclass for test cases.
Expand All @@ -57,6 +57,9 @@ class AdapterComplianceCase(abc.ABC):
id: str
expected: list[str]

requires_nested: bool = False
requires_dict_in_list: bool = False


@dataclass
class GetCase(AdapterComplianceCase):
Expand Down Expand Up @@ -314,6 +317,7 @@ class AdjacentCase(AdapterComplianceCase):
expected=[
"aardvark",
],
requires_dict_in_list=True,
),
AdjacentCase(
id="dict_in_list_multiple",
Expand All @@ -326,6 +330,7 @@ class AdjacentCase(AdapterComplianceCase):
"aardvark",
"albatross",
],
requires_dict_in_list=True,
),
AdjacentCase(
id="absent_dict",
Expand All @@ -334,6 +339,47 @@ class AdjacentCase(AdapterComplianceCase):
MetadataEdge("tags", {"a": 5, "b": 10}),
},
expected=[],
requires_dict_in_list=True,
),
AdjacentCase(
id="nested",
query="domesticated hunters",
edges={
MetadataEdge("nested.a", 5),
},
expected=[
"alligator",
"alpaca",
],
requires_nested=True,
),
AdjacentCase(
id="nested_same_field",
query="domesticated hunters",
edges={
MetadataEdge("nested.a", 5),
MetadataEdge("nested.a", 6),
},
expected=[
"alligator",
"alpaca",
"ant",
],
requires_nested=True,
),
AdjacentCase(
id="nested_diff_field",
query="domesticated hunters",
edges={
MetadataEdge("nested.a", 5),
MetadataEdge("nested.b", 5),
},
expected=[
"alligator",
"alpaca",
"anteater",
],
requires_nested=True,
),
]

Expand All @@ -347,6 +393,14 @@ class AdapterComplianceSuite(abc.ABC):
loaded.
"""

def supports_nested_metadata(self) -> bool:
"""Return whether nested metadata is expected to work."""
return True

def supports_dict_in_list(self) -> bool:
"""Return whether dicts can appear in list fields in metadata."""
return True

def expected(self, method: str, case: AdapterComplianceCase) -> list[str]:
"""
Override to change the expected behavior of a case.
Expand All @@ -373,6 +427,10 @@ def expected(self, method: str, case: AdapterComplianceCase) -> list[str]:
:
The expected animals.
"""
if not self.supports_nested_metadata() and case.requires_nested:
pytest.xfail("nested metadata not supported")
if not self.supports_dict_in_list() and case.requires_dict_in_list:
pytest.xfail("dict-in-list fields is not supported")
return case.expected

@pytest.fixture(params=GET_CASES, ids=lambda c: c.id)
Expand Down
8 changes: 8 additions & 0 deletions packages/graph-retriever/tests/edges/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ def test_edge_function():
)


def test_nested_edge():
edge_function = MetadataEdgeFunction([("a.b", "b.c")])
assert edge_function(mk_node({"a": {"b": 5}, "b": {"c": 7}})) == Edges(
{MetadataEdge("b.c", 7)},
{MetadataEdge("b.c", 5)},
)


def test_link_to_id():
edge_function = MetadataEdgeFunction([("mentions", Id())])
result = edge_function(mk_node({"mentions": ["a", "c"]}))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ def _equals_or_contains(
True if and only if `metadata[key] == value` or `metadata[key]` is a
list containing `value`.
"""
actual = metadata.get(key, SENTINEL)
actual = metadata
for key_part in key.split("."):
actual = actual.get(key_part, SENTINEL)
if actual is SENTINEL:
break
if actual == value:
return True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def cluster(


class TestCassandraAdapter(AdapterComplianceSuite):
def supports_nested_metadata(self) -> bool:
return False

@pytest.fixture(scope="class")
def adapter(
self,
Expand Down
13 changes: 13 additions & 0 deletions packages/langchain-graph-retriever/tests/adapters/test_chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@


class TestChroma(AdapterComplianceSuite):
def supports_nested_metadata(self) -> bool:
return False

@pytest.fixture(scope="class")
def adapter(
self,
Expand All @@ -25,6 +28,16 @@ def adapter(
)

shredder = ShreddingTransformer()

# Chroma doesn't even support *writing* nested data currently, so we
# filter it out.
def remove_nested_metadata(doc: Document) -> Document:
metadata = doc.metadata.copy()
metadata.pop("nested", None)
return Document(id=doc.id, page_content=doc.page_content, metadata=metadata)

animal_docs = [remove_nested_metadata(doc) for doc in animal_docs]

docs = list(shredder.transform_documents(animal_docs))
store = Chroma.from_documents(
docs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@


class TestOpenSearch(AdapterComplianceSuite):
def expected(self, method, case):
CASES = ({"dict_in_list", "dict_in_list_multiple", "absent_dict"},)
if method in ["adjacent", "aadjacent"] and case in CASES:
pytest.xfail("OpenSearch doesn't support edges with dict values.")
return super().expected(method, case)
def supports_nested_metadata(self) -> bool:
return False

def supports_dict_in_list(self) -> bool:
return False

@pytest.fixture(scope="class")
def adapter(
Expand Down

0 comments on commit 9040f94

Please sign in to comment.