Skip to content

Commit

Permalink
Add support for fuzzy case-insensitive match in PropertyRef
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Chantavy <[email protected]>
  • Loading branch information
achantavy committed Nov 14, 2024
1 parent c86e388 commit 5a31913
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 1 deletion.
4 changes: 4 additions & 0 deletions cartography/graph/querybuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,18 @@ def _build_where_clause_for_rel_match(node_var: str, matcher: TargetNodeMatcher)
"""
match = Template("$node_var.$key = $prop_ref")
case_insensitive_match = Template("toLower($node_var.$key) = toLower($prop_ref)")
fuzzy_and_ignorecase_match = Template("toLower($node_var.$key) CONTAINS toLower($prop_ref)")

matcher_asdict = asdict(matcher)

result = []
for key, prop_ref in matcher_asdict.items():
if prop_ref.ignore_case:
prop_line = case_insensitive_match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref)
elif prop_ref.fuzzy_and_ignore_case:
prop_line = fuzzy_and_ignorecase_match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref)
else:
# Exact match (default; most efficient)
prop_line = match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref)
result.append(prop_line)
return ' AND\n'.join(result)
Expand Down
19 changes: 18 additions & 1 deletion cartography/models/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@ class PropertyRef:
(PropertyRef.set_in_kwargs=True).
"""

def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_case=False):
def __init__(
self,
name: str,
set_in_kwargs=False,
extra_index=False,
ignore_case=False,
fuzzy_and_ignore_case=False,
):
"""
:param name: The name of the property
:param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the
Expand All @@ -33,11 +40,21 @@ def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_cas
cartography catalog of GitHubUser nodes. Therefore, you would need `ignore_case=True` in the PropertyRef
that points to the GitHubUser node's name field, otherwise if one of your employees' GitHub usernames
contains capital letters, you would not be able to map them properly to a GitHubUser node in your graph.
:param fuzzy_and_ignore_case: If True, performs a fuzzy + case-insensitive match when comparing the value of
this property using the `CONTAINS` operator.
query. Defaults to False. This only has effect as part of a TargetNodeMatcher and is not supported for the
sub resource relationship.
"""
self.name = name
self.set_in_kwargs = set_in_kwargs
self.extra_index = extra_index
self.ignore_case = ignore_case
self.fuzzy_and_ignore_case = fuzzy_and_ignore_case
if self.fuzzy_and_ignore_case and self.ignore_case:
raise ValueError(
f'Error setting PropertyRef "{self.name}": ignore_case cannot be used together with'
'fuzzy_and_ignore_case. Pick one or the other.',
)

def _parameterize_name(self) -> str:
return f"${self.name}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from dataclasses import dataclass

from cartography.models.core.common import PropertyRef
from cartography.models.core.nodes import CartographyNodeProperties
from cartography.models.core.nodes import CartographyNodeSchema
from cartography.models.core.relationships import CartographyRelProperties
from cartography.models.core.relationships import CartographyRelSchema
from cartography.models.core.relationships import LinkDirection
from cartography.models.core.relationships import make_target_node_matcher
from cartography.models.core.relationships import OtherRelationships
from cartography.models.core.relationships import TargetNodeMatcher


@dataclass(frozen=True)
class FakeEmp2ToGitHubUserRelProperties(CartographyRelProperties):
lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)


@dataclass(frozen=True)
class FakeEmp2ToGitHubUser(CartographyRelSchema):
target_node_label: str = 'GitHubUser'
target_node_matcher: TargetNodeMatcher = make_target_node_matcher(
{'username': PropertyRef('github_username', fuzzy_and_ignore_case=True)},
)
direction: LinkDirection = LinkDirection.OUTWARD
rel_label: str = "IDENTITY_GITHUB"
properties: FakeEmp2ToGitHubUserRelProperties = FakeEmp2ToGitHubUserRelProperties()


@dataclass(frozen=True)
class FakeEmp2NodeProperties(CartographyNodeProperties):
id: PropertyRef = PropertyRef('id')
lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
email: PropertyRef = PropertyRef('email')
github_username: PropertyRef = PropertyRef('github_username')


@dataclass(frozen=True)
class FakeEmp2Schema(CartographyNodeSchema):
label: str = 'FakeEmployee2'
properties: FakeEmp2NodeProperties = FakeEmp2NodeProperties()
other_relationships: OtherRelationships = OtherRelationships([
FakeEmp2ToGitHubUser(),
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from cartography.client.core.tx import load
from cartography.intel.github.users import load_organization_users
from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_EMPLOYEE_DATA
from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_ORG_DATA
from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_USER_DATA
from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers_fuzzy import FakeEmp2Schema
from tests.integration.util import check_rels

TEST_UPDATE_TAG = 123456789


def test_load_team_members_data_fuzzy(neo4j_session):
# Arrange: Load some fake GitHubUser nodes to the graph
load_organization_users(
neo4j_session,
FAKE_GITHUB_USER_DATA,
FAKE_GITHUB_ORG_DATA,
TEST_UPDATE_TAG,
)

# Act: Create team members
load(neo4j_session, FakeEmp2Schema(), FAKE_EMPLOYEE_DATA, lastupdated=TEST_UPDATE_TAG)

# Assert we can create relationships using a fuzzy, case insensitive match
assert check_rels(neo4j_session, 'FakeEmployee2', 'email', 'GitHubUser', 'username', 'IDENTITY_GITHUB') == {
('[email protected]', 'HjsimPson'), ('[email protected]', 'mbsimp-son'),
}
33 changes: 33 additions & 0 deletions tests/unit/cartography/graph/test_querybuilder_simple.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from cartography.graph.querybuilder import build_ingestion_query
from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers import FakeEmpSchema
from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers_fuzzy import FakeEmp2Schema
from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema
from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeWithSubResourceSchema
from tests.unit.cartography.graph.helpers import remove_leading_whitespace_and_empty_lines
Expand Down Expand Up @@ -90,3 +91,35 @@ def test_build_ingestion_query_case_insensitive_match():
actual_query = remove_leading_whitespace_and_empty_lines(query)
expected_query = remove_leading_whitespace_and_empty_lines(expected)
assert actual_query == expected_query


def test_build_ingestion_query_fuzzy_case_insensitive():
query = build_ingestion_query(FakeEmp2Schema())

expected = """
UNWIND $DictList AS item
MERGE (i:FakeEmployee2{id: item.id})
ON CREATE SET i.firstseen = timestamp()
SET
i.lastupdated = $lastupdated,
i.email = item.email,
i.github_username = item.github_username
WITH i, item
CALL {
WITH i, item
OPTIONAL MATCH (n0:GitHubUser)
WHERE
toLower(n0.username) CONTAINS toLower(item.github_username)
WITH i, item, n0 WHERE n0 IS NOT NULL
MERGE (i)-[r0:IDENTITY_GITHUB]->(n0)
ON CREATE SET r0.firstseen = timestamp()
SET
r0.lastupdated = $lastupdated
}
"""

# Assert: compare query outputs while ignoring leading whitespace.
actual_query = remove_leading_whitespace_and_empty_lines(query)
expected_query = remove_leading_whitespace_and_empty_lines(expected)
assert actual_query == expected_query

0 comments on commit 5a31913

Please sign in to comment.