Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for fuzzy case-insensitive match in PropertyRef #1383

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cartography/graph/querybuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,18 @@ def _build_where_clause_for_rel_match(node_var: str, matcher: TargetNodeMatcher)
"""
match = Template("$node_var.$key = $prop_ref")
case_insensitive_match = Template("toLower($node_var.$key) = toLower($prop_ref)")
fuzzy_and_ignorecase_match = Template("toLower($node_var.$key) CONTAINS toLower($prop_ref)")

matcher_asdict = asdict(matcher)

result = []
for key, prop_ref in matcher_asdict.items():
if prop_ref.ignore_case:
prop_line = case_insensitive_match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref)
elif prop_ref.fuzzy_and_ignore_case:
prop_line = fuzzy_and_ignorecase_match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref)
else:
# Exact match (default; most efficient)
prop_line = match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref)
result.append(prop_line)
return ' AND\n'.join(result)
Expand Down
19 changes: 18 additions & 1 deletion cartography/models/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@ class PropertyRef:
(PropertyRef.set_in_kwargs=True).
"""

def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_case=False):
def __init__(
self,
name: str,
set_in_kwargs=False,
extra_index=False,
ignore_case=False,
fuzzy_and_ignore_case=False,
):
"""
:param name: The name of the property
:param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the
Expand All @@ -33,11 +40,21 @@ def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_cas
cartography catalog of GitHubUser nodes. Therefore, you would need `ignore_case=True` in the PropertyRef
that points to the GitHubUser node's name field, otherwise if one of your employees' GitHub usernames
contains capital letters, you would not be able to map them properly to a GitHubUser node in your graph.
:param fuzzy_and_ignore_case: If True, performs a fuzzy + case-insensitive match when comparing the value of
this property using the `CONTAINS` operator.
query. Defaults to False. This only has effect as part of a TargetNodeMatcher and is not supported for the
sub resource relationship.
"""
self.name = name
self.set_in_kwargs = set_in_kwargs
self.extra_index = extra_index
self.ignore_case = ignore_case
self.fuzzy_and_ignore_case = fuzzy_and_ignore_case
if self.fuzzy_and_ignore_case and self.ignore_case:
raise ValueError(
f'Error setting PropertyRef "{self.name}": ignore_case cannot be used together with'
'fuzzy_and_ignore_case. Pick one or the other.',
)

def _parameterize_name(self) -> str:
return f"${self.name}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,21 @@
'github_username': 'mbsimp-son', # pure lowercase
},
]

FAKE_EMPLOYEE2_DATA = [
{
'id': 123,
'email': '[email protected]',
'first_name': 'Homer',
'last_name': 'Simpson',
'name': 'Homer Simpson',
'github_username': 'jsimpso', # substring
},
{
'id': 456,
'email': '[email protected]',
'first_name': 'Marge',
'last_name': 'Simpson',
'github_username': 'mbsimp', # substring
},
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from dataclasses import dataclass

from cartography.models.core.common import PropertyRef
from cartography.models.core.nodes import CartographyNodeProperties
from cartography.models.core.nodes import CartographyNodeSchema
from cartography.models.core.relationships import CartographyRelProperties
from cartography.models.core.relationships import CartographyRelSchema
from cartography.models.core.relationships import LinkDirection
from cartography.models.core.relationships import make_target_node_matcher
from cartography.models.core.relationships import OtherRelationships
from cartography.models.core.relationships import TargetNodeMatcher


@dataclass(frozen=True)
class FakeEmp2ToGitHubUserRelProperties(CartographyRelProperties):
lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)


@dataclass(frozen=True)
class FakeEmp2ToGitHubUser(CartographyRelSchema):
target_node_label: str = 'GitHubUser'
target_node_matcher: TargetNodeMatcher = make_target_node_matcher(
{'username': PropertyRef('github_username', fuzzy_and_ignore_case=True)},
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SecPrez: this will allow us to connect 2 nodes in a "fuzzy" way, lemme know if this helps

)
direction: LinkDirection = LinkDirection.OUTWARD
rel_label: str = "IDENTITY_GITHUB"
properties: FakeEmp2ToGitHubUserRelProperties = FakeEmp2ToGitHubUserRelProperties()


@dataclass(frozen=True)
class FakeEmp2NodeProperties(CartographyNodeProperties):
id: PropertyRef = PropertyRef('id')
lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
email: PropertyRef = PropertyRef('email')
github_username: PropertyRef = PropertyRef('github_username')


@dataclass(frozen=True)
class FakeEmp2Schema(CartographyNodeSchema):
label: str = 'FakeEmployee2'
properties: FakeEmp2NodeProperties = FakeEmp2NodeProperties()
other_relationships: OtherRelationships = OtherRelationships([
FakeEmp2ToGitHubUser(),
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from cartography.client.core.tx import load
from cartography.intel.github.users import load_organization_users
from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_EMPLOYEE2_DATA
from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_ORG_DATA
from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_USER_DATA
from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers_fuzzy import FakeEmp2Schema
from tests.integration.util import check_rels

TEST_UPDATE_TAG = 123456789


def test_load_team_members_data_fuzzy(neo4j_session):
# Arrange: Load some fake GitHubUser nodes to the graph
load_organization_users(
neo4j_session,
FAKE_GITHUB_USER_DATA,
FAKE_GITHUB_ORG_DATA,
TEST_UPDATE_TAG,
)

# Act: Create team members
load(neo4j_session, FakeEmp2Schema(), FAKE_EMPLOYEE2_DATA, lastupdated=TEST_UPDATE_TAG)

# Assert we can create relationships using a fuzzy, case insensitive match
assert check_rels(neo4j_session, 'FakeEmployee2', 'email', 'GitHubUser', 'username', 'IDENTITY_GITHUB') == {
('[email protected]', 'HjsimPson'), ('[email protected]', 'mbsimp-son'),
}
33 changes: 33 additions & 0 deletions tests/unit/cartography/graph/test_querybuilder_simple.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from cartography.graph.querybuilder import build_ingestion_query
from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers import FakeEmpSchema
from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers_fuzzy import FakeEmp2Schema
from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema
from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeWithSubResourceSchema
from tests.unit.cartography.graph.helpers import remove_leading_whitespace_and_empty_lines
Expand Down Expand Up @@ -90,3 +91,35 @@ def test_build_ingestion_query_case_insensitive_match():
actual_query = remove_leading_whitespace_and_empty_lines(query)
expected_query = remove_leading_whitespace_and_empty_lines(expected)
assert actual_query == expected_query


def test_build_ingestion_query_fuzzy_case_insensitive():
query = build_ingestion_query(FakeEmp2Schema())

expected = """
UNWIND $DictList AS item
MERGE (i:FakeEmployee2{id: item.id})
ON CREATE SET i.firstseen = timestamp()
SET
i.lastupdated = $lastupdated,
i.email = item.email,
i.github_username = item.github_username
WITH i, item
CALL {
WITH i, item
OPTIONAL MATCH (n0:GitHubUser)
WHERE
toLower(n0.username) CONTAINS toLower(item.github_username)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SecPrez this is what the final rendered query will look like

WITH i, item, n0 WHERE n0 IS NOT NULL
MERGE (i)-[r0:IDENTITY_GITHUB]->(n0)
ON CREATE SET r0.firstseen = timestamp()
SET
r0.lastupdated = $lastupdated
}
"""

# Assert: compare query outputs while ignoring leading whitespace.
actual_query = remove_leading_whitespace_and_empty_lines(query)
expected_query = remove_leading_whitespace_and_empty_lines(expected)
assert actual_query == expected_query
Loading