diff --git a/cartography/graph/querybuilder.py b/cartography/graph/querybuilder.py index e33d444265..909880092a 100644 --- a/cartography/graph/querybuilder.py +++ b/cartography/graph/querybuilder.py @@ -118,6 +118,7 @@ def _build_where_clause_for_rel_match(node_var: str, matcher: TargetNodeMatcher) """ match = Template("$node_var.$key = $prop_ref") case_insensitive_match = Template("toLower($node_var.$key) = toLower($prop_ref)") + fuzzy_and_ignorecase_match = Template("toLower($node_var.$key) CONTAINS toLower($prop_ref)") matcher_asdict = asdict(matcher) @@ -125,7 +126,10 @@ def _build_where_clause_for_rel_match(node_var: str, matcher: TargetNodeMatcher) for key, prop_ref in matcher_asdict.items(): if prop_ref.ignore_case: prop_line = case_insensitive_match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref) + elif prop_ref.fuzzy_and_ignore_case: + prop_line = fuzzy_and_ignorecase_match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref) else: + # Exact match (default; most efficient) prop_line = match.safe_substitute(node_var=node_var, key=key, prop_ref=prop_ref) result.append(prop_line) return ' AND\n'.join(result) diff --git a/cartography/models/core/common.py b/cartography/models/core/common.py index 6ee5d664c6..2a6976cee8 100644 --- a/cartography/models/core/common.py +++ b/cartography/models/core/common.py @@ -8,7 +8,14 @@ class PropertyRef: (PropertyRef.set_in_kwargs=True). """ - def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_case=False): + def __init__( + self, + name: str, + set_in_kwargs=False, + extra_index=False, + ignore_case=False, + fuzzy_and_ignore_case=False, + ): """ :param name: The name of the property :param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the @@ -33,11 +40,21 @@ def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_cas cartography catalog of GitHubUser nodes. Therefore, you would need `ignore_case=True` in the PropertyRef that points to the GitHubUser node's name field, otherwise if one of your employees' GitHub usernames contains capital letters, you would not be able to map them properly to a GitHubUser node in your graph. + :param fuzzy_and_ignore_case: If True, performs a fuzzy + case-insensitive match when comparing the value of + this property using the `CONTAINS` operator. + query. Defaults to False. This only has effect as part of a TargetNodeMatcher and is not supported for the + sub resource relationship. """ self.name = name self.set_in_kwargs = set_in_kwargs self.extra_index = extra_index self.ignore_case = ignore_case + self.fuzzy_and_ignore_case = fuzzy_and_ignore_case + if self.fuzzy_and_ignore_case and self.ignore_case: + raise ValueError( + f'Error setting PropertyRef "{self.name}": ignore_case cannot be used together with' + 'fuzzy_and_ignore_case. Pick one or the other.', + ) def _parameterize_name(self) -> str: return f"${self.name}" diff --git a/tests/data/graph/querybuilder/sample_models/fake_emps_githubusers_fuzzy.py b/tests/data/graph/querybuilder/sample_models/fake_emps_githubusers_fuzzy.py new file mode 100644 index 0000000000..5583fdd57e --- /dev/null +++ b/tests/data/graph/querybuilder/sample_models/fake_emps_githubusers_fuzzy.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class FakeEmp2ToGitHubUserRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class FakeEmp2ToGitHubUser(CartographyRelSchema): + target_node_label: str = 'GitHubUser' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'username': PropertyRef('github_username', fuzzy_and_ignore_case=True)}, + ) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "IDENTITY_GITHUB" + properties: FakeEmp2ToGitHubUserRelProperties = FakeEmp2ToGitHubUserRelProperties() + + +@dataclass(frozen=True) +class FakeEmp2NodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + email: PropertyRef = PropertyRef('email') + github_username: PropertyRef = PropertyRef('github_username') + + +@dataclass(frozen=True) +class FakeEmp2Schema(CartographyNodeSchema): + label: str = 'FakeEmployee2' + properties: FakeEmp2NodeProperties = FakeEmp2NodeProperties() + other_relationships: OtherRelationships = OtherRelationships([ + FakeEmp2ToGitHubUser(), + ]) diff --git a/tests/integration/cartography/graph/test_querybuilder_fuzzy_case_insensitive.py b/tests/integration/cartography/graph/test_querybuilder_fuzzy_case_insensitive.py new file mode 100644 index 0000000000..d81b06d3d0 --- /dev/null +++ b/tests/integration/cartography/graph/test_querybuilder_fuzzy_case_insensitive.py @@ -0,0 +1,27 @@ +from cartography.client.core.tx import load +from cartography.intel.github.users import load_organization_users +from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_EMPLOYEE_DATA +from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_ORG_DATA +from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_USER_DATA +from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers_fuzzy import FakeEmp2Schema +from tests.integration.util import check_rels + +TEST_UPDATE_TAG = 123456789 + + +def test_load_team_members_data_fuzzy(neo4j_session): + # Arrange: Load some fake GitHubUser nodes to the graph + load_organization_users( + neo4j_session, + FAKE_GITHUB_USER_DATA, + FAKE_GITHUB_ORG_DATA, + TEST_UPDATE_TAG, + ) + + # Act: Create team members + load(neo4j_session, FakeEmp2Schema(), FAKE_EMPLOYEE_DATA, lastupdated=TEST_UPDATE_TAG) + + # Assert we can create relationships using a fuzzy, case insensitive match + assert check_rels(neo4j_session, 'FakeEmployee2', 'email', 'GitHubUser', 'username', 'IDENTITY_GITHUB') == { + ('hjsimpson@example.com', 'HjsimPson'), ('mbsimpson@example.com', 'mbsimp-son'), + } diff --git a/tests/unit/cartography/graph/test_querybuilder_simple.py b/tests/unit/cartography/graph/test_querybuilder_simple.py index fcc5d4f191..937ce4a008 100644 --- a/tests/unit/cartography/graph/test_querybuilder_simple.py +++ b/tests/unit/cartography/graph/test_querybuilder_simple.py @@ -1,5 +1,6 @@ from cartography.graph.querybuilder import build_ingestion_query from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers import FakeEmpSchema +from tests.data.graph.querybuilder.sample_models.fake_emps_githubusers_fuzzy import FakeEmp2Schema from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeWithSubResourceSchema from tests.unit.cartography.graph.helpers import remove_leading_whitespace_and_empty_lines @@ -90,3 +91,35 @@ def test_build_ingestion_query_case_insensitive_match(): actual_query = remove_leading_whitespace_and_empty_lines(query) expected_query = remove_leading_whitespace_and_empty_lines(expected) assert actual_query == expected_query + + +def test_build_ingestion_query_fuzzy_case_insensitive(): + query = build_ingestion_query(FakeEmp2Schema()) + + expected = """ + UNWIND $DictList AS item + MERGE (i:FakeEmployee2{id: item.id}) + ON CREATE SET i.firstseen = timestamp() + SET + i.lastupdated = $lastupdated, + i.email = item.email, + i.github_username = item.github_username + + WITH i, item + CALL { + WITH i, item + OPTIONAL MATCH (n0:GitHubUser) + WHERE + toLower(n0.username) CONTAINS toLower(item.github_username) + WITH i, item, n0 WHERE n0 IS NOT NULL + MERGE (i)-[r0:IDENTITY_GITHUB]->(n0) + ON CREATE SET r0.firstseen = timestamp() + SET + r0.lastupdated = $lastupdated + } + """ + + # Assert: compare query outputs while ignoring leading whitespace. + actual_query = remove_leading_whitespace_and_empty_lines(query) + expected_query = remove_leading_whitespace_and_empty_lines(expected) + assert actual_query == expected_query