From 61eece5c9c0b268b44dcec8fc6103fa29fb9b764 Mon Sep 17 00:00:00 2001 From: Daniel Brauer Date: Tue, 12 Nov 2024 08:44:48 -0500 Subject: [PATCH] organization node schema and fixes for lint and tests --- ...json => github_org_and_users_cleanup.json} | 0 cartography/intel/github/users.py | 43 ++++++++++++++++--- cartography/models/github/orgs.py | 32 ++++++++++++++ cartography/models/github/users.py | 41 ++++++++---------- docs/root/modules/github/schema.md | 6 +++ .../test_querybuilder_case_insensitive.py | 6 ++- 6 files changed, 97 insertions(+), 31 deletions(-) rename cartography/data/jobs/cleanup/{github_users_cleanup.json => github_org_and_users_cleanup.json} (100%) create mode 100644 cartography/models/github/orgs.py diff --git a/cartography/data/jobs/cleanup/github_users_cleanup.json b/cartography/data/jobs/cleanup/github_org_and_users_cleanup.json similarity index 100% rename from cartography/data/jobs/cleanup/github_users_cleanup.json rename to cartography/data/jobs/cleanup/github_org_and_users_cleanup.json diff --git a/cartography/intel/github/users.py b/cartography/intel/github/users.py index a5fcfe97f1..4d82d879df 100644 --- a/cartography/intel/github/users.py +++ b/cartography/intel/github/users.py @@ -10,7 +10,9 @@ from cartography.client.core.tx import load from cartography.intel.github.util import fetch_all from cartography.models.core.nodes import CartographyNodeSchema -from cartography.models.github.users import GitHubOrganizationUserSchema, GitHubUnaffiliatedUserSchema +from cartography.models.github.orgs import GitHubOrganizationSchema +from cartography.models.github.users import GitHubOrganizationUserSchema +from cartography.models.github.users import GitHubUnaffiliatedUserSchema from cartography.stats import get_stats_client from cartography.util import merge_module_sync_metadata from cartography.util import run_cleanup_job @@ -120,6 +122,7 @@ def _get_enterprise_owners_raw(token: str, api_url: str, organization: str) -> T ) return owners.edges, org + @timeit def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], List[Dict], Dict]: """ @@ -158,12 +161,12 @@ def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], processed_owner['MEMBER_OF'] = org['url'] owners_dict[processed_owner['url']] = processed_owner - affiliated_users = [] # users affiliated with the target org + affiliated_users = [] # users affiliated with the target org for url, user in users_dict.items(): user['isEnterpriseOwner'] = url in owners_dict affiliated_users.append(user) - unaffiliated_users = [] # users not affiliated with the target org + unaffiliated_users = [] # users not affiliated with the target org for url, owner in owners_dict.items(): if url not in users_dict: unaffiliated_users.append(owner) @@ -189,6 +192,22 @@ def load_users( ) +@timeit +def load_organization( + neo4j_session: neo4j.Session, + node_schema: CartographyNodeSchema, + org_data: List[Dict[str, Any]], + update_tag: int, +) -> None: + logger.info(f"Loading {len(org_data)} GitHub organization to the graph") + load( + neo4j_session, + node_schema, + org_data, + lastupdated=update_tag, + ) + + @timeit def sync( neo4j_session: neo4j.Session, @@ -199,10 +218,20 @@ def sync( ) -> None: logger.info("Syncing GitHub users") affiliated_user_data, unaffiliated_user_data, org_data = get_users(github_api_key, github_url, organization) - load_users(neo4j_session, GitHubOrganizationUserSchema(), affiliated_user_data, org_data, common_job_parameters['UPDATE_TAG']) - load_users(neo4j_session, GitHubUnaffiliatedUserSchema(), unaffiliated_user_data, org_data, common_job_parameters['UPDATE_TAG']) - # no automated cleanup job because user has no sub_resource_relationship - run_cleanup_job('github_users_cleanup.json', neo4j_session, common_job_parameters) + load_organization( + neo4j_session, GitHubOrganizationSchema(), [org_data], + common_job_parameters['UPDATE_TAG'], + ) + load_users( + neo4j_session, GitHubOrganizationUserSchema(), affiliated_user_data, org_data, + common_job_parameters['UPDATE_TAG'], + ) + load_users( + neo4j_session, GitHubUnaffiliatedUserSchema(), unaffiliated_user_data, org_data, + common_job_parameters['UPDATE_TAG'], + ) + # no automated cleanup job for users because user node has no sub_resource_relationship + run_cleanup_job('github_org_and_users_cleanup.json', neo4j_session, common_job_parameters) merge_module_sync_metadata( neo4j_session, group_type='GitHubOrganization', diff --git a/cartography/models/github/orgs.py b/cartography/models/github/orgs.py new file mode 100644 index 0000000000..46dfa1caa3 --- /dev/null +++ b/cartography/models/github/orgs.py @@ -0,0 +1,32 @@ +""" +This schema does not handle the org's relationships. Those are handled by other schemas, for example: +* GitHubTeamSchema defines (GitHubOrganization)-[RESOURCE]->(GitHubTeam) +* GitHubUserSchema defines (GitHubUser)-[MEMBER_OF|UNAFFILIATED]->(GitHubOrganization) +(There may be others, these are just two examples.) +""" +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties + + +@dataclass(frozen=True) +class GitHubOrganizationNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('url') + username: PropertyRef = PropertyRef('login', extra_index=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class GitHubUserToOrganizationRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class GitHubOrganizationSchema(CartographyNodeSchema): + label: str = 'GitHubOrganization' + properties: GitHubOrganizationNodeProperties = GitHubOrganizationNodeProperties() + other_relationships = None + sub_resource_relationship = None diff --git a/cartography/models/github/users.py b/cartography/models/github/users.py index edf757de9f..8da967b061 100644 --- a/cartography/models/github/users.py +++ b/cartography/models/github/users.py @@ -1,20 +1,7 @@ -from dataclasses import dataclass -from typing import Optional - -from cartography.models.core.common import PropertyRef -from cartography.models.core.nodes import CartographyNodeProperties -from cartography.models.core.nodes import CartographyNodeSchema -from cartography.models.core.relationships import CartographyRelProperties -from cartography.models.core.relationships import CartographyRelSchema -from cartography.models.core.relationships import LinkDirection -from cartography.models.core.relationships import make_target_node_matcher -from cartography.models.core.relationships import OtherRelationships -from cartography.models.core.relationships import TargetNodeMatcher - """ RE: Tenant relationship between GitHubUser and GitHubOrganization -Note this relationship is implemented via 'other_relationships' and not via the 'sub_resource_relationship' +Note this relationship is implemented via 'other_relationships' and not via the 'sub_resource_relationship' as might be expected. The 'sub_resource_relationship' typically describes the relationship of a node to its tenant (the org, project, or @@ -22,9 +9,10 @@ away, all nodes related to it should be cleaned up. In GitHub, though the GitHubUser's tenant seems to be GitHubOrganization, users actually exist independently. There -is a concept of 'UNAFFILIATED' users, like Enterprise Owners who are related to an org even if they are not direct -members of it. You would not want them to be cleaned up, if an org goes away, and you could want them in your graph -even if they are not members of any org in the enterprise. +is a concept of 'UNAFFILIATED' users (https://docs.github.com/en/graphql/reference/enums#roleinorganization) like +Enterprise Owners who are related to an org even if they are not direct members of it. You would not want them to be +cleaned up, if an org goes away, and you could want them in your graph even if they are not members of any org in +the enterprise. To allow for this in the schema, this relationship is treated as any other node-to-node relationship, via 'other_relationships', instead of as the typical 'sub_resource_relationship'. @@ -39,11 +27,18 @@ The main importance of having two schemas is to allow the two sets of users to be loaded separately. If we are loading an unaffiliated user, but the user already exists in the graph (perhaps they are members of another GitHub orgs for example), then loading the unaffiliated user will not blank out the 'role' and 'has_2fa_enabled' properties. - -See: -* https://docs.github.com/en/graphql/reference/enums#roleinorganization -* https://docs.github.com/en/enterprise-cloud@latest/admin/managing-accounts-and-repositories/managing-users-in-your-enterprise/roles-in-an-enterprise#enterprise-owners """ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher @dataclass(frozen=True) @@ -59,6 +54,7 @@ class GitHubOrganizationUserNodeProperties(CartographyNodeProperties): has_2fa_enabled: PropertyRef = PropertyRef('hasTwoFactorEnabled') role: PropertyRef = PropertyRef('role') + @dataclass(frozen=True) class GitHubUnaffiliatedUserNodeProperties(CartographyNodeProperties): id: PropertyRef = PropertyRef('url') @@ -72,6 +68,7 @@ class GitHubUnaffiliatedUserNodeProperties(CartographyNodeProperties): # 'has_2fa_enabled' not specified for unaffiliated; GitHub api does not return this property for them # 'role' not specified for unaffiliated; they do not have a role in the target organization + @dataclass(frozen=True) class GitHubUserToOrganizationRelProperties(CartographyRelProperties): lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) @@ -120,4 +117,4 @@ class GitHubUnaffiliatedUserSchema(CartographyNodeSchema): GitHubUserUnaffiliatedOrganizationRel(), ], ) - sub_resource_relationship = None \ No newline at end of file + sub_resource_relationship = None diff --git a/docs/root/modules/github/schema.md b/docs/root/modules/github/schema.md index fcbde3f81c..b316f1e1d9 100644 --- a/docs/root/modules/github/schema.md +++ b/docs/root/modules/github/schema.md @@ -87,6 +87,12 @@ Representation of a single GitHubOrganization [organization object](https://deve (GitHubOrganization)-[RESOURCE]->(GitHubTeam) ``` +- GitHubUsers are members of an organization. In some cases there may be a user who is "unaffiliated" with an org, for example if the user is an enterprise owner, but not member of, the org. + + ``` + (GitHubUser)-[MEMBER_OF|UNAFFILIATED]->(GitHubOrganization) + ``` + ### GitHubTeam diff --git a/tests/integration/cartography/graph/test_querybuilder_case_insensitive.py b/tests/integration/cartography/graph/test_querybuilder_case_insensitive.py index 7a47110377..70c8b97676 100644 --- a/tests/integration/cartography/graph/test_querybuilder_case_insensitive.py +++ b/tests/integration/cartography/graph/test_querybuilder_case_insensitive.py @@ -1,5 +1,6 @@ from cartography.client.core.tx import load -from cartography.intel.github.users import load_organization_users +from cartography.intel.github.users import load_users +from cartography.models.core.nodes import CartographyNodeSchema from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_EMPLOYEE_DATA from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_ORG_DATA from tests.data.graph.querybuilder.sample_data.case_insensitive_prop_ref import FAKE_GITHUB_USER_DATA @@ -11,8 +12,9 @@ def test_load_team_members_data(neo4j_session): # Arrange: Load some fake GitHubUser nodes to the graph - load_organization_users( + load_users( neo4j_session, + CartographyNodeSchema(), FAKE_GITHUB_USER_DATA, FAKE_GITHUB_ORG_DATA, TEST_UPDATE_TAG,