diff --git a/cartography/intel/github/users.py b/cartography/intel/github/users.py index c17ed4be2f..3158e8720d 100644 --- a/cartography/intel/github/users.py +++ b/cartography/intel/github/users.py @@ -7,7 +7,10 @@ import neo4j +from cartography.client.core.tx import load from cartography.intel.github.util import fetch_all +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.github.users import GitHubOrganizationUserSchema, GitHubUnaffiliatedUserSchema from cartography.stats import get_stats_client from cartography.util import merge_module_sync_metadata from cartography.util import run_cleanup_job @@ -74,7 +77,7 @@ @timeit -def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]: +def _get_users_raw(token: str, api_url: str, organization: str) -> Tuple[List[Dict], dict]: """ Retrieve a list of users from the given GitHub organization as described in https://docs.github.com/en/graphql/reference/objects#organizationmemberedge. @@ -84,7 +87,7 @@ def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], :return: A 2-tuple containing 1. a list of dicts representing users and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape of both + see tests.data.github.users.GITHUB_USER_DATA for shape of both """ users, org = fetch_all( token, @@ -98,7 +101,15 @@ def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], def _get_enterprise_owners_raw(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]: """ - Function broken out for testing purposes. See 'get_enterprise_owners' for docs. + Retrieve a list of enterprise owners from the given GitHub organization as described in + https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge. + :param token: The Github API token as string. + :param api_url: The Github v4 API endpoint as string. + :param organization: The name of the target Github organization as string. + :return: A 2-tuple containing + 1. a list of dicts representing users who are enterprise owners + 3. data on the owning GitHub organization + see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape """ owners, org = fetch_all( token, @@ -109,180 +120,93 @@ def _get_enterprise_owners_raw(token: str, api_url: str, organization: str) -> T ) return owners.edges, org - @timeit -def get_enterprise_owners(token: str, api_url: str, organization: str) -> Tuple[List[Dict], List[Dict], Dict]: +def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], List[Dict], Dict]: """ - Retrieve a list of enterprise owners from the given GitHub organization as described in - https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge. + Retrieve all users: + * organization users (users directly affiliated with an organization) + * unaffiliated users (user who, for example, are enterprise owners but not members of the target organization). + :param token: The Github API token as string. :param api_url: The Github v4 API endpoint as string. :param organization: The name of the target Github organization as string. - :return: A 3-tuple containing - 1. a list of dicts representing enterprise owners who are also users in the organization - 2. a list of dicts representing enterprise owners who are not users in the organization + :return: A 2-tuple containing + 1. a list of dicts representing users who are affiliated with the target org + see tests.data.github.users.GITHUB_USER_DATA for shape + 2. a list of dicts representing users who are not affiliated (e.g. enterprise owners who are not also in + the target org) — see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape 3. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape """ + + users, org = _get_users_raw(token, api_url, organization) + users_dict = {} + for user in users: + processed_user = deepcopy(user['node']) + processed_user['role'] = user['role'] + processed_user['hasTwoFactorEnabled'] = user['hasTwoFactorEnabled'] + processed_user['MEMBER_OF'] = org['url'] + users_dict[processed_user['url']] = processed_user + owners, org = _get_enterprise_owners_raw(token, api_url, organization) - unaffiliated_owners = [] - affiliated_owners = [] + owners_dict = {} for owner in owners: + processed_owner = deepcopy(owner['node']) + processed_owner['isEnterpriseOwner'] = True if owner['organizationRole'] == 'UNAFFILIATED': - unaffiliated_owners.append(owner) + processed_owner['UNAFFILIATED'] = org['url'] else: - affiliated_owners.append(owner) - return affiliated_owners, unaffiliated_owners, org - - -def _mark_users_as_enterprise_owners( - user_data: List[Dict], - user_org_data: Dict, - affiliated_owner_data: List[Dict], - owner_org_data: Dict, -) -> list[Dict]: - """ - :param user_data: A list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA for shape. - :param user_org_data: A dict representing the organization for the user_data - see tests.data.github.users.GITHUB_ORG_DATA for shape. - :param affiliated_owner_data: A list of dicts representing affiliated enterprise owners - (owners who are also users in the org) - see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape. - :param owner_org_data: A dict representing the organization for the owner data - see tests.data.github.users.GITHUB_ORG_DATA for shape. - :return: A new list of user_data dicts updated with a new property, isEnterpriseOwner - """ + processed_owner['MEMBER_OF'] = org['url'] + owners_dict[processed_owner['url']] = processed_owner - # Guarding against accidental mixing of data from different orgs. Since user data and owner data are queried - # separately, there is at least a possibility of callers attempting to join data from different orgs. - if user_org_data['url'] != owner_org_data['url']: - raise ValueError(f"Organization URLs do not match: {user_org_data['url']} != {owner_org_data['url']}") - if user_org_data['login'] != owner_org_data['login']: - raise ValueError(f"Organization logins do not match: {user_org_data['login']} != {owner_org_data['login']}") + affiliated_users = [] # users affiliated with the target org + for url, user in users_dict.items(): + user['isEnterpriseOwner'] = url in owners_dict + affiliated_users.append(user) - result = [] - owner_urls = {entry['node']['url'] for entry in affiliated_owner_data} - for user in user_data: - user_copy = deepcopy(user) - user_copy['node']['isEnterpriseOwner'] = user['node']['url'] in owner_urls - result.append(user_copy) - return result + unaffiliated_users = [] # users not affiliated with the target org + for url, owner in owners_dict.items(): + if url not in users_dict: + unaffiliated_users.append(owner) - -@timeit -def load_organization_users( - neo4j_session: neo4j.Session, user_data: List[Dict], org_data: Dict, - update_tag: int, -) -> None: - query = """ - MERGE (org:GitHubOrganization{id: $OrgUrl}) - ON CREATE SET org.firstseen = timestamp() - SET org.username = $OrgLogin, - org.lastupdated = $UpdateTag - WITH org - - UNWIND $UserData as user - - MERGE (u:GitHubUser{id: user.node.url}) - ON CREATE SET u.firstseen = timestamp() - SET u.fullname = user.node.name, - u.username = user.node.login, - u.has_2fa_enabled = user.hasTwoFactorEnabled, - u.role = user.role, - u.is_site_admin = user.node.isSiteAdmin, - u.is_enterprise_owner = user.node.isEnterpriseOwner, - u.email = user.node.email, - u.company = user.node.company, - u.lastupdated = $UpdateTag - - MERGE (u)-[r:MEMBER_OF]->(org) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $UpdateTag - """ - neo4j_session.run( - query, - OrgUrl=org_data['url'], - OrgLogin=org_data['login'], - UserData=user_data, - UpdateTag=update_tag, - ) + return affiliated_users, unaffiliated_users, org @timeit -def load_unaffiliated_owners( - neo4j_session: neo4j.Session, owner_data: List[Dict], org_data: Dict, +def load_users( + neo4j_session: neo4j.Session, + node_schema: CartographyNodeSchema, + user_data: List[Dict], + org_data: Dict, update_tag: int, ) -> None: - """ - The owner_data here represents users who are enterprise owners but are not in the target org. - Note the subtle differences between what is loaded here and in load_organization_users: - 1. The user-org relationship is set to UNAFFILIATED - 2. 'role' is not set: these users have no role in the organization (i.e. they are neither 'MEMBER' nor 'ADMIN'). - 3. 'has_2fa_enabled' is not set (it is unavailable from the GraphQL query for these owners) - 4. 'is_enterprise_owner' is set to TRUE - - If the user does already exist in the graph (perhaps they are members of other orgs) then this merge will - update the user's node but leave 'role' and 'has_2fa_enabled' untouched. - """ - query = """ - MERGE (org:GitHubOrganization{id: $OrgUrl}) - ON CREATE SET org.firstseen = timestamp() - SET org.username = $OrgLogin, - org.lastupdated = $UpdateTag - WITH org - - UNWIND $UserData as user - - MERGE (u:GitHubUser{id: user.node.url}) - ON CREATE SET u.firstseen = timestamp() - SET u.fullname = user.node.name, - u.username = user.node.login, - u.is_site_admin = user.node.isSiteAdmin, - u.is_enterprise_owner = TRUE, - u.email = user.node.email, - u.company = user.node.company, - u.lastupdated = $UpdateTag - - MERGE (u)-[r:UNAFFILIATED]->(org) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $UpdateTag - """ - neo4j_session.run( - query, - OrgUrl=org_data['url'], - OrgLogin=org_data['login'], - UserData=owner_data, - UpdateTag=update_tag, + logger.info(f"Loading {len(user_data)} GitHub users to the graph") + load( + neo4j_session, + node_schema, + user_data, + lastupdated=update_tag, + org_url=org_data['url'], ) @timeit def sync( neo4j_session: neo4j.Session, - common_job_parameters: Dict[str, Any], + common_job_parameters: Dict, github_api_key: str, github_url: str, organization: str, ) -> None: logger.info("Syncing GitHub users") - user_data, user_org_data = get_users(github_api_key, github_url, organization) - affiliated_owner_data, unaffiliated_owner_data, owner_org_data = get_enterprise_owners( - github_api_key, - github_url, organization, - ) - processed_user_data = _mark_users_as_enterprise_owners( - user_data, user_org_data, - affiliated_owner_data, owner_org_data, - ) - load_organization_users(neo4j_session, processed_user_data, user_org_data, common_job_parameters['UPDATE_TAG']) - load_unaffiliated_owners( - neo4j_session, unaffiliated_owner_data, - owner_org_data, common_job_parameters['UPDATE_TAG'], - ) + affiliated_user_data, unaffiliated_user_data, org_data = get_users(github_api_key, github_url, organization) + load_users(neo4j_session, GitHubOrganizationUserSchema(), affiliated_user_data, org_data, common_job_parameters['UPDATE_TAG']) + load_users(neo4j_session, GitHubUnaffiliatedUserSchema(), unaffiliated_user_data, org_data, common_job_parameters['UPDATE_TAG']) + # no automated cleanup job because user has no sub_resource_relationship run_cleanup_job('github_users_cleanup.json', neo4j_session, common_job_parameters) merge_module_sync_metadata( neo4j_session, group_type='GitHubOrganization', - group_id=user_org_data['url'], + group_id=org_data['url'], synced_type='GitHubOrganization', update_tag=common_job_parameters['UPDATE_TAG'], stat_handler=stat_handler, diff --git a/cartography/models/github/users.py b/cartography/models/github/users.py new file mode 100644 index 0000000000..edf757de9f --- /dev/null +++ b/cartography/models/github/users.py @@ -0,0 +1,123 @@ +from dataclasses import dataclass +from typing import Optional + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + +""" +RE: Tenant relationship between GitHubUser and GitHubOrganization + +Note this relationship is implemented via 'other_relationships' and not via the 'sub_resource_relationship' +as might be expected. + +The 'sub_resource_relationship' typically describes the relationship of a node to its tenant (the org, project, or +other resource to which other nodes belong). An assumption of that relationship is that if the tenant goes +away, all nodes related to it should be cleaned up. + +In GitHub, though the GitHubUser's tenant seems to be GitHubOrganization, users actually exist independently. There +is a concept of 'UNAFFILIATED' users, like Enterprise Owners who are related to an org even if they are not direct +members of it. You would not want them to be cleaned up, if an org goes away, and you could want them in your graph +even if they are not members of any org in the enterprise. + +To allow for this in the schema, this relationship is treated as any other node-to-node relationship, via +'other_relationships', instead of as the typical 'sub_resource_relationship'. + +RE: GitHubOrganizationUserSchema vs GitHubUnaffiliatedUserSchema + +As noted above, there are implicitly two types of users, those that are part of, or affiliated, to a target +GitHubOrganization, and those thare are not part, or unaffiliated. Both are represented as GitHubUser nodes, +but there are two schemas below to allow for some differences between them, e.g., unaffiliated lack these properties: + * the 'role' property, because unaffiliated have no 'role' in the target org + * the 'has_2fa_enabled' property, because the GitHub api does not return it, for these users +The main importance of having two schemas is to allow the two sets of users to be loaded separately. If we are loading +an unaffiliated user, but the user already exists in the graph (perhaps they are members of another GitHub orgs for +example), then loading the unaffiliated user will not blank out the 'role' and 'has_2fa_enabled' properties. + +See: +* https://docs.github.com/en/graphql/reference/enums#roleinorganization +* https://docs.github.com/en/enterprise-cloud@latest/admin/managing-accounts-and-repositories/managing-users-in-your-enterprise/roles-in-an-enterprise#enterprise-owners +""" + + +@dataclass(frozen=True) +class GitHubOrganizationUserNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('url') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + fullname: PropertyRef = PropertyRef('name') + username: PropertyRef = PropertyRef('login', extra_index=True) + is_site_admin: PropertyRef = PropertyRef('isSiteAdmin') + is_enterprise_owner: PropertyRef = PropertyRef('isEnterpriseOwner') + email: PropertyRef = PropertyRef('email') + company: PropertyRef = PropertyRef('company') + has_2fa_enabled: PropertyRef = PropertyRef('hasTwoFactorEnabled') + role: PropertyRef = PropertyRef('role') + +@dataclass(frozen=True) +class GitHubUnaffiliatedUserNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('url') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + fullname: PropertyRef = PropertyRef('name') + username: PropertyRef = PropertyRef('login', extra_index=True) + is_site_admin: PropertyRef = PropertyRef('isSiteAdmin') + is_enterprise_owner: PropertyRef = PropertyRef('isEnterpriseOwner') + email: PropertyRef = PropertyRef('email') + company: PropertyRef = PropertyRef('company') + # 'has_2fa_enabled' not specified for unaffiliated; GitHub api does not return this property for them + # 'role' not specified for unaffiliated; they do not have a role in the target organization + +@dataclass(frozen=True) +class GitHubUserToOrganizationRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class GitHubUserMemberOfOrganizationRel(CartographyRelSchema): + target_node_label: str = 'GitHubOrganization' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('MEMBER_OF')}, + ) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "MEMBER_OF" + properties: GitHubUserToOrganizationRelProperties = GitHubUserToOrganizationRelProperties() + + +@dataclass(frozen=True) +class GitHubUserUnaffiliatedOrganizationRel(CartographyRelSchema): + target_node_label: str = 'GitHubOrganization' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('UNAFFILIATED')}, + ) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "UNAFFILIATED" + properties: GitHubUserToOrganizationRelProperties = GitHubUserToOrganizationRelProperties() + + +@dataclass(frozen=True) +class GitHubOrganizationUserSchema(CartographyNodeSchema): + label: str = 'GitHubUser' + properties: GitHubOrganizationUserNodeProperties = GitHubOrganizationUserNodeProperties() + other_relationships: OtherRelationships = OtherRelationships( + [ + GitHubUserMemberOfOrganizationRel(), + ], + ) + sub_resource_relationship = None + + +@dataclass(frozen=True) +class GitHubUnaffiliatedUserSchema(CartographyNodeSchema): + label: str = 'GitHubUser' + properties: GitHubUnaffiliatedUserNodeProperties = GitHubUnaffiliatedUserNodeProperties() + other_relationships: OtherRelationships = OtherRelationships( + [ + GitHubUserUnaffiliatedOrganizationRel(), + ], + ) + sub_resource_relationship = None \ No newline at end of file diff --git a/tests/integration/cartography/intel/github/test_users.py b/tests/integration/cartography/intel/github/test_users.py index 3239ac1a8b..7527fa345a 100644 --- a/tests/integration/cartography/intel/github/test_users.py +++ b/tests/integration/cartography/intel/github/test_users.py @@ -12,7 +12,7 @@ FAKE_API_KEY = 'asdf' -@patch.object(cartography.intel.github.users, 'get_users', return_value=GITHUB_USER_DATA) +@patch.object(cartography.intel.github.users, '_get_users_raw', return_value=GITHUB_USER_DATA) @patch.object(cartography.intel.github.users, '_get_enterprise_owners_raw', return_value=GITHUB_ENTERPRISE_OWNER_DATA) def test_sync(mock_owners, mock_users, neo4j_session): # Arrange