diff --git a/cartography/data/jobs/cleanup/github_users_cleanup.json b/cartography/data/jobs/cleanup/github_users_cleanup.json index 4419d8d650..7c7f6ff403 100644 --- a/cartography/data/jobs/cleanup/github_users_cleanup.json +++ b/cartography/data/jobs/cleanup/github_users_cleanup.json @@ -18,6 +18,11 @@ "query": "MATCH (:GitHubUser)-[r:MEMBER_OF]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", "iterative": true, "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:UNAFFILIATED]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 }], "name": "cleanup GitHub users data" } diff --git a/cartography/intel/github/users.py b/cartography/intel/github/users.py index 2107b26800..c17ed4be2f 100644 --- a/cartography/intel/github/users.py +++ b/cartography/intel/github/users.py @@ -1,4 +1,5 @@ import logging +from copy import deepcopy from typing import Any from typing import Dict from typing import List @@ -44,17 +45,46 @@ } """ +GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL = """ + query($login: String!, $cursor: String) { + organization(login: $login) + { + url + login + enterpriseOwners(first:100, after: $cursor){ + edges { + node { + url + login + name + isSiteAdmin + email + company + } + organizationRole + } + pageInfo{ + endCursor + hasNextPage + } + } + } + } + """ + @timeit -def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]: +def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]: """ Retrieve a list of users from the given GitHub organization as described in https://docs.github.com/en/graphql/reference/objects#organizationmemberedge. :param token: The Github API token as string. :param api_url: The Github v4 API endpoint as string. :param organization: The name of the target Github organization as string. - :return: A 2-tuple containing 1. a list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA - for shape, and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape. + :return: A 2-tuple containing + 1. a list of dicts representing users and + 2. data on the owning GitHub organization + see tests.data.github.users.GITHUB_ORG_DATA for shape of both """ users, org = fetch_all( token, @@ -66,6 +96,78 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]: return users.edges, org +def _get_enterprise_owners_raw(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]: + """ + Function broken out for testing purposes. See 'get_enterprise_owners' for docs. + """ + owners, org = fetch_all( + token, + api_url, + organization, + GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL, + 'enterpriseOwners', + ) + return owners.edges, org + + +@timeit +def get_enterprise_owners(token: str, api_url: str, organization: str) -> Tuple[List[Dict], List[Dict], Dict]: + """ + Retrieve a list of enterprise owners from the given GitHub organization as described in + https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge. + :param token: The Github API token as string. + :param api_url: The Github v4 API endpoint as string. + :param organization: The name of the target Github organization as string. + :return: A 3-tuple containing + 1. a list of dicts representing enterprise owners who are also users in the organization + 2. a list of dicts representing enterprise owners who are not users in the organization + 3. data on the owning GitHub organization + see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape + """ + owners, org = _get_enterprise_owners_raw(token, api_url, organization) + unaffiliated_owners = [] + affiliated_owners = [] + for owner in owners: + if owner['organizationRole'] == 'UNAFFILIATED': + unaffiliated_owners.append(owner) + else: + affiliated_owners.append(owner) + return affiliated_owners, unaffiliated_owners, org + + +def _mark_users_as_enterprise_owners( + user_data: List[Dict], + user_org_data: Dict, + affiliated_owner_data: List[Dict], + owner_org_data: Dict, +) -> list[Dict]: + """ + :param user_data: A list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA for shape. + :param user_org_data: A dict representing the organization for the user_data + see tests.data.github.users.GITHUB_ORG_DATA for shape. + :param affiliated_owner_data: A list of dicts representing affiliated enterprise owners + (owners who are also users in the org) - see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape. + :param owner_org_data: A dict representing the organization for the owner data + see tests.data.github.users.GITHUB_ORG_DATA for shape. + :return: A new list of user_data dicts updated with a new property, isEnterpriseOwner + """ + + # Guarding against accidental mixing of data from different orgs. Since user data and owner data are queried + # separately, there is at least a possibility of callers attempting to join data from different orgs. + if user_org_data['url'] != owner_org_data['url']: + raise ValueError(f"Organization URLs do not match: {user_org_data['url']} != {owner_org_data['url']}") + if user_org_data['login'] != owner_org_data['login']: + raise ValueError(f"Organization logins do not match: {user_org_data['login']} != {owner_org_data['login']}") + + result = [] + owner_urls = {entry['node']['url'] for entry in affiliated_owner_data} + for user in user_data: + user_copy = deepcopy(user) + user_copy['node']['isEnterpriseOwner'] = user['node']['url'] in owner_urls + result.append(user_copy) + return result + + @timeit def load_organization_users( neo4j_session: neo4j.Session, user_data: List[Dict], org_data: Dict, @@ -87,6 +189,7 @@ def load_organization_users( u.has_2fa_enabled = user.hasTwoFactorEnabled, u.role = user.role, u.is_site_admin = user.node.isSiteAdmin, + u.is_enterprise_owner = user.node.isEnterpriseOwner, u.email = user.node.email, u.company = user.node.company, u.lastupdated = $UpdateTag @@ -104,6 +207,54 @@ def load_organization_users( ) +@timeit +def load_unaffiliated_owners( + neo4j_session: neo4j.Session, owner_data: List[Dict], org_data: Dict, + update_tag: int, +) -> None: + """ + The owner_data here represents users who are enterprise owners but are not in the target org. + Note the subtle differences between what is loaded here and in load_organization_users: + 1. The user-org relationship is set to UNAFFILIATED + 2. 'role' is not set: these users have no role in the organization (i.e. they are neither 'MEMBER' nor 'ADMIN'). + 3. 'has_2fa_enabled' is not set (it is unavailable from the GraphQL query for these owners) + 4. 'is_enterprise_owner' is set to TRUE + + If the user does already exist in the graph (perhaps they are members of other orgs) then this merge will + update the user's node but leave 'role' and 'has_2fa_enabled' untouched. + """ + query = """ + MERGE (org:GitHubOrganization{id: $OrgUrl}) + ON CREATE SET org.firstseen = timestamp() + SET org.username = $OrgLogin, + org.lastupdated = $UpdateTag + WITH org + + UNWIND $UserData as user + + MERGE (u:GitHubUser{id: user.node.url}) + ON CREATE SET u.firstseen = timestamp() + SET u.fullname = user.node.name, + u.username = user.node.login, + u.is_site_admin = user.node.isSiteAdmin, + u.is_enterprise_owner = TRUE, + u.email = user.node.email, + u.company = user.node.company, + u.lastupdated = $UpdateTag + + MERGE (u)-[r:UNAFFILIATED]->(org) + ON CREATE SET r.firstseen = timestamp() + SET r.lastupdated = $UpdateTag + """ + neo4j_session.run( + query, + OrgUrl=org_data['url'], + OrgLogin=org_data['login'], + UserData=owner_data, + UpdateTag=update_tag, + ) + + @timeit def sync( neo4j_session: neo4j.Session, @@ -113,13 +264,25 @@ def sync( organization: str, ) -> None: logger.info("Syncing GitHub users") - user_data, org_data = get(github_api_key, github_url, organization) - load_organization_users(neo4j_session, user_data, org_data, common_job_parameters['UPDATE_TAG']) + user_data, user_org_data = get_users(github_api_key, github_url, organization) + affiliated_owner_data, unaffiliated_owner_data, owner_org_data = get_enterprise_owners( + github_api_key, + github_url, organization, + ) + processed_user_data = _mark_users_as_enterprise_owners( + user_data, user_org_data, + affiliated_owner_data, owner_org_data, + ) + load_organization_users(neo4j_session, processed_user_data, user_org_data, common_job_parameters['UPDATE_TAG']) + load_unaffiliated_owners( + neo4j_session, unaffiliated_owner_data, + owner_org_data, common_job_parameters['UPDATE_TAG'], + ) run_cleanup_job('github_users_cleanup.json', neo4j_session, common_job_parameters) merge_module_sync_metadata( neo4j_session, group_type='GitHubOrganization', - group_id=org_data['url'], + group_id=user_org_data['url'], synced_type='GitHubOrganization', update_tag=common_job_parameters['UPDATE_TAG'], stat_handler=stat_handler, diff --git a/docs/root/modules/github/schema.md b/docs/root/modules/github/schema.md index 027ac514ce..fcbde3f81c 100644 --- a/docs/root/modules/github/schema.md +++ b/docs/root/modules/github/schema.md @@ -131,10 +131,10 @@ Representation of a single GitHubUser [user object](https://developer.github.com | has_2fa_enabled | Whether the user has 2-factor authentication enabled | | role | Either 'ADMIN' (denoting that the user is an owner of a Github organization) or 'MEMBER' | | is_site_admin | Whether the user is a site admin | -| permission | Only present if the user is an [outside collaborator](https://docs.github.com/en/graphql/reference/objects#repositorycollaboratorconnection) of this repo. -`permission` is either ADMIN, MAINTAIN, READ, TRIAGE, or WRITE ([ref](https://docs.github.com/en/graphql/reference/enums#repositorypermission)). -| email | The user's publicly visible profile email. -| company | The user's public profile company. +| is_enterprise_owner | Whether the user is an [enterprise owner](https://docs.github.com/en/enterprise-cloud@latest/admin/managing-accounts-and-repositories/managing-users-in-your-enterprise/roles-in-an-enterprise#enterprise-owners) | +| permission | Only present if the user is an [outside collaborator](https://docs.github.com/en/graphql/reference/objects#repositorycollaboratorconnection) of this repo. `permission` is either ADMIN, MAINTAIN, READ, TRIAGE, or WRITE ([ref](https://docs.github.com/en/graphql/reference/enums#repositorypermission)). | +| email | The user's publicly visible profile email. | +| company | The user's public profile company. | #### Relationships @@ -152,6 +152,12 @@ WRITE, MAINTAIN, TRIAGE, and READ ([Reference](https://docs.github.com/en/graphq (GitHubUser)-[:OUTSIDE_COLLAB_{ACTION}]->(GitHubRepository) ``` +- GitHubUsers are members of an organization. In some cases there may be a user who is "unaffiliated" with an org, for example if the user is an enterprise owner, but not member of, the org. + + ``` + (GitHubUser)-[MEMBER_OF|UNAFFILIATED]->(GitHubOrganization) + ``` + ### GitHubBranch Representation of a single GitHubBranch [ref object](https://developer.github.com/v4/object/ref). This node contains minimal data for a repository branch. diff --git a/tests/data/github/users.py b/tests/data/github/users.py index b421bd44e2..c1a1842932 100644 --- a/tests/data/github/users.py +++ b/tests/data/github/users.py @@ -1,30 +1,88 @@ -GITHUB_USER_DATA = [ - { - 'hasTwoFactorEnabled': None, - 'node': { - 'url': 'https://example.com/hjsimpson', - 'login': 'hjsimpson', - 'name': 'Homer Simpson', - 'isSiteAdmin': False, - 'email': 'hjsimpson@example.com', - 'company': 'Springfield Nuclear Power Plant', - }, - 'role': 'MEMBER', - }, { - 'hasTwoFactorEnabled': None, - 'node': { - 'url': 'https://example.com/mbsimpson', - 'login': 'mbsimpson', - 'name': 'Marge Simpson', - 'isSiteAdmin': False, - 'email': 'mbsimpson@example.com', - 'company': 'Simpson Residence', - }, - 'role': 'ADMIN', - }, -] - GITHUB_ORG_DATA = { 'url': 'https://example.com/my_org', 'login': 'my_org', } + + +GITHUB_USER_DATA = ( + [ + { + 'hasTwoFactorEnabled': None, + 'node': { + 'url': 'https://example.com/hjsimpson', + 'login': 'hjsimpson', + 'name': 'Homer Simpson', + 'isSiteAdmin': False, + 'email': 'hjsimpson@example.com', + 'company': 'Springfield Nuclear Power Plant', + }, + 'role': 'MEMBER', + }, { + 'hasTwoFactorEnabled': None, + 'node': { + 'url': 'https://example.com/lmsimpson', + 'login': 'lmsimpson', + 'name': 'Lisa Simpson', + 'isSiteAdmin': False, + 'email': 'lmsimpson@example.com', + 'company': 'Simpson Residence', + }, + 'role': 'MEMBER', + }, { + 'hasTwoFactorEnabled': True, + 'node': { + 'url': 'https://example.com/mbsimpson', + 'login': 'mbsimpson', + 'name': 'Marge Simpson', + 'isSiteAdmin': False, + 'email': 'mbsimpson@example.com', + 'company': 'Simpson Residence', + }, + 'role': 'ADMIN', + }, + ], + GITHUB_ORG_DATA, +) + +# Subtle differences between owner data and user data: +# 1. owner data does not include a `hasTwoFactorEnabled` field (it in unavailable in the GraphQL query for these owners) +# 2. an `organizationRole` field instead of a `role` field. In owner data, membership within an org is not assumed, so +# there is an 'UNAFFILIATED' value for owners of an org who are not also members of it. (Otherwise the 'OWNER' +# organizationRole matches the 'ADMIN' role in the user data, and the 'DIRECT_MEMBER' organizationRole matches +# the 'MEMBER' role.) +GITHUB_ENTERPRISE_OWNER_DATA = ( + [ + { + 'node': { + 'url': 'https://example.com/kbroflovski', + 'login': 'kbroflovski', + 'name': 'Kyle Broflovski', + 'isSiteAdmin': False, + 'email': 'kbroflovski@example.com', + 'company': 'South Park Elementary', + }, + 'organizationRole': 'UNAFFILIATED', + }, { + 'node': { + 'url': 'https://example.com/mbsimpson', + 'login': 'mbsimpson', + 'name': 'Marge Simpson', + 'isSiteAdmin': False, + 'email': 'mbsimpson@example.com', + 'company': 'Simpson Residence', + }, + 'organizationRole': 'OWNER', + }, { + 'node': { + 'url': 'https://example.com/lmsimpson', + 'login': 'lmsimpson', + 'name': 'Lisa Simpson', + 'isSiteAdmin': False, + 'email': 'lmsimpson@example.com', + 'company': 'Simpson Residence', + }, + 'organizationRole': 'DIRECT_MEMBER', + }, + ], + GITHUB_ORG_DATA, +) diff --git a/tests/integration/cartography/intel/github/test_users.py b/tests/integration/cartography/intel/github/test_users.py index 03766700e1..3239ac1a8b 100644 --- a/tests/integration/cartography/intel/github/test_users.py +++ b/tests/integration/cartography/intel/github/test_users.py @@ -1,17 +1,34 @@ +from unittest.mock import patch + import cartography.intel.github.users -import tests.data.github.users +from tests.data.github.users import GITHUB_ENTERPRISE_OWNER_DATA +from tests.data.github.users import GITHUB_ORG_DATA +from tests.data.github.users import GITHUB_USER_DATA TEST_UPDATE_TAG = 123456789 +TEST_JOB_PARAMS = {'UPDATE_TAG': TEST_UPDATE_TAG} +TEST_GITHUB_URL = GITHUB_ORG_DATA['url'] +TEST_GITHUB_ORG = GITHUB_ORG_DATA['login'] +FAKE_API_KEY = 'asdf' + +@patch.object(cartography.intel.github.users, 'get_users', return_value=GITHUB_USER_DATA) +@patch.object(cartography.intel.github.users, '_get_enterprise_owners_raw', return_value=GITHUB_ENTERPRISE_OWNER_DATA) +def test_sync(mock_owners, mock_users, neo4j_session): + # Arrange + # No need to 'arrange' data here. The patched functions return all the data needed. -def test_load_github_organization_users(neo4j_session): - cartography.intel.github.users.load_organization_users( + # Act + cartography.intel.github.users.sync( neo4j_session, - tests.data.github.users.GITHUB_USER_DATA, - tests.data.github.users.GITHUB_ORG_DATA, - TEST_UPDATE_TAG, + TEST_JOB_PARAMS, + FAKE_API_KEY, + TEST_GITHUB_URL, + TEST_GITHUB_ORG, ) + # Assert + # Ensure users got loaded nodes = neo4j_session.run( """ @@ -20,7 +37,9 @@ def test_load_github_organization_users(neo4j_session): ) expected_nodes = { ("https://example.com/hjsimpson", 'MEMBER'), + ("https://example.com/lmsimpson", 'MEMBER'), ("https://example.com/mbsimpson", 'ADMIN'), + ("https://example.com/kbroflovski", None), } actual_nodes = { ( @@ -33,23 +52,74 @@ def test_load_github_organization_users(neo4j_session): # Ensure users are connected to the expected organization nodes = neo4j_session.run( """ - MATCH(user:GitHubUser)-[:MEMBER_OF]->(org:GitHubOrganization) - RETURN user.id, org.id + MATCH(user:GitHubUser)-[r]->(org:GitHubOrganization) + RETURN user.id, type(r), org.id """, ) actual_nodes = { ( n['user.id'], + n['type(r)'], n['org.id'], ) for n in nodes } expected_nodes = { ( 'https://example.com/hjsimpson', + 'MEMBER_OF', + 'https://example.com/my_org', + ), ( + 'https://example.com/lmsimpson', + 'MEMBER_OF', 'https://example.com/my_org', ), ( 'https://example.com/mbsimpson', + 'MEMBER_OF', + 'https://example.com/my_org', + ), ( + 'https://example.com/kbroflovski', + 'UNAFFILIATED', 'https://example.com/my_org', ), } assert actual_nodes == expected_nodes + + # Ensure enterprise owners are identified + nodes = neo4j_session.run( + """ + MATCH (g:GitHubUser) RETURN g.id, g.is_enterprise_owner + """, + ) + expected_nodes = { + ("https://example.com/hjsimpson", False), + ("https://example.com/lmsimpson", True), + ("https://example.com/mbsimpson", True), + ("https://example.com/kbroflovski", True), + } + actual_nodes = { + ( + n['g.id'], + n['g.is_enterprise_owner'], + ) for n in nodes + } + assert actual_nodes == expected_nodes + + # Ensure hasTwoFactorEnabled has not been improperly overwritten for enterprise owners + nodes = neo4j_session.run( + """ + MATCH (g:GitHubUser) RETURN g.id, g.has_2fa_enabled + """, + ) + expected_nodes = { + ("https://example.com/hjsimpson", None), + ("https://example.com/lmsimpson", None), + ("https://example.com/mbsimpson", True), + ("https://example.com/kbroflovski", None), + } + actual_nodes = { + ( + n['g.id'], + n['g.has_2fa_enabled'], + ) for n in nodes + } + assert actual_nodes == expected_nodes