Skip to content

Commit

Permalink
Merge pull request #1 from etsy/gh_identify_enterprise_owners
Browse files Browse the repository at this point in the history
identifying enterprise owners in github
  • Loading branch information
danbrauer authored Oct 31, 2024
2 parents 5fcbe83 + 67a55bf commit f9c1254
Show file tree
Hide file tree
Showing 5 changed files with 346 additions and 44 deletions.
5 changes: 5 additions & 0 deletions cartography/data/jobs/cleanup/github_users_cleanup.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
"query": "MATCH (:GitHubUser)-[r:MEMBER_OF]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)",
"iterative": true,
"iterationsize": 100
},
{
"query": "MATCH (:GitHubUser)-[r:UNAFFILIATED]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)",
"iterative": true,
"iterationsize": 100
}],
"name": "cleanup GitHub users data"
}
175 changes: 169 additions & 6 deletions cartography/intel/github/users.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from copy import deepcopy
from typing import Any
from typing import Dict
from typing import List
Expand Down Expand Up @@ -44,17 +45,46 @@
}
"""

GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL = """
query($login: String!, $cursor: String) {
organization(login: $login)
{
url
login
enterpriseOwners(first:100, after: $cursor){
edges {
node {
url
login
name
isSiteAdmin
email
company
}
organizationRole
}
pageInfo{
endCursor
hasNextPage
}
}
}
}
"""


@timeit
def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
"""
Retrieve a list of users from the given GitHub organization as described in
https://docs.github.com/en/graphql/reference/objects#organizationmemberedge.
:param token: The Github API token as string.
:param api_url: The Github v4 API endpoint as string.
:param organization: The name of the target Github organization as string.
:return: A 2-tuple containing 1. a list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA
for shape, and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape.
:return: A 2-tuple containing
1. a list of dicts representing users and
2. data on the owning GitHub organization
see tests.data.github.users.GITHUB_ORG_DATA for shape of both
"""
users, org = fetch_all(
token,
Expand All @@ -66,6 +96,78 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
return users.edges, org


def _get_enterprise_owners_raw(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
"""
Function broken out for testing purposes. See 'get_enterprise_owners' for docs.
"""
owners, org = fetch_all(
token,
api_url,
organization,
GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL,
'enterpriseOwners',
)
return owners.edges, org


@timeit
def get_enterprise_owners(token: str, api_url: str, organization: str) -> Tuple[List[Dict], List[Dict], Dict]:
"""
Retrieve a list of enterprise owners from the given GitHub organization as described in
https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge.
:param token: The Github API token as string.
:param api_url: The Github v4 API endpoint as string.
:param organization: The name of the target Github organization as string.
:return: A 3-tuple containing
1. a list of dicts representing enterprise owners who are also users in the organization
2. a list of dicts representing enterprise owners who are not users in the organization
3. data on the owning GitHub organization
see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
"""
owners, org = _get_enterprise_owners_raw(token, api_url, organization)
unaffiliated_owners = []
affiliated_owners = []
for owner in owners:
if owner['organizationRole'] == 'UNAFFILIATED':
unaffiliated_owners.append(owner)
else:
affiliated_owners.append(owner)
return affiliated_owners, unaffiliated_owners, org


def _mark_users_as_enterprise_owners(
user_data: List[Dict],
user_org_data: Dict,
affiliated_owner_data: List[Dict],
owner_org_data: Dict,
) -> list[Dict]:
"""
:param user_data: A list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA for shape.
:param user_org_data: A dict representing the organization for the user_data
see tests.data.github.users.GITHUB_ORG_DATA for shape.
:param affiliated_owner_data: A list of dicts representing affiliated enterprise owners
(owners who are also users in the org) - see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape.
:param owner_org_data: A dict representing the organization for the owner data
see tests.data.github.users.GITHUB_ORG_DATA for shape.
:return: A new list of user_data dicts updated with a new property, isEnterpriseOwner
"""

# Guarding against accidental mixing of data from different orgs. Since user data and owner data are queried
# separately, there is at least a possibility of callers attempting to join data from different orgs.
if user_org_data['url'] != owner_org_data['url']:
raise ValueError(f"Organization URLs do not match: {user_org_data['url']} != {owner_org_data['url']}")
if user_org_data['login'] != owner_org_data['login']:
raise ValueError(f"Organization logins do not match: {user_org_data['login']} != {owner_org_data['login']}")

result = []
owner_urls = {entry['node']['url'] for entry in affiliated_owner_data}
for user in user_data:
user_copy = deepcopy(user)
user_copy['node']['isEnterpriseOwner'] = user['node']['url'] in owner_urls
result.append(user_copy)
return result


@timeit
def load_organization_users(
neo4j_session: neo4j.Session, user_data: List[Dict], org_data: Dict,
Expand All @@ -87,6 +189,7 @@ def load_organization_users(
u.has_2fa_enabled = user.hasTwoFactorEnabled,
u.role = user.role,
u.is_site_admin = user.node.isSiteAdmin,
u.is_enterprise_owner = user.node.isEnterpriseOwner,
u.email = user.node.email,
u.company = user.node.company,
u.lastupdated = $UpdateTag
Expand All @@ -104,6 +207,54 @@ def load_organization_users(
)


@timeit
def load_unaffiliated_owners(
neo4j_session: neo4j.Session, owner_data: List[Dict], org_data: Dict,
update_tag: int,
) -> None:
"""
The owner_data here represents users who are enterprise owners but are not in the target org.
Note the subtle differences between what is loaded here and in load_organization_users:
1. The user-org relationship is set to UNAFFILIATED
2. 'role' is not set: these users have no role in the organization (i.e. they are neither 'MEMBER' nor 'ADMIN').
3. 'has_2fa_enabled' is not set (it is unavailable from the GraphQL query for these owners)
4. 'is_enterprise_owner' is set to TRUE
If the user does already exist in the graph (perhaps they are members of other orgs) then this merge will
update the user's node but leave 'role' and 'has_2fa_enabled' untouched.
"""
query = """
MERGE (org:GitHubOrganization{id: $OrgUrl})
ON CREATE SET org.firstseen = timestamp()
SET org.username = $OrgLogin,
org.lastupdated = $UpdateTag
WITH org
UNWIND $UserData as user
MERGE (u:GitHubUser{id: user.node.url})
ON CREATE SET u.firstseen = timestamp()
SET u.fullname = user.node.name,
u.username = user.node.login,
u.is_site_admin = user.node.isSiteAdmin,
u.is_enterprise_owner = TRUE,
u.email = user.node.email,
u.company = user.node.company,
u.lastupdated = $UpdateTag
MERGE (u)-[r:UNAFFILIATED]->(org)
ON CREATE SET r.firstseen = timestamp()
SET r.lastupdated = $UpdateTag
"""
neo4j_session.run(
query,
OrgUrl=org_data['url'],
OrgLogin=org_data['login'],
UserData=owner_data,
UpdateTag=update_tag,
)


@timeit
def sync(
neo4j_session: neo4j.Session,
Expand All @@ -113,13 +264,25 @@ def sync(
organization: str,
) -> None:
logger.info("Syncing GitHub users")
user_data, org_data = get(github_api_key, github_url, organization)
load_organization_users(neo4j_session, user_data, org_data, common_job_parameters['UPDATE_TAG'])
user_data, user_org_data = get_users(github_api_key, github_url, organization)
affiliated_owner_data, unaffiliated_owner_data, owner_org_data = get_enterprise_owners(
github_api_key,
github_url, organization,
)
processed_user_data = _mark_users_as_enterprise_owners(
user_data, user_org_data,
affiliated_owner_data, owner_org_data,
)
load_organization_users(neo4j_session, processed_user_data, user_org_data, common_job_parameters['UPDATE_TAG'])
load_unaffiliated_owners(
neo4j_session, unaffiliated_owner_data,
owner_org_data, common_job_parameters['UPDATE_TAG'],
)
run_cleanup_job('github_users_cleanup.json', neo4j_session, common_job_parameters)
merge_module_sync_metadata(
neo4j_session,
group_type='GitHubOrganization',
group_id=org_data['url'],
group_id=user_org_data['url'],
synced_type='GitHubOrganization',
update_tag=common_job_parameters['UPDATE_TAG'],
stat_handler=stat_handler,
Expand Down
14 changes: 10 additions & 4 deletions docs/root/modules/github/schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@ Representation of a single GitHubUser [user object](https://developer.github.com
| has_2fa_enabled | Whether the user has 2-factor authentication enabled |
| role | Either 'ADMIN' (denoting that the user is an owner of a Github organization) or 'MEMBER' |
| is_site_admin | Whether the user is a site admin |
| permission | Only present if the user is an [outside collaborator](https://docs.github.com/en/graphql/reference/objects#repositorycollaboratorconnection) of this repo.
`permission` is either ADMIN, MAINTAIN, READ, TRIAGE, or WRITE ([ref](https://docs.github.com/en/graphql/reference/enums#repositorypermission)).
| email | The user's publicly visible profile email.
| company | The user's public profile company.
| is_enterprise_owner | Whether the user is an [enterprise owner](https://docs.github.com/en/enterprise-cloud@latest/admin/managing-accounts-and-repositories/managing-users-in-your-enterprise/roles-in-an-enterprise#enterprise-owners) |
| permission | Only present if the user is an [outside collaborator](https://docs.github.com/en/graphql/reference/objects#repositorycollaboratorconnection) of this repo. `permission` is either ADMIN, MAINTAIN, READ, TRIAGE, or WRITE ([ref](https://docs.github.com/en/graphql/reference/enums#repositorypermission)). |
| email | The user's publicly visible profile email. |
| company | The user's public profile company. |


#### Relationships
Expand All @@ -152,6 +152,12 @@ WRITE, MAINTAIN, TRIAGE, and READ ([Reference](https://docs.github.com/en/graphq
(GitHubUser)-[:OUTSIDE_COLLAB_{ACTION}]->(GitHubRepository)
```

- GitHubUsers are members of an organization. In some cases there may be a user who is "unaffiliated" with an org, for example if the user is an enterprise owner, but not member of, the org.

```
(GitHubUser)-[MEMBER_OF|UNAFFILIATED]->(GitHubOrganization)
```

### GitHubBranch

Representation of a single GitHubBranch [ref object](https://developer.github.com/v4/object/ref). This node contains minimal data for a repository branch.
Expand Down
110 changes: 84 additions & 26 deletions tests/data/github/users.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,88 @@
GITHUB_USER_DATA = [
{
'hasTwoFactorEnabled': None,
'node': {
'url': 'https://example.com/hjsimpson',
'login': 'hjsimpson',
'name': 'Homer Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Springfield Nuclear Power Plant',
},
'role': 'MEMBER',
}, {
'hasTwoFactorEnabled': None,
'node': {
'url': 'https://example.com/mbsimpson',
'login': 'mbsimpson',
'name': 'Marge Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Simpson Residence',
},
'role': 'ADMIN',
},
]

GITHUB_ORG_DATA = {
'url': 'https://example.com/my_org',
'login': 'my_org',
}


GITHUB_USER_DATA = (
[
{
'hasTwoFactorEnabled': None,
'node': {
'url': 'https://example.com/hjsimpson',
'login': 'hjsimpson',
'name': 'Homer Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Springfield Nuclear Power Plant',
},
'role': 'MEMBER',
}, {
'hasTwoFactorEnabled': None,
'node': {
'url': 'https://example.com/lmsimpson',
'login': 'lmsimpson',
'name': 'Lisa Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Simpson Residence',
},
'role': 'MEMBER',
}, {
'hasTwoFactorEnabled': True,
'node': {
'url': 'https://example.com/mbsimpson',
'login': 'mbsimpson',
'name': 'Marge Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Simpson Residence',
},
'role': 'ADMIN',
},
],
GITHUB_ORG_DATA,
)

# Subtle differences between owner data and user data:
# 1. owner data does not include a `hasTwoFactorEnabled` field (it in unavailable in the GraphQL query for these owners)
# 2. an `organizationRole` field instead of a `role` field. In owner data, membership within an org is not assumed, so
# there is an 'UNAFFILIATED' value for owners of an org who are not also members of it. (Otherwise the 'OWNER'
# organizationRole matches the 'ADMIN' role in the user data, and the 'DIRECT_MEMBER' organizationRole matches
# the 'MEMBER' role.)
GITHUB_ENTERPRISE_OWNER_DATA = (
[
{
'node': {
'url': 'https://example.com/kbroflovski',
'login': 'kbroflovski',
'name': 'Kyle Broflovski',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'South Park Elementary',
},
'organizationRole': 'UNAFFILIATED',
}, {
'node': {
'url': 'https://example.com/mbsimpson',
'login': 'mbsimpson',
'name': 'Marge Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Simpson Residence',
},
'organizationRole': 'OWNER',
}, {
'node': {
'url': 'https://example.com/lmsimpson',
'login': 'lmsimpson',
'name': 'Lisa Simpson',
'isSiteAdmin': False,
'email': '[email protected]',
'company': 'Simpson Residence',
},
'organizationRole': 'DIRECT_MEMBER',
},
],
GITHUB_ORG_DATA,
)
Loading

0 comments on commit f9c1254

Please sign in to comment.