From a6c24ba9d92a95fdedfb23a5e19c9b35cc086bb9 Mon Sep 17 00:00:00 2001 From: Mike Roberts Date: Fri, 26 Apr 2024 13:19:43 -0400 Subject: [PATCH] Use Step Functions for crawler orchestration Before this change all crawler orchestration happened in-Lambda. Moving to step functions is useful for a few reasons: 1 - Soonish I'd like to add the ability to add public repos, and updating those can also use this crawler logic 2 - Going to have to deal with GitHub rate limiting and retries at some point - Step Functions is a good use for that 3 - Async data loading is a good example of Step Functions anyway, and it's good to have Step Functions in Cicada somewhere from an example point of view --- docs/SettingUpCicada.md | 12 +- .../github/crawler/crawlConfiguration.ts | 10 - .../github/crawler/crawlInstallations.ts | 12 + ...hubRepositoryCrawler.ts => crawlPushes.ts} | 44 +--- .../github/crawler/crawlRepositories.ts | 21 ++ .../domain/github/crawler/crawlRunEvents.ts | 23 ++ src/app/domain/github/crawler/crawlUsers.ts | 23 ++ .../domain/github/crawler/githubAppCrawler.ts | 12 - .../crawler/githubInstallationCrawler.ts | 56 ----- src/app/domain/github/githubInstallation.ts | 27 +-- src/app/domain/github/githubRepository.ts | 36 +-- .../githubWebhookInstallationProcessor.ts | 15 +- .../githubCrawlTask/githubCrawlTaskEvents.ts | 76 +++++++ .../lambdaFunctions/githubCrawlTask/lambda.ts | 64 ++++++ .../lambdaFunctions/githubCrawler/lambda.ts | 35 --- src/app/outboundInterfaces/eventBridgeBus.ts | 7 +- src/app/util/collections.ts | 4 + src/cdk/stacks/main/MainStack.ts | 3 + src/cdk/stacks/main/githubCrawlers.ts | 208 ++++++++++++++++++ src/cdk/stacks/main/githubInteraction.ts | 30 +-- src/multipleContexts/eventBridge.ts | 3 +- src/multipleContexts/githubCrawler.ts | 13 ++ ...ler.test.ts => crawlInstallations.test.ts} | 8 +- .../domain/github/crawler/crawlPushes.test.ts | 118 ++++++++++ .../github/crawler/crawlRepositories.test.ts | 112 ++++++++++ ...Crawler.test.ts => crawlRunEvents.test.ts} | 98 +-------- ...tionCrawler.test.ts => crawlUsers.test.ts} | 67 +----- ...githubWebhookInstallationProcessor.test.ts | 53 +---- test/local/unit/util/collections.test.ts | 8 + 29 files changed, 745 insertions(+), 453 deletions(-) delete mode 100644 src/app/domain/github/crawler/crawlConfiguration.ts create mode 100644 src/app/domain/github/crawler/crawlInstallations.ts rename src/app/domain/github/crawler/{githubRepositoryCrawler.ts => crawlPushes.ts} (55%) create mode 100644 src/app/domain/github/crawler/crawlRepositories.ts create mode 100644 src/app/domain/github/crawler/crawlRunEvents.ts create mode 100644 src/app/domain/github/crawler/crawlUsers.ts delete mode 100644 src/app/domain/github/crawler/githubAppCrawler.ts delete mode 100644 src/app/domain/github/crawler/githubInstallationCrawler.ts create mode 100644 src/app/lambdaFunctions/githubCrawlTask/githubCrawlTaskEvents.ts create mode 100644 src/app/lambdaFunctions/githubCrawlTask/lambda.ts delete mode 100644 src/app/lambdaFunctions/githubCrawler/lambda.ts create mode 100644 src/cdk/stacks/main/githubCrawlers.ts create mode 100644 src/multipleContexts/githubCrawler.ts rename test/local/functional/domain/github/crawler/{githubAppCrawler.test.ts => crawlInstallations.test.ts} (84%) create mode 100644 test/local/functional/domain/github/crawler/crawlPushes.test.ts create mode 100644 test/local/functional/domain/github/crawler/crawlRepositories.test.ts rename test/local/functional/domain/github/crawler/{githubRepositoryCrawler.test.ts => crawlRunEvents.test.ts} (55%) rename test/local/functional/domain/github/crawler/{githubInstallationCrawler.test.ts => crawlUsers.test.ts} (67%) diff --git a/docs/SettingUpCicada.md b/docs/SettingUpCicada.md index dd6012f..eb536cf 100644 --- a/docs/SettingUpCicada.md +++ b/docs/SettingUpCicada.md @@ -183,14 +183,4 @@ Cicada has to be **registered** and **installed** as a "GitHub App" in your GitH Finally you can go back to the Cicada Home Page (`https://YOUR_WEB_HOST_NAME`) and login, at which point setup is complete. -## Load older data - -Depending on how recently you've had activity in your GitHub account you may already see some data from the logged-in home screen. If not then: - -* Go to the Lambda Console in your account -* Go to the _githubCrawler_ function for your installation -* Go to the "Test" tab -* Set the _Event JSON_ section to `{"lookbackDays": 90}` (adjust the value to more or few days) -* Click the "Test" button - -This will load more data from your GitHub account. NB: only GitHub Actions Runs data will be available more than a couple of weeks in the past - you won't see any older Push events than that because they're not available from GitHub. +On installation, Cicada loads the past 30 days of workflow run events, and however many pushes GitHub will provide - usually a few days. \ No newline at end of file diff --git a/src/app/domain/github/crawler/crawlConfiguration.ts b/src/app/domain/github/crawler/crawlConfiguration.ts deleted file mode 100644 index 39bb9fe..0000000 --- a/src/app/domain/github/crawler/crawlConfiguration.ts +++ /dev/null @@ -1,10 +0,0 @@ -export interface CrawlConfiguration { - crawlChildObjects: 'never' | 'ifChanged' | 'always' - lookbackDays: number -} - -export function calculateCrawlChildren(config: CrawlConfiguration, parentStateChanged: boolean) { - return ( - config.crawlChildObjects === 'always' || (config.crawlChildObjects === 'ifChanged' && parentStateChanged) - ) -} diff --git a/src/app/domain/github/crawler/crawlInstallations.ts b/src/app/domain/github/crawler/crawlInstallations.ts new file mode 100644 index 0000000..c433351 --- /dev/null +++ b/src/app/domain/github/crawler/crawlInstallations.ts @@ -0,0 +1,12 @@ +import { processRawInstallation } from '../githubInstallation' +import { AppState } from '../../../environment/AppState' +import { removeNullAndUndefined } from '../../../util/collections' +import { GithubInstallation } from '../../types/GithubInstallation' + +export async function crawlInstallations(appState: AppState): Promise { + const installations = await appState.githubClient.listInstallations() + + return removeNullAndUndefined( + await Promise.all(installations.map(async (raw) => processRawInstallation(appState, raw))) + ) +} diff --git a/src/app/domain/github/crawler/githubRepositoryCrawler.ts b/src/app/domain/github/crawler/crawlPushes.ts similarity index 55% rename from src/app/domain/github/crawler/githubRepositoryCrawler.ts rename to src/app/domain/github/crawler/crawlPushes.ts index 83db922..1e2aa95 100644 --- a/src/app/domain/github/crawler/githubRepositoryCrawler.ts +++ b/src/app/domain/github/crawler/crawlPushes.ts @@ -1,53 +1,23 @@ import { AppState } from '../../../environment/AppState' -import { GithubInstallation } from '../../types/GithubInstallation' import { GithubRepository } from '../../types/GithubRepository' -import { processRawRunEvents } from '../githubWorkflowRunEvent' -import { processPushes } from '../githubPush' -import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient' -import { dateTimeAddDays } from '../../../util/dateAndTime' import { isRawGithubPushEventEvent } from '../../types/rawGithub/RawGithubAPIPushEventEvent' import { fromRawGithubPushEventEvent, GithubPush } from '../../types/GithubPush' -import { CrawlConfiguration } from './crawlConfiguration' - -export async function crawlRepository( - appState: AppState, - installation: GithubInstallation, - repo: GithubRepository, - crawlConfiguration: CrawlConfiguration -) { - const githubClient = appState.githubClient.clientForInstallation(installation.installationId) - await crawlRunEvents(appState, githubClient, repo, crawlConfiguration) - await crawlPushes(appState, githubClient, repo) -} - -export async function crawlRunEvents( - appState: AppState, - githubClient: GithubInstallationClient, - repo: GithubRepository, - crawlConfiguration: CrawlConfiguration -) { - const startTime = `${dateTimeAddDays( - appState.clock.now(), - -1 * crawlConfiguration.lookbackDays - ).toISOString()}` - - const recentRunEvents = await githubClient.listWorkflowRunsForRepo( - repo.ownerName, - repo.name, - `>${startTime}` - ) - await processRawRunEvents(appState, recentRunEvents, false) -} +import { processPushes } from '../githubPush' +import { GithubInstallation } from '../../types/GithubInstallation' // TOEventually - only get all pushes back to lookback in crawl configuration, however GitHub doesn't keep // them around for very long export async function crawlPushes( appState: AppState, - githubClient: GithubInstallationClient, + // the owner ID on repo isn't sufficient when we are crawling public repos from other accounts + installation: GithubInstallation, repo: GithubRepository ) { + const githubClient = appState.githubClient.clientForInstallation(installation.installationId) const allEventsForRepo = await githubClient.listMostRecentEventsForRepo(repo.ownerName, repo.name) const rawPushes = allEventsForRepo.filter(isRawGithubPushEventEvent) + // TODO - this comment was from pre-step-functions version. Is there something that can be improved now + // repo is in context? // For now do translation to internal pushes here since we need context of repo details, which aren't in the raw push // (this isn't required for webhook translation) const pushes = rawPushes diff --git a/src/app/domain/github/crawler/crawlRepositories.ts b/src/app/domain/github/crawler/crawlRepositories.ts new file mode 100644 index 0000000..b366985 --- /dev/null +++ b/src/app/domain/github/crawler/crawlRepositories.ts @@ -0,0 +1,21 @@ +import { AppState } from '../../../environment/AppState' +import { GithubInstallation } from '../../types/GithubInstallation' +import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient' +import { processRawRepositories } from '../githubRepository' +import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/githubCommonTypes' + +export async function crawlRepositories(appState: AppState, installation: GithubInstallation) { + const githubClient = appState.githubClient.clientForInstallation(installation.installationId) + const latestRawRepositories = await readRawRepositories(installation, githubClient) + return await processRawRepositories(appState, latestRawRepositories) +} + +async function readRawRepositories(installation: GithubInstallation, githubClient: GithubInstallationClient) { + if (installation.accountType === ORGANIZATION_ACCOUNT_TYPE) { + return await githubClient.listOrganizationRepositories(installation.accountLogin) + } else if (installation.accountType === USER_ACCOUNT_TYPE) { + return await githubClient.listInstallationRepositories() + } else { + throw new Error(`Unknown installation account type: ${installation.accountType}`) + } +} diff --git a/src/app/domain/github/crawler/crawlRunEvents.ts b/src/app/domain/github/crawler/crawlRunEvents.ts new file mode 100644 index 0000000..92c63be --- /dev/null +++ b/src/app/domain/github/crawler/crawlRunEvents.ts @@ -0,0 +1,23 @@ +import { AppState } from '../../../environment/AppState' +import { GithubRepository } from '../../types/GithubRepository' +import { dateTimeAddDays } from '../../../util/dateAndTime' +import { processRawRunEvents } from '../githubWorkflowRunEvent' +import { GithubInstallation } from '../../types/GithubInstallation' + +export async function crawlWorkflowRunEvents( + appState: AppState, + // the owner ID on repo isn't sufficient when we are crawling public repos from other accounts + installation: GithubInstallation, + repo: GithubRepository, + lookbackDays: number +) { + const githubClient = appState.githubClient.clientForInstallation(installation.installationId) + const startTime = `${dateTimeAddDays(appState.clock.now(), -1 * lookbackDays).toISOString()}` + + const recentRunEvents = await githubClient.listWorkflowRunsForRepo( + repo.ownerName, + repo.name, + `>${startTime}` + ) + await processRawRunEvents(appState, recentRunEvents, false) +} diff --git a/src/app/domain/github/crawler/crawlUsers.ts b/src/app/domain/github/crawler/crawlUsers.ts new file mode 100644 index 0000000..d7fe5a6 --- /dev/null +++ b/src/app/domain/github/crawler/crawlUsers.ts @@ -0,0 +1,23 @@ +import { AppState } from '../../../environment/AppState' +import { GithubInstallation } from '../../types/GithubInstallation' +import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient' +import { processRawUsers } from '../githubUser' +import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/githubCommonTypes' + +export async function crawlUsers(appState: AppState, installation: GithubInstallation) { + const latestRawUsers = await readRawUsers( + installation, + appState.githubClient.clientForInstallation(installation.installationId) + ) + await processRawUsers(appState, latestRawUsers, installation) +} + +async function readRawUsers(installation: GithubInstallation, githubClient: GithubInstallationClient) { + if (installation.accountType === ORGANIZATION_ACCOUNT_TYPE) { + return await githubClient.listOrganizationMembers(installation.accountLogin) + } else if (installation.accountType === USER_ACCOUNT_TYPE) { + return [await githubClient.getUser(installation.accountLogin)] + } else { + throw new Error(`Unknown installation account type: ${installation.accountType}`) + } +} diff --git a/src/app/domain/github/crawler/githubAppCrawler.ts b/src/app/domain/github/crawler/githubAppCrawler.ts deleted file mode 100644 index 24cfc17..0000000 --- a/src/app/domain/github/crawler/githubAppCrawler.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { AppState } from '../../../environment/AppState' -import { processRawInstallation } from '../githubInstallation' -import { CrawlConfiguration } from './crawlConfiguration' -import { logger } from '../../../util/logging' - -export async function crawlGithubApp(appState: AppState, crawlConfiguration: CrawlConfiguration) { - logger.info(`Crawling GitHub app`, { ...crawlConfiguration }) - - for (const rawInstallation of await appState.githubClient.listInstallations()) { - await processRawInstallation(appState, rawInstallation, crawlConfiguration) - } -} diff --git a/src/app/domain/github/crawler/githubInstallationCrawler.ts b/src/app/domain/github/crawler/githubInstallationCrawler.ts deleted file mode 100644 index c9c03d1..0000000 --- a/src/app/domain/github/crawler/githubInstallationCrawler.ts +++ /dev/null @@ -1,56 +0,0 @@ -import { AppState } from '../../../environment/AppState' -import { GithubInstallation } from '../../types/GithubInstallation' -import { processRawUsers } from '../githubUser' -import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/githubCommonTypes' -import { processRawRepositories } from '../githubRepository' -import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient' -import { CrawlConfiguration } from './crawlConfiguration' - -export async function crawlInstallation( - appState: AppState, - installation: GithubInstallation, - crawlConfiguration: CrawlConfiguration -) { - const githubClient = appState.githubClient.clientForInstallation(installation.installationId) - await crawlUsers(appState, installation, githubClient) - await crawlRepositories(appState, installation, githubClient, crawlConfiguration) -} - -export async function crawlUsers( - appState: AppState, - installation: GithubInstallation, - githubClient: GithubInstallationClient -) { - const latestRawUsers = await readRawUsers(installation, githubClient) - await processRawUsers(appState, latestRawUsers, installation) -} - -async function readRawUsers(installation: GithubInstallation, githubClient: GithubInstallationClient) { - if (installation.accountType === ORGANIZATION_ACCOUNT_TYPE) { - return await githubClient.listOrganizationMembers(installation.accountLogin) - } else if (installation.accountType === USER_ACCOUNT_TYPE) { - return [await githubClient.getUser(installation.accountLogin)] - } else { - throw new Error(`Unknown installation account type: ${installation.accountType}`) - } -} - -export async function crawlRepositories( - appState: AppState, - installation: GithubInstallation, - githubClient: GithubInstallationClient, - crawlConfiguration: CrawlConfiguration -) { - const latestRawRepositories = await readRawRepositories(installation, githubClient) - await processRawRepositories(appState, installation, latestRawRepositories, crawlConfiguration) -} - -async function readRawRepositories(installation: GithubInstallation, githubClient: GithubInstallationClient) { - if (installation.accountType === ORGANIZATION_ACCOUNT_TYPE) { - return await githubClient.listOrganizationRepositories(installation.accountLogin) - } else if (installation.accountType === USER_ACCOUNT_TYPE) { - return await githubClient.listInstallationRepositories() - } else { - throw new Error(`Unknown installation account type: ${installation.accountType}`) - } -} diff --git a/src/app/domain/github/githubInstallation.ts b/src/app/domain/github/githubInstallation.ts index d951de4..4c71c5e 100644 --- a/src/app/domain/github/githubInstallation.ts +++ b/src/app/domain/github/githubInstallation.ts @@ -4,31 +4,18 @@ import { GithubInstallationEntity } from '../entityStore/entities/GithubInstalla import { logger } from '../../util/logging' import deepEqual from 'deep-equal' import { RawGithubInstallation } from '../types/rawGithub/RawGithubInstallation' -import { crawlInstallation } from './crawler/githubInstallationCrawler' -import { calculateCrawlChildren, CrawlConfiguration } from './crawler/crawlConfiguration' -export async function processRawInstallation( - appState: AppState, - rawInstallation: RawGithubInstallation, - crawlConfiguration: CrawlConfiguration -) { - await processInstallation(appState, fromRawGithubInstallation(rawInstallation), crawlConfiguration) +export async function processRawInstallation(appState: AppState, rawInstallation: RawGithubInstallation) { + return await processInstallation(appState, fromRawGithubInstallation(rawInstallation)) } -export async function processInstallation( - appState: AppState, - installation: GithubInstallation, - crawlConfiguration: CrawlConfiguration -) { +export async function processInstallation(appState: AppState, installation: GithubInstallation) { if (`${installation.appId}` !== (await appState.config.github()).appId) { logger.warn(`Not processing invalid installation - unexpected app ID`) - return + return null } - const { installationStateChanged } = await saveInstallation(appState, installation) - if (calculateCrawlChildren(crawlConfiguration, installationStateChanged)) { - await crawlInstallation(appState, installation, crawlConfiguration) - } + return await saveInstallation(appState, installation) } async function saveInstallation(appState: AppState, installation: GithubInstallation) { @@ -56,9 +43,7 @@ async function saveInstallation(appState: AppState, installation: GithubInstalla await installationsStore.put(installation) } - return { - installationStateChanged - } + return installation } export function installationsEqual(x: GithubInstallation, y: GithubInstallation) { diff --git a/src/app/domain/github/githubRepository.ts b/src/app/domain/github/githubRepository.ts index c4f02e6..e7a4a8a 100644 --- a/src/app/domain/github/githubRepository.ts +++ b/src/app/domain/github/githubRepository.ts @@ -2,43 +2,15 @@ import { AppState } from '../../environment/AppState' import { RawGithubRepository } from '../types/rawGithub/RawGithubRepository' import { fromRawGithubRepository, GithubRepository } from '../types/GithubRepository' import { GithubRepositoryEntity } from '../entityStore/entities/GithubRepositoryEntity' -import { GithubInstallation } from '../types/GithubInstallation' -import { crawlRepository } from './crawler/githubRepositoryCrawler' -import { CrawlConfiguration } from './crawler/crawlConfiguration' -export async function processRawRepositories( - appState: AppState, - installation: GithubInstallation, - rawRepos: RawGithubRepository[], - crawlChildResources: CrawlConfiguration -) { - await processRepositories( - appState, - installation, - rawRepos.map(fromRawGithubRepository), - crawlChildResources - ) +export async function processRawRepositories(appState: AppState, rawRepos: RawGithubRepository[]) { + return await saveRepositories(appState, rawRepos.map(fromRawGithubRepository)) } -export async function processRepositories( - appState: AppState, - installation: GithubInstallation, - repos: GithubRepository[], - crawlConfiguration: CrawlConfiguration -) { - await saveRepositories(appState, repos) - // TOEventually - delete repos that don't exist any more - // TOEventually - figure out what actually changed and just crawl them - const crawlChildResources = - crawlConfiguration.crawlChildObjects === 'always' || crawlConfiguration.crawlChildObjects == 'ifChanged' - for (const repo of crawlChildResources ? repos : []) { - await crawlRepository(appState, installation, repo, crawlConfiguration) - } -} - -export async function saveRepositories(appState: AppState, repos: GithubRepository[]) { +async function saveRepositories(appState: AppState, repos: GithubRepository[]) { // Just put all repos since there may have been updates to details await appState.entityStore.for(GithubRepositoryEntity).advancedOperations.batchPut(repos) + return repos } export async function getRepository(appState: AppState, accountId: number, repoId: number) { diff --git a/src/app/domain/github/webhookProcessor/processors/githubWebhookInstallationProcessor.ts b/src/app/domain/github/webhookProcessor/processors/githubWebhookInstallationProcessor.ts index 1977fbd..b142be8 100644 --- a/src/app/domain/github/webhookProcessor/processors/githubWebhookInstallationProcessor.ts +++ b/src/app/domain/github/webhookProcessor/processors/githubWebhookInstallationProcessor.ts @@ -1,8 +1,10 @@ import { fromRawGithubInstallation } from '../../../types/GithubInstallation' import { RawGithubInstallation } from '../../../types/rawGithub/RawGithubInstallation' -import { processInstallation } from '../../githubInstallation' import { AppState } from '../../../../environment/AppState' import { WebhookProcessor } from '../WebhookProcessor' +import { processInstallation } from '../../githubInstallation' +import { sendToEventBridge } from '../../../../outboundInterfaces/eventBridgeBus' +import { EVENTBRIDGE_DETAIL_TYPES } from '../../../../../multipleContexts/eventBridge' export const githubWebhookInstallationProcessor: WebhookProcessor = async ( appState: AppState, @@ -10,12 +12,11 @@ export const githubWebhookInstallationProcessor: WebhookProcessor = async ( ): Promise => { // TOEventually - need to differentiate sub-types of installation - e.g. deleted // TOEventually - type check, e.g. with AJV - const parsed = fromRawGithubInstallation(JSON.parse(body).installation as RawGithubInstallation) - if (!parsed) { + const installation = fromRawGithubInstallation(JSON.parse(body).installation as RawGithubInstallation) + if (!installation) { return } - await processInstallation(appState, parsed, { - crawlChildObjects: 'ifChanged', - lookbackDays: 90 - }) + + await processInstallation(appState, installation) + await sendToEventBridge(appState, EVENTBRIDGE_DETAIL_TYPES.INSTALLATION_UPDATED, installation) } diff --git a/src/app/lambdaFunctions/githubCrawlTask/githubCrawlTaskEvents.ts b/src/app/lambdaFunctions/githubCrawlTask/githubCrawlTaskEvents.ts new file mode 100644 index 0000000..7c8d558 --- /dev/null +++ b/src/app/lambdaFunctions/githubCrawlTask/githubCrawlTaskEvents.ts @@ -0,0 +1,76 @@ +import { GithubInstallation, isGithubInstallation } from '../../domain/types/GithubInstallation' +import { GithubRepository, isGithubRepository } from '../../domain/types/GithubRepository' +import { throwError } from '@symphoniacloud/dynamodb-entity-store' +import { + CRAWLABLE_RESOURCES, + CrawlableResource, + isCrawlableResource +} from '../../../multipleContexts/githubCrawler' + +// TOEventually - safer type checking here + +export type CrawlEvent = { resourceType: CrawlableResource } +type CrawlEventWithInstallation = CrawlEvent & { installation: GithubInstallation } +type CrawlEventWithRepository = CrawlEvent & { repository: GithubRepository } + +export function isCrawlEvent(x: unknown): x is CrawlEvent { + return x !== undefined && isCrawlableResource((x as CrawlEvent).resourceType) +} + +export function isCrawlEventWithInstallation(x: CrawlEvent): x is CrawlEventWithInstallation { + const candidate = x as CrawlEventWithInstallation + return candidate.installation && isGithubInstallation(candidate.installation) +} + +export function isCrawlEventWithRepository(x: CrawlEvent): x is CrawlEventWithRepository { + const candidate = x as CrawlEventWithRepository + return candidate.repository && isGithubRepository(candidate.repository) +} + +export type CrawlInstallationsEvent = { resourceType: 'installations' } +export type CrawlUsersEvent = { resourceType: 'users' } & CrawlEventWithInstallation +export type CrawlRepositoriesEvent = { resourceType: 'repositories' } & CrawlEventWithInstallation +export type CrawlPushesEvent = { resourceType: 'pushes' } & CrawlEventWithInstallation & + CrawlEventWithRepository +export type CrawlWorkflowRunEventsEvent = { + resourceType: 'pushes' + lookbackDays: number +} & CrawlEventWithInstallation & + CrawlEventWithRepository + +export function isCrawlInstallationsEvent(x: CrawlEvent): x is CrawlInstallationsEvent { + return x.resourceType === CRAWLABLE_RESOURCES.INSTALLATIONS +} + +export function isCrawlUsersEvent(x: CrawlEvent): x is CrawlUsersEvent { + if (x.resourceType !== CRAWLABLE_RESOURCES.USERS) return false + return ( + isCrawlEventWithInstallation(x) || + throwError(`Invalid object for ${CRAWLABLE_RESOURCES.USERS} : ${JSON.stringify(x)}`)() + ) +} + +export function isCrawlRepositoriesEvent(x: CrawlEvent): x is CrawlRepositoriesEvent { + if (x.resourceType !== CRAWLABLE_RESOURCES.REPOSITORIES) return false + return ( + isCrawlEventWithInstallation(x) || + throwError(`Invalid object for ${CRAWLABLE_RESOURCES.REPOSITORIES} : ${JSON.stringify(x)}`)() + ) +} + +export function isCrawlPushesEvent(x: CrawlEvent): x is CrawlPushesEvent { + if (x.resourceType !== CRAWLABLE_RESOURCES.PUSHES) return false + return ( + (isCrawlEventWithInstallation(x) && isCrawlEventWithRepository(x)) || + throwError(`Invalid object for ${CRAWLABLE_RESOURCES.PUSHES} : ${JSON.stringify(x)}`)() + ) +} + +export function isCrawlWorkflowRunEventsEvent(x: CrawlEvent): x is CrawlWorkflowRunEventsEvent { + if (x.resourceType !== CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS) return false + const hasLookBackDays = typeof (x as CrawlWorkflowRunEventsEvent).lookbackDays !== undefined + return ( + (hasLookBackDays && isCrawlEventWithInstallation(x) && isCrawlEventWithRepository(x)) || + throwError(`Invalid object for ${CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS} : ${JSON.stringify(x)}`)() + ) +} diff --git a/src/app/lambdaFunctions/githubCrawlTask/lambda.ts b/src/app/lambdaFunctions/githubCrawlTask/lambda.ts new file mode 100644 index 0000000..28ef8af --- /dev/null +++ b/src/app/lambdaFunctions/githubCrawlTask/lambda.ts @@ -0,0 +1,64 @@ +import { Handler } from 'aws-lambda/handler' +import { AppState } from '../../environment/AppState' +import { lambdaStartup } from '../../environment/lambdaStartup' +import middy from '@middy/core' +import { powertoolsMiddlewares } from '../../middleware/standardMiddleware' +import { logger } from '../../util/logging' +import { isFailure } from '../../util/structuredResult' +import { crawlPushes } from '../../domain/github/crawler/crawlPushes' +import { crawlRepositories } from '../../domain/github/crawler/crawlRepositories' +import { crawlInstallations } from '../../domain/github/crawler/crawlInstallations' +import { crawlUsers } from '../../domain/github/crawler/crawlUsers' +import { + isCrawlEvent, + isCrawlInstallationsEvent, + isCrawlPushesEvent, + isCrawlRepositoriesEvent, + isCrawlUsersEvent, + isCrawlWorkflowRunEventsEvent +} from './githubCrawlTaskEvents' +import { crawlWorkflowRunEvents } from '../../domain/github/crawler/crawlRunEvents' + +let appState: AppState + +export const baseHandler: Handler = async (event) => { + if (!appState) { + const startup = await lambdaStartup() + if (isFailure(startup)) { + logger.info('Github App not ready, not crawling yet') + return + } + + appState = startup.result + } + + if (!isCrawlEvent(event)) { + throw new Error('No resourceType field') + } + + if (isCrawlInstallationsEvent(event)) { + return await crawlInstallations(appState) + } + + if (isCrawlUsersEvent(event)) { + return await crawlUsers(appState, event.installation) + } + + if (isCrawlRepositoriesEvent(event)) { + return await crawlRepositories(appState, event.installation) + } + + if (isCrawlPushesEvent(event)) { + return await crawlPushes(appState, event.installation, event.repository) + } + + if (isCrawlWorkflowRunEventsEvent(event)) { + return await crawlWorkflowRunEvents(appState, event.installation, event.repository, event.lookbackDays) + } + + throw new Error(`unknown event format: ${event}`) +} + +// Entry point - usage is defined by CDK +// noinspection JSUnusedGlobalSymbols +export const handler = middy(baseHandler).use(powertoolsMiddlewares) diff --git a/src/app/lambdaFunctions/githubCrawler/lambda.ts b/src/app/lambdaFunctions/githubCrawler/lambda.ts deleted file mode 100644 index d4c288e..0000000 --- a/src/app/lambdaFunctions/githubCrawler/lambda.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { Handler } from 'aws-lambda/handler' -import { AppState } from '../../environment/AppState' -import { lambdaStartup } from '../../environment/lambdaStartup' -import middy from '@middy/core' -import { powertoolsMiddlewares } from '../../middleware/standardMiddleware' -import { crawlGithubApp } from '../../domain/github/crawler/githubAppCrawler' -import { logger } from '../../util/logging' -import { isFailure } from '../../util/structuredResult' - -let appState: AppState - -export const baseHandler: Handler = async (event) => { - if (!appState) { - const startup = await lambdaStartup() - if (isFailure(startup)) { - logger.info('Github App not ready, not crawling yet') - return - } - - appState = startup.result - } - - await crawlGithubApp(appState, { - crawlChildObjects: 'always', - lookbackDays: isEventWithLookbackDays(event) ? event.lookbackDays : 3 - }) -} - -function isEventWithLookbackDays(x: unknown): x is { lookbackDays: number } { - return x !== null && typeof x === 'object' && 'lookbackDays' in x && typeof x.lookbackDays === 'number' -} - -// Entry point - usage is defined by CDK -// noinspection JSUnusedGlobalSymbols -export const handler = middy(baseHandler).use(powertoolsMiddlewares) diff --git a/src/app/outboundInterfaces/eventBridgeBus.ts b/src/app/outboundInterfaces/eventBridgeBus.ts index d5e0034..d65f887 100644 --- a/src/app/outboundInterfaces/eventBridgeBus.ts +++ b/src/app/outboundInterfaces/eventBridgeBus.ts @@ -6,8 +6,13 @@ import { AppState } from '../environment/AppState' import { GithubPush } from '../domain/types/GithubPush' import { GithubWorkflowRunEvent } from '../domain/types/GithubWorkflowRunEvent' import { WebPushTestEvent } from '../domain/webPush/WebPushTestEvent' +import { GithubInstallation } from '../domain/types/GithubInstallation' -export type CicadaEventBridgeData = GithubPush | GithubWorkflowRunEvent | WebPushTestEvent +export type CicadaEventBridgeData = + | GithubPush + | GithubWorkflowRunEvent + | WebPushTestEvent + | GithubInstallation // This exists since eventually would be nice to add metadata (see https://community.aws/posts/eventbridge-schema-registry-best-practices) export interface CicadaEventBridgeDetail { diff --git a/src/app/util/collections.ts b/src/app/util/collections.ts index 1137eef..8cbf80f 100644 --- a/src/app/util/collections.ts +++ b/src/app/util/collections.ts @@ -39,3 +39,7 @@ export function arrayDifferenceDeep(xs: T[], ys: T[]): T[] { } export type NonEmptyArray = [T, ...T[]] + +export function removeNullAndUndefined(xs: (T | undefined | null)[]) { + return xs.filter((x) => x !== null && x !== undefined) as T[] +} diff --git a/src/cdk/stacks/main/MainStack.ts b/src/cdk/stacks/main/MainStack.ts index 7e53083..4fb7477 100644 --- a/src/cdk/stacks/main/MainStack.ts +++ b/src/cdk/stacks/main/MainStack.ts @@ -7,6 +7,7 @@ import { defineWebInfrastructure } from './webInfrastructure' import { defineGithubInteraction } from './githubInteraction' import { saveInSSMViaCloudFormation } from '../../support/ssm' import { SSM_PARAM_NAMES, SsmParamName } from '../../../multipleContexts/ssmParams' +import { defineGithubCrawlers } from './githubCrawlers' export class MainStack extends Stack { constructor(scope: Construct, id: string, props: AllStacksProps) { @@ -26,6 +27,8 @@ export class MainStack extends Stack { restApi }) + defineGithubCrawlers(this, mainStackProps) + savePreGeneratedConfiguration(this, props) } } diff --git a/src/cdk/stacks/main/githubCrawlers.ts b/src/cdk/stacks/main/githubCrawlers.ts new file mode 100644 index 0000000..50cfee3 --- /dev/null +++ b/src/cdk/stacks/main/githubCrawlers.ts @@ -0,0 +1,208 @@ +import { Construct } from 'constructs' +import { MainStackProps } from './mainStackProps' +import { CicadaFunction, cicadaFunctionProps } from './constructs/CicadaFunction' +import { + DefinitionBody, + IntegrationPattern, + JsonPath, + Map, + StateMachine, + TaskInput +} from 'aws-cdk-lib/aws-stepfunctions' +import { LambdaInvoke, StepFunctionsStartExecution } from 'aws-cdk-lib/aws-stepfunctions-tasks' +import { CRAWLABLE_RESOURCES } from '../../../multipleContexts/githubCrawler' +import { Rule, Schedule } from 'aws-cdk-lib/aws-events' +import { SfnStateMachine } from 'aws-cdk-lib/aws-events-targets' +import { EVENTBRIDGE_DETAIL_TYPES } from '../../../multipleContexts/eventBridge' +import { Duration } from 'aws-cdk-lib' + +export function defineGithubCrawlers(scope: Construct, props: MainStackProps) { + const crawlerFunction = defineGithubCrawlerFunction(scope, props) + const installationCrawler = defineInstallationCrawler(scope, props, crawlerFunction) + const allInstallationsCrawler = defineAllInstallationsCrawler( + scope, + props, + crawlerFunction, + installationCrawler + ) + defineOnInstallationUpdatedProcessor(scope, props, installationCrawler) + defineSchedules(scope, allInstallationsCrawler) +} + +function defineGithubCrawlerFunction(scope: Construct, props: MainStackProps) { + return new CicadaFunction( + scope, + cicadaFunctionProps(props, 'githubCrawlTask', { + memorySize: 512, + timeoutSeconds: 600, + tablesReadWriteAccess: [ + 'github-installations', + 'github-users', + 'github-account-memberships', + 'github-repositories', + 'github-repo-activity', + 'github-latest-workflow-runs', + 'github-latest-pushes-per-ref' + ] + }) + ) +} + +function defineInstallationCrawler(scope: Construct, props: MainStackProps, crawlerFunction: CicadaFunction) { + const crawlUsers = new LambdaInvoke(scope, 'crawlUsers', { + lambdaFunction: crawlerFunction, + payload: TaskInput.fromObject({ + resourceType: CRAWLABLE_RESOURCES.USERS, + installation: JsonPath.objectAt('$.installation') + }), + // Pass through original input to next state + resultPath: JsonPath.DISCARD + }) + + const crawlRepositories = new LambdaInvoke(scope, 'crawlRepositories', { + lambdaFunction: crawlerFunction, + payload: TaskInput.fromObject({ + resourceType: CRAWLABLE_RESOURCES.REPOSITORIES, + installation: JsonPath.objectAt('$.installation') + }), + resultSelector: { + repositories: JsonPath.objectAt('$.Payload') + }, + resultPath: '$.repositoriesCrawler' + }) + + const crawlPushes = new LambdaInvoke(scope, 'crawlPushes', { + lambdaFunction: crawlerFunction, + payload: TaskInput.fromObject({ + resourceType: CRAWLABLE_RESOURCES.PUSHES, + installation: JsonPath.objectAt('$.installation'), + repository: JsonPath.objectAt('$.repository') + }), + // Pass through original input to next state + resultPath: JsonPath.DISCARD + }) + + const crawlWorkflowRunEvents = new LambdaInvoke(scope, 'crawlWorkflowRunEvents', { + lambdaFunction: crawlerFunction, + payload: TaskInput.fromObject({ + resourceType: CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS, + installation: JsonPath.objectAt('$.installation'), + repository: JsonPath.objectAt('$.repository'), + lookbackDays: JsonPath.numberAt('$$.Execution.Input.lookbackDays') + }), + outputPath: '$.Payload' + }) + + // TOEventually - put this into separate state machine for crawling public repos + // TOEventually - need to consider github app rate limiting (max 5000 requests / hour, etc.) + const forEachRepository = new Map(scope, 'forEachRepository', { + maxConcurrency: 10, + itemsPath: '$.repositoriesCrawler.repositories', + itemSelector: { + installation: JsonPath.objectAt('$.installation'), + repository: JsonPath.objectAt('$$.Map.Item.Value') + } + }) + forEachRepository.itemProcessor(crawlPushes.next(crawlWorkflowRunEvents)) + + const workflow = crawlUsers.next(crawlRepositories).next(forEachRepository) + + return new StateMachine(scope, 'installationCrawler', { + stateMachineName: `${props.appName}-installation`, + comment: 'Crawl a GitHub App Installation and child resources', + definitionBody: DefinitionBody.fromChainable(workflow), + tracingEnabled: true + }) +} + +// TOEventually - at some point need to find old installations and delete them +// TOEventually - add error handling +function defineAllInstallationsCrawler( + scope: Construct, + props: MainStackProps, + crawlerFunction: CicadaFunction, + installationCrawler: StateMachine +) { + const crawlInstallations = new LambdaInvoke(scope, 'crawlInstallations', { + lambdaFunction: crawlerFunction, + payload: TaskInput.fromObject({ + resourceType: CRAWLABLE_RESOURCES.INSTALLATIONS + }), + outputPath: '$.Payload' + }) + + const invokeInstallationCrawler = new StepFunctionsStartExecution( + scope, + 'allInstallationsInvokeInstallationCrawler', + { + stateMachine: installationCrawler, + // This is the configuration for "run and wait" - "REQUEST_RESPONSE" is *incorrect* for that + integrationPattern: IntegrationPattern.RUN_JOB, + // Sets up runtime link between the caller and callee workflows + associateWithParent: true, + // Have to explicitly include input because using `associateWithParent` + input: TaskInput.fromObject({ + installation: JsonPath.entirePayload, + // This crawler runs daily, so look back N + 1 days + lookbackDays: 2 + }) + } + ) + + const forEachInstallation = new Map(scope, 'forEachInstallation', {}) + forEachInstallation.itemProcessor(invokeInstallationCrawler) + + const workflow = crawlInstallations.next(forEachInstallation) + + return new StateMachine(scope, 'allInstallationsCrawler', { + stateMachineName: `${props.appName}-all-installations`, + comment: 'Crawl all GitHub App Installations and child resources', + definitionBody: DefinitionBody.fromChainable(workflow), + tracingEnabled: true + }) +} + +function defineOnInstallationUpdatedProcessor( + scope: Construct, + props: MainStackProps, + installationCrawler: StateMachine +) { + // In theory I think it should be possible just to use event manipulation in the event rule, + // rather than a whole new step function, but I couldn't figure out how to merge dynamic and static + // content in the even rule target properties + + const onInstallationUpdatedProcessor = new StateMachine(scope, 'onInstallationUpdatedProcessor', { + stateMachineName: `${props.appName}-on-installation-updated`, + comment: 'Crawl installation and child resources when installation updated', + definitionBody: DefinitionBody.fromChainable( + new StepFunctionsStartExecution(scope, 'onInstallationUpdatedInvokeInstallationCrawler', { + stateMachine: installationCrawler, + integrationPattern: IntegrationPattern.RUN_JOB, + associateWithParent: true, + input: TaskInput.fromObject({ + installation: JsonPath.objectAt('$.detail.data'), + // TOEventually - consider making this longer, at least for new installations + lookbackDays: 30 + }) + }) + ), + tracingEnabled: true + }) + + new Rule(scope, 'installationUpdatedStepFunctionRule', { + description: `Run Installation Crawler when installation updated`, + eventPattern: { + source: [props.appName], + detailType: [EVENTBRIDGE_DETAIL_TYPES.INSTALLATION_UPDATED] + }, + targets: [new SfnStateMachine(onInstallationUpdatedProcessor)] + }) +} + +function defineSchedules(scope: Construct, allInstallationsCrawler: StateMachine) { + new Rule(scope, 'ScheduleRule', { + description: 'Scheduled All Installations Crawl', + schedule: Schedule.rate(Duration.days(1)), + targets: [new SfnStateMachine(allInstallationsCrawler)] + }) +} diff --git a/src/cdk/stacks/main/githubInteraction.ts b/src/cdk/stacks/main/githubInteraction.ts index f19cd86..4fab15c 100644 --- a/src/cdk/stacks/main/githubInteraction.ts +++ b/src/cdk/stacks/main/githubInteraction.ts @@ -10,10 +10,8 @@ import { } from 'aws-cdk-lib/aws-apigateway' import { grantLambdaFunctionPermissionToPutEvents } from '../../support/eventbridge' import { Effect, PolicyStatement, Role, ServicePrincipal } from 'aws-cdk-lib/aws-iam' -import { Rule, Schedule } from 'aws-cdk-lib/aws-events' +import { Rule } from 'aws-cdk-lib/aws-events' import * as targets from 'aws-cdk-lib/aws-events-targets' -import { LambdaFunction } from 'aws-cdk-lib/aws-events-targets' -import { Duration } from 'aws-cdk-lib' import { MainStackProps } from './mainStackProps' export interface GithubInteractionProps extends MainStackProps { @@ -25,7 +23,6 @@ export function defineGithubInteraction(scope: Construct, props: GithubInteracti defineSetup(scope, props, githubApiResource) defineAuth(scope, props, githubApiResource) - defineScheduledCrawler(scope, props) defineWebhook(scope, props, githubApiResource) defineWebhookFunction(scope, props) } @@ -59,31 +56,6 @@ function defineAuth(scope: Construct, props: GithubInteractionProps, githubApiRe .addMethod(HttpMethod.GET, new LambdaIntegration(lambdaFunction)) } -function defineScheduledCrawler(scope: Construct, props: GithubInteractionProps) { - const lambdaFunction = new CicadaFunction( - scope, - cicadaFunctionProps(props, 'githubCrawler', { - memorySize: 512, - timeoutSeconds: 600, - tablesReadWriteAccess: [ - 'github-installations', - 'github-users', - 'github-account-memberships', - 'github-repositories', - 'github-repo-activity', - 'github-latest-workflow-runs', - 'github-latest-pushes-per-ref' - ] - }) - ) - grantLambdaFunctionPermissionToPutEvents(lambdaFunction, props) - new Rule(scope, 'ScheduleRule', { - description: 'Scheduled Github Crawler', - schedule: Schedule.rate(Duration.days(1)), - targets: [new LambdaFunction(lambdaFunction)] - }) -} - const EVENTS_BUCKET_GITHUB_WEBHOOK_KEY_PREFIX = 'githubWebhook/' function defineWebhook(scope: Construct, props: GithubInteractionProps, githubApiResource: Resource) { diff --git a/src/multipleContexts/eventBridge.ts b/src/multipleContexts/eventBridge.ts index 04273e0..f90ca13 100644 --- a/src/multipleContexts/eventBridge.ts +++ b/src/multipleContexts/eventBridge.ts @@ -2,7 +2,8 @@ export const EVENTBRIDGE_DETAIL_TYPES = { GITHUB_NEW_PUSH: 'GithubNewPush', GITHUB_NEW_WORKFLOW_RUN_EVENT: 'GithubNewWorkflowRunEvent', - WEB_PUSH_TEST: 'WebPushTest' + WEB_PUSH_TEST: 'WebPushTest', + INSTALLATION_UPDATED: 'InstallationUpdated' } as const export type EventBridgeDetailType = (typeof EVENTBRIDGE_DETAIL_TYPES)[keyof typeof EVENTBRIDGE_DETAIL_TYPES] diff --git a/src/multipleContexts/githubCrawler.ts b/src/multipleContexts/githubCrawler.ts new file mode 100644 index 0000000..b444242 --- /dev/null +++ b/src/multipleContexts/githubCrawler.ts @@ -0,0 +1,13 @@ +export const CRAWLABLE_RESOURCES = { + INSTALLATIONS: 'installations', + USERS: 'users', + REPOSITORIES: 'repositories', + PUSHES: 'pushes', + WORKFLOW_RUN_EVENTS: 'workflowRunEvents' +} as const + +export type CrawlableResource = (typeof CRAWLABLE_RESOURCES)[keyof typeof CRAWLABLE_RESOURCES] + +export function isCrawlableResource(x: unknown): x is CrawlableResource { + return typeof x === 'string' && Object.values(CRAWLABLE_RESOURCES).includes(x as CrawlableResource) +} diff --git a/test/local/functional/domain/github/crawler/githubAppCrawler.test.ts b/test/local/functional/domain/github/crawler/crawlInstallations.test.ts similarity index 84% rename from test/local/functional/domain/github/crawler/githubAppCrawler.test.ts rename to test/local/functional/domain/github/crawler/crawlInstallations.test.ts index d8f7b02..2b17327 100644 --- a/test/local/functional/domain/github/crawler/githubAppCrawler.test.ts +++ b/test/local/functional/domain/github/crawler/crawlInstallations.test.ts @@ -1,12 +1,12 @@ import { expect, test } from 'vitest' import { FakeAppState } from '../../../../../testSupport/fakes/fakeAppState' -import { crawlGithubApp } from '../../../../../../src/app/domain/github/crawler/githubAppCrawler' import { testOrgInstallation, testPersonalInstallation } from '../../../../../examples/cicada/githubDomainObjects' import example_personal_account_installation from '../../../../../examples/github/personal-account/api/installation.json' import example_org_installation from '../../../../../examples/github/org/api/installation.json' +import { crawlInstallations } from '../../../../../../src/app/domain/github/crawler/crawlInstallations' test('app-crawler-for-personal-account-installation', async () => { // A @@ -18,7 +18,7 @@ test('app-crawler-for-personal-account-installation', async () => { appState.githubClient.stubInstallations = [example_personal_account_installation] // A - await crawlGithubApp(appState, { crawlChildObjects: 'never', lookbackDays: 7 }) + const result = await crawlInstallations(appState) // A expect(appState.dynamoDB.puts.length).toEqual(1) @@ -31,6 +31,7 @@ test('app-crawler-for-personal-account-installation', async () => { }, TableName: 'fakeGithubInstallationsTable' }) + expect(result).toEqual([testPersonalInstallation]) }) test('app-crawler-for-org-installation', async () => { @@ -43,7 +44,7 @@ test('app-crawler-for-org-installation', async () => { appState.githubClient.stubInstallations = [example_org_installation] // A - await crawlGithubApp(appState, { crawlChildObjects: 'never', lookbackDays: 7 }) + const result = await crawlInstallations(appState) // A expect(appState.dynamoDB.puts.length).toEqual(1) @@ -56,4 +57,5 @@ test('app-crawler-for-org-installation', async () => { }, TableName: 'fakeGithubInstallationsTable' }) + expect(result).toEqual([testOrgInstallation]) }) diff --git a/test/local/functional/domain/github/crawler/crawlPushes.test.ts b/test/local/functional/domain/github/crawler/crawlPushes.test.ts new file mode 100644 index 0000000..575459c --- /dev/null +++ b/test/local/functional/domain/github/crawler/crawlPushes.test.ts @@ -0,0 +1,118 @@ +import { expect, test } from 'vitest' +import { FakeAppState } from '../../../../../testSupport/fakes/fakeAppState' +import { FakeGithubInstallationClient } from '../../../../../testSupport/fakes/fakeGithubInstallationClient' +import { + testOrgInstallation, + testOrgTestRepoOne, + testOrgTestRepoOnePush, + testPersonalInstallation, + testPersonalTestRepo, + testPersonalTestRepoPush +} from '../../../../../examples/cicada/githubDomainObjects' +import example_personal_repo_push from '../../../../../examples/github/personal-account/api/repoPush.json' +import example_org_repo_push from '../../../../../examples/github/org/api/repoPush.json' +import { crawlPushes } from '../../../../../../src/app/domain/github/crawler/crawlPushes' + +test('repo-crawler-for-personal-account-installation', async () => { + // A + const appState = new FakeAppState() + const githubInstallationClient = new FakeGithubInstallationClient() + appState.githubClient.fakeClientsForInstallation.addResponse(48093071, githubInstallationClient) + githubInstallationClient.stubMostRecentEventsForRepo.addResponse( + { + owner: 'cicada-test-user', + repo: 'personal-test-repo' + }, + [example_personal_repo_push] + ) + + // A + await crawlPushes(appState, testPersonalInstallation, testPersonalTestRepo) + + // A + expect(appState.dynamoDB.puts.length).toEqual(2) + expect(appState.dynamoDB.puts[0]).toEqual({ + ConditionExpression: 'attribute_not_exists(PK)', + Item: { + PK: 'ACCOUNT#162360409', + SK: 'REPO#767679529#REF#refs/heads/main#PUSH#COMMIT#dfb5cb80ad3ce5a19a5020b4645696b2d6b4d94c', + GSI1PK: 'ACCOUNT#162360409', + GSI1SK: 'REPO#767679529#DATETIME#2024-03-05T18:01:12Z', + _et: 'githubPush', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testPersonalTestRepoPush + }, + TableName: 'fakeGithubRepoActivityTable' + }) + expect(appState.dynamoDB.puts[1]).toEqual({ + ConditionExpression: 'attribute_not_exists(PK) OR #dateTime < :newDateTime', + ExpressionAttributeNames: { + '#dateTime': 'dateTime' + }, + ExpressionAttributeValues: { + ':newDateTime': '2024-03-05T18:01:12Z' + }, + Item: { + PK: 'ACCOUNT#162360409', + SK: 'REPO#767679529#REF#refs/heads/main', + GSI1PK: 'ACCOUNT#162360409', + GSI1SK: 'DATETIME#2024-03-05T18:01:12Z', + _et: 'githubLatestPushPerRef', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testPersonalTestRepoPush + }, + TableName: 'fakeGithubLatestPushesPerRefTable' + }) +}) + +test('repo-crawler-for-org-installation', async () => { + // A + const appState = new FakeAppState() + const githubInstallationClient = new FakeGithubInstallationClient() + appState.githubClient.fakeClientsForInstallation.addResponse(48133709, githubInstallationClient) + githubInstallationClient.stubMostRecentEventsForRepo.addResponse( + { + owner: 'cicada-test-org', + repo: 'org-test-repo-one' + }, + [example_org_repo_push] + ) + + // A + await crawlPushes(appState, testOrgInstallation, testOrgTestRepoOne) + + // A + expect(appState.dynamoDB.puts.length).toEqual(2) + expect(appState.dynamoDB.puts[0]).toEqual({ + ConditionExpression: 'attribute_not_exists(PK)', + Item: { + PK: 'ACCOUNT#162483619', + SK: 'REPO#768206479#REF#refs/heads/main#PUSH#COMMIT#8c3aa1cb0316ea23abeb2612457edb80868f53c8', + GSI1PK: 'ACCOUNT#162483619', + GSI1SK: 'REPO#768206479#DATETIME#2024-03-06T17:00:40Z', + _et: 'githubPush', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testOrgTestRepoOnePush + }, + TableName: 'fakeGithubRepoActivityTable' + }) + expect(appState.dynamoDB.puts[1]).toEqual({ + ConditionExpression: 'attribute_not_exists(PK) OR #dateTime < :newDateTime', + ExpressionAttributeNames: { + '#dateTime': 'dateTime' + }, + ExpressionAttributeValues: { + ':newDateTime': '2024-03-06T17:00:40Z' + }, + Item: { + PK: 'ACCOUNT#162483619', + SK: 'REPO#768206479#REF#refs/heads/main', + GSI1PK: 'ACCOUNT#162483619', + GSI1SK: 'DATETIME#2024-03-06T17:00:40Z', + _et: 'githubLatestPushPerRef', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testOrgTestRepoOnePush + }, + TableName: 'fakeGithubLatestPushesPerRefTable' + }) +}) diff --git a/test/local/functional/domain/github/crawler/crawlRepositories.test.ts b/test/local/functional/domain/github/crawler/crawlRepositories.test.ts new file mode 100644 index 0000000..064fe0c --- /dev/null +++ b/test/local/functional/domain/github/crawler/crawlRepositories.test.ts @@ -0,0 +1,112 @@ +import { expect, test } from 'vitest' +import { FakeAppState } from '../../../../../testSupport/fakes/fakeAppState' +import { FakeGithubInstallationClient } from '../../../../../testSupport/fakes/fakeGithubInstallationClient' +import { + testOrgInstallation, + testOrgTestRepoOne, + testOrgTestRepoTwo, + testPersonalInstallation, + testPersonalTestRepo, + testTestUserMembershipOfOrg +} from '../../../../../examples/cicada/githubDomainObjects' +import example_personal_account_repo from '../../../../../examples/github/personal-account/api/repo.json' +import example_org_repos from '../../../../../examples/github/org/api/repos.json' +import { crawlRepositories } from '../../../../../../src/app/domain/github/crawler/crawlRepositories' + +test('repository-crawler-for-personal-account-installation', async () => { + // A + const appState = new FakeAppState() + const githubInstallationClient = new FakeGithubInstallationClient() + appState.githubClient.fakeClientsForInstallation.addResponse(48093071, githubInstallationClient) + githubInstallationClient.stubInstallationRepositories = [example_personal_account_repo] + + // A + await crawlRepositories(appState, testPersonalInstallation) + + // A + expect(appState.dynamoDB.batchWrites.length).toEqual(1) + expect(appState.dynamoDB.batchWrites[0]).toEqual({ + RequestItems: { + fakeGithubRepositoriesTable: [ + { + PutRequest: { + Item: { + PK: 'OWNER#162360409', + SK: 'REPO#767679529', + _et: 'githubRepository', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testPersonalTestRepo + } + } + } + ] + } + }) +}) + +test('repository-crawler-for-org-installation', async () => { + // A + const appState = new FakeAppState() + const githubInstallationClient = new FakeGithubInstallationClient() + appState.githubClient.fakeClientsForInstallation.addResponse(48133709, githubInstallationClient) + githubInstallationClient.stubOrganizationRepositories.addResponse('cicada-test-org', example_org_repos) + appState.dynamoDB.stubAllPagesQueries.addResponse( + { + TableName: 'fakeGithubAccountMemberships', + KeyConditionExpression: 'PK = :pk', + ExpressionAttributeValues: { ':pk': 'ACCOUNT#162483619' } + }, + [ + { + $metadata: {}, + Items: [ + { + _et: 'githubAccountMembership', + ...testTestUserMembershipOfOrg + }, + // Old membership that will be deleted + { + _et: 'githubAccountMembership', + ...testTestUserMembershipOfOrg, + userId: 9786 + } + ] + } + ] + ) + + // A + await crawlRepositories(appState, testOrgInstallation) + + // A + expect(appState.dynamoDB.batchWrites.length).toEqual(1) + + expect(appState.dynamoDB.batchWrites[0]).toEqual({ + RequestItems: { + fakeGithubRepositoriesTable: [ + { + PutRequest: { + Item: { + PK: 'OWNER#162483619', + SK: 'REPO#768206479', + _et: 'githubRepository', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testOrgTestRepoOne + } + } + }, + { + PutRequest: { + Item: { + PK: 'OWNER#162483619', + SK: 'REPO#768207426', + _et: 'githubRepository', + _lastUpdated: '2024-02-02T19:00:00.000Z', + ...testOrgTestRepoTwo + } + } + } + ] + } + }) +}) diff --git a/test/local/functional/domain/github/crawler/githubRepositoryCrawler.test.ts b/test/local/functional/domain/github/crawler/crawlRunEvents.test.ts similarity index 55% rename from test/local/functional/domain/github/crawler/githubRepositoryCrawler.test.ts rename to test/local/functional/domain/github/crawler/crawlRunEvents.test.ts index 3aed13e..0b57027 100644 --- a/test/local/functional/domain/github/crawler/githubRepositoryCrawler.test.ts +++ b/test/local/functional/domain/github/crawler/crawlRunEvents.test.ts @@ -1,22 +1,18 @@ import { expect, test } from 'vitest' import { FakeAppState } from '../../../../../testSupport/fakes/fakeAppState' import { FakeGithubInstallationClient } from '../../../../../testSupport/fakes/fakeGithubInstallationClient' -import { crawlRepository } from '../../../../../../src/app/domain/github/crawler/githubRepositoryCrawler' import { testOrgInstallation, testOrgTestRepoOne, - testOrgTestRepoOnePush, testOrgTestRepoOneWorkflowRunOne, testPersonalInstallation, testPersonalTestRepo, - testPersonalTestRepoPush, testPersonalTestRepoWorkflowRun } from '../../../../../examples/cicada/githubDomainObjects' import example_personal_workflow_run from '../../../../../examples/github/personal-account/api/workflowRunEvent.json' -import example_personal_repo_push from '../../../../../examples/github/personal-account/api/repoPush.json' import example_org_workflow_run from '../../../../../examples/github/org/api/workflowRunEvent.json' -import example_org_repo_push from '../../../../../examples/github/org/api/repoPush.json' +import { crawlWorkflowRunEvents } from '../../../../../../src/app/domain/github/crawler/crawlRunEvents' test('repo-crawler-for-personal-account-installation', async () => { // A @@ -31,22 +27,12 @@ test('repo-crawler-for-personal-account-installation', async () => { }, [example_personal_workflow_run] ) - githubInstallationClient.stubMostRecentEventsForRepo.addResponse( - { - owner: 'cicada-test-user', - repo: 'personal-test-repo' - }, - [example_personal_repo_push] - ) // A - await crawlRepository(appState, testPersonalInstallation, testPersonalTestRepo, { - crawlChildObjects: 'always', - lookbackDays: 10 - }) + await crawlWorkflowRunEvents(appState, testPersonalInstallation, testPersonalTestRepo, 10) // A - expect(appState.dynamoDB.puts.length).toEqual(4) + expect(appState.dynamoDB.puts.length).toEqual(2) expect(appState.dynamoDB.puts[0]).toEqual({ ConditionExpression: 'attribute_not_exists(PK)', Item: { @@ -79,38 +65,6 @@ test('repo-crawler-for-personal-account-installation', async () => { }, TableName: 'fakeGithubLatestWorkflowRunsTable' }) - expect(appState.dynamoDB.puts[2]).toEqual({ - ConditionExpression: 'attribute_not_exists(PK)', - Item: { - PK: 'ACCOUNT#162360409', - SK: 'REPO#767679529#REF#refs/heads/main#PUSH#COMMIT#dfb5cb80ad3ce5a19a5020b4645696b2d6b4d94c', - GSI1PK: 'ACCOUNT#162360409', - GSI1SK: 'REPO#767679529#DATETIME#2024-03-05T18:01:12Z', - _et: 'githubPush', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testPersonalTestRepoPush - }, - TableName: 'fakeGithubRepoActivityTable' - }) - expect(appState.dynamoDB.puts[3]).toEqual({ - ConditionExpression: 'attribute_not_exists(PK) OR #dateTime < :newDateTime', - ExpressionAttributeNames: { - '#dateTime': 'dateTime' - }, - ExpressionAttributeValues: { - ':newDateTime': '2024-03-05T18:01:12Z' - }, - Item: { - PK: 'ACCOUNT#162360409', - SK: 'REPO#767679529#REF#refs/heads/main', - GSI1PK: 'ACCOUNT#162360409', - GSI1SK: 'DATETIME#2024-03-05T18:01:12Z', - _et: 'githubLatestPushPerRef', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testPersonalTestRepoPush - }, - TableName: 'fakeGithubLatestPushesPerRefTable' - }) }) test('repo-crawler-for-org-installation', async () => { @@ -126,22 +80,12 @@ test('repo-crawler-for-org-installation', async () => { }, [example_org_workflow_run] ) - githubInstallationClient.stubMostRecentEventsForRepo.addResponse( - { - owner: 'cicada-test-org', - repo: 'org-test-repo-one' - }, - [example_org_repo_push] - ) // A - await crawlRepository(appState, testOrgInstallation, testOrgTestRepoOne, { - crawlChildObjects: 'always', - lookbackDays: 10 - }) + await crawlWorkflowRunEvents(appState, testOrgInstallation, testOrgTestRepoOne, 10) // A - expect(appState.dynamoDB.puts.length).toEqual(4) + expect(appState.dynamoDB.puts.length).toEqual(2) expect(appState.dynamoDB.puts[0]).toEqual({ ConditionExpression: 'attribute_not_exists(PK)', Item: { @@ -174,36 +118,4 @@ test('repo-crawler-for-org-installation', async () => { }, TableName: 'fakeGithubLatestWorkflowRunsTable' }) - expect(appState.dynamoDB.puts[2]).toEqual({ - ConditionExpression: 'attribute_not_exists(PK)', - Item: { - PK: 'ACCOUNT#162483619', - SK: 'REPO#768206479#REF#refs/heads/main#PUSH#COMMIT#8c3aa1cb0316ea23abeb2612457edb80868f53c8', - GSI1PK: 'ACCOUNT#162483619', - GSI1SK: 'REPO#768206479#DATETIME#2024-03-06T17:00:40Z', - _et: 'githubPush', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testOrgTestRepoOnePush - }, - TableName: 'fakeGithubRepoActivityTable' - }) - expect(appState.dynamoDB.puts[3]).toEqual({ - ConditionExpression: 'attribute_not_exists(PK) OR #dateTime < :newDateTime', - ExpressionAttributeNames: { - '#dateTime': 'dateTime' - }, - ExpressionAttributeValues: { - ':newDateTime': '2024-03-06T17:00:40Z' - }, - Item: { - PK: 'ACCOUNT#162483619', - SK: 'REPO#768206479#REF#refs/heads/main', - GSI1PK: 'ACCOUNT#162483619', - GSI1SK: 'DATETIME#2024-03-06T17:00:40Z', - _et: 'githubLatestPushPerRef', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testOrgTestRepoOnePush - }, - TableName: 'fakeGithubLatestPushesPerRefTable' - }) }) diff --git a/test/local/functional/domain/github/crawler/githubInstallationCrawler.test.ts b/test/local/functional/domain/github/crawler/crawlUsers.test.ts similarity index 67% rename from test/local/functional/domain/github/crawler/githubInstallationCrawler.test.ts rename to test/local/functional/domain/github/crawler/crawlUsers.test.ts index 3167ae8..2d4d02b 100644 --- a/test/local/functional/domain/github/crawler/githubInstallationCrawler.test.ts +++ b/test/local/functional/domain/github/crawler/crawlUsers.test.ts @@ -1,38 +1,32 @@ import { expect, test } from 'vitest' import { FakeAppState } from '../../../../../testSupport/fakes/fakeAppState' import { FakeGithubInstallationClient } from '../../../../../testSupport/fakes/fakeGithubInstallationClient' -import { crawlInstallation } from '../../../../../../src/app/domain/github/crawler/githubInstallationCrawler' import { testMikeRobertsUser, testMikeRobertsUserMembershipOfOrg, testOrgInstallation, - testOrgTestRepoOne, - testOrgTestRepoTwo, testPersonalInstallation, - testPersonalTestRepo, testTestUser, testTestUserMembershipOfOrg, testTestUserMembershipOfPersonalInstallation } from '../../../../../examples/cicada/githubDomainObjects' import example_personal_account_user from '../../../../../examples/github/personal-account/api/user.json' -import example_personal_account_repo from '../../../../../examples/github/personal-account/api/repo.json' import example_org_users from '../../../../../examples/github/org/api/users.json' -import example_org_repos from '../../../../../examples/github/org/api/repos.json' +import { crawlUsers } from '../../../../../../src/app/domain/github/crawler/crawlUsers' -test('installation-crawler-for-personal-account-installation', async () => { +test('user-crawler-for-personal-account-installation', async () => { // A const appState = new FakeAppState() const githubInstallationClient = new FakeGithubInstallationClient() appState.githubClient.fakeClientsForInstallation.addResponse(48093071, githubInstallationClient) githubInstallationClient.stubUsers.addResponse('cicada-test-user', example_personal_account_user) - githubInstallationClient.stubInstallationRepositories = [example_personal_account_repo] // A - await crawlInstallation(appState, testPersonalInstallation, { crawlChildObjects: 'never', lookbackDays: 7 }) + await crawlUsers(appState, testPersonalInstallation) // A - expect(appState.dynamoDB.batchWrites.length).toEqual(3) + expect(appState.dynamoDB.batchWrites.length).toEqual(2) expect(appState.dynamoDB.batchWrites[0]).toEqual({ RequestItems: { fakeGithubUsersTable: [ @@ -68,32 +62,15 @@ test('installation-crawler-for-personal-account-installation', async () => { ] } }) - expect(appState.dynamoDB.batchWrites[2]).toEqual({ - RequestItems: { - fakeGithubRepositoriesTable: [ - { - PutRequest: { - Item: { - PK: 'OWNER#162360409', - SK: 'REPO#767679529', - _et: 'githubRepository', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testPersonalTestRepo - } - } - } - ] - } - }) }) -test('installation-crawler-for-org-installation', async () => { +test('user-crawler-for-org-installation', async () => { // A const appState = new FakeAppState() const githubInstallationClient = new FakeGithubInstallationClient() appState.githubClient.fakeClientsForInstallation.addResponse(48133709, githubInstallationClient) githubInstallationClient.stubOrganizationMembers.addResponse('cicada-test-org', example_org_users) - githubInstallationClient.stubOrganizationRepositories.addResponse('cicada-test-org', example_org_repos) + appState.dynamoDB.stubAllPagesQueries.addResponse( { TableName: 'fakeGithubAccountMemberships', @@ -120,10 +97,10 @@ test('installation-crawler-for-org-installation', async () => { ) // A - await crawlInstallation(appState, testOrgInstallation, { crawlChildObjects: 'never', lookbackDays: 7 }) + await crawlUsers(appState, testOrgInstallation) // A - expect(appState.dynamoDB.batchWrites.length).toEqual(4) + expect(appState.dynamoDB.batchWrites.length).toEqual(3) expect(appState.dynamoDB.batchWrites[0]).toEqual({ RequestItems: { fakeGithubUsersTable: [ @@ -185,32 +162,4 @@ test('installation-crawler-for-org-installation', async () => { ] } }) - expect(appState.dynamoDB.batchWrites[3]).toEqual({ - RequestItems: { - fakeGithubRepositoriesTable: [ - { - PutRequest: { - Item: { - PK: 'OWNER#162483619', - SK: 'REPO#768206479', - _et: 'githubRepository', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testOrgTestRepoOne - } - } - }, - { - PutRequest: { - Item: { - PK: 'OWNER#162483619', - SK: 'REPO#768207426', - _et: 'githubRepository', - _lastUpdated: '2024-02-02T19:00:00.000Z', - ...testOrgTestRepoTwo - } - } - } - ] - } - }) }) diff --git a/test/local/functional/domain/github/webhook/githubWebhookInstallationProcessor.test.ts b/test/local/functional/domain/github/webhook/githubWebhookInstallationProcessor.test.ts index 39ceaf5..469d439 100644 --- a/test/local/functional/domain/github/webhook/githubWebhookInstallationProcessor.test.ts +++ b/test/local/functional/domain/github/webhook/githubWebhookInstallationProcessor.test.ts @@ -1,13 +1,8 @@ import { expect, test } from 'vitest' import { FakeAppState } from '../../../../../testSupport/fakes/fakeAppState' import { githubWebhookInstallationProcessor } from '../../../../../../src/app/domain/github/webhookProcessor/processors/githubWebhookInstallationProcessor' -import { FakeGithubInstallationClient } from '../../../../../testSupport/fakes/fakeGithubInstallationClient' import example_installation_created from '../../../../../examples/github/org/webhook/installationCreated.json' -import example_org_users from '../../../../../examples/github/org/api/users.json' -import example_org_repos from '../../../../../examples/github/org/api/repos.json' -import example_org_workflow_run from '../../../../../examples/github/org/api/workflowRunEvent.json' -import example_org_repo_push from '../../../../../examples/github/org/api/repoPush.json' import { testOrgInstallation } from '../../../../../examples/cicada/githubDomainObjects' test('installation-webhook-for-org-account-installation', async () => { @@ -17,46 +12,12 @@ test('installation-webhook-for-org-account-installation', async () => { ...appState.config.fakeGithubConfig, appId: '850768' } - const githubInstallationClient = new FakeGithubInstallationClient() - appState.githubClient.fakeClientsForInstallation.addResponse(48133709, githubInstallationClient) - githubInstallationClient.stubOrganizationMembers.addResponse('cicada-test-org', example_org_users) - githubInstallationClient.stubOrganizationRepositories.addResponse('cicada-test-org', example_org_repos) - githubInstallationClient.stubWorkflowRunsForRepo.addResponse( - { - owner: 'cicada-test-org', - repo: 'org-test-repo-one', - created: '>2023-11-04T19:00:00.000Z' - }, - [example_org_workflow_run] - ) - githubInstallationClient.stubWorkflowRunsForRepo.addResponse( - { - owner: 'cicada-test-org', - repo: 'org-test-repo-two', - created: '>2023-11-04T19:00:00.000Z' - }, - [] - ) - githubInstallationClient.stubMostRecentEventsForRepo.addResponse( - { - owner: 'cicada-test-org', - repo: 'org-test-repo-one' - }, - [example_org_repo_push] - ) - githubInstallationClient.stubMostRecentEventsForRepo.addResponse( - { - owner: 'cicada-test-org', - repo: 'org-test-repo-two' - }, - [] - ) // A await githubWebhookInstallationProcessor(appState, JSON.stringify(example_installation_created)) // A - expect(appState.dynamoDB.puts.length).toEqual(5) + expect(appState.dynamoDB.puts.length).toEqual(1) expect(appState.dynamoDB.puts[0]).toEqual({ Item: { PK: 'ACCOUNT#162483619', @@ -66,10 +27,10 @@ test('installation-webhook-for-org-account-installation', async () => { }, TableName: 'fakeGithubInstallationsTable' }) - expect(appState.dynamoDB.puts[1].Item?.['_et']).toEqual('githubWorkflowRunEvent') - expect(appState.dynamoDB.puts[2].Item?.['_et']).toEqual('githubLatestWorkflowRunEvent') - expect(appState.dynamoDB.puts[3].Item?.['_et']).toEqual('githubPush') - expect(appState.dynamoDB.puts[4].Item?.['_et']).toEqual('githubLatestPushPerRef') - - expect(appState.dynamoDB.batchWrites.length).toEqual(3) + expect(appState.eventBridgeBus.sentEvents.length).toEqual(1) + expect(appState.eventBridgeBus.sentEvents[0]).toEqual({ + detailType: 'InstallationUpdated', + detail: + '{"data":{"installationId":48133709,"appId":850768,"appSlug":"cicada-test-org","accountLogin":"cicada-test-org","accountId":162483619,"accountType":"organization"}}' + }) }) diff --git a/test/local/unit/util/collections.test.ts b/test/local/unit/util/collections.test.ts index 1aa9cd2..49f1caf 100644 --- a/test/local/unit/util/collections.test.ts +++ b/test/local/unit/util/collections.test.ts @@ -3,6 +3,7 @@ import { arrayDifferenceDeep, excludeKeys, mergeOrderedLists, + removeNullAndUndefined, selectKeys } from '../../../../src/app/util/collections' @@ -51,3 +52,10 @@ test('array difference', () => { { x: 3 } ]) }) + +test('removeNullAndUndefined', () => { + expect(removeNullAndUndefined([])).toEqual([]) + expect(removeNullAndUndefined([null, undefined])).toEqual([]) + expect(removeNullAndUndefined([1, null, undefined, 1])).toEqual([1, 1]) + expect(removeNullAndUndefined([null, 1, undefined, 1])).toEqual([1, 1]) +})