Skip to content

Commit

Permalink
Use Step Functions for crawler orchestration
Browse files Browse the repository at this point in the history
Before this change all crawler orchestration happened in-Lambda.
Moving to step functions is useful for a few reasons:
1 - Soonish I'd like to add the ability to add public repos,
and updating those can also use this crawler logic
2 - Going to have to deal with GitHub rate limiting and retries at some point - Step Functions is a good use for that
3 - Async data loading is a good example of Step Functions anyway, and it's good to have Step Functions in Cicada somewhere from an example point of view
  • Loading branch information
mikebroberts committed Apr 26, 2024
1 parent 9b3bddf commit a6c24ba
Show file tree
Hide file tree
Showing 29 changed files with 745 additions and 453 deletions.
12 changes: 1 addition & 11 deletions docs/SettingUpCicada.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,4 @@ Cicada has to be **registered** and **installed** as a "GitHub App" in your GitH

Finally you can go back to the Cicada Home Page (`https://YOUR_WEB_HOST_NAME`) and login, at which point setup is complete.

## Load older data

Depending on how recently you've had activity in your GitHub account you may already see some data from the logged-in home screen. If not then:

* Go to the Lambda Console in your account
* Go to the _githubCrawler_ function for your installation
* Go to the "Test" tab
* Set the _Event JSON_ section to `{"lookbackDays": 90}` (adjust the value to more or few days)
* Click the "Test" button

This will load more data from your GitHub account. NB: only GitHub Actions Runs data will be available more than a couple of weeks in the past - you won't see any older Push events than that because they're not available from GitHub.
On installation, Cicada loads the past 30 days of workflow run events, and however many pushes GitHub will provide - usually a few days.
10 changes: 0 additions & 10 deletions src/app/domain/github/crawler/crawlConfiguration.ts

This file was deleted.

12 changes: 12 additions & 0 deletions src/app/domain/github/crawler/crawlInstallations.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { processRawInstallation } from '../githubInstallation'
import { AppState } from '../../../environment/AppState'
import { removeNullAndUndefined } from '../../../util/collections'
import { GithubInstallation } from '../../types/GithubInstallation'

export async function crawlInstallations(appState: AppState): Promise<GithubInstallation[]> {
const installations = await appState.githubClient.listInstallations()

return removeNullAndUndefined(
await Promise.all(installations.map(async (raw) => processRawInstallation(appState, raw)))
)
}
Original file line number Diff line number Diff line change
@@ -1,53 +1,23 @@
import { AppState } from '../../../environment/AppState'
import { GithubInstallation } from '../../types/GithubInstallation'
import { GithubRepository } from '../../types/GithubRepository'
import { processRawRunEvents } from '../githubWorkflowRunEvent'
import { processPushes } from '../githubPush'
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
import { dateTimeAddDays } from '../../../util/dateAndTime'
import { isRawGithubPushEventEvent } from '../../types/rawGithub/RawGithubAPIPushEventEvent'
import { fromRawGithubPushEventEvent, GithubPush } from '../../types/GithubPush'
import { CrawlConfiguration } from './crawlConfiguration'

export async function crawlRepository(
appState: AppState,
installation: GithubInstallation,
repo: GithubRepository,
crawlConfiguration: CrawlConfiguration
) {
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
await crawlRunEvents(appState, githubClient, repo, crawlConfiguration)
await crawlPushes(appState, githubClient, repo)
}

export async function crawlRunEvents(
appState: AppState,
githubClient: GithubInstallationClient,
repo: GithubRepository,
crawlConfiguration: CrawlConfiguration
) {
const startTime = `${dateTimeAddDays(
appState.clock.now(),
-1 * crawlConfiguration.lookbackDays
).toISOString()}`

const recentRunEvents = await githubClient.listWorkflowRunsForRepo(
repo.ownerName,
repo.name,
`>${startTime}`
)
await processRawRunEvents(appState, recentRunEvents, false)
}
import { processPushes } from '../githubPush'
import { GithubInstallation } from '../../types/GithubInstallation'

// TOEventually - only get all pushes back to lookback in crawl configuration, however GitHub doesn't keep
// them around for very long
export async function crawlPushes(
appState: AppState,
githubClient: GithubInstallationClient,
// the owner ID on repo isn't sufficient when we are crawling public repos from other accounts
installation: GithubInstallation,
repo: GithubRepository
) {
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
const allEventsForRepo = await githubClient.listMostRecentEventsForRepo(repo.ownerName, repo.name)
const rawPushes = allEventsForRepo.filter(isRawGithubPushEventEvent)
// TODO - this comment was from pre-step-functions version. Is there something that can be improved now
// repo is in context?
// For now do translation to internal pushes here since we need context of repo details, which aren't in the raw push
// (this isn't required for webhook translation)
const pushes = rawPushes
Expand Down
21 changes: 21 additions & 0 deletions src/app/domain/github/crawler/crawlRepositories.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { AppState } from '../../../environment/AppState'
import { GithubInstallation } from '../../types/GithubInstallation'
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
import { processRawRepositories } from '../githubRepository'
import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/githubCommonTypes'

export async function crawlRepositories(appState: AppState, installation: GithubInstallation) {
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
const latestRawRepositories = await readRawRepositories(installation, githubClient)
return await processRawRepositories(appState, latestRawRepositories)
}

async function readRawRepositories(installation: GithubInstallation, githubClient: GithubInstallationClient) {
if (installation.accountType === ORGANIZATION_ACCOUNT_TYPE) {
return await githubClient.listOrganizationRepositories(installation.accountLogin)
} else if (installation.accountType === USER_ACCOUNT_TYPE) {
return await githubClient.listInstallationRepositories()
} else {
throw new Error(`Unknown installation account type: ${installation.accountType}`)
}
}
23 changes: 23 additions & 0 deletions src/app/domain/github/crawler/crawlRunEvents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { AppState } from '../../../environment/AppState'
import { GithubRepository } from '../../types/GithubRepository'
import { dateTimeAddDays } from '../../../util/dateAndTime'
import { processRawRunEvents } from '../githubWorkflowRunEvent'
import { GithubInstallation } from '../../types/GithubInstallation'

export async function crawlWorkflowRunEvents(
appState: AppState,
// the owner ID on repo isn't sufficient when we are crawling public repos from other accounts
installation: GithubInstallation,
repo: GithubRepository,
lookbackDays: number
) {
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
const startTime = `${dateTimeAddDays(appState.clock.now(), -1 * lookbackDays).toISOString()}`

const recentRunEvents = await githubClient.listWorkflowRunsForRepo(
repo.ownerName,
repo.name,
`>${startTime}`
)
await processRawRunEvents(appState, recentRunEvents, false)
}
23 changes: 23 additions & 0 deletions src/app/domain/github/crawler/crawlUsers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { AppState } from '../../../environment/AppState'
import { GithubInstallation } from '../../types/GithubInstallation'
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
import { processRawUsers } from '../githubUser'
import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/githubCommonTypes'

export async function crawlUsers(appState: AppState, installation: GithubInstallation) {
const latestRawUsers = await readRawUsers(
installation,
appState.githubClient.clientForInstallation(installation.installationId)
)
await processRawUsers(appState, latestRawUsers, installation)
}

async function readRawUsers(installation: GithubInstallation, githubClient: GithubInstallationClient) {
if (installation.accountType === ORGANIZATION_ACCOUNT_TYPE) {
return await githubClient.listOrganizationMembers(installation.accountLogin)
} else if (installation.accountType === USER_ACCOUNT_TYPE) {
return [await githubClient.getUser(installation.accountLogin)]
} else {
throw new Error(`Unknown installation account type: ${installation.accountType}`)
}
}
12 changes: 0 additions & 12 deletions src/app/domain/github/crawler/githubAppCrawler.ts

This file was deleted.

56 changes: 0 additions & 56 deletions src/app/domain/github/crawler/githubInstallationCrawler.ts

This file was deleted.

27 changes: 6 additions & 21 deletions src/app/domain/github/githubInstallation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,18 @@ import { GithubInstallationEntity } from '../entityStore/entities/GithubInstalla
import { logger } from '../../util/logging'
import deepEqual from 'deep-equal'
import { RawGithubInstallation } from '../types/rawGithub/RawGithubInstallation'
import { crawlInstallation } from './crawler/githubInstallationCrawler'
import { calculateCrawlChildren, CrawlConfiguration } from './crawler/crawlConfiguration'

export async function processRawInstallation(
appState: AppState,
rawInstallation: RawGithubInstallation,
crawlConfiguration: CrawlConfiguration
) {
await processInstallation(appState, fromRawGithubInstallation(rawInstallation), crawlConfiguration)
export async function processRawInstallation(appState: AppState, rawInstallation: RawGithubInstallation) {
return await processInstallation(appState, fromRawGithubInstallation(rawInstallation))
}

export async function processInstallation(
appState: AppState,
installation: GithubInstallation,
crawlConfiguration: CrawlConfiguration
) {
export async function processInstallation(appState: AppState, installation: GithubInstallation) {
if (`${installation.appId}` !== (await appState.config.github()).appId) {
logger.warn(`Not processing invalid installation - unexpected app ID`)
return
return null
}

const { installationStateChanged } = await saveInstallation(appState, installation)
if (calculateCrawlChildren(crawlConfiguration, installationStateChanged)) {
await crawlInstallation(appState, installation, crawlConfiguration)
}
return await saveInstallation(appState, installation)
}

async function saveInstallation(appState: AppState, installation: GithubInstallation) {
Expand Down Expand Up @@ -56,9 +43,7 @@ async function saveInstallation(appState: AppState, installation: GithubInstalla
await installationsStore.put(installation)
}

return {
installationStateChanged
}
return installation
}

export function installationsEqual(x: GithubInstallation, y: GithubInstallation) {
Expand Down
36 changes: 4 additions & 32 deletions src/app/domain/github/githubRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,15 @@ import { AppState } from '../../environment/AppState'
import { RawGithubRepository } from '../types/rawGithub/RawGithubRepository'
import { fromRawGithubRepository, GithubRepository } from '../types/GithubRepository'
import { GithubRepositoryEntity } from '../entityStore/entities/GithubRepositoryEntity'
import { GithubInstallation } from '../types/GithubInstallation'
import { crawlRepository } from './crawler/githubRepositoryCrawler'
import { CrawlConfiguration } from './crawler/crawlConfiguration'

export async function processRawRepositories(
appState: AppState,
installation: GithubInstallation,
rawRepos: RawGithubRepository[],
crawlChildResources: CrawlConfiguration
) {
await processRepositories(
appState,
installation,
rawRepos.map(fromRawGithubRepository),
crawlChildResources
)
export async function processRawRepositories(appState: AppState, rawRepos: RawGithubRepository[]) {
return await saveRepositories(appState, rawRepos.map(fromRawGithubRepository))
}

export async function processRepositories(
appState: AppState,
installation: GithubInstallation,
repos: GithubRepository[],
crawlConfiguration: CrawlConfiguration
) {
await saveRepositories(appState, repos)
// TOEventually - delete repos that don't exist any more
// TOEventually - figure out what actually changed and just crawl them
const crawlChildResources =
crawlConfiguration.crawlChildObjects === 'always' || crawlConfiguration.crawlChildObjects == 'ifChanged'
for (const repo of crawlChildResources ? repos : []) {
await crawlRepository(appState, installation, repo, crawlConfiguration)
}
}

export async function saveRepositories(appState: AppState, repos: GithubRepository[]) {
async function saveRepositories(appState: AppState, repos: GithubRepository[]) {
// Just put all repos since there may have been updates to details
await appState.entityStore.for(GithubRepositoryEntity).advancedOperations.batchPut(repos)
return repos
}

export async function getRepository(appState: AppState, accountId: number, repoId: number) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import { fromRawGithubInstallation } from '../../../types/GithubInstallation'
import { RawGithubInstallation } from '../../../types/rawGithub/RawGithubInstallation'
import { processInstallation } from '../../githubInstallation'
import { AppState } from '../../../../environment/AppState'
import { WebhookProcessor } from '../WebhookProcessor'
import { processInstallation } from '../../githubInstallation'
import { sendToEventBridge } from '../../../../outboundInterfaces/eventBridgeBus'
import { EVENTBRIDGE_DETAIL_TYPES } from '../../../../../multipleContexts/eventBridge'

export const githubWebhookInstallationProcessor: WebhookProcessor = async (
appState: AppState,
body: string
): Promise<void> => {
// TOEventually - need to differentiate sub-types of installation - e.g. deleted
// TOEventually - type check, e.g. with AJV
const parsed = fromRawGithubInstallation(JSON.parse(body).installation as RawGithubInstallation)
if (!parsed) {
const installation = fromRawGithubInstallation(JSON.parse(body).installation as RawGithubInstallation)
if (!installation) {
return
}
await processInstallation(appState, parsed, {
crawlChildObjects: 'ifChanged',
lookbackDays: 90
})

await processInstallation(appState, installation)
await sendToEventBridge(appState, EVENTBRIDGE_DETAIL_TYPES.INSTALLATION_UPDATED, installation)
}
Loading

0 comments on commit a6c24ba

Please sign in to comment.