Skip to content

Commit

Permalink
Move "crawl installation" to a single Lambda call
Browse files Browse the repository at this point in the history
Adding public repos brings more business logic in, which is
always a pain with Step Functions. Also since Github "best
practices" recommend against parallel API calls we probably
don't want to parallelize various tasks anyway.

There's some benefit to being able to do retries at a lower level,
but errors should be rare anyway.

I may roll this back in future, but for now I think this is
easier.
  • Loading branch information
mikebroberts committed Sep 18, 2024
1 parent c6f8ad2 commit 5fdebe6
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 187 deletions.
25 changes: 25 additions & 0 deletions src/app/domain/github/crawler/crawlInstallation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { AppState } from '../../../environment/AppState'
import { GithubInstallation } from '../../types/GithubInstallation'
import { crawlUsers } from './crawlUsers'
import { crawlRepositories } from './crawlRepositories'
import { crawlPushes } from './crawlPushes'
import { crawlWorkflowRunEvents } from './crawlRunEvents'
import { logger } from '../../../util/logging'

export async function crawlInstallation(
appState: AppState,
installation: GithubInstallation,
lookbackDays: number
) {
logger.info(`Crawling Installation for ${installation.accountLogin}`)
await crawlUsers(appState, installation)
const repos = await crawlRepositories(appState, installation)
// Eventually consider doing some parallelization here (or move back to step function) but
// need to be careful since GitHub gets twitchy about concurrent requests to the API
// Their "best practice" doc says don't do it, but their rate limit doc says it's supported
// Only really need to care if things start getting slow
for (const repo of repos) {
await crawlPushes(appState, installation, repo)
await crawlWorkflowRunEvents(appState, installation, repo, lookbackDays)
}
}
2 changes: 2 additions & 0 deletions src/app/domain/github/crawler/crawlInstallations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ import { processRawInstallation } from '../githubInstallation'
import { AppState } from '../../../environment/AppState'
import { removeNullAndUndefined } from '../../../util/collections'
import { GithubInstallation } from '../../types/GithubInstallation'
import { logger } from '../../../util/logging'

export async function crawlInstallations(appState: AppState): Promise<GithubInstallation[]> {
logger.info(`Crawling Installations`)
const installations = await appState.githubClient.listInstallations()

return removeNullAndUndefined(
Expand Down
2 changes: 2 additions & 0 deletions src/app/domain/github/crawler/crawlPushes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { isRawGithubPushEventEvent } from '../../types/rawGithub/RawGithubAPIPus
import { fromRawGithubPushEventEvent, GithubPush } from '../../types/GithubPush'
import { processPushes } from '../githubPush'
import { GithubInstallation } from '../../types/GithubInstallation'
import { logger } from '../../../util/logging'

// TOEventually - only get all pushes back to lookback in crawl configuration, however GitHub doesn't keep
// them around for very long
Expand All @@ -13,6 +14,7 @@ export async function crawlPushes(
installation: GithubInstallation,
repo: GithubRepositorySummary
) {
logger.info(`Crawling Pushes for ${installation.accountLogin}/${repo.name}`)
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
const allEventsForRepo = await githubClient.listMostRecentEventsForRepo(repo.ownerName, repo.name)
const rawPushes = allEventsForRepo.filter(isRawGithubPushEventEvent)
Expand Down
2 changes: 2 additions & 0 deletions src/app/domain/github/crawler/crawlRepositories.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ import { GithubInstallation } from '../../types/GithubInstallation'
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
import { processRawRepositories, toRepositorySummary } from '../githubRepository'
import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/GithubAccountType'
import { logger } from '../../../util/logging'

export async function crawlRepositories(appState: AppState, installation: GithubInstallation) {
logger.info(`Crawling Repositories for ${installation.accountLogin}`)
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
const latestRawRepositories = await readRawRepositories(installation, githubClient)
const repos = await processRawRepositories(appState, latestRawRepositories)
Expand Down
2 changes: 2 additions & 0 deletions src/app/domain/github/crawler/crawlRunEvents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { GithubRepositorySummary } from '../../types/GithubRepository'
import { dateTimeAddDays } from '../../../util/dateAndTime'
import { processRawRunEvents } from '../githubWorkflowRunEvent'
import { GithubInstallation } from '../../types/GithubInstallation'
import { logger } from '../../../util/logging'

export async function crawlWorkflowRunEvents(
appState: AppState,
Expand All @@ -11,6 +12,7 @@ export async function crawlWorkflowRunEvents(
repo: GithubRepositorySummary,
lookbackDays: number
) {
logger.info(`Crawling Run Events for ${installation.accountLogin}/${repo.name}`)
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
const startTime = `${dateTimeAddDays(appState.clock.now(), -1 * lookbackDays).toISOString()}`

Expand Down
2 changes: 2 additions & 0 deletions src/app/domain/github/crawler/crawlUsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ import { GithubInstallation } from '../../types/GithubInstallation'
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
import { processRawUsers } from '../githubUser'
import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/GithubAccountType'
import { logger } from '../../../util/logging'

export async function crawlUsers(appState: AppState, installation: GithubInstallation) {
logger.info(`Crawling Users for ${installation.accountLogin}`)
const latestRawUsers = await readRawUsers(
installation,
appState.githubClient.clientForInstallation(installation.installationId)
Expand Down
69 changes: 15 additions & 54 deletions src/app/lambdaFunctions/githubCrawlTask/githubCrawlTaskEvents.ts
Original file line number Diff line number Diff line change
@@ -1,76 +1,37 @@
import { GithubInstallation, isGithubInstallation } from '../../domain/types/GithubInstallation'
import { GithubRepositorySummary, isGithubRepositorySummary } from '../../domain/types/GithubRepository'
import { throwError } from '@symphoniacloud/dynamodb-entity-store'
import {
CRAWLABLE_RESOURCES,
CrawlableResource,
isCrawlableResource
} from '../../../multipleContexts/githubCrawler' // TOEventually - safer type checking here

// TOEventually - safer type checking here
} from '../../../multipleContexts/githubCrawler'
import { isNotNullObject } from '../../util/types' // TOEventually - safer type checking here

export type CrawlEvent = { resourceType: CrawlableResource }
type CrawlEventWithInstallation = CrawlEvent & { installation: GithubInstallation }
type CrawlEventWithRepositorySummary = CrawlEvent & { repository: GithubRepositorySummary }

export function isCrawlEvent(x: unknown): x is CrawlEvent {
return x !== undefined && isCrawlableResource((x as CrawlEvent).resourceType)
}

export function isCrawlEventWithInstallation(x: CrawlEvent): x is CrawlEventWithInstallation {
const candidate = x as CrawlEventWithInstallation
return candidate.installation && isGithubInstallation(candidate.installation)
}

export function isCrawlEventWithRepositorySummary(x: CrawlEvent): x is CrawlEventWithRepositorySummary {
const candidate = x as CrawlEventWithRepositorySummary
return candidate.repository && isGithubRepositorySummary(candidate.repository)
return isNotNullObject(x) && 'resourceType' in x && isCrawlableResource(x.resourceType)
}

export type CrawlInstallationsEvent = { resourceType: 'installations' }
export type CrawlUsersEvent = { resourceType: 'users' } & CrawlEventWithInstallation
export type CrawlRepositoriesEvent = { resourceType: 'repositories' } & CrawlEventWithInstallation
export type CrawlPushesEvent = { resourceType: 'pushes' } & CrawlEventWithInstallation &
CrawlEventWithRepositorySummary
export type CrawlWorkflowRunEventsEvent = {
resourceType: 'pushes'

export type CrawlInstallationEvent = {
resourceType: 'installation'
installation: GithubInstallation
lookbackDays: number
} & CrawlEventWithInstallation &
CrawlEventWithRepositorySummary
}

export function isCrawlInstallationsEvent(x: CrawlEvent): x is CrawlInstallationsEvent {
return x.resourceType === CRAWLABLE_RESOURCES.INSTALLATIONS
}

export function isCrawlUsersEvent(x: CrawlEvent): x is CrawlUsersEvent {
if (x.resourceType !== CRAWLABLE_RESOURCES.USERS) return false
return (
isCrawlEventWithInstallation(x) ||
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.USERS} : ${JSON.stringify(x)}`)()
)
}

export function isCrawlRepositoriesEvent(x: CrawlEvent): x is CrawlRepositoriesEvent {
if (x.resourceType !== CRAWLABLE_RESOURCES.REPOSITORIES) return false
return (
isCrawlEventWithInstallation(x) ||
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.REPOSITORIES} : ${JSON.stringify(x)}`)()
)
}

export function isCrawlPushesEvent(x: CrawlEvent): x is CrawlPushesEvent {
if (x.resourceType !== CRAWLABLE_RESOURCES.PUSHES) return false
return (
(isCrawlEventWithInstallation(x) && isCrawlEventWithRepositorySummary(x)) ||
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.PUSHES} : ${JSON.stringify(x)}`)()
)
}

export function isCrawlWorkflowRunEventsEvent(x: CrawlEvent): x is CrawlWorkflowRunEventsEvent {
if (x.resourceType !== CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS) return false
const hasLookBackDays = typeof (x as CrawlWorkflowRunEventsEvent).lookbackDays !== undefined
export function isCrawlInstallationEvent(x: CrawlEvent): x is CrawlInstallationEvent {
if (x.resourceType !== CRAWLABLE_RESOURCES.INSTALLATION) return false
return (
(hasLookBackDays && isCrawlEventWithInstallation(x) && isCrawlEventWithRepositorySummary(x)) ||
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS} : ${JSON.stringify(x)}`)()
('installation' in x &&
isGithubInstallation(x.installation) &&
'lookbackDays' in x &&
typeof x.lookbackDays === 'number') ||
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.INSTALLATION} : ${JSON.stringify(x)}`)()
)
}
30 changes: 4 additions & 26 deletions src/app/lambdaFunctions/githubCrawlTask/lambda.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,9 @@ import middy from '@middy/core'
import { powertoolsMiddlewares } from '../../middleware/standardMiddleware'
import { logger } from '../../util/logging'
import { isFailure } from '../../util/structuredResult'
import { crawlPushes } from '../../domain/github/crawler/crawlPushes'
import { crawlRepositories } from '../../domain/github/crawler/crawlRepositories'
import { crawlInstallations } from '../../domain/github/crawler/crawlInstallations'
import { crawlUsers } from '../../domain/github/crawler/crawlUsers'
import {
isCrawlEvent,
isCrawlInstallationsEvent,
isCrawlPushesEvent,
isCrawlRepositoriesEvent,
isCrawlUsersEvent,
isCrawlWorkflowRunEventsEvent
} from './githubCrawlTaskEvents'
import { crawlWorkflowRunEvents } from '../../domain/github/crawler/crawlRunEvents'
import { isCrawlEvent, isCrawlInstallationEvent, isCrawlInstallationsEvent } from './githubCrawlTaskEvents'
import { crawlInstallation } from '../../domain/github/crawler/crawlInstallation'

let appState: AppState

Expand All @@ -40,20 +30,8 @@ export const baseHandler: Handler<unknown, unknown> = async (event) => {
return await crawlInstallations(appState)
}

if (isCrawlUsersEvent(event)) {
return await crawlUsers(appState, event.installation)
}

if (isCrawlRepositoriesEvent(event)) {
return await crawlRepositories(appState, event.installation)
}

if (isCrawlPushesEvent(event)) {
return await crawlPushes(appState, event.installation, event.repository)
}

if (isCrawlWorkflowRunEventsEvent(event)) {
return await crawlWorkflowRunEvents(appState, event.installation, event.repository, event.lookbackDays)
if (isCrawlInstallationEvent(event)) {
return await crawlInstallation(appState, event.installation, event.lookbackDays)
}

throw new Error(`unknown event format: ${event}`)
Expand Down
110 changes: 7 additions & 103 deletions src/cdk/stacks/main/githubCrawlers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,20 @@ import {
DefinitionBody,
IntegrationPattern,
JsonPath,
LogLevel,
Map,
StateMachine,
StateMachineType,
TaskInput
} from 'aws-cdk-lib/aws-stepfunctions'
import { LambdaInvoke, StepFunctionsStartExecution } from 'aws-cdk-lib/aws-stepfunctions-tasks'
import { CRAWLABLE_RESOURCES } from '../../../multipleContexts/githubCrawler'
import { Rule, Schedule } from 'aws-cdk-lib/aws-events'
import { SfnStateMachine } from 'aws-cdk-lib/aws-events-targets'
import { EVENTBRIDGE_DETAIL_TYPES } from '../../../multipleContexts/eventBridge'
import { Duration, RemovalPolicy } from 'aws-cdk-lib'
import { LogGroup } from 'aws-cdk-lib/aws-logs'
import { Duration } from 'aws-cdk-lib'

export function defineGithubCrawlers(scope: Construct, props: MainStackProps) {
const crawlerFunction = defineGithubCrawlerFunction(scope, props)
const reposChildrenCrawler = defineReposChildrenCrawler(scope, props, crawlerFunction)
const installationCrawler = defineInstallationCrawler(scope, props, crawlerFunction, reposChildrenCrawler)
const installationCrawler = defineInstallationCrawler(scope, props, crawlerFunction)
const allInstallationsCrawler = defineAllInstallationsCrawler(
scope,
props,
Expand Down Expand Up @@ -52,114 +48,22 @@ function defineGithubCrawlerFunction(scope: Construct, props: MainStackProps) {
)
}

function defineReposChildrenCrawler(
scope: Construct,
props: MainStackProps,
crawlerFunction: CicadaFunction
) {
const crawlPushes = new LambdaInvoke(scope, 'crawlPushes', {
lambdaFunction: crawlerFunction,
payload: TaskInput.fromObject({
resourceType: CRAWLABLE_RESOURCES.PUSHES,
installation: JsonPath.objectAt('$.installation'),
repository: JsonPath.objectAt('$.repository')
}),
// Pass through original input to next state
resultPath: JsonPath.DISCARD
})

const crawlWorkflowRunEvents = new LambdaInvoke(scope, 'crawlWorkflowRunEvents', {
function defineInstallationCrawler(scope: Construct, props: MainStackProps, crawlerFunction: CicadaFunction) {
const crawlInstallation = new LambdaInvoke(scope, 'crawlInstallation', {
lambdaFunction: crawlerFunction,
payload: TaskInput.fromObject({
resourceType: CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS,
installation: JsonPath.objectAt('$.installation'),
repository: JsonPath.objectAt('$.repository'),
lookbackDays: JsonPath.numberAt('$$.Execution.Input.lookbackDays')
}),
resultPath: JsonPath.DISCARD
})

// TOEventually - need to consider github app rate limiting (max 5000 requests / hour, etc.)
// TOEventually - as part of rate limiting use conditional requests, look at returned quota data, etc.
const forEachRepository = new Map(scope, 'forEachRepository', {
maxConcurrency: 10,
itemsPath: '$.repositories',
itemSelector: {
resourceType: CRAWLABLE_RESOURCES.INSTALLATION,
installation: JsonPath.objectAt('$.installation'),
repository: JsonPath.objectAt('$$.Map.Item.Value')
}
})
forEachRepository.itemProcessor(crawlPushes.next(crawlWorkflowRunEvents))

return new StateMachine(scope, 'repoElementsCrawler', {
stateMachineName: `${props.appName}-repositories-elements-crawler`,
stateMachineType: StateMachineType.EXPRESS,
// Need to configure logs because Express Workflows don't have any diagnotics otherwise
logs: {
level: LogLevel.ALL,
destination: new LogGroup(scope, 'repoElementsCrawlerLogGroup', {
logGroupName: `${props.appName}-repositories-elements-crawler`,
removalPolicy: RemovalPolicy.DESTROY,
retention: props.logRetention
})
},
comment: 'Crawl child objects of a list of repositories (Express)',
definitionBody: DefinitionBody.fromChainable(forEachRepository),
tracingEnabled: true
})
}

function defineInstallationCrawler(
scope: Construct,
props: MainStackProps,
crawlerFunction: CicadaFunction,
reposChildrenCrawler: StateMachine
) {
const crawlUsers = new LambdaInvoke(scope, 'crawlUsers', {
lambdaFunction: crawlerFunction,
payload: TaskInput.fromObject({
resourceType: CRAWLABLE_RESOURCES.USERS,
installation: JsonPath.objectAt('$.installation')
lookbackDays: JsonPath.numberAt('$.lookbackDays')
}),
// Pass through original input to next state
resultPath: JsonPath.DISCARD
})

const crawlRepositories = new LambdaInvoke(scope, 'crawlRepositories', {
lambdaFunction: crawlerFunction,
payload: TaskInput.fromObject({
resourceType: CRAWLABLE_RESOURCES.REPOSITORIES,
installation: JsonPath.objectAt('$.installation')
}),
resultSelector: {
repositories: JsonPath.objectAt('$.Payload')
},
resultPath: '$.repositoriesCrawler'
})

// TOEventually - will need to partition set of repositories due to either max timeout of
// underlying express workflow OR because of size of request.
const invokeReposChildrenCrawler = new StepFunctionsStartExecution(
scope,
'installationInvokeReposChildrenCrawler',
{
stateMachine: reposChildrenCrawler,
integrationPattern: IntegrationPattern.RUN_JOB,
associateWithParent: true,
input: TaskInput.fromObject({
installation: JsonPath.objectAt('$.installation'),
repositories: JsonPath.objectAt('$.repositoriesCrawler.repositories'),
lookbackDays: JsonPath.numberAt('$.lookbackDays')
})
}
)

const workflow = crawlUsers.next(crawlRepositories).next(invokeReposChildrenCrawler)

return new StateMachine(scope, 'installationCrawler', {
stateMachineName: `${props.appName}-installation`,
comment: 'Crawl a GitHub App Installation and child resources',
definitionBody: DefinitionBody.fromChainable(workflow),
definitionBody: DefinitionBody.fromChainable(crawlInstallation),
tracingEnabled: true
})
}
Expand Down
5 changes: 1 addition & 4 deletions src/multipleContexts/githubCrawler.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
export const CRAWLABLE_RESOURCES = {
INSTALLATIONS: 'installations',
USERS: 'users',
REPOSITORIES: 'repositories',
PUSHES: 'pushes',
WORKFLOW_RUN_EVENTS: 'workflowRunEvents'
INSTALLATION: 'installation'
} as const

export type CrawlableResource = (typeof CRAWLABLE_RESOURCES)[keyof typeof CRAWLABLE_RESOURCES]
Expand Down

0 comments on commit 5fdebe6

Please sign in to comment.