From e18e4ecae878f738f0824c71f3f035eb93d730e2 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Wed, 26 Jul 2023 17:31:26 -0500 Subject: [PATCH] refactor smoke-test dependency on actions pod --- .github/workflows/docker-ingestion-base.yml | 45 ------ .github/workflows/docker-ingestion.yml | 118 -------------- .github/workflows/docker-unified.yml | 152 +++++++++++++++++- datahub-frontend/build.gradle | 2 + datahub-upgrade/build.gradle | 2 + docker/build.gradle | 6 +- docker/datahub-ingestion-base/Dockerfile | 14 +- docker/datahub-ingestion-base/build.gradle | 4 +- docker/datahub-ingestion-base/entrypoint.sh | 13 ++ docker/datahub-ingestion-slim/Dockerfile | 20 ++- docker/datahub-ingestion-slim/build.gradle | 4 +- docker/datahub-ingestion/Dockerfile | 27 ++-- docker/datahub-ingestion/build.gradle | 4 +- docker/docker-compose-with-cassandra.yml | 5 +- docker/docker-compose-without-neo4j.yml | 5 +- docker/docker-compose.yml | 5 +- docker/elasticsearch-setup/build.gradle | 2 + docker/kafka-setup/build.gradle | 2 + docker/mysql-setup/build.gradle | 2 + docker/postgres-setup/build.gradle | 2 + .../docker-compose-m1.quickstart.yml | 4 +- ...er-compose-without-neo4j-m1.quickstart.yml | 4 +- ...ocker-compose-without-neo4j.quickstart.yml | 4 +- .../quickstart/docker-compose.quickstart.yml | 4 +- docs/cli.md | 7 +- metadata-jobs/mae-consumer-job/build.gradle | 2 + metadata-jobs/mce-consumer-job/build.gradle | 2 + metadata-service/war/build.gradle | 2 + 28 files changed, 270 insertions(+), 193 deletions(-) delete mode 100644 .github/workflows/docker-ingestion-base.yml delete mode 100644 .github/workflows/docker-ingestion.yml create mode 100644 docker/datahub-ingestion-base/entrypoint.sh diff --git a/.github/workflows/docker-ingestion-base.yml b/.github/workflows/docker-ingestion-base.yml deleted file mode 100644 index 64f94ef3225ce..0000000000000 --- a/.github/workflows/docker-ingestion-base.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: ingestion base -on: - release: - types: [published] - push: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - pull_request: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-base: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Build and Push image - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-base - tags: latest - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ github.ref == 'refs/heads/master' }} - context: . - file: ./docker/datahub-ingestion-base/Dockerfile - platforms: linux/amd64,linux/arm64/v8 diff --git a/.github/workflows/docker-ingestion.yml b/.github/workflows/docker-ingestion.yml deleted file mode 100644 index 6716026851739..0000000000000 --- a/.github/workflows/docker-ingestion.yml +++ /dev/null @@ -1,118 +0,0 @@ -name: datahub-ingestion docker -on: - push: - branches: - - master - paths-ignore: - - "docs/**" - - "**.md" - pull_request: - branches: - - master - paths: - - "metadata-ingestion/**" - - "metadata-models/**" - - "docker/datahub-ingestion/**" - - "docker/datahub-ingestion-slim/**" - - ".github/workflows/docker-ingestion.yml" - release: - types: [published] - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - tag: ${{ steps.tag.outputs.tag }} - publish: ${{ steps.publish.outputs.publish }} - python_release_version: ${{ steps.python_release_version.outputs.release_version }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Compute Tag - id: tag - run: | - source .github/scripts/docker_helpers.sh - echo "tag=$(get_tag)" >> $GITHUB_OUTPUT - - name: Compute Python Release Version - id: python_release_version - run: | - source .github/scripts/docker_helpers.sh - echo "release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT - - name: Check whether publishing enabled - id: publish - env: - ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD }} - run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT - push_to_registries: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - needs: setup - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Build and push - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - linkedin/datahub-ingestion - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - build-args: | - RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} - - name: Build and Push image (slim) - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-slim - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion-slim/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - ingestion-slim_scan: - permissions: - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/upload-sarif to upload SARIF results - actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status - name: "[Monitoring] Scan datahub-ingestion-slim images for vulnerabilities" - if: ${{ github.ref == 'refs/heads/master' }} - runs-on: ubuntu-latest - needs: [push_to_registries] - steps: - - name: Checkout # adding checkout step just to make trivy upload happy - uses: actions/checkout@v3 - - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 - with: - image: acryldata/datahub-ingestion-slim:latest - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@0.8.0 - env: - TRIVY_OFFLINE_SCAN: true - with: - image-ref: acryldata/datahub-ingestion-slim:latest - format: "template" - template: "@/contrib/sarif.tpl" - output: "trivy-results.sarif" - severity: "CRITICAL,HIGH" - ignore-unfixed: true - vuln-type: "os,library" - - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: "trivy-results.sarif" diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index dab3fbf0c9f36..5c875d634b0a1 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -31,6 +31,9 @@ env: DATAHUB_ELASTIC_SETUP_IMAGE: "linkedin/datahub-elasticsearch-setup" DATAHUB_MYSQL_SETUP_IMAGE: "acryldata/datahub-mysql-setup" DATAHUB_UPGRADE_IMAGE: "acryldata/datahub-upgrade" + DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base" + DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion" + DATAHUB_INGESTION_SLIM_IMAGE: "acryldata/datahub-ingestion-slim" jobs: setup: @@ -51,7 +54,7 @@ jobs: - name: Check whether publishing enabled id: publish env: - ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD }} + ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT @@ -414,6 +417,118 @@ jobs: file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + datahub_ingestion_build: + name: Build and Push DataHub Ingestion Docker Images + runs-on: ubuntu-latest + needs: setup + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Build and push Base Image + uses: ./.github/actions/docker-custom-build-and-push + with: + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Build and push Full Image + uses: ./.github/actions/docker-custom-build-and-push + with: + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + tags: ${{ needs.setup.outputs.tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Build and push Slim Image + uses: ./.github/actions/docker-custom-build-and-push + with: + images: | + ${{ env.DATAHUB_INGESTION_SLIM_IMAGE }} + tags: ${{ needs.setup.outputs.tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-slim/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + datahub_ingestion_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Download image Full Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Download image Slim Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_SLIM_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner Base Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Run Trivy vulnerability scanner Full Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Run Trivy vulnerability scanner Slim Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_SLIM_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + smoke_test: name: Run Smoke Tests runs-on: ubuntu-latest @@ -432,8 +547,11 @@ jobs: mae_consumer_build, mce_consumer_build, datahub_upgrade_build, + datahub_ingestion_build, ] steps: + - name: Disk Check + run: df -h . - name: Check out the repo uses: actions/checkout@v3 - name: Set up JDK 11 @@ -445,11 +563,19 @@ jobs: with: python-version: "3.7" cache: "pip" + - name: Disk Check + run: df -h . - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh + - name: Disk Check + run: df -h . - name: Build datahub cli run: | ./gradlew :metadata-ingestion:install + - name: Disk Check + run: df -h . + - name: Docker Image Check + run: docker images - name: Download GMS image uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} @@ -490,11 +616,23 @@ jobs: if: ${{ needs.setup.outputs.publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Download datahub-ingestion-slim image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_SLIM_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Disk Check + run: df -h . + - name: Docker Image Check + run: docker images - name: run quickstart env: DATAHUB_TELEMETRY_ENABLED: false DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} - DATAHUB_ACTIONS_IMAGE: "acryldata/datahub-actions-slim" + DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_SLIM_IMAGE }} + ACTIONS_VERSION: ${{ needs.setup.outputs.unique_tag }} + ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions' + ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' run: | ./smoke-test/run-quickstart.sh - name: sleep 60s @@ -502,6 +640,14 @@ jobs: # we are doing this because gms takes time to get ready # and we don't have a better readiness check when bootstrap is done sleep 60s + - name: Disk Check + run: df -h . + - name: Docker Image Check + run: docker images + - name: Remove Source Code + run: find ./*/* ! -path "./metadata-ingestion*" ! -path "./smoke-test*" ! -path "./gradle*" -delete + - name: Disk Check + run: df -h . - name: Smoke test env: RUN_QUICKSTART: false @@ -512,6 +658,8 @@ jobs: run: | echo "$DATAHUB_VERSION" ./smoke-test/smoke.sh + - name: Disk Check + run: df -h . - name: store logs if: failure() run: | diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index f21d10d8f3842..c3f361fe49cd0 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -79,6 +79,8 @@ docker { files fileTree(rootProject.projectDir) { include 'docker/monitoring/*' include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index ad2bf02bfdcc7..5367059bed283 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -89,6 +89,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/build.gradle b/docker/build.gradle index 5fea245838049..f6bc51f9d3557 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -6,6 +6,7 @@ apply from: "../gradle/versioning/versioning.gradle" ext { quickstart_modules = [ + ':docker:datahub-ingestion-slim', ':docker:elasticsearch-setup', ':docker:mysql-setup', ':docker:kafka-setup', @@ -54,7 +55,10 @@ task quickstartSlim(type: Exec, dependsOn: ':metadata-ingestion:install') { environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-actions-slim" + environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion-slim" + environment "ACTIONS_VERSION", "v${version}" + environment "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions' + environment "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 15ac9314d35c4..d6448321c9f85 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -35,7 +35,8 @@ RUN apt-get update && apt-get install -y -qq \ && tar -xzf /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz -C /root \ && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ && ./configure --prefix /usr && make && make install && make clean && ./configure --clean \ - && apt-get remove -y make + && apt-get remove -y make \ + && rm -rf /var/cache/apk/* COPY --from=binary /go/bin/dockerize /usr/local/bin RUN if [ $(arch) = "x86_64" ]; then \ @@ -57,6 +58,15 @@ RUN if [ $(arch) = "x86_64" ]; then \ fi; COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh RUN pip install -r requirements.txt && \ - pip uninstall -y acryl-datahub + pip uninstall -y acryl-datahub && \ + chmod +x /entrypoint.sh && \ + addgroup --gid 1000 datahub && \ + adduser --uid 1000 --gid 1000 --home /datahub-ingestion datahub + +USER datahub +ENV PATH=/datahub-ingestion/.local/bin:/usr/local/bin:$PATH + +ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index fe3c12a59886f..af81e6564fc9a 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -17,9 +17,11 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() } } -tasks.getByPath('docker').dependsOn('build') +tasks.getByPath(':docker:datahub-ingestion-base:docker').dependsOn('build') task mkdirBuildDocker { doFirst { diff --git a/docker/datahub-ingestion-base/entrypoint.sh b/docker/datahub-ingestion-base/entrypoint.sh new file mode 100644 index 0000000000000..e3f20c964306f --- /dev/null +++ b/docker/datahub-ingestion-base/entrypoint.sh @@ -0,0 +1,13 @@ +#!/usr/bin/bash + +if [ ! -z "$ACTIONS_EXTRA_PACKAGES" ]; then + pip install --user $ACTIONS_EXTRA_PACKAGES +fi + +if [[ ! -z "$ACTIONS_CONFIG" && ! -z "$ACTIONS_EXTRA_PACKAGES" ]]; then + curl -q "$ACTIONS_CONFIG" -o config.yaml + exec dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s \ + datahub actions --config config.yaml +else + exec datahub $@ +fi diff --git a/docker/datahub-ingestion-slim/Dockerfile b/docker/datahub-ingestion-slim/Dockerfile index dd052f80e07e0..0d5198cf03cbd 100644 --- a/docker/datahub-ingestion-slim/Dockerfile +++ b/docker/datahub-ingestion-slim/Dockerfile @@ -2,5 +2,23 @@ ARG APP_ENV=prod ARG DOCKER_VERSION=latest -FROM acryldata/datahub-ingestion:$DOCKER_VERSION as base +FROM acryldata/datahub-ingestion-base:$DOCKER_VERSION as base +FROM acryldata/datahub-ingestion:$DOCKER_VERSION as prod-codegen + +FROM base as prod-install + +USER 0 +WORKDIR /datahub-ingestion +COPY --from=prod-codegen /datahub-ingestion/ . + +USER datahub +RUN pip install --user --no-deps ".[all]" + +FROM base as dev-install +# Dummy stage for development. Assumes code is built on your machine and mounted to this image. +# See this excellent thread https://github.com/docker/cli/issues/1134 + +FROM ${APP_ENV}-install as final + +USER datahub diff --git a/docker/datahub-ingestion-slim/build.gradle b/docker/datahub-ingestion-slim/build.gradle index f21b66b576a0c..c52e8065f40a8 100644 --- a/docker/datahub-ingestion-slim/build.gradle +++ b/docker/datahub-ingestion-slim/build.gradle @@ -17,12 +17,14 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() } buildArgs([DOCKER_VERSION: version]) buildx(false) } -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion:docker']) +tasks.getByPath(':docker:datahub-ingestion-slim:docker').dependsOn(['build', ':docker:datahub-ingestion:docker']) task mkdirBuildDocker { doFirst { diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 8a642904cb136..f3ea09a9082dc 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -21,26 +21,30 @@ RUN (for attempt in 1 2 3 4 5; do ./gradlew --version && break ; echo "Failed to ./gradlew :metadata-events:mxe-schemas:build FROM base as prod-codegen - -COPY ./docker/datahub-ingestion/requirements.txt requirements.txt -RUN pip install -r requirements.txt +USER 0 COPY --from=prod-build /datahub-src /datahub-src -RUN cd /datahub-src/metadata-ingestion && \ - pip install -e ".[base]" && \ +WORKDIR /datahub-src/metadata-ingestion +RUN pip install -e ".[base]" && \ ./scripts/codegen.sh FROM base as prod-install +USER 0 + COPY --from=prod-codegen /datahub-src/metadata-ingestion /datahub-ingestion -COPY --from=prod-codegen /root/.cache/pip /root/.cache/pip +COPY --from=prod-codegen /root/.cache/pip /datahub-ingestion/.cache/pip + +COPY ./docker/datahub-ingestion/requirements.txt requirements.txt +RUN pip install -r requirements.txt + ARG RELEASE_VERSION -RUN cd /datahub-ingestion && \ - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ +WORKDIR /datahub-ingestion +RUN sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py && \ pip install ".[all]" && \ - pip freeze && \ # This is required to fix security vulnerability in htrace-core4 - rm -f /usr/local/lib/python3.10/site-packages/pyspark/jars/htrace-core4-4.1.0-incubating.jar + rm -f /usr/local/lib/3.10/site-packages/pyspark/jars/htrace-core4-4.1.0-incubating.jar && \ + chown -R datahub /datahub-ingestion FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. @@ -48,7 +52,4 @@ FROM base as dev-install FROM ${APP_ENV}-install as final -RUN addgroup --system datahub && adduser --system datahub --ingroup datahub USER datahub - -ENTRYPOINT [ "datahub" ] diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 7a24d87794c0e..a9bb55d20d423 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -25,10 +25,12 @@ docker { include "gradle/**" include "buildSrc/**" include "*" + }.exclude { + i -> i.file.isHidden() } buildArgs([DOCKER_VERSION: version]) } -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker']) +tasks.getByPath(':docker:datahub-ingestion:docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker']) task mkdirBuildDocker { doFirst { diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 1f887f277c066..8926967957fe7 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -24,8 +24,11 @@ services: datahub-actions: container_name: datahub-actions hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 32c11bf1739ad..5db604d142ef6 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -25,8 +25,11 @@ services: datahub-actions: container_name: datahub-actions hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 3bc15d077d8f0..3bf694f822abf 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -24,8 +24,11 @@ services: datahub-actions: container_name: datahub-actions hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index cc2fe1ec5c4db..51fadbd26c609 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -17,6 +17,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index a5d33457e45f7..bf51c2763bd85 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -16,6 +16,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 48a28f15a581d..183d94c0b7f23 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index a5b0413ec4be8..85083805f47d3 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 8fdbfce542078..85a0308087fce 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http @@ -45,7 +47,7 @@ services: - METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} datahub-frontend-react: container_name: datahub-frontend-react depends_on: diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index de9fe9887a779..6759aa0b893e2 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http @@ -45,7 +47,7 @@ services: - METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} datahub-frontend-react: container_name: datahub-frontend-react depends_on: diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 35b83a7d06081..97ace1a7fbe5e 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http @@ -45,7 +47,7 @@ services: - METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} datahub-frontend-react: container_name: datahub-frontend-react depends_on: diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index f78f75f748deb..191c9b6269ea6 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http @@ -45,7 +47,7 @@ services: - METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} datahub-frontend-react: container_name: datahub-frontend-react depends_on: diff --git a/docs/cli.md b/docs/cli.md index 13208f381fab6..5f307d6e17b03 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -121,7 +121,12 @@ The environment variables listed below take precedence over the DataHub CLI conf - `DATAHUB_DEBUG` (default `false`) - Set to `true` to enable debug logging for CLI. Can also be achieved through `--debug` option of the CLI. - `DATAHUB_VERSION` (default `head`) - Set to a specific version to run quickstart with the particular version of docker images. - `ACTIONS_VERSION` (default `head`) - Set to a specific version to run quickstart with that image tag of `datahub-actions` container. -- `ACTIONS_BASE` (default ``) - Set to `-slim` to run a slimmer actions container without pyspark/deequ features. +- `DATAHUB_ACTIONS_IMAGE` (default `acryldata/datahub-actions`) - Set to `-slim` to run a slimmer actions container without pyspark/deequ features. + +For images `acryldata/datahub-ingestion` or `acryldata/datahub-ingestion-slim` + +- `ACTIONS_EXTRA_PACKAGES` (default ``) - Install the extra python packages prior to running the main executable. +- `ACTIONS_CONFIG` (default ``) - Use the specified actions configuration file url (yaml format). ```shell DATAHUB_SKIP_CONFIG=false diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index e7941a04224e3..6155b3d892103 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -43,6 +43,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index 5981284e9da3f..8f3179e2e1a68 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -56,6 +56,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 7e9aa90664611..53c8f586187ed 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -72,6 +72,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() } tag("Debug", "${docker_registry}/${docker_repo}:debug")