diff --git a/.dockerignore b/.dockerignore index 29c6c45bb0653..701263f5fedde 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,7 @@ **/node_modules/ -datahub-frontend/build/ -metadata-ingestion/venv/ +*/build/ +*/*/build/ +*/venv/ out **/*.class # Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index fdf13bac0accc..eb81b31745536 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -77,10 +77,11 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 5d0edf3ee8427..81e6e79c2a5e5 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -88,10 +88,11 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index 64635671343ef..c4d8a962dcd32 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -10,18 +10,20 @@ ext { docker_repo = 'datahub-ingestion-base' docker_dir = 'datahub-ingestion-base' docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") + docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" revision = 2 // increment to trigger rebuild } docker { - name "${docker_registry}/${docker_repo}:v${version}-${docker_target}" - version "v${version}-${docker_target}" + name "${docker_registry}/${docker_repo}:v${docker_version}" + version "v${docker_version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } buildArgs([APP_ENV: docker_target]) } diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 2abd4e2f33bef..1aee79a428a98 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh . RUN pip install --no-cache --user ".[base]" && \ pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \ - pip install --no-cache --user ".[all]" && \ - ./pyspark_jars.sh + pip install --no-cache --user ".[all]" +RUN ./pyspark_jars.sh FROM base as full-install diff --git a/docker/datahub-ingestion/README.md b/docker/datahub-ingestion/README.md index 6580199bcce21..ed856314c5cc0 100644 --- a/docker/datahub-ingestion/README.md +++ b/docker/datahub-ingestion/README.md @@ -2,3 +2,10 @@ [![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml) Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service. + +## Slim vs Full Image Build + +There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies. + +Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full +image with pyspark use the following project property `-PdockerTarget=full`. diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index fed33752a4b81..247b896d6955c 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -9,6 +9,8 @@ ext { docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry docker_repo = 'datahub-ingestion' docker_dir = 'datahub-ingestion' + docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") + docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" revision = 2 // increment to trigger rebuild } @@ -19,21 +21,19 @@ dependencies { } docker { - name "${docker_registry}/${docker_repo}:v${version}-slim" - version "v${version}-slim" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only") + name "${docker_registry}/${docker_repo}:v${docker_version}" + version "v${docker_version}" + dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" include "metadata-ingestion/**" include "metadata-ingestion-modules/**" }.exclude { - i -> i.file.isHidden() || - i.file == buildDir || - i.file == project(':metadata-ingestion').buildDir || - i.file == project(':metadata-ingestion-modules').buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } buildArgs([DOCKER_VERSION: version, - RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')]) + RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')]) } tasks.getByName('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker', diff --git a/docker/datahub-ingestion/pyspark_jars.sh b/docker/datahub-ingestion/pyspark_jars.sh index ecd24e78c4105..ab4b223f0358a 100755 --- a/docker/datahub-ingestion/pyspark_jars.sh +++ b/docker/datahub-ingestion/pyspark_jars.sh @@ -2,21 +2,33 @@ set -ex -HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}" -ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars" -# Remove conflicting versions -echo "Removing version conflicts from $PYSPARK_JARS" -CONFLICTS="zookeeper hadoop- slf4j-" -for jar in $CONFLICTS; do - rm "$PYSPARK_JARS/$jar"*.jar -done +function replace_jar { + JAR_PREFIX=$1 + TRANSITIVE=$2 + DEPENDENCY=$3 -# Fetch dependencies -mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY" -mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY" + echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar" + ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true + rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true + rm -r "$HOME/.m2" || true -# Move to pyspark location -echo "Moving jars to $PYSPARK_JARS" -find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \; + if [ ! -z "$DEPENDENCY" ]; then + echo "Resolving $DEPENDENCY" + mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null + + echo "Moving jars to $PYSPARK_JARS" + find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \; + find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \; + fi +} + +replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" +replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}" +replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}" +replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}" +replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}" +replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}" +replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}" +replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}" diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index ffee3b9c65cf4..ac935ca42fd12 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -15,10 +15,11 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" - include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + include 'metadata-service/restli-servlet-impl/src/main/resources/index/**' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index 573ef21c88bf9..25f9847190de3 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -15,9 +15,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 0d8941cce4833..1598866914c0e 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -16,9 +16,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index 8a026be09d2b4..e24e206c99145 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -16,9 +16,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index 51c758f434328..5e735e118493c 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -45,11 +45,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index daf41a1e0303e..ef042188bc3d8 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -56,11 +56,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 54e95fdcfe579..35730ad6dfa9f 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -70,11 +70,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files war.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug")