diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/.env b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/.env new file mode 100644 index 00000000000..47c1881cbec --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/.env @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +HDDS_VERSION=1.4.0 +HADOOP_IMAGE=apache/hadoop +OZONE_RUNNER_VERSION=20230615-1 +OZONE_RUNNER_IMAGE=apache/ozone-runner +OZONE_OPTS= diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/README.md b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/README.md new file mode 100644 index 00000000000..8d9693eb3e7 --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/README.md @@ -0,0 +1,12 @@ +This demonstrates a complete warehouse cluster with dockerized Apache Spark, Apache Iceberg and Apache Ozone configured with s3:// storage protocol, with Spark also running as a Thriftserver and accepting JDBC connections. + +- Download and unzip Apache Ozone from Apache Downloads folder https://ozone.apache.org/downloads/ into a folder called ozone +- Place the contents of this folder into ozone/compose/spark-ozone-iceberg folder +- Once in the directory, run the command `OZONE_DATANODES=3 ./run.sh -d`. +- This will use Docker Dompose to start the ozone cluster with 3 datanodes for testing purposes. In addition to that, it will also start a REST catalog server, a Spark master, a Spark worker and a Spark Thriftserver, all connected to the Ozone cluster +- Download and install an S3 client, like awscli: `$ sudo apt install awscli` +- Using awscli, create the warehouse bucket: `$ AWS_ACCESS_KEY_ID=any AWS_SECRET_ACCESS_KEY=any AWS_REGION=us-east-1 aws s3api --endpoint http://localhost:9878 --bucket=warehouse create-bucket` +- Connect to spark-sql and create a database +- `docker exec -it spark-ozone-iceberg-spark-master-1 /opt/bitnami/spark/bin/spark-sql` +- `spark-sql ()> create database test;` +- you can now create tables within the database, also you can connect using JDBC to the thriftserver. diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/docker-compose.yaml b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/docker-compose.yaml new file mode 100644 index 00000000000..df44209278f --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/docker-compose.yaml @@ -0,0 +1,182 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.8" + +# reusable fragments (see https://docs.docker.com/compose/compose-file/#extension-fields) +x-common-config: + &common-config + image: ${OZONE_RUNNER_IMAGE}:${OZONE_RUNNER_VERSION} + volumes: + - ../..:/opt/hadoop + env_file: + - docker-config + +x-replication: + &replication + OZONE-SITE.XML_ozone.server.default.replication: ${OZONE_REPLICATION_FACTOR:-1} + +services: + datanode: + <<: *common-config + ports: + - 19864 + - 9882 + environment: + <<: *replication + OZONE_OPTS: + command: ["ozone","datanode"] + om: + <<: *common-config + environment: + ENSURE_OM_INITIALIZED: /data/metadata/om/current/VERSION + OZONE_OPTS: + <<: *replication + ports: + - 9874:9874 + - 9862:9862 + command: ["ozone","om"] + scm: + <<: *common-config + ports: + - 9876:9876 + - 9860:9860 + environment: + ENSURE_SCM_INITIALIZED: /data/metadata/scm/current/VERSION + OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1} + OZONE_OPTS: + <<: *replication + command: ["ozone","scm"] + httpfs: + <<: *common-config + environment: + OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1} + <<: *replication + ports: + - 14000:14000 + command: [ "ozone","httpfs" ] + s3g: + <<: *common-config + environment: + OZONE_OPTS: + <<: *replication + ports: + - 9878:9878 + command: ["ozone","s3g"] + recon: + <<: *common-config + ports: + - 9888:9888 + environment: + OZONE_OPTS: + <<: *replication + command: ["ozone","recon"] + iceberg-rest: + image: tabulario/iceberg-rest:1.5.0 + depends_on: + - catalog-pg + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=any + - AWS_SECRET_ACCESS_KEY=any + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3://warehouse/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://s3g:9878/ + - CATALOG_S3_PATH__STYLE__ACCESS=true + - CATALOG_JDBC_USER=iceberg + - CATALOG_JDBC_PASSWORD=iceberg + - CATALOG_URI=jdbc:postgresql://catalog-pg:5432/iceberg + catalog-pg: + image: postgres:15 + environment: + POSTGRES_USER: iceberg + POSTGRES_PASSWORD: iceberg + POSTGRES_DB: iceberg + ports: + - 5432:5432 + + spark-master: + build: ./spark + #image: bitnami/spark:3.5.0 + #command: > + # /bin/sh + #/opt/bitnami/spark/bin/spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.apache.ozone:ozone-filesystem-hadoop3:1.4.0,org.apache.iceberg:iceberg-hive-runtime:1.5.0,org.apache.iceberg:iceberg-hive-metastore:1.5.0,org.apache.iceberg:iceberg-aws-bundle:1.5.0 + #stdin_open: true + #tty: true + ports: + - 8080:8080 + - 7077:7077 + volumes: + # - ./ivy2:/opt/bitnami/spark/.ivy2/ + - ./spark.conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf + # - ./spark-event-logs:/opt/bitnami/spark/event-logs + # - ./extra-jars:/opt/bitnami/spark/extra-jars + environment: + - AWS_ACCESS_KEY_ID=any + - AWS_SECRET_ACCESS_KEY=any + - AWS_REGION=us-east-1 + - SPARK_MODE=master + healthcheck: + test: bin/spark-shell || exit 1 + interval: 30s + retries: 10 + start_period: 10s + timeout: 60s + spark-thriftserver: + build: ./spark + container_name: spark-thriftserver + depends_on: + spark-master: + condition: service_healthy + command: + - sbin/start-thriftserver.sh + - --master + - spark://spark-master:7077 + environment: + - AWS_ACCESS_KEY_ID=any + - AWS_SECRET_ACCESS_KEY=any + - AWS_REGION=us-east-1 + ports: + - 10000:10000 + - 4040:4040 + volumes_from: + - spark-master + healthcheck: + test: beeline help || exit 1 + interval: 10s + retries: 10 + start_period: 5s + timeout: 60s + + spark-worker: + build: ./spark + container_name: spark-worker + depends_on: + spark-master: + condition: service_healthy + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_WORKER_MEMORY=2G + - SPARK_WORKER_CORES=2 + - AWS_ACCESS_KEY_ID=any + - AWS_SECRET_ACCESS_KEY=any + - AWS_REGION=us-east-1 + volumes_from: + - spark-master + \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/docker-config b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/docker-config new file mode 100644 index 00000000000..729f036e15c --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/docker-config @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CORE-SITE.XML_fs.defaultFS=ofs://om +CORE-SITE.XML_fs.trash.interval=1 +# For HttpFS service it is required to enable proxying users. +CORE-SITE.XML_hadoop.proxyuser.hadoop.hosts=* +CORE-SITE.XML_hadoop.proxyuser.hadoop.groups=* + +OZONE-SITE.XML_ozone.om.address=om +OZONE-SITE.XML_ozone.om.http-address=om:9874 +OZONE-SITE.XML_ozone.scm.http-address=scm:9876 +OZONE-SITE.XML_ozone.scm.container.size=1GB +OZONE-SITE.XML_ozone.scm.block.size=1MB +OZONE-SITE.XML_ozone.scm.datanode.ratis.volume.free-space.min=10MB +OZONE-SITE.XML_ozone.scm.pipeline.creation.interval=30s +OZONE-SITE.XML_ozone.scm.pipeline.owner.container.count=1 +OZONE-SITE.XML_ozone.scm.names=scm +OZONE-SITE.XML_ozone.scm.datanode.id.dir=/data +OZONE-SITE.XML_ozone.scm.block.client.address=scm +OZONE-SITE.XML_ozone.metadata.dirs=/data/metadata +OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon +OZONE-SITE.XML_ozone.scm.client.address=scm +OZONE-SITE.XML_hdds.datanode.dir=/data/hdds +OZONE-SITE.XML_hdds.datanode.volume.min.free.space=100MB +OZONE-SITE.XML_ozone.recon.address=recon:9891 +OZONE-SITE.XML_ozone.recon.http-address=0.0.0.0:9888 +OZONE-SITE.XML_ozone.recon.https-address=0.0.0.0:9889 +OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m +OZONE-SITE.XML_ozone.datanode.pipeline.limit=1 +OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s +OZONE-SITE.XML_hdds.container.report.interval=60s +OZONE-SITE.XML_ozone.scm.stale.node.interval=30s +OZONE-SITE.XML_ozone.scm.dead.node.interval=45s +OZONE-SITE.XML_hdds.heartbeat.interval=5s +OZONE-SITE.XML_ozone.scm.close.container.wait.duration=5s +OZONE-SITE.XML_hdds.scm.replication.thread.interval=15s +OZONE-SITE.XML_hdds.scm.replication.under.replicated.interval=5s +OZONE-SITE.XML_hdds.scm.replication.over.replicated.interval=5s +OZONE-SITE.XML_hdds.scm.wait.time.after.safemode.exit=30s + +OZONE-SITE.XML_dfs.container.ratis.datastream.enabled=true + +OZONE_CONF_DIR=/etc/hadoop +OZONE_LOG_DIR=/var/log/hadoop + +no_proxy=om,scm,s3g,recon,kdc,localhost,127.0.0.1 + +# Explicitly enable filesystem snapshot feature for this Docker compose cluster +OZONE-SITE.XML_ozone.filesystem.snapshot.enabled=true diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/run.sh b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/run.sh new file mode 100755 index 00000000000..e3bdd21bd46 --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/run.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +declare -i OZONE_DATANODES OZONE_REPLICATION_FACTOR OZONE_SAFEMODE_MIN_DATANODES + +ORIG_DATANODES="${OZONE_DATANODES:-}" +ORIG_REPLICATION_FACTOR="${OZONE_REPLICATION_FACTOR:-}" + +# only support replication factor of 1 or 3 +if [[ -v OZONE_REPLICATION_FACTOR ]] && [[ ${OZONE_REPLICATION_FACTOR} -ne 1 ]] && [[ ${OZONE_REPLICATION_FACTOR} -ne 3 ]]; then + # assume invalid replication factor was intended as "number of datanodes" + if [[ -z ${ORIG_DATANODES} ]]; then + OZONE_DATANODES=${OZONE_REPLICATION_FACTOR} + fi + unset OZONE_REPLICATION_FACTOR +fi + +# at least 1 datanode +if [[ -v OZONE_DATANODES ]] && [[ ${OZONE_DATANODES} -lt 1 ]]; then + unset OZONE_DATANODES +fi + +if [[ -v OZONE_DATANODES ]] && [[ -v OZONE_REPLICATION_FACTOR ]]; then + # ensure enough datanodes for replication factor + if [[ ${OZONE_DATANODES} -lt ${OZONE_REPLICATION_FACTOR} ]]; then + OZONE_DATANODES=${OZONE_REPLICATION_FACTOR} + fi +elif [[ -v OZONE_DATANODES ]]; then + if [[ ${OZONE_DATANODES} -ge 3 ]]; then + OZONE_REPLICATION_FACTOR=3 + else + OZONE_REPLICATION_FACTOR=1 + fi +elif [[ -v OZONE_REPLICATION_FACTOR ]]; then + OZONE_DATANODES=${OZONE_REPLICATION_FACTOR} +else + OZONE_DATANODES=1 + OZONE_REPLICATION_FACTOR=1 +fi + +: ${OZONE_SAFEMODE_MIN_DATANODES:=${OZONE_REPLICATION_FACTOR}} + +export OZONE_DATANODES OZONE_REPLICATION_FACTOR OZONE_SAFEMODE_MIN_DATANODES + +docker compose up --scale datanode=${OZONE_DATANODES} --no-recreate "$@" diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/spark.conf/spark-defaults.conf b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/spark.conf/spark-defaults.conf new file mode 100644 index 00000000000..09a8bfd9b22 --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/spark.conf/spark-defaults.conf @@ -0,0 +1,15 @@ +spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +spark.sql.catalog.demo org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.demo.catalog-impl org.apache.iceberg.rest.RESTCatalog +spark.sql.catalog.demo.uri http://iceberg-rest:8181 +spark.sql.catalog.demo.io-impl org.apache.iceberg.aws.s3.S3FileIO +spark.sql.catalog.demo.warehouse s3a://warehouse/ +spark.sql.catalog.demo.s3.endpoint http://s3g:9878/ +spark.sql.catalog.demo.s3.path-style-access true +spark.sql.defaultCatalog demo +spark.eventLog.enabled true +spark.eventLog.dir /opt/bitnami/spark/event-logs +spark.history.fs.logDirectory /opt/bitnami/spark/history-logs +spark.sql.catalogImplementation in-memory +spark.driver.extraClassPath /opt/bitnami/spark/extra-jars/* +spark.executor.extraClassPath /opt/bitnami/spark/extra-jars/* \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/spark/Dockerfile b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/spark/Dockerfile new file mode 100644 index 00000000000..960f51aa0f0 --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/spark-ozone-iceberg/spark/Dockerfile @@ -0,0 +1,10 @@ +FROM bitnami/spark:3.5.0 +USER root +RUN install_packages curl sudo net-tools +USER 1001 +RUN mkdir extra-jars +RUN mkdir event-logs +RUN mkdir history-logs +RUN curl https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.5.0/iceberg-aws-bundle-1.5.0.jar --output /opt/bitnami/spark/extra-jars/iceberg-aws-bundle-1.5.0.jar +RUN curl https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.5.0/iceberg-spark-runtime-3.5_2.12-1.5.0.jar --output /opt/bitnami/spark/extra-jars/iceberg-spark-runtime-3.5_2.12-1.5.0.jar +RUN curl https://repo1.maven.org/maven2/org/slf4j/slf4j-api/2.0.7/slf4j-api-2.0.7.jar --output /opt/bitnami/spark/extra-jars/slf4j-api-2.0.7.jar