diff --git a/.github/workflows/velox_docker.yml b/.github/workflows/velox_docker.yml index 31796c15bdd5..d110d0a6d223 100644 --- a/.github/workflows/velox_docker.yml +++ b/.github/workflows/velox_docker.yml @@ -521,7 +521,7 @@ jobs: fail-fast: false matrix: spark: ["spark-3.2"] - celeborn: ["celeborn-0.4.0", "celeborn-0.3.2"] + celeborn: ["celeborn-0.4.1", "celeborn-0.3.2-incubating"] runs-on: ubuntu-20.04 container: ubuntu:22.04 steps: @@ -557,8 +557,8 @@ jobs: fi echo "EXTRA_PROFILE: ${EXTRA_PROFILE}" cd /opt && mkdir -p celeborn && \ - wget https://archive.apache.org/dist/incubator/celeborn/${{ matrix.celeborn }}-incubating/apache-${{ matrix.celeborn }}-incubating-bin.tgz && \ - tar xzf apache-${{ matrix.celeborn }}-incubating-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \ + wget https://archive.apache.org/dist/celeborn/${{ matrix.celeborn }}/apache-${{ matrix.celeborn }}-bin.tgz && \ + tar xzf apache-${{ matrix.celeborn }}-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \ mv ./conf/celeborn-env.sh.template ./conf/celeborn-env.sh && \ bash -c "echo -e 'CELEBORN_MASTER_MEMORY=4g\nCELEBORN_WORKER_MEMORY=4g\nCELEBORN_WORKER_OFFHEAP_MEMORY=8g' > ./conf/celeborn-env.sh" && \ bash -c "echo -e 'celeborn.worker.commitFiles.threads 128\nceleborn.worker.sortPartition.threads 64' > ./conf/celeborn-defaults.conf" && \ diff --git a/docs/get-started/ClickHouse.md b/docs/get-started/ClickHouse.md index 4352a99e55f9..ab24de7a4fd6 100644 --- a/docs/get-started/ClickHouse.md +++ b/docs/get-started/ClickHouse.md @@ -679,13 +679,13 @@ spark.shuffle.manager=org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleMa quickly start a celeborn cluster ```shell -wget https://archive.apache.org/dist/incubator/celeborn/celeborn-0.3.0-incubating/apache-celeborn-0.3.0-incubating-bin.tgz && \ -tar -zxvf apache-celeborn-0.3.0-incubating-bin.tgz && \ -mv apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf.template apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf && \ -mv apache-celeborn-0.3.0-incubating-bin/conf/log4j2.xml.template apache-celeborn-0.3.0-incubating-bin/conf/log4j2.xml && \ +wget https://archive.apache.org/dist/celeborn/celeborn-0.3.2-incubating/apache-celeborn-0.3.2-incubating-bin.tgz && \ +tar -zxvf apache-celeborn-0.3.2-incubating-bin.tgz && \ +mv apache-celeborn-0.3.2-incubating-bin/conf/celeborn-defaults.conf.template apache-celeborn-0.3.2-incubating-bin/conf/celeborn-defaults.conf && \ +mv apache-celeborn-0.3.2-incubating-bin/conf/log4j2.xml.template apache-celeborn-0.3.2-incubating-bin/conf/log4j2.xml && \ mkdir /opt/hadoop && chmod 777 /opt/hadoop && \ -echo -e "celeborn.worker.flusher.threads 4\nceleborn.worker.storage.dirs /tmp\nceleborn.worker.monitor.disk.enabled false" > apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf && \ -bash apache-celeborn-0.3.0-incubating-bin/sbin/start-master.sh && bash apache-celeborn-0.3.0-incubating-bin/sbin/start-worker.sh +echo -e "celeborn.worker.flusher.threads 4\nceleborn.worker.storage.dirs /tmp\nceleborn.worker.monitor.disk.enabled false" > apache-celeborn-0.3.2-incubating-bin/conf/celeborn-defaults.conf && \ +bash apache-celeborn-0.3.2-incubating-bin/sbin/start-master.sh && bash apache-celeborn-0.3.2-incubating-bin/sbin/start-worker.sh ``` ### Columnar shuffle mode diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java index f454cf00c656..d196691d1b14 100644 --- a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java +++ b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java @@ -217,7 +217,13 @@ public boolean unregisterShuffle(int shuffleId) { } } return CelebornUtils.unregisterShuffle( - lifecycleManager, shuffleClient, shuffleIdTracker, shuffleId, appUniqueId, isDriver()); + lifecycleManager, + shuffleClient, + shuffleIdTracker, + shuffleId, + appUniqueId, + throwsFetchFailure, + isDriver()); } @Override diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java index 9dd4e1d1191e..6b4229ad3037 100644 --- a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java +++ b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java @@ -49,11 +49,21 @@ public static boolean unregisterShuffle( Object shuffleIdTracker, int appShuffleId, String appUniqueId, + boolean throwsFetchFailure, boolean isDriver) { try { - // for Celeborn 0.4.0 try { - if (lifecycleManager != null) { + try { + // for Celeborn 0.4.1 + if (lifecycleManager != null) { + Method unregisterAppShuffle = + lifecycleManager + .getClass() + .getMethod("unregisterAppShuffle", int.class, boolean.class); + unregisterAppShuffle.invoke(lifecycleManager, appShuffleId, throwsFetchFailure); + } + } catch (NoSuchMethodException ex) { + // for Celeborn 0.4.0 Method unregisterAppShuffle = lifecycleManager.getClass().getMethod("unregisterAppShuffle", int.class); unregisterAppShuffle.invoke(lifecycleManager, appShuffleId); diff --git a/pom.xml b/pom.xml index 81ce0e5d462a..887839ce5fc0 100644 --- a/pom.xml +++ b/pom.xml @@ -53,7 +53,7 @@ delta-core 2.4.0 24 - 0.3.2-incubating + 0.4.1 0.8.0 15.0.0 15.0.0-gluten diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index 3f1760069792..71db637a8403 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -21,7 +21,7 @@ 3.4.2 2.12 3 - 0.3.0-incubating + 0.3.2-incubating 0.8.0 1.2.0-SNAPSHOT 32.0.1-jre @@ -167,7 +167,7 @@ celeborn-0.4 - 0.4.0-incubating + 0.4.1