Skip to content

Commit

Permalink
Merge branch 'opt_nested_funcs' of https://github.com/KevinyhZou/gluten
Browse files Browse the repository at this point in the history
… into opt_nested_funcs
  • Loading branch information
KevinyhZou committed Feb 11, 2025
2 parents 2ec4cef + b76e3f9 commit 8a95008
Show file tree
Hide file tree
Showing 158 changed files with 3,367 additions and 1,093 deletions.
144 changes: 130 additions & 14 deletions .github/workflows/velox_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
uses: actions/cache/restore@v4
with:
path: '${{ env.CCACHE_DIR }}'
key: ccache-centos7-release-default-${{github.sha}}
Expand All @@ -96,7 +96,7 @@ jobs:
"
- name: "Save ccache"
uses: actions/cache/save@v3
uses: actions/cache/save@v4
id: ccache
with:
path: '${{ env.CCACHE_DIR }}'
Expand Down Expand Up @@ -507,7 +507,7 @@ jobs:
fail-fast: false
matrix:
spark: [ "spark-3.2" ]
uniffle: [ "0.9.1" ]
uniffle: [ "0.9.2" ]
hadoop: [ "2.8.5" ]
runs-on: ubuntu-20.04
container: apache/gluten:centos-8
Expand All @@ -527,7 +527,9 @@ jobs:
run: |
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk && \
cd /opt && \
${WGET_CMD} https://archive.apache.org/dist/incubator/uniffle/${{ matrix.uniffle }}/apache-uniffle-${{ matrix.uniffle }}-incubating-bin.tar.gz && \
mkdir /opt/uniffle && tar xzf apache-uniffle-${{ matrix.uniffle }}-incubating-bin.tar.gz -C /opt/uniffle --strip-components=1 && \
${WGET_CMD} https://archive.apache.org/dist/hadoop/common/hadoop-${{ matrix.hadoop }}/hadoop-${{ matrix.hadoop }}.tar.gz && \
tar xzf hadoop-${{ matrix.hadoop }}.tar.gz -C /opt/ && \
cd /opt/uniffle && mkdir shuffle_data && \
bash -c "echo -e 'XMX_SIZE=16g\nHADOOP_HOME=/opt/hadoop-${{ matrix.hadoop }}' > ./bin/rss-env.sh" && \
Expand Down Expand Up @@ -757,8 +759,6 @@ jobs:
path: /root/.m2/repository/org/apache/arrow/
- name: Prepare spark.test.home for Spark 3.4.4 (other tests)
run: |
rm -rf /opt/shims/spark34
bash .github/workflows/util/install_spark_resources.sh 3.4
dnf module -y install python39 && \
alternatives --set python3 /usr/bin/python3.9 && \
pip3 install setuptools && \
Expand All @@ -778,7 +778,7 @@ jobs:
if: always()
uses: actions/upload-artifact@v4
with:
name: test-report-spark34
name: test-report-spark34-jdk17
path: '**/surefire-reports/TEST-*.xml'
- name: Upload golden files
if: failure()
Expand All @@ -787,6 +787,51 @@ jobs:
name: golden-files-spark34
path: /tmp/tpch-approved-plan/**

run-spark-test-spark34-jdk8:
needs: build-native-lib-centos-7
runs-on: ubuntu-20.04
container: apache/gluten:centos-8
steps:
- uses: actions/checkout@v2
- name: Download All Artifacts
uses: actions/download-artifact@v4
with:
name: velox-native-lib-centos-7-${{github.sha}}
path: ./cpp/build/releases
- name: Download Arrow Jars
uses: actions/download-artifact@v4
with:
name: arrow-jars-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
- name: Prepare spark.test.home for Spark 3.4.4 (other tests)
run: |
dnf module -y install python39 && \
alternatives --set python3 /usr/bin/python3.9 && \
pip3 install setuptools && \
pip3 install pyspark==3.4.4 cython && \
pip3 install pandas pyarrow
- name: Build and Run unit test for Spark 3.4.4 (other tests)
run: |
cd $GITHUB_WORKSPACE/
export SPARK_SCALA_VERSION=2.12
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export SPARK_HOME=/opt/shims/spark34/spark_home/
ls -l /opt/shims/spark34/spark_home/
$MVN_CMD clean test -Pspark-3.4 -Pjava-8 -Pbackends-velox -Pceleborn -Pdelta -Phudi -Pspark-ut \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags \
-DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/"
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-report-spark34
path: '**/surefire-reports/TEST-*.xml'
- name: Upload golden files
if: failure()
uses: actions/upload-artifact@v4
with:
name: golden-files-spark34-jdk8
path: /tmp/tpch-approved-plan/**

run-spark-test-spark34-slow:
needs: build-native-lib-centos-7
Expand All @@ -804,14 +849,9 @@ jobs:
with:
name: arrow-jars-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
- name: Prepare
run: |
rm -rf /opt/shims/spark34
bash .github/workflows/util/install_spark_resources.sh 3.4
- name: Build and Run unit test for Spark 3.4.4 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
export SPARK_HOME=/opt/shims/spark34/spark_home/
ls -l /opt/shims/spark34/spark_home/
$MVN_CMD clean test -Pspark-3.4 -Pjava-17 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -Phudi \
Expand All @@ -821,7 +861,39 @@ jobs:
if: always()
uses: actions/upload-artifact@v4
with:
name: test-report-spark34-slow
name: test-report-spark34-slow-jdk17
path: '**/surefire-reports/TEST-*.xml'

run-spark-test-spark34-slow-jdk8:
needs: build-native-lib-centos-7
runs-on: ubuntu-20.04
container: apache/gluten:centos-8
steps:
- uses: actions/checkout@v2
- name: Download All Artifacts
uses: actions/download-artifact@v4
with:
name: velox-native-lib-centos-7-${{github.sha}}
path: ./cpp/build/releases
- name: Download Arrow Jars
uses: actions/download-artifact@v4
with:
name: arrow-jars-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
- name: Build and Run unit test for Spark 3.4.4 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export SPARK_HOME=/opt/shims/spark34/spark_home/
ls -l /opt/shims/spark34/spark_home/
$MVN_CMD clean test -Pspark-3.4 -Pjava-8 -Pbackends-velox -Pceleborn -Pdelta -Pspark-ut -Phudi \
-DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest \
-DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/"
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-report-spark34-slow-jdk8
path: '**/surefire-reports/TEST-*.xml'

run-spark-test-spark35:
Expand Down Expand Up @@ -867,6 +939,50 @@ jobs:
name: golden-files-spark35
path: /tmp/tpch-approved-plan/**

run-spark-test-spark35-jdk17:
needs: build-native-lib-centos-7
runs-on: ubuntu-20.04
container: apache/gluten:centos-8-jdk17
steps:
- uses: actions/checkout@v2
- name: Download All Artifacts
uses: actions/download-artifact@v4
with:
name: velox-native-lib-centos-7-${{github.sha}}
path: ./cpp/build/releases
- name: Download Arrow Jars
uses: actions/download-artifact@v4
with:
name: arrow-jars-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
- name: Prepare
run: |
dnf module -y install python39 && \
alternatives --set python3 /usr/bin/python3.9 && \
pip3 install setuptools && \
pip3 install pyspark==3.5.2 cython && \
pip3 install pandas pyarrow
- name: Build and Run unit test for Spark 3.5.2 (other tests)
run: |
cd $GITHUB_WORKSPACE/
export SPARK_SCALA_VERSION=2.12
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
$MVN_CMD clean test -Pspark-3.5 -Pjava-17 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Phudi -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/ ${EXTRA_FLAGS}" \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-report-spark35-jdk17
path: '**/surefire-reports/TEST-*.xml'
- name: Upload golden files
if: failure()
uses: actions/upload-artifact@v4
with:
name: golden-files-spark35
path: /tmp/tpch-approved-plan/**

run-spark-test-spark35-scala213:
needs: build-native-lib-centos-7
runs-on: ubuntu-20.04
Expand Down Expand Up @@ -1067,7 +1183,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Get Ccache
uses: actions/cache/restore@v3
uses: actions/cache/restore@v4
with:
path: '${{ env.CCACHE_DIR }}'
key: ccache-centos8-release-default-${{github.sha}}
Expand All @@ -1079,7 +1195,7 @@ jobs:
bash dev/ci-velox-buildshared-centos-8.sh
ccache -s
# - name: "Save ccache"
# uses: actions/cache/save@v3
# uses: actions/cache/save@v4
# id: ccache
# with:
# path: '${{ env.CCACHE_DIR }}'
Expand Down
6 changes: 0 additions & 6 deletions backends-clickhouse/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -490,12 +490,6 @@
<goals>
<goal>test</goal>
</goals>
<configuration>
<systemProperties>
<clickhouse.lib.path>${clickhouse.lib.path}</clickhouse.lib.path>
<tpcds.data.path>${tpcds.data.path}</tpcds.data.path>
</systemProperties>
</configuration>
</execution>
</executions>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ object CHExpressionUtil {
FROM_UTC_TIMESTAMP -> UtcTimestampValidator(),
STACK -> DefaultValidator(),
RAISE_ERROR -> DefaultValidator(),
WIDTH_BUCKET -> DefaultValidator()
WIDTH_BUCKET -> DefaultValidator(),
MAKE_DATE -> DefaultValidator()
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
package org.apache.gluten.execution

import org.apache.gluten.backendsapi.clickhouse.CHConf
import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.utils.UTSystemParameters

import org.apache.spark.SparkConf
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
Expand All @@ -43,7 +41,6 @@ class GlutenClickHouseJoinSuite extends GlutenClickHouseWholeStageTransformerSui
.set("spark.sql.adaptive.enabled", "false")
.set("spark.sql.files.minPartitionNum", "1")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
.set("spark.gluten.sql.columnar.hashagg.enablefinal", "true")
.set("spark.gluten.sql.enable.native.validation", "false")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,10 @@
*/
package org.apache.gluten.execution

import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.utils.{TestExceptionUtils, UTSystemParameters}

import org.apache.spark.SparkConf
import org.apache.gluten.utils.TestExceptionUtils

class GlutenClickHouseNativeExceptionSuite extends GlutenClickHouseWholeStageTransformerSuite {

override protected def sparkConf: SparkConf = {
super.sparkConf
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
}

test("native exception caught by jvm") {
try {
TestExceptionUtils.generateNativeException()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package org.apache.gluten.execution

import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.exception.GlutenException
import org.apache.gluten.utils.UTSystemParameters

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
Expand Down Expand Up @@ -65,7 +64,6 @@ class GlutenClickHouseNativeLibSuite extends PlanTest {
.builder()
.master("local[1]")
.config(baseSparkConf)
.config(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.config(GlutenConfig.GLUTEN_EXECUTOR_LIB_PATH.key, "/path/not/exist/libch.so")
.getOrCreate()
spark.sql("select 1").show()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package org.apache.gluten.execution

import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.utils.UTSystemParameters

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
Expand Down Expand Up @@ -52,7 +51,6 @@ class GlutenClickHouseSyntheticDataSuite
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
.set("spark.gluten.sql.columnar.hashagg.enablefinal", "true")
.set("spark.gluten.sql.enable.native.validation", "false")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ abstract class GlutenClickHouseTPCDSAbstractSuite
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
.set("spark.gluten.sql.columnar.hashagg.enablefinal", "true")
.set("spark.gluten.sql.enable.native.validation", "false")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package org.apache.gluten.execution

import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.utils.UTSystemParameters

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
Expand Down Expand Up @@ -568,7 +567,6 @@ abstract class GlutenClickHouseTPCHAbstractSuite
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
.set("spark.gluten.sql.columnar.hashagg.enablefinal", "true")
.set("spark.gluten.sql.enable.native.validation", "false")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package org.apache.gluten.execution

import org.apache.gluten.backendsapi.clickhouse.RuntimeConfig
import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.utils.UTSystemParameters

import org.apache.spark.{SPARK_VERSION_SHORT, SparkConf}
Expand Down Expand Up @@ -79,7 +78,6 @@ class GlutenClickHouseWholeStageTransformerSuite extends WholeStageTransformerSu
import org.apache.gluten.backendsapi.clickhouse.CHConf._

val conf = super.sparkConf
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.enable.native.validation", "false")
.set("spark.sql.warehouse.dir", warehouse)
.setCHConfig("user_defined_path", "/tmp/user_defined")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
*/
package org.apache.gluten.execution

import org.apache.gluten.config.GlutenConfig
import org.apache.gluten.utils.UTSystemParameters

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Row, TestUtils}
import org.apache.spark.sql.catalyst.expressions.{Expression, GetJsonObject, Literal}
Expand Down Expand Up @@ -57,7 +54,6 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH.key, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
.set("spark.gluten.sql.columnar.hashagg.enablefinal", "true")
.set("spark.gluten.sql.enable.native.validation", "false")
Expand Down
Loading

0 comments on commit 8a95008

Please sign in to comment.