Merge remote-tracking branch 'upstream/master' into HEAD

juanvisoler · Aug 8, 2023 · 548b93a · 548b93a
2 parents 36124cb + b4b9121
commit 548b93a
Show file tree

Hide file tree

Showing 3,494 changed files with 206,536 additions and 67,116 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -76,7 +76,7 @@ CORE:
   - "common/kvstore/**/*"
   - "common/network-common/**/*"
   - "common/network-shuffle/**/*"
-  - "python/pyspark/**/*.py"
+  - "python/pyspark/*.py"
   - "python/pyspark/tests/**/*.py"
 SPARK SUBMIT:
   - "bin/spark-submit*"
@@ -108,8 +108,8 @@ AVRO:
 DSTREAM:
   - "streaming/**/*"
   - "data/streaming/**/*"
-  - "external/kinesis*"
-  - "external/kafka*"
+  - "connector/kinesis*"
+  - "connector/kafka*"
   - "python/pyspark/streaming/**/*"
 GRAPHX:
   - "graphx/**/*"
@@ -155,6 +155,7 @@ CONNECT:
   - "connector/connect/**/*"
   - "**/sql/sparkconnect/**/*"
   - "python/pyspark/sql/**/connect/**/*"
+  - "python/pyspark/ml/**/connect/**/*"
 PROTOBUF:
   - "connector/protobuf/**/*"
   - "python/pyspark/sql/protobuf/**/*"
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -27,7 +27,7 @@ on:
         required: true
         default: '*'
       jdk:
-        description: 'JDK version: 8, 11 or 17'
+        description: 'JDK version: 8, 11, 17 or 21-ea'
         required: true
         default: '8'
       scala:

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
diff --git a/.github/workflows/build_branch32.yml → .github/workflows/build_branch35.yml b/.github/workflows/build_branch32.yml → .github/workflows/build_branch35.yml
@@ -17,11 +17,11 @@
 # under the License.
 #
 
-name: "Build (branch-3.2, Scala 2.13, Hadoop 3, JDK 8)"
+name: "Build (branch-3.5, Scala 2.13, Hadoop 3, JDK 8)"
 
 on:
   schedule:
-    - cron: '0 4 * * *'
+    - cron: '0 11 * * *'
 
 jobs:
   run-build:
@@ -32,18 +32,18 @@ jobs:
     if: github.repository == 'apache/spark'
     with:
       java: 8
-      branch: branch-3.2
-      hadoop: hadoop3.2
+      branch: branch-3.5
+      hadoop: hadoop3
       envs: >-
         {
           "SCALA_PROFILE": "scala2.13"
         }
-      # TODO(SPARK-39712): Reenable "sparkr": "true"
-      # TODO(SPARK-39685): Reenable "lint": "true"
-      # TODO(SPARK-39681): Reenable "pyspark": "true"
-      # TODO(SPARK-39682): Reenable "docker-integration-tests": "true"
       jobs: >-
         {
           "build": "true",
-          "tpcds-1g": "true"
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true",
+          "lint" : "true"
         }
diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml
@@ -0,0 +1,49 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build (master, Scala 2.12, Hadoop 3, JDK 21-ea)"
+
+on:
+  schedule:
+    - cron: '0 4 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'apache/spark'
+    with:
+      java: 21-ea
+      branch: master
+      hadoop: hadoop3
+      envs: >-
+        {
+          "SKIP_MIMA": "true",
+          "SKIP_UNIDOC": "true",
+        }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "false",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true"
+        }
diff --git a/.github/workflows/build_maven.yml b/.github/workflows/build_maven.yml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build using Maven (master, Scala 2.12, Hadoop 3, JDK 8)"
+
+on:
+  schedule:
+    - cron: '0 13 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/maven_test.yml
+    if: github.repository == 'apache/spark'
diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml
@@ -0,0 +1,210 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Build and test using Maven
+
+on:
+  workflow_call:
+    inputs:
+      java:
+        required: false
+        type: string
+        default: 8
+      branch:
+        description: Branch to run the build against
+        required: false
+        type: string
+        default: master
+      hadoop:
+        description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
+        required: false
+        type: string
+        default: hadoop3
+      envs:
+        description: Additional environment variables to set when running the tests. Should be in JSON format.
+        required: false
+        type: string
+        default: '{}'
+jobs:
+  # Build: build Spark and run the tests for specified modules using maven
+  build:
+    name: "Build modules using Maven: ${{ matrix.modules }} ${{ matrix.comment }}"
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        java:
+          - ${{ inputs.java }}
+        hadoop:
+          - ${{ inputs.hadoop }}
+        hive:
+          - hive2.3
+        modules:
+          - >-
+            core,launcher,common#unsafe,common#kvstore,common#network-common,common#network-shuffle,common#sketch
+          - >-
+            graphx,streaming,mllib-local,mllib,hadoop-cloud
+          - >-
+            repl,sql#hive-thriftserver
+          - >-
+            connector#kafka-0-10,connector#kafka-0-10-sql,connector#kafka-0-10-token-provider,connector#spark-ganglia-lgpl,connector#protobuf,connector#avro
+          - >-
+            sql#catalyst,resource-managers#yarn,resource-managers#mesos,resource-managers#kubernetes#core
+          - >-
+            connect
+        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
+        included-tags: [ "" ]
+        excluded-tags: [ "" ]
+        comment: [ "" ]
+        include:
+          # Hive tests
+          - modules: sql#hive
+            java: ${{ inputs.java }}
+            hadoop: ${{ inputs.hadoop }}
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- slow tests"
+          - modules: sql#hive
+            java: ${{ inputs.java }}
+            hadoop: ${{ inputs.hadoop }}
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- other tests"
+          # SQL tests
+          - modules: sql#core
+            java: ${{ inputs.java }}
+            hadoop: ${{ inputs.hadoop }}
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- extended tests"
+          - modules: sql#core
+            java: ${{ inputs.java }}
+            hadoop: ${{ inputs.hadoop }}
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.SlowSQLTest
+            comment: "- slow tests"
+          - modules: sql#core
+            java: ${{ inputs.java }}
+            hadoop: ${{ inputs.hadoop }}
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
+            comment: "- other tests"
+    env:
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
+      HADOOP_PROFILE: ${{ matrix.hadoop }}
+      HIVE_PROFILE: ${{ matrix.hive }}
+      SPARK_LOCAL_IP: localhost
+      GITHUB_PREV_SHA: ${{ github.event.before }}
+    steps:
+      - name: Checkout Spark repository
+        uses: actions/checkout@v3
+        # In order to fetch changed files
+        with:
+          fetch-depth: 0
+          repository: apache/spark
+          ref: ${{ inputs.branch }}
+      - name: Sync the current branch with the latest in Apache Spark
+        if: github.repository != 'apache/spark'
+        run: |
+          echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
+          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+          git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
+          git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
+      # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+      - name: Cache Scala, SBT and Maven
+        uses: actions/cache@v3
+        with:
+          path: |
+            build/apache-maven-*
+            build/scala-*
+            build/*.jar
+            ~/.sbt
+          key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+          restore-keys: |
+            build-
+      - name: Cache Coursier local repository
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/coursier
+          key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+          restore-keys: |
+            ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
+      - name: Install Java ${{ matrix.java }}
+        uses: actions/setup-java@v3
+        with:
+          distribution: temurin
+          java-version: ${{ matrix.java }}
+      - name: Install Python 3.8
+        uses: actions/setup-python@v4
+        # We should install one Python that is higher than 3+ for SQL and Yarn because:
+        # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+        # - Yarn has a Python specific test too, for example, YarnClusterSuite.
+        if: contains(matrix.modules, 'resource-managers#yarn') || (contains(matrix.modules, 'sql#core'))
+        with:
+          python-version: 3.8
+          architecture: x64
+      - name: Install Python packages (Python 3.8)
+        if: (contains(matrix.modules, 'sql#core'))
+        run: |
+          python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.56.0' 'protobuf==3.20.3'
+          python3.8 -m pip list
+      # Run the tests.
+      - name: Run tests
+        env: ${{ fromJSON(inputs.envs) }}
+        shell: 'script -q -e -c "bash {0}"'
+        run: |
+          # Fix for TTY related issues when launching the Ammonite REPL in tests.
+          export TERM=vt100 && script -qfc 'echo exit | amm -s' && rm typescript
+          # `set -e` to make the exit status as expected due to use `script -q -e -c` to run the commands
+          set -e
+          export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+          export MAVEN_CLI_OPTS="--no-transfer-progress"
+          export JAVA_VERSION=${{ matrix.java }}
+          # Replace with the real module name, for example, connector#kafka-0-10 -> connector/kafka-0-10
+          export TEST_MODULES=`echo "$MODULES_TO_TEST" | sed -e "s%#%/%g"`
+          ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} clean install
+          if [[ "$INCLUDED_TAGS" != "" ]]; then
+            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
+          elif [[ "$EXCLUDED_TAGS" != "" ]]; then
+            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
+          elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
+            ./build/mvn $MAVEN_CLI_OPTS -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connector/connect/common,connector/connect/server test -fae
+          elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then
+            # To avoid a compilation loop, for the `sql/hive-thriftserver` module, run `clean install` instead
+            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} clean install -fae
+          else
+            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Pspark-ganglia-lgpl -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} test -fae
+          fi
+      - name: Clean up local Maven repository
+        run: |  
+          rm -rf ~/.m2/repository/org/apache/spark
+      - name: Upload test results to report
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+          path: "**/target/test-reports/*.xml"
+      - name: Upload unit tests log files
+        if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+          path: "**/target/unit-tests.log"
diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml
@@ -32,9 +32,9 @@ jobs:
       matrix:
         branch:
           - master
+          - branch-3.5
           - branch-3.4
           - branch-3.3
-          - branch-3.2
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -117,6 +117,6 @@ spark-warehouse/
 node_modules
 
 # For Antlr
-sql/catalyst/gen/
-sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.tokens
-sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/gen/
+sql/api/gen/
+sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.tokens
+sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/gen/
diff --git a/LICENSE b/LICENSE
@@ -213,7 +213,6 @@ Apache Software Foundation License 2.0
 common/network-common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
 core/src/main/java/org/apache/spark/util/collection/TimSort.java
 core/src/main/resources/org/apache/spark/ui/static/bootstrap*
-core/src/main/resources/org/apache/spark/ui/static/jsonFormatter*
 core/src/main/resources/org/apache/spark/ui/static/vis*
 docs/js/vendor/bootstrap.js
 connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java