Merge pull request #690 from NVIDIA/branch-24.06

release 24.06 [skip ci]
NVIDIA · Jul 12, 2024 · c7becc2 · c7becc2
2 parents df01b39 + 20fc7b9
commit c7becc2
Show file tree

Hide file tree

Showing 43 changed files with 1,021 additions and 249 deletions.
diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-24.04
+    - branch-24.06
     types: [closed]
 
 jobs:
@@ -29,14 +29,14 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          ref: branch-24.04 # force to fetch from latest upstream instead of PR ref
+          ref: branch-24.06 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids-ml
-          HEAD: branch-24.04
-          BASE: branch-24.06
+          HEAD: branch-24.06
+          BASE: branch-24.08
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
 
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,17 +34,20 @@ jobs:
       args: ${{ env.args }}
 
     # This job only runs for pull request comments
-    if: contains( '\
-      lijinf2,\
-      eordentlich,\
-      wbo4958,\
-      leewyang,\
-      rongou,\
-      wjxiz1992,\
-      GaryShen2008,\
-      NvTimLiu,\
-      YanxuanLiu,\
-      ', format('{0},', github.actor)) && github.event.comment.body == 'build'
+    if: |
+      github.event.comment.body == 'build' &&
+      (
+        github.actor == 'lijinf2' ||
+        github.actor == 'eordentlich' ||
+        github.actor == 'wbo4958' ||
+        github.actor == 'leewyang' ||
+        github.actor == 'rongou' ||
+        github.actor == 'wjxiz1992' ||
+        github.actor == 'GaryShen2008' ||
+        github.actor == 'NvTimLiu' ||
+        github.actor == 'YanxuanLiu' ||
+        github.actor == 'pxLi'
+      )
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
@@ -59,15 +62,15 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
           ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
           lfs: 'true'
 
       # repo specific steps
       - name: Setup java
-        uses: actions/setup-java@v3
+        uses: actions/setup-java@v4
         with:
           distribution: adopt
           java-version: 8

diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=24.04
+ARG CUML_VER=24.06
 RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER python=3.9 cuda-version=11.8 \
     && conda clean --all -f -y
diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
@@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG PYSPARK_VERSION=3.3.1
-ARG RAPIDS_VERSION=24.4.0
+ARG RAPIDS_VERSION=24.6.0
 ARG ARCH=amd64
 #ARG ARCH=arm64
 # Install packages to build spark-rapids-ml

diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
@@ -17,7 +17,7 @@
 ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
-ARG CUML_VERSION=24.04
+ARG CUML_VERSION=24.06
 
 # Install packages to build spark-rapids-ml
 RUN apt update -y \

diff --git a/docs/site/compatibility.md b/docs/site/compatibility.md
@@ -34,4 +34,4 @@ Note: Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it
 ## Single vs Double precision inputs
 The underlying cuML implementations all accept single precision (e.g. Float or float32) input types and offer the best performance in this case.  As a result, by default, Spark RAPIDs ML converts Spark DataFrames supplied to `fit` and `transform` methods having double precision data types (i.e. `VectorUDT`, `ArrayType(DoubleType())`, `DoubleType()` columns) to single precision before passing them down to the cuML layer.  Most of the cuML algorithm implementations also support double precision inputs.   The Estimator (for all algorithms) constructor parameter `float32_inputs` can be used to control this behavior.  The default value is `True` which forces the conversion to single precision for all algorithms, but it can be set to `False` in which case double precision input data is passed to those cuML algorithms which support it.
 
-Currently all algorithms *except* the following support double precision:  LogisticRegression, k-NN, UMAP.
+Currently all algorithms *except* the following support double precision:  k-NN, UMAP.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,7 +9,7 @@
 project = 'spark-rapids-ml'
 copyright = '2024, NVIDIA'
 author = 'NVIDIA'
-release = '24.04.0'
+release = '24.06.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/jvm/README.md b/jvm/README.md
@@ -74,7 +74,7 @@ the _project root path_ with:
 cd jvm
 mvn clean package
 ```
-Then `rapids-4-spark-ml_2.12-24.02.0-SNAPSHOT.jar` will be generated under `target` folder.
+Then `rapids-4-spark-ml_2.12-24.04.1-SNAPSHOT.jar` will be generated under `target` folder.
 
 Users can also use the _release_ version spark-rapids plugin as the dependency if it's already been
 released in public maven repositories, see [rapids-4-spark maven repository](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark)
@@ -94,8 +94,8 @@ repository, usually in your `~/.m2/repository`.
 
 Add the artifact jar to the Spark, for example:
 ```bash
-ML_JAR="target/rapids-4-spark-ml_2.12-24.02.0-SNAPSHOT.jar"
-PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar"
+ML_JAR="target/rapids-4-spark-ml_2.12-24.04.1-SNAPSHOT.jar"
+PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1.jar"
 
 $SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \
  --driver-memory 20G \

diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh
@@ -8,7 +8,7 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices
 sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel
 sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall"
 
-RAPIDS_VERSION=24.4.0
+RAPIDS_VERSION=24.6.0
 
 # install scikit-learn 
 sudo /usr/local/bin/pip3.9 install scikit-learn

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -51,7 +51,7 @@ If you already have a Databricks account, you can run the example notebooks on a
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.02.0.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.04.1.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin

diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -4,8 +4,8 @@ SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
-RAPIDS_VERSION=24.4.0
-SPARK_RAPIDS_VERSION=24.02.0
+RAPIDS_VERSION=24.6.0
+SPARK_RAPIDS_VERSION=24.04.1
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 

diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -29,7 +29,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D
 - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
   ```
   export CUDA_VERSION=11.8
-  export RAPIDS_VERSION=24.4.0
+  export RAPIDS_VERSION=24.6.0
 
   gcloud dataproc clusters create $USER-spark-rapids-ml \
   --image-version=2.1-ubuntu \

diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-RAPIDS_VERSION=24.4.0
+RAPIDS_VERSION=24.6.0
 
 # patch existing packages
 mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"

diff --git a/python/README.md b/python/README.md
@@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a
 
 First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html).   Example for CUDA Toolkit 11.8:
 ```bash
-conda create -n rapids-24.04 \
+conda create -n rapids-24.06 \
     -c rapidsai -c conda-forge -c nvidia \
-    cuml=24.04 python=3.9 cuda-version=11.8
+    cuml=24.06 python=3.9 cuda-version=11.8
 ```
 
 **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting.  Once you have a working environment, you can then try installing directly, if necessary.
@@ -19,7 +19,7 @@ conda create -n rapids-24.04 \
 
 Once you have the conda environment, activate it and install the required packages.
 ```bash
-conda activate rapids-24.04
+conda activate rapids-24.06
 
 ## for development access to notebooks, tests, and benchmarks
 git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git