From 2361af9740782f4057f4ed3abfde382a555f45b6 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Wed, 8 Nov 2023 15:33:43 +0800
Subject: [PATCH 1/4] [Doc] add performance tuning page (#504)

Signed-off-by: Bobby Wang <wbo4958@gmail.com>
---
 docs/site/performance.md | 44 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 docs/site/performance.md
diff --git a/docs/site/performance.md b/docs/site/performance.md
new file mode 100644
index 00000000..4804f8c6
--- /dev/null
+++ b/docs/site/performance.md
@@ -0,0 +1,44 @@
+---
+title: Performance Tuning
+nav_order: 6
+---
+# Performance Tuning
+
+* TOC
+{:toc}
+
+## Stage-level scheduling
+
+Starting from spark-rapids-ml `23.10.0`, stage-level scheduling is automatically enabled.
+Therefore, if you are using Spark **standalone** cluster version **`3.4.0`** or higher, we strongly recommend
+configuring the `"spark.task.resource.gpu.amount"` as a fractional value. This will
+enable running multiple tasks in parallel during the ETL phase to help the performance. An example configuration
+would be `"spark.task.resource.gpu.amount=1/spark.executor.cores"`. For example,
+
+``` bash
+spark-submit \
+  --master spark://<master-ip>:7077 \
+  --conf spark.executor.cores=12 \
+  --conf spark.task.cpus=1 \
+  --conf spark.executor.resource.gpu.amount=1 \
+  --conf spark.task.resource.gpu.amount=0.08 \
+  ...
+```
+
+The above spark-submit command specifies a request for 1 GPU and 12 CPUs per executor. So you can see,
+a total of 12 tasks per executor will be executed concurrently during the ETL phase. And the stage-level scheduling
+is then used internally to the library to automatically carry out the ML training phases using the required 1 gpu per task.
+
+However, if you are using a spark-rapids-ml version earlier than 23.10.0 or a Spark
+standalone cluster version below 3.4.0, you need to make sure there will be only 1 task running at any time per executor.
+You can set `spark.task.cpus` equal to `spark.executor.cores`, or `"spark.task.resource.gpu.amount"=1`. For example,
+
+``` bash
+spark-submit \
+  --master spark://<master-ip>:7077 \
+  --conf spark.executor.cores=12 \
+  --conf spark.task.cpus=1 \
+  --conf spark.executor.resource.gpu.amount=1 \
+  --conf spark.task.resource.gpu.amount=1 \
+  ...
+```

From f9b71093e89f1304beacf1abb59f0e933c78f0ec Mon Sep 17 00:00:00 2001
From: eordentlich <eordentlich@gmail.com>
Date: Wed, 8 Nov 2023 15:45:44 -0800
Subject: [PATCH 2/4] azure db kernel driver has been updated (#516)

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>
---
 notebooks/databricks/README.md             |  2 --
 notebooks/databricks/init-pip-cuda-11.8.sh | 12 ------------
 2 files changed, 14 deletions(-)

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
index cba043b4..c1681228 100644
--- a/notebooks/databricks/README.md
+++ b/notebooks/databricks/README.md
@@ -29,8 +29,6 @@ If you already have a Databricks account, you can run the example notebooks on a
   - downloads and installs the [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) plugin for accelerating data loading and Spark SQL.
   - installs various `cuXX` dependencies via pip.
 
-  **Note**: as of the last update of this README, Azure Databricks requires a CUDA driver forward compatibility package.  Uncomment the designated lines for this in the init script.  AWS Databricks does not need this and leave the lines commented in that case.
-
 - Copy the modified `init-pip-cuda-11.8.sh` init script to your *workspace* (not DBFS) (ex. workspace directory: /Users/< databricks-user-name >/init_scripts).
   ```bash
   export WS_SAVE_DIR="/path/to/directory/in/workspace"
diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
index 5d5c7ce4..d20e3a5c 100644
--- a/notebooks/databricks/init-pip-cuda-11.8.sh
+++ b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -13,18 +13,6 @@ curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RA
 wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
 sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit
 
-# install forward compatibility package due to old driver
-# uncomment below lines on Azure Databricks
-# distro=ubuntu2004
-# arch=x86_64
-# apt-key del 7fa2af80
-# wget https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-keyring_1.0-1_all.deb
-# dpkg -i cuda-keyring_1.0-1_all.deb
-# apt-get update
-# apt-get install -y cuda-compat-11-8
-# export LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64
-# ldconfig
-
 # reset symlink and update library loading paths
 rm /usr/local/cuda
 ln -s /usr/local/cuda-11.8 /usr/local/cuda

From 84d344ec56cab11926259525ec067bdec8e6c8ba Mon Sep 17 00:00:00 2001
From: eordentlich <eordentlich@gmail.com>
Date: Wed, 8 Nov 2023 15:51:20 -0800
Subject: [PATCH 3/4] update remaining 23.8/23.08 in docs and also etl plugin
 version in scripts (#514)

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>
---
 jvm/README.md                                     | 6 +++---
 jvm/pom.xml                                       | 4 ++--
 notebooks/databricks/README.md                    | 2 +-
 notebooks/databricks/init-pip-cuda-11.8.sh        | 4 ++--
 python/README.md                                  | 6 +++---
 python/benchmark/databricks/gpu_cluster_spec.sh   | 2 +-
 python/benchmark/databricks/init-pip-cuda-11.8.sh | 4 ++--
 python/run_benchmark.sh                           | 2 +-
 8 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/jvm/README.md b/jvm/README.md
index 49186b03..859a0477 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -74,7 +74,7 @@ the _project root path_ with:
 cd jvm
 mvn clean package
 ```
-Then `rapids-4-spark-ml_2.12-23.08.0-SNAPSHOT.jar` will be generated under `target` folder.
+Then `rapids-4-spark-ml_2.12-23.10.0-SNAPSHOT.jar` will be generated under `target` folder.
 
 Users can also use the _release_ version spark-rapids plugin as the dependency if it's already been
 released in public maven repositories, see [rapids-4-spark maven repository](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark)
@@ -94,8 +94,8 @@ repository, usually in your `~/.m2/repository`.
 
 Add the artifact jar to the Spark, for example:
 ```bash
-ML_JAR="target/rapids-4-spark-ml_2.12-23.08.0-SNAPSHOT.jar"
-PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.08.2-SNAPSHOT/rapids-4-spark_2.12-23.08.2-SNAPSHOT.jar"
+ML_JAR="target/rapids-4-spark-ml_2.12-23.10.0-SNAPSHOT.jar"
+PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar"
 
 $SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \
  --driver-memory 20G \
diff --git a/jvm/pom.xml b/jvm/pom.xml
index f978ea39..fadf180f 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -20,7 +20,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-ml_2.12</artifactId>
-    <version>23.08.0-SNAPSHOT</version>
+    <version>23.10.0-SNAPSHOT</version>
     <name>RAPIDS Accelerator for Apache Spark ML</name>
     <description>The RAPIDS cuML library for Apache Spark</description>
     <inceptionYear>2021</inceptionYear>
@@ -93,7 +93,7 @@
         <dependency>
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark_2.12</artifactId>
-            <version>23.08.0</version>
+            <version>23.10.0</version>
         </dependency>
 
 
diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
index c1681228..9c685e4b 100644
--- a/notebooks/databricks/README.md
+++ b/notebooks/databricks/README.md
@@ -44,7 +44,7 @@ If you already have a Databricks account, you can run the example notebooks on a
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.08.2.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.10.0.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin
diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
index d20e3a5c..b7b2169e 100644
--- a/notebooks/databricks/init-pip-cuda-11.8.sh
+++ b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -2,10 +2,10 @@
 # set portion of path below after /dbfs/ to dbfs zip file location
 SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
-# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
+# also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
 RAPIDS_VERSION=23.10.0
-SPARK_RAPIDS_VERSION=23.08.2
+SPARK_RAPIDS_VERSION=23.10.0
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
diff --git a/python/README.md b/python/README.md
index c77d6134..dac24b9d 100644
--- a/python/README.md
+++ b/python/README.md
@@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a
 
 First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html).   Example for CUDA Toolkit 11.8:
 ```bash
-conda create -n rapids-23.08 \
+conda create -n rapids-23.10 \
     -c rapidsai -c conda-forge -c nvidia \
-    cuml=23.08 python=3.9 cuda-version=11.8
+    cuml=23.10 python=3.9 cuda-version=11.8
 ```
 
 **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting.  Once you have a working environment, you can then try installing directly, if necessary.
@@ -19,7 +19,7 @@ conda create -n rapids-23.08 \
 
 Once you have the conda environment, activate it and install the required packages.
 ```bash
-conda activate rapids-23.08
+conda activate rapids-23.10
 
 ## for development access to notebooks, tests, and benchmarks
 git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git
diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh
index ee1fa7a6..f8bbc47f 100644
--- a/python/benchmark/databricks/gpu_cluster_spec.sh
+++ b/python/benchmark/databricks/gpu_cluster_spec.sh
@@ -9,7 +9,7 @@ cat <<EOF
         "spark.task.cpus": "1",
         "spark.databricks.delta.preview.enabled": "true",
         "spark.python.worker.reuse": "true",
-        "spark.executorEnv.PYTHONPATH": "/databricks/jars/rapids-4-spark_2.12-23.08.2.jar:/databricks/spark/python",
+        "spark.executorEnv.PYTHONPATH": "/databricks/jars/rapids-4-spark_2.12-23.10.0.jar:/databricks/spark/python",
         "spark.sql.files.minPartitionNum": "2",
         "spark.sql.execution.arrow.maxRecordsPerBatch": "10000",
         "spark.executor.cores": "8",
diff --git a/python/benchmark/databricks/init-pip-cuda-11.8.sh b/python/benchmark/databricks/init-pip-cuda-11.8.sh
index 2c32b8e4..ef10f72d 100644
--- a/python/benchmark/databricks/init-pip-cuda-11.8.sh
+++ b/python/benchmark/databricks/init-pip-cuda-11.8.sh
@@ -3,10 +3,10 @@
 SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/spark-rapids-ml.zip
 BENCHMARK_ZIP=/dbfs/path/to/benchmark.zip
 # IMPORTANT: specify rapids fully 23.10.0 and not 23.10
-# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
+# also, in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.1)
 RAPIDS_VERSION=23.10.0
-SPARK_RAPIDS_VERSION=23.08.2
+SPARK_RAPIDS_VERSION=23.10.0
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
diff --git a/python/run_benchmark.sh b/python/run_benchmark.sh
index 5f8186d6..60e27804 100755
--- a/python/run_benchmark.sh
+++ b/python/run_benchmark.sh
@@ -99,7 +99,7 @@ EOF
 
 if [[ $cluster_type == "gpu_etl" ]]
 then
-SPARK_RAPIDS_VERSION=23.08.2
+SPARK_RAPIDS_VERSION=23.10.0
 rapids_jar=${rapids_jar:-rapids-4-spark_2.12-$SPARK_RAPIDS_VERSION.jar}
 if [ ! -f $rapids_jar ]; then
     echo "downloading spark rapids jar"

From 8f7cb6ab8269b59beff36556a349c9f6c727e4a0 Mon Sep 17 00:00:00 2001
From: eordentlich <eordentlich@gmail.com>
Date: Wed, 8 Nov 2023 17:02:43 -0800
Subject: [PATCH 4/4] updated version of PR 440 to add spark parameter
 validation  (#515)

* Validate parameters by converting them to spark java companion objects

Signed-off-by: Bobby Wang <wbo4958@gmail.com>

* add validate_parameters

* fix test failure

* add kmeans

* fix test issue

* address comments, rebase on latest commit

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>

* fix linear model test and better error message

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>

* formatting

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>

* increase cv tolerance for intermittent failure

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>

* fix tolerance arg

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>

---------

Signed-off-by: Bobby Wang <wbo4958@gmail.com>
Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>
Co-authored-by: Bobby Wang <wbo4958@gmail.com>
---
 python/src/spark_rapids_ml/classification.py | 10 ++++-
 python/src/spark_rapids_ml/clustering.py     |  5 +++
 python/src/spark_rapids_ml/core.py           | 39 +++++++++++++++++++-
 python/src/spark_rapids_ml/feature.py        |  5 +++
 python/src/spark_rapids_ml/knn.py            |  4 ++
 python/src/spark_rapids_ml/params.py         | 11 +++++-
 python/src/spark_rapids_ml/regression.py     |  8 ++++
 python/src/spark_rapids_ml/umap.py           |  4 ++
 python/tests/test_common_estimator.py        |  4 ++
 python/tests/test_kmeans.py                  | 21 +++++++++++
 python/tests/test_linear_model.py            | 30 +++++++++++++--
 python/tests/test_logistic_regression.py     | 27 +++++++++++++-
 python/tests/test_pca.py                     | 19 ++++++++++
 python/tests/test_random_forest.py           | 23 ++++++++++++
 14 files changed, 202 insertions(+), 8 deletions(-)

diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py
index 2aab2bce..878d6256 100644
--- a/python/src/spark_rapids_ml/classification.py
+++ b/python/src/spark_rapids_ml/classification.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from abc import ABCMeta
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -27,6 +28,7 @@
     cast,
 )
 
+import pyspark
 from pyspark.ml.common import _py2java
 from pyspark.ml.evaluation import Evaluator, MulticlassClassificationEvaluator
 
@@ -298,6 +300,9 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]:
         mapping = super()._param_mapping()
         return mapping
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return pyspark.ml.classification.RandomForestClassifier
+
 
 class RandomForestClassifier(
     _RandomForestClassifierClass,
@@ -664,7 +669,7 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]:
     def _param_value_mapping(
         cls,
     ) -> Dict[str, Callable[[Any], Union[None, str, float, int]]]:
-        return {"C": lambda x: 1 / x if x != 0.0 else 0.0}
+        return {"C": lambda x: 1 / x if x > 0.0 else (0.0 if x == 0.0 else None)}
 
     def _get_cuml_params_default(self) -> Dict[str, Any]:
         return {
@@ -704,6 +709,9 @@ def _reg_params_value_mapping(
 
         return (penalty, C, l1_ratio)
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return pyspark.ml.classification.LogisticRegression
+
 
 class _LogisticRegressionCumlParams(
     _CumlParams,
diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py
index 868f0130..c4a30e4f 100644
--- a/python/src/spark_rapids_ml/clustering.py
+++ b/python/src/spark_rapids_ml/clustering.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 #
 
+from abc import ABCMeta
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import pandas as pd
+import pyspark
 from pyspark import keyword_only
 from pyspark.ml.clustering import KMeansModel as SparkKMeansModel
 from pyspark.ml.clustering import _KMeansParams
@@ -92,6 +94,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:
             "max_samples_per_batch": 32768,
         }
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return pyspark.ml.clustering.KMeans
+
 
 class _KMeansCumlParams(_CumlParams, _KMeansParams, HasFeaturesCols):
     """
diff --git a/python/src/spark_rapids_ml/core.py b/python/src/spark_rapids_ml/core.py
index 7751144a..7c242358 100644
--- a/python/src/spark_rapids_ml/core.py
+++ b/python/src/spark_rapids_ml/core.py
@@ -16,7 +16,7 @@
 import json
 import os
 import threading
-from abc import abstractmethod
+from abc import ABCMeta, abstractmethod
 from collections import namedtuple
 from typing import (
     TYPE_CHECKING,
@@ -56,6 +56,7 @@
     MLWritable,
     MLWriter,
 )
+from pyspark.ml.wrapper import JavaParams
 from pyspark.sql import Column, DataFrame
 from pyspark.sql.functions import col, struct
 from pyspark.sql.pandas.functions import pandas_udf
@@ -301,6 +302,16 @@ def _initialize_cuml_logging(verbose: Optional[Union[bool, int]]) -> None:
 
             cuml_logger.set_level(log_level)
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        """
+        Subclass should override to return corresponding pyspark.ml class
+        Ex. logistic regression should return pyspark.ml.classification.LogisticRegression
+        Return None if no corresponding class in pyspark, e.g. knn
+        """
+        raise NotImplementedError(
+            "pyspark.ml class corresponding to estimator not specified."
+        )
+
 
 class _CumlCaller(_CumlParams, _CumlCommon):
     """
@@ -419,6 +430,31 @@ def _require_nccl_ucx(self) -> Tuple[bool, bool]:
         """
         return (True, False)
 
+    def _validate_parameters(self) -> None:
+        cls_name = self._pyspark_class()
+
+        if cls_name is not None:
+            pyspark_est = cls_name()
+            # Both pyspark and cuml may have a parameter with the same name,
+            # but cuml might have additional optional values that can be set.
+            # If we transfer these cuml-specific values to the Spark JVM,
+            # it would result in an exception.
+            # To avoid this issue, we skip transferring these parameters
+            # since the mapped parameters have been validated in _get_cuml_mapping_value.
+            cuml_est = self.copy()
+            cuml_params = cuml_est._param_value_mapping().keys()
+            param_mapping = cuml_est._param_mapping()
+            pyspark_params = [k for k, v in param_mapping.items() if v in cuml_params]
+            for p in pyspark_params:
+                cuml_est.clear(cuml_est.getParam(p))
+
+            cuml_est._copyValues(pyspark_est)
+            # validate the parameters
+            pyspark_est._transfer_params_to_java()
+
+            del pyspark_est
+            del cuml_est
+
     @abstractmethod
     def _get_cuml_fit_func(
         self,
@@ -468,6 +504,7 @@ def _call_cuml_fit_func(
         :class:`Transformer`
             fitted model
         """
+        self._validate_parameters()
 
         cls = self.__class__
 
diff --git a/python/src/spark_rapids_ml/feature.py b/python/src/spark_rapids_ml/feature.py
index 0a5a93e8..6afcc8f0 100644
--- a/python/src/spark_rapids_ml/feature.py
+++ b/python/src/spark_rapids_ml/feature.py
@@ -15,10 +15,12 @@
 #
 
 import itertools
+from abc import ABCMeta
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
+import pyspark
 from pyspark import keyword_only
 from pyspark.ml.common import _py2java
 from pyspark.ml.feature import PCAModel as SparkPCAModel
@@ -69,6 +71,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:
             "whiten": False,
         }
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return pyspark.ml.feature.PCA
+
 
 class _PCACumlParams(_CumlParams, _PCAParams, HasInputCols):
     """
diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py
index a54859dc..a55d667b 100644
--- a/python/src/spark_rapids_ml/knn.py
+++ b/python/src/spark_rapids_ml/knn.py
@@ -15,6 +15,7 @@
 #
 
 import asyncio
+from abc import ABCMeta
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import numpy as np
@@ -68,6 +69,9 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]:
     def _get_cuml_params_default(self) -> Dict[str, Any]:
         return {"n_neighbors": 5, "verbose": False, "batch_size": 2000000}
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return None
+
 
 class _NearestNeighborsCumlParams(_CumlParams, HasInputCol, HasLabelCol, HasInputCols):
     """
diff --git a/python/src/spark_rapids_ml/params.py b/python/src/spark_rapids_ml/params.py
index 45e03a7f..1f4835fd 100644
--- a/python/src/spark_rapids_ml/params.py
+++ b/python/src/spark_rapids_ml/params.py
@@ -429,7 +429,16 @@ def _set_cuml_param(
 
         if cuml_param is not None:
             # if Spark Param is mapped to cuML parameter, set cuml_params
-            self._set_cuml_value(cuml_param, spark_value)
+            try:
+                self._set_cuml_value(cuml_param, spark_value)
+            except ValueError:
+                # create more informative message
+                param_ref_str = (
+                    cuml_param + " or " + spark_param
+                    if cuml_param != spark_param
+                    else spark_param
+                )
+                raise ValueError(f"{param_ref_str} given invalid value {spark_value}")
 
     def _get_cuml_mapping_value(self, k: str, v: Any) -> Any:
         value_map = self._param_value_mapping()
diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py
index 6998a4fc..d0b393e1 100644
--- a/python/src/spark_rapids_ml/regression.py
+++ b/python/src/spark_rapids_ml/regression.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from abc import ABCMeta
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -28,6 +29,7 @@
 
 import numpy as np
 import pandas as pd
+import pyspark
 from pyspark import Row, TaskContext, keyword_only
 from pyspark.ml.common import _py2java
 from pyspark.ml.evaluation import Evaluator, RegressionEvaluator
@@ -218,6 +220,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:
             "shuffle": True,
         }
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return pyspark.ml.regression.LinearRegression
+
 
 class _LinearRegressionCumlParams(
     _CumlParams, _LinearRegressionParams, HasFeaturesCols
@@ -783,6 +788,9 @@ def _param_value_mapping(
         )
         return mapping
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return pyspark.ml.regression.RandomForestRegressor
+
 
 class RandomForestRegressor(
     _RandomForestRegressorClass,
diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py
index 062e8e75..46430b60 100644
--- a/python/src/spark_rapids_ml/umap.py
+++ b/python/src/spark_rapids_ml/umap.py
@@ -16,6 +16,7 @@
 
 import json
 import os
+from abc import ABCMeta
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -113,6 +114,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:
             "verbose": False,
         }
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return None
+
 
 class _UMAPCumlParams(
     _CumlParams, HasFeaturesCol, HasFeaturesCols, HasLabelCol, HasOutputCol
diff --git a/python/tests/test_common_estimator.py b/python/tests/test_common_estimator.py
index 2dc0108c..99197a3a 100644
--- a/python/tests/test_common_estimator.py
+++ b/python/tests/test_common_estimator.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 #
 
+from abc import ABCMeta
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -234,6 +235,9 @@ def _create_pyspark_model(self, result: Row) -> "SparkRapidsMLDummyModel":
         assert result.model_attribute_b == "hello dummy"
         return SparkRapidsMLDummyModel._from_row(result)
 
+    def _pyspark_class(self) -> Optional[ABCMeta]:
+        return None
+
 
 class SparkRapidsMLDummyModel(
     SparkRapidsMLDummyClass,
diff --git a/python/tests/test_kmeans.py b/python/tests/test_kmeans.py
index a84ad035..17d217f7 100644
--- a/python/tests/test_kmeans.py
+++ b/python/tests/test_kmeans.py
@@ -19,6 +19,7 @@
 import numpy as np
 import pytest
 from _pytest.logging import LogCaptureFixture
+from pyspark.errors import IllegalArgumentException
 from pyspark.ml.clustering import KMeans as SparkKMeans
 from pyspark.ml.clustering import KMeansModel as SparkKMeansModel
 from pyspark.ml.functions import array_to_vector
@@ -407,3 +408,23 @@ def test_kmeans_spark_compat(
 
         assert model.transform(df).take(1) == model2.transform(df).take(1)
         # True
+
+
+def test_parameters_validation() -> None:
+    data = [
+        ([1.0, 2.0], 1.0),
+        ([3.0, 1.0], 0.0),
+    ]
+
+    with CleanSparkSession() as spark:
+        features_col = "features"
+        label_col = "label"
+        schema = features_col + " array<float>, " + label_col + " float"
+        df = spark.createDataFrame(data, schema=schema)
+        with pytest.raises(IllegalArgumentException, match="k given invalid value -1"):
+            KMeans(k=-1).fit(df)
+
+        with pytest.raises(
+            IllegalArgumentException, match="maxIter given invalid value -1"
+        ):
+            KMeans().setMaxIter(-1).fit(df)
diff --git a/python/tests/test_linear_model.py b/python/tests/test_linear_model.py
index a50d44bb..b8db60de 100644
--- a/python/tests/test_linear_model.py
+++ b/python/tests/test_linear_model.py
@@ -19,6 +19,7 @@
 import numpy as np
 import pytest
 from _pytest.logging import LogCaptureFixture
+from pyspark.errors import IllegalArgumentException
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.feature import VectorAssembler
 from pyspark.ml.functions import array_to_vector
@@ -166,9 +167,7 @@ def test_linear_regression_params(
 
     # Unsupported value
     spark_params = {"solver": "l-bfgs"}
-    with pytest.raises(
-        ValueError, match="Value 'l-bfgs' for 'solver' param is unsupported"
-    ):
+    with pytest.raises(ValueError, match="solver given invalid value l-bfgs"):
         unsupported_lr = LinearRegression(**spark_params)
 
     # make sure no warning when enabling float64 inputs
@@ -647,3 +646,28 @@ def test_crossvalidator_linear_regression(
         spark_cv_model = spark_cv.fit(df)
 
         assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics)
+
+
+def test_parameters_validation() -> None:
+    data = [
+        ([1.0, 2.0], 1.0),
+        ([3.0, 1.0], 0.0),
+    ]
+
+    with CleanSparkSession() as spark:
+        features_col = "features"
+        label_col = "label"
+        schema = features_col + " array<float>, " + label_col + " float"
+        df = spark.createDataFrame(data, schema=schema)
+        with pytest.raises(
+            IllegalArgumentException, match="maxIter given invalid value -1"
+        ):
+            LinearRegression(maxIter=-1).fit(df)
+
+        with pytest.raises(
+            IllegalArgumentException, match="regParam given invalid value -1.0"
+        ):
+            LinearRegression().setRegParam(-1.0).fit(df)
+
+        # shouldn't throw an exception for setting cuml values
+        LinearRegression(loss="squared_loss")._validate_parameters()
diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py
index 52c42e1d..3b7a95bd 100644
--- a/python/tests/test_logistic_regression.py
+++ b/python/tests/test_logistic_regression.py
@@ -5,6 +5,7 @@
 import pytest
 from _pytest.logging import LogCaptureFixture
 from packaging import version
+from pyspark.errors import IllegalArgumentException
 from pyspark.ml.classification import LogisticRegression as SparkLogisticRegression
 from pyspark.ml.classification import (
     LogisticRegressionModel as SparkLogisticRegressionModel,
@@ -657,7 +658,7 @@ def test_compat_multinomial(
                 regParam=0.1,
                 elasticNetParam=0.2,
                 fitIntercept=fit_intercept,
-                family="multimonial",
+                family="multinomial",
             )
 
         assert mlor.getRegParam() == 0.1
@@ -1046,4 +1047,26 @@ def test_crossvalidator_logistic_regression(
         )
         spark_cv_model = spark_cv.fit(df)
 
-        assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics)
+        assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics, 0.0005)
+
+
+def test_parameters_validation() -> None:
+    data = [
+        ([1.0, 2.0], 1.0),
+        ([3.0, 1.0], 0.0),
+    ]
+
+    with CleanSparkSession() as spark:
+        features_col = "features"
+        label_col = "label"
+        schema = features_col + " array<float>, " + label_col + " float"
+        df = spark.createDataFrame(data, schema=schema)
+        with pytest.raises(
+            IllegalArgumentException, match="maxIter given invalid value -1"
+        ):
+            LogisticRegression(maxIter=-1).fit(df)
+
+        # regParam is mapped to different value in LogisticRegression which should be in
+        # charge of validating it.
+        with pytest.raises(ValueError, match="C or regParam given invalid value -1.0"):
+            LogisticRegression().setRegParam(-1.0).fit(df)
diff --git a/python/tests/test_pca.py b/python/tests/test_pca.py
index eab7592a..4925e4c6 100644
--- a/python/tests/test_pca.py
+++ b/python/tests/test_pca.py
@@ -19,6 +19,7 @@
 import numpy as np
 import pytest
 from _pytest.logging import LogCaptureFixture
+from pyspark.errors import IllegalArgumentException
 from pyspark.ml.feature import PCA as SparkPCA
 from pyspark.ml.feature import PCAModel as SparkPCAModel
 from pyspark.ml.functions import array_to_vector
@@ -410,3 +411,21 @@ def test_pca_spark_compat(
         assert loadedModel.pc == model.pc
         assert loadedModel.explainedVariance == model.explainedVariance
         assert loadedModel.transform(df).take(1) == model.transform(df).take(1)
+
+
+def test_parameters_validation() -> None:
+    data = [
+        ([1.0, 2.0], 1.0),
+        ([3.0, 1.0], 0.0),
+    ]
+
+    with CleanSparkSession() as spark:
+        features_col = "features"
+        label_col = "label"
+        schema = features_col + " array<float>, " + label_col + " float"
+        df = spark.createDataFrame(data, schema=schema)
+        with pytest.raises(IllegalArgumentException, match="k given invalid value -1"):
+            PCA(k=-1).fit(df)
+
+        with pytest.raises(IllegalArgumentException, match="k given invalid value -1"):
+            PCA().setK(-1).fit(df)
diff --git a/python/tests/test_random_forest.py b/python/tests/test_random_forest.py
index edfcc614..1f984809 100644
--- a/python/tests/test_random_forest.py
+++ b/python/tests/test_random_forest.py
@@ -21,6 +21,7 @@
 import pytest
 from _pytest.logging import LogCaptureFixture
 from cuml import accuracy_score
+from pyspark.errors import IllegalArgumentException
 from pyspark.ml.classification import (
     RandomForestClassificationModel as SparkRFClassificationModel,
 )
@@ -904,3 +905,25 @@ def test_crossvalidator_random_forest(
         spark_cv_model = spark_cv.fit(df)
 
         assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics)
+
+
+def test_parameters_validation() -> None:
+    data = [
+        ([1.0, 2.0], 1.0),
+        ([3.0, 1.0], 0.0),
+    ]
+
+    with CleanSparkSession() as spark:
+        features_col = "features"
+        label_col = "label"
+        schema = features_col + " array<float>, " + label_col + " float"
+        df = spark.createDataFrame(data, schema=schema)
+        with pytest.raises(
+            IllegalArgumentException, match="maxDepth given invalid value -1"
+        ):
+            RandomForestClassifier(maxDepth=-1).fit(df)
+
+        with pytest.raises(
+            IllegalArgumentException, match="maxBins given invalid value -1"
+        ):
+            RandomForestRegressor().setMaxBins(-1).fit(df)