From 2361af9740782f4057f4ed3abfde382a555f45b6 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 8 Nov 2023 15:33:43 +0800 Subject: [PATCH 1/4] [Doc] add performance tuning page (#504) Signed-off-by: Bobby Wang --- docs/site/performance.md | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 docs/site/performance.md diff --git a/docs/site/performance.md b/docs/site/performance.md new file mode 100644 index 00000000..4804f8c6 --- /dev/null +++ b/docs/site/performance.md @@ -0,0 +1,44 @@ +--- +title: Performance Tuning +nav_order: 6 +--- +# Performance Tuning + +* TOC +{:toc} + +## Stage-level scheduling + +Starting from spark-rapids-ml `23.10.0`, stage-level scheduling is automatically enabled. +Therefore, if you are using Spark **standalone** cluster version **`3.4.0`** or higher, we strongly recommend +configuring the `"spark.task.resource.gpu.amount"` as a fractional value. This will +enable running multiple tasks in parallel during the ETL phase to help the performance. An example configuration +would be `"spark.task.resource.gpu.amount=1/spark.executor.cores"`. For example, + +``` bash +spark-submit \ + --master spark://:7077 \ + --conf spark.executor.cores=12 \ + --conf spark.task.cpus=1 \ + --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.task.resource.gpu.amount=0.08 \ + ... +``` + +The above spark-submit command specifies a request for 1 GPU and 12 CPUs per executor. So you can see, +a total of 12 tasks per executor will be executed concurrently during the ETL phase. And the stage-level scheduling +is then used internally to the library to automatically carry out the ML training phases using the required 1 gpu per task. + +However, if you are using a spark-rapids-ml version earlier than 23.10.0 or a Spark +standalone cluster version below 3.4.0, you need to make sure there will be only 1 task running at any time per executor. +You can set `spark.task.cpus` equal to `spark.executor.cores`, or `"spark.task.resource.gpu.amount"=1`. For example, + +``` bash +spark-submit \ + --master spark://:7077 \ + --conf spark.executor.cores=12 \ + --conf spark.task.cpus=1 \ + --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.task.resource.gpu.amount=1 \ + ... +``` From f9b71093e89f1304beacf1abb59f0e933c78f0ec Mon Sep 17 00:00:00 2001 From: eordentlich Date: Wed, 8 Nov 2023 15:45:44 -0800 Subject: [PATCH 2/4] azure db kernel driver has been updated (#516) Signed-off-by: Erik Ordentlich --- notebooks/databricks/README.md | 2 -- notebooks/databricks/init-pip-cuda-11.8.sh | 12 ------------ 2 files changed, 14 deletions(-) diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index cba043b4..c1681228 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -29,8 +29,6 @@ If you already have a Databricks account, you can run the example notebooks on a - downloads and installs the [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) plugin for accelerating data loading and Spark SQL. - installs various `cuXX` dependencies via pip. - **Note**: as of the last update of this README, Azure Databricks requires a CUDA driver forward compatibility package. Uncomment the designated lines for this in the init script. AWS Databricks does not need this and leave the lines commented in that case. - - Copy the modified `init-pip-cuda-11.8.sh` init script to your *workspace* (not DBFS) (ex. workspace directory: /Users/< databricks-user-name >/init_scripts). ```bash export WS_SAVE_DIR="/path/to/directory/in/workspace" diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh index 5d5c7ce4..d20e3a5c 100644 --- a/notebooks/databricks/init-pip-cuda-11.8.sh +++ b/notebooks/databricks/init-pip-cuda-11.8.sh @@ -13,18 +13,6 @@ curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RA wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit -# install forward compatibility package due to old driver -# uncomment below lines on Azure Databricks -# distro=ubuntu2004 -# arch=x86_64 -# apt-key del 7fa2af80 -# wget https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-keyring_1.0-1_all.deb -# dpkg -i cuda-keyring_1.0-1_all.deb -# apt-get update -# apt-get install -y cuda-compat-11-8 -# export LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/local/cuda/lib64 -# ldconfig - # reset symlink and update library loading paths rm /usr/local/cuda ln -s /usr/local/cuda-11.8 /usr/local/cuda From 84d344ec56cab11926259525ec067bdec8e6c8ba Mon Sep 17 00:00:00 2001 From: eordentlich Date: Wed, 8 Nov 2023 15:51:20 -0800 Subject: [PATCH 3/4] update remaining 23.8/23.08 in docs and also etl plugin version in scripts (#514) Signed-off-by: Erik Ordentlich --- jvm/README.md | 6 +++--- jvm/pom.xml | 4 ++-- notebooks/databricks/README.md | 2 +- notebooks/databricks/init-pip-cuda-11.8.sh | 4 ++-- python/README.md | 6 +++--- python/benchmark/databricks/gpu_cluster_spec.sh | 2 +- python/benchmark/databricks/init-pip-cuda-11.8.sh | 4 ++-- python/run_benchmark.sh | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/jvm/README.md b/jvm/README.md index 49186b03..859a0477 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -74,7 +74,7 @@ the _project root path_ with: cd jvm mvn clean package ``` -Then `rapids-4-spark-ml_2.12-23.08.0-SNAPSHOT.jar` will be generated under `target` folder. +Then `rapids-4-spark-ml_2.12-23.10.0-SNAPSHOT.jar` will be generated under `target` folder. Users can also use the _release_ version spark-rapids plugin as the dependency if it's already been released in public maven repositories, see [rapids-4-spark maven repository](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark) @@ -94,8 +94,8 @@ repository, usually in your `~/.m2/repository`. Add the artifact jar to the Spark, for example: ```bash -ML_JAR="target/rapids-4-spark-ml_2.12-23.08.0-SNAPSHOT.jar" -PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.08.2-SNAPSHOT/rapids-4-spark_2.12-23.08.2-SNAPSHOT.jar" +ML_JAR="target/rapids-4-spark-ml_2.12-23.10.0-SNAPSHOT.jar" +PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.10.0/rapids-4-spark_2.12-23.10.0.jar" $SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \ --driver-memory 20G \ diff --git a/jvm/pom.xml b/jvm/pom.xml index f978ea39..fadf180f 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -20,7 +20,7 @@ 4.0.0 com.nvidia rapids-4-spark-ml_2.12 - 23.08.0-SNAPSHOT + 23.10.0-SNAPSHOT RAPIDS Accelerator for Apache Spark ML The RAPIDS cuML library for Apache Spark 2021 @@ -93,7 +93,7 @@ com.nvidia rapids-4-spark_2.12 - 23.08.0 + 23.10.0 diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index c1681228..9c685e4b 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -44,7 +44,7 @@ If you already have a Databricks account, you can run the example notebooks on a spark.task.resource.gpu.amount 1 spark.databricks.delta.preview.enabled true spark.python.worker.reuse true - spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.08.2.jar:/databricks/spark/python + spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.10.0.jar:/databricks/spark/python spark.sql.execution.arrow.maxRecordsPerBatch 100000 spark.rapids.memory.gpu.minAllocFraction 0.0001 spark.plugins com.nvidia.spark.SQLPlugin diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh index d20e3a5c..b7b2169e 100644 --- a/notebooks/databricks/init-pip-cuda-11.8.sh +++ b/notebooks/databricks/init-pip-cuda-11.8.sh @@ -2,10 +2,10 @@ # set portion of path below after /dbfs/ to dbfs zip file location SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 -# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) +# also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) RAPIDS_VERSION=23.10.0 -SPARK_RAPIDS_VERSION=23.08.2 +SPARK_RAPIDS_VERSION=23.10.0 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar diff --git a/python/README.md b/python/README.md index c77d6134..dac24b9d 100644 --- a/python/README.md +++ b/python/README.md @@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 11.8: ```bash -conda create -n rapids-23.08 \ +conda create -n rapids-23.10 \ -c rapidsai -c conda-forge -c nvidia \ - cuml=23.08 python=3.9 cuda-version=11.8 + cuml=23.10 python=3.9 cuda-version=11.8 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -19,7 +19,7 @@ conda create -n rapids-23.08 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-23.08 +conda activate rapids-23.10 ## for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh index ee1fa7a6..f8bbc47f 100644 --- a/python/benchmark/databricks/gpu_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_cluster_spec.sh @@ -9,7 +9,7 @@ cat < Date: Wed, 8 Nov 2023 17:02:43 -0800 Subject: [PATCH 4/4] updated version of PR 440 to add spark parameter validation (#515) * Validate parameters by converting them to spark java companion objects Signed-off-by: Bobby Wang * add validate_parameters * fix test failure * add kmeans * fix test issue * address comments, rebase on latest commit Signed-off-by: Erik Ordentlich * fix linear model test and better error message Signed-off-by: Erik Ordentlich * formatting Signed-off-by: Erik Ordentlich * increase cv tolerance for intermittent failure Signed-off-by: Erik Ordentlich * fix tolerance arg Signed-off-by: Erik Ordentlich --------- Signed-off-by: Bobby Wang Signed-off-by: Erik Ordentlich Co-authored-by: Bobby Wang --- python/src/spark_rapids_ml/classification.py | 10 ++++- python/src/spark_rapids_ml/clustering.py | 5 +++ python/src/spark_rapids_ml/core.py | 39 +++++++++++++++++++- python/src/spark_rapids_ml/feature.py | 5 +++ python/src/spark_rapids_ml/knn.py | 4 ++ python/src/spark_rapids_ml/params.py | 11 +++++- python/src/spark_rapids_ml/regression.py | 8 ++++ python/src/spark_rapids_ml/umap.py | 4 ++ python/tests/test_common_estimator.py | 4 ++ python/tests/test_kmeans.py | 21 +++++++++++ python/tests/test_linear_model.py | 30 +++++++++++++-- python/tests/test_logistic_regression.py | 27 +++++++++++++- python/tests/test_pca.py | 19 ++++++++++ python/tests/test_random_forest.py | 23 ++++++++++++ 14 files changed, 202 insertions(+), 8 deletions(-) diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index 2aab2bce..878d6256 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from abc import ABCMeta from typing import ( TYPE_CHECKING, Any, @@ -27,6 +28,7 @@ cast, ) +import pyspark from pyspark.ml.common import _py2java from pyspark.ml.evaluation import Evaluator, MulticlassClassificationEvaluator @@ -298,6 +300,9 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]: mapping = super()._param_mapping() return mapping + def _pyspark_class(self) -> Optional[ABCMeta]: + return pyspark.ml.classification.RandomForestClassifier + class RandomForestClassifier( _RandomForestClassifierClass, @@ -664,7 +669,7 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]: def _param_value_mapping( cls, ) -> Dict[str, Callable[[Any], Union[None, str, float, int]]]: - return {"C": lambda x: 1 / x if x != 0.0 else 0.0} + return {"C": lambda x: 1 / x if x > 0.0 else (0.0 if x == 0.0 else None)} def _get_cuml_params_default(self) -> Dict[str, Any]: return { @@ -704,6 +709,9 @@ def _reg_params_value_mapping( return (penalty, C, l1_ratio) + def _pyspark_class(self) -> Optional[ABCMeta]: + return pyspark.ml.classification.LogisticRegression + class _LogisticRegressionCumlParams( _CumlParams, diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 868f0130..c4a30e4f 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -14,10 +14,12 @@ # limitations under the License. # +from abc import ABCMeta from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast import numpy as np import pandas as pd +import pyspark from pyspark import keyword_only from pyspark.ml.clustering import KMeansModel as SparkKMeansModel from pyspark.ml.clustering import _KMeansParams @@ -92,6 +94,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "max_samples_per_batch": 32768, } + def _pyspark_class(self) -> Optional[ABCMeta]: + return pyspark.ml.clustering.KMeans + class _KMeansCumlParams(_CumlParams, _KMeansParams, HasFeaturesCols): """ diff --git a/python/src/spark_rapids_ml/core.py b/python/src/spark_rapids_ml/core.py index 7751144a..7c242358 100644 --- a/python/src/spark_rapids_ml/core.py +++ b/python/src/spark_rapids_ml/core.py @@ -16,7 +16,7 @@ import json import os import threading -from abc import abstractmethod +from abc import ABCMeta, abstractmethod from collections import namedtuple from typing import ( TYPE_CHECKING, @@ -56,6 +56,7 @@ MLWritable, MLWriter, ) +from pyspark.ml.wrapper import JavaParams from pyspark.sql import Column, DataFrame from pyspark.sql.functions import col, struct from pyspark.sql.pandas.functions import pandas_udf @@ -301,6 +302,16 @@ def _initialize_cuml_logging(verbose: Optional[Union[bool, int]]) -> None: cuml_logger.set_level(log_level) + def _pyspark_class(self) -> Optional[ABCMeta]: + """ + Subclass should override to return corresponding pyspark.ml class + Ex. logistic regression should return pyspark.ml.classification.LogisticRegression + Return None if no corresponding class in pyspark, e.g. knn + """ + raise NotImplementedError( + "pyspark.ml class corresponding to estimator not specified." + ) + class _CumlCaller(_CumlParams, _CumlCommon): """ @@ -419,6 +430,31 @@ def _require_nccl_ucx(self) -> Tuple[bool, bool]: """ return (True, False) + def _validate_parameters(self) -> None: + cls_name = self._pyspark_class() + + if cls_name is not None: + pyspark_est = cls_name() + # Both pyspark and cuml may have a parameter with the same name, + # but cuml might have additional optional values that can be set. + # If we transfer these cuml-specific values to the Spark JVM, + # it would result in an exception. + # To avoid this issue, we skip transferring these parameters + # since the mapped parameters have been validated in _get_cuml_mapping_value. + cuml_est = self.copy() + cuml_params = cuml_est._param_value_mapping().keys() + param_mapping = cuml_est._param_mapping() + pyspark_params = [k for k, v in param_mapping.items() if v in cuml_params] + for p in pyspark_params: + cuml_est.clear(cuml_est.getParam(p)) + + cuml_est._copyValues(pyspark_est) + # validate the parameters + pyspark_est._transfer_params_to_java() + + del pyspark_est + del cuml_est + @abstractmethod def _get_cuml_fit_func( self, @@ -468,6 +504,7 @@ def _call_cuml_fit_func( :class:`Transformer` fitted model """ + self._validate_parameters() cls = self.__class__ diff --git a/python/src/spark_rapids_ml/feature.py b/python/src/spark_rapids_ml/feature.py index 0a5a93e8..6afcc8f0 100644 --- a/python/src/spark_rapids_ml/feature.py +++ b/python/src/spark_rapids_ml/feature.py @@ -15,10 +15,12 @@ # import itertools +from abc import ABCMeta from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd +import pyspark from pyspark import keyword_only from pyspark.ml.common import _py2java from pyspark.ml.feature import PCAModel as SparkPCAModel @@ -69,6 +71,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "whiten": False, } + def _pyspark_class(self) -> Optional[ABCMeta]: + return pyspark.ml.feature.PCA + class _PCACumlParams(_CumlParams, _PCAParams, HasInputCols): """ diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index a54859dc..a55d667b 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -15,6 +15,7 @@ # import asyncio +from abc import ABCMeta from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import numpy as np @@ -68,6 +69,9 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]: def _get_cuml_params_default(self) -> Dict[str, Any]: return {"n_neighbors": 5, "verbose": False, "batch_size": 2000000} + def _pyspark_class(self) -> Optional[ABCMeta]: + return None + class _NearestNeighborsCumlParams(_CumlParams, HasInputCol, HasLabelCol, HasInputCols): """ diff --git a/python/src/spark_rapids_ml/params.py b/python/src/spark_rapids_ml/params.py index 45e03a7f..1f4835fd 100644 --- a/python/src/spark_rapids_ml/params.py +++ b/python/src/spark_rapids_ml/params.py @@ -429,7 +429,16 @@ def _set_cuml_param( if cuml_param is not None: # if Spark Param is mapped to cuML parameter, set cuml_params - self._set_cuml_value(cuml_param, spark_value) + try: + self._set_cuml_value(cuml_param, spark_value) + except ValueError: + # create more informative message + param_ref_str = ( + cuml_param + " or " + spark_param + if cuml_param != spark_param + else spark_param + ) + raise ValueError(f"{param_ref_str} given invalid value {spark_value}") def _get_cuml_mapping_value(self, k: str, v: Any) -> Any: value_map = self._param_value_mapping() diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index 6998a4fc..d0b393e1 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from abc import ABCMeta from typing import ( TYPE_CHECKING, Any, @@ -28,6 +29,7 @@ import numpy as np import pandas as pd +import pyspark from pyspark import Row, TaskContext, keyword_only from pyspark.ml.common import _py2java from pyspark.ml.evaluation import Evaluator, RegressionEvaluator @@ -218,6 +220,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "shuffle": True, } + def _pyspark_class(self) -> Optional[ABCMeta]: + return pyspark.ml.regression.LinearRegression + class _LinearRegressionCumlParams( _CumlParams, _LinearRegressionParams, HasFeaturesCols @@ -783,6 +788,9 @@ def _param_value_mapping( ) return mapping + def _pyspark_class(self) -> Optional[ABCMeta]: + return pyspark.ml.regression.RandomForestRegressor + class RandomForestRegressor( _RandomForestRegressorClass, diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index 062e8e75..46430b60 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -16,6 +16,7 @@ import json import os +from abc import ABCMeta from typing import ( TYPE_CHECKING, Any, @@ -113,6 +114,9 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "verbose": False, } + def _pyspark_class(self) -> Optional[ABCMeta]: + return None + class _UMAPCumlParams( _CumlParams, HasFeaturesCol, HasFeaturesCols, HasLabelCol, HasOutputCol diff --git a/python/tests/test_common_estimator.py b/python/tests/test_common_estimator.py index 2dc0108c..99197a3a 100644 --- a/python/tests/test_common_estimator.py +++ b/python/tests/test_common_estimator.py @@ -14,6 +14,7 @@ # limitations under the License. # +from abc import ABCMeta from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -234,6 +235,9 @@ def _create_pyspark_model(self, result: Row) -> "SparkRapidsMLDummyModel": assert result.model_attribute_b == "hello dummy" return SparkRapidsMLDummyModel._from_row(result) + def _pyspark_class(self) -> Optional[ABCMeta]: + return None + class SparkRapidsMLDummyModel( SparkRapidsMLDummyClass, diff --git a/python/tests/test_kmeans.py b/python/tests/test_kmeans.py index a84ad035..17d217f7 100644 --- a/python/tests/test_kmeans.py +++ b/python/tests/test_kmeans.py @@ -19,6 +19,7 @@ import numpy as np import pytest from _pytest.logging import LogCaptureFixture +from pyspark.errors import IllegalArgumentException from pyspark.ml.clustering import KMeans as SparkKMeans from pyspark.ml.clustering import KMeansModel as SparkKMeansModel from pyspark.ml.functions import array_to_vector @@ -407,3 +408,23 @@ def test_kmeans_spark_compat( assert model.transform(df).take(1) == model2.transform(df).take(1) # True + + +def test_parameters_validation() -> None: + data = [ + ([1.0, 2.0], 1.0), + ([3.0, 1.0], 0.0), + ] + + with CleanSparkSession() as spark: + features_col = "features" + label_col = "label" + schema = features_col + " array, " + label_col + " float" + df = spark.createDataFrame(data, schema=schema) + with pytest.raises(IllegalArgumentException, match="k given invalid value -1"): + KMeans(k=-1).fit(df) + + with pytest.raises( + IllegalArgumentException, match="maxIter given invalid value -1" + ): + KMeans().setMaxIter(-1).fit(df) diff --git a/python/tests/test_linear_model.py b/python/tests/test_linear_model.py index a50d44bb..b8db60de 100644 --- a/python/tests/test_linear_model.py +++ b/python/tests/test_linear_model.py @@ -19,6 +19,7 @@ import numpy as np import pytest from _pytest.logging import LogCaptureFixture +from pyspark.errors import IllegalArgumentException from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import VectorAssembler from pyspark.ml.functions import array_to_vector @@ -166,9 +167,7 @@ def test_linear_regression_params( # Unsupported value spark_params = {"solver": "l-bfgs"} - with pytest.raises( - ValueError, match="Value 'l-bfgs' for 'solver' param is unsupported" - ): + with pytest.raises(ValueError, match="solver given invalid value l-bfgs"): unsupported_lr = LinearRegression(**spark_params) # make sure no warning when enabling float64 inputs @@ -647,3 +646,28 @@ def test_crossvalidator_linear_regression( spark_cv_model = spark_cv.fit(df) assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics) + + +def test_parameters_validation() -> None: + data = [ + ([1.0, 2.0], 1.0), + ([3.0, 1.0], 0.0), + ] + + with CleanSparkSession() as spark: + features_col = "features" + label_col = "label" + schema = features_col + " array, " + label_col + " float" + df = spark.createDataFrame(data, schema=schema) + with pytest.raises( + IllegalArgumentException, match="maxIter given invalid value -1" + ): + LinearRegression(maxIter=-1).fit(df) + + with pytest.raises( + IllegalArgumentException, match="regParam given invalid value -1.0" + ): + LinearRegression().setRegParam(-1.0).fit(df) + + # shouldn't throw an exception for setting cuml values + LinearRegression(loss="squared_loss")._validate_parameters() diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py index 52c42e1d..3b7a95bd 100644 --- a/python/tests/test_logistic_regression.py +++ b/python/tests/test_logistic_regression.py @@ -5,6 +5,7 @@ import pytest from _pytest.logging import LogCaptureFixture from packaging import version +from pyspark.errors import IllegalArgumentException from pyspark.ml.classification import LogisticRegression as SparkLogisticRegression from pyspark.ml.classification import ( LogisticRegressionModel as SparkLogisticRegressionModel, @@ -657,7 +658,7 @@ def test_compat_multinomial( regParam=0.1, elasticNetParam=0.2, fitIntercept=fit_intercept, - family="multimonial", + family="multinomial", ) assert mlor.getRegParam() == 0.1 @@ -1046,4 +1047,26 @@ def test_crossvalidator_logistic_regression( ) spark_cv_model = spark_cv.fit(df) - assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics) + assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics, 0.0005) + + +def test_parameters_validation() -> None: + data = [ + ([1.0, 2.0], 1.0), + ([3.0, 1.0], 0.0), + ] + + with CleanSparkSession() as spark: + features_col = "features" + label_col = "label" + schema = features_col + " array, " + label_col + " float" + df = spark.createDataFrame(data, schema=schema) + with pytest.raises( + IllegalArgumentException, match="maxIter given invalid value -1" + ): + LogisticRegression(maxIter=-1).fit(df) + + # regParam is mapped to different value in LogisticRegression which should be in + # charge of validating it. + with pytest.raises(ValueError, match="C or regParam given invalid value -1.0"): + LogisticRegression().setRegParam(-1.0).fit(df) diff --git a/python/tests/test_pca.py b/python/tests/test_pca.py index eab7592a..4925e4c6 100644 --- a/python/tests/test_pca.py +++ b/python/tests/test_pca.py @@ -19,6 +19,7 @@ import numpy as np import pytest from _pytest.logging import LogCaptureFixture +from pyspark.errors import IllegalArgumentException from pyspark.ml.feature import PCA as SparkPCA from pyspark.ml.feature import PCAModel as SparkPCAModel from pyspark.ml.functions import array_to_vector @@ -410,3 +411,21 @@ def test_pca_spark_compat( assert loadedModel.pc == model.pc assert loadedModel.explainedVariance == model.explainedVariance assert loadedModel.transform(df).take(1) == model.transform(df).take(1) + + +def test_parameters_validation() -> None: + data = [ + ([1.0, 2.0], 1.0), + ([3.0, 1.0], 0.0), + ] + + with CleanSparkSession() as spark: + features_col = "features" + label_col = "label" + schema = features_col + " array, " + label_col + " float" + df = spark.createDataFrame(data, schema=schema) + with pytest.raises(IllegalArgumentException, match="k given invalid value -1"): + PCA(k=-1).fit(df) + + with pytest.raises(IllegalArgumentException, match="k given invalid value -1"): + PCA().setK(-1).fit(df) diff --git a/python/tests/test_random_forest.py b/python/tests/test_random_forest.py index edfcc614..1f984809 100644 --- a/python/tests/test_random_forest.py +++ b/python/tests/test_random_forest.py @@ -21,6 +21,7 @@ import pytest from _pytest.logging import LogCaptureFixture from cuml import accuracy_score +from pyspark.errors import IllegalArgumentException from pyspark.ml.classification import ( RandomForestClassificationModel as SparkRFClassificationModel, ) @@ -904,3 +905,25 @@ def test_crossvalidator_random_forest( spark_cv_model = spark_cv.fit(df) assert array_equal(model.avgMetrics, spark_cv_model.avgMetrics) + + +def test_parameters_validation() -> None: + data = [ + ([1.0, 2.0], 1.0), + ([3.0, 1.0], 0.0), + ] + + with CleanSparkSession() as spark: + features_col = "features" + label_col = "label" + schema = features_col + " array, " + label_col + " float" + df = spark.createDataFrame(data, schema=schema) + with pytest.raises( + IllegalArgumentException, match="maxDepth given invalid value -1" + ): + RandomForestClassifier(maxDepth=-1).fit(df) + + with pytest.raises( + IllegalArgumentException, match="maxBins given invalid value -1" + ): + RandomForestRegressor().setMaxBins(-1).fit(df)