diff --git a/build.sbt b/build.sbt index 198201954c..f19e9c49ff 100644 --- a/build.sbt +++ b/build.sbt @@ -7,10 +7,10 @@ import scala.xml.transform.{RewriteRule, RuleTransformer} import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} val condaEnvName = "synapseml" -val sparkVersion = "3.4.1" +val sparkVersion = "3.3.3" name := "synapseml" ThisBuild / organization := "com.microsoft.azure" -ThisBuild / scalaVersion := "2.12.17" +ThisBuild / scalaVersion := "2.12.15" val scalaMajorVersion = 2.12 @@ -34,7 +34,7 @@ val extraDependencies = Seq( "com.jcraft" % "jsch" % "0.1.54", "org.apache.httpcomponents.client5" % "httpclient5" % "5.1.3", "org.apache.httpcomponents" % "httpmime" % "4.5.13", - "com.linkedin.isolation-forest" %% "isolation-forest_3.4.1" % "3.0.3" + "com.linkedin.isolation-forest" %% "isolation-forest_3.3.3" % "3.0.3" exclude("com.google.protobuf", "protobuf-java") exclude("org.apache.spark", "spark-mllib_2.12") exclude("org.apache.spark", "spark-core_2.12") exclude("org.apache.spark", "spark-avro_2.12") exclude("org.apache.spark", "spark-sql_2.12"), diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala index 425d7314f6..6c50f43ea7 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala @@ -70,7 +70,7 @@ object PyCodegen { // There's `Already borrowed` error found in transformers 4.16.2 when using tokenizers s"""extras_require={"extras": [ | "cmake", - | "horovod==0.28.1", + | "horovod==0.27.0", | "pytorch_lightning>=1.5.0,<1.5.10", | "torch==1.13.1", | "torchvision>=0.14.1", diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/PackageUtils.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/PackageUtils.scala index 6041b9b307..b2642dd4a2 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/PackageUtils.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/PackageUtils.scala @@ -18,7 +18,7 @@ object PackageUtils { val PackageName = s"synapseml_$ScalaVersionSuffix" val PackageMavenCoordinate = s"$PackageGroup:$PackageName:${BuildInfo.version}" - private val AvroCoordinate = "org.apache.spark:spark-avro_2.12:3.4.1" + private val AvroCoordinate = "org.apache.spark:spark-avro_2.12:3.3.3" val PackageRepository: String = SparkMLRepository // If testing onnx package with snapshots repo, make sure to switch to using diff --git a/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala index 68169552f7..90c9814cab 100644 --- a/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala +++ b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala @@ -199,20 +199,17 @@ object SparkHelpers { def flatten(ratings: Dataset[_], num: Int, dstOutputColumn: String, srcOutputColumn: String): DataFrame = { import ratings.sparkSession.implicits._ - import org.apache.spark.sql.functions.{collect_top_k, struct} + + val topKAggregator = new TopByKeyAggregator[Int, Int, Float](num, Ordering.by(_._2)) + val recs = ratings.as[(Int, Int, Float)].groupByKey(_._1).agg(topKAggregator.toColumn) + .toDF("id", "recommendations") val arrayType = ArrayType( new StructType() .add(dstOutputColumn, IntegerType) .add(Constants.RatingCol, FloatType) ) - - ratings.toDF(srcOutputColumn, dstOutputColumn, Constants.RatingCol).groupBy(srcOutputColumn) - .agg(collect_top_k(struct(Constants.RatingCol, dstOutputColumn), num, false)) - .as[(Int, Seq[(Float, Int)])] - .map(t => (t._1, t._2.map(p => (p._2, p._1)))) - .toDF(srcOutputColumn, Constants.Recommendations) - .withColumn(Constants.Recommendations, col(Constants.Recommendations).cast(arrayType)) + recs.select(col("id").as(srcOutputColumn), col("recommendations").cast(arrayType)) } } diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala index 2a86894bc2..be47791e69 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala @@ -101,7 +101,7 @@ object RTestGen { | "spark.sql.shuffle.partitions=10", | "spark.sql.crossJoin.enabled=true") | - |sc <- spark_connect(master = "local", version = "3.4.1", config = conf) + |sc <- spark_connect(master = "local", version = "3.3.3", config = conf) | |""".stripMargin, StandardOpenOption.CREATE) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala index 8936116365..8f9ddf32ae 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala @@ -30,11 +30,11 @@ object DatabricksUtilities { // ADB Info val Region = "eastus" - val PoolName = "synapseml-build-13.3" - val GpuPoolName = "synapseml-build-13.3-gpu" - val AdbRuntime = "13.3.x-scala2.12" - // https://docs.databricks.com/en/release-notes/runtime/13.3lts-ml.html - val AdbGpuRuntime = "13.3.x-gpu-ml-scala2.12" + val PoolName = "synapseml-build-12.2" + val GpuPoolName = "synapseml-build-12.2-gpu" + val AdbRuntime = "12.2.x-scala2.12" + // https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/ + val AdbGpuRuntime = "12.2.x-gpu-ml-scala2.12" val NumWorkers = 5 val AutoTerminationMinutes = 15 @@ -82,9 +82,9 @@ object DatabricksUtilities { Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)), Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")), Map("pypi" -> Map("package" -> "torchvision==0.14.1")), - Map("pypi" -> Map("package" -> "transformers==4.32.1")), - Map("pypi" -> Map("package" -> "petastorm==0.12.0")), - Map("pypi" -> Map("package" -> "protobuf==3.20.3")) + Map("pypi" -> Map("package" -> "transformers==4.25.1")), + Map("pypi" -> Map("package" -> "petastorm==0.12.1")), + Map("pypi" -> Map("package" -> "protobuf==3.19.4")) ).toJson.compactPrint val GPUInitScripts: String = List( diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseExtension/SynapseExtensionUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseExtension/SynapseExtensionUtilities.scala index 48683cc413..8307b637eb 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseExtension/SynapseExtensionUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseExtension/SynapseExtensionUtilities.scala @@ -83,7 +83,7 @@ object SynapseExtensionUtilities { |"{ | 'Default${store}ArtifactId': '$storeId', | 'ExecutableFile': '$path', - | 'SparkVersion':'3.4', + | 'SparkVersion':'3.3', | 'SparkSettings': { | 'spark.jars.packages' : '$SparkMavenPackageList', | 'spark.jars.repositories' : '$SparkMavenRepositoryList', diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index c86b615206..1d5bf572b5 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -254,7 +254,7 @@ object SynapseUtilities { | "nodeSizeFamily": "MemoryOptimized", | "provisioningState": "Succeeded", | "sessionLevelPackagesEnabled": "true", - | "sparkVersion": "3.4" + | "sparkVersion": "3.3" | } |} |""".stripMargin diff --git a/deep-learning/src/main/python/horovod_installation.sh b/deep-learning/src/main/python/horovod_installation.sh index 22124422ff..8bd5f19c02 100644 --- a/deep-learning/src/main/python/horovod_installation.sh +++ b/deep-learning/src/main/python/horovod_installation.sh @@ -8,9 +8,9 @@ set -eu # Install prerequisite libraries that horovod depends on pip install pytorch-lightning==1.5.0 pip install torchvision==0.14.1 -pip install transformers==4.32.1 +pip install transformers==4.25.1 pip install petastorm>=0.12.0 -pip install protobuf==3.20.3 +pip install protobuf==3.19.1 # Remove Outdated Signing Key: sudo apt-key del 7fa2af80 @@ -35,8 +35,8 @@ libcusparse-dev-11-0=11.1.1.245-1 git clone --recursive https://github.com/horovod/horovod.git cd horovod -# git fetch origin refs/tags/v0.28.1:tags/v0.28.1 -git checkout 1d217b59949986d025f6db93c49943fb6b6cc78f +# git fetch origin refs/tags/v0.27.0:tags/v0.27.0 +git checkout bfaca90d5cf66780a97d8799d4e1573855b64560 git checkout -b tmp-branch rm -rf build/ dist/ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \ @@ -44,4 +44,4 @@ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PY readlink -f dist/horovod-*.whl -pip install --no-cache-dir dist/horovod-0.28.1-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps +pip install --no-cache-dir dist/horovod-0.27.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps diff --git a/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py b/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py index 8e001f3be6..fcd6ff59cb 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py +++ b/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py @@ -11,12 +11,12 @@ if _TRANSFORMERS_AVAILABLE: import transformers - _TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1" - if _TRANSFORMERS_EQUAL_4_32_1: + _TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1" + if _TRANSFORMERS_EQUAL_4_25_1: from transformers import AutoTokenizer else: raise RuntimeError( - "transformers should be == 4.32.1, found: {}".format( + "transformers should be == 4.25.1, found: {}".format( transformers.__version__ ) ) diff --git a/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py b/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py index f8b624e6c7..4723da1016 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py +++ b/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py @@ -19,10 +19,10 @@ if _HOROVOD_AVAILABLE: import horovod - _HOROVOD_EQUAL_0_28_1 = horovod.__version__ == "0.28.1" - if not _HOROVOD_EQUAL_0_28_1: + _HOROVOD_EQUAL_0_27_0 = horovod.__version__ == "0.27.0" + if not _HOROVOD_EQUAL_0_27_0: raise RuntimeError( - "horovod should be of version 0.28.1, found: {}".format(horovod.__version__) + "horovod should be of version 0.27.0, found: {}".format(horovod.__version__) ) else: raise ModuleNotFoundError("module not found: horovod") diff --git a/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py b/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py index 134bc5f135..b17b9f5f18 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py +++ b/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py @@ -13,12 +13,12 @@ if _TRANSFORMERS_AVAILABLE: import transformers - _TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1" - if _TRANSFORMERS_EQUAL_4_32_1: + _TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1" + if _TRANSFORMERS_EQUAL_4_25_1: from transformers import AutoModelForSequenceClassification else: raise RuntimeError( - "transformers should be == 4.32.1, found: {}".format( + "transformers should be == 4.25.1, found: {}".format( transformers.__version__ ) ) diff --git a/environment.yml b/environment.yml index 257c657630..d901e32a31 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,7 @@ dependencies: - r-devtools=2.4.2 - pip: - pyarrow>=0.15.0 - - pyspark==3.4.1 + - pyspark==3.3.3 - pandas==1.2.5 - wheel - sphinx==4.2.0 @@ -34,13 +34,13 @@ dependencies: - numpy - torch==1.13.1 - torchvision==0.14.1 - - horovod==0.28.1 + - horovod==0.27.0 - petastorm>=0.11.0 - pytorch_lightning==1.5.0 - onnxmltools==1.7.0 - matplotlib - Pillow - - transformers==4.32.1 + - transformers==4.25.1 - huggingface-hub>=0.8.1 - langchain==0.0.151 - openai==0.27.5 diff --git a/pipeline.yaml b/pipeline.yaml index 071a5fa7dc..68b54efd83 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -527,7 +527,7 @@ jobs: fi sbt publishM2 - SPARK_VERSION=3.4.1 + SPARK_VERSION=3.3.3 HADOOP_VERSION=3 wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz (timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) diff --git a/start b/start index 75219e9459..7e2595dda0 100644 --- a/start +++ b/start @@ -1,10 +1,9 @@ #!/bin/bash export OPENMPI_VERSION="3.1.2" - -export SPARK_VERSION="3.4.1" +export SPARK_VERSION="3.3.3" export HADOOP_VERSION="3.3" -export SYNAPSEML_VERSION="1.0.2" # Binder compatibility version +export SYNAPSEML_VERSION="0.11.4" # Binder compatibility version echo "Beginning Spark Session..." exec "$@" diff --git a/tools/docker/demo/Dockerfile b/tools/docker/demo/Dockerfile index 7def1662d8..2a0c1268b3 100644 --- a/tools/docker/demo/Dockerfile +++ b/tools/docker/demo/Dockerfile @@ -1,9 +1,9 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 -ARG SYNAPSEML_VERSION=1.0.2 +ARG SYNAPSEML_VERSION=0.11.4 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.4.1 +ENV SPARK_VERSION=3.3.3 ENV HADOOP_VERSION=3 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/docker/minimal/Dockerfile b/tools/docker/minimal/Dockerfile index 9b6b3522e8..0497926dc4 100644 --- a/tools/docker/minimal/Dockerfile +++ b/tools/docker/minimal/Dockerfile @@ -1,9 +1,9 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 -ARG SYNAPSEML_VERSION=1.0.2 +ARG SYNAPSEML_VERSION=0.11.4 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.4.1 +ENV SPARK_VERSION=3.3.3 ENV HADOOP_VERSION=3 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/dotnet/dotnetSetup.sh b/tools/dotnet/dotnetSetup.sh index c378cfa524..f8299ddace 100644 --- a/tools/dotnet/dotnetSetup.sh +++ b/tools/dotnet/dotnetSetup.sh @@ -20,11 +20,11 @@ echo "##vso[task.setvariable variable=DOTNET_WORKER_DIR]$DOTNET_WORKER_DIR" # Install Sleet dotnet tool install -g sleet -# Install Apache Spark-3.4.1 -curl https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz -o spark-3.4.1-bin-hadoop3.tgz +# Install Apache Spark-3.3 +curl https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz -o spark-3.3.3-bin-hadoop3.tgz mkdir ~/bin -tar -xzvf spark-3.4.1-bin-hadoop3.tgz -C ~/bin -export SPARK_HOME=~/bin/spark-3.4.1-bin-hadoop3/ +tar -xzvf spark-3.3.3-bin-hadoop3.tgz -C ~/bin +export SPARK_HOME=~/bin/spark-3.3.3-bin-hadoop3/ export PATH=$SPARK_HOME/bin:$PATH echo "##vso[task.setvariable variable=SPARK_HOME]$SPARK_HOME" echo "##vso[task.setvariable variable=PATH]$SPARK_HOME/bin:$PATH" diff --git a/tools/tests/run_r_tests.R b/tools/tests/run_r_tests.R index a5a61260f2..e9e684bbf1 100644 --- a/tools/tests/run_r_tests.R +++ b/tools/tests/run_r_tests.R @@ -3,7 +3,7 @@ if (!require("sparklyr")) { library("sparklyr") } -spark_install_tar(paste(getwd(), "/../../../../../../spark-3.4.1-bin-hadoop3.tgz", sep = "")) +spark_install_tar(paste(getwd(), "/../../../../../../spark-3.3.3-bin-hadoop3.tgz", sep = "")) options("testthat.output_file" = "../../../../r-test-results.xml") devtools::test(reporter = JunitReporter$new())