From 415e37b13b12bcec36cdb31d5d4b9b8d31a26694 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 24 Jan 2023 19:18:28 +0100 Subject: [PATCH 1/4] Add multiplicity parameter for artificial increase of data --- build.sbt | 2 +- .../scala/com/dataintuitive/luciusapi/Common.scala | 5 +++++ .../scala/com/dataintuitive/luciusapi/initialize.scala | 10 +++++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index fae3e38..5351c19 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "LuciusAPI" import aether.AetherKeys._ -ThisBuild / version := "5.1.3" +ThisBuild / version := "5.1.4-alpha" scalaVersion := "2.11.12" diff --git a/src/main/scala/com/dataintuitive/luciusapi/Common.scala b/src/main/scala/com/dataintuitive/luciusapi/Common.scala index dab79e4..7324b25 100644 --- a/src/main/scala/com/dataintuitive/luciusapi/Common.scala +++ b/src/main/scala/com/dataintuitive/luciusapi/Common.scala @@ -312,6 +312,11 @@ object Common extends Serializable { .getOrElse(Map.empty) } + def paramMultiplicity(config: Config):Int = { + Try(config.getString("multiplicity").toInt) + .getOrElse(1) + } + } } diff --git a/src/main/scala/com/dataintuitive/luciusapi/initialize.scala b/src/main/scala/com/dataintuitive/luciusapi/initialize.scala index b8ea0ed..174b92c 100644 --- a/src/main/scala/com/dataintuitive/luciusapi/initialize.scala +++ b/src/main/scala/com/dataintuitive/luciusapi/initialize.scala @@ -36,7 +36,8 @@ object initialize extends SparkSessionJob with NamedObjectSupport { dbVersion: String, partitions: Int, storageLevel: StorageLevel, - geneDataTypes: Map[String, String]) + geneDataTypes: Map[String, String], + multiplicity: Int) type JobOutput = collection.Map[String, Any] override def validate(sparkSession: SparkSession, @@ -49,8 +50,9 @@ object initialize extends SparkSessionJob with NamedObjectSupport { val partitions = paramPartitions(config) val storageLevel = paramStorageLevel(config) val geneDataTypes = paramGeneDataTypes(config) + val multiplicity = paramMultiplicity(config) - withGood(db, genes) { JobData(_, _, dbVersion, partitions, storageLevel, geneDataTypes) } + withGood(db, genes) { JobData(_, _, dbVersion, partitions, storageLevel, geneDataTypes, multiplicity) } } @@ -111,7 +113,9 @@ object initialize extends SparkSessionJob with NamedObjectSupport { case (parquet, _) => parquet.as[Perturbation] } } - val db = dbRaws.reduce(_ union _).repartition(data.partitions) + val db_single = dbRaws.reduce(_ union _) + + val db = (1 to data.multiplicity).map{ i => db_single}.reduce(_ union _).repartition(data.partitions) val dbNamedDataset = NamedDataSet[Perturbation](db, forceComputation = true, storageLevel = data.storageLevel) From d7f803fd0188a92e7970601031f3a6e5a3a5b815 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 24 Jan 2023 19:25:30 +0100 Subject: [PATCH 2/4] Add comments --- src/main/scala/com/dataintuitive/luciusapi/Common.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/scala/com/dataintuitive/luciusapi/Common.scala b/src/main/scala/com/dataintuitive/luciusapi/Common.scala index 7324b25..4b68273 100644 --- a/src/main/scala/com/dataintuitive/luciusapi/Common.scala +++ b/src/main/scala/com/dataintuitive/luciusapi/Common.scala @@ -312,6 +312,11 @@ object Common extends Serializable { .getOrElse(Map.empty) } + /** + * In order to make sure Lucius handles heavy load and slow response times, + * it's important to dev/test with larger datasets than a small dev dataset allows. + * This paramter multiplies input data this many times so the apparant dataset is larger. + */ def paramMultiplicity(config: Config):Int = { Try(config.getString("multiplicity").toInt) .getOrElse(1) From 625ed908185ce1b9d95a8b1a5c36ab805a532ad5 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 24 Jan 2023 19:25:40 +0100 Subject: [PATCH 3/4] Remove println's --- src/main/scala/com/dataintuitive/luciusapi/initialize.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/main/scala/com/dataintuitive/luciusapi/initialize.scala b/src/main/scala/com/dataintuitive/luciusapi/initialize.scala index 174b92c..15c3db4 100644 --- a/src/main/scala/com/dataintuitive/luciusapi/initialize.scala +++ b/src/main/scala/com/dataintuitive/luciusapi/initialize.scala @@ -96,10 +96,6 @@ object initialize extends SparkSessionJob with NamedObjectSupport { val thisVersion = state.state.filter(_.version.major.toString == majorVersion) - println(outputs) - println(state) - println(thisVersion) - val parquets = thisVersion.map(_.obj.toString).map( sparkSession.read .schema(Encoders.product[Perturbation].schema) // This assists parquet file reading so that it is more independent of our current Perturbation format. From 3d4b53249d2e1336192906efaa64906e21b224d3 Mon Sep 17 00:00:00 2001 From: Hendrik Cannoodt Date: Fri, 10 Feb 2023 09:57:15 +0100 Subject: [PATCH 4/4] Bump LuciusCore dependency to 4.1.2 LuciusCore 4.1.2 improves the performance of treatmentToPerturbation --- README.md | 1 + build.sbt | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 08a80c6..5bbab81 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ There's still a lot of work to be done on this (version numbers don't reflect ev | 5.1.1 | 4.1.1 | 0.11.1 | 2.4.7 | | 5.1.2 | 4.1.1 | 0.11.1 | 2.4.7 | | 5.1.3 | 4.1.1 | 0.11.1 | 2.4.7 | +| 5.1.4 | 4.1.2 | 0.11.1 | 2.4.7 | # API Documentation diff --git a/build.sbt b/build.sbt index fae3e38..5feaee2 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "LuciusAPI" import aether.AetherKeys._ -ThisBuild / version := "5.1.3" +ThisBuild / version := "5.1.4" scalaVersion := "2.11.12" @@ -10,7 +10,7 @@ resolvers += Resolver.githubPackages("data-intuitive") resolvers += "Artifactory" at "https://sparkjobserver.jfrog.io/artifactory/jobserver/" libraryDependencies ++= Seq( - "com.data-intuitive" %% "luciuscore" % "4.1.1", + "com.data-intuitive" %% "luciuscore" % "4.1.2", "spark.jobserver" %% "job-server-api" % "0.11.1" % "provided", "spark.jobserver" %% "job-server-extras" % "0.11.1" % "provided", "org.scalactic" %% "scalactic" % "3.0.7" % "test" ,