From bb7f35076f48a89e9ba4fbda2d6074b97b5bbb78 Mon Sep 17 00:00:00 2001 From: rdsharma26 <65777064+rdsharma26@users.noreply.github.com> Date: Wed, 1 Mar 2023 11:55:15 -0500 Subject: [PATCH] Fix style issues causing mvn install to fail. (#453) - Style issues were caused due to spacing issues and the presence of the chi character in unicode. - The workflow step was also updated to include the execution of the style check. Previously, the style check was not being executed which led to the style errors being committed to master. --- .github/workflows/maven.yml | 2 +- .../com/amazon/deequ/analyzers/Distance.scala | 195 ++++++++++-------- .../amazon/deequ/KLL/KLLDistanceTest.scala | 96 +++++---- 3 files changed, 162 insertions(+), 131 deletions(-) diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 88b1da8f5..bffe71cf8 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -20,5 +20,5 @@ jobs: distribution: 'corretto' cache: maven - name: Build with Maven - run: mvn clean test + run: mvn clean verify diff --git a/src/main/scala/com/amazon/deequ/analyzers/Distance.scala b/src/main/scala/com/amazon/deequ/analyzers/Distance.scala index d1d9b742c..17847d040 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/Distance.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/Distance.scala @@ -15,48 +15,46 @@ */ package com.amazon.deequ.analyzers -import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.mllib.stat.Statistics._ import org.apache.spark.mllib.stat.test.ChiSqTestResult - - - +import scala.annotation.tailrec object Distance { - // Chi-square constants // at least two distinct categories are required to run the chi-square test for a categorical variable private val chisquareMinDimension: Int = 2 - //for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734) + // for tables larger than 2 x 2: + // "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" + // - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734 private val defaultAbsThresholdYates: Integer = 5 private val defaultPercThresholdYates: Double = 0.2 - // for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.) + // for 2x2 tables: + // all expected counts should be 10 or greater + // - Cochran, William G. "The (chi)**2 test of goodness of fit." + // The Annals of mathematical statistics (1952): 315-345. private val defaultAbsThresholdCochran: Integer = 10 - // Default c(alpha) value corresponding to an alpha value of 0.003, Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms), 3rd Edition, Addison Wesley, Reading Mass, 1998. + // Default c(alpha) value corresponding to an alpha value of 0.003, + // Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms), + // 3rd Edition, Addison Wesley, Reading Mass, 1998. private val defaultCAlpha : Double = 1.8 trait CategoricalDistanceMethod case class LInfinityMethod(alpha: Option[Double] = None) extends CategoricalDistanceMethod - case class ChisquareMethod( - absThresholdYates: Integer = defaultAbsThresholdYates, - percThresholdYates: Double = defaultPercThresholdYates, - absThresholdCochran: Integer = defaultAbsThresholdCochran) + case class ChisquareMethod(absThresholdYates: Integer = defaultAbsThresholdYates, + percThresholdYates: Double = defaultPercThresholdYates, + absThresholdCochran: Integer = defaultAbsThresholdCochran) extends CategoricalDistanceMethod - /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */ - def numericalDistance( - sample1: QuantileNonSample[Double], - sample2: QuantileNonSample[Double], - correctForLowNumberOfSamples: Boolean = false, - alpha: Option[Double] = None) - : Double = { + /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */ + def numericalDistance(sample1: QuantileNonSample[Double], + sample2: QuantileNonSample[Double], + correctForLowNumberOfSamples: Boolean = false, + alpha: Option[Double] = None): Double = { val rankMap1 = sample1.getRankMap() val rankMap2 = sample2.getRankMap() val combinedKeys = rankMap1.keySet.union(rankMap2.keySet) @@ -76,24 +74,27 @@ object Distance { /** Calculate distance of categorical profiles based on different distance methods * * Thresholds for chi-square method: - * - for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.) - * - for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734) + * - for 2x2 tables: + * all expected counts should be 10 or greater + * - Cochran, William G. "The (chi)**2 test of goodness of fit." + * The Annals of mathematical statistics (1952): 315-345. + * - for tables larger than 2 x 2: + * "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" + * - (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734) * - * @param sample1 the mapping between categories(keys) and counts(values) of the observed sample - * @param sample2 the mapping between categories(keys) and counts(values) of the expected baseline + * @param sample1 the mapping between categories(keys) and + * counts(values) of the observed sample + * @param sample2 the mapping between categories(keys) and + * counts(values) of the expected baseline * @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value * @param method Method to use: LInfinity or Chisquare - * @param absThresholdYates Yates absolute threshold for tables larger than 2x2 - * @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2 - * @param absThresholdCochran Cochran absolute threshold for 2x2 tables - * @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument + * @return distance can be an absolute distance or + * a p-value based on the correctForLowNumberOfSamples argument */ - def categoricalDistance( - sample1: scala.collection.mutable.Map[String, Long], - sample2: scala.collection.mutable.Map[String, Long], - correctForLowNumberOfSamples: Boolean = false, - method: CategoricalDistanceMethod = LInfinityMethod()) - : Double = { + def categoricalDistance(sample1: scala.collection.mutable.Map[String, Long], + sample2: scala.collection.mutable.Map[String, Long], + correctForLowNumberOfSamples: Boolean = false, + method: CategoricalDistanceMethod = LInfinityMethod()): Double = { method match { case LInfinityMethod(alpha) => categoricalLInfinityDistance(sample1, sample2, correctForLowNumberOfSamples, alpha) case ChisquareMethod(absThresholdYates, percThresholdYates, absThresholdCochran) @@ -109,38 +110,47 @@ object Distance { /** Calculate distance of categorical profiles based on Chisquare test or stats * - * for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.) - * for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734) + * for 2x2 tables: + * all expected counts should be 10 or greater + * - Cochran, William G. "The (chi)**2 test of goodness of fit." + * The Annals of mathematical statistics (1952): 315-345. + * for tables larger than 2 x 2: + * "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" + * - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734 * - * @param sample the mapping between categories(keys) and counts(values) of the observed sample - * @param expected the mapping between categories(keys) and counts(values) of the expected baseline + * @param sample the mapping between categories(keys) and + * counts(values) of the observed sample + * @param expected the mapping between categories(keys) and + * counts(values) of the expected baseline * @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value * @param absThresholdYates Yates absolute threshold for tables larger than 2x2 - * @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2 + * @param percThresholdYates Yates percentage of categories that can be + * below threshold for tables larger than 2x2 * @param absThresholdCochran Cochran absolute threshold for 2x2 tables - * @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument + * @return distance can be an absolute distance or + * a p-value based on the correctForLowNumberOfSamples argument * */ - private[this] def categoricalChiSquareTest( - sample: scala.collection.mutable.Map[String, Long], - expected: scala.collection.mutable.Map[String, Long], - correctForLowNumberOfSamples: Boolean = false, - absThresholdYates : Integer = defaultAbsThresholdYates , - percThresholdYates : Double = defaultPercThresholdYates, - absThresholdCochran : Integer = defaultAbsThresholdCochran, - normalizeExpected : Boolean = true) - : Double = { - - val sampleSum: Double = sample.filter(e => expected.contains(e._1)).map((e => e._2)).sum - val expectedSum: Double = expected.map(e => e._2).sum + private[this] def categoricalChiSquareTest(sample: scala.collection.mutable.Map[String, Long], + expected: scala.collection.mutable.Map[String, Long], + correctForLowNumberOfSamples: Boolean = false, + absThresholdYates : Integer = defaultAbsThresholdYates, + percThresholdYates : Double = defaultPercThresholdYates, + absThresholdCochran : Integer = defaultAbsThresholdCochran): Double = { + + val sampleSum: Double = sample.filter(e => expected.contains(e._1)).values.sum + val expectedSum: Double = expected.values.sum // Normalize the expected input, normalization is required to conduct the chi-square test - // While normalization is already included in the mllib chi-square test, we perform normalization manually to execute proper regrouping - // https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest:org.apache.spark.mllib.stat.test.ChiSqTestResult - val expectedNorm: scala.collection.mutable.Map[String, Double] = expected.map(e => (e._1, (e._2 / expectedSum * sampleSum))) + // While normalization is already included in the mllib chi-square test, + // we perform normalization manually to execute proper regrouping + // https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest + val expectedNorm: scala.collection.mutable.Map[String, Double] = + expected.map(e => (e._1, e._2 / expectedSum * sampleSum)) // Call the function that regroups categories if necessary depending on thresholds - val (regroupedSample, regroupedExpected) = regroupCategories(sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran) + val (regroupedSample, regroupedExpected) = regroupCategories( + sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran) // If less than 2 categories remain we cannot conduct the test if (regroupedSample.keySet.size < chisquareMinDimension) { @@ -158,30 +168,39 @@ object Distance { /** Regroup categories with elements below threshold, required for chi-square test * - * for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.) - * for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734) + * for 2x2 tables: + * all expected counts should be 10 or greater + * - Cochran, William G. "The (chi)**2 test of goodness of fit." + * The Annals of mathematical statistics (1952): 315-345. + * for tables larger than 2 x 2: + * "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" + * - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734 * - * @param sample the mapping between categories(keys) and counts(values) of the observed sample - * @param expected the mapping between categories(keys) and counts(values) of the expected baseline + * @param sample the mapping between categories(keys) and + * counts(values) of the observed sample + * @param expected the mapping between categories(keys) and + * counts(values) of the expected baseline * @param absThresholdYates Yates absolute threshold for tables larger than 2x2 - * @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2 + * @param percThresholdYates Yates percentage of categories that can be + * below threshold for tables larger than 2x2 * @param absThresholdCochran Cochran absolute threshold for 2x2 tables * @return (sample, expected) returns the two regrouped mappings * */ - private[this] def regroupCategories( - sample: scala.collection.mutable.Map[String, Double], - expected: scala.collection.mutable.Map[String, Double], - absThresholdYates: Integer = defaultAbsThresholdYates, - percThresholdYates: Double = defaultPercThresholdYates, - absThresholdCochran: Integer = defaultAbsThresholdCochran) + @tailrec + private[this] def regroupCategories(sample: scala.collection.mutable.Map[String, Double], + expected: scala.collection.mutable.Map[String, Double], + absThresholdYates: Integer = defaultAbsThresholdYates, + percThresholdYates: Double = defaultPercThresholdYates, + absThresholdCochran: Integer = defaultAbsThresholdCochran) : (scala.collection.mutable.Map[String, Double], scala.collection.mutable.Map[String, Double]) = { // If number of categories is below the minimum return original mappings if (expected.keySet.size < chisquareMinDimension) { (sample, expected) } else { - // Determine thresholds depending on dimensions of mapping (2x2 tables use Cochran, all other tables Yates thresholds) + // Determine thresholds depending on dimensions of mapping + // 2x2 tables use Cochran, all other tables Yates thresholds var absThresholdPerColumn : Integer = absThresholdCochran var maxNbColumnsBelowThreshold: Integer = 0 if (expected.keySet.size > chisquareMinDimension) { @@ -191,8 +210,9 @@ object Distance { // Count number of categories below threshold val nbExpectedColumnsBelowThreshold = expected.filter(e => e._2 < absThresholdPerColumn).keySet.size - // If the number of categories below threshold exceeds the authorized maximum, small categories are regrouped until valid - if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold){ + // If the number of categories below threshold exceeds + // the authorized maximum, small categories are regrouped until valid + if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold) { // Identified key that holds minimum value val expectedMin: (String, Double) = expected.minBy(e => e._2) @@ -226,10 +246,8 @@ object Distance { * @return ChiSqTestResult returns the chi-square test result object (contains both statistics and p-value) * */ - private[this] def chiSquareTest( - sample: scala.collection.mutable.Map[String, Double], - expected: scala.collection.mutable.Map[String, Double]) - : ChiSqTestResult = { + private[this] def chiSquareTest(sample: scala.collection.mutable.Map[String, Double], + expected: scala.collection.mutable.Map[String, Double]): ChiSqTestResult = { var sampleArray = Array[Double]() var expectedArray = Array[Double]() @@ -248,12 +266,10 @@ object Distance { } /** Calculate distance of categorical profiles based on L-Infinity Distance */ - private[this] def categoricalLInfinityDistance( - sample1: scala.collection.mutable.Map[String, Long], - sample2: scala.collection.mutable.Map[String, Long], - correctForLowNumberOfSamples: Boolean = false, - alpha: Option[Double]) - : Double = { + private[this] def categoricalLInfinityDistance(sample1: scala.collection.mutable.Map[String, Long], + sample2: scala.collection.mutable.Map[String, Long], + correctForLowNumberOfSamples: Boolean = false, + alpha: Option[Double]): Double = { var n = 0.0 var m = 0.0 sample1.keySet.foreach { key => @@ -276,21 +292,19 @@ object Distance { /** Select which metrics to compute (linf_simple or linf_robust) * based on whether samples are enough */ - private[this] def selectMetrics( - linfSimple: Double, - n: Double, - m: Double, - correctForLowNumberOfSamples: Boolean = false, - alpha: Option[Double]) - : Double = { + private[this] def selectMetrics(linfSimple: Double, + n: Double, + m: Double, + correctForLowNumberOfSamples: Boolean = false, + alpha: Option[Double]): Double = { if (correctForLowNumberOfSamples) { linfSimple } else { // This formula is based on “Two-sample Kolmogorov–Smirnov test" // Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test - val cAlpha : Double = alpha match { - case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2) + val cAlpha: Double = alpha match { + case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2) case None => defaultCAlpha } val linfRobust = Math.max(0.0, linfSimple - cAlpha * Math.sqrt((n + m) / (n * m))) @@ -298,4 +312,3 @@ object Distance { } } } - diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala index 58e5fc6f3..56cb77522 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala @@ -20,25 +20,27 @@ import com.amazon.deequ.SparkContextSpec import com.amazon.deequ.analyzers.Distance.{ChisquareMethod, LInfinityMethod} import com.amazon.deequ.analyzers.{Distance, QuantileNonSample} import com.amazon.deequ.utils.FixtureSupport -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.WordSpec -class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec +class KLLDistanceTest extends WordSpec with SparkContextSpec with FixtureSupport{ "KLL distance calculator should compute correct linf_simple" in { - var sample1 = new QuantileNonSample[Double](4, 0.64) - var sample2 = new QuantileNonSample[Double](4, 0.64) + val sample1 = new QuantileNonSample[Double](4, 0.64) + val sample2 = new QuantileNonSample[Double](4, 0.64) sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4))) sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5))) - assert(Distance.numericalDistance(sample1, sample2, true) == 0.25) + val distance = Distance.numericalDistance(sample1, sample2, correctForLowNumberOfSamples = true) + assert(distance == 0.25) } "KLL distance calculator should compute correct linf_robust" in { - var sample1 = new QuantileNonSample[Double](4, 0.64) - var sample2 = new QuantileNonSample[Double](4, 0.64) + val sample1 = new QuantileNonSample[Double](4, 0.64) + val sample2 = new QuantileNonSample[Double](4, 0.64) sample1.reconstruct(4, 0.64, Array(Array(1, 2, 3, 4))) sample2.reconstruct(4, 0.64, Array(Array(2, 3, 4, 5))) - assert(Distance.numericalDistance(sample1, sample2) == 0.0) + val distance = Distance.numericalDistance(sample1, sample2) + assert(distance == 0.0) } "Categorial distance should compute correct linf_simple" in { @@ -46,8 +48,8 @@ class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) val sample2 = scala.collection.mutable.Map( "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) - assert(Distance.categoricalDistance(sample1, - sample2, true) == 0.06015037593984962) + val distance = Distance.categoricalDistance(sample1, sample2, correctForLowNumberOfSamples = true) + assert(distance == 0.06015037593984962) } "Categorial distance should compute correct linf_robust" in { @@ -55,7 +57,8 @@ class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) val sample2 = scala.collection.mutable.Map( "a" -> 11L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) - assert(Distance.categoricalDistance(sample1, sample2) == 0.0) + val distance = Distance.categoricalDistance(sample1, sample2) + assert(distance == 0.0) } "Categorial distance should compute correct linf_simple with different bin value" in { @@ -63,8 +66,8 @@ class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) val sample2 = scala.collection.mutable.Map( "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) - assert(Distance.categoricalDistance(sample1, - sample2, true) == 0.2857142857142857) + val distance = Distance.categoricalDistance(sample1, sample2, correctForLowNumberOfSamples = true) + assert(distance == 0.2857142857142857) } "Categorial distance should compute correct linf_robust with different bin value" in { @@ -72,16 +75,17 @@ class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec "a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L) val sample2 = scala.collection.mutable.Map( "f" -> 11L, "a" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 10L) - assert(Distance.categoricalDistance(sample1, sample2) == 0.0) + val distance = Distance.categoricalDistance(sample1, sample2) + assert(distance == 0.0) } - "Categorial distance should compute correct linf_robust with different alpha value .003" in { val sample1 = scala.collection.mutable.Map( "a" -> 207L, "b" -> 20L, "c" -> 25L, "d" -> 14L, "e" -> 25L, "g" -> 13L) val sample2 = scala.collection.mutable.Map( "a" -> 22L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L, "f" -> 15L) - assert(Distance.categoricalDistance(sample1, sample2, method = LInfinityMethod(alpha = Some(0.003))) == 0.2726338046550349) + val distance = Distance.categoricalDistance(sample1, sample2, method = LInfinityMethod(alpha = Some(0.003))) + assert(distance == 0.2726338046550349) } "Categorial distance should compute correct linf_robust with different alpha value .1" in { @@ -89,77 +93,91 @@ class KLLDistanceTest extends WordSpec with Matchers with SparkContextSpec "a" -> 207L, "b" -> 20L, "c" -> 25L, "d" -> 14L, "e" -> 25L, "g" -> 13L) val sample2 = scala.collection.mutable.Map( "a" -> 22L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L, "f" -> 15L) - assert(Distance.categoricalDistance(sample1, sample2, method = LInfinityMethod(alpha = Some(0.1))) == 0.33774199396969184) + val distance = Distance.categoricalDistance(sample1, sample2, method = LInfinityMethod(alpha = Some(0.1))) + assert(distance == 0.33774199396969184) } - // Tests using chi-square method for categorical variables "Categorical distance should compute correct chisquare stats with missing bin values" in { val sample1 = scala.collection.mutable.Map( "a" -> 207L, "b" -> 20L, "c" -> 25L, "d" -> 14L, "e" -> 25L, "g" -> 13L) val sample2 = scala.collection.mutable.Map( "a" -> 223L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L, "f" -> 15L) - - assert(Distance.categoricalDistance(sample1, sample2, correctForLowNumberOfSamples = true, method = ChisquareMethod()) == 28.175042782458068) + val distance = Distance.categoricalDistance( + sample1, sample2, correctForLowNumberOfSamples = true, method = ChisquareMethod()) + assert(distance == 28.175042782458068) } - - "Categorical distance should compute correct chisquare test with missing bin values" in { val sample1 = scala.collection.mutable.Map( "a" -> 207L, "b" -> 20L, "c" -> 25L, "d" -> 14L, "e" -> 25L, "g" -> 13L) val sample2 = scala.collection.mutable.Map( "a" -> 223L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L, "f" -> 15L) - assert(Distance.categoricalDistance(sample1, sample2, method = ChisquareMethod()) == 3.3640191298478506E-5) + val distance = Distance.categoricalDistance(sample1, sample2, method = ChisquareMethod()) + assert(distance == 3.3640191298478506E-5) } - "Categorical distance should compute correct chisquare test" in { val sample1 = scala.collection.mutable.Map( "a" -> 207L, "b" -> 20L, "c" -> 25L, "d" -> 14L, "e" -> 25L) val sample2 = scala.collection.mutable.Map( "a" -> 223L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L) - assert(Distance.categoricalDistance(sample1, sample2, method = ChisquareMethod()) == 0.013227994814265176) + val distance = Distance.categoricalDistance(sample1, sample2, method = ChisquareMethod()) + assert(distance == 0.013227994814265176) } - "Categorical distance should compute correct chisquare distance (low samples) with regrouping 2 categories (yates) after normalizing" in { + "Categorical distance should compute correct chisquare distance (low samples) " + + "with regrouping 2 categories (yates) after normalizing" in { val sample1 = scala.collection.mutable.Map( "a" -> 100L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L, "f" -> 2L) val sample2 = scala.collection.mutable.Map( "a" -> 100L, "b" -> 22L, "c" -> 25L, "d" -> 5L, "e" -> 13L, "f" -> 2L) - assert(Distance.categoricalDistance(sample1, sample2, correctForLowNumberOfSamples = true, method = ChisquareMethod()) == 8.789790456457125) + val distance = Distance.categoricalDistance( + sample1, sample2, correctForLowNumberOfSamples = true, method = ChisquareMethod()) + assert(distance == 8.789790456457125) } "Categorical distance should compute correct chisquare distance (low samples) with regrouping (yates)" in { val baseline = scala.collection.mutable.Map( - "a" -> 100L, "b" -> 40L, "c" -> 30L,"e" -> 4L) + "a" -> 100L, "b" -> 40L, "c" -> 30L, "e" -> 4L) val sample = scala.collection.mutable.Map( - "a" -> 100L, "b" -> 40L, "c" -> 30L,"d" -> 10L) - assert(Distance.categoricalDistance(sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) == 0.38754325259515626) + "a" -> 100L, "b" -> 40L, "c" -> 30L, "d" -> 10L) + val distance = Distance.categoricalDistance( + sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) + assert(distance == 0.38754325259515626) } - "Categorical distance should compute correct chisquare distance (low samples) with regrouping 2 categories (yates)" in { + "Categorical distance should compute correct chisquare distance (low samples) " + + "with regrouping 2 categories (yates)" in { val baseline = scala.collection.mutable.Map( "a" -> 100L, "b" -> 4L, "c" -> 3L, "d" -> 34L) val sample = scala.collection.mutable.Map( "a" -> 100L, "b" -> 4L, "c" -> 3L, "d" -> 27L) - assert(Distance.categoricalDistance(sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) == 1.1507901668129925) + val distance = Distance.categoricalDistance( + sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) + assert(distance == 1.1507901668129925) } - "Categorical distance should compute correct chisquare distance (low samples) with regrouping ( sum of 2 grouped categories is below threshold, but small categories represent less than 20%) (yates)" in { + "Categorical distance should compute correct chisquare distance (low samples) " + + "with regrouping " + + "(sum of 2 grouped categories is below threshold, but small categories represent less than 20%) (yates)" in { val baseline = scala.collection.mutable.Map( - "a" -> 100L, "b" -> 2L, "c" -> 1L, "d" -> 34L,"e" -> 20L,"f" -> 20L,"g" -> 20L,"h" -> 20L) + "a" -> 100L, "b" -> 2L, "c" -> 1L, "d" -> 34L, "e" -> 20L, "f" -> 20L, "g" -> 20L, "h" -> 20L) val sample = scala.collection.mutable.Map( - "a" -> 100L, "b" -> 4L, "c" -> 3L, "d" -> 27L,"e" -> 20L,"f" -> 20L,"g" -> 20L,"h" -> 20L) - assert(Distance.categoricalDistance(sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) == 6.827423492761593) + "a" -> 100L, "b" -> 4L, "c" -> 3L, "d" -> 27L, "e" -> 20L, "f" -> 20L, "g" -> 20L, "h" -> 20L) + val distance = Distance.categoricalDistance( + sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) + assert(distance == 6.827423492761593) } - "Categorical distance should compute correct chisquare distance (low samples) with regrouping ( dimensions after regrouping are too small)" in { + "Categorical distance should compute correct chisquare distance (low samples) " + + "with regrouping ( dimensions after regrouping are too small)" in { val baseline = scala.collection.mutable.Map( "a" -> 100L, "b" -> 4L, "c" -> 3L) val sample = scala.collection.mutable.Map( "a" -> 100L, "b" -> 4L, "c" -> 3L) - assert(Distance.categoricalDistance(sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()).isNaN) + val distance = Distance.categoricalDistance( + sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) + assert(distance.isNaN) } - }