Fix style issues causing mvn install to fail. (#453)

- Style issues were caused due to spacing issues and the presence of the chi character in unicode. - The workflow step was also updated to include the execution of the style check. Previously, the style check was not being executed which led to the style errors being committed to master.
awslabs · Mar 1, 2023 · bb7f350 · bb7f350
1 parent d2551bc
commit bb7f350
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 131 deletions.
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -20,5 +20,5 @@ jobs:
         distribution: 'corretto'
         cache: maven
     - name: Build with Maven
-      run: mvn clean test
+      run: mvn clean verify
 
diff --git a/src/main/scala/com/amazon/deequ/analyzers/Distance.scala b/src/main/scala/com/amazon/deequ/analyzers/Distance.scala
@@ -15,48 +15,46 @@
  */
 
 package com.amazon.deequ.analyzers
-import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.mllib.stat.Statistics._
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
 
-
-
-
+import scala.annotation.tailrec
 
 object Distance {
-
     // Chi-square constants
     // at least two distinct categories are required to run the chi-square test for a categorical variable
     private val chisquareMinDimension: Int = 2
 
-    //for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
+    // for tables larger than 2 x 2:
+    //   "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
+    //     - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
     private val defaultAbsThresholdYates: Integer = 5
     private val defaultPercThresholdYates: Double = 0.2
 
-    // for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
+    // for 2x2 tables:
+    //   all expected counts should be 10 or greater
+    //     - Cochran, William G. "The (chi)**2 test of goodness of fit."
+    //       The Annals of mathematical statistics (1952): 315-345.
     private val defaultAbsThresholdCochran: Integer = 10
 
-    // Default c(alpha) value corresponding to an alpha value of 0.003, Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms), 3rd Edition, Addison Wesley, Reading Mass, 1998.
+    // Default c(alpha) value corresponding to an alpha value of 0.003,
+    // Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms),
+    // 3rd Edition, Addison Wesley, Reading Mass, 1998.
     private val defaultCAlpha : Double = 1.8
 
     trait CategoricalDistanceMethod
     case class LInfinityMethod(alpha: Option[Double] = None) extends CategoricalDistanceMethod
-    case class ChisquareMethod(
-                          absThresholdYates: Integer = defaultAbsThresholdYates,
-                          percThresholdYates: Double = defaultPercThresholdYates,
-                          absThresholdCochran: Integer = defaultAbsThresholdCochran)
+    case class ChisquareMethod(absThresholdYates: Integer = defaultAbsThresholdYates,
+                               percThresholdYates: Double = defaultPercThresholdYates,
+                               absThresholdCochran: Integer = defaultAbsThresholdCochran)
       extends CategoricalDistanceMethod
 
-  /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
-    def numericalDistance(
-      sample1: QuantileNonSample[Double],
-      sample2: QuantileNonSample[Double],
-      correctForLowNumberOfSamples: Boolean = false,
-      alpha: Option[Double] = None)
-    : Double = {
+    /** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
+    def numericalDistance(sample1: QuantileNonSample[Double],
+                          sample2: QuantileNonSample[Double],
+                          correctForLowNumberOfSamples: Boolean = false,
+                          alpha: Option[Double] = None): Double = {
       val rankMap1 = sample1.getRankMap()
       val rankMap2 = sample2.getRankMap()
       val combinedKeys = rankMap1.keySet.union(rankMap2.keySet)
@@ -76,24 +74,27 @@ object Distance {
   /** Calculate distance of categorical profiles based on different distance methods
    *
    * Thresholds for chi-square method:
-   *    - for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
-   *    - for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
+   *  - for 2x2 tables:
+   *      all expected counts should be 10 or greater
+   *        - Cochran, William G. "The (chi)**2 test of goodness of fit."
+   *          The Annals of mathematical statistics (1952): 315-345.
+   *  - for tables larger than 2 x 2:
+   *      "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
+   *        - (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
    *
-   * @param sample1                      the mapping between categories(keys) and counts(values) of the observed sample
-   * @param sample2                      the mapping between categories(keys) and counts(values) of the expected baseline
+   * @param sample1                      the mapping between categories(keys) and
+   *                                     counts(values) of the observed sample
+   * @param sample2                      the mapping between categories(keys) and
+   *                                     counts(values) of the expected baseline
    * @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
    * @param method                       Method to use: LInfinity or Chisquare
-   * @param absThresholdYates            Yates absolute threshold for tables larger than 2x2
-   * @param percThresholdYates           Yates percentage of categories that can be below threshold for tables larger than 2x2
-   * @param absThresholdCochran          Cochran absolute threshold for 2x2 tables
-   * @return distance                    can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
+   * @return distance                    can be an absolute distance or
+   *                                     a p-value based on the correctForLowNumberOfSamples argument
    */
-  def categoricalDistance(
-    sample1: scala.collection.mutable.Map[String, Long],
-    sample2: scala.collection.mutable.Map[String, Long],
-    correctForLowNumberOfSamples: Boolean = false,
-    method: CategoricalDistanceMethod = LInfinityMethod())
-  : Double = {
+  def categoricalDistance(sample1: scala.collection.mutable.Map[String, Long],
+                          sample2: scala.collection.mutable.Map[String, Long],
+                          correctForLowNumberOfSamples: Boolean = false,
+                          method: CategoricalDistanceMethod = LInfinityMethod()): Double = {
     method match {
       case LInfinityMethod(alpha) => categoricalLInfinityDistance(sample1, sample2, correctForLowNumberOfSamples, alpha)
       case ChisquareMethod(absThresholdYates, percThresholdYates, absThresholdCochran)
@@ -109,38 +110,47 @@ object Distance {
 
   /** Calculate distance of categorical profiles based on Chisquare test or stats
    *
-   *  for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
-   *  for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
+   *  for 2x2 tables:
+   *    all expected counts should be 10 or greater
+   *      - Cochran, William G. "The (chi)**2 test of goodness of fit."
+   *        The Annals of mathematical statistics (1952): 315-345.
+   *  for tables larger than 2 x 2:
+   *    "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
+   *      - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
    *
-   *  @param sample                       the mapping between categories(keys) and counts(values) of the observed sample
-   *  @param expected                     the mapping between categories(keys) and counts(values) of the expected baseline
+   *  @param sample                       the mapping between categories(keys) and
+   *                                      counts(values) of the observed sample
+   *  @param expected                     the mapping between categories(keys) and
+   *                                      counts(values) of the expected baseline
    *  @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
    *  @param absThresholdYates            Yates absolute threshold for tables larger than 2x2
-   *  @param percThresholdYates           Yates percentage of categories that can be below threshold for tables larger than 2x2
+   *  @param percThresholdYates           Yates percentage of categories that can be
+   *                                      below threshold for tables larger than 2x2
    *  @param absThresholdCochran          Cochran absolute threshold for 2x2 tables
-   *  @return distance                    can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
+   *  @return distance                    can be an absolute distance or
+   *                                      a p-value based on the correctForLowNumberOfSamples argument
    *
    */
-  private[this] def categoricalChiSquareTest(
-    sample: scala.collection.mutable.Map[String, Long],
-    expected: scala.collection.mutable.Map[String, Long],
-    correctForLowNumberOfSamples: Boolean = false,
-    absThresholdYates : Integer = defaultAbsThresholdYates ,
-    percThresholdYates : Double = defaultPercThresholdYates,
-    absThresholdCochran : Integer = defaultAbsThresholdCochran,
-    normalizeExpected : Boolean = true)
-  : Double = {
-
-    val sampleSum: Double = sample.filter(e => expected.contains(e._1)).map((e => e._2)).sum
-    val expectedSum: Double = expected.map(e => e._2).sum
+  private[this] def categoricalChiSquareTest(sample: scala.collection.mutable.Map[String, Long],
+                                             expected: scala.collection.mutable.Map[String, Long],
+                                             correctForLowNumberOfSamples: Boolean = false,
+                                             absThresholdYates : Integer = defaultAbsThresholdYates,
+                                             percThresholdYates : Double = defaultPercThresholdYates,
+                                             absThresholdCochran : Integer = defaultAbsThresholdCochran): Double = {
+
+    val sampleSum: Double = sample.filter(e => expected.contains(e._1)).values.sum
+    val expectedSum: Double = expected.values.sum
 
     // Normalize the expected input, normalization is required to conduct the chi-square test
-    // While normalization is already included in the mllib chi-square test, we perform normalization manually to execute proper regrouping
-    // https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest:org.apache.spark.mllib.stat.test.ChiSqTestResult
-    val expectedNorm: scala.collection.mutable.Map[String, Double] = expected.map(e => (e._1, (e._2 / expectedSum * sampleSum)))
+    // While normalization is already included in the mllib chi-square test,
+    // we perform normalization manually to execute proper regrouping
+    // https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest
+    val expectedNorm: scala.collection.mutable.Map[String, Double] =
+      expected.map(e => (e._1, e._2 / expectedSum * sampleSum))
 
     // Call the function that regroups categories if necessary depending on thresholds
-    val (regroupedSample, regroupedExpected) = regroupCategories(sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)
+    val (regroupedSample, regroupedExpected) = regroupCategories(
+      sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)
 
     // If less than 2 categories remain we cannot conduct the test
     if (regroupedSample.keySet.size < chisquareMinDimension) {
@@ -158,30 +168,39 @@ object Distance {
 
   /** Regroup categories with elements below threshold, required for chi-square test
    *
-   * for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
-   * for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
+   * for 2x2 tables:
+   *   all expected counts should be 10 or greater
+   *     - Cochran, William G. "The (chi)**2 test of goodness of fit."
+   *       The Annals of mathematical statistics (1952): 315-345.
+   * for tables larger than 2 x 2:
+   *   "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
+   *     - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
    *
-   * @param sample                       the mapping between categories(keys) and counts(values) of the observed sample
-   * @param expected                     the mapping between categories(keys) and counts(values) of the expected baseline
+   * @param sample                       the mapping between categories(keys) and
+   *                                     counts(values) of the observed sample
+   * @param expected                     the mapping between categories(keys) and
+   *                                     counts(values) of the expected baseline
    * @param absThresholdYates            Yates absolute threshold for tables larger than 2x2
-   * @param percThresholdYates           Yates percentage of categories that can be below threshold for tables larger than 2x2
+   * @param percThresholdYates           Yates percentage of categories that can be
+   *                                     below threshold for tables larger than 2x2
    * @param absThresholdCochran          Cochran absolute threshold for 2x2 tables
    * @return (sample, expected)          returns the two regrouped mappings
    *
    */
-  private[this] def regroupCategories(
-    sample: scala.collection.mutable.Map[String, Double],
-    expected: scala.collection.mutable.Map[String, Double],
-    absThresholdYates: Integer = defaultAbsThresholdYates,
-    percThresholdYates: Double = defaultPercThresholdYates,
-    absThresholdCochran: Integer = defaultAbsThresholdCochran)
+  @tailrec
+  private[this] def regroupCategories(sample: scala.collection.mutable.Map[String, Double],
+                                      expected: scala.collection.mutable.Map[String, Double],
+                                      absThresholdYates: Integer = defaultAbsThresholdYates,
+                                      percThresholdYates: Double = defaultPercThresholdYates,
+                                      absThresholdCochran: Integer = defaultAbsThresholdCochran)
     : (scala.collection.mutable.Map[String, Double], scala.collection.mutable.Map[String, Double]) = {
 
     // If number of categories is below the minimum return original mappings
     if (expected.keySet.size < chisquareMinDimension) {
       (sample, expected)
     } else {
-      // Determine thresholds depending on dimensions of mapping (2x2 tables use Cochran, all other tables Yates thresholds)
+      // Determine thresholds depending on dimensions of mapping
+      // 2x2 tables use Cochran, all other tables Yates thresholds
       var absThresholdPerColumn : Integer = absThresholdCochran
       var maxNbColumnsBelowThreshold: Integer = 0
       if (expected.keySet.size > chisquareMinDimension) {
@@ -191,8 +210,9 @@ object Distance {
       // Count number of categories below threshold
       val nbExpectedColumnsBelowThreshold = expected.filter(e => e._2 < absThresholdPerColumn).keySet.size
 
-      // If the number of categories below threshold exceeds the authorized maximum, small categories are regrouped until valid
-      if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold){
+      // If the number of categories below threshold exceeds
+      // the authorized maximum, small categories are regrouped until valid
+      if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold) {
 
         // Identified key that holds minimum value
         val expectedMin: (String, Double) = expected.minBy(e => e._2)
@@ -226,10 +246,8 @@ object Distance {
    * @return ChiSqTestResult    returns the chi-square test result object (contains both statistics and p-value)
    *
    */
-  private[this] def chiSquareTest(
-                                   sample: scala.collection.mutable.Map[String, Double],
-                                   expected: scala.collection.mutable.Map[String, Double])
-  : ChiSqTestResult = {
+  private[this] def chiSquareTest(sample: scala.collection.mutable.Map[String, Double],
+                                  expected: scala.collection.mutable.Map[String, Double]): ChiSqTestResult = {
 
     var sampleArray = Array[Double]()
     var expectedArray = Array[Double]()
@@ -248,12 +266,10 @@ object Distance {
   }
 
   /** Calculate distance of categorical profiles based on L-Infinity Distance */
-  private[this] def categoricalLInfinityDistance(
-    sample1: scala.collection.mutable.Map[String, Long],
-    sample2: scala.collection.mutable.Map[String, Long],
-    correctForLowNumberOfSamples: Boolean = false,
-    alpha: Option[Double])
-  : Double = {
+  private[this] def categoricalLInfinityDistance(sample1: scala.collection.mutable.Map[String, Long],
+                                                 sample2: scala.collection.mutable.Map[String, Long],
+                                                 correctForLowNumberOfSamples: Boolean = false,
+                                                 alpha: Option[Double]): Double = {
     var n = 0.0
     var m = 0.0
     sample1.keySet.foreach { key =>
@@ -276,26 +292,23 @@ object Distance {
 
   /** Select which metrics to compute (linf_simple or linf_robust)
    *  based on whether samples are enough */
-   private[this] def selectMetrics(
-     linfSimple: Double,
-     n: Double,
-     m: Double,
-     correctForLowNumberOfSamples: Boolean = false,
-     alpha: Option[Double])
-   : Double = {
+   private[this] def selectMetrics(linfSimple: Double,
+                                   n: Double,
+                                   m: Double,
+                                   correctForLowNumberOfSamples: Boolean = false,
+                                   alpha: Option[Double]): Double = {
      if (correctForLowNumberOfSamples) {
        linfSimple
      } else {
        // This formula is based on  “Two-sample Kolmogorov–Smirnov test"
        // Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
 
-       val cAlpha : Double =  alpha match {
-         case Some(a)  => Math.sqrt(-Math.log(a/2) * 1/2)
+       val cAlpha: Double = alpha match {
+         case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2)
          case None => defaultCAlpha
        }
        val linfRobust = Math.max(0.0, linfSimple - cAlpha * Math.sqrt((n + m) / (n * m)))
        linfRobust
      }
    }
 }
-