Skip to content

Commit

Permalink
Fix style issues causing mvn install to fail. (#453)
Browse files Browse the repository at this point in the history
- Style issues were caused due to spacing issues and the presence of the chi character in unicode.
- The workflow step was also updated to include the execution of the style check. Previously, the style check was not being executed which led to the style errors being committed to master.
  • Loading branch information
rdsharma26 authored Mar 1, 2023
1 parent d2551bc commit bb7f350
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 131 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ jobs:
distribution: 'corretto'
cache: maven
- name: Build with Maven
run: mvn clean test
run: mvn clean verify

195 changes: 104 additions & 91 deletions src/main/scala/com/amazon/deequ/analyzers/Distance.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,48 +15,46 @@
*/

package com.amazon.deequ.analyzers
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.stat.Statistics._
import org.apache.spark.mllib.stat.test.ChiSqTestResult




import scala.annotation.tailrec

object Distance {

// Chi-square constants
// at least two distinct categories are required to run the chi-square test for a categorical variable
private val chisquareMinDimension: Int = 2

//for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
// for tables larger than 2 x 2:
// "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
// - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
private val defaultAbsThresholdYates: Integer = 5
private val defaultPercThresholdYates: Double = 0.2

// for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
// for 2x2 tables:
// all expected counts should be 10 or greater
// - Cochran, William G. "The (chi)**2 test of goodness of fit."
// The Annals of mathematical statistics (1952): 315-345.
private val defaultAbsThresholdCochran: Integer = 10

// Default c(alpha) value corresponding to an alpha value of 0.003, Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms), 3rd Edition, Addison Wesley, Reading Mass, 1998.
// Default c(alpha) value corresponding to an alpha value of 0.003,
// Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms),
// 3rd Edition, Addison Wesley, Reading Mass, 1998.
private val defaultCAlpha : Double = 1.8

trait CategoricalDistanceMethod
case class LInfinityMethod(alpha: Option[Double] = None) extends CategoricalDistanceMethod
case class ChisquareMethod(
absThresholdYates: Integer = defaultAbsThresholdYates,
percThresholdYates: Double = defaultPercThresholdYates,
absThresholdCochran: Integer = defaultAbsThresholdCochran)
case class ChisquareMethod(absThresholdYates: Integer = defaultAbsThresholdYates,
percThresholdYates: Double = defaultPercThresholdYates,
absThresholdCochran: Integer = defaultAbsThresholdCochran)
extends CategoricalDistanceMethod

/** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
def numericalDistance(
sample1: QuantileNonSample[Double],
sample2: QuantileNonSample[Double],
correctForLowNumberOfSamples: Boolean = false,
alpha: Option[Double] = None)
: Double = {
/** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
def numericalDistance(sample1: QuantileNonSample[Double],
sample2: QuantileNonSample[Double],
correctForLowNumberOfSamples: Boolean = false,
alpha: Option[Double] = None): Double = {
val rankMap1 = sample1.getRankMap()
val rankMap2 = sample2.getRankMap()
val combinedKeys = rankMap1.keySet.union(rankMap2.keySet)
Expand All @@ -76,24 +74,27 @@ object Distance {
/** Calculate distance of categorical profiles based on different distance methods
*
* Thresholds for chi-square method:
* - for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
* - for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
* - for 2x2 tables:
* all expected counts should be 10 or greater
* - Cochran, William G. "The (chi)**2 test of goodness of fit."
* The Annals of mathematical statistics (1952): 315-345.
* - for tables larger than 2 x 2:
* "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
* - (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
*
* @param sample1 the mapping between categories(keys) and counts(values) of the observed sample
* @param sample2 the mapping between categories(keys) and counts(values) of the expected baseline
* @param sample1 the mapping between categories(keys) and
* counts(values) of the observed sample
* @param sample2 the mapping between categories(keys) and
* counts(values) of the expected baseline
* @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
* @param method Method to use: LInfinity or Chisquare
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
* @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
* @return distance can be an absolute distance or
* a p-value based on the correctForLowNumberOfSamples argument
*/
def categoricalDistance(
sample1: scala.collection.mutable.Map[String, Long],
sample2: scala.collection.mutable.Map[String, Long],
correctForLowNumberOfSamples: Boolean = false,
method: CategoricalDistanceMethod = LInfinityMethod())
: Double = {
def categoricalDistance(sample1: scala.collection.mutable.Map[String, Long],
sample2: scala.collection.mutable.Map[String, Long],
correctForLowNumberOfSamples: Boolean = false,
method: CategoricalDistanceMethod = LInfinityMethod()): Double = {
method match {
case LInfinityMethod(alpha) => categoricalLInfinityDistance(sample1, sample2, correctForLowNumberOfSamples, alpha)
case ChisquareMethod(absThresholdYates, percThresholdYates, absThresholdCochran)
Expand All @@ -109,38 +110,47 @@ object Distance {

/** Calculate distance of categorical profiles based on Chisquare test or stats
*
* for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
* for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
* for 2x2 tables:
* all expected counts should be 10 or greater
* - Cochran, William G. "The (chi)**2 test of goodness of fit."
* The Annals of mathematical statistics (1952): 315-345.
* for tables larger than 2 x 2:
* "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
* - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
*
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
* @param sample the mapping between categories(keys) and
* counts(values) of the observed sample
* @param expected the mapping between categories(keys) and
* counts(values) of the expected baseline
* @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
* @param percThresholdYates Yates percentage of categories that can be
* below threshold for tables larger than 2x2
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
* @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
* @return distance can be an absolute distance or
* a p-value based on the correctForLowNumberOfSamples argument
*
*/
private[this] def categoricalChiSquareTest(
sample: scala.collection.mutable.Map[String, Long],
expected: scala.collection.mutable.Map[String, Long],
correctForLowNumberOfSamples: Boolean = false,
absThresholdYates : Integer = defaultAbsThresholdYates ,
percThresholdYates : Double = defaultPercThresholdYates,
absThresholdCochran : Integer = defaultAbsThresholdCochran,
normalizeExpected : Boolean = true)
: Double = {

val sampleSum: Double = sample.filter(e => expected.contains(e._1)).map((e => e._2)).sum
val expectedSum: Double = expected.map(e => e._2).sum
private[this] def categoricalChiSquareTest(sample: scala.collection.mutable.Map[String, Long],
expected: scala.collection.mutable.Map[String, Long],
correctForLowNumberOfSamples: Boolean = false,
absThresholdYates : Integer = defaultAbsThresholdYates,
percThresholdYates : Double = defaultPercThresholdYates,
absThresholdCochran : Integer = defaultAbsThresholdCochran): Double = {

val sampleSum: Double = sample.filter(e => expected.contains(e._1)).values.sum
val expectedSum: Double = expected.values.sum

// Normalize the expected input, normalization is required to conduct the chi-square test
// While normalization is already included in the mllib chi-square test, we perform normalization manually to execute proper regrouping
// https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest:org.apache.spark.mllib.stat.test.ChiSqTestResult
val expectedNorm: scala.collection.mutable.Map[String, Double] = expected.map(e => (e._1, (e._2 / expectedSum * sampleSum)))
// While normalization is already included in the mllib chi-square test,
// we perform normalization manually to execute proper regrouping
// https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest
val expectedNorm: scala.collection.mutable.Map[String, Double] =
expected.map(e => (e._1, e._2 / expectedSum * sampleSum))

// Call the function that regroups categories if necessary depending on thresholds
val (regroupedSample, regroupedExpected) = regroupCategories(sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)
val (regroupedSample, regroupedExpected) = regroupCategories(
sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)

// If less than 2 categories remain we cannot conduct the test
if (regroupedSample.keySet.size < chisquareMinDimension) {
Expand All @@ -158,30 +168,39 @@ object Distance {

/** Regroup categories with elements below threshold, required for chi-square test
*
* for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
* for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
* for 2x2 tables:
* all expected counts should be 10 or greater
* - Cochran, William G. "The (chi)**2 test of goodness of fit."
* The Annals of mathematical statistics (1952): 315-345.
* for tables larger than 2 x 2:
* "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
* - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
*
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
* @param sample the mapping between categories(keys) and
* counts(values) of the observed sample
* @param expected the mapping between categories(keys) and
* counts(values) of the expected baseline
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
* @param percThresholdYates Yates percentage of categories that can be
* below threshold for tables larger than 2x2
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
* @return (sample, expected) returns the two regrouped mappings
*
*/
private[this] def regroupCategories(
sample: scala.collection.mutable.Map[String, Double],
expected: scala.collection.mutable.Map[String, Double],
absThresholdYates: Integer = defaultAbsThresholdYates,
percThresholdYates: Double = defaultPercThresholdYates,
absThresholdCochran: Integer = defaultAbsThresholdCochran)
@tailrec
private[this] def regroupCategories(sample: scala.collection.mutable.Map[String, Double],
expected: scala.collection.mutable.Map[String, Double],
absThresholdYates: Integer = defaultAbsThresholdYates,
percThresholdYates: Double = defaultPercThresholdYates,
absThresholdCochran: Integer = defaultAbsThresholdCochran)
: (scala.collection.mutable.Map[String, Double], scala.collection.mutable.Map[String, Double]) = {

// If number of categories is below the minimum return original mappings
if (expected.keySet.size < chisquareMinDimension) {
(sample, expected)
} else {
// Determine thresholds depending on dimensions of mapping (2x2 tables use Cochran, all other tables Yates thresholds)
// Determine thresholds depending on dimensions of mapping
// 2x2 tables use Cochran, all other tables Yates thresholds
var absThresholdPerColumn : Integer = absThresholdCochran
var maxNbColumnsBelowThreshold: Integer = 0
if (expected.keySet.size > chisquareMinDimension) {
Expand All @@ -191,8 +210,9 @@ object Distance {
// Count number of categories below threshold
val nbExpectedColumnsBelowThreshold = expected.filter(e => e._2 < absThresholdPerColumn).keySet.size

// If the number of categories below threshold exceeds the authorized maximum, small categories are regrouped until valid
if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold){
// If the number of categories below threshold exceeds
// the authorized maximum, small categories are regrouped until valid
if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold) {

// Identified key that holds minimum value
val expectedMin: (String, Double) = expected.minBy(e => e._2)
Expand Down Expand Up @@ -226,10 +246,8 @@ object Distance {
* @return ChiSqTestResult returns the chi-square test result object (contains both statistics and p-value)
*
*/
private[this] def chiSquareTest(
sample: scala.collection.mutable.Map[String, Double],
expected: scala.collection.mutable.Map[String, Double])
: ChiSqTestResult = {
private[this] def chiSquareTest(sample: scala.collection.mutable.Map[String, Double],
expected: scala.collection.mutable.Map[String, Double]): ChiSqTestResult = {

var sampleArray = Array[Double]()
var expectedArray = Array[Double]()
Expand All @@ -248,12 +266,10 @@ object Distance {
}

/** Calculate distance of categorical profiles based on L-Infinity Distance */
private[this] def categoricalLInfinityDistance(
sample1: scala.collection.mutable.Map[String, Long],
sample2: scala.collection.mutable.Map[String, Long],
correctForLowNumberOfSamples: Boolean = false,
alpha: Option[Double])
: Double = {
private[this] def categoricalLInfinityDistance(sample1: scala.collection.mutable.Map[String, Long],
sample2: scala.collection.mutable.Map[String, Long],
correctForLowNumberOfSamples: Boolean = false,
alpha: Option[Double]): Double = {
var n = 0.0
var m = 0.0
sample1.keySet.foreach { key =>
Expand All @@ -276,26 +292,23 @@ object Distance {

/** Select which metrics to compute (linf_simple or linf_robust)
* based on whether samples are enough */
private[this] def selectMetrics(
linfSimple: Double,
n: Double,
m: Double,
correctForLowNumberOfSamples: Boolean = false,
alpha: Option[Double])
: Double = {
private[this] def selectMetrics(linfSimple: Double,
n: Double,
m: Double,
correctForLowNumberOfSamples: Boolean = false,
alpha: Option[Double]): Double = {
if (correctForLowNumberOfSamples) {
linfSimple
} else {
// This formula is based on “Two-sample Kolmogorov–Smirnov test"
// Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test

val cAlpha : Double = alpha match {
case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2)
val cAlpha: Double = alpha match {
case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2)
case None => defaultCAlpha
}
val linfRobust = Math.max(0.0, linfSimple - cAlpha * Math.sqrt((n + m) / (n * m)))
linfRobust
}
}
}

Loading

0 comments on commit bb7f350

Please sign in to comment.