From 0824c97db2fd5f1db89413d42fe66a4eec9831e8 Mon Sep 17 00:00:00 2001 From: Shuhei Kadowaki <40514306+aviatesk@users.noreply.github.com> Date: Thu, 2 Jul 2020 18:04:29 +0900 Subject: [PATCH] update documentation, examples: (#258) - rename `productName` to `manufacturerName` - fix typo - fix KLL example --- .../amazon/deequ/VerificationRunBuilder.scala | 4 +-- .../amazon/deequ/examples/KLLExample.scala | 11 +++---- .../examples/algebraic_states_example.md | 32 +++++++++---------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala b/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala index ac8fdb3ca..aae3b66aa 100644 --- a/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala +++ b/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala @@ -114,7 +114,7 @@ class VerificationRunBuilder(val data: DataFrame) { } /** - * Can be used to enforce the calculation of some some metric regardless of if there is a + * Can be used to enforce the calculation of some metric regardless of if there is a * constraint on it (optional) * * @param requiredAnalyzer The analyzer to be used to calculate the metric during the run @@ -125,7 +125,7 @@ class VerificationRunBuilder(val data: DataFrame) { } /** - * Can be used to enforce the calculation of some some metrics regardless of if there are + * Can be used to enforce the calculation of some metrics regardless of if there are * constraints on them (optional) * * @param requiredAnalyzers The analyzers to be used to calculate the metrics during the run diff --git a/src/main/scala/com/amazon/deequ/examples/KLLExample.scala b/src/main/scala/com/amazon/deequ/examples/KLLExample.scala index 3a2b897eb..01665e8b6 100644 --- a/src/main/scala/com/amazon/deequ/examples/KLLExample.scala +++ b/src/main/scala/com/amazon/deequ/examples/KLLExample.scala @@ -19,7 +19,7 @@ package com.amazon.deequ.examples import com.amazon.deequ.analyzers.KLLParameters import com.amazon.deequ.examples.ExampleUtils.{itemsAsDataframe, withSpark} import com.amazon.deequ.profiles.NumericColumnProfile -import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules} +import com.amazon.deequ.profiles.ColumnProfilerRunner private[examples] object KLLExample extends App { @@ -32,13 +32,13 @@ private[examples] object KLLExample extends App { Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), Item(5, "Thingy E", null, "high", 12)) - val suggestionResult = ConstraintSuggestionRunner() + val profileResult = ColumnProfilerRunner() .onData(df) - .addConstraintRules(Rules.DEFAULT) - .setKLLParameters(KLLParameters(2, 0.64, 2)) + .withKLLProfiling() + .setKLLParameters(Some(KLLParameters(2, 0.64, 2))) .run() - val columnProfiles = suggestionResult.columnProfiles + val columnProfiles = profileResult.profiles println("Observed statistics:") columnProfiles.foreach { case (name, profile) => @@ -107,4 +107,3 @@ private[examples] object KLLExample extends App { } } } - diff --git a/src/main/scala/com/amazon/deequ/examples/algebraic_states_example.md b/src/main/scala/com/amazon/deequ/examples/algebraic_states_example.md index 460d74a2c..e250f8a5c 100644 --- a/src/main/scala/com/amazon/deequ/examples/algebraic_states_example.md +++ b/src/main/scala/com/amazon/deequ/examples/algebraic_states_example.md @@ -30,7 +30,7 @@ val stateStore = InMemoryStateProvider() val metricsForData = AnalysisRunner.run( data = data, analysis = analysis, - saveStatesWith = Some(stateStore)) + saveStatesWith = Some(stateStore)) ``` We can now inspect the metrics for the current version of the data: @@ -51,7 +51,7 @@ Completeness(productName,None): 1.0 Completeness(description,None): 0.6666666666666666 ``` -Now lets assume we somehow gathered more data that we want to add to our dataset. We would now like to know the updated metrics for the whole dataset, but we do not want to read the old data again. Fortunately, **deequ** allows us to continue from the internal state of the computation and update the metrics from the stored states without having to access the previous data! +Now lets assume we somehow gathered more data that we want to add to our dataset. We would now like to know the updated metrics for the whole dataset, but we do not want to read the old data again. Fortunately, **deequ** allows us to continue from the internal state of the computation and update the metrics from the stored states without having to access the previous data! ```scala val moreData = ExampleUtils.itemsAsDataframe(spark, @@ -61,7 +61,7 @@ val moreData = ExampleUtils.itemsAsDataframe(spark, val metricsAfterAddingMoreData = AnalysisRunner.run( data = moreData, analysis = analysis, - aggregateWith = Some(stateStore) + aggregateWith = Some(stateStore) ) println("\nMetrics after adding 2 more records:\n") @@ -102,8 +102,8 @@ And we have the following metrics (defined by a check) that we're interested in ```scala val check = Check(CheckLevel.Warning, "a check") - .isComplete("productName") - .containsURL("productName", _ == 0.0) + .isComplete("manufacturerName") + .containsURL("manufacturerName", _ == 0.0) .isContainedIn("countryCode", Array("DE", "US", "CN")) ``` @@ -121,7 +121,7 @@ AnalysisRunner.run(usManufacturers, analysis, saveStatesWith = Some(usStates)) AnalysisRunner.run(cnManufacturers, analysis, saveStatesWith = Some(cnStates)) val tableMetrics = AnalysisRunner.runOnAggregatedStates( - deManufacturers.schema, + deManufacturers.schema, analysis, Seq(deStates, usStates, cnStates) ) @@ -133,9 +133,9 @@ tableMetrics.metricMap.foreach { case (analyzer, metric) => ``` ``` -Completeness(productName,None): 1.0 -PatternMatch(productName,(https?|ftp)://[^\s/$.?#].[^\s]*,None): 0.0 -Compliance("countryCode contained in DE,US,CN", +Completeness(manufacturerName,None): 1.0 +PatternMatch(manufacturerName,(https?|ftp)://[^\s/$.?#].[^\s]*,None): 0.0 +Compliance("countryCode contained in DE,US,CN", countryCode IS NULL OR countryCode IN ('DE','US','CN'),None): 1.0 ``` @@ -146,17 +146,17 @@ val updatedUsManufacturers = ExampleUtils.manufacturersAsDataframe(spark, Manufacturer(3, "ManufacturerDNew", "US"), Manufacturer(4, null, "US"), Manufacturer(5, "ManufacturerFNew http://clickme.com", "US")) - + val updatedUsStates = InMemoryStateProvider() AnalysisRunner.run( - updatedUsManufacturers, - analysis, + updatedUsManufacturers, + analysis, saveStatesWith = Some(updatedUsStates) ) val updatedTableMetrics = AnalysisRunner.runOnAggregatedStates( - deManufacturers.schema, + deManufacturers.schema, analysis, Seq(deStates, updatedUsStates, cnStates) ) @@ -170,9 +170,9 @@ updatedTableMetrics.metricMap.foreach { case (analyzer, metric) => This code will only operate on the updated partition and the states, but will still return the correct metrics for the table as a whole: ``` -Completeness(productName,None): 0.8571428571428571 -PatternMatch(productName,(https?|ftp)://[^\s/$.?#].[^\s]*,None): 0.14285714285714285 -Compliance("countryCode contained in DE,US,CN", +Completeness(manufacturerName,None): 0.8571428571428571 +PatternMatch(manufacturerName,(https?|ftp)://[^\s/$.?#].[^\s]*,None): 0.14285714285714285 +Compliance("countryCode contained in DE,US,CN", countryCode IS NULL OR countryCode IN ('DE','US','CN'),None): 1.0 ```