diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..a6475ad2 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,35 @@ +# Contributing + +## Overview + +Thanks for visiting and considering contributing to Data Caterer 🎉. Any contributions, whether they are suggestions, +bug fixes, features and general improvements or discussion points, are welcomed. To make things simple for everyone, +please follow the below guidelines when contributing. + +## Feature Request + +1. [Check if it already exists on the roadmap here](https://data.catering/use-case/roadmap/) +2. [For further discussion, join the Slack channel to discuss](https://join.slack.com/t/data-catering/shared_invite/zt-2664ylbpi-w3n7lWAO~PHeOG9Ujpm~~w) +3. Once accepted, it will be added and tracked via the Github Project +4. Sub-tasks will be created and implemented when contributors have time +5. When all sub-tasks completed, it can be promoted to the next release once all documentation is completed + in [data-caterer-docs](https://github.com/data-catering/data-caterer-docs) + +## Bug Fixes + +1. Search [Issues](https://github.com/data-catering/data-caterer/issues) in the project to see if the bug has already + been raised + 1. If it already exists, you can upvote or add to the discussion if you feel it is warranted (e.g. same bug, but you + get different output to others) + 2. If none exist, please create a new issue with the following format: + ```shell + version: 0.5.3 (Version of Data Caterer) + description: When generating data to Postgres, error inserting due to record with same primary key. (High level description of bug) + reproduction: (Code that could be used to reproduce the bug) + expected_behaviour: (What you are expecting to occur) + ``` +2. Depending on the severity of the issue, the team will be able to pick up the issue when contributors have time + +## General Improvements/Discussion + +[Come join the Slack channel to discuss anything you want!](https://join.slack.com/t/data-catering/shared_invite/zt-2664ylbpi-w3n7lWAO~PHeOG9Ujpm~~w) diff --git a/README.md b/README.md index 97211b66..571b8ade 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,10 @@ Sponsors have access to the following features: This is inspired by the [mkdocs-material project](https://github.com/squidfunk/mkdocs-material) which [follows the same model](https://squidfunk.github.io/mkdocs-material/insiders/). +## Contributing + +[View details here about how you can contribute to the project.](CONTRIBUTING.md) + ## Additional Details ### Design diff --git a/api/src/main/scala/com/github/pflooky/datacaterer/api/ValidationBuilder.scala b/api/src/main/scala/com/github/pflooky/datacaterer/api/ValidationBuilder.scala index cf9e6e4a..bf5bfdcf 100644 --- a/api/src/main/scala/com/github/pflooky/datacaterer/api/ValidationBuilder.scala +++ b/api/src/main/scala/com/github/pflooky/datacaterer/api/ValidationBuilder.scala @@ -116,7 +116,7 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation()) { * required to be boolean. Can use any columns in the validation logic. * * For example, - * {{{validation.expr("CASE WHEN status == 'open' THEN balance > 0 ELSE balance == 0 END}}} + * {{{validation.expr("CASE WHEN status == 'open' THEN balance > 0 ELSE balance == 0 END")}}} * * @param expr SQL expression which returns a boolean * @return ValidationBuilder @@ -126,14 +126,28 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation()) { validation match { case GroupByValidation(grpCols, aggCol, aggType, _) => val grpWithExpr = GroupByValidation(grpCols, aggCol, aggType, expr) - grpWithExpr.description = this.validation.description - grpWithExpr.errorThreshold = this.validation.errorThreshold - this.modify(_.validation).setTo(grpWithExpr) + copyWithDescAndThreshold(grpWithExpr) case expressionValidation: ExpressionValidation => - val withExpr = expressionValidation.modify(_.expr).setTo(expr) - withExpr.description = this.validation.description - withExpr.errorThreshold = this.validation.errorThreshold - this.modify(_.validation).setTo(withExpr) + val withExpr = expressionValidation.modify(_.whereExpr).setTo(expr) + copyWithDescAndThreshold(withExpr) + } + } + + /** + * SQL expression used to apply to columns before running validations. + * + * For example, + * {{{validation.selectExpr("PERCENTILE(amount, 0.5) AS median_amount, *")}}} + * + * @param expr SQL expression + * @return ValidationBuilder + * @see SQL expressions + */ + def selectExpr(expr: String): ValidationBuilder = { + validation match { + case expressionValidation: ExpressionValidation => + val withExpr = expressionValidation.modify(_.selectExpr).setTo(expr) + copyWithDescAndThreshold(withExpr) } } @@ -195,6 +209,12 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation()) { def columnNames: ColumnNamesValidationBuilder = { ColumnNamesValidationBuilder() } + + private def copyWithDescAndThreshold(newValidation: Validation): ValidationBuilder = { + newValidation.description = this.validation.description + newValidation.errorThreshold = this.validation.errorThreshold + this.modify(_.validation).setTo(newValidation) + } } case class ColumnValidationBuilder(validationBuilder: ValidationBuilder = ValidationBuilder(), column: String = "") { diff --git a/api/src/main/scala/com/github/pflooky/datacaterer/api/model/ValidationModels.scala b/api/src/main/scala/com/github/pflooky/datacaterer/api/model/ValidationModels.scala index b260c66e..0f0366fb 100644 --- a/api/src/main/scala/com/github/pflooky/datacaterer/api/model/ValidationModels.scala +++ b/api/src/main/scala/com/github/pflooky/datacaterer/api/model/ValidationModels.scala @@ -18,7 +18,8 @@ trait Validation { } case class ExpressionValidation( - expr: String = "true" + whereExpr: String = "true", + selectExpr: String = "*" ) extends Validation case class GroupByValidation( diff --git a/api/src/main/scala/com/github/pflooky/datacaterer/api/parser/ValidationIdResolver.scala b/api/src/main/scala/com/github/pflooky/datacaterer/api/parser/ValidationIdResolver.scala index e9e0978e..a40081a2 100644 --- a/api/src/main/scala/com/github/pflooky/datacaterer/api/parser/ValidationIdResolver.scala +++ b/api/src/main/scala/com/github/pflooky/datacaterer/api/parser/ValidationIdResolver.scala @@ -31,8 +31,9 @@ class ValidationBuilderSerializer extends JsonSerializer[ValidationBuilder] { val validation = value.validation gen.writeStartObject() validation match { - case ExpressionValidation(expr) => - gen.writeStringField("expr", expr) + case ExpressionValidation(expr, selectExpr) => + gen.writeStringField("whereExpr", expr) + gen.writeStringField("selectExpr", selectExpr) case GroupByValidation(groupByCols, aggCol, aggType, expr) => gen.writeArrayFieldStart("groupByCols") groupByCols.foreach(gen.writeObject) diff --git a/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanBuilderTest.scala b/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanBuilderTest.scala index 131a29b0..428fe1cb 100644 --- a/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanBuilderTest.scala +++ b/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanBuilderTest.scala @@ -123,7 +123,7 @@ class PlanBuilderTest extends AnyFunSuite { assert(validationHead.description.contains("name is equal to Peter")) assert(validationHead.errorThreshold.contains(0.1)) assert(validationHead.isInstanceOf[ExpressionValidation]) - assert(validationHead.asInstanceOf[ExpressionValidation].expr == "name == 'Peter'") + assert(validationHead.asInstanceOf[ExpressionValidation].whereExpr == "name == 'Peter'") assert(dataSourceHead._2.head.options == Map("path" -> "test/path/json")) assert(dataSourceHead._2.head.waitCondition == PauseWaitCondition()) } diff --git a/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanRunTest.scala b/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanRunTest.scala index cf61d6ba..af6eb118 100644 --- a/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanRunTest.scala +++ b/api/src/test/scala/com/github/pflooky/datacaterer/api/PlanRunTest.scala @@ -84,7 +84,7 @@ class PlanRunTest extends AnyFunSuite { assert(dsValidation._2.head.validations.size == 1) assert(dsValidation._2.head.validations.head.validation.isInstanceOf[ExpressionValidation]) val expressionValidation = dsValidation._2.head.validations.head.validation.asInstanceOf[ExpressionValidation] - assert(expressionValidation.expr == "account_id != ''") + assert(expressionValidation.whereExpr == "account_id != ''") } test("Can create plan with multiple validations for one data source") { @@ -105,10 +105,10 @@ class PlanRunTest extends AnyFunSuite { assert(dsValidation._1 == "my_postgres") val accountValid = dsValidation._2.filter(_.options.get(JDBC_TABLE).contains("account.accounts")).head assert(accountValid.validations.size == 1) - assert(accountValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].expr == "account_id != ''")) + assert(accountValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].whereExpr == "account_id != ''")) val txnValid = dsValidation._2.filter(_.options.get(JDBC_TABLE).contains("account.transactions")).head assert(txnValid.validations.size == 1) - assert(txnValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].expr == "txn_id IS NOT NULL")) + assert(txnValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].whereExpr == "txn_id IS NOT NULL")) } test("Can create plan with validations only defined") { @@ -124,7 +124,7 @@ class PlanRunTest extends AnyFunSuite { assert(result._validations.head.dataSources.contains("my_csv")) val validRes = result._validations.head.dataSources("my_csv").head assert(validRes.validations.size == 1) - assert(validRes.validations.head.validation.asInstanceOf[ExpressionValidation].expr == "account_id != 'acc123'") + assert(validRes.validations.head.validation.asInstanceOf[ExpressionValidation].whereExpr == "account_id != 'acc123'") assert(validRes.options.nonEmpty) assert(validRes.options == Map(FORMAT -> "csv", PATH -> "/my/csv")) } diff --git a/api/src/test/scala/com/github/pflooky/datacaterer/api/ValidationConfigurationBuilderTest.scala b/api/src/test/scala/com/github/pflooky/datacaterer/api/ValidationConfigurationBuilderTest.scala index fc38067a..ddd92a41 100644 --- a/api/src/test/scala/com/github/pflooky/datacaterer/api/ValidationConfigurationBuilderTest.scala +++ b/api/src/test/scala/com/github/pflooky/datacaterer/api/ValidationConfigurationBuilderTest.scala @@ -38,276 +38,276 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { val result = ValidationBuilder().col("my_col").greaterThan(10) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col > 10") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col > 10") } test("Can create column equal to validation") { val result = ValidationBuilder().col("my_col").isEqual(10) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col == 10") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col == 10") val resultStr = ValidationBuilder().col("my_col").isEqual("created") assert(resultStr.validation.isInstanceOf[ExpressionValidation]) - assert(resultStr.validation.asInstanceOf[ExpressionValidation].expr == "my_col == 'created'") + assert(resultStr.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col == 'created'") } test("Can create column equal to another column validation") { val result = ValidationBuilder().col("my_col").isEqualCol("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col == other_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col == other_col") } test("Can create column not equal to validation") { val result = ValidationBuilder().col("my_col").isNotEqual(10) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col != 10") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col != 10") val resultStr = ValidationBuilder().col("my_col").isNotEqual("created") assert(resultStr.validation.isInstanceOf[ExpressionValidation]) - assert(resultStr.validation.asInstanceOf[ExpressionValidation].expr == "my_col != 'created'") + assert(resultStr.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col != 'created'") } test("Can create column not equal to another column validation") { val result = ValidationBuilder().col("my_col").isNotEqualCol("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col != other_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col != other_col") } test("Can create column is null validation") { val result = ValidationBuilder().col("my_col").isNull assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "ISNULL(my_col)") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "ISNULL(my_col)") } test("Can create column is not null validation") { val result = ValidationBuilder().col("my_col").isNotNull assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "ISNOTNULL(my_col)") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "ISNOTNULL(my_col)") } test("Can create column contains validation") { val result = ValidationBuilder().col("my_col").contains("apple") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "CONTAINS(my_col, 'apple')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "CONTAINS(my_col, 'apple')") } test("Can create column not contains validation") { val result = ValidationBuilder().col("my_col").notContains("apple") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "!CONTAINS(my_col, 'apple')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "!CONTAINS(my_col, 'apple')") } test("Can create column less than validation") { val result = ValidationBuilder().col("my_col").lessThan(Date.valueOf("2023-01-01")) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col < DATE('2023-01-01')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col < DATE('2023-01-01')") } test("Can create column less than other column validation") { val result = ValidationBuilder().col("my_col").lessThanCol("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col < other_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col < other_col") } test("Can create column less than or equal validation") { val result = ValidationBuilder().col("my_col").lessThanOrEqual(Timestamp.valueOf("2023-01-01 00:00:00.0")) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col <= TIMESTAMP('2023-01-01 00:00:00.0')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col <= TIMESTAMP('2023-01-01 00:00:00.0')") } test("Can create column less than or equal other column validation") { val result = ValidationBuilder().col("my_col").lessThanOrEqualCol("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col <= other_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col <= other_col") } test("Can create column greater than validation") { val result = ValidationBuilder().col("my_col").greaterThan(10) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col > 10") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col > 10") } test("Can create column greater than other column validation") { val result = ValidationBuilder().col("my_col").greaterThanCol("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col > other_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col > other_col") } test("Can create column greater than or equal validation") { val result = ValidationBuilder().col("my_col").greaterThanOrEqual(10) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col >= 10") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col >= 10") } test("Can create column greater than or equal other column validation") { val result = ValidationBuilder().col("my_col").greaterThanOrEqualCol("other_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col >= other_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col >= other_col") } test("Can create column between validation") { val result = ValidationBuilder().col("my_col").between(10, 20) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col BETWEEN 10 AND 20") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col BETWEEN 10 AND 20") } test("Can create column between other col validation") { val result = ValidationBuilder().col("my_col").betweenCol("other_col", "another_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col BETWEEN other_col AND another_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col BETWEEN other_col AND another_col") } test("Can create column not between validation") { val result = ValidationBuilder().col("my_col").notBetween(10, 20) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col NOT BETWEEN 10 AND 20") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col NOT BETWEEN 10 AND 20") } test("Can create column not between other col validation") { val result = ValidationBuilder().col("my_col").notBetweenCol("other_col", "another_col") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col NOT BETWEEN other_col AND another_col") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col NOT BETWEEN other_col AND another_col") } test("Can create column in validation") { val result = ValidationBuilder().col("my_col").in("open", "closed") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col IN ('open','closed')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col IN ('open','closed')") } test("Can create column not in validation") { val result = ValidationBuilder().col("my_col").notIn("open", "closed") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "NOT my_col IN ('open','closed')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "NOT my_col IN ('open','closed')") } test("Can create column matches validation") { val result = ValidationBuilder().col("my_col").matches("ACC[0-9]{8}") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "REGEXP(my_col, 'ACC[0-9]{8}')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "REGEXP(my_col, 'ACC[0-9]{8}')") } test("Can create column not matches validation") { val result = ValidationBuilder().col("my_col").notMatches("ACC[0-9]{8}") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "!REGEXP(my_col, 'ACC[0-9]{8}')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "!REGEXP(my_col, 'ACC[0-9]{8}')") } test("Can create column starts with validation") { val result = ValidationBuilder().col("my_col").startsWith("ACC") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "STARTSWITH(my_col, 'ACC')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "STARTSWITH(my_col, 'ACC')") } test("Can create column not starts with validation") { val result = ValidationBuilder().col("my_col").notStartsWith("ACC") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "!STARTSWITH(my_col, 'ACC')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "!STARTSWITH(my_col, 'ACC')") } test("Can create column ends with validation") { val result = ValidationBuilder().col("my_col").endsWith("ACC") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "ENDSWITH(my_col, 'ACC')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "ENDSWITH(my_col, 'ACC')") } test("Can create column not ends with validation") { val result = ValidationBuilder().col("my_col").notEndsWith("ACC") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "!ENDSWITH(my_col, 'ACC')") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "!ENDSWITH(my_col, 'ACC')") } test("Can create column size validation") { val result = ValidationBuilder().col("my_col").size(2) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "SIZE(my_col) == 2") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "SIZE(my_col) == 2") } test("Can create column not size validation") { val result = ValidationBuilder().col("my_col").notSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "SIZE(my_col) != 5") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "SIZE(my_col) != 5") } test("Can create column less than size validation") { val result = ValidationBuilder().col("my_col").lessThanSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "SIZE(my_col) < 5") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "SIZE(my_col) < 5") } test("Can create column less than or equal size validation") { val result = ValidationBuilder().col("my_col").lessThanOrEqualSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "SIZE(my_col) <= 5") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "SIZE(my_col) <= 5") } test("Can create column greater than size validation") { val result = ValidationBuilder().col("my_col").greaterThanSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "SIZE(my_col) > 5") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "SIZE(my_col) > 5") } test("Can create column greater than or equal size validation") { val result = ValidationBuilder().col("my_col").greaterThanOrEqualSize(5) assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "SIZE(my_col) >= 5") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "SIZE(my_col) >= 5") } test("Can create column greater luhn check validation") { val result = ValidationBuilder().col("my_col").luhnCheck assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "LUHN_CHECK(my_col)") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "LUHN_CHECK(my_col)") } test("Can create column type validation") { val result = ValidationBuilder().col("my_col").hasType("double") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "TYPEOF(my_col) == 'double'") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "TYPEOF(my_col) == 'double'") } test("Can create column generic expression validation") { val result = ValidationBuilder().col("my_col").expr("my_col * 2 < other_col / 4") assert(result.validation.isInstanceOf[ExpressionValidation]) - assert(result.validation.asInstanceOf[ExpressionValidation].expr == "my_col * 2 < other_col / 4") + assert(result.validation.asInstanceOf[ExpressionValidation].whereExpr == "my_col * 2 < other_col / 4") } test("Can create group by column validation") { @@ -446,7 +446,7 @@ class ValidationConfigurationBuilderTest extends AnyFunSuite { assert(validation.joinType == DEFAULT_VALIDATION_JOIN_TYPE) assert(validation.joinCols == List("account_id")) assert(validation.validationBuilder.validation.isInstanceOf[ExpressionValidation]) - assert(validation.validationBuilder.validation.asInstanceOf[ExpressionValidation].expr == "amount <= other_data_source_balance") + assert(validation.validationBuilder.validation.asInstanceOf[ExpressionValidation].whereExpr == "amount <= other_data_source_balance") } test("Can create validation based on data from another data source as an anti-join") { diff --git a/app/src/main/scala/com/github/pflooky/datagen/core/generator/result/ResultHtmlWriter.scala b/app/src/main/scala/com/github/pflooky/datagen/core/generator/result/ResultHtmlWriter.scala index c2307268..01b18cfe 100644 --- a/app/src/main/scala/com/github/pflooky/datagen/core/generator/result/ResultHtmlWriter.scala +++ b/app/src/main/scala/com/github/pflooky/datagen/core/generator/result/ResultHtmlWriter.scala @@ -677,9 +677,10 @@ class ResultHtmlWriter { private def getValidationOptions(validation: Validation): List[List[String]] = { val options = validation match { - case ExpressionValidation(expr) => + case ExpressionValidation(expr, selectExpr) => List( - List("expr", expr), + List("selectExpr", selectExpr), + List("whereExpr", expr), List("errorThreshold", validation.errorThreshold.getOrElse(0.0).toString) ) case GroupByValidation(groupByCols, aggCol, aggType, expr) => diff --git a/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationOperations.scala b/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationOperations.scala index ee771a5d..ec5d22ac 100644 --- a/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationOperations.scala +++ b/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationOperations.scala @@ -47,7 +47,8 @@ object ValidationHelper { class ExpressionValidationOps(expressionValidation: ExpressionValidation) extends ValidationOps(expressionValidation) { override def validate(df: DataFrame, dfCount: Long): ValidationResult = { - validateWithExpression(df, dfCount, expressionValidation.expr) + val dfWithSelectExpr = df.selectExpr(expressionValidation.selectExpr) + validateWithExpression(dfWithSelectExpr, dfCount, expressionValidation.whereExpr) } } diff --git a/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationProcessor.scala b/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationProcessor.scala index 47bdfd61..5fd32bf8 100644 --- a/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationProcessor.scala +++ b/app/src/main/scala/com/github/pflooky/datagen/core/validator/ValidationProcessor.scala @@ -119,7 +119,7 @@ class ValidationProcessor( } else { failedValidations.foreach(validationRes => { val (validationType, validationCheck) = validationRes.validation match { - case ExpressionValidation(expr) => ("expression", expr) + case ExpressionValidation(expr, selectExpr) => ("expression", expr) case GroupByValidation(_, _, _, expr) => ("groupByAggregate", expr) //TODO get validationCheck from validationBuilder -> make this a recursive method to get validationCheck case UpstreamDataSourceValidation(validationBuilder, upstreamDataSource, _, _, _) => ("upstreamDataSource", "") diff --git a/app/src/test/scala/com/github/pflooky/datagen/core/model/ValidationOperationsTest.scala b/app/src/test/scala/com/github/pflooky/datagen/core/model/ValidationOperationsTest.scala index b1bd07a6..d1cdc64e 100644 --- a/app/src/test/scala/com/github/pflooky/datagen/core/model/ValidationOperationsTest.scala +++ b/app/src/test/scala/com/github/pflooky/datagen/core/model/ValidationOperationsTest.scala @@ -28,6 +28,14 @@ class ValidationOperationsTest extends SparkSuite { assert(result.sampleErrorValues.isEmpty) } + test("Can define select expression to run before where expression") { + val validation = ExpressionValidation("median_amount < 1000", "PERCENTILE(amount, 0.5) AS median_amount") + val result = new ExpressionValidationOps(validation).validate(df, 4) + + assert(result.isSuccess) + assert(result.sampleErrorValues.isEmpty) + } + test("Can return empty sample rows when validation is successful from error threshold") { val validation = new ValidationBuilder().expr("amount < 400").errorThreshold(1).validation.asInstanceOf[ExpressionValidation] val result = new ExpressionValidationOps(validation).validate(df, 4) diff --git a/gradle.properties b/gradle.properties index c53e0885..1f49f881 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ groupId=io.github.data-catering -version=0.5.3 +version=0.5.4 scalaVersion=2.12 scalaSpecificVersion=2.12.15