From 195148c9e36b9f66d1c088771b202246fe706023 Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Thu, 7 Mar 2024 16:07:23 +0100 Subject: [PATCH 1/9] add unit test that successfully checks the bug --- soda/core/tests/data_source/test_samples.py | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index 168d9d1ca..e906cdee1 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -479,3 +479,25 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix failed_ids = [sample[0] for sample in scan._configuration.sampler.samples[0].rows] assert sorted(failed_ids) == sorted(["ID5", "ID7"]) + +def test_missing_filtered_with_dataset_filter(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + # Row count is 10 + scan = data_source_fixture.create_test_scan() + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + filter {table_name} [not_null_id]: + where: id IS NOT NULL + + checks for {table_name} [not_null_id]: + - missing_count(pct) = 1: + missing values: [No value, N/A, error] + filter: cst_size IS NOT NULL or cst_size_txt IS NOT NULL + """ + ) + scan.execute() + + scan.assert_all_checks_pass() + assert mock_soda_cloud.find_failed_rows_line_count(0) == 1 \ No newline at end of file From e98c88f672a77a1cfeb252f99cc3531f16356436 Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Thu, 7 Mar 2024 16:12:25 +0100 Subject: [PATCH 2/9] add fix for missing check --- soda/core/soda/execution/metric/numeric_query_metric.py | 2 +- soda/core/tests/data_source/test_samples.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/soda/core/soda/execution/metric/numeric_query_metric.py b/soda/core/soda/execution/metric/numeric_query_metric.py index acc51b1f0..7b3fd8e5e 100644 --- a/soda/core/soda/execution/metric/numeric_query_metric.py +++ b/soda/core/soda/execution/metric/numeric_query_metric.py @@ -323,7 +323,7 @@ def create_failed_rows_sample_query(self) -> SampleQuery | None: where_clauses.append(invalid_condition) if self.filter: - where_clauses.append(self.filter) + where_clauses.append(f"({self.filter})") passing_where_clauses.append(self.filter) where_sql = " AND ".join(where_clauses) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index e906cdee1..bd8cb70b5 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -480,7 +480,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix failed_ids = [sample[0] for sample in scan._configuration.sampler.samples[0].rows] assert sorted(failed_ids) == sorted(["ID5", "ID7"]) -def test_missing_filtered_with_dataset_filter(data_source_fixture: DataSourceFixture): +def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) # Row count is 10 scan = data_source_fixture.create_test_scan() From 70469f0acbef086f083b54ba5b1536347d6f437c Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Thu, 7 Mar 2024 16:55:45 +0100 Subject: [PATCH 3/9] add invalid_count example --- soda/core/tests/data_source/test_samples.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index bd8cb70b5..bccf6bfde 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -495,9 +495,14 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi - missing_count(pct) = 1: missing values: [No value, N/A, error] filter: cst_size IS NOT NULL or cst_size_txt IS NOT NULL + - invalid_count(cat) > 0: + valid values: ['HIGH'] + filter: country = 'BE' OR country = 'NL' """ ) scan.execute() scan.assert_all_checks_pass() - assert mock_soda_cloud.find_failed_rows_line_count(0) == 1 \ No newline at end of file + + assert mock_soda_cloud.find_failed_rows_line_count(0) == 1 + assert mock_soda_cloud.find_failed_rows_line_count(1) == 2 \ No newline at end of file From 0e13c02f093ec60b0d810b3aa459f24a2a5b4f35 Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Mon, 11 Mar 2024 12:05:18 +0100 Subject: [PATCH 4/9] add parentheses to passing_sql filter as well --- soda/core/soda/execution/metric/numeric_query_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soda/core/soda/execution/metric/numeric_query_metric.py b/soda/core/soda/execution/metric/numeric_query_metric.py index 7b3fd8e5e..837147b85 100644 --- a/soda/core/soda/execution/metric/numeric_query_metric.py +++ b/soda/core/soda/execution/metric/numeric_query_metric.py @@ -324,7 +324,7 @@ def create_failed_rows_sample_query(self) -> SampleQuery | None: if self.filter: where_clauses.append(f"({self.filter})") - passing_where_clauses.append(self.filter) + passing_where_clauses.append(f"({self.filter})") where_sql = " AND ".join(where_clauses) passing_where_sql = " AND ".join(passing_where_clauses) From 2437054b1e55cedde78406cb8d6ce888d8ccf25e Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Mon, 11 Mar 2024 16:36:02 +0100 Subject: [PATCH 5/9] add fix for duplicate queries --- .../soda/execution/query/duplicates_query.py | 2 +- .../core/tests/data_source/test_duplicates.py | 2 +- soda/core/tests/data_source/test_samples.py | 24 +++++++++++++++---- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/soda/core/soda/execution/query/duplicates_query.py b/soda/core/soda/execution/query/duplicates_query.py index 175d2f3e0..19e7fccd3 100644 --- a/soda/core/soda/execution/query/duplicates_query.py +++ b/soda/core/soda/execution/query/duplicates_query.py @@ -23,7 +23,7 @@ def __init__(self, partition: "Partition", metric: "Metric"): values_filter_clauses.append(resolved_partition_filter) if metric.filter: - values_filter_clauses.append(metric.filter) + values_filter_clauses.append(f"({metric.filter})") values_filter = " \n AND ".join(values_filter_clauses) diff --git a/soda/core/tests/data_source/test_duplicates.py b/soda/core/tests/data_source/test_duplicates.py index dedc279c3..a203ea40c 100644 --- a/soda/core/tests/data_source/test_duplicates.py +++ b/soda/core/tests/data_source/test_duplicates.py @@ -69,4 +69,4 @@ def test_duplicates_with_filter(data_source_fixture: DataSourceFixture): scan.execute() scan.assert_all_checks_pass() - scan.assert_log("AND country = 'NL'") + scan.assert_log("AND (country = 'NL')") diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index bccf6bfde..b75e97aca 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -482,7 +482,6 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) - # Row count is 10 scan = data_source_fixture.create_test_scan() mock_soda_cloud = scan.enable_mock_soda_cloud() scan.enable_mock_sampler() @@ -494,15 +493,30 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi checks for {table_name} [not_null_id]: - missing_count(pct) = 1: missing values: [No value, N/A, error] - filter: cst_size IS NOT NULL or cst_size_txt IS NOT NULL - - invalid_count(cat) > 0: + filter: cst_size IS NOT NULL OR + cst_size_txt IS NOT NULL + - missing_percent(pct) < 20: + missing values: [No value, N/A, error] + filter: | + "cst_size" IS NOT NULL or "cst_size_txt" IS NOT NULL + - invalid_count(cat) < 3: + valid values: ['HIGH'] + filter: country = 'BE' OR country = 'NL' + - invalid_percent(cat) < 30: valid values: ['HIGH'] filter: country = 'BE' OR country = 'NL' + - duplicate_count(cat) = 1: + filter: country = 'BE' OR country = 'NL' + - duplicate_percent(cat) < 20: + filter: country = 'BE' OR country = 'NL' """ ) scan.execute() - scan.assert_all_checks_pass() assert mock_soda_cloud.find_failed_rows_line_count(0) == 1 - assert mock_soda_cloud.find_failed_rows_line_count(1) == 2 \ No newline at end of file + assert mock_soda_cloud.find_failed_rows_line_count(1) == 1 + assert mock_soda_cloud.find_failed_rows_line_count(2) == 2 + assert mock_soda_cloud.find_failed_rows_line_count(3) == 2 + assert mock_soda_cloud.find_failed_rows_line_count(4) == 3 + assert mock_soda_cloud.find_failed_rows_line_count(5) == 3 \ No newline at end of file From 348ecdaae314401c6e6867288f7e69a23a1b73b5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Mar 2024 16:14:19 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- soda/core/tests/data_source/test_samples.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index b75e97aca..67b07b164 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -480,6 +480,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix failed_ids = [sample[0] for sample in scan._configuration.sampler.samples[0].rows] assert sorted(failed_ids) == sorted(["ID5", "ID7"]) + def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) scan = data_source_fixture.create_test_scan() @@ -493,7 +494,7 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi checks for {table_name} [not_null_id]: - missing_count(pct) = 1: missing values: [No value, N/A, error] - filter: cst_size IS NOT NULL OR + filter: cst_size IS NOT NULL OR cst_size_txt IS NOT NULL - missing_percent(pct) < 20: missing values: [No value, N/A, error] @@ -519,4 +520,4 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi assert mock_soda_cloud.find_failed_rows_line_count(2) == 2 assert mock_soda_cloud.find_failed_rows_line_count(3) == 2 assert mock_soda_cloud.find_failed_rows_line_count(4) == 3 - assert mock_soda_cloud.find_failed_rows_line_count(5) == 3 \ No newline at end of file + assert mock_soda_cloud.find_failed_rows_line_count(5) == 3 From da502239b86e064a77b4b609817446d0fbe79d33 Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Tue, 12 Mar 2024 11:15:03 +0100 Subject: [PATCH 7/9] move to pytest.mark.parametrize --- soda/core/tests/data_source/test_samples.py | 92 +++++++++++++++------ 1 file changed, 65 insertions(+), 27 deletions(-) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index 67b07b164..5daacc85b 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -481,7 +481,65 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix assert sorted(failed_ids) == sorted(["ID5", "ID7"]) -def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFixture): +@pytest.mark.parametrize( + "check,sample_count,samples", + [ + pytest.param( + """ + - missing_count(pct) = 1: + missing values: [No value, N/A, error] + filter: cst_size IS NOT NULL OR cst_size_txt IS NOT NULL + """, + 1, + ['ID7'] + ), + pytest.param( + """ + - missing_percent(pct) < 20: + missing values: [No value, N/A, error] + filter: | + "cst_size" IS NOT NULL or "cst_size_txt" IS NOT NULL + """, + 1, + ['ID7'] + ), + pytest.param( + """ + - invalid_count(cat) < 3: + valid values: ['HIGH'] + filter: country = 'BE' OR country = 'NL' + """, + 2, + ['ID3', 'ID4'] + ), + pytest.param( + """ + - invalid_percent(cat) < 30: + valid values: ['HIGH'] + filter: country = 'BE' OR country = 'NL' + """, + 2, + ['ID3', 'ID4'] + ), + pytest.param( + """ + - duplicate_count(cat) = 1: + filter: country = 'BE' OR country = 'NL' + """, + 3, + ['ID1', 'ID2', None] + ), + pytest.param( + """ + - duplicate_percent(cat) < 20: + filter: country = 'BE' OR country = 'NL' + """, + 3, + ['ID1', 'ID2', None] + ) + ] +) +def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFixture, check, sample_count, samples): table_name = data_source_fixture.ensure_test_table(customers_test_table) scan = data_source_fixture.create_test_scan() mock_soda_cloud = scan.enable_mock_soda_cloud() @@ -492,32 +550,12 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi where: id IS NOT NULL checks for {table_name} [not_null_id]: - - missing_count(pct) = 1: - missing values: [No value, N/A, error] - filter: cst_size IS NOT NULL OR - cst_size_txt IS NOT NULL - - missing_percent(pct) < 20: - missing values: [No value, N/A, error] - filter: | - "cst_size" IS NOT NULL or "cst_size_txt" IS NOT NULL - - invalid_count(cat) < 3: - valid values: ['HIGH'] - filter: country = 'BE' OR country = 'NL' - - invalid_percent(cat) < 30: - valid values: ['HIGH'] - filter: country = 'BE' OR country = 'NL' - - duplicate_count(cat) = 1: - filter: country = 'BE' OR country = 'NL' - - duplicate_percent(cat) < 20: - filter: country = 'BE' OR country = 'NL' - """ + {check} + """ ) + scan.execute() scan.assert_all_checks_pass() - - assert mock_soda_cloud.find_failed_rows_line_count(0) == 1 - assert mock_soda_cloud.find_failed_rows_line_count(1) == 1 - assert mock_soda_cloud.find_failed_rows_line_count(2) == 2 - assert mock_soda_cloud.find_failed_rows_line_count(3) == 2 - assert mock_soda_cloud.find_failed_rows_line_count(4) == 3 - assert mock_soda_cloud.find_failed_rows_line_count(5) == 3 + assert mock_soda_cloud.find_failed_rows_line_count(0) == sample_count + failed_ids = [sample[0] for sample in scan._configuration.sampler.samples[0].rows] + assert set(failed_ids) == set(samples) \ No newline at end of file From 07c129832bcb2afc490764eb9dfb6f4d609260be Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:19:41 +0000 Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- soda/core/tests/data_source/test_samples.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index 5daacc85b..48c5f8d3e 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -491,7 +491,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix filter: cst_size IS NOT NULL OR cst_size_txt IS NOT NULL """, 1, - ['ID7'] + ["ID7"], ), pytest.param( """ @@ -501,7 +501,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix "cst_size" IS NOT NULL or "cst_size_txt" IS NOT NULL """, 1, - ['ID7'] + ["ID7"], ), pytest.param( """ @@ -510,7 +510,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix filter: country = 'BE' OR country = 'NL' """, 2, - ['ID3', 'ID4'] + ["ID3", "ID4"], ), pytest.param( """ @@ -519,7 +519,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix filter: country = 'BE' OR country = 'NL' """, 2, - ['ID3', 'ID4'] + ["ID3", "ID4"], ), pytest.param( """ @@ -527,7 +527,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix filter: country = 'BE' OR country = 'NL' """, 3, - ['ID1', 'ID2', None] + ["ID1", "ID2", None], ), pytest.param( """ @@ -535,9 +535,9 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix filter: country = 'BE' OR country = 'NL' """, 3, - ['ID1', 'ID2', None] - ) - ] + ["ID1", "ID2", None], + ), + ], ) def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFixture, check, sample_count, samples): table_name = data_source_fixture.ensure_test_table(customers_test_table) @@ -558,4 +558,4 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi scan.assert_all_checks_pass() assert mock_soda_cloud.find_failed_rows_line_count(0) == sample_count failed_ids = [sample[0] for sample in scan._configuration.sampler.samples[0].rows] - assert set(failed_ids) == set(samples) \ No newline at end of file + assert set(failed_ids) == set(samples) From dda387ff6c5a5eaab575d7902225f742701e9649 Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Tue, 12 Mar 2024 13:52:11 +0100 Subject: [PATCH 9/9] remove the test formatting of filter --- soda/core/tests/data_source/test_samples.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/soda/core/tests/data_source/test_samples.py b/soda/core/tests/data_source/test_samples.py index 48c5f8d3e..0dddf15ae 100644 --- a/soda/core/tests/data_source/test_samples.py +++ b/soda/core/tests/data_source/test_samples.py @@ -497,8 +497,7 @@ def test_sample_with_multiple_value_condition(data_source_fixture: DataSourceFix """ - missing_percent(pct) < 20: missing values: [No value, N/A, error] - filter: | - "cst_size" IS NOT NULL or "cst_size_txt" IS NOT NULL + filter: cst_size IS NOT NULL or cst_size_txt IS NOT NULL """, 1, ["ID7"], @@ -553,7 +552,6 @@ def test_missing_with_check_and_dataset_filter(data_source_fixture: DataSourceFi {check} """ ) - scan.execute() scan.assert_all_checks_pass() assert mock_soda_cloud.find_failed_rows_line_count(0) == sample_count