From 93bbdcb1198162a4ab4f6a839c5511b6dfd209a2 Mon Sep 17 00:00:00 2001 From: kumar saurav Date: Thu, 5 Sep 2024 00:58:22 +0530 Subject: [PATCH] add: Null keyword completeness validation add: Null keyword completeness validation add: Null keyword completeness validation add: Null keyword completeness validation add: Null keyword completeness validation add: yaml file --- dcs_core/core/common/models/validation.py | 2 ++ dcs_core/core/datasource/manager.py | 4 +-- dcs_core/core/datasource/sql_datasource.py | 25 +++++++++++++++ .../validation/completeness_validation.py | 26 ++++++++++++++++ dcs_core/core/validation/manager.py | 4 +++ docs/validations/validity.md | 24 ++++++++++++++ .../configuration/test_configuration_v1.py | 31 +++++++++++++++++++ .../datasource/test_sql_datasource.py | 23 +++++++++----- 8 files changed, 130 insertions(+), 9 deletions(-) diff --git a/dcs_core/core/common/models/validation.py b/dcs_core/core/common/models/validation.py index 8583abf..774eafa 100644 --- a/dcs_core/core/common/models/validation.py +++ b/dcs_core/core/common/models/validation.py @@ -103,6 +103,8 @@ class ValidationFunction(str, Enum): PERCENT_EMPTY_STRING = "percent_empty_string" COUNT_NAN = "count_nan" PERCENT_NAN = "percent_nan" + COUNT_NULL_KEYWORD = "count_null_keyword" + PERCENT_NULL_KEYWORD = "percent_null_keyboard" # Custom SQL CUSTOM_SQL = "custom_sql" diff --git a/dcs_core/core/datasource/manager.py b/dcs_core/core/datasource/manager.py index e6f9014..44d580c 100644 --- a/dcs_core/core/datasource/manager.py +++ b/dcs_core/core/datasource/manager.py @@ -78,7 +78,7 @@ def _create_data_source( data_source_name = data_source_config.name data_source_type = data_source_config.type if data_source_type == "spark_df": - from datachecks.integrations.databases.spark_df import SparkDFDataSource + from dcs_core.integrations.databases.spark_df import SparkDFDataSource return SparkDFDataSource( data_source_name, @@ -86,7 +86,7 @@ def _create_data_source( ) try: module_name = ( - f"datachecks.integrations.databases.{data_source_config.type.value}" + f"dcs_core.integrations.databases.{data_source_config.type.value}" ) module = importlib.import_module(module_name) data_source_class = self.DATA_SOURCE_CLASS_NAME_MAPPER[ diff --git a/dcs_core/core/datasource/sql_datasource.py b/dcs_core/core/datasource/sql_datasource.py index 25e70fa..be18783 100644 --- a/dcs_core/core/datasource/sql_datasource.py +++ b/dcs_core/core/datasource/sql_datasource.py @@ -674,3 +674,28 @@ def query_negative_metric( result = self.fetchone(query)[0] return round(result, 2) if operation == "percent" else result + + def query_get_null_keyword_count( + self, table: str, field: str, operation: str, filters: str = None + ) -> Union[int, float]: + """ + Get the count of NULL-like values (specific keywords) in the specified column. + :param table: table name + :param field: column name + :param filters: filter condition + :return: count of NULL-like keyword values + """ + qualified_table_name = self.qualified_table_name(table) + + query = f""" SELECT SUM(CASE WHEN LOWER({field}) IN ('nothing', 'nil', 'null', 'none', 'n/a') THEN 1 ELSE 0 END) AS null_count,COUNT(*) AS total_count + FROM {qualified_table_name}""" + + if filters: + query += f" AND {filters}" + + result = self.fetchone(query) + + if operation == "percent": + return round((result[0] / result[1]) * 100, 2) if result[1] > 0 else 0 + + return result[0] if result else 0 diff --git a/dcs_core/core/validation/completeness_validation.py b/dcs_core/core/validation/completeness_validation.py index 81c6485..9a00210 100644 --- a/dcs_core/core/validation/completeness_validation.py +++ b/dcs_core/core/validation/completeness_validation.py @@ -89,3 +89,29 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]: ) else: raise ValueError("Invalid data source type") + + +class CountNullKeywordValidation(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if isinstance(self.data_source, SQLDataSource): + return self.data_source.query_get_null_keyword_count( + table=self.dataset_name, + field=self.field_name, + operation="count", + filters=self.where_filter if self.where_filter is not None else None, + ) + else: + raise ValueError("Invalid data source type") + + +class PercentageNullKeywordValidation(Validation): + def _generate_metric_value(self, **kwargs) -> Union[float, int]: + if isinstance(self.data_source, SQLDataSource): + return self.data_source.query_get_null_keyword_count( + table=self.dataset_name, + field=self.field_name, + operation="percent", + filters=self.where_filter if self.where_filter is not None else None, + ) + else: + raise ValueError("Invalid data source type") diff --git a/dcs_core/core/validation/manager.py b/dcs_core/core/validation/manager.py index 2bfe40f..ab3159a 100644 --- a/dcs_core/core/validation/manager.py +++ b/dcs_core/core/validation/manager.py @@ -23,8 +23,10 @@ from dcs_core.core.validation.base import Validation from dcs_core.core.validation.completeness_validation import ( # noqa F401 this is used in globals CountEmptyStringValidation, + CountNullKeywordValidation, CountNullValidation, PercentageEmptyStringValidation, + PercentageNullKeywordValidation, PercentageNullValidation, ) from dcs_core.core.validation.custom_query_validation import ( # noqa F401 this is used in globals @@ -165,6 +167,8 @@ class ValidationManager: ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation", ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation", ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation", + ValidationFunction.COUNT_NULL_KEYWORD.value: "CountNullKeywordValidation", + ValidationFunction.PERCENT_NULL_KEYWORD.value: "PercentageNullKeywordValidation", } def __init__( diff --git a/docs/validations/validity.md b/docs/validations/validity.md index 4ab33da..44c7620 100644 --- a/docs/validations/validity.md +++ b/docs/validations/validity.md @@ -505,3 +505,27 @@ validations for product_db.products: on: percent_negative(price) threshold: "< 40" ``` +## COUNT_NULL_KEYWORD + +The count null keyword validation counts the number of null like keyword in a dataset. + +**Example** + +```yaml title="dcs_config.yaml" +validations for product_db.products: + - count_null_keyword: + on: count_null_keyword(keyword) + threshold: <=10 +``` + +## PERCENT_NULL_KEYWORD + +The percent null keyword validation checks the percentage of null like keyword in a dataset. + +**Example** + +```yaml title="dcs_config.yaml" +validations for product_db.products: + - percent_null_keyword: + on: percent_null_keyboard(keyword) +``` diff --git a/tests/core/configuration/test_configuration_v1.py b/tests/core/configuration/test_configuration_v1.py index aa756b3..a0a4f7a 100644 --- a/tests/core/configuration/test_configuration_v1.py +++ b/tests/core/configuration/test_configuration_v1.py @@ -1025,3 +1025,34 @@ def test_should_parse_percent_negative_validation(): .get_validation_function == ValidationFunction.PERCENT_NEGATIVE ) + + +def test_should_parse_count_null_keyword(): + yaml_string = """ + validations for source.table: + - test: + on: count_null_keyword(keyword) + threshold: <=10 + """ + configuration = load_configuration_from_yaml_str(yaml_string) + assert ( + configuration.validations["source.table"] + .validations["test"] + .get_validation_function + == ValidationFunction.COUNT_NULL_KEYWORD + ) + + +def test_should_parse_percent_null_keyword(): + yaml_string = """ + validations for source.table: + - test: + on: percent_null_keyboard(keyword) + """ + configuration = load_configuration_from_yaml_str(yaml_string) + assert ( + configuration.validations["source.table"] + .validations["test"] + .get_validation_function + == ValidationFunction.PERCENT_NULL_KEYWORD + ) diff --git a/tests/integration/datasource/test_sql_datasource.py b/tests/integration/datasource/test_sql_datasource.py index d5f414e..8d8d7a6 100644 --- a/tests/integration/datasource/test_sql_datasource.py +++ b/tests/integration/datasource/test_sql_datasource.py @@ -132,7 +132,8 @@ def setup_tables( isin VARCHAR(12), perm_id VARCHAR(50), salary INTEGER, - price FLOAT + price FLOAT, + null_keyword VARCHAR(50) ) """ ) @@ -144,27 +145,27 @@ def setup_tables( ('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}', 1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec', '123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340', - 'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id + 'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0,'null'), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id ('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}', 90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890', 'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06', - 'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn + 'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0,'Alvin'), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn ('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}', 50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890', 'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586', - 'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id + 'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0,'nil'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id ('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}', 40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890', 'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345', - 'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol + 'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0,'Simon'), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol ('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}', 35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890', 'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7', - '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id + '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0,'None'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id ('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}', 35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890', 'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY', - '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0) -- invalid isin -- invalid sedol + '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0,'Ram') -- invalid isin -- invalid sedol """ postgresql_connection.execute(text(insert_query)) @@ -641,3 +642,11 @@ def test_should_return_percent_negative( table=self.TABLE_NAME, field="price", operation="percent" ) assert round(percent_negative, 2) == 50.0 + + def test_should_return_row_count_for_null_keyword( + self, postgres_datasource: PostgresDataSource + ): + valid_count = postgres_datasource.query_get_null_keyword_count( + table=self.TABLE_NAME, field="null_keyword", operation="count" + ) + assert valid_count == 3