Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: [DCS-248] add count_null_keyword and percent_null_keyword functions #253

Merged
merged 1 commit into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dcs_core/core/common/models/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class ValidationFunction(str, Enum):
PERCENT_NAN = "percent_nan"
COUNT_ALL_SPACE = "count_all_space"
PERCENT_ALL_SPACE = "percent_all_space"
COUNT_NULL_KEYWORD = "count_null_keyword"
PERCENT_NULL_KEYWORD = "percent_null_keyboard"

# Custom SQL
CUSTOM_SQL = "custom_sql"
Expand Down
25 changes: 25 additions & 0 deletions dcs_core/core/datasource/sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,3 +699,28 @@ def query_get_all_space_count(
return round((result[0] / result[1]) * 100) if result[1] > 0 else 0

return result[0] if result else 0

def query_get_null_keyword_count(
self, table: str, field: str, operation: str, filters: str = None
) -> Union[int, float]:
"""
Get the count of NULL-like values (specific keywords) in the specified column.
:param table: table name
:param field: column name
:param filters: filter condition
:return: count of NULL-like keyword values
"""
qualified_table_name = self.qualified_table_name(table)

query = f""" SELECT SUM(CASE WHEN LOWER({field}) IN ('nothing', 'nil', 'null', 'none', 'n/a') THEN 1 ELSE 0 END) AS null_count,COUNT(*) AS total_count
FROM {qualified_table_name}"""

if filters:
query += f" AND {filters}"

result = self.fetchone(query)

if operation == "percent":
return round((result[0] / result[1]) * 100, 2) if result[1] > 0 else 0

return result[0] if result else 0
26 changes: 26 additions & 0 deletions dcs_core/core/validation/completeness_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,29 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]:
)
else:
raise ValueError("Invalid data source type")


class CountNullKeywordValidation(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_get_null_keyword_count(
table=self.dataset_name,
field=self.field_name,
operation="count",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Invalid data source type")


class PercentageNullKeywordValidation(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_get_null_keyword_count(
table=self.dataset_name,
field=self.field_name,
operation="percent",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Invalid data source type")
4 changes: 4 additions & 0 deletions dcs_core/core/validation/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
from dcs_core.core.validation.completeness_validation import ( # noqa F401 this is used in globals
CountAllSpaceValidation,
CountEmptyStringValidation,
CountNullKeywordValidation,
CountNullValidation,
PercentageAllSpaceValidation,
PercentageEmptyStringValidation,
PercentageNullKeywordValidation,
PercentageNullValidation,
)
from dcs_core.core.validation.custom_query_validation import ( # noqa F401 this is used in globals
Expand Down Expand Up @@ -169,6 +171,8 @@ class ValidationManager:
ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation",
ValidationFunction.COUNT_ALL_SPACE.value: "CountAllSpaceValidation",
ValidationFunction.PERCENT_ALL_SPACE.value: "PercentageAllSpaceValidation",
ValidationFunction.COUNT_NULL_KEYWORD.value: "CountNullKeywordValidation",
ValidationFunction.PERCENT_NULL_KEYWORD.value: "PercentageNullKeywordValidation",
}

def __init__(
Expand Down
25 changes: 25 additions & 0 deletions docs/validations/validity.md
Original file line number Diff line number Diff line change
Expand Up @@ -529,3 +529,28 @@ validations for product_db.products:
- percent_all_space:
on: percent_all_space(space)
```

## COUNT_NULL_KEYWORD

The count null keyword validation counts the number of null like keyword in a dataset.

**Example**

```yaml title="dcs_config.yaml"
validations for product_db.products:
- count_null_keyword:
on: count_null_keyword(keyword)
threshold: <=10
```

## PERCENT_NULL_KEYWORD

The percent null keyword validation checks the percentage of null like keyword in a dataset.

**Example**

```yaml title="dcs_config.yaml"
validations for product_db.products:
- percent_null_keyword:
on: percent_null_keyboard(keyword)
```
31 changes: 31 additions & 0 deletions tests/core/configuration/test_configuration_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,3 +1056,34 @@ def test_should_parse_percent_all_space():
.get_validation_function
== ValidationFunction.PERCENT_ALL_SPACE
)


def test_should_parse_count_null_keyword():
yaml_string = """
validations for source.table:
- test:
on: count_null_keyword(keyword)
threshold: <=10
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["source.table"]
.validations["test"]
.get_validation_function
== ValidationFunction.COUNT_NULL_KEYWORD
)


def test_should_parse_percent_null_keyword():
yaml_string = """
validations for source.table:
- test:
on: percent_null_keyboard(keyword)
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["source.table"]
.validations["test"]
.get_validation_function
== ValidationFunction.PERCENT_NULL_KEYWORD
)
23 changes: 16 additions & 7 deletions tests/integration/datasource/test_sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def setup_tables(
perm_id VARCHAR(50),
salary INTEGER,
price FLOAT,
all_space VARCHAR(50)
all_space VARCHAR(50),
null_keyword VARCHAR(50)
)
"""
)
Expand All @@ -145,27 +146,27 @@ def setup_tables(
('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}',
1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec',
'123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340',
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0,'Allen'), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0,'Allen','null'), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}',
90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890',
'[email protected] ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06',
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0,' '), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0,' ','Alvin'), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}',
50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890',
'[email protected]', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586',
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0,'Ram'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0,'Ram','nil'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}',
40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890',
'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345',
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0,' '), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0,' ','Simon'), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890',
'[email protected]', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7',
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0,'Simon'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0,'Simon','None'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890',
'[email protected]', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY',
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0,'Sam') -- invalid isin -- invalid sedol
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0,'Sam','Ram') -- invalid isin -- invalid sedol
"""

postgresql_connection.execute(text(insert_query))
Expand Down Expand Up @@ -650,3 +651,11 @@ def test_should_return_row_count_for_all_space(
table=self.TABLE_NAME, field="all_space", operation="count"
)
assert valid_count == 2

def test_should_return_row_count_for_null_keyword(
self, postgres_datasource: PostgresDataSource
):
valid_count = postgres_datasource.query_get_null_keyword_count(
table=self.TABLE_NAME, field="null_keyword", operation="count"
)
assert valid_count == 3
Loading