From dfb2f7e5967a0914c92f7caea96c5e83b3c916c7 Mon Sep 17 00:00:00 2001 From: ethan-cartwright Date: Tue, 16 Jan 2024 22:12:23 -0500 Subject: [PATCH] feat(classifier): Add support for excluding list of exact column names (#9472) Co-authored-by: Ethan Cartwright --- .../base-requirements.txt | 2 +- .../docs/dev_guides/classification.md | 2 + metadata-ingestion/setup.py | 2 +- .../ingestion/glossary/datahub_classifier.py | 6 ++ .../tests/unit/test_classification.py | 69 +++++++++++++++++++ 5 files changed, 79 insertions(+), 2 deletions(-) diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 9092875902794..ddd8e55fa5c68 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -1,5 +1,5 @@ # Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate. -acryl-datahub-classify==0.0.8 +acryl-datahub-classify==0.0.9 acryl-PyHive==0.6.16 acryl-sqlglot==20.4.1.dev14 aenum==3.1.15 diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md index 8adbcee234c4a..04318d06bca71 100644 --- a/metadata-ingestion/docs/dev_guides/classification.md +++ b/metadata-ingestion/docs/dev_guides/classification.md @@ -31,9 +31,11 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d | Field | Required | Type | Description | Default | | ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | confidence_level_threshold | | number | | 0.68 | +| strip_exclusion_formatting | | bool | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True | | info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None | | info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. | | info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | | +| info_types_config.`key`.exclude_name | | list[string] | Optional list of names to exclude from classification. | None | | info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | | | info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] | | info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | | diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index cad5e750b492c..34e8167a997f6 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -189,7 +189,7 @@ "pandas", "cryptography", "msal", - "acryl-datahub-classify==0.0.8", + "acryl-datahub-classify==0.0.9", # spacy version restricted to reduce backtracking, used by acryl-datahub-classify, "spacy==3.4.3", } diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index 42eb930c80f9d..94a65d887efbc 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -73,6 +73,11 @@ class Config: description="Factors and their weights to consider when predicting info types", alias="prediction_factors_and_weights", ) + ExcludeName: Optional[List[str]] = Field( + default=None, + alias="exclude_name", + description="List of exact column names to exclude from classification for this info type", + ) Name: Optional[NameFactorConfig] = Field(default=None, alias="name") Description: Optional[DescriptionFactorConfig] = Field( @@ -95,6 +100,7 @@ class DataHubClassifierConfig(ConfigModel): default=0.68, description="The confidence threshold above which the prediction is considered as a proposal", ) + strip_exclusion_formatting: bool = Field(default=True) info_types: Optional[List[str]] = Field( default=None, description="List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`.", diff --git a/metadata-ingestion/tests/unit/test_classification.py b/metadata-ingestion/tests/unit/test_classification.py index 45ee7e1dc3633..c79ae5808b2a6 100644 --- a/metadata-ingestion/tests/unit/test_classification.py +++ b/metadata-ingestion/tests/unit/test_classification.py @@ -157,3 +157,72 @@ def test_incorrect_custom_info_type_config(): }, } ) + + +def test_exclude_name_config(): + config = DataHubClassifier.create( + config_dict={ + "confidence_level_threshold": 0.7, + "info_types_config": { + "Email_Address": { + "Prediction_Factors_and_Weights": { + "Name": 1, + "Description": 0, + "Datatype": 0, + "Values": 0, + }, + "ExcludeName": ["email_sent", "email_received"], + "Name": { + "regex": [ + "^.*mail.*id.*$", + "^.*id.*mail.*$", + "^.*mail.*add.*$", + "^.*add.*mail.*$", + "email", + "mail", + ] + }, + "Description": {"regex": []}, + "Datatype": {"type": ["str"]}, + "Values": {"prediction_type": "regex", "regex": [], "library": []}, + } + }, + } + ).config + assert config.info_types_config["Email_Address"].ExcludeName is not None + assert config.info_types_config["Email_Address"].ExcludeName == [ + "email_sent", + "email_received", + ] + + +def test_no_exclude_name_config(): + config = DataHubClassifier.create( + config_dict={ + "confidence_level_threshold": 0.7, + "info_types_config": { + "Email_Address": { + "Prediction_Factors_and_Weights": { + "Name": 1, + "Description": 0, + "Datatype": 0, + "Values": 0, + }, + "Name": { + "regex": [ + "^.*mail.*id.*$", + "^.*id.*mail.*$", + "^.*mail.*add.*$", + "^.*add.*mail.*$", + "email", + "mail", + ] + }, + "Description": {"regex": []}, + "Datatype": {"type": ["str"]}, + "Values": {"prediction_type": "regex", "regex": [], "library": []}, + } + }, + } + ).config + assert config.info_types_config["Email_Address"].ExcludeName is None