Skip to content

Commit

Permalink
feat(classifier): Add support for excluding list of exact column names (
Browse files Browse the repository at this point in the history
#9472)

Co-authored-by: Ethan Cartwright <[email protected]>
  • Loading branch information
ethan-cartwright and ethan-cartwright authored Jan 17, 2024
1 parent acfc315 commit dfb2f7e
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 2 deletions.
2 changes: 1 addition & 1 deletion docker/datahub-ingestion-base/base-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate.
acryl-datahub-classify==0.0.8
acryl-datahub-classify==0.0.9
acryl-PyHive==0.6.16
acryl-sqlglot==20.4.1.dev14
aenum==3.1.15
Expand Down
2 changes: 2 additions & 0 deletions metadata-ingestion/docs/dev_guides/classification.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
| Field | Required | Type | Description | Default |
| ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| confidence_level_threshold | | number | | 0.68 |
| strip_exclusion_formatting | | bool | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True |
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None |
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | |
| info_types_config.`key`.exclude_name | | list[string] | Optional list of names to exclude from classification. | None |
| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | |
| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |
| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | |
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@
"pandas",
"cryptography",
"msal",
"acryl-datahub-classify==0.0.8",
"acryl-datahub-classify==0.0.9",
# spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
"spacy==3.4.3",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ class Config:
description="Factors and their weights to consider when predicting info types",
alias="prediction_factors_and_weights",
)
ExcludeName: Optional[List[str]] = Field(
default=None,
alias="exclude_name",
description="List of exact column names to exclude from classification for this info type",
)
Name: Optional[NameFactorConfig] = Field(default=None, alias="name")

Description: Optional[DescriptionFactorConfig] = Field(
Expand All @@ -95,6 +100,7 @@ class DataHubClassifierConfig(ConfigModel):
default=0.68,
description="The confidence threshold above which the prediction is considered as a proposal",
)
strip_exclusion_formatting: bool = Field(default=True)
info_types: Optional[List[str]] = Field(
default=None,
description="List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`.",
Expand Down
69 changes: 69 additions & 0 deletions metadata-ingestion/tests/unit/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,72 @@ def test_incorrect_custom_info_type_config():
},
}
)


def test_exclude_name_config():
config = DataHubClassifier.create(
config_dict={
"confidence_level_threshold": 0.7,
"info_types_config": {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"ExcludeName": ["email_sent", "email_received"],
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {"prediction_type": "regex", "regex": [], "library": []},
}
},
}
).config
assert config.info_types_config["Email_Address"].ExcludeName is not None
assert config.info_types_config["Email_Address"].ExcludeName == [
"email_sent",
"email_received",
]


def test_no_exclude_name_config():
config = DataHubClassifier.create(
config_dict={
"confidence_level_threshold": 0.7,
"info_types_config": {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {"prediction_type": "regex", "regex": [], "library": []},
}
},
}
).config
assert config.info_types_config["Email_Address"].ExcludeName is None

0 comments on commit dfb2f7e

Please sign in to comment.