-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature: First version of spacy plugin for piicatcher
A piicatcher plugin that uses spacy to scan column data. By default it downloads en_US_core_news_lg as the model to use.
- Loading branch information
Showing
11 changed files
with
3,083 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# This workflow will install Python dependencies, run tests and lint with a single version of Python | ||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions | ||
|
||
name: piicatcher_spacy | ||
on: | ||
push: | ||
branches: [ master ] | ||
pull_request: | ||
branches: [ master ] | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: ['3.6', '3.7', '3.8'] | ||
name: Python ${{ matrix.python-version }} | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install Python Poetry | ||
uses: abatilo/[email protected] | ||
- name: View poetry version | ||
run: poetry --version | ||
- name: Install dependencies | ||
run: | | ||
python -m poetry install | ||
- name: Test with pytest | ||
run: | | ||
python -m poetry run pytest --junitxml=junit/test-results.xml --cov=piicatcher --cov-report=xml --cov-report=html tests/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
name: Upload Python Package | ||
|
||
on: | ||
push: | ||
# Sequence of patterns matched against refs/tags | ||
tags: | ||
- 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 | ||
|
||
jobs: | ||
release: | ||
name: Create Release | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@master | ||
- name: Build Changelog | ||
id: github_release | ||
uses: mikepenz/release-changelog-builder-action@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Create Release | ||
id: create_release | ||
uses: softprops/action-gh-release@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token | ||
with: | ||
body: ${{steps.github_release.outputs.changelog}} | ||
draft: false | ||
prerelease: false | ||
deploy: | ||
needs: release | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/setup-python@v2 | ||
with: | ||
python-version: "3.8" | ||
- name: Install Python Poetry | ||
uses: abatilo/[email protected] | ||
- name: View poetry version | ||
run: poetry --version | ||
- name: Install dependencies | ||
run: | | ||
python -m poetry install | ||
- name: Build and publish | ||
env: | ||
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} | ||
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} | ||
run: | | ||
poetry publish --build --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,5 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
repos: | ||
- repo: local | ||
hooks: | ||
- id: isort | ||
name: isort | ||
stages: [commit] | ||
language: system | ||
entry: poetry run isort | ||
types: [python] | ||
|
||
- id: black | ||
name: black | ||
stages: [commit] | ||
language: system | ||
entry: poetry run black | ||
types: [python] | ||
|
||
- id: flake8 | ||
name: flake8 | ||
stages: [commit] | ||
language: system | ||
entry: poetry run flake8 | ||
types: [python] | ||
exclude: setup.py | ||
|
||
- id: mypy | ||
name: mypy | ||
stages: [commit] | ||
language: system | ||
entry: poetry run mypy | ||
types: [python] | ||
pass_filenames: false |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .spacy import SpacyDetector |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import importlib | ||
import logging | ||
from typing import Optional | ||
|
||
import spacy | ||
from dbcat.catalog import CatColumn | ||
from dbcat.catalog.pii_types import PiiType | ||
from piicatcher import Address, Person, BirthDate | ||
from piicatcher.detectors import register_detector, DatumDetector | ||
|
||
|
||
LOGGER = logging.getLogger(__name__) | ||
|
||
|
||
@register_detector | ||
class SpacyDetector(DatumDetector): | ||
pii_cls_map = { | ||
'FAC': Address, # Buildings, airports, highways, bridges, etc. | ||
'GPE': Address, # Countries, cities, states. | ||
'LOC': Address, # Non-GPE locations, mountain ranges, bodies of water. | ||
'PERSON': Person, # People, including fictional. | ||
'PER': Person, # Bug in french model | ||
'DATE': BirthDate, # Dates within the period 18 to 100 years ago. | ||
} | ||
name = 'DatumSpacyDetector' | ||
|
||
def __init__(self, model: str = "en_US_core_news_lg"): | ||
super(SpacyDetector, self).__init__() | ||
|
||
# Fixes a warning message from transformers that is pulled in via spacy | ||
import os | ||
os.environ['TOKENIZERS_PARALLELISM'] = 'false' | ||
self.check_spacy_version() | ||
|
||
if not self.check_spacy_model(model): | ||
raise ValueError("Unable to find spacy model '{}'. Is your language supported? " | ||
"Check the list of models available here: " | ||
"https://github.com/explosion/spacy-models ".format(self.model)) | ||
|
||
self.nlp = spacy.load(model) | ||
|
||
# If the model doesn't support named entity recognition | ||
if 'ner' not in [step[0] for step in self.nlp.pipeline]: | ||
raise ValueError( | ||
"The spacy model '{}' doesn't support named entity recognition, " | ||
"please choose another model.".format(self.model) | ||
) | ||
|
||
@staticmethod | ||
def check_spacy_version() -> bool: | ||
"""Ensure that the version of spaCy is v3.""" | ||
spacy_version = spacy.__version__ # spacy_info.get('spaCy version', spacy_info.get('spacy_version', None)) | ||
|
||
if spacy_version is None: | ||
raise ImportError('Spacy v3 needs to be installed. Unable to detect spacy version.') | ||
try: | ||
spacy_major = int(spacy_version.split('.')[0]) | ||
except Exception: | ||
raise ImportError('Spacy v3 needs to be installed. Spacy version {} is unknown.'.format(spacy_version)) | ||
if spacy_major != 3: | ||
raise ImportError('Spacy v3 needs to be installed. Detected version {}.'.format(spacy_version)) | ||
|
||
return True | ||
|
||
@staticmethod | ||
def check_spacy_model(model) -> bool: | ||
"""Ensure that the spaCy model is installed.""" | ||
spacy_info = spacy.info() | ||
if isinstance(spacy_info, str): | ||
raise ValueError('Unable to detect spacy models.') | ||
models = list(spacy_info.get('pipelines', spacy_info.get('models', None)).keys()) | ||
if models is None: | ||
raise ValueError('Unable to detect spacy models.') | ||
|
||
if model not in models: | ||
LOGGER.info("Downloading spacy model {}".format(model)) | ||
spacy.cli.download(model) | ||
importlib.import_module(model) | ||
# spacy.info() doesnt update after a spacy.cli.download, so theres no point checking it | ||
models.append(model) | ||
|
||
# Always returns true, if it fails to download, spacy sys.exit()s | ||
return model in models | ||
|
||
def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]: | ||
doc = self.nlp(datum) | ||
for ent in doc.ents: | ||
LOGGER.debug("Found %s", ent.label_) | ||
if ent.label_ == "PERSON": | ||
return Person() | ||
|
||
if ent.label_ == "GPE": | ||
return Address() | ||
|
||
if ent.label_ == "DATE": | ||
return BirthDate() |
Oops, something went wrong.