From 8dc8b3748306ce6e6ae9628de027dab051d8fb7d Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 19:47:54 +0530 Subject: [PATCH 1/9] added iqama --- definitions.json | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/definitions.json b/definitions.json index 6728686..7cf82b2 100644 --- a/definitions.json +++ b/definitions.json @@ -151,6 +151,19 @@ "<<<<" ] }, + "Resident Identity (Iqama)": { + "regex":null, + "region":"Saudi Arabia", + "keywords":[ + "Kingdom", + "Saudi", + "Arabia", + "Permit", + "Iqama", + "Residen", + "Identity" + ] + }, "Nebraska Driver's License": { "regex":"[A-Z]{1}[0-9]{9,11}", "region":"United States", From 8dd45a08d24e3fd92884e1bc835494de8604ab6d Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 19:55:03 +0530 Subject: [PATCH 2/9] added saudi driver's license --- definitions.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/definitions.json b/definitions.json index 7cf82b2..39e6158 100644 --- a/definitions.json +++ b/definitions.json @@ -158,12 +158,27 @@ "Kingdom", "Saudi", "Arabia", + "Ministry", + "Interior", "Permit", "Iqama", "Residen", "Identity" ] }, + "Saudi Driver's License": { + "regex":"\b[0-9]{10}\b", + "region":"Saudi Arabia", + "keywords":[ + "Kingdom", + "Saudi", + "Arabia", + "Ministry", + "Interior", + "Driving", + "License" + ] + }, "Nebraska Driver's License": { "regex":"[A-Z]{1}[0-9]{9,11}", "region":"United States", From 4652426859b4a5358cd249476a8f27719f805bdb Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:00:35 +0530 Subject: [PATCH 3/9] added saudi visa --- definitions.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/definitions.json b/definitions.json index 39e6158..47ad2df 100644 --- a/definitions.json +++ b/definitions.json @@ -179,6 +179,20 @@ "License" ] }, + "Saudi Arabian Visa": { + "regex":"(?:V Date: Thu, 23 Nov 2023 20:10:32 +0530 Subject: [PATCH 4/9] added tawuniya health insurance --- definitions.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/definitions.json b/definitions.json index 47ad2df..0bc6c9f 100644 --- a/definitions.json +++ b/definitions.json @@ -193,6 +193,18 @@ "Validity" ] }, + "Tawuniya Health Insurance": { + "regex":"\b[0-9]{5}\b", + "region":"Saudi Arabia", + "keywords":[ + "Tawuniya", + "Policy", + "Holder", + "Number", + "Deductible", + "Approval" + ] + }, "Nebraska Driver's License": { "regex":"[A-Z]{1}[0-9]{9,11}", "region":"United States", From 4afe9ad2d8e7a3194d3622f0b82369c4e7bcb9a9 Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:14:16 +0530 Subject: [PATCH 5/9] added spacy to requirements.txt --- .github/workflows/{github-actions-demo.yml => github-action.yml} | 0 requirements.txt | 1 + 2 files changed, 1 insertion(+) rename .github/workflows/{github-actions-demo.yml => github-action.yml} (100%) diff --git a/.github/workflows/github-actions-demo.yml b/.github/workflows/github-action.yml similarity index 100% rename from .github/workflows/github-actions-demo.yml rename to .github/workflows/github-action.yml diff --git a/requirements.txt b/requirements.txt index 5b08efc..d73775c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ nltk bs4 requests geotext +spacy From 328c7fc9693e09cd74d233b0b6d6efd75f10c0f7 Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:17:58 +0530 Subject: [PATCH 6/9] fixed a runner exception --- text_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/text_utils.py b/text_utils.py index 9d1fe42..6da4580 100644 --- a/text_utils.py +++ b/text_utils.py @@ -94,9 +94,15 @@ def regional_pii(text): from nltk import word_tokenize, pos_tag, ne_chunk from nltk.corpus import stopwords - if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt') - if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker') - if not nltk.data.find('corpora/words.zip'): nltk.download('words') + try: + if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt') + if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker') + if not nltk.data.find('corpora/words.zip'): nltk.download('words') + except LookupError: + nltk.download('punkt') + nltk.download('maxent_ne_chunker') + nltk.download('words') + stop_words = set(stopwords.words('english')) words = word_tokenize(text) From 3edf1403a4e2a969601a907b38f5c961b1c8779b Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:21:26 +0530 Subject: [PATCH 7/9] fixed a runner exception --- text_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/text_utils.py b/text_utils.py index 6da4580..e9d4672 100644 --- a/text_utils.py +++ b/text_utils.py @@ -95,12 +95,13 @@ def regional_pii(text): from nltk.corpus import stopwords try: - if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt') - if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker') - if not nltk.data.find('corpora/words.zip'): nltk.download('words') + nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"] + for resource in nltk_resources: + if not nltk.data.find(resource): raise LookupError() except LookupError: nltk.download('punkt') nltk.download('maxent_ne_chunker') + nltk.download('stopwords') nltk.download('words') stop_words = set(stopwords.words('english')) From a8ed80c038e3280dca592a20253e38fb35c66b52 Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:25:30 +0530 Subject: [PATCH 8/9] fixed a runner exception #2 --- text_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/text_utils.py b/text_utils.py index e9d4672..3a584e2 100644 --- a/text_utils.py +++ b/text_utils.py @@ -103,6 +103,7 @@ def regional_pii(text): nltk.download('maxent_ne_chunker') nltk.download('stopwords') nltk.download('words') + nltk.download('averaged_perceptron_tagger') stop_words = set(stopwords.words('english')) From 0ec2088e815294e324bf6c97bc8de315574d6866 Mon Sep 17 00:00:00 2001 From: 0x4f53 <71916237+0x4f53@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:30:43 +0530 Subject: [PATCH 9/9] simplified exception handling code --- text_utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/text_utils.py b/text_utils.py index 3a584e2..52fd666 100644 --- a/text_utils.py +++ b/text_utils.py @@ -94,16 +94,15 @@ def regional_pii(text): from nltk import word_tokenize, pos_tag, ne_chunk from nltk.corpus import stopwords + resources = ["punkt", "maxent_ne_chunker", "stopwords", "words", "averaged_perceptron_tagger"] + try: nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"] for resource in nltk_resources: if not nltk.data.find(resource): raise LookupError() except LookupError: - nltk.download('punkt') - nltk.download('maxent_ne_chunker') - nltk.download('stopwords') - nltk.download('words') - nltk.download('averaged_perceptron_tagger') + for resource in resources: + nltk.download(resource) stop_words = set(stopwords.words('english'))