From 8dc8b3748306ce6e6ae9628de027dab051d8fb7d Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 19:47:54 +0530
Subject: [PATCH 1/9] added iqama

---
 definitions.json | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/definitions.json b/definitions.json
index 6728686..7cf82b2 100644
--- a/definitions.json
+++ b/definitions.json
@@ -151,6 +151,19 @@
          "<<<<"
       ]
    },
+   "Resident Identity (Iqama)": {
+      "regex":null,
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Kingdom",
+         "Saudi",
+         "Arabia",
+         "Permit",
+         "Iqama",
+         "Residen",
+         "Identity"
+      ]
+   },
    "Nebraska Driver's License": {
       "regex":"[A-Z]{1}[0-9]{9,11}",
       "region":"United States",

From 8dd45a08d24e3fd92884e1bc835494de8604ab6d Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 19:55:03 +0530
Subject: [PATCH 2/9] added saudi driver's license

---
 definitions.json | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/definitions.json b/definitions.json
index 7cf82b2..39e6158 100644
--- a/definitions.json
+++ b/definitions.json
@@ -158,12 +158,27 @@
          "Kingdom",
          "Saudi",
          "Arabia",
+         "Ministry",
+         "Interior",
          "Permit",
          "Iqama",
          "Residen",
          "Identity"
       ]
    },
+   "Saudi Driver's License": {
+      "regex":"\b[0-9]{10}\b",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Kingdom",
+         "Saudi",
+         "Arabia",
+         "Ministry",
+         "Interior",
+         "Driving",
+         "License"
+      ]
+   },
    "Nebraska Driver's License": {
       "regex":"[A-Z]{1}[0-9]{9,11}",
       "region":"United States",

From 4652426859b4a5358cd249476a8f27719f805bdb Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:00:35 +0530
Subject: [PATCH 3/9] added saudi visa

---
 definitions.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/definitions.json b/definitions.json
index 39e6158..47ad2df 100644
--- a/definitions.json
+++ b/definitions.json
@@ -179,6 +179,20 @@
          "License"
       ]
    },
+   "Saudi Arabian Visa": {
+      "regex":"(?:V<SAU)(?:[A-Z0-9<].+)",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Visa",
+         "Saudi Arabia",
+         "V<SAU",
+         "<<<<",
+         "Entries",
+         "Permitted",
+         "Work",
+         "Validity"
+      ]
+   },
    "Nebraska Driver's License": {
       "regex":"[A-Z]{1}[0-9]{9,11}",
       "region":"United States",

From 0ac64b154d858388faca5bb7b8d0136a82b9c2ab Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:10:32 +0530
Subject: [PATCH 4/9] added tawuniya health insurance

---
 definitions.json | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/definitions.json b/definitions.json
index 47ad2df..0bc6c9f 100644
--- a/definitions.json
+++ b/definitions.json
@@ -193,6 +193,18 @@
          "Validity"
       ]
    },
+   "Tawuniya Health Insurance": {
+      "regex":"\b[0-9]{5}\b",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Tawuniya",
+         "Policy",
+         "Holder",
+         "Number",
+         "Deductible",
+         "Approval"
+      ]
+   },
    "Nebraska Driver's License": {
       "regex":"[A-Z]{1}[0-9]{9,11}",
       "region":"United States",

From 4afe9ad2d8e7a3194d3622f0b82369c4e7bcb9a9 Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:14:16 +0530
Subject: [PATCH 5/9] added spacy to requirements.txt

---
 .github/workflows/{github-actions-demo.yml => github-action.yml} | 0
 requirements.txt                                                 | 1 +
 2 files changed, 1 insertion(+)
 rename .github/workflows/{github-actions-demo.yml => github-action.yml} (100%)

diff --git a/.github/workflows/github-actions-demo.yml b/.github/workflows/github-action.yml
similarity index 100%
rename from .github/workflows/github-actions-demo.yml
rename to .github/workflows/github-action.yml
diff --git a/requirements.txt b/requirements.txt
index 5b08efc..d73775c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,4 @@ nltk
 bs4
 requests
 geotext
+spacy

From 328c7fc9693e09cd74d233b0b6d6efd75f10c0f7 Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:17:58 +0530
Subject: [PATCH 6/9] fixed a runner exception

---
 text_utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/text_utils.py b/text_utils.py
index 9d1fe42..6da4580 100644
--- a/text_utils.py
+++ b/text_utils.py
@@ -94,9 +94,15 @@ def regional_pii(text):
     from nltk import word_tokenize, pos_tag, ne_chunk
     from nltk.corpus import stopwords
 
-    if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
-    if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
-    if not nltk.data.find('corpora/words.zip'): nltk.download('words')
+    try:
+        if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
+        if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
+        if not nltk.data.find('corpora/words.zip'): nltk.download('words')
+    except LookupError:
+        nltk.download('punkt')
+        nltk.download('maxent_ne_chunker')
+        nltk.download('words')
+
     stop_words = set(stopwords.words('english'))
 
     words = word_tokenize(text)

From 3edf1403a4e2a969601a907b38f5c961b1c8779b Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:21:26 +0530
Subject: [PATCH 7/9] fixed a runner exception

---
 text_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/text_utils.py b/text_utils.py
index 6da4580..e9d4672 100644
--- a/text_utils.py
+++ b/text_utils.py
@@ -95,12 +95,13 @@ def regional_pii(text):
     from nltk.corpus import stopwords
 
     try:
-        if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
-        if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
-        if not nltk.data.find('corpora/words.zip'): nltk.download('words')
+        nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"]
+        for resource in nltk_resources:
+            if not nltk.data.find(resource): raise LookupError()
     except LookupError:
         nltk.download('punkt')
         nltk.download('maxent_ne_chunker')
+        nltk.download('stopwords')
         nltk.download('words')
 
     stop_words = set(stopwords.words('english'))

From a8ed80c038e3280dca592a20253e38fb35c66b52 Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:25:30 +0530
Subject: [PATCH 8/9] fixed a runner exception #2

---
 text_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/text_utils.py b/text_utils.py
index e9d4672..3a584e2 100644
--- a/text_utils.py
+++ b/text_utils.py
@@ -103,6 +103,7 @@ def regional_pii(text):
         nltk.download('maxent_ne_chunker')
         nltk.download('stopwords')
         nltk.download('words')
+        nltk.download('averaged_perceptron_tagger')
 
     stop_words = set(stopwords.words('english'))
 

From 0ec2088e815294e324bf6c97bc8de315574d6866 Mon Sep 17 00:00:00 2001
From: 0x4f53 <71916237+0x4f53@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:30:43 +0530
Subject: [PATCH 9/9] simplified exception handling code

---
 text_utils.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/text_utils.py b/text_utils.py
index 3a584e2..52fd666 100644
--- a/text_utils.py
+++ b/text_utils.py
@@ -94,16 +94,15 @@ def regional_pii(text):
     from nltk import word_tokenize, pos_tag, ne_chunk
     from nltk.corpus import stopwords
 
+    resources = ["punkt", "maxent_ne_chunker", "stopwords", "words", "averaged_perceptron_tagger"]
+
     try:
         nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"]
         for resource in nltk_resources:
             if not nltk.data.find(resource): raise LookupError()
     except LookupError:
-        nltk.download('punkt')
-        nltk.download('maxent_ne_chunker')
-        nltk.download('stopwords')
-        nltk.download('words')
-        nltk.download('averaged_perceptron_tagger')
+        for resource in resources:
+            nltk.download(resource)
 
     stop_words = set(stopwords.words('english'))