From 624760d23cc11d76e836cb2f0c22b9b10ab42abd Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sat, 12 Oct 2024 16:44:10 +0300
Subject: [PATCH 01/42] Simplified language metadata JSON by removing
 unnecessary nesting and keys. - Removed 'description', 'entry', and
 'languages' keys. - Flattened structure to include only 'language', 'iso',
 and 'qid' at the top level.

---
 .../resources/language_metadata.json          | 98 ++++++-------------
 1 file changed, 31 insertions(+), 67 deletions(-)

diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index e6d7de8a6..b5400c697 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -1,70 +1,34 @@
 {
-  "used by": "Scribe-Data/src/scribe_data/utils.py",
-  "description": {
-    "entry": {
-      "language": "the supported language. All lowercase",
-      "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase",
-      "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390",
-      "remove-words": "words that should not be included as autosuggestions for the given language.",
-      "ignore-words": "words that should be removed from the autosuggestion generation process."
-    }
+  "english": {
+    "iso": "en",
+    "qid": "Q1860"
   },
-  "languages": [
-    {
-      "language": "english",
-      "iso": "en",
-      "qid": "Q1860",
-      "remove-words": ["of", "the", "The", "and"],
-      "ignore-words": []
-    },
-    {
-      "language": "french",
-      "iso": "fr",
-      "qid": "Q150",
-      "remove-words": ["of", "the", "The", "and"],
-      "ignore-words": ["XXe"]
-    },
-    {
-      "language": "german",
-      "iso": "de",
-      "qid": "Q188",
-      "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"],
-      "ignore-words": ["Gemeinde", "Familienname"]
-    },
-    {
-      "language": "italian",
-      "iso": "it",
-      "qid": "Q652",
-      "remove-words": ["of", "the", "The", "and", "text", "from"],
-      "ignore-words": ["The", "ATP"]
-    },
-    {
-      "language": "portuguese",
-      "iso": "pt",
-      "qid": "Q5146",
-      "remove-words": ["of", "the", "The", "and", "jbutadptflora"],
-      "ignore-words": []
-    },
-    {
-      "language": "russian",
-      "iso": "ru",
-      "qid": "Q7737",
-      "remove-words": ["of", "the", "The", "and"],
-      "ignore-words": []
-    },
-    {
-      "language": "spanish",
-      "iso": "es",
-      "qid": "Q1321",
-      "remove-words": ["of", "the", "The", "and"],
-      "ignore-words": []
-    },
-    {
-      "language": "swedish",
-      "iso": "sv",
-      "qid": "Q9027",
-      "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"],
-      "ignore-words": ["databasdump"]
-    }
-  ]
+  "french": {
+    "iso": "fr",
+    "qid": "Q150"
+  },
+  "german": {
+    "iso": "de",
+    "qid": "Q188"
+  },
+  "italian": {
+    "iso": "it",
+    "qid": "Q652"
+  },
+  "portuguese": {
+    "iso": "pt",
+    "qid": "Q5146"
+  },
+  "russian": {
+    "iso": "ru",
+    "qid": "Q7737"
+  },
+  "spanish": {
+    "iso": "es",
+    "qid": "Q1321"
+  },
+  "swedish": {
+    "iso": "sv",
+    "qid": "Q9027"
+  }
 }

From 05ba79d41a08148c5e29d32b335b9524fab84d27 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sat, 12 Oct 2024 16:50:34 +0300
Subject: [PATCH 02/42] Refactored _load_json function to handle simplified
 JSON structure. - Removed 'root' parameter since the JSON is now flat. -
 Updated function to return the entire contents of the JSON directly.

---
 src/scribe_data/utils.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index dbd477946..4c3a78e3c 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -36,7 +36,7 @@
 DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export"
 
 
-def _load_json(package_path: str, file_name: str, root: str) -> Any:
+def _load_json(package_path: str, file_name: str) -> Any:
     """
     Loads a JSON resource from a package into a python entity.
 
@@ -48,25 +48,19 @@ def _load_json(package_path: str, file_name: str, root: str) -> Any:
         file_name : str
             The name of the file (resource) that contains the JSON data.
 
-        root : str
-            The root node of the JSON document.
-
     Returns
     -------
-        A python entity starting at 'root'.
+        A python entity representing the JSON content.
     """
-
     with resources.files(package_path).joinpath(file_name).open(
         encoding="utf-8"
     ) as in_stream:
         contents = json.load(in_stream)
-        return contents[root]
+        return contents  # No need for 'root'
 
 
 _languages = _load_json(
-    package_path="scribe_data.resources",
-    file_name="language_metadata.json",
-    root="languages",
+    package_path="scribe_data.resources", file_name="language_metadata.json"
 )
 
 

From 7be7005789bd92791dc5d0952d3919d2b590f1db Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sat, 12 Oct 2024 18:25:15 +0300
Subject: [PATCH 03/42] =?UTF-8?q?Refactor=20language=20metadata=20structur?=
 =?UTF-8?q?e:=20Include=20all=20languages=20with=20Norwegian=20having=20su?=
 =?UTF-8?q?b-languags=20-=20Removed=20unnecessary=20top-level=20keys=20-?=
 =?UTF-8?q?=20Organized=20Norwegian=20with=20its=20sub-languages=20(Nynors?=
 =?UTF-8?q?k=20and=20Bokm=C3=A5l)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../resources/language_metadata.json          | 124 ++++++++++++++++++
 1 file changed, 124 insertions(+)

diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index b5400c697..dd85cdc91 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -1,8 +1,40 @@
 {
+  "arabic": {
+    "iso": "ar",
+    "qid": "Q13955"
+  },
+  "basque": {
+    "iso": "eu",
+    "qid": "Q8752"
+  },
+  "bengali": {
+    "iso": "bn",
+    "qid": "Q9610"
+  },
+  "czech": {
+    "iso": "cs",
+    "qid": "Q9056"
+  },
+  "danish": {
+    "iso": "da",
+    "qid": "Q9035"
+  },
   "english": {
     "iso": "en",
     "qid": "Q1860"
   },
+  "esperanto": {
+    "iso": "eo",
+    "qid": "Q143"
+  },
+  "estonian": {
+    "iso": "et",
+    "qid": "Q9072"
+  },
+  "finnish": {
+    "iso": "fi",
+    "qid": "Q1412"
+  },
   "french": {
     "iso": "fr",
     "qid": "Q150"
@@ -11,24 +43,116 @@
     "iso": "de",
     "qid": "Q188"
   },
+  "greek": {
+    "iso": "el",
+    "qid": "Q36510"
+  },
+  "hausa": {
+    "iso": "ha",
+    "qid": "Q56475"
+  },
+  "hebrew": {
+    "iso": "he",
+    "qid": "Q9288"
+  },
+  "hindustani": {
+    "iso": "hi",
+    "qid": "Q11051"
+  },
+  "indonesian": {
+    "iso": "id",
+    "qid": "Q9240"
+  },
   "italian": {
     "iso": "it",
     "qid": "Q652"
   },
+  "japanese": {
+    "iso": "ja",
+    "qid": "Q5287"
+  },
+  "kurmanji": {
+    "iso": "kmr",
+    "qid": "Q36163"
+  },
+  "latin": {
+    "iso": "la",
+    "qid": "Q397"
+  },
+  "malay": {
+    "iso": "ms",
+    "qid": "Q9237"
+  },
+  "malayalam": {
+    "iso": "ml",
+    "qid": "Q36236"
+  },
+  "mandarin": {
+    "iso": "zh",
+    "qid": "Q727694"
+  },
+  "norwegian": {
+    "sub_languages": {
+      "nynorsk": {
+        "iso": "nn",
+        "qid": "Q25164"
+      },
+      "bokmål": {
+        "iso": "nb",
+        "qid": "Q9043"
+      }
+    }
+  },
+  "pidgin": {
+    "iso": "pi",
+    "qid": "Q33655"
+  },
+  "polish": {
+    "iso": "pl",
+    "qid": "Q809"
+  },
   "portuguese": {
     "iso": "pt",
     "qid": "Q5146"
   },
+  "punjabi": {
+    "iso": "pa",
+    "qid": "Q58635"
+  },
   "russian": {
     "iso": "ru",
     "qid": "Q7737"
   },
+  "slovak": {
+    "iso": "sk",
+    "qid": "Q9058"
+  },
   "spanish": {
     "iso": "es",
     "qid": "Q1321"
   },
+  "swahili": {
+    "iso": "sw",
+    "qid": "Q7838"
+  },
   "swedish": {
     "iso": "sv",
     "qid": "Q9027"
+  },
+  "tajik": {
+    "iso": "tg",
+    "qid": "Q9260"
+  },
+  "tamil": {
+    "iso": "ta",
+    "qid": "Q5885"
+  },
+  "ukrainian": {
+    "iso": "ua",
+    "qid": "Q8798"
+  },
+  "yoruba": {
+    "iso": "yo",
+    "qid": "Q34311"
   }
 }

From e1ce1d8a6d2ea72003bb61f4aac3678aec648270 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sat, 12 Oct 2024 20:43:17 +0300
Subject: [PATCH 04/42] Refactor _find function to handle languages with
 sub-languages - Enhanced the function to check for both regular languages and
 their sub-languages. - Added error handling for cases where a language has
 only sub-languages, providing informative messages. - Updated the function's
 docstring to reflect changes in behavior and usage.

---
 src/scribe_data/utils.py | 48 ++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 4c3a78e3c..45434b783 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -66,28 +66,20 @@ def _load_json(package_path: str, file_name: str) -> Any:
 
 def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -> Any:
     """
-    Each 'language', (english, german,..., etc) is a dictionary of key/value pairs:
+    Finds a target value based on a source key/value pair from the language metadata.
 
-        entry = {
-            "language": "english",
-            "iso": "en",
-            "qid": "Q1860",
-            "remove-words": [...],
-            "ignore-words": [...]
-        }
-
-    Given a key/value pair, the 'source' and the 'target' key get the 'target' value.
+    This version handles both regular languages and those with sub-languages (e.g., Norwegian).
 
     Parameters
     ----------
         source_value : str
-            The source value to find equivalents for (e.g. 'english').
+            The source value to find equivalents for (e.g., 'english', 'nynorsk').
 
         source_key : str
-            The source key to reference (e.g. 'language').
+            The source key to reference (e.g., 'language').
 
         target_key : str
-            The key to target (e.g. 'iso').
+            The key to target (e.g., 'qid').
 
         error_msg : str
             The message displayed when a value cannot be found.
@@ -98,18 +90,30 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
 
     Raises
     ------
-        ValueError : when a source_value is not supported.
+        ValueError : when a source_value is not supported or the language only has sub-languages.
     """
     norm_source_value = source_value.lower()
 
-    if target_value := [
-        entry[target_key]
-        for entry in _languages
-        if entry[source_key] == norm_source_value
-    ]:
-        assert len(target_value) == 1, f"More than one entry for '{norm_source_value}'"
-        return target_value[0]
-
+    # Check if we're searching by language name
+    if source_key == "language":
+        # First, check the main language entries (e.g., mandarin, french, etc.)
+        for language, entry in _languages.items():
+            # If the language name matches the top-level key, return the target value
+            if language.lower() == norm_source_value:
+                if "sub_languages" in entry:
+                    sub_languages = ", ".join(entry["sub_languages"].keys())
+                    raise ValueError(
+                        f"'{language}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}"
+                    )
+                return entry.get(target_key)
+
+            # If there are sub-languages, check them too
+            if "sub_languages" in entry:
+                for sub_language, sub_entry in entry["sub_languages"].items():
+                    if sub_language.lower() == norm_source_value:
+                        return sub_entry.get(target_key)
+
+    # If no match was found, raise an error
     raise ValueError(error_msg)
 
 

From 046c78d94cf85acea433e6fd4e19093a03593cf1 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sat, 12 Oct 2024 20:46:10 +0300
Subject: [PATCH 05/42] Update get_scribe_languages to handle sub-languages in
 JSON structure - Adjusted the function to return both main languages and
 their sub-languages. - Ensured that languages like Norwegian are represented
 by their sub-languages only. - Enhanced compatibility with the new JSON
 format.

---
 src/scribe_data/utils.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 45434b783..bb9c7a399 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -120,8 +120,22 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
 def get_scribe_languages() -> list[str]:
     """
     Returns the list of currently implemented Scribe languages.
+    This version handles both regular languages and those with sub-languages (e.g., Norwegian).
     """
-    return sorted(entry["language"].capitalize() for entry in _languages)
+    languages = []
+
+    for language, entry in _languages.items():
+        # Add the main language (if it's directly queryable)
+        if "sub_languages" not in entry:
+            languages.append(language.capitalize())
+
+        # If there are sub-languages, add them instead
+        if "sub_languages" in entry:
+            languages.extend(
+                sub_language.capitalize() for sub_language in entry["sub_languages"]
+            )
+
+    return sorted(languages)
 
 
 def get_language_qid(language: str) -> str:

From 8f737cd0a21e37e2eff6766c8be6f016bf6de647 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sun, 13 Oct 2024 18:00:29 +0300
Subject: [PATCH 06/42] Remove get_language_words_to_remove and
 get_language_words_to_ignore due to new language_metadata.json structure

---
 src/scribe_data/utils.py | 44 ----------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 494a2d1bf..03e356870 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -206,50 +206,6 @@ def get_language_from_iso(iso: str) -> str:
     return language_name
 
 
-def get_language_words_to_remove(language: str) -> list[str]:
-    """
-    Returns the words that should be removed during the data cleaning process for the given language.
-
-    Parameters
-    ----------
-        language : str
-            The language the words should be returned for.
-
-    Returns
-    -------
-        list[str]
-            The words that that be removed during the data cleaning process for the given language.
-    """
-    return _find(
-        "language",
-        language,
-        "remove-words",
-        f"{language.capitalize()} is currently not a supported language.",
-    )
-
-
-def get_language_words_to_ignore(language: str) -> list[str]:
-    """
-    Returns the words that should not be included as autosuggestions for the given language.
-
-    Parameters
-    ----------
-        language : str
-            The language the words should be returned for.
-
-    Returns
-    -------
-        list[str]
-            The words that should not be included as autosuggestions for the given language.
-    """
-    return _find(
-        "language",
-        language,
-        "ignore-words",
-        f"{language.capitalize()} is currently not a supported language.",
-    )
-
-
 def load_queried_data(
     file_path: str, language: str, data_type: str
 ) -> tuple[Any, bool, str]:

From 9f75f5426cfa87bc51976ce28c95a6a065f4bc5e Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Sun, 13 Oct 2024 23:59:31 +0300
Subject: [PATCH 07/42] Refactor language_map and language_to_qid generation to
 handle new JSON structure

- Updated the logic for building language_map and language_to_qid to handle languages with sub-languages.
- Both main languages and sub-languages are now processed in a single pass, ensuring that:
  - language_map includes all metadata for main and sub-languages.
  - language_to_qid correctly maps both main and sub-languages to their QIDs.
---
 src/scribe_data/cli/cli_utils.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
index ecf8b6213..f5b72f663 100644
--- a/src/scribe_data/cli/cli_utils.py
+++ b/src/scribe_data/cli/cli_utils.py
@@ -42,14 +42,23 @@
 with DATA_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file:
     data_type_metadata = json.load(file)
 
-language_map = {
-    lang["language"].lower(): lang for lang in language_metadata["languages"]
-}
-
-# Create language_to_qid dictionary.
-language_to_qid = {
-    lang["language"].lower(): lang["qid"] for lang in language_metadata["languages"]
-}
+language_map = {}
+language_to_qid = {}
+
+# Process each language and its potential sub-languages in one pass
+for lang_key, lang_data in language_metadata.items():
+    lang_key_lower = lang_key.lower()
+
+    # Handle sub-languages if they exist
+    if "sub_languages" in lang_data:
+        for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items():
+            sub_lang_key_lower = sub_lang_key.lower()
+            language_map[sub_lang_key_lower] = sub_lang_data
+            language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"]
+    else:
+        # Handle the main language directly
+        language_map[lang_key_lower] = lang_data
+        language_to_qid[lang_key_lower] = lang_data["qid"]
 
 
 def correct_data_type(data_type: str) -> str:

From 6186be979c28b52acc9cc36bc0b8bf2536dbc31c Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 00:40:16 +0300
Subject: [PATCH 08/42] Fix: Update language extraction to match new JSON
 structure by removing the 'languages' key reference

---
 src/scribe_data/cli/interactive.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
index 4e95f34b0..cefaa6bbe 100644
--- a/src/scribe_data/cli/interactive.py
+++ b/src/scribe_data/cli/interactive.py
@@ -52,7 +52,7 @@
 class ScribeDataConfig:
     def __init__(self):
         self.languages = [
-            lang["language"].capitalize() for lang in language_metadata["languages"]
+            [lang_key.capitalize() for lang_key in language_metadata.keys()]
         ]
         self.data_types = list(data_type_metadata.keys())
         self.selected_languages: List[str] = []

From 1c959ec5d89f4d24e1f9f33f70b9e9a3289e86a8 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 00:48:56 +0300
Subject: [PATCH 09/42] Refactor language extraction to use direct keys from
 language_metadata. Removed dependency on the 'languages' key in JSON
 structure.

---
 src/scribe_data/wikidata/query_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
index 70c0fbf00..ffdc3bfba 100644
--- a/src/scribe_data/wikidata/query_data.py
+++ b/src/scribe_data/wikidata/query_data.py
@@ -115,7 +115,7 @@ def query_data(
         SCRIBE_DATA_SRC_PATH / "language_data_extraction"
     )
     languages = [lang.capitalize() for lang in languages]
-    current_languages = list(language_metadata["languages"])
+    current_languages = list(language_metadata.keys())
     current_data_type = ["nouns", "verbs", "prepositions"]
 
     # Assign current_languages and current_data_type if no arguments have been passed.

From 458328ef5086d8b190e66ae2e3aae5c5e37cdf19 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 14:13:54 +0300
Subject: [PATCH 10/42] Added format_sublanguage_name function to format
 sub-language names as 'mainlang/sublang'

- Implemented the function to check if a language is a sub-language and format its name as 'mainlang/sublang' for easier searching in language_data_extraction.
- Returns the original language name if it's not a sub-language.
- Added detailed docstring for clarity and usage examples.
---
 src/scribe_data/utils.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 03e356870..33fc3763e 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -487,3 +487,39 @@ def order_annotations(annotation: str) -> str:
     annotation_split = sorted(list(set(filter(None, annotation.split("/")))))
 
     return "/".join(annotation_split)
+
+
+def format_sublanguage_name(lang, language_metadata):
+    """
+    Formats the name of a sub-language by appending its main language
+    in the format 'mainlang/sublang'. If the language is not a sub-language,
+    the original language name is returned as-is.
+
+    Args:
+        lang (str): The name of the language or sub-language to format.
+        language_metadata (dict): The metadata containing information about
+                                  main languages and their sub-languages.
+
+    Returns:
+        str: The formatted language name if it's a sub-language
+             (e.g., 'norwegian/nynorsk'), otherwise the original name.
+
+    Example:
+        format_sublanguage_name("nynorsk", language_metadata)
+        'norwegian/nynorsk'
+
+        format_sublanguage_name("english", language_metadata)
+        'english'
+    """
+    # Iterate through the main languages in the metadata
+    for main_lang, lang_data in language_metadata.items():
+        # Check if the main language has sub-languages
+        if "sub_languages" in lang_data:
+            # Check if the provided language is a sub-language
+            for sub_lang in lang_data["sub_languages"]:
+                if lang.lower() == sub_lang.lower():
+                    # Return the formatted name mainlang/sublang
+                    return f"{main_lang}/{sub_lang}"
+
+    # If it's not a sub-language, return the original name
+    return lang

From e0177607afb489a34f882ba7db78649c5899cacf Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 14:22:11 +0300
Subject: [PATCH 11/42] Refactor: Apply format_sublanguage_name to handle
 sub-language - Wrapped 'lang' variable with format_sublanguage_name to ensure
 sub-languages are formatted as 'mainlang/sublang' during data extraction. -
 This ensures proper directory creation and querying for a sub-languages,
 aligning with the new language metadata structure.

---
 src/scribe_data/wikidata/query_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
index ffdc3bfba..9c8e04d1e 100644
--- a/src/scribe_data/wikidata/query_data.py
+++ b/src/scribe_data/wikidata/query_data.py
@@ -33,6 +33,7 @@
 from scribe_data.cli.cli_utils import (
     language_metadata,
 )
+from scribe_data.utils import format_sublanguage_name
 from scribe_data.wikidata.wikidata_utils import sparql
 
 
@@ -159,7 +160,7 @@ def query_data(
         disable=interactive,
         colour="MAGENTA",
     ):
-        lang = q.parent.parent.name
+        lang = format_sublanguage_name(q.parent.parent.name, language_metadata)
         target_type = q.parent.name
 
         updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir

From 470541444c09dea57cb18dd1dcff894e505d89e3 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 14:31:45 +0300
Subject: [PATCH 12/42] Removed dependency on the 'languages' key based on the
 old json structure in cli/total.py file

---
 src/scribe_data/cli/total.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index e94d33d40..735d74051 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -71,8 +71,8 @@ def get_datatype_list(language):
         data_types : list[str] or None
             A list of the corresponding data types.
     """
-    languages = list(language_metadata["languages"])
-    language_list = [lang["language"] for lang in languages]
+    languages = list(language_metadata.keys())
+    language_list = [lang for lang in languages]
 
     if language.lower() in language_list:
         language_data = language_map.get(language.lower())

From ab7b6cf5be0b5ba0db2c965aee8f6b56acddcbb9 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 15:12:19 +0300
Subject: [PATCH 13/42] Add function to list all languages from language
 metadata loaded json

- Created list_all_languages function to extract both main languages and sub-languages
- The function checks for sub-languages and compiles a complete list for easier access.
- Updated example usage to demonstrate the new functionality.
---
 src/scribe_data/utils.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 33fc3763e..1df502ad6 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -523,3 +523,20 @@ def format_sublanguage_name(lang, language_metadata):
 
     # If it's not a sub-language, return the original name
     return lang
+
+
+def list_all_languages(language_metadata):
+    """List all languages from the provided metadata dictionary, including sub-languages."""
+    current_languages = []
+
+    # Iterate through the language metadata
+    for lang_key, lang_data in language_metadata.items():
+        # Check if there are sub-languages
+        if "sub_languages" in lang_data:
+            # Add the sub-languages to current_languages
+            current_languages.extend(lang_data["sub_languages"].keys())
+        else:
+            # If no sub-languages, add the main language
+            current_languages.append(lang_key)
+
+    return current_languages

From 8d8f8f59ea8e1bda8783d552381c4c578b05f38d Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 15:14:37 +0300
Subject: [PATCH 14/42] Refactor to use list_all_languages function for
 language extraction

- Replaced old extraction method with a centralized function.
---
 src/scribe_data/load/data_to_sqlite.py | 4 ++--
 src/scribe_data/wikidata/query_data.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py
index 79d19e39b..aec1f9560 100644
--- a/src/scribe_data/load/data_to_sqlite.py
+++ b/src/scribe_data/load/data_to_sqlite.py
@@ -35,6 +35,7 @@
     DEFAULT_SQLITE_EXPORT_DIR,
     get_language_iso,
 )
+from scribe_data.utils import list_all_languages
 
 
 def data_to_sqlite(
@@ -52,8 +53,7 @@ def data_to_sqlite(
         current_language_data = json.load(f_languages)
         data_types = json.load(f_data_types).keys()
 
-    current_languages = [d["language"] for d in current_language_data["languages"]]
-
+    current_languages = list_all_languages(current_language_data)
     if not languages:
         languages = current_languages
 
diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
index 9c8e04d1e..c075663a6 100644
--- a/src/scribe_data/wikidata/query_data.py
+++ b/src/scribe_data/wikidata/query_data.py
@@ -33,7 +33,7 @@
 from scribe_data.cli.cli_utils import (
     language_metadata,
 )
-from scribe_data.utils import format_sublanguage_name
+from scribe_data.utils import format_sublanguage_name, list_all_languages
 from scribe_data.wikidata.wikidata_utils import sparql
 
 
@@ -116,7 +116,7 @@ def query_data(
         SCRIBE_DATA_SRC_PATH / "language_data_extraction"
     )
     languages = [lang.capitalize() for lang in languages]
-    current_languages = list(language_metadata.keys())
+    current_languages = list_all_languages(language_metadata)
     current_data_type = ["nouns", "verbs", "prepositions"]
 
     # Assign current_languages and current_data_type if no arguments have been passed.

From d9a649b2681378475b19ab745031f607d6ca5616 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 16:39:14 +0300
Subject: [PATCH 15/42] Enhance language handling by importing utility
 functions

- Imported list_all_languages and ormat_sublanguage_name from scribe_data.utils.
- Updated get_datatype_list and print_total_lexemes to improve language name retrieval and formatting.
---
 src/scribe_data/cli/total.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 735d74051..990aef733 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -30,6 +30,7 @@
     language_to_qid,
 )
 from scribe_data.wikidata.wikidata_utils import sparql
+from scribe_data.utils import list_all_languages, format_sublanguage_name
 
 
 def get_qid_by_input(input_str):
@@ -71,12 +72,14 @@ def get_datatype_list(language):
         data_types : list[str] or None
             A list of the corresponding data types.
     """
-    languages = list(language_metadata.keys())
+    languages = list_all_languages(language_metadata)
     language_list = [lang for lang in languages]
 
     if language.lower() in language_list:
         language_data = language_map.get(language.lower())
-        language_capitalized = language.capitalize()
+        language_capitalized = format_sublanguage_name(
+            language, language_metadata
+        ).capitalize()
         language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized
 
         if not language_data:
@@ -131,9 +134,11 @@ def print_total_lexemes(language: str = None):
     print("=" * 64)
 
     if language is None:  # all languages
-        languages = list(language_metadata["languages"])
-        languages.sort(key=lambda x: x["language"])
-        language_list = [lang["language"] for lang in languages]
+        languages = list_all_languages(
+            language_metadata
+        )  # this returns a list of language names
+        language_list = languages  # sorts the list in place
+        language_list.sort()
 
         for lang in language_list:
             data_types = get_datatype_list(lang)

From 30f97e96883460261dd83e9fdfb4d6b6da8ba121 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 19:35:34 +0300
Subject: [PATCH 16/42] Update get_language_iso function:

- Refactored to use the user-defined _find function.
- Removed the 	ry-except block as error handling is already implemented in _find.
- Removed the InvalidLanguageValue module as it was imported but unused.
---
 src/scribe_data/utils.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 1df502ad6..9898f2449 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -27,7 +27,7 @@
 from typing import Any, Optional
 
 from iso639 import Lang
-from iso639.exceptions import DeprecatedLanguageValue, InvalidLanguageValue
+from iso639.exceptions import DeprecatedLanguageValue
 
 PROJECT_ROOT = "Scribe-Data"
 DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export"
@@ -174,12 +174,13 @@ def get_language_iso(language: str) -> str:
         str
             The ISO code for the language.
     """
-    try:
-        iso_code = str(Lang(language.capitalize()).pt1)
-    except InvalidLanguageValue:
-        raise ValueError(
-            f"{language.capitalize()} is currently not a supported language for ISO conversion."
-        ) from None
+
+    iso_code = _find(
+        "language",
+        language,
+        "iso",
+        f"{language.upper()} is currently not a supported language for ISO conversion.",
+    )
     return iso_code
 
 

From ceec18768f2897c45e166cdc68fb462958944fd4 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 19:55:09 +0300
Subject: [PATCH 17/42] Handle sub-languages in language table generation

- Utilized already built helper functions to support sub-languages when retrieving ISO and QID values.
- Updated table printing to correctly format and display both main languages and sub-languages.
---
 src/scribe_data/cli/list.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index 5d16b4413..6f8f2358e 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -26,18 +26,19 @@
     language_map,
     LANGUAGE_DATA_EXTRACTION_DIR,
 )
+from scribe_data.utils import list_all_languages, get_language_iso, get_language_qid
 
 
 def list_languages() -> None:
     """
     Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
     """
-    languages = list(language_metadata["languages"])
-    languages.sort(key=lambda x: x["language"])
+    languages = list_all_languages(language_metadata)
+    languages.sort()
 
-    language_col_width = max(len(lang["language"]) for lang in languages) + 2
-    iso_col_width = max(len(lang["iso"]) for lang in languages) + 2
-    qid_col_width = max(len(lang["qid"]) for lang in languages) + 2
+    language_col_width = max(len(lang) for lang in languages) + 2
+    iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2
+    qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2
 
     table_line_length = language_col_width + iso_col_width + qid_col_width
 
@@ -49,7 +50,7 @@ def list_languages() -> None:
 
     for lang in languages:
         print(
-            f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}"
+            f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
         )
 
     print("-" * table_line_length)

From 540e9d2c4e322a943c5c8b111453080415acfda7 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 21:27:17 +0300
Subject: [PATCH 18/42] adding new languages and their dialects to the
 language_metadata.json file

---
 .../resources/language_metadata.json          | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index dd85cdc91..d7d8100cd 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -56,8 +56,16 @@
     "qid": "Q9288"
   },
   "hindustani": {
-    "iso": "hi",
-    "qid": "Q11051"
+    "sub_languages": {
+      "hindi": {
+        "iso": "hi",
+        "qid": "Q11051"
+      },
+      "urdu": {
+        "iso": "ur",
+        "qid": "Q11051"
+      }
+    }
   },
   "indonesian": {
     "iso": "id",
@@ -104,8 +112,12 @@
     }
   },
   "pidgin": {
-    "iso": "pi",
-    "qid": "Q33655"
+    "sub_languages": {
+      "nigerian": {
+        "iso": "pi",
+        "qid": "Q33655"
+      }
+    }
   },
   "polish": {
     "iso": "pl",
@@ -116,8 +128,16 @@
     "qid": "Q5146"
   },
   "punjabi": {
-    "iso": "pa",
-    "qid": "Q58635"
+    "sub_languages": {
+      "gurmukhi": {
+        "iso": "pan",
+        "qid": "Q58635"
+      },
+      "shahmukhi": {
+        "iso": "pnp",
+        "qid": "Q58635"
+      }
+    }
   },
   "russian": {
     "iso": "ru",

From f389ab5b833b5255c9bd3e6c2e92aca64f10ec5b Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 21:52:40 +0300
Subject: [PATCH 19/42] Modified the loop that searches languages in the
 list_data_types function to reflect the new JSON structure, ensuring only
 data types are printed and no sub-languages unlike before.

---
 src/scribe_data/cli/list.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index 6f8f2358e..6b9ec295c 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -26,7 +26,12 @@
     language_map,
     LANGUAGE_DATA_EXTRACTION_DIR,
 )
-from scribe_data.utils import list_all_languages, get_language_iso, get_language_qid
+from scribe_data.utils import (
+    list_all_languages,
+    get_language_iso,
+    get_language_qid,
+    format_sublanguage_name,
+)
 
 
 def list_languages() -> None:
@@ -66,6 +71,7 @@ def list_data_types(language: str = None) -> None:
         language : str
             The language to potentially list data types for.
     """
+    languages = list_all_languages(language_metadata)
     if language:
         language_data = language_map.get(language.lower())
         language_capitalized = language.capitalize()
@@ -84,8 +90,11 @@ def list_data_types(language: str = None) -> None:
 
     else:
         data_types = set()
-        for lang in language_metadata["languages"]:
-            language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize()
+        for lang in languages:
+            language_dir = (
+                LANGUAGE_DATA_EXTRACTION_DIR
+                / format_sublanguage_name(lang, language_metadata).capitalize()
+            )
             if language_dir.is_dir():
                 data_types.update(f.name for f in language_dir.iterdir() if f.is_dir())
 

From 09944edab9f064ad39a414b2775cc78c62578e49 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 22:24:19 +0300
Subject: [PATCH 20/42] Capitalize the languages returned by the function
 'format_sublanguage_name' to align with the directory structure in the
 language_data_extraction directory.

---
 src/scribe_data/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 9898f2449..b4da68647 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -520,10 +520,10 @@ def format_sublanguage_name(lang, language_metadata):
             for sub_lang in lang_data["sub_languages"]:
                 if lang.lower() == sub_lang.lower():
                     # Return the formatted name mainlang/sublang
-                    return f"{main_lang}/{sub_lang}"
+                    return f"{main_lang.capitalize()}/{sub_lang.capitalize()}"
 
     # If it's not a sub-language, return the original name
-    return lang
+    return lang.capitalize()
 
 
 def list_all_languages(language_metadata):

From f602f170335ee6833a6c322206885ecf22c081ad Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Mon, 14 Oct 2024 22:29:02 +0300
Subject: [PATCH 21/42] Implemented minor fixes by utilizing the
 format_sublanguage_name function to handle sub_language folders.

---
 src/scribe_data/cli/list.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index 6b9ec295c..447d59060 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -73,6 +73,7 @@ def list_data_types(language: str = None) -> None:
     """
     languages = list_all_languages(language_metadata)
     if language:
+        language = format_sublanguage_name(language, language_metadata)
         language_data = language_map.get(language.lower())
         language_capitalized = language.capitalize()
         language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized
@@ -132,9 +133,11 @@ def list_languages_for_data_type(data_type: str) -> None:
             The data type to check for.
     """
     data_type = correct_data_type(data_type=data_type)
+    all_languages = list_all_languages(language_metadata)
     available_languages = []
-    for lang in language_metadata["languages"]:
-        language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize()
+    for lang in all_languages:
+        lang = format_sublanguage_name(lang, language_metadata)
+        language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang.capitalize()
         if language_dir.is_dir():
             dt_path = language_dir / data_type
             if dt_path.exists():

From ba0ed9a7c8ba2c042b9b98a4e574858c015de63c Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Tue, 15 Oct 2024 19:26:18 +0300
Subject: [PATCH 22/42] Updated the instance variable self.languages in
 ScribeDataConfig to use list_all_languages, assigning a complete list of all
 languages.

---
 src/scribe_data/cli/interactive.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
index cefaa6bbe..6ba7a1f55 100644
--- a/src/scribe_data/cli/interactive.py
+++ b/src/scribe_data/cli/interactive.py
@@ -35,7 +35,7 @@
 from scribe_data.cli.cli_utils import data_type_metadata, language_metadata
 from scribe_data.cli.get import get_data
 from scribe_data.cli.version import get_version_message
-from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
+from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, list_all_languages
 
 # MARK: Config Setup
 
@@ -51,9 +51,7 @@
 
 class ScribeDataConfig:
     def __init__(self):
-        self.languages = [
-            [lang_key.capitalize() for lang_key in language_metadata.keys()]
-        ]
+        self.languages = list_all_languages(language_metadata)
         self.data_types = list(data_type_metadata.keys())
         self.selected_languages: List[str] = []
         self.selected_data_types: List[str] = []

From c77cb1fdf1fbe38aa1381f3071ef308d47875581 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Wed, 16 Oct 2024 17:22:25 +0300
Subject: [PATCH 23/42] adding mandarin as a sub language under chinese and
 updating some qids

---
 .../resources/language_metadata.json          | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index d7d8100cd..00a8d405c 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -95,9 +95,13 @@
     "iso": "ml",
     "qid": "Q36236"
   },
-  "mandarin": {
-    "iso": "zh",
-    "qid": "Q727694"
+  "chinese": {
+    "sub_languages": {
+      "mandarin": {
+        "iso": "zh",
+        "qid": "Q727694"
+      }
+    }
   },
   "norwegian": {
     "sub_languages": {
@@ -107,7 +111,7 @@
       },
       "bokmål": {
         "iso": "nb",
-        "qid": "Q9043"
+        "qid": "Q25167"
       }
     }
   },
@@ -129,12 +133,12 @@
   },
   "punjabi": {
     "sub_languages": {
-      "gurmukhi": {
-        "iso": "pan",
+      "shahmukhi": {
+        "iso": "pnb",
         "qid": "Q58635"
       },
-      "shahmukhi": {
-        "iso": "pnp",
+      "gurmukhi": {
+        "iso": "pa",
         "qid": "Q58635"
       }
     }

From 87ec3b03747e921e0b2d7c6c5801ae82d5baa06d Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Wed, 16 Oct 2024 17:46:53 +0300
Subject: [PATCH 24/42] Update test_list_languages to match updated output
 format

---
 tests/cli/test_list.py | 55 +++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py
index 1ec2ec1e4..3933082f6 100644
--- a/tests/cli/test_list.py
+++ b/tests/cli/test_list.py
@@ -39,17 +39,49 @@ def test_list_languages(self, mock_print):
         list_languages()
         expected_calls = [
             call(),
-            call("Language     ISO  QID    "),
-            call("-----------------------"),
-            call("English      en   Q1860  "),
-            call("French       fr   Q150   "),
-            call("German       de   Q188   "),
-            call("Italian      it   Q652   "),
-            call("Portuguese   pt   Q5146  "),
-            call("Russian      ru   Q7737  "),
-            call("Spanish      es   Q1321  "),
-            call("Swedish      sv   Q9027  "),
-            call("-----------------------"),
+            call("Language     ISO   QID      "),
+            call("--------------------------"),
+            call("Arabic       ar    Q13955   "),
+            call("Basque       eu    Q8752    "),
+            call("Bengali      bn    Q9610    "),
+            call("Bokmål       nb    Q25167   "),
+            call("Czech        cs    Q9056    "),
+            call("Danish       da    Q9035    "),
+            call("English      en    Q1860    "),
+            call("Esperanto    eo    Q143     "),
+            call("Estonian     et    Q9072    "),
+            call("Finnish      fi    Q1412    "),
+            call("French       fr    Q150     "),
+            call("German       de    Q188     "),
+            call("Greek        el    Q36510   "),
+            call("Gurmukhi     pa    Q58635   "),
+            call("Hausa        ha    Q56475   "),
+            call("Hebrew       he    Q9288    "),
+            call("Hindi        hi    Q11051   "),
+            call("Indonesian   id    Q9240    "),
+            call("Italian      it    Q652     "),
+            call("Japanese     ja    Q5287    "),
+            call("Kurmanji     kmr   Q36163   "),
+            call("Latin        la    Q397     "),
+            call("Malay        ms    Q9237    "),
+            call("Malayalam    ml    Q36236   "),
+            call("Mandarin     zh    Q727694  "),
+            call("Nigerian     pi    Q33655   "),
+            call("Nynorsk      nn    Q25164   "),
+            call("Polish       pl    Q809     "),
+            call("Portuguese   pt    Q5146    "),
+            call("Russian      ru    Q7737    "),
+            call("Shahmukhi    pnb   Q58635   "),
+            call("Slovak       sk    Q9058    "),
+            call("Spanish      es    Q1321    "),
+            call("Swahili      sw    Q7838    "),
+            call("Swedish      sv    Q9027    "),
+            call("Tajik        tg    Q9260    "),
+            call("Tamil        ta    Q5885    "),
+            call("Ukrainian    ua    Q8798    "),
+            call("Urdu         ur    Q11051   "),
+            call("Yoruba       yo    Q34311   "),
+            call("--------------------------"),
             call(),
         ]
         mock_print.assert_has_calls(expected_calls)
@@ -80,6 +112,7 @@ def test_list_data_types_specific_language(self, mock_print):
             call("Available data types: English"),
             call("-----------------------------"),
             call("adjectives"),
+            call("adverbs"),
             call("emoji-keywords"),
             call("nouns"),
             call("verbs"),

From 881c0553ece0246a7910cf2285f1d80b1013b1a4 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Wed, 16 Oct 2024 20:28:44 +0300
Subject: [PATCH 25/42] removing .capitalize method since it's already
 implemented inside laguages listing functions

---
 src/scribe_data/cli/list.py |  6 ++---
 tests/cli/test_list.py      | 52 ++++++++++++++++++++++++++++++-------
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index 447d59060..ee3311ede 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -137,11 +137,11 @@ def list_languages_for_data_type(data_type: str) -> None:
     available_languages = []
     for lang in all_languages:
         lang = format_sublanguage_name(lang, language_metadata)
-        language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang.capitalize()
+        language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang
         if language_dir.is_dir():
             dt_path = language_dir / data_type
             if dt_path.exists():
-                available_languages.append(lang["language"])
+                available_languages.append(lang)
 
     available_languages.sort()
     table_header = f"Available languages: {data_type}"
@@ -154,7 +154,7 @@ def list_languages_for_data_type(data_type: str) -> None:
     print("-" * table_line_length)
 
     for lang in available_languages:
-        print(f"{lang.capitalize()}")
+        print(f"{lang}")
 
     print("-" * table_line_length)
     print()
diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py
index 3933082f6..cad0fa549 100644
--- a/tests/cli/test_list.py
+++ b/tests/cli/test_list.py
@@ -97,6 +97,8 @@ def test_list_data_types_all_languages(self, mock_print):
             call("adverbs"),
             call("emoji-keywords"),
             call("nouns"),
+            call("personal-pronouns"),
+            call("postpositions"),
             call("prepositions"),
             call("verbs"),
             call("-----------------------------------"),
@@ -175,16 +177,48 @@ def test_list_languages_for_data_type_valid(self, mock_print):
         list_languages_for_data_type("nouns")
         expected_calls = [
             call(),
-            call("Available languages: nouns"),
+            call("Language     ISO   QID      "),
             call("--------------------------"),
-            call("English"),
-            call("French"),
-            call("German"),
-            call("Italian"),
-            call("Portuguese"),
-            call("Russian"),
-            call("Spanish"),
-            call("Swedish"),
+            call("Arabic       ar    Q13955   "),
+            call("Basque       eu    Q8752    "),
+            call("Bengali      bn    Q9610    "),
+            call("Bokmål       nb    Q25167   "),
+            call("Czech        cs    Q9056    "),
+            call("Danish       da    Q9035    "),
+            call("English      en    Q1860    "),
+            call("Esperanto    eo    Q143     "),
+            call("Estonian     et    Q9072    "),
+            call("Finnish      fi    Q1412    "),
+            call("French       fr    Q150     "),
+            call("German       de    Q188     "),
+            call("Greek        el    Q36510   "),
+            call("Gurmukhi     pa    Q58635   "),
+            call("Hausa        ha    Q56475   "),
+            call("Hebrew       he    Q9288    "),
+            call("Hindi        hi    Q11051   "),
+            call("Indonesian   id    Q9240    "),
+            call("Italian      it    Q652     "),
+            call("Japanese     ja    Q5287    "),
+            call("Kurmanji     kmr   Q36163   "),
+            call("Latin        la    Q397     "),
+            call("Malay        ms    Q9237    "),
+            call("Malayalam    ml    Q36236   "),
+            call("Mandarin     zh    Q727694  "),
+            call("Nigerian     pi    Q33655   "),
+            call("Nynorsk      nn    Q25164   "),
+            call("Polish       pl    Q809     "),
+            call("Portuguese   pt    Q5146    "),
+            call("Russian      ru    Q7737    "),
+            call("Shahmukhi    pnb   Q58635   "),
+            call("Slovak       sk    Q9058    "),
+            call("Spanish      es    Q1321    "),
+            call("Swahili      sw    Q7838    "),
+            call("Swedish      sv    Q9027    "),
+            call("Tajik        tg    Q9260    "),
+            call("Tamil        ta    Q5885    "),
+            call("Ukrainian    ua    Q8798    "),
+            call("Urdu         ur    Q11051   "),
+            call("Yoruba       yo    Q34311   "),
             call("--------------------------"),
             call(),
         ]

From fed80b391b073fa8adc7657020236ab118cdc84a Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Wed, 16 Oct 2024 21:35:09 +0300
Subject: [PATCH 26/42] Updating test cases in test_list.py file to match newly
 added languages

---
 tests/cli/test_list.py | 82 +++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py
index cad0fa549..bc31f38f2 100644
--- a/tests/cli/test_list.py
+++ b/tests/cli/test_list.py
@@ -177,48 +177,48 @@ def test_list_languages_for_data_type_valid(self, mock_print):
         list_languages_for_data_type("nouns")
         expected_calls = [
             call(),
-            call("Language     ISO   QID      "),
+            call("Available languages: nouns"),
             call("--------------------------"),
-            call("Arabic       ar    Q13955   "),
-            call("Basque       eu    Q8752    "),
-            call("Bengali      bn    Q9610    "),
-            call("Bokmål       nb    Q25167   "),
-            call("Czech        cs    Q9056    "),
-            call("Danish       da    Q9035    "),
-            call("English      en    Q1860    "),
-            call("Esperanto    eo    Q143     "),
-            call("Estonian     et    Q9072    "),
-            call("Finnish      fi    Q1412    "),
-            call("French       fr    Q150     "),
-            call("German       de    Q188     "),
-            call("Greek        el    Q36510   "),
-            call("Gurmukhi     pa    Q58635   "),
-            call("Hausa        ha    Q56475   "),
-            call("Hebrew       he    Q9288    "),
-            call("Hindi        hi    Q11051   "),
-            call("Indonesian   id    Q9240    "),
-            call("Italian      it    Q652     "),
-            call("Japanese     ja    Q5287    "),
-            call("Kurmanji     kmr   Q36163   "),
-            call("Latin        la    Q397     "),
-            call("Malay        ms    Q9237    "),
-            call("Malayalam    ml    Q36236   "),
-            call("Mandarin     zh    Q727694  "),
-            call("Nigerian     pi    Q33655   "),
-            call("Nynorsk      nn    Q25164   "),
-            call("Polish       pl    Q809     "),
-            call("Portuguese   pt    Q5146    "),
-            call("Russian      ru    Q7737    "),
-            call("Shahmukhi    pnb   Q58635   "),
-            call("Slovak       sk    Q9058    "),
-            call("Spanish      es    Q1321    "),
-            call("Swahili      sw    Q7838    "),
-            call("Swedish      sv    Q9027    "),
-            call("Tajik        tg    Q9260    "),
-            call("Tamil        ta    Q5885    "),
-            call("Ukrainian    ua    Q8798    "),
-            call("Urdu         ur    Q11051   "),
-            call("Yoruba       yo    Q34311   "),
+            call("Arabic"),
+            call("Basque"),
+            call("Bengali"),
+            call("Chinese/Mandarin"),
+            call("Czech"),
+            call("Danish"),
+            call("English"),
+            call("Esperanto"),
+            call("Estonian"),
+            call("Finnish"),
+            call("French"),
+            call("German"),
+            call("Greek"),
+            call("Hausa"),
+            call("Hebrew"),
+            call("Hindustani/Hindi"),
+            call("Hindustani/Urdu"),
+            call("Indonesian"),
+            call("Italian"),
+            call("Japanese"),
+            call("Kurmanji"),
+            call("Latin"),
+            call("Malay"),
+            call("Malayalam"),
+            call("Norwegian/Bokmål"),
+            call("Norwegian/Nynorsk"),
+            call("Pidgin/Nigerian"),
+            call("Polish"),
+            call("Portuguese"),
+            call("Punjabi/Gurmukhi"),
+            call("Punjabi/Shahmukhi"),
+            call("Russian"),
+            call("Slovak"),
+            call("Spanish"),
+            call("Swahili"),
+            call("Swedish"),
+            call("Tajik"),
+            call("Tamil"),
+            call("Ukrainian"),
+            call("Yoruba"),
             call("--------------------------"),
             call(),
         ]

From e6140e5052d2994bd6ff5da78a11e63448d144c7 Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Thu, 17 Oct 2024 00:31:59 +0300
Subject: [PATCH 27/42] Update test cases to include sub-languages

- Updated all test cases to account for sub-languages.
- Removed tests for 	est_get_language_words_to_remove and 	est_get_language_words_to_ignore, as these functions were deleted from utils.py and the languages metadata files
---
 tests/load/test_update_utils.py | 123 ++++++++++----------------------
 1 file changed, 36 insertions(+), 87 deletions(-)

diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
index 638ee09dd..489abc4b8 100644
--- a/tests/load/test_update_utils.py
+++ b/tests/load/test_update_utils.py
@@ -38,14 +38,46 @@ def test_get_scribe_languages():
     test_case.assertCountEqual(
         utils.get_scribe_languages(),
         [
+            "Arabic",
+            "Basque",
+            "Bengali",
+            "Bokmål",
+            "Czech",
+            "Danish",
             "English",
+            "Esperanto",
+            "Estonian",
+            "Finnish",
             "French",
             "German",
+            "Greek",
+            "Gurmukhi",
+            "Hausa",
+            "Hebrew",
+            "Hindi",
+            "Indonesian",
             "Italian",
+            "Japanese",
+            "Kurmanji",
+            "Latin",
+            "Malay",
+            "Malayalam",
+            "Mandarin",
+            "Nigerian",
+            "Nynorsk",
+            "Polish",
             "Portuguese",
             "Russian",
+            "Shahmukhi",
+            "Slovak",
             "Spanish",
+            "Swahili",
             "Swedish",
+            "Tajik",
+            "Tamil",
+            "Ukrainian",
+            "Urdu",
+            "Yoruba",
         ],
     )
 
@@ -61,6 +93,7 @@ def test_get_scribe_languages():
         ("russian", "Q7737"),
         ("spanish", "Q1321"),
         ("swedish", "Q9027"),
+        ("bokmål", "Q25167"),
     ],
 )
 def test_get_language_qid_positive(language, qid_code):
@@ -88,6 +121,7 @@ def test_get_language_qid_negative():
         ("russian", "ru"),
         ("spanish", "es"),
         ("SwedisH", "sv"),
+        ("bokmål", "nb"),
     ],
 )
 def test_get_language_iso_positive(language, iso_code):
@@ -100,7 +134,7 @@ def test_get_language_iso_negative():
 
     assert (
         str(excp.value)
-        == "Gibberish is currently not a supported language for ISO conversion."
+        == "GIBBERISH is currently not a supported language for ISO conversion."
     )
 
 
@@ -115,6 +149,7 @@ def test_get_language_iso_negative():
         ("ru", "Russian"),
         ("es", "Spanish"),
         ("sv", "Swedish"),
+        ("nb", "Bokmål"),
     ],
 )
 def test_get_language_from_iso_positive(iso_code, language):
@@ -128,92 +163,6 @@ def test_get_language_from_iso_negative():
     assert str(excp.value) == "IXI is currently not a supported ISO language."
 
 
-@pytest.mark.parametrize(
-    "language, remove_words",
-    [
-        (
-            "english",
-            [
-                "of",
-                "the",
-                "The",
-                "and",
-            ],
-        ),
-        (
-            "french",
-            [
-                "of",
-                "the",
-                "The",
-                "and",
-            ],
-        ),
-        ("german", ["of", "the", "The", "and", "NeinJa", "et", "redirect"]),
-        ("italian", ["of", "the", "The", "and", "text", "from"]),
-        ("portuguese", ["of", "the", "The", "and", "jbutadptflora"]),
-        (
-            "russian",
-            [
-                "of",
-                "the",
-                "The",
-                "and",
-            ],
-        ),
-        ("spanish", ["of", "the", "The", "and"]),
-        ("swedish", ["of", "the", "The", "and", "Checklist", "Catalogue"]),
-    ],
-)
-def test_get_language_words_to_remove(language, remove_words):
-    test_case = unittest.TestCase()
-
-    # ignore order, only content matters
-    test_case.assertCountEqual(
-        utils.get_language_words_to_remove(language), remove_words
-    )
-
-
-def test_get_language_words_to_remove_negative():
-    with pytest.raises(ValueError) as excp:
-        _ = utils.get_language_words_to_remove("python")
-
-    assert str(excp.value) == "Python is currently not a supported language."
-
-
-@pytest.mark.parametrize(
-    "language, ignore_words",
-    [
-        (
-            "french",
-            [
-                "XXe",
-            ],
-        ),
-        ("german", ["Gemeinde", "Familienname"]),
-        ("italian", ["The", "ATP"]),
-        ("portuguese", []),
-        ("russian", []),
-        ("spanish", []),
-        ("swedish", ["databasdump"]),
-    ],
-)
-def test_get_language_words_to_ignore(language, ignore_words):
-    test_case = unittest.TestCase()
-
-    # ignore order, only content matters
-    test_case.assertCountEqual(
-        utils.get_language_words_to_ignore(language), ignore_words
-    )
-
-
-def test_get_language_words_to_ignore_negative():
-    with pytest.raises(ValueError) as excp:
-        _ = utils.get_language_words_to_ignore("JAVA")
-
-    assert str(excp.value) == "Java is currently not a supported language."
-
-
 def test_get_ios_data_path():
     assert (
         utils.get_ios_data_path("suomi")

From 22791cec7696ff87b086d772f1b4d6ed07eff3ad Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Thu, 17 Oct 2024 01:37:28 +0300
Subject: [PATCH 28/42] Updated the get_language_from_iso function to depend on
 the JSON file. Made the language_metadata parameter optional in two
 functions. Added a ValueError exception when a language is not found.

---
 src/scribe_data/utils.py | 47 +++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index b4da68647..df22a9a9a 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -26,8 +26,6 @@
 from pathlib import Path
 from typing import Any, Optional
 
-from iso639 import Lang
-from iso639.exceptions import DeprecatedLanguageValue
 
 PROJECT_ROOT = "Scribe-Data"
 DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export"
@@ -198,13 +196,20 @@ def get_language_from_iso(iso: str) -> str:
         str
             The name for the language which has an ISO value of iso.
     """
-    try:
-        language_name = str(Lang(iso.lower()).name)
-    except DeprecatedLanguageValue as e:
-        raise ValueError(
-            f"{iso.upper()} is currently not a supported ISO language."
-        ) from e
-    return language_name
+    # Iterate over the languages and their properties
+    for language, properties in _languages.items():
+        # Check if the current language's ISO matches the provided ISO
+        if properties.get("iso") == iso:
+            return language.capitalize()
+
+        # If there are sub-languages, check those as well
+        if "sub_languages" in properties:
+            for sub_lang, sub_properties in properties["sub_languages"].items():
+                if sub_properties.get("iso") == iso:
+                    return sub_lang.capitalize()
+
+    # If no match is found, raise a ValueError
+    raise ValueError(f"{iso.upper()} is currently not a supported ISO language.")
 
 
 def load_queried_data(
@@ -490,10 +495,10 @@ def order_annotations(annotation: str) -> str:
     return "/".join(annotation_split)
 
 
-def format_sublanguage_name(lang, language_metadata):
+def format_sublanguage_name(lang, language_metadata=_languages):
     """
     Formats the name of a sub-language by appending its main language
-    in the format 'mainlang/sublang'. If the language is not a sub-language,
+    in the format 'Mainlang/Sublang'. If the language is not a sub-language,
     the original language name is returned as-is.
 
     Args:
@@ -503,30 +508,36 @@ def format_sublanguage_name(lang, language_metadata):
 
     Returns:
         str: The formatted language name if it's a sub-language
-             (e.g., 'norwegian/nynorsk'), otherwise the original name.
+             (e.g., 'Norwegian/Nynorsk'), otherwise the original name.
+
+    Raises:
+        ValueError: If the provided language or sub-language is not found.
 
     Example:
         format_sublanguage_name("nynorsk", language_metadata)
-        'norwegian/nynorsk'
+        'Norwegian/Nynorsk'
 
         format_sublanguage_name("english", language_metadata)
-        'english'
+        'English'
     """
     # Iterate through the main languages in the metadata
     for main_lang, lang_data in language_metadata.items():
+        # If it's not a sub-language, return the original name
+        if main_lang == lang.lower():
+            return lang.capitalize()
         # Check if the main language has sub-languages
         if "sub_languages" in lang_data:
             # Check if the provided language is a sub-language
             for sub_lang in lang_data["sub_languages"]:
                 if lang.lower() == sub_lang.lower():
-                    # Return the formatted name mainlang/sublang
+                    # Return the formatted name Mainlang/Sublang
                     return f"{main_lang.capitalize()}/{sub_lang.capitalize()}"
 
-    # If it's not a sub-language, return the original name
-    return lang.capitalize()
+    # Raise ValueError if no match is found
+    raise ValueError(f"{lang.upper()} is not a valid language or sub-language.")
 
 
-def list_all_languages(language_metadata):
+def list_all_languages(language_metadata=_languages):
     """List all languages from the provided metadata dictionary, including sub-languages."""
     current_languages = []
 

From 1416134a84c99227998212fb13bc5fa83d29c66b Mon Sep 17 00:00:00 2001
From: Omar Agiez <omaragiez3@gmail.com>
Date: Thu, 17 Oct 2024 01:39:25 +0300
Subject: [PATCH 29/42] Add unit tests for language formatting and listing: -
 Positive and negative tests for format_sublanguage_name - Test to validate
 the output of list_all_languages

---
 tests/load/test_update_utils.py | 66 +++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
index 489abc4b8..df37317a3 100644
--- a/tests/load/test_update_utils.py
+++ b/tests/load/test_update_utils.py
@@ -163,6 +163,72 @@ def test_get_language_from_iso_negative():
     assert str(excp.value) == "IXI is currently not a supported ISO language."
 
 
+@pytest.mark.parametrize(
+    "lang, expected_output",
+    [
+        ("nynorsk", "Norwegian/Nynorsk"),
+        ("bokmål", "Norwegian/Bokmål"),
+        ("english", "English"),
+    ],
+)
+def test_format_sublanguage_name_positive(lang, expected_output):
+    assert utils.format_sublanguage_name(lang) == expected_output
+
+
+def test_format_sublanguage_name_negative():
+    with pytest.raises(ValueError) as excp:
+        _ = utils.format_sublanguage_name("soccer")
+
+    assert str(excp.value) == "SOCCER is not a valid language or sub-language."
+
+
+def test_list_all_languages():
+    expected_languages = [
+        "arabic",
+        "basque",
+        "bengali",
+        "czech",
+        "danish",
+        "english",
+        "esperanto",
+        "estonian",
+        "finnish",
+        "french",
+        "german",
+        "greek",
+        "hausa",
+        "hebrew",
+        "hindi",
+        "urdu",
+        "indonesian",
+        "italian",
+        "japanese",
+        "kurmanji",
+        "latin",
+        "malay",
+        "malayalam",
+        "mandarin",
+        "nynorsk",
+        "bokmål",
+        "nigerian",
+        "polish",
+        "portuguese",
+        "shahmukhi",
+        "gurmukhi",
+        "russian",
+        "slovak",
+        "spanish",
+        "swahili",
+        "swedish",
+        "tajik",
+        "tamil",
+        "ukrainian",
+        "yoruba",
+    ]
+
+    assert utils.list_all_languages() == expected_languages
+
+
 def test_get_ios_data_path():
     assert (
         utils.get_ios_data_path("suomi")

From fff64278b731ed860cf7507320194359944ee706 Mon Sep 17 00:00:00 2001
From: Ebeleokolo <ebele.okolo@decagon.dev>
Date: Wed, 16 Oct 2024 23:35:55 -0400
Subject: [PATCH 30/42] Add Finnish verbs query

---
 .../Finnish/verbs/query_verbs.sparql          | 133 +++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql
index 949500ea2..b1a44c354 100644
--- a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql
+++ b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql
@@ -1,13 +1,144 @@
+PREFIX wd: <http://www.wikidata.org/entity/>
+PREFIX wikibase: <http://wikiba.se/ontology#>
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
 # tool: scribe-data
-# All Finnish (Q1412) verbs and the given forms.
+# All Finnish (Q1412) verbs and their forms.
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?verb
+  ?infinitiveI
+  ?presIndSg1
+  ?imperativeSg2
+  ?passivePresent
 
 WHERE {
   ?lexeme dct:language wd:Q1412 ;
     wikibase:lexicalCategory wd:Q24905 ;
     wikibase:lemma ?verb .
+
+  # Infinitives
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?infinitiveIForm .
+    ?infinitiveIForm ontolex:representation ?infinitiveI ;
+      wikibase:grammaticalFeature wd:Q179230 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?infinitiveIIForm .
+    ?infinitiveIIForm ontolex:representation ?infinitiveII ;
+      wikibase:grammaticalFeature wd:Q179230 ;
+      wikibase:grammaticalFeature wd:Q66596723 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?infinitiveIIIForm .
+    ?infinitiveIIIForm ontolex:representation ?infinitiveIII ;
+      wikibase:grammaticalFeature wd:Q179230 ;
+      wikibase:grammaticalFeature wd:Q66596786 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?infinitiveIVForm .
+    ?infinitiveIVForm ontolex:representation ?infinitiveIV ;
+      wikibase:grammaticalFeature wd:Q179230 ;
+      wikibase:grammaticalFeature wd:Q66596828 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?infinitiveVForm .
+    ?infinitiveVForm ontolex:representation ?infinitiveV ;
+      wikibase:grammaticalFeature wd:Q179230 ;
+      wikibase:grammaticalFeature wd:Q66596870 .
+  }
+
+  # Present Indicative
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?presIndSg1Form .
+    ?presIndSg1Form ontolex:representation ?presIndSg1 ;
+      wikibase:grammaticalFeature wd:Q192613 ;
+      wikibase:grammaticalFeature wd:Q21714344 ;
+      wikibase:grammaticalFeature wd:Q110786 .
+  }
+
+  # Past Indicative
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?pastIndSg1Form .
+    ?pastIndSg1Form ontolex:representation ?pastIndSg1 ;
+      wikibase:grammaticalFeature wd:Q1240211 ;
+      wikibase:grammaticalFeature wd:Q21714344 ;
+      wikibase:grammaticalFeature wd:Q110786 .
+  }
+
+  # Conditional
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?conditionalSg1Form .
+    ?conditionalSg1Form ontolex:representation ?conditionalSg1 ;
+      wikibase:grammaticalFeature wd:Q52824793 ;
+      wikibase:grammaticalFeature wd:Q21714344 ;
+      wikibase:grammaticalFeature wd:Q110786 .
+  }
+
+  # Potential
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?potentialSg1Form .
+    ?potentialSg1Form ontolex:representation ?potentialSg1 ;
+      wikibase:grammaticalFeature wd:Q696092 ;
+      wikibase:grammaticalFeature wd:Q21714344 ;
+      wikibase:grammaticalFeature wd:Q110786 .
+  }
+
+  # Imperative
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperativeSg2Form .
+    ?imperativeSg2Form ontolex:representation ?imperativeSg2 ;
+      wikibase:grammaticalFeature wd:Q22716 ;
+      wikibase:grammaticalFeature wd:Q51929049 ;
+      wikibase:grammaticalFeature wd:Q110786 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperativePl2Form .
+    ?imperativePl2Form ontolex:representation ?imperativePl2 ;
+      wikibase:grammaticalFeature wd:Q22716 ;
+      wikibase:grammaticalFeature wd:Q51929049 ;
+      wikibase:grammaticalFeature wd:Q146786 .
+  }
+
+  # Participles
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?activePresParticipleForm .
+    ?activePresParticipleForm ontolex:representation ?activePresParticiple ;
+      wikibase:grammaticalFeature wd:Q814722 ;
+      wikibase:grammaticalFeature wd:Q1317831 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?activePastParticipleForm .
+    ?activePastParticipleForm ontolex:representation ?activePastParticiple ;
+      wikibase:grammaticalFeature wd:Q12612262 ;
+      wikibase:grammaticalFeature wd:Q1317831 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?passivePresParticipleForm .
+    ?passivePresParticipleForm ontolex:representation ?passivePresParticiple ;
+      wikibase:grammaticalFeature wd:Q814722 ;
+      wikibase:grammaticalFeature wd:Q1194697 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?passivePastParticipleForm .
+    ?passivePastParticipleForm ontolex:representation ?passivePastParticiple ;
+      wikibase:grammaticalFeature wd:Q12612262 ;
+      wikibase:grammaticalFeature wd:Q1194697 .
+  }
+
+  # Passive forms
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?passivePresentForm .
+    ?passivePresentForm ontolex:representation ?passivePresent ;
+      wikibase:grammaticalFeature wd:Q192613 ;
+      wikibase:grammaticalFeature wd:Q1194697 .
+  }
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?passivePastForm .
+    ?passivePastForm ontolex:representation ?passivePast ;
+      wikibase:grammaticalFeature wd:Q1240211 ;
+      wikibase:grammaticalFeature wd:Q1194697 .
+  }
 }

From 25c6bf7759b64a8b42e7da17e96340a7f11d5418 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Thu, 17 Oct 2024 09:34:19 +0200
Subject: [PATCH 31/42] Updates to Finnish verbs query

---
 .../Finnish/verbs/query_verbs.sparql          | 72 +++++++------------
 1 file changed, 26 insertions(+), 46 deletions(-)

diff --git a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql
index b1a44c354..3af067d84 100644
--- a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql
+++ b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql
@@ -1,18 +1,11 @@
-PREFIX wd: <http://www.wikidata.org/entity/>
-PREFIX wikibase: <http://wikiba.se/ontology#>
-PREFIX dct: <http://purl.org/dc/terms/>
-PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
 # tool: scribe-data
-# All Finnish (Q1412) verbs and their forms.
+# All Finnish (Q1412) verbs and the given forms.
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?verb
   ?infinitiveI
-  ?presIndSg1
-  ?imperativeSg2
-  ?passivePresent
 
 WHERE {
   ?lexeme dct:language wd:Q1412 ;
@@ -25,120 +18,107 @@ WHERE {
     ?infinitiveIForm ontolex:representation ?infinitiveI ;
       wikibase:grammaticalFeature wd:Q179230 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?infinitiveIIForm .
     ?infinitiveIIForm ontolex:representation ?infinitiveII ;
-      wikibase:grammaticalFeature wd:Q179230 ;
-      wikibase:grammaticalFeature wd:Q66596723 .
+      wikibase:grammaticalFeature wd:Q179230, wd:Q66596723 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?infinitiveIIIForm .
     ?infinitiveIIIForm ontolex:representation ?infinitiveIII ;
-      wikibase:grammaticalFeature wd:Q179230 ;
-      wikibase:grammaticalFeature wd:Q66596786 .
+      wikibase:grammaticalFeature wd:Q179230, wd:Q66596786 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?infinitiveIVForm .
     ?infinitiveIVForm ontolex:representation ?infinitiveIV ;
-      wikibase:grammaticalFeature wd:Q179230 ;
-      wikibase:grammaticalFeature wd:Q66596828 .
+      wikibase:grammaticalFeature wd:Q179230, wd:Q66596828 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?infinitiveVForm .
     ?infinitiveVForm ontolex:representation ?infinitiveV ;
-      wikibase:grammaticalFeature wd:Q179230 ;
-      wikibase:grammaticalFeature wd:Q66596870 .
+      wikibase:grammaticalFeature wd:Q179230, wd:Q66596870 .
   }
 
   # Present Indicative
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?presIndSg1Form .
     ?presIndSg1Form ontolex:representation ?presIndSg1 ;
-      wikibase:grammaticalFeature wd:Q192613 ;
-      wikibase:grammaticalFeature wd:Q21714344 ;
-      wikibase:grammaticalFeature wd:Q110786 .
+      wikibase:grammaticalFeature wd:Q192613, wd:Q21714344, wd:Q110786 .
   }
 
   # Past Indicative
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?pastIndSg1Form .
     ?pastIndSg1Form ontolex:representation ?pastIndSg1 ;
-      wikibase:grammaticalFeature wd:Q1240211 ;
-      wikibase:grammaticalFeature wd:Q21714344 ;
-      wikibase:grammaticalFeature wd:Q110786 .
+      wikibase:grammaticalFeature wd:Q1240211, wd:Q21714344, wd:Q110786 .
   }
 
   # Conditional
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?conditionalSg1Form .
     ?conditionalSg1Form ontolex:representation ?conditionalSg1 ;
-      wikibase:grammaticalFeature wd:Q52824793 ;
-      wikibase:grammaticalFeature wd:Q21714344 ;
-      wikibase:grammaticalFeature wd:Q110786 .
+      wikibase:grammaticalFeature wd:Q52824793, wd:Q21714344, wd:Q110786 .
   }
 
   # Potential
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?potentialSg1Form .
     ?potentialSg1Form ontolex:representation ?potentialSg1 ;
-      wikibase:grammaticalFeature wd:Q696092 ;
-      wikibase:grammaticalFeature wd:Q21714344 ;
-      wikibase:grammaticalFeature wd:Q110786 .
+      wikibase:grammaticalFeature wd:Q696092, wd:Q21714344, wd:Q110786 .
   }
 
   # Imperative
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?imperativeSg2Form .
     ?imperativeSg2Form ontolex:representation ?imperativeSg2 ;
-      wikibase:grammaticalFeature wd:Q22716 ;
-      wikibase:grammaticalFeature wd:Q51929049 ;
-      wikibase:grammaticalFeature wd:Q110786 .
+      wikibase:grammaticalFeature wd:Q22716, wd:Q51929049, wd:Q110786 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?imperativePl2Form .
     ?imperativePl2Form ontolex:representation ?imperativePl2 ;
-      wikibase:grammaticalFeature wd:Q22716 ;
-      wikibase:grammaticalFeature wd:Q51929049 ;
-      wikibase:grammaticalFeature wd:Q146786 .
+      wikibase:grammaticalFeature wd:Q22716, wd:Q51929049, wd:Q146786 .
   }
 
   # Participles
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?activePresParticipleForm .
     ?activePresParticipleForm ontolex:representation ?activePresParticiple ;
-      wikibase:grammaticalFeature wd:Q814722 ;
-      wikibase:grammaticalFeature wd:Q1317831 .
+      wikibase:grammaticalFeature wd:Q814722, wd:Q1317831 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?activePastParticipleForm .
     ?activePastParticipleForm ontolex:representation ?activePastParticiple ;
-      wikibase:grammaticalFeature wd:Q12612262 ;
-      wikibase:grammaticalFeature wd:Q1317831 .
+      wikibase:grammaticalFeature wd:Q12612262, wd:Q1317831 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?passivePresParticipleForm .
     ?passivePresParticipleForm ontolex:representation ?passivePresParticiple ;
-      wikibase:grammaticalFeature wd:Q814722 ;
-      wikibase:grammaticalFeature wd:Q1194697 .
+      wikibase:grammaticalFeature wd:Q814722, wd:Q1194697 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?passivePastParticipleForm .
     ?passivePastParticipleForm ontolex:representation ?passivePastParticiple ;
-      wikibase:grammaticalFeature wd:Q12612262 ;
-      wikibase:grammaticalFeature wd:Q1194697 .
+      wikibase:grammaticalFeature wd:Q12612262, wd:Q1194697 .
   }
 
   # Passive forms
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?passivePresentForm .
     ?passivePresentForm ontolex:representation ?passivePresent ;
-      wikibase:grammaticalFeature wd:Q192613 ;
-      wikibase:grammaticalFeature wd:Q1194697 .
+      wikibase:grammaticalFeature wd:Q192613, wd:Q1194697 .
   }
+
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?passivePastForm .
     ?passivePastForm ontolex:representation ?passivePast ;
-      wikibase:grammaticalFeature wd:Q1240211 ;
-      wikibase:grammaticalFeature wd:Q1194697 .
+      wikibase:grammaticalFeature wd:Q1240211, wd:Q1194697 .
   }
 }

From 13f4728f84acad890404656c6dab13df1d2f246b Mon Sep 17 00:00:00 2001
From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:33:41 +0100
Subject: [PATCH 32/42] Update query_adverbs.sparql

added comparative
---
 .../Spanish/adverbs/query_adverbs.sparql                 | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql
index 2abb5033f..8188fc5e8 100644
--- a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql
+++ b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql
@@ -7,6 +7,7 @@ SELECT
   ?adverb
   ?diminutive
   ?superlative
+  ?comparative
 
 WHERE {
   ?lexeme dct:language wd:Q1321 ;
@@ -28,4 +29,12 @@ WHERE {
     ?superlativeForm ontolex:representation ?superlative ;
       wikibase:grammaticalFeature wd:Q1817208 .
   }
+
+  # MARK: Comparative
+
+  OPTIONAL {
+     ?lexeme ontolex:lexicalForm ?comparativeForm .
+     ?comparativeForm ontolex:representation ?comparative ;
+      wikibase:grammaticalFeature wd:Q14169499 .
+  }
 }

From 93c254c3ef8ac861bc01b180cab93e8d93fd9045 Mon Sep 17 00:00:00 2001
From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com>
Date: Thu, 17 Oct 2024 14:44:07 +0100
Subject: [PATCH 33/42] Create query_verbs.sparql

I noticed that there was no folder for Igbo.
---
 .../Igbo/verbs/query_verbs.sparql                   | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql

diff --git a/src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql
new file mode 100644
index 000000000..6b59644f3
--- /dev/null
+++ b/src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql
@@ -0,0 +1,13 @@
+# tool: scribe-data
+# All Igbo (Q33578) verbs and the given forms.
+# Enter this query at https://query.wikidata.org/.
+
+SELECT
+  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+  ?verb
+
+WHERE {
+  ?lexeme dct:language wd:Q33578 ;
+    wikibase:lexicalCategory wd:Q24905 ;
+    wikibase:lemma ?verb .
+ }

From 7eab5dabaea411323d5bd2c84398d2fdacb6acc0 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Thu, 17 Oct 2024 21:07:11 +0200
Subject: [PATCH 34/42] Add Igbo to the languages check

---
 src/scribe_data/check/check_project_structure.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py
index 4c58478a8..3313d0350 100644
--- a/src/scribe_data/check/check_project_structure.py
+++ b/src/scribe_data/check/check_project_structure.py
@@ -40,6 +40,7 @@
     "Malay",
     "Punjabi",
     "Tajik",
+    "Igbo",
 }
 
 DATA_TYPES = {

From ac99582c2c6074a64a28162d003a330689949a74 Mon Sep 17 00:00:00 2001
From: gicharuelvis <gicharuelvis@gmail.com>
Date: Fri, 18 Oct 2024 00:17:39 +0300
Subject: [PATCH 35/42]  Added Swedish Adverbs

---
 .../Swedish/adverbs/query_adverbs.sparql           | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql

diff --git a/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql
new file mode 100644
index 000000000..11edd90ee
--- /dev/null
+++ b/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql
@@ -0,0 +1,14 @@
+#                                                          Adverb
+# tool: scribe-data
+# All Swedish (Q9027) adverbs and the given forms.
+# Enter this query at https://query.wikidata.org/.
+
+SELECT DISTINCT
+  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+  ?adverb
+
+WHERE {
+  ?lexeme dct:language wd:Q9027 ;
+    wikibase:lexicalCategory wd:Q380057 ;
+    wikibase:lemma ?adverb .
+}
\ No newline at end of file

From dd56c2d50a746dbd5e1b63315ca67364e17813db Mon Sep 17 00:00:00 2001
From: gicharuelvis <gicharuelvis@gmail.com>
Date: Fri, 18 Oct 2024 00:37:26 +0300
Subject: [PATCH 36/42] Added Swedish Adverbs

---
 .../Swedish/adverbs/query_adverbs.sparql                         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql
index 11edd90ee..302af2bfc 100644
--- a/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql
+++ b/src/scribe_data/language_data_extraction/Swedish/adverbs/query_adverbs.sparql
@@ -1,4 +1,3 @@
-#                                                          Adverb
 # tool: scribe-data
 # All Swedish (Q9027) adverbs and the given forms.
 # Enter this query at https://query.wikidata.org/.

From 4fd4f0fd9e899c0de22f6f9be4a204c6f561f7f1 Mon Sep 17 00:00:00 2001
From: gicharuelvis <gicharuelvis@gmail.com>
Date: Fri, 18 Oct 2024 01:08:30 +0300
Subject: [PATCH 37/42] Added Swedish Adjectives

---
 .../Swedish/adjectives/query_adjectives.sparql | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql

diff --git a/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql
new file mode 100644
index 000000000..0949450ba
--- /dev/null
+++ b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql
@@ -0,0 +1,18 @@
+# tool: scribe-data
+# All Swedish (Q9027) adjectives and the given forms.
+# Enter this query at https://query.wikidata.org/.
+
+SELECT
+  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+  ?adjective
+
+WHERE {
+  ?lexeme dct:language wd:Q9027 ;
+    wikibase:lexicalCategory wd:Q34698 ;
+    wikibase:lemma ?lemma .
+
+  SERVICE wikibase:label {
+    bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
+    ?lemma rdfs:label ?adjective .
+  }
+}

From 9284cfe8a04fbf4440aecea8aee571ca9517152d Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Fri, 18 Oct 2024 00:33:25 +0200
Subject: [PATCH 38/42] Remove label service from adjectives query

---
 .../Swedish/adjectives/query_adjectives.sparql             | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql
index 0949450ba..0bef8ebab 100644
--- a/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql
+++ b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql
@@ -9,10 +9,5 @@ SELECT
 WHERE {
   ?lexeme dct:language wd:Q9027 ;
     wikibase:lexicalCategory wd:Q34698 ;
-    wikibase:lemma ?lemma .
-
-  SERVICE wikibase:label {
-    bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
-    ?lemma rdfs:label ?adjective .
-  }
+    wikibase:lemma ?adjective .
 }

From 7201596da68b6b5252c6980f45e95b7547780f78 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Fri, 18 Oct 2024 00:45:43 +0200
Subject: [PATCH 39/42] Remove forms that were accidentally added

---
 .../Spanish/adverbs/query_adverbs.sparql      | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql
index 8188fc5e8..084da843f 100644
--- a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql
+++ b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql
@@ -5,36 +5,9 @@
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?adverb
-  ?diminutive
-  ?superlative
-  ?comparative
 
 WHERE {
   ?lexeme dct:language wd:Q1321 ;
     wikibase:lexicalCategory wd:Q380057 ;
     wikibase:lemma ?adverb .
-
-  # MARK: Diminutive
-
-  OPTIONAL {
-    ?lexeme ontolex:lexicalForm ?diminutiveForm .
-    ?diminutiveForm ontolex:representation ?diminutive ;
-      wikibase:grammaticalFeature wd:Q108709 .
-  }
-
-  # MARK: Superlative
-
-  OPTIONAL {
-    ?lexeme ontolex:lexicalForm ?superlativeForm .
-    ?superlativeForm ontolex:representation ?superlative ;
-      wikibase:grammaticalFeature wd:Q1817208 .
-  }
-
-  # MARK: Comparative
-
-  OPTIONAL {
-     ?lexeme ontolex:lexicalForm ?comparativeForm .
-     ?comparativeForm ontolex:representation ?comparative ;
-      wikibase:grammaticalFeature wd:Q14169499 .
-  }
 }

From 7502f49c2efe4b742a0369d18f41897b4aa12d4c Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Fri, 18 Oct 2024 00:49:42 +0200
Subject: [PATCH 40/42] Minor changes to unicode setup docs

---
 src/scribe_data/unicode/UNICODE_INSTALLTION.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/scribe_data/unicode/UNICODE_INSTALLTION.md b/src/scribe_data/unicode/UNICODE_INSTALLTION.md
index dfb4e1e4f..67d4ffb83 100644
--- a/src/scribe_data/unicode/UNICODE_INSTALLTION.md
+++ b/src/scribe_data/unicode/UNICODE_INSTALLTION.md
@@ -4,7 +4,9 @@ The Scribe-Data Unicode process is powered by [cldr-json](https://github.com/uni
 
 Please see the [installation guide for PyICU](https://gitlab.pyicu.org/main/pyicu#installing-pyicu) as the extension must be linked to ICU on your machine to work properly.
 
-Note that some of the commands may be incorrect. On macOS you may need to do the following:
+## macOS Support
+
+Note that some of the commands in the installation guide may be incorrect. On macOS you may need to do the following:
 
 ```bash
 # Instead of:
@@ -16,7 +18,7 @@ echo "/opt/homebrew/opt/icu4c/bin:/opt/homebrew/opt/icu4c/sbin:$PATH"
 echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/opt/homebrew/opt/icu4c/lib/pkgconfig"
 ```
 
-# Installing PyICU for Emoji Support on Windows
+## Windows Support
 
 This guide provides step-by-step instructions on how to install the PyICU library, which is essential for proper emoji support on Windows.
 
@@ -25,7 +27,7 @@ This guide provides step-by-step instructions on how to install the PyICU librar
 1. Visit the [PyICU Release Page](https://github.com/cgohlke/pyicu-build/releases).
 2. Locate and download the wheel (`.whl`) file that matches your Python version. Make sure to select the correct architecture (e.g., `win_amd64` for 64-bit Python).
 
-## Set Up a Virtual Environment
+### Set Up a Virtual Environment
 
 If you haven't already, You can do this with the following command:
 
@@ -37,7 +39,7 @@ python -m venv venv
 venv\Scripts\activate
 ```
 
-## Install PyICU
+### Install PyICU
 
 ```bash
 # Replace 'PyICU-2.13-cp312-cp312-win_amd64.whl' with the actual filename you downloaded

From eec462236b62418473472c35378a9971657b65ed Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Fri, 18 Oct 2024 00:50:18 +0200
Subject: [PATCH 41/42] Minor header change to unicode docs headers

---
 src/scribe_data/unicode/UNICODE_INSTALLTION.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scribe_data/unicode/UNICODE_INSTALLTION.md b/src/scribe_data/unicode/UNICODE_INSTALLTION.md
index 67d4ffb83..2dbe323be 100644
--- a/src/scribe_data/unicode/UNICODE_INSTALLTION.md
+++ b/src/scribe_data/unicode/UNICODE_INSTALLTION.md
@@ -22,7 +22,7 @@ echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/opt/homebrew/opt/icu4c/lib/pkgconfig"
 
 This guide provides step-by-step instructions on how to install the PyICU library, which is essential for proper emoji support on Windows.
 
-## Download the PyICU Wheel File
+### Download the PyICU Wheel File
 
 1. Visit the [PyICU Release Page](https://github.com/cgohlke/pyicu-build/releases).
 2. Locate and download the wheel (`.whl`) file that matches your Python version. Make sure to select the correct architecture (e.g., `win_amd64` for 64-bit Python).

From 661b131cff45f947d3d33eac705363bd8c0944f9 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Fri, 18 Oct 2024 03:05:02 +0200
Subject: [PATCH 42/42] Edits to language metadata and supporting functions +
 pr checklist

---
 .github/PULL_REQUEST_TEMPLATE.md              |   1 +
 CONTRIBUTING.md                               |  11 ++
 src/scribe_data/cli/cli_utils.py              |  81 +++++-----
 src/scribe_data/cli/list.py                   |   9 +-
 src/scribe_data/cli/total.py                  |  13 +-
 .../resources/language_metadata.json          |  32 ++--
 src/scribe_data/utils.py                      | 150 +++++++++---------
 tests/cli/test_utils.py                       |  10 +-
 tests/load/test_update_utils.py               |  62 +-------
 9 files changed, 158 insertions(+), 211 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index bab97a1a8..17c07e1c1 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,6 +7,7 @@ Thank you for your pull request! 🚀
 <!-- Please replace the empty checkboxes [] below with checked ones [x] accordingly. -->
 
 - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
+- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)
 
 ---
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 376a954a7..2e44c618e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u
 - [First steps as a contributor](#first-steps)
 - [Learning the tech stack](#learning-the-tech)
 - [Development environment](#dev-env)
+- [Testing](#testing)
 - [Issues and projects](#issues-projects)
 - [Bug reports](#bug-reports)
 - [Feature requests](#feature-requests)
@@ -171,6 +172,16 @@ pip install -e .
 > [!NOTE]
 > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
 
+<a id="testing"></a>
+
+## Testing [`⇧`](#contents)
+
+In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root:
+
+```bash
+pytest
+```
+
 <a id="issues-projects"></a>
 
 ## Issues and projects [`⇧`](#contents)
diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
index be2fa0f79..e39e1621d 100644
--- a/src/scribe_data/cli/cli_utils.py
+++ b/src/scribe_data/cli/cli_utils.py
@@ -27,6 +27,8 @@
 
 from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
 
+# MARK: CLI Variables
+
 LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"
 
 LANGUAGE_METADATA_FILE = (
@@ -56,20 +58,21 @@
 language_map = {}
 language_to_qid = {}
 
-# Process each language and its potential sub-languages in one pass
-for lang_key, lang_data in language_metadata.items():
-    lang_key_lower = lang_key.lower()
+# Process each language and its potential sub-languages in one pass.
+for lang, lang_data in language_metadata.items():
+    lang_lower = lang.lower()
 
-    # Handle sub-languages if they exist
+    # Handle sub-languages if they exist.
     if "sub_languages" in lang_data:
-        for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items():
-            sub_lang_key_lower = sub_lang_key.lower()
-            language_map[sub_lang_key_lower] = sub_lang_data
-            language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"]
+        for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
+            sub_lang_lower = sub_lang.lower()
+            language_map[sub_lang_lower] = sub_lang_data
+            language_to_qid[sub_lang_lower] = sub_lang_data["qid"]
+
     else:
-        # Handle the main language directly
-        language_map[lang_key_lower] = lang_data
-        language_to_qid[lang_key_lower] = lang_data["qid"]
+        # Handle the main language directly.
+        language_map[lang_lower] = lang_data
+        language_to_qid[lang_lower] = lang_data["qid"]
 
 
 # MARK: Correct Inputs
@@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None:
     if isinstance(data, dict):
         max_key_length = max((len(key) for key in data.keys()), default=0)
 
-        if data_type == "autosuggestions":
-            for key, value in data.items():
+        for key, value in data.items():
+            if data_type == "autosuggestions":
                 print(f"{key:<{max_key_length}} : {', '.join(value)}")
 
-        elif data_type == "emoji_keywords":
-            for key, value in data.items():
+            elif data_type == "emoji_keywords":
                 emojis = [item["emoji"] for item in value]
                 print(f"{key:<{max_key_length}} : {' '.join(emojis)}")
 
-        elif data_type in {"prepositions"}:
-            for key, value in data.items():
+            elif data_type in {"prepositions"}:
                 print(f"{key:<{max_key_length}} : {value}")
 
-        else:
-            for key, value in data.items():
-                if isinstance(value, dict):
-                    print(f"{key:<{max_key_length}} : ")
-                    max_sub_key_length = max(
-                        (len(sub_key) for sub_key in value.keys()), default=0
-                    )
-                    for sub_key, sub_value in value.items():
-                        print(f"  {sub_key:<{max_sub_key_length}} : {sub_value}")
-
-                elif isinstance(value, list):
-                    print(f"{key:<{max_key_length}} : ")
-                    for item in value:
-                        if isinstance(item, dict):
-                            for sub_key, sub_value in item.items():
-                                print(f"  {sub_key:<{max_key_length}} : {sub_value}")
-
-                        else:
-                            print(f"  {item}")
-
-                else:
-                    print(f"{key:<{max_key_length}} : {value}")
+            elif isinstance(value, dict):
+                print(f"{key:<{max_key_length}} : ")
+                max_sub_key_length = max(
+                    (len(sub_key) for sub_key in value.keys()), default=0
+                )
+                for sub_key, sub_value in value.items():
+                    print(f"  {sub_key:<{max_sub_key_length}} : {sub_value}")
+
+            elif isinstance(value, list):
+                print(f"{key:<{max_key_length}} : ")
+                for item in value:
+                    if isinstance(item, dict):
+                        for sub_key, sub_value in item.items():
+                            print(f"  {sub_key:<{max_key_length}} : {sub_value}")
+
+                    else:
+                        print(f"  {item}")
+
+            else:
+                print(f"{key:<{max_key_length}} : {value}")
 
     elif isinstance(data, list):
         for item in data:
@@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type):
         ):
             closest_match = difflib.get_close_matches(item, valid_options, n=1)
             closest_match_str = (
-                f" The closest matching {item_type} is {closest_match[0]}."
+                f" The closest matching {item_type} is '{closest_match[0]}'."
                 if closest_match
                 else ""
             )
 
-            return f"Invalid {item_type} {item}.{closest_match_str}"
+            return f"Invalid {item_type} '{item}'.{closest_match_str}"
 
         return None
 
diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index ee3311ede..762d3bfca 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -21,16 +21,16 @@
 """
 
 from scribe_data.cli.cli_utils import (
+    LANGUAGE_DATA_EXTRACTION_DIR,
     correct_data_type,
-    language_metadata,
     language_map,
-    LANGUAGE_DATA_EXTRACTION_DIR,
+    language_metadata,
 )
 from scribe_data.utils import (
-    list_all_languages,
+    format_sublanguage_name,
     get_language_iso,
     get_language_qid,
-    format_sublanguage_name,
+    list_all_languages,
 )
 
 
@@ -39,7 +39,6 @@ def list_languages() -> None:
     Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
     """
     languages = list_all_languages(language_metadata)
-    languages.sort()
 
     language_col_width = max(len(lang) for lang in languages) + 2
     iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 5530ef5db..885d9b3e9 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -29,8 +29,8 @@
     language_metadata,
     language_to_qid,
 )
+from scribe_data.utils import format_sublanguage_name, list_all_languages
 from scribe_data.wikidata.wikidata_utils import sparql
-from scribe_data.utils import list_all_languages, format_sublanguage_name
 
 
 def get_qid_by_input(input_str):
@@ -73,9 +73,8 @@ def get_datatype_list(language):
             A list of the corresponding data types.
     """
     languages = list_all_languages(language_metadata)
-    language_list = [lang for lang in languages]
 
-    if language.lower() in language_list:
+    if language.lower() in languages:
         language_data = language_map.get(language.lower())
         language_capitalized = format_sublanguage_name(
             language, language_metadata
@@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None):
     print("=" * 64)
 
     if language is None:  # all languages
-        languages = list_all_languages(
-            language_metadata
-        )  # this returns a list of language names
-        language_list = languages  # sorts the list in place
-        language_list.sort()
+        languages = list_all_languages(language_metadata)
 
-        for lang in language_list:
+        for lang in languages:
             data_types = get_datatype_list(lang)
 
             first_row = True
diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index 00a8d405c..7ab2145bf 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -11,6 +11,14 @@
     "iso": "bn",
     "qid": "Q9610"
   },
+  "chinese": {
+    "sub_languages": {
+      "mandarin": {
+        "iso": "zh",
+        "qid": "Q727694"
+      }
+    }
+  },
   "czech": {
     "iso": "cs",
     "qid": "Q9056"
@@ -95,23 +103,15 @@
     "iso": "ml",
     "qid": "Q36236"
   },
-  "chinese": {
-    "sub_languages": {
-      "mandarin": {
-        "iso": "zh",
-        "qid": "Q727694"
-      }
-    }
-  },
   "norwegian": {
     "sub_languages": {
-      "nynorsk": {
-        "iso": "nn",
-        "qid": "Q25164"
-      },
       "bokmål": {
         "iso": "nb",
         "qid": "Q25167"
+      },
+      "nynorsk": {
+        "iso": "nn",
+        "qid": "Q25164"
       }
     }
   },
@@ -133,13 +133,13 @@
   },
   "punjabi": {
     "sub_languages": {
-      "shahmukhi": {
-        "iso": "pnb",
-        "qid": "Q58635"
-      },
       "gurmukhi": {
         "iso": "pa",
         "qid": "Q58635"
+      },
+      "shahmukhi": {
+        "iso": "pnb",
+        "qid": "Q58635"
       }
     }
   },
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index df22a9a9a..3c2007640 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -26,7 +26,6 @@
 from pathlib import Path
 from typing import Any, Optional
 
-
 PROJECT_ROOT = "Scribe-Data"
 DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export"
 DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export"
@@ -53,8 +52,7 @@ def _load_json(package_path: str, file_name: str) -> Any:
     with resources.files(package_path).joinpath(file_name).open(
         encoding="utf-8"
     ) as in_stream:
-        contents = json.load(in_stream)
-        return contents  # No need for 'root'
+        return json.load(in_stream)
 
 
 _languages = _load_json(
@@ -90,13 +88,13 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
     ------
         ValueError : when a source_value is not supported or the language only has sub-languages.
     """
-    norm_source_value = source_value.lower()
-
-    # Check if we're searching by language name
+    # Check if we're searching by language name.
     if source_key == "language":
-        # First, check the main language entries (e.g., mandarin, french, etc.)
+        norm_source_value = source_value.lower()
+
+        # First, check the main language entries (e.g., mandarin, french, etc.).
         for language, entry in _languages.items():
-            # If the language name matches the top-level key, return the target value
+            # If the language name matches the top-level key, return the target value.
             if language.lower() == norm_source_value:
                 if "sub_languages" in entry:
                     sub_languages = ", ".join(entry["sub_languages"].keys())
@@ -105,37 +103,16 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
                     )
                 return entry.get(target_key)
 
-            # If there are sub-languages, check them too
+            # If there are sub-languages, check them too.
             if "sub_languages" in entry:
                 for sub_language, sub_entry in entry["sub_languages"].items():
                     if sub_language.lower() == norm_source_value:
                         return sub_entry.get(target_key)
 
-    # If no match was found, raise an error
+    # If no match was found, raise an error.
     raise ValueError(error_msg)
 
 
-def get_scribe_languages() -> list[str]:
-    """
-    Returns the list of currently implemented Scribe languages.
-    This version handles both regular languages and those with sub-languages (e.g., Norwegian).
-    """
-    languages = []
-
-    for language, entry in _languages.items():
-        # Add the main language (if it's directly queryable)
-        if "sub_languages" not in entry:
-            languages.append(language.capitalize())
-
-        # If there are sub-languages, add them instead
-        if "sub_languages" in entry:
-            languages.extend(
-                sub_language.capitalize() for sub_language in entry["sub_languages"]
-            )
-
-    return sorted(languages)
-
-
 def get_language_qid(language: str) -> str:
     """
     Returns the QID of the given language.
@@ -173,13 +150,12 @@ def get_language_iso(language: str) -> str:
             The ISO code for the language.
     """
 
-    iso_code = _find(
+    return _find(
         "language",
         language,
         "iso",
         f"{language.upper()} is currently not a supported language for ISO conversion.",
     )
-    return iso_code
 
 
 def get_language_from_iso(iso: str) -> str:
@@ -433,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str:
     ----------
         wikidata_gender : str
             The gender of the noun that was queried from WikiData.
+
+    Returns
+    -------
+        The gender value corrected in case the Wikidata ID was queried.
     """
     gender_map = {
-        "masculine": "M",
-        "Q499327": "M",
-        "feminine": "F",
-        "Q1775415": "F",
-        "common gender": "C",
-        "Q1305037": "C",
-        "neuter": "N",
-        "Q1775461": "N",
+        "masculine": "masculine",
+        "Q499327": "masculine",
+        "feminine": "feminine",
+        "Q1775415": "feminine",
+        "common": "common",
+        "common gender": "common",
+        "Q1305037": "common",
+        "neuter": "neuter",
+        "Q1775461": "neuter",
     }
 
     return gender_map.get(
-        wikidata_gender, ""
+        wikidata_gender.lower(), ""
     )  # nouns could have a gender that is not a valid attribute
 
 
@@ -458,20 +439,24 @@ def map_cases(wikidata_case: str) -> str:
     ----------
         wikidata_case : str
             The case of the noun that was queried from WikiData.
+
+    Returns
+    -------
+        The case value corrected in case the Wikidata ID was queried.
     """
     case_map = {
-        "accusative": "Acc",
-        "Q146078": "Acc",
-        "dative": "Dat",
-        "Q145599": "Dat",
-        "genitive": "Gen",
-        "Q146233": "Gen",
-        "instrumental": "Ins",
-        "Q192997": "Ins",
-        "prepositional": "Pre",
-        "Q2114906": "Pre",
-        "locative": "Loc",
-        "Q202142": "Loc",
+        "accusative": "accusative",
+        "Q146078": "accusative",
+        "dative": "dative",
+        "Q145599": "dative",
+        "genitive": "genitive",
+        "Q146233": "genitive",
+        "instrumental": "instrumental",
+        "Q192997": "instrumental",
+        "prepositional": "prepositional",
+        "Q2114906": "prepositional",
+        "locative": "locative",
+        "Q202142": "locative",
     }
     case = wikidata_case.split(" case")[0]
     return case_map.get(case, "")
@@ -498,57 +483,66 @@ def order_annotations(annotation: str) -> str:
 def format_sublanguage_name(lang, language_metadata=_languages):
     """
     Formats the name of a sub-language by appending its main language
-    in the format 'Mainlang/Sublang'. If the language is not a sub-language,
+    in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language,
     the original language name is returned as-is.
 
-    Args:
-        lang (str): The name of the language or sub-language to format.
-        language_metadata (dict): The metadata containing information about
-                                  main languages and their sub-languages.
+    Parameters
+    ----------
+        lang : str
+            The name of the language or sub-language to format.
 
-    Returns:
-        str: The formatted language name if it's a sub-language
-             (e.g., 'Norwegian/Nynorsk'), otherwise the original name.
+        language_metadata : dict
+            The metadata containing information about main languages and their sub-languages.
 
-    Raises:
+    Returns
+    -------
+        str
+            The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk').
+            Otherwise the original name.
+
+    Raises
+    ------
         ValueError: If the provided language or sub-language is not found.
 
-    Example:
-        format_sublanguage_name("nynorsk", language_metadata)
+    Example
+    -------
+        > format_sublanguage_name("nynorsk", language_metadata)
         'Norwegian/Nynorsk'
 
-        format_sublanguage_name("english", language_metadata)
+        > format_sublanguage_name("english", language_metadata)
         'English'
     """
-    # Iterate through the main languages in the metadata
     for main_lang, lang_data in language_metadata.items():
-        # If it's not a sub-language, return the original name
+        # If it's not a sub-language, return the original name.
         if main_lang == lang.lower():
             return lang.capitalize()
-        # Check if the main language has sub-languages
+
+        # Check if the main language has sub-languages.
         if "sub_languages" in lang_data:
-            # Check if the provided language is a sub-language
+            # Check if the provided language is a sub-language.
             for sub_lang in lang_data["sub_languages"]:
                 if lang.lower() == sub_lang.lower():
-                    # Return the formatted name Mainlang/Sublang
+                    # Return the formatted name MAIN_LANG/SUB_LANG.
                     return f"{main_lang.capitalize()}/{sub_lang.capitalize()}"
 
-    # Raise ValueError if no match is found
+    # Raise ValueError if no match is found.
     raise ValueError(f"{lang.upper()} is not a valid language or sub-language.")
 
 
 def list_all_languages(language_metadata=_languages):
-    """List all languages from the provided metadata dictionary, including sub-languages."""
+    """
+    Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages.
+    """
     current_languages = []
 
-    # Iterate through the language metadata
+    # Iterate through the language metadata.
     for lang_key, lang_data in language_metadata.items():
-        # Check if there are sub-languages
+        # Check if there are sub-languages.
         if "sub_languages" in lang_data:
-            # Add the sub-languages to current_languages
+            # Add the sub-languages to current_languages.
             current_languages.extend(lang_data["sub_languages"].keys())
         else:
-            # If no sub-languages, add the main language
+            # If no sub-languages, add the main language.
             current_languages.append(lang_key)
 
-    return current_languages
+    return sorted(current_languages)
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index a827666a2..333c3b7d7 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid):
                 language=language_qid, data_type=data_type_qid
             )
 
-        self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.")
+        self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.")
 
     @patch("scribe_data.cli.total.get_qid_by_input")
     def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid):
@@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid):
                 language=language_qid, data_type=data_type_qid
             )
 
-        self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.")
+        self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.")
 
     @patch("scribe_data.cli.total.get_qid_by_input")
     def test_validate_language_and_data_type_both_invalid(self, mock_get_qid):
@@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid):
 
         self.assertEqual(
             str(context.exception),
-            "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.",
+            "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.",
         )
 
     def test_validate_language_and_data_type_with_list(self):
@@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self):
         data_types = ["nouns", "InvalidDataType"]
         with self.assertRaises(ValueError) as context:
             validate_language_and_data_type(languages, data_types)
-        self.assertIn("Invalid language InvalidLanguage", str(context.exception))
-        self.assertIn("Invalid data-type InvalidDataType", str(context.exception))
+        self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception))
+        self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception))
diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
index df37317a3..43eaa2038 100644
--- a/tests/load/test_update_utils.py
+++ b/tests/load/test_update_utils.py
@@ -21,7 +21,6 @@
 """
 
 import sys
-import unittest
 from pathlib import Path
 
 import pytest
@@ -31,57 +30,6 @@
 from scribe_data import utils
 
 
-def test_get_scribe_languages():
-    test_case = unittest.TestCase()
-
-    # test for content, not order
-    test_case.assertCountEqual(
-        utils.get_scribe_languages(),
-        [
-            "Arabic",
-            "Basque",
-            "Bengali",
-            "Bokmål",
-            "Czech",
-            "Danish",
-            "English",
-            "Esperanto",
-            "Estonian",
-            "Finnish",
-            "French",
-            "German",
-            "Greek",
-            "Gurmukhi",
-            "Hausa",
-            "Hebrew",
-            "Hindi",
-            "Indonesian",
-            "Italian",
-            "Japanese",
-            "Kurmanji",
-            "Latin",
-            "Malay",
-            "Malayalam",
-            "Mandarin",
-            "Nigerian",
-            "Nynorsk",
-            "Polish",
-            "Portuguese",
-            "Russian",
-            "Shahmukhi",
-            "Slovak",
-            "Spanish",
-            "Swahili",
-            "Swedish",
-            "Tajik",
-            "Tamil",
-            "Ukrainian",
-            "Urdu",
-            "Yoruba",
-        ],
-    )
-
-
 @pytest.mark.parametrize(
     "language, qid_code",
     [
@@ -187,6 +135,7 @@ def test_list_all_languages():
         "arabic",
         "basque",
         "bengali",
+        "bokmål",
         "czech",
         "danish",
         "english",
@@ -196,10 +145,10 @@ def test_list_all_languages():
         "french",
         "german",
         "greek",
+        "gurmukhi",
         "hausa",
         "hebrew",
         "hindi",
-        "urdu",
         "indonesian",
         "italian",
         "japanese",
@@ -208,14 +157,12 @@ def test_list_all_languages():
         "malay",
         "malayalam",
         "mandarin",
-        "nynorsk",
-        "bokmål",
         "nigerian",
+        "nynorsk",
         "polish",
         "portuguese",
-        "shahmukhi",
-        "gurmukhi",
         "russian",
+        "shahmukhi",
         "slovak",
         "spanish",
         "swahili",
@@ -223,6 +170,7 @@ def test_list_all_languages():
         "tajik",
         "tamil",
         "ukrainian",
+        "urdu",
         "yoruba",
     ]