Merge branch 'main' into Danish_nouns

mhmohona · Oct 29, 2024 · f50aa50 · f50aa50
2 parents 39de829 + 135f32c
commit f50aa50
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 146 deletions.
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
@@ -23,6 +23,7 @@
 from http.client import IncompleteRead
 from urllib.error import HTTPError
 
+import requests
 from SPARQLWrapper import JSON
 
 from scribe_data.utils import (
@@ -101,10 +102,42 @@ def get_datatype_list(language):
         return data_types
 
     else:  # return all data types
-        print("Language is not present in Scribe-Data. Checking all data types.")
         return data_type_metadata
 
 
+def check_qid_is_language(qid: str):
+    """
+    Parameters
+    ----------
+        qid : str
+            The QID to check Wikidata to see if it's a language and return its English label.
+
+    Outputs
+    -------
+        str
+            The English label of the Wikidata language entity.
+
+    Raises
+    ------
+        ValueError
+            An invalid QID that's not a language has been passed.
+    """
+    api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
+    request_string = f"{api_endpoint}/entities/items/{qid}"
+
+    request = requests.get(request_string, timeout=5)
+    request_result = request.json()
+
+    if request_result["statements"]["P31"]:
+        instance_of_values = request_result["statements"]["P31"]
+        for val in instance_of_values:
+            if val["value"]["content"] == "Q34770":
+                print(f"{request_result['labels']['en']} ({qid}) is a language.\n")
+                return request_result["labels"]["en"]
+
+    raise ValueError("The passed Wikidata QID is not a language.")
+
+
 # MARK: Print
 
 
@@ -125,14 +158,28 @@ def print_total_lexemes(language: str = None):
     if language is None:
         print("Returning total counts for all languages and data types...\n")
 
-    elif language.startswith("Q") and language[1:].isdigit():
-        print(f"Wikidata QID {language} passed. Checking all data types.\n")
+    elif (
+        isinstance(language, str)
+        and language.startswith("Q")
+        and language[1:].isdigit()
+    ):
+        print(
+            f"Wikidata QID {language} passed. Checking validity and then all data types."
+        )
+        language = check_qid_is_language(qid=language)
 
     else:
         print(f"Returning total counts for {language} data types...\n")
 
-    print(f"{'Language':<15} {'Data Type':<25} {'Total Wikidata Lexemes':<25}")
-    print("=" * 64)
+    def print_total_header():
+        """
+        Prints the header of the total command output.
+        """
+        print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}")
+        print("=" * 70)
+        print(
+            f"{language.capitalize():<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}"
+        )
 
     if language is None:  # all languages
         languages = list_all_languages(language_metadata)
@@ -145,13 +192,11 @@ def print_total_lexemes(language: str = None):
                 total_lexemes = get_total_lexemes(lang, dt, False)
                 total_lexemes = f"{total_lexemes:,}"
                 if first_row:
-                    print(
-                        f"{lang.capitalize():<15} {dt.replace('_', '-'): <25} {total_lexemes:<25}"
-                    )
+                    print_total_header()
                     first_row = False
 
                 else:
-                    print(f"{'':<15} {dt.replace('_', ' '): <25} {total_lexemes:<25}")
+                    print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}")
 
             print()
 
@@ -170,13 +215,11 @@ def print_total_lexemes(language: str = None):
             total_lexemes = get_total_lexemes(language, dt, False)
             total_lexemes = f"{total_lexemes:,}"
             if first_row:
-                print(
-                    f"{language.capitalize():<15} {dt.replace('_', '-'): <25} {total_lexemes:<25}"
-                )
+                print_total_header()
                 first_row = False
 
             else:
-                print(f"{'':<15} {dt.replace('_', ' '): <25} {total_lexemes:<25}")
+                print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}")
 
         print()
 

diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py
@@ -24,7 +24,7 @@
 import unittest
 from io import StringIO
 from pathlib import Path
-from unittest.mock import MagicMock, Mock, mock_open, patch
+from unittest.mock import MagicMock, mock_open, patch
 
 from scribe_data.cli.convert import (
     convert_to_csv_or_tsv,
@@ -35,34 +35,7 @@
 
 
 class TestConvert(unittest.TestCase):
-    # MARK: Helper Functions
-
-    def setup_language_map(self, mock_language_map: Mock) -> None:
-        """
-        Set up the mock language map for testing.
-
-        Parameters
-        ---------
-            mock_language_map: Mock
-                Mock object representing the language map
-                to be configured.
-
-        Returns
-        -------
-            None
-        """
-        mock_language_map.get.side_effect = lambda lang: {
-            "english": {
-                "language": "english",
-                "iso": "en",
-                "qid": "Q1860",
-            },
-            "french": {
-                "language": "french",
-                "iso": "fr",
-                "qid": "Q150",
-            },
-        }.get(lang.lower())
+    # MARK: Helper Function
 
     def normalize_line_endings(self, data: str) -> str:
         """
@@ -83,44 +56,27 @@ def normalize_line_endings(self, data: str) -> str:
 
     # MARK: JSON
 
-    # @patch("scribe_data.cli.convert.Path", autospec=True)
-    # def test_convert_to_json_normalized_language(self, mock_path):
-    #
-
-    #     mock_path_obj = MagicMock(spec=Path)
-    #     mock_path.return_value = mock_path_obj
-
-    #     mock_path_obj.suffix = ".csv"
-    #     mock_path_obj.exists.return_value = True
-
-    #     convert_to_json(
-    #         language="French",
-    #         data_type="nouns",
-    #         output_type="json",
-    #         input_file="input.csv",
-    #         output_dir="/output_dir",
-    #         overwrite=True,
-    #     )
-
-    # @patch("scribe_data.cli.convert.Path", autospec=True)
-    # def test_convert_to_json_unknown_language(self, mock_path):
-    #     mock_input_file_path = MagicMock(spec=Path)
-    #     mock_input_file_path.exists.return_value = True
-    #     mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)]
-
-    #     with self.assertRaises(ValueError) as context:
-    #         convert_to_json(
-    #             language="FakeLanguage",
-    #             data_type="nouns",
-    #             output_type="json",
-    #             input_file="test.csv",
-    #             output_dir="/output_dir",
-    #             overwrite=True,
-    #         )
-
-    #     self.assertEqual(
-    #         str(context.exception), "Language 'FakeLanguage' is not recognized."
-    #     )
+    @patch("scribe_data.cli.convert.Path", autospec=True)
+    def test_convert_to_json_empty_language(self, mock_path):
+        csv_data = "key,value\na,1\nb,2"
+        mock_file = StringIO(csv_data)
+
+        mock_path_obj = MagicMock(spec=Path)
+        mock_path.return_value = mock_path_obj
+        mock_path_obj.suffix = ".csv"
+        mock_path_obj.exists.return_value = True
+        mock_path_obj.open.return_value.__enter__.return_value = mock_file
+
+        with self.assertRaises(ValueError) as context:
+            convert_to_json(
+                language="",
+                data_type="nouns",
+                output_type="json",
+                input_file="input.csv",
+                output_dir="/output_dir",
+                overwrite=True,
+            )
+        self.assertIn("Language '' is not recognized.", str(context.exception))
 
     @patch("scribe_data.cli.convert.Path", autospec=True)
     def test_convert_to_json_with_input_file(self, mock_path):
@@ -146,7 +102,7 @@ def test_convert_to_json_with_input_file(self, mock_path):
 
         mock_path_obj.open.assert_called_once_with("r", encoding="utf-8")
 
-    @patch("scribe_data.cli.convert.Path")
+    @patch("scribe_data.cli.convert.Path", autospec=True)
     def test_convert_to_json_supported_file_extension_csv(self, mock_path_class):
         mock_path_instance = MagicMock(spec=Path)
 
@@ -164,7 +120,7 @@ def test_convert_to_json_supported_file_extension_csv(self, mock_path_class):
             overwrite=True,
         )
 
-    @patch("scribe_data.cli.convert.Path")
+    @patch("scribe_data.cli.convert.Path", autospec=True)
     def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class):
         mock_path_instance = MagicMock(spec=Path)
 
@@ -182,7 +138,7 @@ def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class):
             overwrite=True,
         )
 
-    @patch("scribe_data.cli.convert.Path")
+    @patch("scribe_data.cli.convert.Path", autospec=True)
     def test_convert_to_json_unsupported_file_extension(self, mock_path):
         mock_path_obj = MagicMock(spec=Path)
         mock_path.return_value = mock_path_obj
@@ -322,62 +278,29 @@ def test_convert_to_json_with_complex_structure(self, mock_path_class):
 
     # MARK: CSV or TSV
 
-    # @patch("scribe_data.cli.convert.Path", autospec=True)
-    # def test_convert_to_csv_or_json_normalized_language(
-    #     self, mock_path
-    # ):
-    #
-
-    #     mock_path_obj = MagicMock(spec=Path)
-    #     mock_path.return_value = mock_path_obj
-
-    #     mock_path_obj.suffix = ".json"
-    #     mock_path_obj.exists.return_value = True
-
-    #     mock_json_data = json.dumps({"key1": "value1", "key2": "value2"})
-    #     mock_open_function = mock_open(read_data=mock_json_data)
-    #     mock_path_obj.open = mock_open_function
-
-    #     convert_to_csv_or_tsv(
-    #         language="English",
-    #         data_type="nouns",
-    #         output_type="csv",
-    #         input_file="input.json",
-    #         output_dir="/output_dir",
-    #         overwrite=True,
-    #     )
-
-    #     mock_open_function.assert_called_once_with("r", encoding="utf-8")
-
-    # @patch("scribe_data.cli.convert.Path", autospec=True)
-    # def test_convert_to_csv_or_json_unknown_language(
-    #     self, mock_path
-    # ):
-    #
-
-    #     mock_path_obj = MagicMock(spec=Path)
-    #     mock_path.return_value = mock_path_obj
-
-    #     mock_path_obj.suffix = ".json"
-    #     mock_path_obj.exists.return_value = True
-
-    #     mock_json_data = json.dumps({"key1": "value1", "key2": "value2"})
-    #     mock_open_function = mock_open(read_data=mock_json_data)
-    #     mock_path_obj.open = mock_open_function
-
-    #     with self.assertRaises(ValueError) as context:
-    #         convert_to_csv_or_tsv(
-    #             language="FakeLanguage",
-    #             data_type="nouns",
-    #             output_type="csv",
-    #             input_file="input.json",
-    #             output_dir="/output_dir",
-    #             overwrite=True,
-    #         )
-
-    #     self.assertEqual(
-    #         str(context.exception), "Language 'FakeLanguage' is not recognized."
-    #     )
+    @patch("scribe_data.cli.convert.Path", autospec=True)
+    def test_convert_to_csv_or_json_empty_language(self, mock_path):
+        mock_path_obj = MagicMock(spec=Path)
+        mock_path.return_value = mock_path_obj
+
+        mock_path_obj.suffix = ".json"
+        mock_path_obj.exists.return_value = True
+
+        mock_json_data = json.dumps({"key1": "value1", "key2": "value2"})
+        mock_open_function = mock_open(read_data=mock_json_data)
+        mock_path_obj.open = mock_open_function
+
+        with self.assertRaises(ValueError) as context:
+            convert_to_csv_or_tsv(
+                language="",
+                data_type="nouns",
+                output_type="csv",
+                input_file="input.json",
+                output_dir="/output_dir",
+                overwrite=True,
+            )
+
+        self.assertEqual(str(context.exception), "Language '' is not recognized.")
 
     @patch("scribe_data.cli.convert.Path", autospec=True)
     def test_convert_to_csv_or_tsv_standarddict_to_csv(self, mock_path_class):
@@ -710,8 +633,8 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv(self, mock_path_class):
 
     # MARK: SQLITE
 
-    @patch("scribe_data.cli.convert.Path")
-    @patch("scribe_data.cli.convert.data_to_sqlite")
+    @patch("scribe_data.cli.convert.Path", autospec=True)
+    @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True)
     @patch("shutil.copy")
     def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_path):
         mock_path.return_value.exists.return_value = True
@@ -728,8 +651,8 @@ def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_pat
         mock_data_to_sqlite.assert_called_with(["english"], ["nouns"])
         mock_shutil_copy.assert_called()
 
-    @patch("scribe_data.cli.convert.Path")
-    @patch("scribe_data.cli.convert.data_to_sqlite")
+    @patch("scribe_data.cli.convert.Path", autospec=True)
+    @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True)
     def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path):
         mock_input_file = MagicMock()
         mock_input_file.exists.return_value = True
@@ -751,9 +674,9 @@ def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path):
 
         mock_data_to_sqlite.assert_called_with(["english"], ["nouns"])
 
-    @patch("scribe_data.cli.convert.Path")
-    @patch("scribe_data.cli.convert.data_to_sqlite")
-    @patch("scribe_data.cli.convert.get_language_iso")
+    @patch("scribe_data.cli.convert.Path", autospec=True)
+    @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True)
+    @patch("scribe_data.cli.convert.get_language_iso", autospec=True)
     @patch("shutil.copy")
     def test_convert_to_sqlite_with_language_iso(
         self, mock_copy, mock_get_language_iso, mock_data_to_sqlite, mock_path