diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 6d7881ef..466f0c73 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -23,6 +23,7 @@ from http.client import IncompleteRead from urllib.error import HTTPError +import requests from SPARQLWrapper import JSON from scribe_data.utils import ( @@ -101,10 +102,42 @@ def get_datatype_list(language): return data_types else: # return all data types - print("Language is not present in Scribe-Data. Checking all data types.") return data_type_metadata +def check_qid_is_language(qid: str): + """ + Parameters + ---------- + qid : str + The QID to check Wikidata to see if it's a language and return its English label. + + Outputs + ------- + str + The English label of the Wikidata language entity. + + Raises + ------ + ValueError + An invalid QID that's not a language has been passed. + """ + api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0" + request_string = f"{api_endpoint}/entities/items/{qid}" + + request = requests.get(request_string, timeout=5) + request_result = request.json() + + if request_result["statements"]["P31"]: + instance_of_values = request_result["statements"]["P31"] + for val in instance_of_values: + if val["value"]["content"] == "Q34770": + print(f"{request_result['labels']['en']} ({qid}) is a language.\n") + return request_result["labels"]["en"] + + raise ValueError("The passed Wikidata QID is not a language.") + + # MARK: Print @@ -125,14 +158,28 @@ def print_total_lexemes(language: str = None): if language is None: print("Returning total counts for all languages and data types...\n") - elif language.startswith("Q") and language[1:].isdigit(): - print(f"Wikidata QID {language} passed. Checking all data types.\n") + elif ( + isinstance(language, str) + and language.startswith("Q") + and language[1:].isdigit() + ): + print( + f"Wikidata QID {language} passed. Checking validity and then all data types." + ) + language = check_qid_is_language(qid=language) else: print(f"Returning total counts for {language} data types...\n") - print(f"{'Language':<15} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") - print("=" * 64) + def print_total_header(): + """ + Prints the header of the total command output. + """ + print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") + print("=" * 70) + print( + f"{language.capitalize():<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}" + ) if language is None: # all languages languages = list_all_languages(language_metadata) @@ -145,13 +192,11 @@ def print_total_lexemes(language: str = None): total_lexemes = get_total_lexemes(lang, dt, False) total_lexemes = f"{total_lexemes:,}" if first_row: - print( - f"{lang.capitalize():<15} {dt.replace('_', '-'): <25} {total_lexemes:<25}" - ) + print_total_header() first_row = False else: - print(f"{'':<15} {dt.replace('_', ' '): <25} {total_lexemes:<25}") + print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}") print() @@ -170,13 +215,11 @@ def print_total_lexemes(language: str = None): total_lexemes = get_total_lexemes(language, dt, False) total_lexemes = f"{total_lexemes:,}" if first_row: - print( - f"{language.capitalize():<15} {dt.replace('_', '-'): <25} {total_lexemes:<25}" - ) + print_total_header() first_row = False else: - print(f"{'':<15} {dt.replace('_', ' '): <25} {total_lexemes:<25}") + print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}") print() diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index 5927f3c4..fb43851c 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -24,7 +24,7 @@ import unittest from io import StringIO from pathlib import Path -from unittest.mock import MagicMock, Mock, mock_open, patch +from unittest.mock import MagicMock, mock_open, patch from scribe_data.cli.convert import ( convert_to_csv_or_tsv, @@ -35,34 +35,7 @@ class TestConvert(unittest.TestCase): - # MARK: Helper Functions - - def setup_language_map(self, mock_language_map: Mock) -> None: - """ - Set up the mock language map for testing. - - Parameters - --------- - mock_language_map: Mock - Mock object representing the language map - to be configured. - - Returns - ------- - None - """ - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - }, - }.get(lang.lower()) + # MARK: Helper Function def normalize_line_endings(self, data: str) -> str: """ @@ -83,44 +56,27 @@ def normalize_line_endings(self, data: str) -> str: # MARK: JSON - # @patch("scribe_data.cli.convert.Path", autospec=True) - # def test_convert_to_json_normalized_language(self, mock_path): - # - - # mock_path_obj = MagicMock(spec=Path) - # mock_path.return_value = mock_path_obj - - # mock_path_obj.suffix = ".csv" - # mock_path_obj.exists.return_value = True - - # convert_to_json( - # language="French", - # data_type="nouns", - # output_type="json", - # input_file="input.csv", - # output_dir="/output_dir", - # overwrite=True, - # ) - - # @patch("scribe_data.cli.convert.Path", autospec=True) - # def test_convert_to_json_unknown_language(self, mock_path): - # mock_input_file_path = MagicMock(spec=Path) - # mock_input_file_path.exists.return_value = True - # mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)] - - # with self.assertRaises(ValueError) as context: - # convert_to_json( - # language="FakeLanguage", - # data_type="nouns", - # output_type="json", - # input_file="test.csv", - # output_dir="/output_dir", - # overwrite=True, - # ) - - # self.assertEqual( - # str(context.exception), "Language 'FakeLanguage' is not recognized." - # ) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_empty_language(self, mock_path): + csv_data = "key,value\na,1\nb,2" + mock_file = StringIO(csv_data) + + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + mock_path_obj.suffix = ".csv" + mock_path_obj.exists.return_value = True + mock_path_obj.open.return_value.__enter__.return_value = mock_file + + with self.assertRaises(ValueError) as context: + convert_to_json( + language="", + data_type="nouns", + output_type="json", + input_file="input.csv", + output_dir="/output_dir", + overwrite=True, + ) + self.assertIn("Language '' is not recognized.", str(context.exception)) @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_with_input_file(self, mock_path): @@ -146,7 +102,7 @@ def test_convert_to_json_with_input_file(self, mock_path): mock_path_obj.open.assert_called_once_with("r", encoding="utf-8") - @patch("scribe_data.cli.convert.Path") + @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_supported_file_extension_csv(self, mock_path_class): mock_path_instance = MagicMock(spec=Path) @@ -164,7 +120,7 @@ def test_convert_to_json_supported_file_extension_csv(self, mock_path_class): overwrite=True, ) - @patch("scribe_data.cli.convert.Path") + @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class): mock_path_instance = MagicMock(spec=Path) @@ -182,7 +138,7 @@ def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class): overwrite=True, ) - @patch("scribe_data.cli.convert.Path") + @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_unsupported_file_extension(self, mock_path): mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj @@ -322,62 +278,29 @@ def test_convert_to_json_with_complex_structure(self, mock_path_class): # MARK: CSV or TSV - # @patch("scribe_data.cli.convert.Path", autospec=True) - # def test_convert_to_csv_or_json_normalized_language( - # self, mock_path - # ): - # - - # mock_path_obj = MagicMock(spec=Path) - # mock_path.return_value = mock_path_obj - - # mock_path_obj.suffix = ".json" - # mock_path_obj.exists.return_value = True - - # mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) - # mock_open_function = mock_open(read_data=mock_json_data) - # mock_path_obj.open = mock_open_function - - # convert_to_csv_or_tsv( - # language="English", - # data_type="nouns", - # output_type="csv", - # input_file="input.json", - # output_dir="/output_dir", - # overwrite=True, - # ) - - # mock_open_function.assert_called_once_with("r", encoding="utf-8") - - # @patch("scribe_data.cli.convert.Path", autospec=True) - # def test_convert_to_csv_or_json_unknown_language( - # self, mock_path - # ): - # - - # mock_path_obj = MagicMock(spec=Path) - # mock_path.return_value = mock_path_obj - - # mock_path_obj.suffix = ".json" - # mock_path_obj.exists.return_value = True - - # mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) - # mock_open_function = mock_open(read_data=mock_json_data) - # mock_path_obj.open = mock_open_function - - # with self.assertRaises(ValueError) as context: - # convert_to_csv_or_tsv( - # language="FakeLanguage", - # data_type="nouns", - # output_type="csv", - # input_file="input.json", - # output_dir="/output_dir", - # overwrite=True, - # ) - - # self.assertEqual( - # str(context.exception), "Language 'FakeLanguage' is not recognized." - # ) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_json_empty_language(self, mock_path): + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + mock_path_obj.suffix = ".json" + mock_path_obj.exists.return_value = True + + mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) + mock_open_function = mock_open(read_data=mock_json_data) + mock_path_obj.open = mock_open_function + + with self.assertRaises(ValueError) as context: + convert_to_csv_or_tsv( + language="", + data_type="nouns", + output_type="csv", + input_file="input.json", + output_dir="/output_dir", + overwrite=True, + ) + + self.assertEqual(str(context.exception), "Language '' is not recognized.") @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_csv_or_tsv_standarddict_to_csv(self, mock_path_class): @@ -710,8 +633,8 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv(self, mock_path_class): # MARK: SQLITE - @patch("scribe_data.cli.convert.Path") - @patch("scribe_data.cli.convert.data_to_sqlite") + @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) @patch("shutil.copy") def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_path): mock_path.return_value.exists.return_value = True @@ -728,8 +651,8 @@ def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_pat mock_data_to_sqlite.assert_called_with(["english"], ["nouns"]) mock_shutil_copy.assert_called() - @patch("scribe_data.cli.convert.Path") - @patch("scribe_data.cli.convert.data_to_sqlite") + @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path): mock_input_file = MagicMock() mock_input_file.exists.return_value = True @@ -751,9 +674,9 @@ def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path): mock_data_to_sqlite.assert_called_with(["english"], ["nouns"]) - @patch("scribe_data.cli.convert.Path") - @patch("scribe_data.cli.convert.data_to_sqlite") - @patch("scribe_data.cli.convert.get_language_iso") + @patch("scribe_data.cli.convert.Path", autospec=True) + @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) + @patch("scribe_data.cli.convert.get_language_iso", autospec=True) @patch("shutil.copy") def test_convert_to_sqlite_with_language_iso( self, mock_copy, mock_get_language_iso, mock_data_to_sqlite, mock_path