Skip to content

Commit

Permalink
Merge branch 'main' into Danish_nouns
Browse files Browse the repository at this point in the history
  • Loading branch information
OmarAI2003 authored Oct 29, 2024
2 parents 39de829 + 135f32c commit f50aa50
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 146 deletions.
69 changes: 56 additions & 13 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from http.client import IncompleteRead
from urllib.error import HTTPError

import requests
from SPARQLWrapper import JSON

from scribe_data.utils import (
Expand Down Expand Up @@ -101,10 +102,42 @@ def get_datatype_list(language):
return data_types

else: # return all data types
print("Language is not present in Scribe-Data. Checking all data types.")
return data_type_metadata


def check_qid_is_language(qid: str):
"""
Parameters
----------
qid : str
The QID to check Wikidata to see if it's a language and return its English label.
Outputs
-------
str
The English label of the Wikidata language entity.
Raises
------
ValueError
An invalid QID that's not a language has been passed.
"""
api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
request_string = f"{api_endpoint}/entities/items/{qid}"

request = requests.get(request_string, timeout=5)
request_result = request.json()

if request_result["statements"]["P31"]:
instance_of_values = request_result["statements"]["P31"]
for val in instance_of_values:
if val["value"]["content"] == "Q34770":
print(f"{request_result['labels']['en']} ({qid}) is a language.\n")
return request_result["labels"]["en"]

raise ValueError("The passed Wikidata QID is not a language.")


# MARK: Print


Expand All @@ -125,14 +158,28 @@ def print_total_lexemes(language: str = None):
if language is None:
print("Returning total counts for all languages and data types...\n")

elif language.startswith("Q") and language[1:].isdigit():
print(f"Wikidata QID {language} passed. Checking all data types.\n")
elif (
isinstance(language, str)
and language.startswith("Q")
and language[1:].isdigit()
):
print(
f"Wikidata QID {language} passed. Checking validity and then all data types."
)
language = check_qid_is_language(qid=language)

else:
print(f"Returning total counts for {language} data types...\n")

print(f"{'Language':<15} {'Data Type':<25} {'Total Wikidata Lexemes':<25}")
print("=" * 64)
def print_total_header():
"""
Prints the header of the total command output.
"""
print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}")
print("=" * 70)
print(
f"{language.capitalize():<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}"
)

if language is None: # all languages
languages = list_all_languages(language_metadata)
Expand All @@ -145,13 +192,11 @@ def print_total_lexemes(language: str = None):
total_lexemes = get_total_lexemes(lang, dt, False)
total_lexemes = f"{total_lexemes:,}"
if first_row:
print(
f"{lang.capitalize():<15} {dt.replace('_', '-'): <25} {total_lexemes:<25}"
)
print_total_header()
first_row = False

else:
print(f"{'':<15} {dt.replace('_', ' '): <25} {total_lexemes:<25}")
print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}")

print()

Expand All @@ -170,13 +215,11 @@ def print_total_lexemes(language: str = None):
total_lexemes = get_total_lexemes(language, dt, False)
total_lexemes = f"{total_lexemes:,}"
if first_row:
print(
f"{language.capitalize():<15} {dt.replace('_', '-'): <25} {total_lexemes:<25}"
)
print_total_header()
first_row = False

else:
print(f"{'':<15} {dt.replace('_', ' '): <25} {total_lexemes:<25}")
print(f"{'':<20} {dt.replace('_', ' '): <25} {total_lexemes:<25}")

print()

Expand Down
189 changes: 56 additions & 133 deletions tests/cli/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import unittest
from io import StringIO
from pathlib import Path
from unittest.mock import MagicMock, Mock, mock_open, patch
from unittest.mock import MagicMock, mock_open, patch

from scribe_data.cli.convert import (
convert_to_csv_or_tsv,
Expand All @@ -35,34 +35,7 @@


class TestConvert(unittest.TestCase):
# MARK: Helper Functions

def setup_language_map(self, mock_language_map: Mock) -> None:
"""
Set up the mock language map for testing.
Parameters
---------
mock_language_map: Mock
Mock object representing the language map
to be configured.
Returns
-------
None
"""
mock_language_map.get.side_effect = lambda lang: {
"english": {
"language": "english",
"iso": "en",
"qid": "Q1860",
},
"french": {
"language": "french",
"iso": "fr",
"qid": "Q150",
},
}.get(lang.lower())
# MARK: Helper Function

def normalize_line_endings(self, data: str) -> str:
"""
Expand All @@ -83,44 +56,27 @@ def normalize_line_endings(self, data: str) -> str:

# MARK: JSON

# @patch("scribe_data.cli.convert.Path", autospec=True)
# def test_convert_to_json_normalized_language(self, mock_path):
#

# mock_path_obj = MagicMock(spec=Path)
# mock_path.return_value = mock_path_obj

# mock_path_obj.suffix = ".csv"
# mock_path_obj.exists.return_value = True

# convert_to_json(
# language="French",
# data_type="nouns",
# output_type="json",
# input_file="input.csv",
# output_dir="/output_dir",
# overwrite=True,
# )

# @patch("scribe_data.cli.convert.Path", autospec=True)
# def test_convert_to_json_unknown_language(self, mock_path):
# mock_input_file_path = MagicMock(spec=Path)
# mock_input_file_path.exists.return_value = True
# mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)]

# with self.assertRaises(ValueError) as context:
# convert_to_json(
# language="FakeLanguage",
# data_type="nouns",
# output_type="json",
# input_file="test.csv",
# output_dir="/output_dir",
# overwrite=True,
# )

# self.assertEqual(
# str(context.exception), "Language 'FakeLanguage' is not recognized."
# )
@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_json_empty_language(self, mock_path):
csv_data = "key,value\na,1\nb,2"
mock_file = StringIO(csv_data)

mock_path_obj = MagicMock(spec=Path)
mock_path.return_value = mock_path_obj
mock_path_obj.suffix = ".csv"
mock_path_obj.exists.return_value = True
mock_path_obj.open.return_value.__enter__.return_value = mock_file

with self.assertRaises(ValueError) as context:
convert_to_json(
language="",
data_type="nouns",
output_type="json",
input_file="input.csv",
output_dir="/output_dir",
overwrite=True,
)
self.assertIn("Language '' is not recognized.", str(context.exception))

@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_json_with_input_file(self, mock_path):
Expand All @@ -146,7 +102,7 @@ def test_convert_to_json_with_input_file(self, mock_path):

mock_path_obj.open.assert_called_once_with("r", encoding="utf-8")

@patch("scribe_data.cli.convert.Path")
@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_json_supported_file_extension_csv(self, mock_path_class):
mock_path_instance = MagicMock(spec=Path)

Expand All @@ -164,7 +120,7 @@ def test_convert_to_json_supported_file_extension_csv(self, mock_path_class):
overwrite=True,
)

@patch("scribe_data.cli.convert.Path")
@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class):
mock_path_instance = MagicMock(spec=Path)

Expand All @@ -182,7 +138,7 @@ def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class):
overwrite=True,
)

@patch("scribe_data.cli.convert.Path")
@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_json_unsupported_file_extension(self, mock_path):
mock_path_obj = MagicMock(spec=Path)
mock_path.return_value = mock_path_obj
Expand Down Expand Up @@ -322,62 +278,29 @@ def test_convert_to_json_with_complex_structure(self, mock_path_class):

# MARK: CSV or TSV

# @patch("scribe_data.cli.convert.Path", autospec=True)
# def test_convert_to_csv_or_json_normalized_language(
# self, mock_path
# ):
#

# mock_path_obj = MagicMock(spec=Path)
# mock_path.return_value = mock_path_obj

# mock_path_obj.suffix = ".json"
# mock_path_obj.exists.return_value = True

# mock_json_data = json.dumps({"key1": "value1", "key2": "value2"})
# mock_open_function = mock_open(read_data=mock_json_data)
# mock_path_obj.open = mock_open_function

# convert_to_csv_or_tsv(
# language="English",
# data_type="nouns",
# output_type="csv",
# input_file="input.json",
# output_dir="/output_dir",
# overwrite=True,
# )

# mock_open_function.assert_called_once_with("r", encoding="utf-8")

# @patch("scribe_data.cli.convert.Path", autospec=True)
# def test_convert_to_csv_or_json_unknown_language(
# self, mock_path
# ):
#

# mock_path_obj = MagicMock(spec=Path)
# mock_path.return_value = mock_path_obj

# mock_path_obj.suffix = ".json"
# mock_path_obj.exists.return_value = True

# mock_json_data = json.dumps({"key1": "value1", "key2": "value2"})
# mock_open_function = mock_open(read_data=mock_json_data)
# mock_path_obj.open = mock_open_function

# with self.assertRaises(ValueError) as context:
# convert_to_csv_or_tsv(
# language="FakeLanguage",
# data_type="nouns",
# output_type="csv",
# input_file="input.json",
# output_dir="/output_dir",
# overwrite=True,
# )

# self.assertEqual(
# str(context.exception), "Language 'FakeLanguage' is not recognized."
# )
@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_csv_or_json_empty_language(self, mock_path):
mock_path_obj = MagicMock(spec=Path)
mock_path.return_value = mock_path_obj

mock_path_obj.suffix = ".json"
mock_path_obj.exists.return_value = True

mock_json_data = json.dumps({"key1": "value1", "key2": "value2"})
mock_open_function = mock_open(read_data=mock_json_data)
mock_path_obj.open = mock_open_function

with self.assertRaises(ValueError) as context:
convert_to_csv_or_tsv(
language="",
data_type="nouns",
output_type="csv",
input_file="input.json",
output_dir="/output_dir",
overwrite=True,
)

self.assertEqual(str(context.exception), "Language '' is not recognized.")

@patch("scribe_data.cli.convert.Path", autospec=True)
def test_convert_to_csv_or_tsv_standarddict_to_csv(self, mock_path_class):
Expand Down Expand Up @@ -710,8 +633,8 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv(self, mock_path_class):

# MARK: SQLITE

@patch("scribe_data.cli.convert.Path")
@patch("scribe_data.cli.convert.data_to_sqlite")
@patch("scribe_data.cli.convert.Path", autospec=True)
@patch("scribe_data.cli.convert.data_to_sqlite", autospec=True)
@patch("shutil.copy")
def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_path):
mock_path.return_value.exists.return_value = True
Expand All @@ -728,8 +651,8 @@ def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_pat
mock_data_to_sqlite.assert_called_with(["english"], ["nouns"])
mock_shutil_copy.assert_called()

@patch("scribe_data.cli.convert.Path")
@patch("scribe_data.cli.convert.data_to_sqlite")
@patch("scribe_data.cli.convert.Path", autospec=True)
@patch("scribe_data.cli.convert.data_to_sqlite", autospec=True)
def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path):
mock_input_file = MagicMock()
mock_input_file.exists.return_value = True
Expand All @@ -751,9 +674,9 @@ def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path):

mock_data_to_sqlite.assert_called_with(["english"], ["nouns"])

@patch("scribe_data.cli.convert.Path")
@patch("scribe_data.cli.convert.data_to_sqlite")
@patch("scribe_data.cli.convert.get_language_iso")
@patch("scribe_data.cli.convert.Path", autospec=True)
@patch("scribe_data.cli.convert.data_to_sqlite", autospec=True)
@patch("scribe_data.cli.convert.get_language_iso", autospec=True)
@patch("shutil.copy")
def test_convert_to_sqlite_with_language_iso(
self, mock_copy, mock_get_language_iso, mock_data_to_sqlite, mock_path
Expand Down

0 comments on commit f50aa50

Please sign in to comment.