From 1f066e09d38ee10966bb472125b6fc3ff8f58e0e Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Tue, 24 Sep 2024 07:15:26 +0600 Subject: [PATCH 1/7] fix more bug --- src/scribe_data/cli/convert.py | 6 +-- src/scribe_data/cli/get.py | 14 +++---- src/scribe_data/utils.py | 4 +- src/scribe_data/wikidata/query_data.py | 52 +++++++++++++------------- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 2a6f9183c..4cd126670 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -64,11 +64,7 @@ def export_json( print(f"Error reading '{data_file}': {e}") return - json_output_dir = ( - output_dir - / DEFAULT_JSON_EXPORT_DIR - / normalized_language["language"].capitalize() - ) + json_output_dir = output_dir / normalized_language["language"].capitalize() json_output_dir.mkdir(parents=True, exist_ok=True) output_file = json_output_dir / f"{data_type}.json" diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 3d438be2c..6dcb6c37a 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -43,17 +43,17 @@ def get_data( """ Function for controlling the data get process for the CLI. """ - if not outputs_per_entry and (data_type in ["emoji-keywords", "emoji_keywords"]): - print( - "\nNo value set for 'outputs-per-entry'. Setting a default value of 3 outputs per entry.\n" - ) - outputs_per_entry = 3 + # if not outputs_per_entry and (data_type in ["emoji-keywords", "emoji_keywords"]): + # print( + # "\nNo value set for 'outputs-per-entry'. Setting a default value of 3 outputs per entry.\n" + # ) + # outputs_per_entry = 3 languages = [language] if language else None if all: print("Updating all languages and data types ...") - query_data() + query_data(None, None, overwrite) elif data_type in ["emoji-keywords", "emoji_keywords"]: for lang in languages: @@ -82,7 +82,7 @@ def get_data( elif language or data_type: data_type = [data_type] if data_type else None print(f"Updating data for language: {language}, data type: {data_type}") - query_data(languages, data_type) + query_data(languages, data_type, overwrite) else: raise ValueError( diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 6c84b97ff..73d83a55e 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -264,7 +264,9 @@ def load_queried_data(language: str, data_type: str) -> tuple[Any, bool, str]: return json.load(f), data_path -def export_formatted_data(formatted_data: dict, language: str, data_type: str) -> None: +def export_formatted_data( + formatted_data: dict, language: str, data_type: str, query_data_in_use: bool = False +) -> None: """ Exports formatted data to a JSON file for a specific language and data type. diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 09c9036be..cd1dadf8c 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -31,7 +31,7 @@ from scribe_data.wikidata.wikidata_utils import sparql -def query_data(languages=None, word_types=None): +def query_data(languages=None, word_types=None, overwrite=None): SCRIBE_DATA_SRC_PATH = Path(__file__).parent.parent PATH_TO_LANGUAGE_EXTRACTION_FILES = ( SCRIBE_DATA_SRC_PATH / "language_data_extraction" @@ -87,33 +87,35 @@ def query_data(languages=None, word_types=None): file_name = f"{target_type}.json" if existing_files := list(export_dir.glob(f"{target_type}*.json")): - print( - f"Existing file(s) found for {lang} {target_type} in the outputs directory:\n" - ) - for i, file in enumerate(existing_files, 1): - print(f"{i}. {file.name}") - - # choice = input( - # "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: " - # ) - choice = input( - "\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: " - ) - - print(f"You entered: {choice}") - - if choice in ["o", "O"]: - print("Removing existing files...") + if overwrite: + print("Overwrite is enabled. Removing existing files...") for file in existing_files: file.unlink() - - # elif choice in ["k", "K"]: - # timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") - # file_name = f"{target_type}_{timestamp}.json" - else: - print(f"Skipping update for {lang} {target_type}.") - continue + print( + f"Existing file(s) found for {lang} {target_type} in the outputs directory:\n" + ) + for i, file in enumerate(existing_files, 1): + print(f"{i}. {file.name}") + # choice = input( + # "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: " + # ) + choice = input( + "\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: " + ) + + print(f"You entered: {choice}") + + if choice.lower() == "o": + print("Removing existing files...") + for file in existing_files: + file.unlink() + # elif choice in ["k", "K"]: + # timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + # file_name = f"{target_type}_{timestamp}.json" + else: + print(f"Skipping update for {lang} {target_type}.") + continue file_path = export_dir / file_name print(f"Querying and formatting {lang} {target_type}") From 65e287bb210faf06b2e11f5a4dadb0e94b0451d0 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Tue, 24 Sep 2024 16:12:42 +0600 Subject: [PATCH 2/7] updating cli test --- tests/cli/test_get.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 0f751cce8..af914ab22 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -34,10 +34,11 @@ class TestCLIGetCommand(unittest.TestCase): def test_get_command( self, mock_system, mock_convert, mock_export_json, mock_query_data ): + # Updated expected_calls to include the 'overwrite' parameter expected_calls = [ - call(["English"], ["nouns"]), - call(["English"], ["nouns"]), - call(), + call(["English"], ["nouns"], False), + call(["English"], ["nouns"], False), + call(None, None, False), ] # Execute the test From 15873b5e769fa227f90147ff7ab4af7c794330d4 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Tue, 24 Sep 2024 16:17:20 +0600 Subject: [PATCH 3/7] remove unnecessary comment --- src/scribe_data/cli/get.py | 6 ------ tests/cli/test_get.py | 1 - 2 files changed, 7 deletions(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 6dcb6c37a..9d42402cc 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -43,12 +43,6 @@ def get_data( """ Function for controlling the data get process for the CLI. """ - # if not outputs_per_entry and (data_type in ["emoji-keywords", "emoji_keywords"]): - # print( - # "\nNo value set for 'outputs-per-entry'. Setting a default value of 3 outputs per entry.\n" - # ) - # outputs_per_entry = 3 - languages = [language] if language else None if all: diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index af914ab22..69a168963 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -34,7 +34,6 @@ class TestCLIGetCommand(unittest.TestCase): def test_get_command( self, mock_system, mock_convert, mock_export_json, mock_query_data ): - # Updated expected_calls to include the 'overwrite' parameter expected_calls = [ call(["English"], ["nouns"], False), call(["English"], ["nouns"], False), From 4cbc502d0828ea6226ce636add0821e756bad470 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Wed, 25 Sep 2024 22:10:07 +0200 Subject: [PATCH 4/7] Minor fixes to interactive mode and removing print statements --- src/scribe_data/cli/convert.py | 16 +++++++--------- src/scribe_data/cli/get.py | 4 ++-- src/scribe_data/cli/interactive.py | 10 ---------- src/scribe_data/wikidata/query_data.py | 2 -- 4 files changed, 9 insertions(+), 23 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 4cd126670..9b07cf823 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -29,25 +29,23 @@ from scribe_data.cli.cli_utils import language_map from scribe_data.load.data_to_sqlite import data_to_sqlite from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, get_language_iso, ) -DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR) - def export_json( language: str, data_type: str, output_dir: Path, overwrite: bool ) -> None: normalized_language = language_map.get(language.lower()) - language_capitalized = language.capitalize() if not normalized_language: - raise ValueError(f"Language '{language_capitalized}' is not recognized.") + raise ValueError(f"Language '{language.capitalize()}' is not recognized.") data_file = ( - DATA_DIR / normalized_language["language"].capitalize() / f"{data_type}.json" + output_dir + / normalized_language["language"].capitalize() + / f"{data_type[0]}.json" ) if not data_file.exists(): @@ -67,7 +65,7 @@ def export_json( json_output_dir = output_dir / normalized_language["language"].capitalize() json_output_dir.mkdir(parents=True, exist_ok=True) - output_file = json_output_dir / f"{data_type}.json" + output_file = json_output_dir / f"{data_type[0]}.json" if output_file.exists() and not overwrite: user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") if user_input.lower() != "y": @@ -81,7 +79,7 @@ def export_json( raise IOError(f"Error writing to '{output_file}': {e}") from e print( - f"Data for language '{normalized_language['language']}' and data type '{data_type}' written to '{output_file}'" + f"Data for {normalized_language['language'].capitalize()} {data_type[0]} written to {output_file}" ) @@ -95,7 +93,7 @@ def convert_to_csv_or_tsv( for dtype in data_type: file_path = ( - DATA_DIR / normalized_language["language"].capitalize() / f"{dtype}.json" + output_dir / normalized_language["language"].capitalize() / f"{dtype}.json" ) if not file_path.exists(): print(f"No data found for {dtype} conversion at '{file_path}'.") diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 9d42402cc..63f978b07 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -75,7 +75,7 @@ def get_data( elif language or data_type: data_type = [data_type] if data_type else None - print(f"Updating data for language: {language}, data type: {data_type}") + print(f"Updating data for language: {language}, data type: {data_type[0]}") query_data(languages, data_type, overwrite) else: @@ -84,7 +84,7 @@ def get_data( ) if output_dir: - output_dir = Path(output_dir) + output_dir = Path(output_dir).resolve() if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index dedc31b16..edafa0d36 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -164,16 +164,6 @@ def run_interactive_mode(): selected_data_types = select_data_types() output_options = get_output_options() - if len(selected_languages) == 1: - print( - f"\nGetting {', '.join(selected_data_types)} for {', '.join(selected_languages)}." - ) - - else: - print( - f"\nQuerying {', '.join(selected_data_types)} for {', '.join(selected_languages)} languages." - ) - print( f"Data will be exported as {output_options['type'].upper()} files to '{output_options['dir']}'." ) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index cd1dadf8c..d1560ce87 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -69,8 +69,6 @@ def query_data(languages=None, word_types=None, overwrite=None): ] queries_to_run = sorted(queries_to_run) - print(queries_to_run) - # Run queries and format data. for q in tqdm( queries_to_run, From cf8924ccd385a082e937bbb4ed86a9c75a2290d5 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 26 Sep 2024 22:40:26 +0200 Subject: [PATCH 5/7] Fix for conversion path for csv/tsv --- src/scribe_data/cli/convert.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 9b07cf823..03ed51897 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -92,12 +92,20 @@ def convert_to_csv_or_tsv( return for dtype in data_type: + # Replace non-JSON default paths with JSON path for where exported data is. file_path = ( - output_dir / normalized_language["language"].capitalize() / f"{dtype}.json" + Path( + str(output_dir) + .replace("scribe_data_csv_export", "scribe_data_json_export") + .replace("scribe_data_tsv_export", "scribe_data_json_export") + ) + / normalized_language["language"].capitalize() + / f"{dtype}.json" ) if not file_path.exists(): - print(f"No data found for {dtype} conversion at '{file_path}'.") - continue + raise FileNotFoundError( + f"No data found for {dtype} conversion at '{file_path}'." + ) try: with file_path.open("r") as f: From 46da58cf10c0c4c683fda69c2bd289fbbba85007 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 26 Sep 2024 22:45:12 +0200 Subject: [PATCH 6/7] Format message that's being cut off by tqdm with new line --- src/scribe_data/wikidata/query_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index d1560ce87..4c91e0ca2 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -91,7 +91,7 @@ def query_data(languages=None, word_types=None, overwrite=None): file.unlink() else: print( - f"Existing file(s) found for {lang} {target_type} in the outputs directory:\n" + f"\nExisting file(s) found for {lang} {target_type} in the outputs directory:\n" ) for i, file in enumerate(existing_files, 1): print(f"{i}. {file.name}") From c1cbd0cb426b0ea03e6672c53af79c3ace0011a7 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 26 Sep 2024 23:59:18 +0200 Subject: [PATCH 7/7] Fix data_type declarations for lists and strings --- src/scribe_data/cli/convert.py | 14 ++++++++------ src/scribe_data/cli/get.py | 4 +++- src/scribe_data/translation/translation_utils.py | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 03ed51897..c350e679e 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -42,12 +42,13 @@ def export_json( if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") + data_type = data_type[0] if isinstance(data_type, list) else data_type data_file = ( - output_dir - / normalized_language["language"].capitalize() - / f"{data_type[0]}.json" + output_dir / normalized_language["language"].capitalize() / f"{data_type}.json" ) + print(data_file) + if not data_file.exists(): print( f"No data found for language '{normalized_language['language']}' and data type '{data_type}'." @@ -65,7 +66,7 @@ def export_json( json_output_dir = output_dir / normalized_language["language"].capitalize() json_output_dir.mkdir(parents=True, exist_ok=True) - output_file = json_output_dir / f"{data_type[0]}.json" + output_file = json_output_dir / f"{data_type}.json" if output_file.exists() and not overwrite: user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") if user_input.lower() != "y": @@ -74,12 +75,13 @@ def export_json( try: with output_file.open("w") as file: - json.dump(data, file, indent=2) + json.dump(data, file, indent=0) + except IOError as e: raise IOError(f"Error writing to '{output_file}': {e}") from e print( - f"Data for {normalized_language['language'].capitalize()} {data_type[0]} written to {output_file}" + f"Data for {normalized_language['language'].capitalize()} {data_type} written to {output_file}" ) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 63f978b07..aa77214b6 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -74,8 +74,10 @@ def get_data( os.system(f"python3 {translation_generation_script}") elif language or data_type: + data_type = data_type[0] if isinstance(data_type, list) else data_type + data_type = [data_type] if data_type else None - print(f"Updating data for language: {language}, data type: {data_type[0]}") + print(f"Updating data for language: {language}, data type: {data_type}") query_data(languages, data_type, overwrite) else: diff --git a/src/scribe_data/translation/translation_utils.py b/src/scribe_data/translation/translation_utils.py index dd61198df..1b1d58670 100644 --- a/src/scribe_data/translation/translation_utils.py +++ b/src/scribe_data/translation/translation_utils.py @@ -144,7 +144,7 @@ def translation_interrupt_handler(source_language, translations): "w", encoding="utf-8", ) as file: - json.dump(translations, file, ensure_ascii=False, indent=4) + json.dump(translations, file, ensure_ascii=False, indent=0) print("The current progress is saved to the translations.json file.") exit() @@ -238,7 +238,7 @@ def translate_to_other_languages( "w", encoding="utf-8", ) as file: - file.write(json.dumps(translations, ensure_ascii=False, indent=2)) + file.write(json.dumps(translations, ensure_ascii=False, indent=0)) file.write("\n") print(