From 99a80215c183f8c8467403f0e6ceffa628b2a756 Mon Sep 17 00:00:00 2001 From: Shashank Mittal Date: Wed, 28 Feb 2024 00:39:15 +0530 Subject: [PATCH] added common funcs to utils.py Signed-off-by: Shashank Mittal --- .../languages/English/nouns/format_nouns.py | 40 ++----------------- src/scribe_data/utils.py | 34 ++++++++++++++++ 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/scribe_data/extract_transform/languages/English/nouns/format_nouns.py b/src/scribe_data/extract_transform/languages/English/nouns/format_nouns.py index 49b8ac07f..0e31f4970 100644 --- a/src/scribe_data/extract_transform/languages/English/nouns/format_nouns.py +++ b/src/scribe_data/extract_transform/languages/English/nouns/format_nouns.py @@ -6,31 +6,16 @@ """ import collections -import json -import os import sys +from scribe_data.utils import export_formatted_data, load_queried_data + LANGUAGE = "English" QUERIED_DATA_TYPE = "nouns" -QUERIED_DATA_FILE = f"{QUERIED_DATA_TYPE}_queried.json" -PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] -LANGUAGES_DIR_PATH = ( - f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages" -) file_path = sys.argv[0] -update_data_in_use = False # check if update_data.py is being used -if f"languages/{LANGUAGE}/{QUERIED_DATA_TYPE}/" not in file_path: - data_path = QUERIED_DATA_FILE -else: - update_data_in_use = True - data_path = ( - f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/{QUERIED_DATA_TYPE}/{QUERIED_DATA_FILE}" - ) - -with open(data_path, encoding="utf-8") as f: - nouns_list = json.load(f) +nouns_list, update_data_in_use = load_queried_data(LANGUAGE, QUERIED_DATA_TYPE, file_path) nouns_formatted = {} @@ -94,21 +79,4 @@ nouns_formatted = collections.OrderedDict(sorted(nouns_formatted.items())) -export_path = f"../formatted_data/{QUERIED_DATA_TYPE}.json" -if update_data_in_use: - export_path = ( - f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/{QUERIED_DATA_TYPE}.json" - ) - -with open( - export_path, - "w", - encoding="utf-8", -) as file: - json.dump(nouns_formatted, file, ensure_ascii=False, indent=0) - -print( - f"Wrote file {QUERIED_DATA_TYPE}.json with {len(nouns_formatted):,} {QUERIED_DATA_TYPE}." -) - -os.remove(data_path) +export_formatted_data(LANGUAGE, QUERIED_DATA_TYPE, nouns_formatted, update_data_in_use) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index c6b894303..fbaa98e4c 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -11,6 +11,9 @@ get_language_words_to_remove, get_language_words_to_ignore, get_path_from_format_file, + get_language_dir_path, + load_queried_data, + export_formatted_data, get_path_from_load_dir, get_path_from_et_dir, get_ios_data_path, @@ -22,6 +25,7 @@ import ast import json +import os import sys from importlib import resources from pathlib import Path @@ -240,6 +244,36 @@ def get_language_words_to_ignore(language: str) -> list[str]: ) +def get_language_dir_path(language): + PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] + return f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages/{language}" + + +def load_queried_data(language, data_type, file_path): + queried_data_file = f"{data_type}_queried.json" + update_data_in_use = False + + if f"languages/{language}/{data_type}/" not in file_path: + data_path = queried_data_file + else: + update_data_in_use = True + data_path = f"{get_language_dir_path(language)}/{data_type}/{queried_data_file}" + + with open(data_path, encoding="utf-8") as f: + return json.load(f), update_data_in_use + + +def export_formatted_data(language, data_type, formatted_data, update_data_in_use): + if update_data_in_use: + export_path = f"{get_language_dir_path(language)}/formatted_data/{data_type}.json" + else: + export_path = f"{data_type}.json" + + with open(export_path, "w", encoding="utf-8") as file: + json.dump(formatted_data, file, ensure_ascii=False, indent=0) + print(f"Wrote file {data_type}.json with {len(formatted_data):,} {data_type}.") + + def get_path_from_format_file() -> str: """ Returns the directory path from a data formatting file to scribe-org.