Skip to content

Commit

Permalink
added common funcs to utils.py
Browse files Browse the repository at this point in the history
Signed-off-by: Shashank Mittal <[email protected]>
  • Loading branch information
shashank-iitbhu committed Feb 27, 2024
1 parent 851410d commit 99a8021
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,16 @@
"""

import collections
import json
import os
import sys

from scribe_data.utils import export_formatted_data, load_queried_data

LANGUAGE = "English"
QUERIED_DATA_TYPE = "nouns"
QUERIED_DATA_FILE = f"{QUERIED_DATA_TYPE}_queried.json"
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
LANGUAGES_DIR_PATH = (
f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages"
)

file_path = sys.argv[0]

update_data_in_use = False # check if update_data.py is being used
if f"languages/{LANGUAGE}/{QUERIED_DATA_TYPE}/" not in file_path:
data_path = QUERIED_DATA_FILE
else:
update_data_in_use = True
data_path = (
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/{QUERIED_DATA_TYPE}/{QUERIED_DATA_FILE}"
)

with open(data_path, encoding="utf-8") as f:
nouns_list = json.load(f)
nouns_list, update_data_in_use = load_queried_data(LANGUAGE, QUERIED_DATA_TYPE, file_path)

nouns_formatted = {}

Expand Down Expand Up @@ -94,21 +79,4 @@

nouns_formatted = collections.OrderedDict(sorted(nouns_formatted.items()))

export_path = f"../formatted_data/{QUERIED_DATA_TYPE}.json"
if update_data_in_use:
export_path = (
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/{QUERIED_DATA_TYPE}.json"
)

with open(
export_path,
"w",
encoding="utf-8",
) as file:
json.dump(nouns_formatted, file, ensure_ascii=False, indent=0)

print(
f"Wrote file {QUERIED_DATA_TYPE}.json with {len(nouns_formatted):,} {QUERIED_DATA_TYPE}."
)

os.remove(data_path)
export_formatted_data(LANGUAGE, QUERIED_DATA_TYPE, nouns_formatted, update_data_in_use)
34 changes: 34 additions & 0 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
get_language_words_to_remove,
get_language_words_to_ignore,
get_path_from_format_file,
get_language_dir_path,
load_queried_data,
export_formatted_data,
get_path_from_load_dir,
get_path_from_et_dir,
get_ios_data_path,
Expand All @@ -22,6 +25,7 @@

import ast
import json
import os
import sys
from importlib import resources
from pathlib import Path
Expand Down Expand Up @@ -240,6 +244,36 @@ def get_language_words_to_ignore(language: str) -> list[str]:
)


def get_language_dir_path(language):
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
return f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages/{language}"


def load_queried_data(language, data_type, file_path):
queried_data_file = f"{data_type}_queried.json"
update_data_in_use = False

if f"languages/{language}/{data_type}/" not in file_path:
data_path = queried_data_file
else:
update_data_in_use = True
data_path = f"{get_language_dir_path(language)}/{data_type}/{queried_data_file}"

with open(data_path, encoding="utf-8") as f:
return json.load(f), update_data_in_use


def export_formatted_data(language, data_type, formatted_data, update_data_in_use):
if update_data_in_use:
export_path = f"{get_language_dir_path(language)}/formatted_data/{data_type}.json"
else:
export_path = f"{data_type}.json"

with open(export_path, "w", encoding="utf-8") as file:
json.dump(formatted_data, file, ensure_ascii=False, indent=0)
print(f"Wrote file {data_type}.json with {len(formatted_data):,} {data_type}.")


def get_path_from_format_file() -> str:
"""
Returns the directory path from a data formatting file to scribe-org.
Expand Down

0 comments on commit 99a8021

Please sign in to comment.