Skip to content

Commit

Permalink
Merge pull request #204 from mhmohona/Interactive
Browse files Browse the repository at this point in the history
Fix more bug in CLI - Interactive Learning module
  • Loading branch information
andrewtavis authored Sep 26, 2024
2 parents 53688c2 + c1cbd0c commit baab052
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 68 deletions.
36 changes: 20 additions & 16 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,26 @@
from scribe_data.cli.cli_utils import language_map
from scribe_data.load.data_to_sqlite import data_to_sqlite
from scribe_data.utils import (
DEFAULT_JSON_EXPORT_DIR,
DEFAULT_SQLITE_EXPORT_DIR,
get_language_iso,
)

DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR)


def export_json(
language: str, data_type: str, output_dir: Path, overwrite: bool
) -> None:
normalized_language = language_map.get(language.lower())
language_capitalized = language.capitalize()

if not normalized_language:
raise ValueError(f"Language '{language_capitalized}' is not recognized.")
raise ValueError(f"Language '{language.capitalize()}' is not recognized.")

data_type = data_type[0] if isinstance(data_type, list) else data_type
data_file = (
DATA_DIR / normalized_language["language"].capitalize() / f"{data_type}.json"
output_dir / normalized_language["language"].capitalize() / f"{data_type}.json"
)

print(data_file)

if not data_file.exists():
print(
f"No data found for language '{normalized_language['language']}' and data type '{data_type}'."
Expand All @@ -64,11 +63,7 @@ def export_json(
print(f"Error reading '{data_file}': {e}")
return

json_output_dir = (
output_dir
/ DEFAULT_JSON_EXPORT_DIR
/ normalized_language["language"].capitalize()
)
json_output_dir = output_dir / normalized_language["language"].capitalize()
json_output_dir.mkdir(parents=True, exist_ok=True)

output_file = json_output_dir / f"{data_type}.json"
Expand All @@ -80,12 +75,13 @@ def export_json(

try:
with output_file.open("w") as file:
json.dump(data, file, indent=2)
json.dump(data, file, indent=0)

except IOError as e:
raise IOError(f"Error writing to '{output_file}': {e}") from e

print(
f"Data for language '{normalized_language['language']}' and data type '{data_type}' written to '{output_file}'"
f"Data for {normalized_language['language'].capitalize()} {data_type} written to {output_file}"
)


Expand All @@ -98,12 +94,20 @@ def convert_to_csv_or_tsv(
return

for dtype in data_type:
# Replace non-JSON default paths with JSON path for where exported data is.
file_path = (
DATA_DIR / normalized_language["language"].capitalize() / f"{dtype}.json"
Path(
str(output_dir)
.replace("scribe_data_csv_export", "scribe_data_json_export")
.replace("scribe_data_tsv_export", "scribe_data_json_export")
)
/ normalized_language["language"].capitalize()
/ f"{dtype}.json"
)
if not file_path.exists():
print(f"No data found for {dtype} conversion at '{file_path}'.")
continue
raise FileNotFoundError(
f"No data found for {dtype} conversion at '{file_path}'."
)

try:
with file_path.open("r") as f:
Expand Down
14 changes: 5 additions & 9 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,11 @@ def get_data(
"""
Function for controlling the data get process for the CLI.
"""
if not outputs_per_entry and (data_type in ["emoji-keywords", "emoji_keywords"]):
print(
"\nNo value set for 'outputs-per-entry'. Setting a default value of 3 outputs per entry.\n"
)
outputs_per_entry = 3

languages = [language] if language else None

if all:
print("Updating all languages and data types ...")
query_data()
query_data(None, None, overwrite)

elif data_type in ["emoji-keywords", "emoji_keywords"]:
for lang in languages:
Expand All @@ -80,17 +74,19 @@ def get_data(
os.system(f"python3 {translation_generation_script}")

elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type

data_type = [data_type] if data_type else None
print(f"Updating data for language: {language}, data type: {data_type}")
query_data(languages, data_type)
query_data(languages, data_type, overwrite)

else:
raise ValueError(
"You must provide either at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
)

if output_dir:
output_dir = Path(output_dir)
output_dir = Path(output_dir).resolve()
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

Expand Down
10 changes: 0 additions & 10 deletions src/scribe_data/cli/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,6 @@ def run_interactive_mode():
selected_data_types = select_data_types()
output_options = get_output_options()

if len(selected_languages) == 1:
print(
f"\nGetting {', '.join(selected_data_types)} for {', '.join(selected_languages)}."
)

else:
print(
f"\nQuerying {', '.join(selected_data_types)} for {', '.join(selected_languages)} languages."
)

print(
f"Data will be exported as {output_options['type'].upper()} files to '{output_options['dir']}'."
)
Expand Down
4 changes: 2 additions & 2 deletions src/scribe_data/translation/translation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def translation_interrupt_handler(source_language, translations):
"w",
encoding="utf-8",
) as file:
json.dump(translations, file, ensure_ascii=False, indent=4)
json.dump(translations, file, ensure_ascii=False, indent=0)

print("The current progress is saved to the translations.json file.")
exit()
Expand Down Expand Up @@ -238,7 +238,7 @@ def translate_to_other_languages(
"w",
encoding="utf-8",
) as file:
file.write(json.dumps(translations, ensure_ascii=False, indent=2))
file.write(json.dumps(translations, ensure_ascii=False, indent=0))
file.write("\n")

print(
Expand Down
4 changes: 3 additions & 1 deletion src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def load_queried_data(language: str, data_type: str) -> tuple[Any, bool, str]:
return json.load(f), data_path


def export_formatted_data(formatted_data: dict, language: str, data_type: str) -> None:
def export_formatted_data(
formatted_data: dict, language: str, data_type: str, query_data_in_use: bool = False
) -> None:
"""
Exports formatted data to a JSON file for a specific language and data type.
Expand Down
54 changes: 27 additions & 27 deletions src/scribe_data/wikidata/query_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from scribe_data.wikidata.wikidata_utils import sparql


def query_data(languages=None, word_types=None):
def query_data(languages=None, word_types=None, overwrite=None):
SCRIBE_DATA_SRC_PATH = Path(__file__).parent.parent
PATH_TO_LANGUAGE_EXTRACTION_FILES = (
SCRIBE_DATA_SRC_PATH / "language_data_extraction"
Expand Down Expand Up @@ -69,8 +69,6 @@ def query_data(languages=None, word_types=None):
]
queries_to_run = sorted(queries_to_run)

print(queries_to_run)

# Run queries and format data.
for q in tqdm(
queries_to_run,
Expand All @@ -87,33 +85,35 @@ def query_data(languages=None, word_types=None):
file_name = f"{target_type}.json"

if existing_files := list(export_dir.glob(f"{target_type}*.json")):
print(
f"Existing file(s) found for {lang} {target_type} in the outputs directory:\n"
)
for i, file in enumerate(existing_files, 1):
print(f"{i}. {file.name}")

# choice = input(
# "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: "
# )
choice = input(
"\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: "
)

print(f"You entered: {choice}")

if choice in ["o", "O"]:
print("Removing existing files...")
if overwrite:
print("Overwrite is enabled. Removing existing files...")
for file in existing_files:
file.unlink()

# elif choice in ["k", "K"]:
# timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# file_name = f"{target_type}_{timestamp}.json"

else:
print(f"Skipping update for {lang} {target_type}.")
continue
print(
f"\nExisting file(s) found for {lang} {target_type} in the outputs directory:\n"
)
for i, file in enumerate(existing_files, 1):
print(f"{i}. {file.name}")
# choice = input(
# "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: "
# )
choice = input(
"\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: "
)

print(f"You entered: {choice}")

if choice.lower() == "o":
print("Removing existing files...")
for file in existing_files:
file.unlink()
# elif choice in ["k", "K"]:
# timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# file_name = f"{target_type}_{timestamp}.json"
else:
print(f"Skipping update for {lang} {target_type}.")
continue

file_path = export_dir / file_name
print(f"Querying and formatting {lang} {target_type}")
Expand Down
6 changes: 3 additions & 3 deletions tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def test_get_command(
self, mock_system, mock_convert, mock_export_json, mock_query_data
):
expected_calls = [
call(["English"], ["nouns"]),
call(["English"], ["nouns"]),
call(),
call(["English"], ["nouns"], False),
call(["English"], ["nouns"], False),
call(None, None, False),
]

# Execute the test
Expand Down

0 comments on commit baab052

Please sign in to comment.