From c01327aeec0623f51e0de0a884b7ad31d40e0099 Mon Sep 17 00:00:00 2001 From: henrikth93 Date: Fri, 19 Jul 2024 22:06:50 +0200 Subject: [PATCH 1/2] Changed tqdm loop so that we are using "with" Changed the tqdm loop so that it is used together with "with", it enables us to prevent printing the progress bar on a failure/error. --- src/scribe_data/wikidata/update_data.py | 257 ++++++++++++------------ 1 file changed, 133 insertions(+), 124 deletions(-) diff --git a/src/scribe_data/wikidata/update_data.py b/src/scribe_data/wikidata/update_data.py index 365d14c46..a06e401c1 100644 --- a/src/scribe_data/wikidata/update_data.py +++ b/src/scribe_data/wikidata/update_data.py @@ -108,132 +108,141 @@ def update_data(languages=None, word_types=None): # Run queries and format data. data_added_dict = {} - for q in tqdm( - queries_to_run, - desc="Data updated", - unit="process", - ): - lang = q.split("/")[-2] - target_type = q.split("/")[-1] - query_name = f"query_{target_type}.sparql" - query_path = f"{q}/{query_name}" - - if not os.path.exists(query_path): - # There are multiple queries for a given target_type, so start by running the first. - query_path = query_path[: -len(".sparql")] + "_1" + ".sparql" - - print(f"Querying and formatting {lang} {target_type}") - # First format the lines into a multi-line string and then pass this to SPARQLWrapper. - with open(query_path, encoding="utf-8") as file: - query_lines = file.readlines() - sparql.setQuery("".join(query_lines)) - - results = None - try: - results = sparql.query().convert() - except HTTPError as err: - print(f"HTTPError with {query_path}: {err}") - - if results is None: - print(f"Nothing returned by the WDQS server for {query_path}") - - # Allow for a query to be reran up to two times. - if queries_to_run.count(q) < 3: - queries_to_run.append(q) - - else: - # Subset the returned JSON and the individual results before saving. - query_results = results["results"]["bindings"] - - results_formatted = [] - for r in query_results: # query_results is also a list - r_dict = {k: r[k]["value"] for k in r.keys()} - - results_formatted.append(r_dict) - - with open( - f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json", - "w", - encoding="utf-8", - ) as f: - json.dump(results_formatted, f, ensure_ascii=False, indent=0) - - if "_1" in query_path: - # Note: Only the first query was ran, so we need to run the second and append the json. - for suffix in ["_2", "_3"]: - query_path = query_path.replace("_1", suffix).replace("_2", suffix) - - if os.path.exists(query_path): - with open(query_path, encoding="utf-8") as file: - query_lines = file.readlines() - sparql.setQuery("".join(query_lines)) - - results = None - try: - results = sparql.query().convert() - except HTTPError as err: - print(f"HTTPError with {query_path}: {err}") - - if results is None: - print( - f"Nothing returned by the WDQS server for {query_path}" - ) - - # Allow for a query to be reran up to two times. - if queries_to_run.count(q) < 3: - queries_to_run.append(q) - - else: - # Subset the returned JSON and the individual results before saving. - query_results = results["results"]["bindings"] - - # Note: Don't rewrite results_formatted as we want to extend the json and combine in formatting. - for r in query_results: # query_results is also a list - r_dict = {k: r[k]["value"] for k in r.keys()} - - # Note: The following is so we have a breakdown of queries for German later. - # Note: We need auxiliary verbs to be present as we loop to get both sein and haben forms. - if lang == "German": - r_dict_keys = list(r_dict.keys()) - if "auxiliaryVerb" not in r_dict_keys: - r_dict["auxiliaryVerb"] = "" - - results_formatted.append(r_dict) - - with open( - f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json", - "w", - encoding="utf-8", - ) as f: - json.dump( - results_formatted, - f, - ensure_ascii=False, - indent=0, + with tqdm(total=len(queries_to_run), desc="Data updated", unit="process") as pbar: + for q in queries_to_run: + lang = q.split("/")[-2] + target_type = q.split("/")[-1] + query_name = f"query_{target_type}.sparql" + query_path = f"{q}/{query_name}" + pbar.update(0) + + if not os.path.exists(query_path): + # There are multiple queries for a given target_type, so start by running the first. + query_path = query_path[: -len(".sparql")] + "_1" + ".sparql" + + print(f"Querying and formatting {lang} {target_type}") + # First format the lines into a multi-line string and then pass this to SPARQLWrapper. + with open(query_path, encoding="utf-8") as file: + query_lines = file.readlines() + sparql.setQuery("".join(query_lines)) + + results = None + try: + results = sparql.query().convert() + + except HTTPError as err: + print(f"HTTPError with {query_path}: {err}") + tqdm.write(f"Exception: {err}") + + if results is None: + print(f"Nothing returned by the WDQS server for {query_path}") + + # Allow for a query to be reran up to two times. + if queries_to_run.count(q) < 3: + queries_to_run.append(q) + ## tqdm.write("results is none") + + else: + # Subset the returned JSON and the individual results before saving. + query_results = results["results"]["bindings"] + + results_formatted = [] + for r in query_results: # query_results is also a list + r_dict = {k: r[k]["value"] for k in r.keys()} + + results_formatted.append(r_dict) + + with open( + f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json", + "w", + encoding="utf-8", + ) as f: + json.dump(results_formatted, f, ensure_ascii=False, indent=0) + + if "_1" in query_path: + # Note: Only the first query was ran, so we need to run the second and append the json. + for suffix in ["_2", "_3"]: + query_path = query_path.replace("_1", suffix).replace( + "_2", suffix + ) + + if os.path.exists(query_path): + with open(query_path, encoding="utf-8") as file: + query_lines = file.readlines() + sparql.setQuery("".join(query_lines)) + + results = None + try: + results = sparql.query().convert() + except HTTPError as err: + print(f"HTTPError with {query_path}: {err}") + + if results is None: + print( + f"Nothing returned by the WDQS server for {query_path}" ) - # Call the corresponding formatting file and update data changes. - os.system( - f"python3 {PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/format_{target_type}.py" - ) - - with open( - f"scribe_data_json_export/{lang.capitalize()}/{target_type}.json", - encoding="utf-8", - ) as json_file: - new_keyboard_data = json.load(json_file) - - if lang not in data_added_dict: - data_added_dict[lang] = {} - data_added_dict[lang][target_type] = ( - len(new_keyboard_data) - current_data[lang][target_type] - ) - - current_data[lang][target_type] = len(new_keyboard_data) - - # Update total_data.json. - with open(f"{PATH_TO_UPDATE_FILES}/total_data.json", "w", encoding="utf-8") as f: - json.dump(current_data, f, ensure_ascii=False, indent=0) + # Allow for a query to be reran up to two times. + if queries_to_run.count(q) < 3: + queries_to_run.append(q) + + else: + # Subset the returned JSON and the individual results before saving. + query_results = results["results"]["bindings"] + + # Note: Don't rewrite results_formatted as we want to extend the json and combine in formatting. + for ( + r + ) in query_results: # query_results is also a list + r_dict = {k: r[k]["value"] for k in r.keys()} + + # Note: The following is so we have a breakdown of queries for German later. + # Note: We need auxiliary verbs to be present as we loop to get both sein and haben forms. + if lang == "German": + r_dict_keys = list(r_dict.keys()) + if "auxiliaryVerb" not in r_dict_keys: + r_dict["auxiliaryVerb"] = "" + + results_formatted.append(r_dict) + + with open( + f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json", + "w", + encoding="utf-8", + ) as f: + json.dump( + results_formatted, + f, + ensure_ascii=False, + indent=0, + ) + + # Call the corresponding formatting file and update data changes. + os.system( + f"python3 {PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/format_{target_type}.py" + ) + + with open( + f"scribe_data_json_export/{lang.capitalize()}/{target_type}.json", + encoding="utf-8", + ) as json_file: + new_keyboard_data = json.load(json_file) + + if lang not in data_added_dict: + data_added_dict[lang] = {} + data_added_dict[lang][target_type] = ( + len(new_keyboard_data) - current_data[lang][target_type] + ) + + current_data[lang][target_type] = len(new_keyboard_data) + + pbar.update(1) + + # Update total_data.json. + with open( + f"{PATH_TO_UPDATE_FILES}/total_data.json", "w", encoding="utf-8" + ) as f: + json.dump(current_data, f, ensure_ascii=False, indent=0) if __name__ == "__main__": From f9f62aaaefd791c6544a0f65fa1bf12f3009f3b7 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Wed, 24 Jul 2024 13:51:39 -0600 Subject: [PATCH 2/2] Update changelog given update to tqdm stage update criteria --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0b8c6742..3297dbca9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,10 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). ## [Upcoming] Scribe-Data 4.0.0 +### 🐞 Bug Fixes + +- Wikidata query process stages no longer trigger the tqdm progress bar when they're unsuccessful ([#155](https://github.com/scribe-org/Scribe-Data/issues/155)). + ### ♻️ Code Refactoring - `word_type` has been switched to `data_type` throughout the codebase ([#160](https://github.com/scribe-org/Scribe-Data/issues/160)).