Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changed tqdm loop so that we are using "with" #176

Merged
merged 2 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).

## [Upcoming] Scribe-Data 4.0.0

### 🐞 Bug Fixes

- Wikidata query process stages no longer trigger the tqdm progress bar when they're unsuccessful ([#155](https://github.com/scribe-org/Scribe-Data/issues/155)).

### ♻️ Code Refactoring

- `word_type` has been switched to `data_type` throughout the codebase ([#160](https://github.com/scribe-org/Scribe-Data/issues/160)).
Expand Down
257 changes: 133 additions & 124 deletions src/scribe_data/wikidata/update_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,132 +108,141 @@ def update_data(languages=None, word_types=None):

# Run queries and format data.
data_added_dict = {}
for q in tqdm(
queries_to_run,
desc="Data updated",
unit="process",
):
lang = q.split("/")[-2]
target_type = q.split("/")[-1]
query_name = f"query_{target_type}.sparql"
query_path = f"{q}/{query_name}"

if not os.path.exists(query_path):
# There are multiple queries for a given target_type, so start by running the first.
query_path = query_path[: -len(".sparql")] + "_1" + ".sparql"

print(f"Querying and formatting {lang} {target_type}")
# First format the lines into a multi-line string and then pass this to SPARQLWrapper.
with open(query_path, encoding="utf-8") as file:
query_lines = file.readlines()
sparql.setQuery("".join(query_lines))

results = None
try:
results = sparql.query().convert()
except HTTPError as err:
print(f"HTTPError with {query_path}: {err}")

if results is None:
print(f"Nothing returned by the WDQS server for {query_path}")

# Allow for a query to be reran up to two times.
if queries_to_run.count(q) < 3:
queries_to_run.append(q)

else:
# Subset the returned JSON and the individual results before saving.
query_results = results["results"]["bindings"]

results_formatted = []
for r in query_results: # query_results is also a list
r_dict = {k: r[k]["value"] for k in r.keys()}

results_formatted.append(r_dict)

with open(
f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json",
"w",
encoding="utf-8",
) as f:
json.dump(results_formatted, f, ensure_ascii=False, indent=0)

if "_1" in query_path:
# Note: Only the first query was ran, so we need to run the second and append the json.
for suffix in ["_2", "_3"]:
query_path = query_path.replace("_1", suffix).replace("_2", suffix)

if os.path.exists(query_path):
with open(query_path, encoding="utf-8") as file:
query_lines = file.readlines()
sparql.setQuery("".join(query_lines))

results = None
try:
results = sparql.query().convert()
except HTTPError as err:
print(f"HTTPError with {query_path}: {err}")

if results is None:
print(
f"Nothing returned by the WDQS server for {query_path}"
)

# Allow for a query to be reran up to two times.
if queries_to_run.count(q) < 3:
queries_to_run.append(q)

else:
# Subset the returned JSON and the individual results before saving.
query_results = results["results"]["bindings"]

# Note: Don't rewrite results_formatted as we want to extend the json and combine in formatting.
for r in query_results: # query_results is also a list
r_dict = {k: r[k]["value"] for k in r.keys()}

# Note: The following is so we have a breakdown of queries for German later.
# Note: We need auxiliary verbs to be present as we loop to get both sein and haben forms.
if lang == "German":
r_dict_keys = list(r_dict.keys())
if "auxiliaryVerb" not in r_dict_keys:
r_dict["auxiliaryVerb"] = ""

results_formatted.append(r_dict)

with open(
f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json",
"w",
encoding="utf-8",
) as f:
json.dump(
results_formatted,
f,
ensure_ascii=False,
indent=0,
with tqdm(total=len(queries_to_run), desc="Data updated", unit="process") as pbar:
for q in queries_to_run:
lang = q.split("/")[-2]
target_type = q.split("/")[-1]
query_name = f"query_{target_type}.sparql"
query_path = f"{q}/{query_name}"
pbar.update(0)

if not os.path.exists(query_path):
# There are multiple queries for a given target_type, so start by running the first.
query_path = query_path[: -len(".sparql")] + "_1" + ".sparql"

print(f"Querying and formatting {lang} {target_type}")
# First format the lines into a multi-line string and then pass this to SPARQLWrapper.
with open(query_path, encoding="utf-8") as file:
query_lines = file.readlines()
sparql.setQuery("".join(query_lines))

results = None
try:
results = sparql.query().convert()

except HTTPError as err:
print(f"HTTPError with {query_path}: {err}")
tqdm.write(f"Exception: {err}")

if results is None:
print(f"Nothing returned by the WDQS server for {query_path}")

# Allow for a query to be reran up to two times.
if queries_to_run.count(q) < 3:
queries_to_run.append(q)
## tqdm.write("results is none")

else:
# Subset the returned JSON and the individual results before saving.
query_results = results["results"]["bindings"]

results_formatted = []
for r in query_results: # query_results is also a list
r_dict = {k: r[k]["value"] for k in r.keys()}

results_formatted.append(r_dict)

with open(
f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json",
"w",
encoding="utf-8",
) as f:
json.dump(results_formatted, f, ensure_ascii=False, indent=0)

if "_1" in query_path:
# Note: Only the first query was ran, so we need to run the second and append the json.
for suffix in ["_2", "_3"]:
query_path = query_path.replace("_1", suffix).replace(
"_2", suffix
)

if os.path.exists(query_path):
with open(query_path, encoding="utf-8") as file:
query_lines = file.readlines()
sparql.setQuery("".join(query_lines))

results = None
try:
results = sparql.query().convert()
except HTTPError as err:
print(f"HTTPError with {query_path}: {err}")

if results is None:
print(
f"Nothing returned by the WDQS server for {query_path}"
)

# Call the corresponding formatting file and update data changes.
os.system(
f"python3 {PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/format_{target_type}.py"
)

with open(
f"scribe_data_json_export/{lang.capitalize()}/{target_type}.json",
encoding="utf-8",
) as json_file:
new_keyboard_data = json.load(json_file)

if lang not in data_added_dict:
data_added_dict[lang] = {}
data_added_dict[lang][target_type] = (
len(new_keyboard_data) - current_data[lang][target_type]
)

current_data[lang][target_type] = len(new_keyboard_data)

# Update total_data.json.
with open(f"{PATH_TO_UPDATE_FILES}/total_data.json", "w", encoding="utf-8") as f:
json.dump(current_data, f, ensure_ascii=False, indent=0)
# Allow for a query to be reran up to two times.
if queries_to_run.count(q) < 3:
queries_to_run.append(q)

else:
# Subset the returned JSON and the individual results before saving.
query_results = results["results"]["bindings"]

# Note: Don't rewrite results_formatted as we want to extend the json and combine in formatting.
for (
r
) in query_results: # query_results is also a list
r_dict = {k: r[k]["value"] for k in r.keys()}

# Note: The following is so we have a breakdown of queries for German later.
# Note: We need auxiliary verbs to be present as we loop to get both sein and haben forms.
if lang == "German":
r_dict_keys = list(r_dict.keys())
if "auxiliaryVerb" not in r_dict_keys:
r_dict["auxiliaryVerb"] = ""

results_formatted.append(r_dict)

with open(
f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/{target_type}_queried.json",
"w",
encoding="utf-8",
) as f:
json.dump(
results_formatted,
f,
ensure_ascii=False,
indent=0,
)

# Call the corresponding formatting file and update data changes.
os.system(
f"python3 {PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/format_{target_type}.py"
)

with open(
f"scribe_data_json_export/{lang.capitalize()}/{target_type}.json",
encoding="utf-8",
) as json_file:
new_keyboard_data = json.load(json_file)

if lang not in data_added_dict:
data_added_dict[lang] = {}
data_added_dict[lang][target_type] = (
len(new_keyboard_data) - current_data[lang][target_type]
)

current_data[lang][target_type] = len(new_keyboard_data)

pbar.update(1)

# Update total_data.json.
with open(
f"{PATH_TO_UPDATE_FILES}/total_data.json", "w", encoding="utf-8"
) as f:
json.dump(current_data, f, ensure_ascii=False, indent=0)


if __name__ == "__main__":
Expand Down
Loading