Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added check for sparql and for json in workflow #392

Merged
merged 5 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 61 additions & 17 deletions src/scribe_data/check/check_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,65 @@
BASE_DIR = "../language_data_extraction"


def check_data_type_folders(path, language, subdir, errors):
"""
Validate the contents of data type folders within a language directory.

This function checks each data type folder for the presence of expected files
and reports any unexpected files. It allows for multiple SPARQL query files,
a format Python file, and a queried JSON file for each data type.

Args:
path (str): The path to the directory containing data type folders.
language (str): The name of the language being processed.
subdir (str or None): The name of the sub-directory (for languages with sub-dialects), or None.
errors (list): A list to which error messages will be appended.

The function checks for the following valid files in each data type folder:
- Files starting with 'query_' and ending with '.sparql'
- A 'format_{data_type}.py' file
- A '{data_type}_queried.json' file

It skips validation for the 'emoji_keywords' data type folder.

Any files not matching these patterns (except '__init__.py') are reported as unexpected.
"""
for item in os.listdir(path):
item_path = os.path.join(path, item)
if os.path.isfile(item_path) and item != "__init__.py":
errors.append(f"Unexpected file found in {language}/{subdir or ''}: {item}")
elif os.path.isdir(item_path):
if item not in DATA_TYPES:
errors.append(
f"Unexpected directory found in {language}/{subdir or ''}: {item}"
)
else:
# Skip validation for emoji_keywords
if item == "emoji_keywords":
continue

# Check for correctly formatted files
valid_files = [
f
for f in os.listdir(item_path)
if (f.startswith(f"query_{item}") and f.endswith(".sparql"))
or f == f"format_{item}.py"
or f == f"{item}_queried.json"
]

for file in os.listdir(item_path):
if file not in valid_files and file != "__init__.py":
error_subdir = f"{subdir}/" or ""
errors.append(
f"Unexpected file in {language}/{error_subdir}{item}: {file}"
)


def validate_project_structure():
"""
Validate that all directories follow the expected project structure and check for unexpected files and directories."""
Validate that all directories follow the expected project structure and check for unexpected files and directories.
Also validate SPARQL query file names in data_type folders and SUBDIRECTORIES.
"""
errors = []

if not os.path.exists(BASE_DIR):
Expand Down Expand Up @@ -129,22 +185,10 @@ def validate_project_structure():
for subdir in expected_subdirs:
subdir_path = os.path.join(language_path, subdir)
if os.path.exists(subdir_path):
for item in os.listdir(subdir_path):
item_path = os.path.join(subdir_path, item)
if os.path.isfile(item_path) and item != "__init__.py":
errors.append(
f"Unexpected file found in {language}/{subdir}: {item}"
)

elif os.path.isdir(item_path) and item not in DATA_TYPES:
errors.append(
f"Unexpected directory found in {language}/{subdir}: {item}"
)

elif unexpected_data_types := found_subdirs - DATA_TYPES:
errors.append(
f"Unexpected subdirectories in '{language}': {unexpected_data_types}"
)
check_data_type_folders(subdir_path, language, subdir, errors)

else:
check_data_type_folders(language_path, language, None, errors)

if errors:
print("Errors found:")
Expand Down
Loading