diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py index 4dcb21e32..832ed4419 100644 --- a/src/scribe_data/check/check_project_structure.py +++ b/src/scribe_data/check/check_project_structure.py @@ -72,9 +72,65 @@ BASE_DIR = "../language_data_extraction" +def check_data_type_folders(path, language, subdir, errors): + """ + Validate the contents of data type folders within a language directory. + + This function checks each data type folder for the presence of expected files + and reports any unexpected files. It allows for multiple SPARQL query files, + a format Python file, and a queried JSON file for each data type. + + Args: + path (str): The path to the directory containing data type folders. + language (str): The name of the language being processed. + subdir (str or None): The name of the sub-directory (for languages with sub-dialects), or None. + errors (list): A list to which error messages will be appended. + + The function checks for the following valid files in each data type folder: + - Files starting with 'query_' and ending with '.sparql' + - A 'format_{data_type}.py' file + - A '{data_type}_queried.json' file + + It skips validation for the 'emoji_keywords' data type folder. + + Any files not matching these patterns (except '__init__.py') are reported as unexpected. + """ + for item in os.listdir(path): + item_path = os.path.join(path, item) + if os.path.isfile(item_path) and item != "__init__.py": + errors.append(f"Unexpected file found in {language}/{subdir or ''}: {item}") + elif os.path.isdir(item_path): + if item not in DATA_TYPES: + errors.append( + f"Unexpected directory found in {language}/{subdir or ''}: {item}" + ) + else: + # Skip validation for emoji_keywords + if item == "emoji_keywords": + continue + + # Check for correctly formatted files + valid_files = [ + f + for f in os.listdir(item_path) + if (f.startswith(f"query_{item}") and f.endswith(".sparql")) + or f == f"format_{item}.py" + or f == f"{item}_queried.json" + ] + + for file in os.listdir(item_path): + if file not in valid_files and file != "__init__.py": + error_subdir = f"{subdir}/" or "" + errors.append( + f"Unexpected file in {language}/{error_subdir}{item}: {file}" + ) + + def validate_project_structure(): """ - Validate that all directories follow the expected project structure and check for unexpected files and directories.""" + Validate that all directories follow the expected project structure and check for unexpected files and directories. + Also validate SPARQL query file names in data_type folders and SUBDIRECTORIES. + """ errors = [] if not os.path.exists(BASE_DIR): @@ -129,22 +185,10 @@ def validate_project_structure(): for subdir in expected_subdirs: subdir_path = os.path.join(language_path, subdir) if os.path.exists(subdir_path): - for item in os.listdir(subdir_path): - item_path = os.path.join(subdir_path, item) - if os.path.isfile(item_path) and item != "__init__.py": - errors.append( - f"Unexpected file found in {language}/{subdir}: {item}" - ) - - elif os.path.isdir(item_path) and item not in DATA_TYPES: - errors.append( - f"Unexpected directory found in {language}/{subdir}: {item}" - ) - - elif unexpected_data_types := found_subdirs - DATA_TYPES: - errors.append( - f"Unexpected subdirectories in '{language}': {unexpected_data_types}" - ) + check_data_type_folders(subdir_path, language, subdir, errors) + + else: + check_data_type_folders(language_path, language, None, errors) if errors: print("Errors found:") diff --git a/src/scribe_data/language_data_extraction/Esperanto/proper_nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Esperanto/proper_nouns/query_proper_nouns.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Esperanto/proper_nouns/query_nouns.sparql rename to src/scribe_data/language_data_extraction/Esperanto/proper_nouns/query_proper_nouns.sparql diff --git a/src/scribe_data/language_data_extraction/Finnish/prepositions/query_preposition.sparql b/src/scribe_data/language_data_extraction/Finnish/prepositions/query_prepositions.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Finnish/prepositions/query_preposition.sparql rename to src/scribe_data/language_data_extraction/Finnish/prepositions/query_prepositions.sparql diff --git a/src/scribe_data/language_data_extraction/Kurmanji/adjectives/query_adjective.sparql b/src/scribe_data/language_data_extraction/Kurmanji/adjectives/query_adjectives.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Kurmanji/adjectives/query_adjective.sparql rename to src/scribe_data/language_data_extraction/Kurmanji/adjectives/query_adjectives.sparql diff --git a/src/scribe_data/language_data_extraction/Kurmanji/adverbs/query_adverb.sparql b/src/scribe_data/language_data_extraction/Kurmanji/adverbs/query_adverbs.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Kurmanji/adverbs/query_adverb.sparql rename to src/scribe_data/language_data_extraction/Kurmanji/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Kurmanji/prepositions/query_preposition.sparql b/src/scribe_data/language_data_extraction/Kurmanji/prepositions/query_prepositions.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Kurmanji/prepositions/query_preposition.sparql rename to src/scribe_data/language_data_extraction/Kurmanji/prepositions/query_prepositions.sparql diff --git a/src/scribe_data/language_data_extraction/Swahili/adverbs/query_adverb.sparql b/src/scribe_data/language_data_extraction/Swahili/adverbs/query_adverbs.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Swahili/adverbs/query_adverb.sparql rename to src/scribe_data/language_data_extraction/Swahili/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Yoruba/adjectives/query_adjective.sparql b/src/scribe_data/language_data_extraction/Yoruba/adjectives/query_adjectives.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Yoruba/adjectives/query_adjective.sparql rename to src/scribe_data/language_data_extraction/Yoruba/adjectives/query_adjectives.sparql diff --git a/src/scribe_data/language_data_extraction/Yoruba/verbs/query_verb.sparql b/src/scribe_data/language_data_extraction/Yoruba/verbs/query_verbs.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Yoruba/verbs/query_verb.sparql rename to src/scribe_data/language_data_extraction/Yoruba/verbs/query_verbs.sparql