diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index 493d1025..d6d60b0f 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -30,11 +30,14 @@ from scribe_data.utils import LANGUAGE_DATA_EXTRACTION_DIR, lexeme_form_metadata lexeme_form_qid_order = [] +# lexeme_form_labels = [] for key, value in lexeme_form_metadata.items(): lexeme_form_qid_order.extend( sub_value["qid"] for sub_key, sub_value in value.items() if "qid" in sub_value ) - + # lexeme_form_labels.extend( + # sub_value["label"] for sub_key, sub_value in value.items() if "label" in sub_value + # ) # MARK: Extract Forms @@ -74,49 +77,7 @@ def extract_forms_from_sparql(file_path: Path) -> str: return None -# MARK: Check Label - - -def check_form_label(form_text: str): - """ - Checks that the label of the form matches the representation label. - - Parameters - ---------- - form_text : str - The text that defines the form within the query. - - Returns - ------- - bool - Whether the form and its current representation label match (repForm and rep). - """ - form_label_line_pattern = r"\?lexeme ontolex:lexicalForm .* \." - - if line_match := re.search(pattern=form_label_line_pattern, string=form_text): - form_label_pattern = r".*\?(.*)\." - if label_match := re.search(pattern=form_label_pattern, string=line_match[0]): - form_label = label_match[1].strip() - current_form_rep_label = form_label.split("Form")[0] - - if not line_match: - return False - - onto_rep_pattern = r"{form_label} ontolex:representation .* ;".format( - form_label=form_label - ) - - if not (line_match := re.search(pattern=onto_rep_pattern, string=form_text)): - return False - - rep_label_pattern = r".*\?(.*);" - if label_match := re.search(pattern=rep_label_pattern, string=line_match[0]): - form_rep_label = label_match[1].strip() - - return form_rep_label == current_form_rep_label - - -# MARK: Get Label +# MARK: Extract Label def extract_form_rep_label(form_text: str): @@ -140,7 +101,7 @@ def extract_form_rep_label(form_text: str): return label_match[1].strip() -# MARK: Get QIDs +# MARK: Extract QIDs def extract_form_qids(form_text: str): @@ -162,7 +123,49 @@ def extract_form_qids(form_text: str): return [q.split("wd:")[1].split(" .")[0] for q in match[0].split(", ")] -# MARK: Punctuation +# MARK: Check Label + + +def check_form_label(form_text: str): + """ + Checks that the label of the form matches the representation label. + + Parameters + ---------- + form_text : str + The text that defines the form within the query. + + Returns + ------- + bool + Whether the form and its current representation label match (repForm and rep). + """ + form_label_line_pattern = r"\?lexeme ontolex:lexicalForm .* \." + + if line_match := re.search(pattern=form_label_line_pattern, string=form_text): + form_label_pattern = r".*\?(.*)\." + if label_match := re.search(pattern=form_label_pattern, string=line_match[0]): + form_label = label_match[1].strip() + current_form_rep_label = form_label.split("Form")[0] + + if not line_match: + return False + + onto_rep_pattern = r"{form_label} ontolex:representation .* ;".format( + form_label=form_label + ) + + if not (line_match := re.search(pattern=onto_rep_pattern, string=form_text)): + return False + + rep_label_pattern = r".*\?(.*);" + if label_match := re.search(pattern=rep_label_pattern, string=line_match[0]): + form_rep_label = label_match[1].strip() + + return form_rep_label == current_form_rep_label + + +# MARK: Check Format def check_query_formatting(form_text: str): @@ -226,225 +229,16 @@ def return_correct_form_label(qids: list): return correct_label[:1].lower() + correct_label[1:] -# MARK: Return Forms - - -def check_unique_return_forms(query_text: str) -> bool: - """ - Checks that each form returned by the SELECT statement is unique. - - Parameters - ---------- - query_text : str - The full text of the SPARQL query. - - Returns - ------- - bool - True if all returned forms are unique, False otherwise. - """ - - error_output = "" - select_pattern = r"SELECT\s*(.*?)\s*WHERE" - if match := re.search(pattern=select_pattern, string=query_text, flags=re.DOTALL): - # Extracting forms after '?' and handling cases where 'AS' is used for aliasing. - return_forms = [] - for part in match[1].split(): - if "?" in part: - form = part.split("?")[-1] - if "AS" in form: - form = form.split("AS")[0].strip() - return_forms.append(form) - - unique_forms = set(return_forms) - if len(return_forms) != len(unique_forms): - error_output += f"\nDuplicate forms found: {', '.join([form for form in return_forms if return_forms.count(form) > 1])}" - return error_output - - return True - - return True - - -# MARK: Unreturned Forms - - -def check_unreturned_optional_forms(query_text: str) -> str: - """ - Checks if there are any optional forms in the query that aren't returned in the SELECT statement. - - Parameters - ---------- - query_text : str - The full text of the SPARQL query. - - Returns - ------- - str - Error message listing any unreturned forms, or empty string if all forms are returned. - """ - # Extract forms from SELECT statement. - select_pattern = r"SELECT\s*(.*?)\s*WHERE" - select_forms = set() - if select_match := re.search( - pattern=select_pattern, string=query_text, flags=re.DOTALL - ): - for part in select_match[1].split(): - if "?" in part: - form = part.split("?")[-1] - if "AS" in form: - form = form.split("AS")[0].strip() - select_forms.add(form) - - # Extract forms from OPTIONAL blocks - optional_forms = set() - optional_pattern = r"OPTIONAL\s*\{([^}]*)\}" - for match in re.finditer(optional_pattern, query_text): - form_text = match.group(1) - rep_pattern = r"ontolex:representation\s+\?([\w]+)\s*;" - if rep_match := re.search(rep_pattern, form_text): - optional_forms.add(rep_match[1]) - - # Find forms that appear in OPTIONAL blocks but not in SELECT. - unreturned_forms = optional_forms - select_forms - - if unreturned_forms: - return f"Unreturned optional forms: {', '.join(sorted(unreturned_forms))}" - - return "" - - -# MARK: Undefined Forms - - -def check_undefined_return_forms(query_text: str) -> str: - """ - Checks if the query is trying to return forms that aren't defined in the WHERE clause - when there are no OPTIONAL blocks. - - Parameters - ---------- - query_text : str - The full text of the SPARQL query. - - Returns - ------- - str - Error message listing any undefined forms being returned, or empty string if all - returned forms are properly defined. - """ - - # Check if query has any OPTIONAL blocks. - optional_pattern = r"OPTIONAL\s*\{" - has_optional_blocks = bool(re.search(optional_pattern, query_text)) - - if has_optional_blocks: - return "" # skip check for queries with OPTIONAL blocks - - # Extract forms from SELECT statement and track aliases. - select_pattern = r"SELECT\s*(.*?)\s*WHERE" - select_forms = set() - aliases = set() - - if select_match := re.search( - pattern=select_pattern, string=query_text, flags=re.DOTALL - ): - select_clause = select_match[1] - - # Process each SELECT item. - items = select_clause.split("\n") - for item in items: - item = item.strip() - if not item: - continue - - # Handle REPLACE...AS statements. - if "AS ?" in item: - if alias_match := re.search(r"AS \?(\w+)", item): - aliases.add(alias_match[1]) - - if var_match := re.findall(r"\?(\w+)", item): - select_forms.update(v for v in var_match if v not in aliases) - - elif "?" in item: - var_match = re.findall(r"\?(\w+)", item) - select_forms.update(var_match) - - # Extract defined variables from WHERE clause. - where_pattern = r"WHERE\s*\{(.*?)\}(?:\s*ORDER BY|\s*$)" - defined_vars = set() - if where_match := re.search( - pattern=where_pattern, string=query_text, flags=re.DOTALL - ): - where_clause = where_match[1] - var_pattern = r"\?(\w+)" - defined_vars = set(re.findall(var_pattern, where_clause)) - - if undefined_forms := { - form for form in select_forms - defined_vars if form not in aliases - }: - return f"Undefined forms in SELECT: {', '.join(sorted(undefined_forms))}" +# MARK: Validate Forms - return "" - -# MARK: Defined Return Forms - - -def check_defined_return_forms(query_text: str) -> str: - """ - Ensures that all variables defined in the WHERE clause are returned in the SELECT clause. - - Parameters - ---------- - query_text : str - The full text of the SPARQL query. - - Returns - ------- - str - Error message listing any defined but unreturned forms, or empty string if all forms are returned. +def validate_forms(query_text: str) -> str: """ - # Check if query has any OPTIONAL blocks. - optional_pattern = r"OPTIONAL\s*\{" - has_optional_blocks = bool(re.search(optional_pattern, query_text)) - - if has_optional_blocks: - return "" # skip check for queries with OPTIONAL blocks - - # Extract forms from WHERE clause. - where_pattern = r"WHERE\s*\{(.*?)\}" - where_forms = set() - if where_match := re.search( - pattern=where_pattern, string=query_text, flags=re.DOTALL - ): - where_clause = where_match[1] - where_forms = set(re.findall(r"\?(\w+)", where_clause)) - - # Extract forms from SELECT statement. - select_pattern = r"SELECT\s*(.*?)\s*WHERE" - select_forms = set() - if select_match := re.search( - pattern=select_pattern, string=query_text, flags=re.DOTALL - ): - select_clause = select_match[1] - select_forms = set(re.findall(r"\?(\w+)", select_clause)) - - # Find forms that are defined but not returned, excluding allowed unreturned variables. - unreturned_forms = where_forms - select_forms - - if unreturned_forms: - return f"Defined but unreturned forms: {', '.join(sorted(unreturned_forms))}" - return "" - - -# MARK: Forms Order - - -def check_forms_order(query_text: str) -> bool: - """ - Checks that the order of variables in the SELECT statement (excluding lexeme and lexemeID) - matches the order of the same variables in the WHERE clause in the given SPARQL query file. + Validates the SPARQL query by checking: + 1. Order of variables in SELECT and WHERE clauses + 2. Presence and correct definition of forms + 3. Form labels and representations + 4. Query formatting Parameters ---------- @@ -453,8 +247,9 @@ def check_forms_order(query_text: str) -> bool: Returns ------- - bool - True if the order of the matches, False otherwise. + str + Error message if there are any issues with the order of variables or forms, + otherwise an empty string. """ select_pattern = r"SELECT\s+(.*?)\s+WHERE" @@ -463,8 +258,9 @@ def check_forms_order(query_text: str) -> bool: select_vars = re.findall(r"\?(\w+)", select_match[1]) else: - return False # invalid query format if no SELECT match + return "Invalid query format: no SELECT match" + error_messages = [] # Exclude the first two variables from select_vars. select_vars = select_vars[2:] # Regex pattern to capture the variables in the WHERE clause. @@ -489,8 +285,38 @@ def check_forms_order(query_text: str) -> bool: index = select_vars.index(var) where_vars.insert(index, var) - # Check if the order of variables matches. - return select_vars == where_vars + uniqueness_forms_check = len(select_vars) != len(set(select_vars)) + undefined_forms = set(select_vars) - set(where_vars) + unreturned_forms = set(where_vars) - set(select_vars) + select_vars = [var for var in select_vars if var not in ["lexeme", "lexemeID"]] + where_vars = [var for var in where_vars if var not in ["lexeme", "lexemeID"]] + + # Check for uniqueness of forms in SELECT. + if uniqueness_forms_check: + duplicates = [var for var in select_vars if select_vars.count(var) > 1] + error_messages.append( + f"Duplicate forms found in SELECT: {', '.join(set(duplicates))}" + ) + + # Check for undefined forms in SELECT. + elif undefined_forms: + error_messages.append( + f"Undefined forms found in SELECT: {', '.join(sorted(undefined_forms))}" + ) + + # Check for unreturned forms in WHERE. + elif unreturned_forms: + error_messages.append( + f"Defined but unreturned forms found: {', '.join(sorted(unreturned_forms))}" + ) + + # Check if the order of variables matches, excluding lexeme and lexemeID. + elif select_vars != where_vars: + error_messages.append( + "The order of variables in the SELECT statement does not match their order in the WHERE clause." + ) + + return "\n".join(error_messages) if error_messages else "" # MARK: Docstring Format @@ -535,12 +361,13 @@ def check_docstring(query_text: str) -> bool: ) -# MARK: Main Query Forms Validation +# MARK: Main Validation + + def check_query_forms() -> None: """ - Validates SPARQL queries in the language data directory to check for correct form QIDs. + Validates SPARQL queries in the language data directory to check for correct form QIDs and formatting. """ - error_output = "" index = 0 for query_file in LANGUAGE_DATA_EXTRACTION_DIR.glob("**/*.sparql"): @@ -556,30 +383,9 @@ def check_query_forms() -> None: ) index += 1 - # Check for unique return forms and handle the error message. - unique_check_result = check_unique_return_forms(query_text) - if unique_check_result is not True: - error_output += f"\n{index}. {query_file_str}: {unique_check_result}\n" - index += 1 - - if undefined_forms := check_undefined_return_forms(query_text): - error_output += f"\n{index}. {query_file_str}: {undefined_forms}\n" - index += 1 - - if unreturned_optional_forms := check_unreturned_optional_forms(query_text): - error_output += ( - f"\n{index}. {query_file_str}: {unreturned_optional_forms}\n" - ) - index += 1 - - if defined_unreturned_forms := check_defined_return_forms(query_text): - error_output += f"\n{index}. {query_file_str}: {defined_unreturned_forms}\n" - index += 1 - - # Check the order of variables in the WHERE and SELECT clauses. - select_where_labels_matching = check_forms_order(query_text) - if not select_where_labels_matching: - error_output += f"\n{index}. {query_file_str}:\n - The order of variables in the SELECT statement does not match their order in the query.\n" + # Check that all variables in the WHERE and SELECT clauses are ordered, defined and returned. + if forms_order_and_definition_check := validate_forms(query_text): + error_output += f"\n{index}. {query_file_str}:\n - {forms_order_and_definition_check}\n" index += 1 if extract_forms_from_sparql(query_file): @@ -612,10 +418,12 @@ def check_query_forms() -> None: "Invalid query formatting found - please put spaces before all periods and semicolons and also remove spaces before commas.", ) ) + elif k != query_form_check_dict[k]["correct_form_rep_label"]: incorrect_query_labels.append( (k, query_form_check_dict[k]["correct_form_rep_label"]) ) + elif query_form_check_dict[k]["form_rep_match"] is False: incorrect_query_labels.append( (k, "Form and representation labels don't match")