Skip to content

Commit

Permalink
feat(validate-data): ensure only known resources are used (DEV-4268) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
Nora-Olivia-Ammann authored Nov 11, 2024
1 parent e745e2a commit dbbaefe
Show file tree
Hide file tree
Showing 7 changed files with 333 additions and 91 deletions.
35 changes: 35 additions & 0 deletions src/dsp_tools/commands/validate_data/models/input_problems.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,41 @@
GRAND_SEPARATOR = "\n\n----------------------------\n"


@dataclass
class UnknownClassesInData:
unknown_classes: set[str]
classes_onto: set[str]

def get_msg(self) -> str:
if unknown := self._get_unknown_ontos_msg():
return unknown
return self._get_unknown_classes_msg()

def _get_unknown_ontos_msg(self) -> str:
def split_prefix(relative_iri: str) -> str:
return relative_iri.split(":")[0]

used_ontos = set(split_prefix(x) for x in self.unknown_classes)
exising_ontos = set(split_prefix(x) for x in self.classes_onto)
msg = ""
if unknown := used_ontos - exising_ontos:
msg = (
f"Your data uses ontologies that don't exist in the database.\n"
f"The following ontologies that are used in the data are unknown: {''.join(exising_ontos)}"
f"The following ontologies are uploaded: {''.join(unknown)}\n"
)
return msg

def _get_unknown_classes_msg(self) -> str:
unknown_classes = sorted(list(self.unknown_classes))
known_classes = sorted(list(self.classes_onto))
return (
f"Your data uses resource classes that do not exist in the ontologies in the database.\n"
f"The following classes that are used in the data are unknown: {''.join(unknown_classes)}\n"
f"The following classes exist in the uploaded ontologies: {''.join(known_classes)}\n"
)


@dataclass
class UnexpectedResults:
components: list[UnexpectedComponent]
Expand Down
33 changes: 10 additions & 23 deletions src/dsp_tools/commands/validate_data/reformat_validaton_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
from dsp_tools.commands.validate_data.models.validation import ValidationReportGraphs
from dsp_tools.commands.validate_data.models.validation import ValidationResult
from dsp_tools.commands.validate_data.models.validation import ValidationResultBaseInfo
from dsp_tools.commands.validate_data.utils import reformat_data_iri
from dsp_tools.commands.validate_data.utils import reformat_onto_iri
from dsp_tools.models.exceptions import BaseError

DASH = Namespace("http://datashapes.org/dash#")
Expand Down Expand Up @@ -361,7 +363,7 @@ def _reformat_one_validation_result(validation_result: ValidationResult) -> Inpu

def _reformat_value_type_violation_result(result: ResultValueTypeViolation) -> ValueTypeProblem:
iris = _reformat_main_iris(result)
actual_type = _reformat_onto_iri(result.actual_value_type)
actual_type = reformat_onto_iri(result.actual_value_type)
return ValueTypeProblem(
res_id=iris.res_id,
res_type=iris.res_type,
Expand All @@ -387,16 +389,16 @@ def _reformat_pattern_violation_result(result: ResultPatternViolation) -> Conten

def _reformat_link_target_violation_result(result: ResultLinkTargetViolation) -> InputProblem:
iris = _reformat_main_iris(result)
target_id = _reformat_data_iri(result.target_iri)
target_id = reformat_data_iri(result.target_iri)
if not result.target_resource_type:
return LinkedResourceDoesNotExistProblem(
res_id=iris.res_id,
res_type=iris.res_type,
prop_name=iris.prop_name,
link_target_id=target_id,
)
actual_type = _reformat_onto_iri(result.target_resource_type)
expected_type = _reformat_onto_iri(result.expected_type)
actual_type = reformat_onto_iri(result.target_resource_type)
expected_type = reformat_onto_iri(result.expected_type)
return LinkTargetTypeMismatchProblem(
res_id=iris.res_id,
res_type=iris.res_type,
Expand All @@ -412,7 +414,7 @@ def _reformat_unique_value_violation_result(result: ResultUniqueValueViolation)
if isinstance(result.actual_value, Literal):
actual_value = str(result.actual_value)
else:
actual_value = _reformat_data_iri(result.actual_value)
actual_value = reformat_data_iri(result.actual_value)
return DuplicateValueProblem(
res_id=iris.res_id,
res_type=iris.res_type,
Expand All @@ -422,22 +424,7 @@ def _reformat_unique_value_violation_result(result: ResultUniqueValueViolation)


def _reformat_main_iris(result: ValidationResult) -> ReformattedIRI:
subject_id = _reformat_data_iri(result.res_iri)
prop_name = _reformat_onto_iri(result.property)
res_type = _reformat_onto_iri(result.res_class)
subject_id = reformat_data_iri(result.res_iri)
prop_name = reformat_onto_iri(result.property)
res_type = reformat_onto_iri(result.res_class)
return ReformattedIRI(res_id=subject_id, res_type=res_type, prop_name=prop_name)


def _reformat_onto_iri(iri: Node) -> str:
iri_str = str(iri)
if "http://www.w3.org/2000/01/rdf-schema#" in iri_str:
return f'rdfs:{iri_str.split("#")[-1]}'
onto = iri_str.split("/")[-2]
ending = iri_str.split("#")[-1]
if onto == "knora-api":
return ending
return f"{onto}:{ending}"


def _reformat_data_iri(iri: Node) -> str:
return str(iri).replace("http://data/", "")
18 changes: 18 additions & 0 deletions src/dsp_tools/commands/validate_data/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from rdflib.term import Node


def reformat_onto_iri(iri: Node) -> str:
"""Takes a rdflib Node and returns a prefixed IRI in string form."""
iri_str = str(iri)
if "http://www.w3.org/2000/01/rdf-schema#" in iri_str:
return f'rdfs:{iri_str.split("#")[-1]}'
onto = iri_str.split("/")[-2]
ending = iri_str.split("#")[-1]
if onto == "knora-api":
return ending
return f"{onto}:{ending}"


def reformat_data_iri(iri: Node) -> str:
"""Takes a rdflib Node with in the data namespace and returns only the suffix."""
return str(iri).replace("http://data/", "")
69 changes: 55 additions & 14 deletions src/dsp_tools/commands/validate_data/validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
from pathlib import Path

from lxml import etree
from rdflib import RDF
from rdflib import Graph
from rdflib import Literal
from rdflib import URIRef
from termcolor import cprint

from dsp_tools.commands.validate_data.api_clients import ListClient
Expand All @@ -14,10 +17,12 @@
from dsp_tools.commands.validate_data.models.data_deserialised import ProjectDeserialised
from dsp_tools.commands.validate_data.models.data_deserialised import XMLProject
from dsp_tools.commands.validate_data.models.data_rdf import DataRDF
from dsp_tools.commands.validate_data.models.input_problems import UnknownClassesInData
from dsp_tools.commands.validate_data.models.validation import RDFGraphs
from dsp_tools.commands.validate_data.models.validation import ValidationReportGraphs
from dsp_tools.commands.validate_data.reformat_validaton_result import reformat_validation_graph
from dsp_tools.commands.validate_data.sparql.construct_shacl import construct_shapes_graphs
from dsp_tools.commands.validate_data.utils import reformat_onto_iri
from dsp_tools.models.exceptions import InputError
from dsp_tools.utils.xml_utils import parse_xml_file
from dsp_tools.utils.xml_utils import remove_comments_from_element_tree
Expand All @@ -42,7 +47,14 @@ def validate_data(filepath: Path, api_url: str, dev_route: bool, save_graphs: bo
true unless it crashed
"""
_inform_about_experimental_feature()
report = _get_validation_result(api_url, filepath, save_graphs)
api_con = ApiConnection(api_url)
graphs = _get_parsed_graphs(api_con, filepath)
if unknown_classes := _check_for_unknown_resource_classes(graphs):
msg = unknown_classes.get_msg()
cprint("\n Validation errors found! ", color="light_red", attrs=["bold", "reverse"])
print(msg)
return True
report = _get_validation_result(graphs, api_con, filepath, save_graphs)
if report.conforms:
cprint("\n Validation passed! ", color="green", attrs=["bold", "reverse"])
else:
Expand All @@ -66,12 +78,51 @@ def validate_data(filepath: Path, api_url: str, dev_route: bool, save_graphs: bo
return True


def _get_validation_result(api_url: str, filepath: Path, save_graphs: bool) -> ValidationReportGraphs:
data_rdf, shortcode = _get_data_info_from_file(filepath, api_url)
api_con = ApiConnection(api_url)
def _inform_about_experimental_feature() -> None:
what_is_validated = [
"This is an experimental feature, it will change and be extended continuously. "
"The following information of your data is being validated:",
"Cardinalities",
"If the value type used matches the ontology",
]
cprint(LIST_SEPARATOR.join(what_is_validated), color="magenta", attrs=["bold"])


def _get_parsed_graphs(api_con: ApiConnection, filepath: Path) -> RDFGraphs:
data_rdf, shortcode = _get_data_info_from_file(filepath, api_con.api_url)
onto_client = OntologyClient(api_con, shortcode)
list_client = ListClient(api_con, shortcode)
rdf_graphs = _create_graphs(onto_client, list_client, data_rdf)
return rdf_graphs


def _check_for_unknown_resource_classes(rdf_graphs: RDFGraphs) -> UnknownClassesInData | None:
used_cls = _get_all_used_classes(rdf_graphs.data)
res_cls, value_cls = _get_all_onto_classes(rdf_graphs.ontos + rdf_graphs.knora_api)
all_cls = res_cls.union(value_cls)
if extra_cls := used_cls - all_cls:
return UnknownClassesInData(unknown_classes=extra_cls, classes_onto=res_cls)
return None


def _get_all_used_classes(data_graph: Graph) -> set[str]:
types_used = set(data_graph.objects(predicate=RDF.type))
return {reformat_onto_iri(x) for x in types_used}


def _get_all_onto_classes(ontos: Graph) -> tuple[set[str], set[str]]:
is_resource_iri = URIRef(KNORA_API + "isResourceClass")
resource_classes = set(ontos.subjects(is_resource_iri, Literal(True)))
res_cls = {reformat_onto_iri(x) for x in resource_classes}
is_value_iri = URIRef(KNORA_API + "isValueClass")
value_classes = set(ontos.subjects(is_value_iri, Literal(True)))
value_cls = {reformat_onto_iri(x) for x in value_classes}
return res_cls, value_cls


def _get_validation_result(
rdf_graphs: RDFGraphs, api_con: ApiConnection, filepath: Path, save_graphs: bool
) -> ValidationReportGraphs:
generic_filepath = Path()
if save_graphs:
generic_filepath = _save_graphs(filepath, rdf_graphs)
Expand All @@ -82,16 +133,6 @@ def _get_validation_result(api_url: str, filepath: Path, save_graphs: bool) -> V
return report


def _inform_about_experimental_feature() -> None:
what_is_validated = [
"This is an experimental feature, it will change and be extended continuously. "
"The following information of your data is being validated:",
"Cardinalities",
"If the value type used matches the ontology",
]
cprint(LIST_SEPARATOR.join(what_is_validated), color="magenta", attrs=["bold"])


def _create_graphs(onto_client: OntologyClient, list_client: ListClient, data_rdf: DataRDF) -> RDFGraphs:
ontologies = _get_project_ontos(onto_client)
all_lists = list_client.get_lists()
Expand Down
Loading

0 comments on commit dbbaefe

Please sign in to comment.