Merge pull request #291 from ckan/shacl-validation-range

Add support for defining ranges/classes when generating a graph
ckan · Aug 13, 2024 · 7c6339d · 7c6339d
2 parents 51d6513 + 75c5451
commit 7c6339d
Show file tree

Hide file tree

Showing 8 changed files with 790 additions and 52 deletions.
diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py
@@ -858,8 +858,13 @@ def _add_list_triples_from_dict(self, _dict, subject, items):
     def _add_triples_from_dict(
         self, _dict, subject, items, list_value=False, date_value=False
     ):
+
         for item in items:
-            key, predicate, fallbacks, _type = item
+            try:
+                key, predicate, fallbacks, _type, _class = item
+            except ValueError:
+                key, predicate, fallbacks, _type = item
+                _class = None
             self._add_triple_from_dict(
                 _dict,
                 subject,
@@ -869,6 +874,7 @@ def _add_triples_from_dict(
                 list_value=list_value,
                 date_value=date_value,
                 _type=_type,
+                _class=_class,
             )
 
     def _add_triple_from_dict(
@@ -882,6 +888,7 @@ def _add_triple_from_dict(
         date_value=False,
         _type=Literal,
         _datatype=None,
+        _class=None,
         value_modifier=None,
     ):
         """
@@ -896,6 +903,8 @@ def _add_triple_from_dict(
         returning a modified value can be passed.
         If a value was found, the modifier is applied before adding the value.
 
+        `_class` is the optional RDF class of the entity being added.
+
         If `list_value` or `date_value` are True, then the value is treated as
         a list or a date respectively (see `_add_list_triple` and
         `_add_date_triple` for details.
@@ -912,7 +921,7 @@ def _add_triple_from_dict(
             value = value_modifier(value)
 
         if value and list_value:
-            self._add_list_triple(subject, predicate, value, _type, _datatype)
+            self._add_list_triple(subject, predicate, value, _type, _datatype, _class)
         elif value and date_value:
             self._add_date_triple(subject, predicate, value, _type)
         elif value:
@@ -926,8 +935,11 @@ def _add_triple_from_dict(
                 object = _type(value)
             self.g.add((subject, predicate, object))
 
+            if _class and isinstance(object, URIRef):
+                self.g.add((object, RDF.type, _class))
+
     def _add_list_triple(
-        self, subject, predicate, value, _type=Literal, _datatype=None
+        self, subject, predicate, value, _type=Literal, _datatype=None, _class=None
     ):
         """
         Adds as many triples to the graph as values
@@ -948,6 +960,9 @@ def _add_list_triple(
                 object = _type(item)
             self.g.add((subject, predicate, object))
 
+            if _class and isinstance(object, URIRef):
+                self.g.add((object, RDF.type, _class))
+
     def _add_date_triple(self, subject, predicate, value, _type=Literal):
         """
         Adds a new triple with a date object

diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py
@@ -275,14 +275,14 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
         items = [
             ("title", DCT.title, None, Literal),
             ("notes", DCT.description, None, Literal),
-            ("url", DCAT.landingPage, None, URIRef),
+            ("url", DCAT.landingPage, None, URIRef, FOAF.Document),
             ("identifier", DCT.identifier, ["guid", "id"], URIRefOrLiteral),
             ("version", OWL.versionInfo, ["dcat_version"], Literal),
             ("version_notes", ADMS.versionNotes, None, Literal),
-            ("frequency", DCT.accrualPeriodicity, None, URIRefOrLiteral),
-            ("access_rights", DCT.accessRights, None, URIRefOrLiteral),
-            ("dcat_type", DCT.type, None, Literal),
-            ("provenance", DCT.provenance, None, Literal),
+            ("frequency", DCT.accrualPeriodicity, None, URIRefOrLiteral, DCT.Frequency),
+            ("access_rights", DCT.accessRights, None, URIRefOrLiteral, DCT.AccessRights),
+            ("dcat_type", DCT.type, None, URIRefOrLiteral),
+            ("provenance", DCT.provenance, None, URIRefOrLiteral, DCT.ProvenanceStatement),
         ]
         self._add_triples_from_dict(dataset_dict, dataset_ref, items)
 
@@ -299,16 +299,16 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
 
         #  Lists
         items = [
-            ("language", DCT.language, None, URIRefOrLiteral),
+            ("language", DCT.language, None, URIRefOrLiteral, DCT.LinguisticSystem),
             ("theme", DCAT.theme, None, URIRef),
-            ("conforms_to", DCT.conformsTo, None, Literal),
-            ("alternate_identifier", ADMS.identifier, None, URIRefOrLiteral),
-            ("documentation", FOAF.page, None, URIRefOrLiteral),
-            ("related_resource", DCT.relation, None, URIRefOrLiteral),
+            ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral, DCT.Standard),
+            ("alternate_identifier", ADMS.identifier, None, URIRefOrLiteral, ADMS.Identifier),
+            ("documentation", FOAF.page, None, URIRefOrLiteral, FOAF.Document),
+            ("related_resource", DCT.relation, None, URIRefOrLiteral, RDFS.Resource),
             ("has_version", DCT.hasVersion, None, URIRefOrLiteral),
             ("is_version_of", DCT.isVersionOf, None, URIRefOrLiteral),
             ("source", DCT.source, None, URIRefOrLiteral),
-            ("sample", ADMS.sample, None, URIRefOrLiteral),
+            ("sample", ADMS.sample, None, URIRefOrLiteral, DCAT.Distribution),
         ]
         self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)
 
@@ -404,7 +404,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                 }
         # Add to graph
         if publisher_ref:
-            g.add((publisher_ref, RDF.type, FOAF.Organization))
+            g.add((publisher_ref, RDF.type, FOAF.Agent))
             g.add((dataset_ref, DCT.publisher, publisher_ref))
             items = [
                 ("name", FOAF.name, None, Literal),
@@ -468,23 +468,24 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                 ("name", DCT.title, None, Literal),
                 ("description", DCT.description, None, Literal),
                 ("status", ADMS.status, None, URIRefOrLiteral),
-                ("rights", DCT.rights, None, URIRefOrLiteral),
-                ("license", DCT.license, None, URIRefOrLiteral),
-                ("access_url", DCAT.accessURL, None, URIRef),
-                ("download_url", DCAT.downloadURL, None, URIRef),
+                ("rights", DCT.rights, None, URIRefOrLiteral, DCT.RightsStatement),
+                ("license", DCT.license, None, URIRefOrLiteral, DCT.LicenseDocument),
+                ("access_url", DCAT.accessURL, None, URIRef, RDFS.Resource),
+                ("download_url", DCAT.downloadURL, None, URIRef, RDFS.Resource),
             ]
 
             self._add_triples_from_dict(resource_dict, distribution, items)
 
             #  Lists
             items = [
-                ("documentation", FOAF.page, None, URIRefOrLiteral),
-                ("language", DCT.language, None, URIRefOrLiteral),
-                ("conforms_to", DCT.conformsTo, None, Literal),
+                ("documentation", FOAF.page, None, URIRefOrLiteral, FOAF.Document),
+                ("language", DCT.language, None, URIRefOrLiteral, DCT.LinguisticSystem),
+                ("conforms_to", DCT.conformsTo, None, URIRefOrLiteral, DCT.Standard),
             ]
             self._add_list_triples_from_dict(resource_dict, distribution, items)
 
             # Set default license for distribution if needed and available
+
             if resource_license_fallback and not (distribution, DCT.license, None) in g:
                 g.add(
                     (
@@ -493,6 +494,15 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                         URIRefOrLiteral(resource_license_fallback),
                     )
                 )
+            # TODO: add an actual field to manage this
+            if (distribution, DCT.license, None) in g:
+                g.add(
+                    (
+                        list(g.objects(distribution, DCT.license))[0],
+                        DCT.type,
+                        URIRef("http://purl.org/adms/licencetype/UnknownIPR")
+                    )
+                )
 
             # Format
             mimetype = resource_dict.get("mimetype")
@@ -515,10 +525,16 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                     mimetype = None
 
             if mimetype:
-                g.add((distribution, DCAT.mediaType, URIRefOrLiteral(mimetype)))
+                mimetype = URIRefOrLiteral(mimetype)
+                g.add((distribution, DCAT.mediaType, mimetype))
+                if isinstance(mimetype, URIRef):
+                    g.add((mimetype, RDF.type, DCT.MediaType))
 
             if fmt:
-                g.add((distribution, DCT["format"], URIRefOrLiteral(fmt)))
+                fmt = URIRefOrLiteral(fmt)
+                g.add((distribution, DCT["format"], fmt))
+                if isinstance(fmt, URIRef):
+                    g.add((fmt, RDF.type, DCT.MediaTypeOrExtent))
 
             # URL fallback and old behavior
             url = resource_dict.get("url")

diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py
@@ -1,7 +1,7 @@
 import json
 from decimal import Decimal, DecimalException
 
-from rdflib import URIRef, BNode, Literal
+from rdflib import URIRef, BNode, Literal, Namespace
 from ckanext.dcat.utils import resource_uri
 
 from .base import URIRefOrLiteral, CleanedURIRef
@@ -13,11 +13,15 @@
     DCT,
     XSD,
     SCHEMA,
+    RDFS,
 )
 
 from .euro_dcat_ap import EuropeanDCATAPProfile
 
 
+ELI = Namespace("http://data.europa.eu/eli/ontology#")
+
+
 class EuropeanDCATAP2Profile(EuropeanDCATAPProfile):
     """
     An RDF profile based on the DCAT-AP 2 for data portals in Europe
@@ -36,7 +40,9 @@ def parse_dataset(self, dataset_dict, dataset_ref):
         # Standard values
         value = self._object_value(dataset_ref, DCAT.temporalResolution)
         if value:
-            dataset_dict["extras"].append({"key": "temporal_resolution", "value": value})
+            dataset_dict["extras"].append(
+                {"key": "temporal_resolution", "value": value}
+            )
 
         # Lists
         for key, predicate in (
@@ -67,7 +73,8 @@ def parse_dataset(self, dataset_dict, dataset_ref):
             # For some reason we incorrectly allowed lists in this property at some point
             # keep support for it but default to single value
             value = (
-                spatial_resolution[0] if len(spatial_resolution) == 1
+                spatial_resolution[0]
+                if len(spatial_resolution) == 1
                 else json.dumps(spatial_resolution)
             )
             dataset_dict["extras"].append(
@@ -169,16 +176,24 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
         )
 
         # Lists
-        for key, predicate, fallbacks, type, datatype in (
-            ("is_referenced_by", DCT.isReferencedBy, None, URIRefOrLiteral, None),
+        for key, predicate, fallbacks, type, datatype, _class in (
+            (
+                "is_referenced_by",
+                DCT.isReferencedBy,
+                None,
+                URIRefOrLiteral,
+                None,
+                RDFS.Resource,
+            ),
             (
                 "applicable_legislation",
                 DCATAP.applicableLegislation,
                 None,
                 URIRefOrLiteral,
                 None,
+                ELI.LegalResource,
             ),
-            ("hvd_category", DCATAP.hvdCategory, None, URIRefOrLiteral, None),
+            ("hvd_category", DCATAP.hvdCategory, None, URIRefOrLiteral, None, None),
         ):
             self._add_triple_from_dict(
                 dataset_dict,
@@ -254,8 +269,20 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             #  Simple values
             items = [
                 ("availability", DCATAP.availability, None, URIRefOrLiteral),
-                ("compress_format", DCAT.compressFormat, None, URIRefOrLiteral),
-                ("package_format", DCAT.packageFormat, None, URIRefOrLiteral),
+                (
+                    "compress_format",
+                    DCAT.compressFormat,
+                    None,
+                    URIRefOrLiteral,
+                    DCT.MediaType,
+                ),
+                (
+                    "package_format",
+                    DCAT.packageFormat,
+                    None,
+                    URIRefOrLiteral,
+                    DCT.MediaType,
+                ),
             ]
 
             self._add_triples_from_dict(resource_dict, distribution, items)
@@ -267,6 +294,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                     DCATAP.applicableLegislation,
                     None,
                     URIRefOrLiteral,
+                    ELI.LegalResource,
                 ),
             ]
             self._add_list_triples_from_dict(resource_dict, distribution, items)
@@ -300,7 +328,12 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                     ("license", DCT.license, None, URIRefOrLiteral),
                     ("access_rights", DCT.accessRights, None, URIRefOrLiteral),
                     ("title", DCT.title, None, Literal),
-                    ("endpoint_description", DCAT.endpointDescription, None, URIRefOrLiteral),
+                    (
+                        "endpoint_description",
+                        DCAT.endpointDescription,
+                        None,
+                        URIRefOrLiteral,
+                    ),
                     ("description", DCT.description, None, Literal),
                 ]
 
@@ -310,7 +343,13 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
 
                 #  Lists
                 items = [
-                    ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral),
+                    (
+                        "endpoint_url",
+                        DCAT.endpointURL,
+                        None,
+                        URIRefOrLiteral,
+                        RDFS.Resource,
+                    ),
                     ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral),
                 ]
                 self._add_list_triples_from_dict(

diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
@@ -71,9 +71,7 @@ def _parse_list_value(data_dict, field_name):
                 _parse_list_value(resource_dict, field_name)
 
         # Repeating subfields
-        new_fields_mapping = {
-            "temporal_coverage": "temporal"
-        }
+        new_fields_mapping = {"temporal_coverage": "temporal"}
         for schema_field in self._dataset_schema["dataset_fields"]:
             if "repeating_subfields" in schema_field:
                 # Check if existing extras need to be migrated
@@ -132,7 +130,7 @@ def _not_empty_dict(data_dict):
                 else:
                     contact_details = BNode()
 
-                self.g.add((contact_details, RDF.type, VCARD.Organization))
+                self.g.add((contact_details, RDF.type, VCARD.Kind))
                 self.g.add((dataset_ref, DCAT.contactPoint, contact_details))
 
                 self._add_triple_from_dict(item, contact_details, VCARD.fn, "name")
@@ -147,23 +145,32 @@ def _not_empty_dict(data_dict):
                 )
 
         publisher = dataset_dict.get("publisher")
-        if isinstance(publisher, list) and len(publisher) and _not_empty_dict(publisher[0]):
+        if (
+            isinstance(publisher, list)
+            and len(publisher)
+            and _not_empty_dict(publisher[0])
+        ):
             publisher = publisher[0]
             publisher_uri = publisher.get("uri")
             if publisher_uri:
                 publisher_ref = CleanedURIRef(publisher_uri)
             else:
                 publisher_ref = BNode()
 
-            self.g.add((publisher_ref, RDF.type, FOAF.Organization))
+            self.g.add((publisher_ref, RDF.type, FOAF.Agent))
             self.g.add((dataset_ref, DCT.publisher, publisher_ref))
 
             self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name")
             self._add_triple_from_dict(
                 publisher, publisher_ref, FOAF.homepage, "url", _type=URIRef
             )
             self._add_triple_from_dict(
-                publisher, publisher_ref, DCT.type, "type", _type=URIRefOrLiteral
+                publisher,
+                publisher_ref,
+                DCT.type,
+                "type",
+                _type=URIRefOrLiteral,
+                _class=SKOS.Concept,
             )
             self._add_triple_from_dict(
                 publisher,
@@ -175,7 +182,11 @@ def _not_empty_dict(data_dict):
             )
 
         temporal = dataset_dict.get("temporal_coverage")
-        if isinstance(temporal, list) and len(temporal) and _not_empty_dict(temporal[0]):
+        if (
+            isinstance(temporal, list)
+            and len(temporal)
+            and _not_empty_dict(temporal[0])
+        ):
             for item in temporal:
                 temporal_ref = BNode()
                 self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime))