From 8239e1bca7dea0b78d3438c78f16df32945b905a Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 7 Oct 2024 14:07:15 +0200 Subject: [PATCH] Add identifier property to Distributions --- ckanext/dcat/profiles/dcat_us_3.py | 50 +++++++------ ckanext/dcat/schemas/dcat_us_full.yaml | 6 +- .../dcat_us_3/test_dcat_us_3_profile_parse.py | 1 + .../test_dcat_us_3_profile_serialize.py | 74 ++++++++++++++++--- examples/dcat/dataset.rdf | 1 + 5 files changed, 96 insertions(+), 36 deletions(-) diff --git a/ckanext/dcat/profiles/dcat_us_3.py b/ckanext/dcat/profiles/dcat_us_3.py index 9a2dd0be..3ea5be1f 100644 --- a/ckanext/dcat/profiles/dcat_us_3.py +++ b/ckanext/dcat/profiles/dcat_us_3.py @@ -49,27 +49,29 @@ def graph_from_catalog(self, catalog_dict, catalog_ref): self._graph_from_catalog_base(catalog_dict, catalog_ref) - def _graph_from_dataset_v3(self, dataset_dict, dataset_ref): - - # byteSize decimal -> nonNegativeInteger - for subject, predicate, object in self.g.triples((None, DCAT.byteSize, None)): - if object and object.datatype == XSD.decimal: - self.g.remove((subject, predicate, object)) - - self.g.add( - ( - subject, - predicate, - Literal(int(object), datatype=XSD.nonNegativeInteger), - ) - ) - - # Other identifiers - value = self._get_dict_value(dataset_dict, "alternate_identifier") - if value: - items = self._read_list_value(value) - for item in items: - identifier = BNode() - self.g.add((dataset_ref, ADMS.identifier, identifier)) - self.g.add((identifier, RDF.type, ADMS.Identifier)) - self.g.add((identifier, SKOS.notation, Literal(item))) + def _parse_dataset_v3_us(self, dataset_dict, dataset_ref): + + for distribution_ref in self._distributions(dataset_ref): + + # Distribution identifier + value = self._object_value(distribution_ref, DCT.identifier) + if value: + for resource_dict in dataset_dict.get("resources", []): + if resource_dict["distribution_ref"] == str(distribution_ref): + resource_dict["identifier"] = value + + def _graph_from_dataset_v3_us(self, dataset_dict, dataset_ref): + + for resource_dict in dataset_dict.get("resources", []): + + distribution_ref = CleanedURIRef(resource_uri(resource_dict)) + + # Distribution identifier + self._add_triple_from_dict( + resource_dict, + distribution_ref, + DCT.identifier, + "identifier", + fallbacks=["guid", "id"], + _type=URIRefOrLiteral, + ) diff --git a/ckanext/dcat/schemas/dcat_us_full.yaml b/ckanext/dcat/schemas/dcat_us_full.yaml index 8d81835f..9d1f0206 100644 --- a/ckanext/dcat/schemas/dcat_us_full.yaml +++ b/ckanext/dcat/schemas/dcat_us_full.yaml @@ -147,7 +147,7 @@ dataset_fields: # Note: CKAN will generate a unique identifier for each dataset - field_name: identifier label: Identifier - help_text: A unique identifier of the dataset. + help_text: A unique identifier of the dataset, if not provided it will fall back to CKAN's internal id. - field_name: frequency label: Frequency @@ -354,6 +354,10 @@ resource_fields: display_snippet: link.html help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset, if not provided it will fall back to CKAN's internal id. + - field_name: issued label: Release date preset: dcat_date diff --git a/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py index da5f4842..24414c7a 100644 --- a/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py @@ -112,6 +112,7 @@ def test_e2e_dcat_to_ckan(self): # Resources: standard fields assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" + assert resource["identifier"] == "https://example.org/distributions/1" assert resource["rights"] == "Some statement about rights" assert resource["issued"] == "2012-05-11" assert resource["modified"] == "2012-05-01T00:04:06" diff --git a/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_serialize.py index 2b1a3363..2284c558 100644 --- a/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_serialize.py @@ -29,17 +29,17 @@ DCAT_AP_PROFILES = ["dcat_us_3"] +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_us_full.yaml" +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "dcat_us_3") class TestDCATUS3ProfileSerializeDataset(BaseSerializeTest): - @pytest.mark.usefixtures("with_plugins", "clean_db") - @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") - @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_us_full.yaml" - ) - @pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", - ) - @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "dcat_us_3") def test_e2e_ckan_to_dcat(self): """ Create a dataset using the scheming schema, check that fields @@ -82,7 +82,9 @@ def test_e2e_ckan_to_dcat(self): ) assert self._triple(g, dataset_ref, DCT.type, URIRef(dataset["dcat_type"])) assert self._triple(g, dataset_ref, ADMS.versionNotes, dataset["version_notes"]) - assert self._triple(g, dataset_ref, DCT.accessRights, URIRef(dataset["access_rights"])) + assert self._triple( + g, dataset_ref, DCT.accessRights, URIRef(dataset["access_rights"]) + ) assert self._triple( g, dataset_ref, @@ -332,3 +334,53 @@ def test_e2e_ckan_to_dcat(self): ] assert endpoint_urls == resource["access_services"][0]["endpoint_url"] + def test_distribution_identifier(self): + + dataset_dict = { + "name": "test-dcat-us", + "description": "Test", + "resources": [ + { + "id": "89b67e5b-d0e1-4bc3-a75a-59f21c66ebc0", + "name": "some data", + "identifier": "https://example.org/distributions/1", + } + ], + } + + s = RDFSerializer(profiles=DCAT_AP_PROFILES) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset_dict) + + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + resource = dataset_dict["resources"][0] + + assert self._triple( + g, distribution_ref, DCT.identifier, URIRef(resource["identifier"]) + ) + + def test_distribution_identifier_falls_back_to_id(self): + + dataset_dict = { + "name": "test-dcat-us", + "description": "Test", + "resources": [ + { + "id": "89b67e5b-d0e1-4bc3-a75a-59f21c66ebc0", + "name": "some data", + } + ], + } + + s = RDFSerializer(profiles=DCAT_AP_PROFILES) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset_dict) + + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + resource = dataset_dict["resources"][0] + + assert self._triple( + g, distribution_ref, DCT.identifier, resource["id"] + ) diff --git a/examples/dcat/dataset.rdf b/examples/dcat/dataset.rdf index 8cd9619f..26212d4b 100644 --- a/examples/dcat/dataset.rdf +++ b/examples/dcat/dataset.rdf @@ -91,6 +91,7 @@ Some website A longer description + https://example.org/distributions/1 2012-05-11 2012-05-01T00:04:06