diff --git a/tests/conftest.py b/tests/conftest.py index 8616c13..6d924a6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import transmogrifier.models as timdex from transmogrifier.config import SOURCES, load_external_config -from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer +from transmogrifier.sources.transformer import JSONTransformer, XMLTransformer from transmogrifier.sources.xml.datacite import Datacite @@ -48,23 +48,21 @@ def runner(): @pytest.fixture def aardvark_record_all_fields(): - return next( - JsonTransformer.parse_source_file( - "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl" - ) + return JSONTransformer.parse_source_file( + "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl" ) @pytest.fixture() def datacite_records(): - return XmlTransformer.parse_source_file( + return XMLTransformer.parse_source_file( "tests/fixtures/datacite/datacite_records.xml" ) @pytest.fixture() def datacite_record_all_fields(): - source_records = XmlTransformer.parse_source_file( + source_records = XMLTransformer.parse_source_file( "tests/fixtures/datacite/datacite_record_all_fields.xml" ) return Datacite("cool-repo", source_records) @@ -72,7 +70,7 @@ def datacite_record_all_fields(): @pytest.fixture() def aardvark_records(): - return JsonTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl") + return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl") @pytest.fixture() @@ -87,7 +85,7 @@ def marc_content_type_crosswalk(): @pytest.fixture() def oai_pmh_records(): - return XmlTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml") + return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml") @pytest.fixture() diff --git a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl index af39020..89eca27 100644 --- a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl +++ b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl @@ -1 +1 @@ -{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"} \ No newline at end of file +{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"} \ No newline at end of file diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py index a87637e..99c8939 100644 --- a/tests/sources/json/test_aardvark.py +++ b/tests/sources/json/test_aardvark.py @@ -12,7 +12,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records): } -def test_jsontransformer_transform_returns_timdex_record(aardvark_records): +def test_aardvark_transform_returns_timdex_record(aardvark_records): transformer = MITAardvark("cool-repo", aardvark_records) assert next(transformer) == timdex.TimdexRecord( source="A Cool Repository", @@ -24,16 +24,92 @@ def test_jsontransformer_transform_returns_timdex_record(aardvark_records): ) +def test_aardvark_get_optional_fields_non_field_method_values_success( + aardvark_record_all_fields, +): + transformer = MITAardvark("cool-repo", aardvark_record_all_fields) + record = next(transformer) + assert record.format == "Shapefile" + assert record.languages == ["eng"] + assert record.summary == ["A description"] + + def test_aardvark_get_main_titles_success(aardvark_record_all_fields): - assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"] + assert MITAardvark.get_main_titles(next(aardvark_record_all_fields)) == [ + "Test title 1" + ] def test_aardvark_get_source_record_id_success(aardvark_record_all_fields): - assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123" + assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123" + + +def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields): + assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [ + timdex.AlternateTitle(value="Alternate title") + ] + + +def test_aardvark_get_contributors_success(aardvark_record_all_fields): + assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [ + timdex.Contributor( + value="Smith, Jane", + kind="Creator", + ), + timdex.Contributor( + value="Smith, John", + kind="Creator", + ), + ] + + +def test_aardvark_get_notes_success(aardvark_record_all_fields): + assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [ + timdex.Note( + value=["Danger: This text will be displayed in a red box"], + kind="Display note", + ), + timdex.Note( + value=["Info: This text will be displayed in a blue box"], + kind="Display note", + ), + timdex.Note( + value=["Tip: This text will be displayed in a green box"], + kind="Display note", + ), + timdex.Note( + value=["Warning: This text will be displayed in a yellow box"], + kind="Display note", + ), + timdex.Note( + value=[ + "This is text without a tag and it will be assigned default 'note' style" + ], + kind="Display note", + ), + ] + + +def test_aardvark_get_publication_information_success(aardvark_record_all_fields): + assert MITAardvark.get_publication_information( + next(aardvark_record_all_fields) + ) == ["ML InfoMap (Firm)", "MIT"] + + +def test_aardvark_get_rights_success(aardvark_record_all_fields): + assert MITAardvark.get_rights(next(aardvark_record_all_fields)) == [ + timdex.Rights(description="Access note", kind="Access"), + timdex.Rights(uri="http://license.license"), + timdex.Rights(uri="http://another_license.another_license"), + timdex.Rights(description="Some person has the rights"), + timdex.Rights( + description="The person with the rights. Another person with the rights" + ), + ] def test_aardvark_get_subjects_success(aardvark_record_all_fields): - assert MITAardvark.get_subjects(aardvark_record_all_fields) == [ + assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [ timdex.Subject(value=["Country"], kind="DCAT Keyword"), timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"), timdex.Subject(value=["Geography"], kind="Dublin Core Subject"), diff --git a/tests/sources/test_transformer.py b/tests/sources/test_transformer.py index 33ecb84..cda6d22 100644 --- a/tests/sources/test_transformer.py +++ b/tests/sources/test_transformer.py @@ -4,7 +4,7 @@ import pytest from transmogrifier.models import TimdexRecord -from transmogrifier.sources.transformer import Transformer, XmlTransformer +from transmogrifier.sources.transformer import Transformer, XMLTransformer from transmogrifier.sources.xml.datacite import Datacite @@ -28,7 +28,7 @@ def test_transformer_get_transformer_source_wrong_module_path_raises_error(bad_c def test_xmltransformer_initializes_with_expected_attributes(oai_pmh_records): - transformer = XmlTransformer("cool-repo", oai_pmh_records) + transformer = XMLTransformer("cool-repo", oai_pmh_records) assert transformer.source == "cool-repo" assert transformer.source_base_url == "https://example.com/" assert transformer.source_name == "A Cool Repository" @@ -36,7 +36,7 @@ def test_xmltransformer_initializes_with_expected_attributes(oai_pmh_records): def test_xmltransformer_iterates_through_all_records(oai_pmh_records): - output_records = XmlTransformer("cool-repo", oai_pmh_records) + output_records = XMLTransformer("cool-repo", oai_pmh_records) assert len(list(output_records)) == 2 assert output_records.processed_record_count == 3 assert output_records.transformed_record_count == 2 @@ -47,10 +47,10 @@ def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_non oai_pmh_records, ): with patch( - "transmogrifier.sources.transformer.XmlTransformer.get_optional_fields" + "transmogrifier.sources.transformer.XMLTransformer.get_optional_fields" ) as m: m.return_value = None - output_records = XmlTransformer("cool-repo", oai_pmh_records) + output_records = XMLTransformer("cool-repo", oai_pmh_records) assert len(list(output_records)) == 0 assert output_records.processed_record_count == 3 assert output_records.skipped_record_count == 2 @@ -62,7 +62,7 @@ def test_xmltransformer_transform_and_write_output_files_writes_output_files( tmp_path, oai_pmh_records ): output_file = str(tmp_path / "output_file.json") - transformer = XmlTransformer("cool-repo", oai_pmh_records) + transformer = XMLTransformer("cool-repo", oai_pmh_records) assert not Path(tmp_path / "output_file.json").exists() assert not Path(tmp_path / "output_file.txt").exists() transformer.transform_and_write_output_files(output_file) @@ -74,31 +74,31 @@ def test_xmltransformer_transform_and_write_output_files_no_txt_file_if_not_need tmp_path, ): output_file = str(tmp_path / "output_file.json") - datacite_records = XmlTransformer.parse_source_file( + datacite_records = XMLTransformer.parse_source_file( "tests/fixtures/datacite/datacite_records.xml" ) - transformer = XmlTransformer("cool-repo", datacite_records) + transformer = XMLTransformer("cool-repo", datacite_records) transformer.transform_and_write_output_files(output_file) assert len(list(tmp_path.iterdir())) == 1 assert next(tmp_path.iterdir()).name == "output_file.json" def test_xmltransformer_parse_source_file_returns_record_iterator(): - records = XmlTransformer.parse_source_file( + records = XMLTransformer.parse_source_file( "tests/fixtures/datacite/datacite_records.xml" ) assert len(list(records)) == 38 def test_xmltransformer_record_is_deleted_returns_true_if_deleted(caplog): - source_records = XmlTransformer.parse_source_file( + source_records = XMLTransformer.parse_source_file( "tests/fixtures/record_deleted.xml" ) - assert XmlTransformer.record_is_deleted(next(source_records)) is True + assert XMLTransformer.record_is_deleted(next(source_records)) is True def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_records): - transformer = XmlTransformer("cool-repo", oai_pmh_records) + transformer = XMLTransformer("cool-repo", oai_pmh_records) assert transformer.get_required_fields(next(oai_pmh_records)) == { "source": "A Cool Repository", "source_link": "https://example.com/12345", @@ -108,7 +108,7 @@ def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_reco def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records): - transformer = XmlTransformer("cool-repo", oai_pmh_records) + transformer = XMLTransformer("cool-repo", oai_pmh_records) assert next(transformer) == TimdexRecord( source="A Cool Repository", source_link="https://example.com/12345", @@ -120,7 +120,7 @@ def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records): def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(caplog): - source_records = XmlTransformer.parse_source_file( + source_records = XMLTransformer.parse_source_file( "tests/fixtures/record_title_field_blank.xml" ) output_records = Datacite("cool-repo", source_records) @@ -132,7 +132,7 @@ def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(capl def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(caplog): - source_records = XmlTransformer.parse_source_file( + source_records = XMLTransformer.parse_source_file( "tests/fixtures/record_title_field_missing.xml" ) output_records = Datacite("cool-repo", source_records) @@ -144,7 +144,7 @@ def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(ca def test_xmltransformer_get_valid_title_with_title_field_multiple_logs_warning(caplog): - source_records = XmlTransformer.parse_source_file( + source_records = XMLTransformer.parse_source_file( "tests/fixtures/record_title_field_multiple.xml" ) output_records = Datacite("cool-repo", source_records) diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py index 25429cf..47324a8 100644 --- a/transmogrifier/sources/json/aardvark.py +++ b/transmogrifier/sources/json/aardvark.py @@ -1,12 +1,12 @@ import logging import transmogrifier.models as timdex -from transmogrifier.sources.transformer import JsonTransformer +from transmogrifier.sources.transformer import JSONTransformer logger = logging.getLogger(__name__) -class MITAardvark(JsonTransformer): +class MITAardvark(JSONTransformer): """MITAardvark transformer. MIT Aardvark records have more required fields than standard Aardvark records @@ -51,30 +51,32 @@ def record_is_deleted(cls, source_record: dict) -> bool: def get_optional_fields(self, source_record: dict) -> dict | None: """ - Retrieve optional TIMDEX fields from a Aardvar JSON record. + Retrieve optional TIMDEX fields from an Aardvark JSON record. Overrides metaclass get_optional_fields() method. Args: - xml: A BeautifulSoup Tag representing a single Datacite record in - oai_datacite XML. + source_record: A JSON object representing a source record. """ fields: dict = {} # alternate_titles + fields["alternate_titles"] = self.get_alternate_titles(source_record) or None # content_type fields["content_type"] = ["Geospatial data"] # contributors + fields["contributors"] = self.get_contributors(source_record) or None # dates - # edition + # edition not used in MITAardvark # format + fields["format"] = source_record.get("dct_format_s") - # funding_information + # funding_information not used in MITAardvark # identifiers @@ -86,19 +88,92 @@ def get_optional_fields(self, source_record: dict) -> dict | None: # locations # notes + fields["notes"] = self.get_notes(source_record) or None # publication_information + fields["publication_information"] = ( + self.get_publication_information(source_record) or None + ) - # related_items + # related_items not used in MITAardvark # rights + fields["rights"] = self.get_rights(source_record) or None # subjects fields["subjects"] = self.get_subjects(source_record) or None # summary field + fields["summary"] = source_record.get("dct_description_sm") + return fields + @staticmethod + def get_alternate_titles(source_record: dict) -> list[timdex.AlternateTitle]: + """Get values from source record for TIMDEX alternate_titles field.""" + return [ + timdex.AlternateTitle(value=title_value) + for title_value in source_record.get("dct_alternative_sm", []) + ] + + @staticmethod + def get_contributors(source_record: dict) -> list[timdex.Contributor]: + """Get values from source record for TIMDEX contributors field.""" + return [ + timdex.Contributor(value=contributor_value, kind="Creator") + for contributor_value in source_record.get("dct_creator_sm", []) + ] + + @staticmethod + def get_notes(source_record: dict) -> list[timdex.Note]: + """Get values from source record for TIMDEX notes field.""" + return [ + timdex.Note(value=[note_value], kind="Display note") + for note_value in source_record.get("gbl_displayNote_sm", []) + ] + + @staticmethod + def get_publication_information(source_record: dict) -> list[str]: + """Get values from source record for TIMDEX publication_information field.""" + publication_information = [] + + if "dct_publisher_sm" in source_record: + publication_information.extend(source_record["dct_publisher_sm"]) + + if "schema_provider_s" in source_record: + publication_information.append(source_record["schema_provider_s"]) + + return publication_information + + @staticmethod + def get_rights(source_record: dict) -> list[timdex.Rights]: + """Get values from source record for TIMDEX rights field.""" + rights = [] + + if "dct_accessRights_s" in source_record: + rights.append( + timdex.Rights( + description=source_record["dct_accessRights_s"], kind="Access" + ) + ) + + rights.extend( + [ + timdex.Rights(uri=rights_uri_value) + for rights_uri_value in source_record.get("dct_license_sm", []) + ] + ) + + for aardvark_rights_field in ["dct_rights_sm", "dct_rightsHolder_sm"]: + if aardvark_rights_field in source_record: + rights.append( + timdex.Rights( + description=". ".join(source_record[aardvark_rights_field]) + ) + ) + + return rights + @staticmethod def get_subjects(source_record: dict) -> list[timdex.Subject]: """Get values from source record for TIMDEX subjects field. @@ -115,6 +190,7 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]: source_record: A JSON object representing a source record. """ subjects = [] + aardvark_subject_fields = { "dcat_keyword_sm": "DCAT Keyword", "dcat_theme_sm": "DCAT Theme", @@ -122,6 +198,7 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]: "gbl_resourceClass_sm": "Subject scheme not provided", "gbl_resourceType_sm": "Subject scheme not provided", } + for aardvark_subject_field, kind_value in { key: value for key, value in aardvark_subject_fields.items() @@ -129,4 +206,5 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]: }.items(): for subject in source_record[aardvark_subject_field]: subjects.append(timdex.Subject(value=[subject], kind=kind_value)) + return subjects diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index d45e8ab..1a0acc8 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -374,7 +374,7 @@ def get_optional_fields( return {} -class JsonTransformer(Transformer): +class JSONTransformer(Transformer): """JSON transformer class.""" @final @@ -525,7 +525,7 @@ def get_optional_fields(self, source_record: dict[str, JSON]) -> Optional[dict]: return {} -class XmlTransformer(Transformer): +class XMLTransformer(Transformer): """XML transformer class.""" @final diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index 7741b04..b46c232 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date, validate_date_range -from transmogrifier.sources.transformer import XmlTransformer +from transmogrifier.sources.transformer import XMLTransformer logger = logging.getLogger(__name__) -class Datacite(XmlTransformer): +class Datacite(XMLTransformer): """Datacite transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py index 433a936..9c3b150 100644 --- a/transmogrifier/sources/xml/dspace_dim.py +++ b/transmogrifier/sources/xml/dspace_dim.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date, validate_date_range -from transmogrifier.sources.transformer import XmlTransformer +from transmogrifier.sources.transformer import XMLTransformer logger = logging.getLogger(__name__) -class DspaceDim(XmlTransformer): +class DspaceDim(XMLTransformer): """DSpace DIM transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/xml/dspace_mets.py b/transmogrifier/sources/xml/dspace_mets.py index 7db882f..6b72868 100644 --- a/transmogrifier/sources/xml/dspace_mets.py +++ b/transmogrifier/sources/xml/dspace_mets.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date -from transmogrifier.sources.transformer import XmlTransformer +from transmogrifier.sources.transformer import XMLTransformer logger = logging.getLogger(__name__) -class DspaceMets(XmlTransformer): +class DspaceMets(XMLTransformer): """DSpace METS transformer.""" def get_optional_fields(self, xml: Tag) -> dict: diff --git a/transmogrifier/sources/xml/ead.py b/transmogrifier/sources/xml/ead.py index 82cdbad..83e2015 100644 --- a/transmogrifier/sources/xml/ead.py +++ b/transmogrifier/sources/xml/ead.py @@ -6,7 +6,7 @@ import transmogrifier.models as timdex from transmogrifier.config import load_external_config from transmogrifier.helpers import validate_date, validate_date_range -from transmogrifier.sources.transformer import XmlTransformer +from transmogrifier.sources.transformer import XMLTransformer logger = logging.getLogger(__name__) @@ -16,7 +16,7 @@ ) -class Ead(XmlTransformer): +class Ead(XMLTransformer): """EAD transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/xml/marc.py b/transmogrifier/sources/xml/marc.py index 0077cc4..4ceda70 100644 --- a/transmogrifier/sources/xml/marc.py +++ b/transmogrifier/sources/xml/marc.py @@ -6,7 +6,7 @@ import transmogrifier.models as timdex from transmogrifier.config import load_external_config from transmogrifier.helpers import validate_date -from transmogrifier.sources.transformer import XmlTransformer +from transmogrifier.sources.transformer import XMLTransformer logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ ) -class Marc(XmlTransformer): +class Marc(XMLTransformer): """Marc transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/xml/oaidc.py b/transmogrifier/sources/xml/oaidc.py index 7e672c7..2cedbc4 100644 --- a/transmogrifier/sources/xml/oaidc.py +++ b/transmogrifier/sources/xml/oaidc.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date -from transmogrifier.sources.transformer import XmlTransformer +from transmogrifier.sources.transformer import XMLTransformer logger = logging.getLogger(__name__) -class OaiDc(XmlTransformer): +class OaiDc(XMLTransformer): """ Generic OAI DC transformer.