Additional MITAardvark methods (#109)

* Additional MITAardvark methods Why these changes are being introduced: * Additonal methods are needed for the MITAardvark class How this addresses that need: * Update get_optional_fields method to include format and summary values and add corresponding unit test * Add get_alternate_titles, get_contributors, get_notes, get_publication_information, and get_rights field methods along with calls in get_optional_fields and corresponding unit tests * Update aardvark_record_all_fields fixture to include new fields Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-54 * Updates based on discussion PR #109 * Update dct_license_sm field in aardvark fixture to properly reflect expected value * Refactor get_alternate_titles, get contributors, and get_notes methods for more efficient processing * Update get_rights method to properly process expected value of dct_license_sm * Update class name capitalization
MITLibraries · Dec 20, 2023 · 97b8bc1 · 97b8bc1
1 parent 1aba987
commit 97b8bc1
Show file tree

Hide file tree

Showing 12 changed files with 204 additions and 52 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,7 +5,7 @@
 
 import transmogrifier.models as timdex
 from transmogrifier.config import SOURCES, load_external_config
-from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
+from transmogrifier.sources.transformer import JSONTransformer, XMLTransformer
 from transmogrifier.sources.xml.datacite import Datacite
 
 
@@ -48,31 +48,29 @@ def runner():
 
 @pytest.fixture
 def aardvark_record_all_fields():
-    return next(
-        JsonTransformer.parse_source_file(
-            "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
-        )
+    return JSONTransformer.parse_source_file(
+        "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
     )
 
 
 @pytest.fixture()
 def datacite_records():
-    return XmlTransformer.parse_source_file(
+    return XMLTransformer.parse_source_file(
         "tests/fixtures/datacite/datacite_records.xml"
     )
 
 
 @pytest.fixture()
 def datacite_record_all_fields():
-    source_records = XmlTransformer.parse_source_file(
+    source_records = XMLTransformer.parse_source_file(
         "tests/fixtures/datacite/datacite_record_all_fields.xml"
     )
     return Datacite("cool-repo", source_records)
 
 
 @pytest.fixture()
 def aardvark_records():
-    return JsonTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")
+    return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")
 
 
 @pytest.fixture()
@@ -87,7 +85,7 @@ def marc_content_type_crosswalk():
 
 @pytest.fixture()
 def oai_pmh_records():
-    return XmlTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")
+    return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")
 
 
 @pytest.fixture()

diff --git a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
@@ -1 +1 @@
-{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
+{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"}
diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py
@@ -12,7 +12,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
     }
 
 
-def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
+def test_aardvark_transform_returns_timdex_record(aardvark_records):
     transformer = MITAardvark("cool-repo", aardvark_records)
     assert next(transformer) == timdex.TimdexRecord(
         source="A Cool Repository",
@@ -24,16 +24,92 @@ def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
     )
 
 
+def test_aardvark_get_optional_fields_non_field_method_values_success(
+    aardvark_record_all_fields,
+):
+    transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
+    record = next(transformer)
+    assert record.format == "Shapefile"
+    assert record.languages == ["eng"]
+    assert record.summary == ["A description"]
+
+
 def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
-    assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
+    assert MITAardvark.get_main_titles(next(aardvark_record_all_fields)) == [
+        "Test title 1"
+    ]
 
 
 def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
-    assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"
+    assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123"
+
+
+def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields):
+    assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [
+        timdex.AlternateTitle(value="Alternate title")
+    ]
+
+
+def test_aardvark_get_contributors_success(aardvark_record_all_fields):
+    assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [
+        timdex.Contributor(
+            value="Smith, Jane",
+            kind="Creator",
+        ),
+        timdex.Contributor(
+            value="Smith, John",
+            kind="Creator",
+        ),
+    ]
+
+
+def test_aardvark_get_notes_success(aardvark_record_all_fields):
+    assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [
+        timdex.Note(
+            value=["Danger: This text will be displayed in a red box"],
+            kind="Display note",
+        ),
+        timdex.Note(
+            value=["Info: This text will be displayed in a blue box"],
+            kind="Display note",
+        ),
+        timdex.Note(
+            value=["Tip: This text will be displayed in a green box"],
+            kind="Display note",
+        ),
+        timdex.Note(
+            value=["Warning: This text will be displayed in a yellow box"],
+            kind="Display note",
+        ),
+        timdex.Note(
+            value=[
+                "This is text without a tag and it will be assigned default 'note' style"
+            ],
+            kind="Display note",
+        ),
+    ]
+
+
+def test_aardvark_get_publication_information_success(aardvark_record_all_fields):
+    assert MITAardvark.get_publication_information(
+        next(aardvark_record_all_fields)
+    ) == ["ML InfoMap (Firm)", "MIT"]
+
+
+def test_aardvark_get_rights_success(aardvark_record_all_fields):
+    assert MITAardvark.get_rights(next(aardvark_record_all_fields)) == [
+        timdex.Rights(description="Access note", kind="Access"),
+        timdex.Rights(uri="http://license.license"),
+        timdex.Rights(uri="http://another_license.another_license"),
+        timdex.Rights(description="Some person has the rights"),
+        timdex.Rights(
+            description="The person with the rights. Another person with the rights"
+        ),
+    ]
 
 
 def test_aardvark_get_subjects_success(aardvark_record_all_fields):
-    assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
+    assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [
         timdex.Subject(value=["Country"], kind="DCAT Keyword"),
         timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
         timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),

diff --git a/tests/sources/test_transformer.py b/tests/sources/test_transformer.py
@@ -4,7 +4,7 @@
 import pytest
 
 from transmogrifier.models import TimdexRecord
-from transmogrifier.sources.transformer import Transformer, XmlTransformer
+from transmogrifier.sources.transformer import Transformer, XMLTransformer
 from transmogrifier.sources.xml.datacite import Datacite
 
 
@@ -28,15 +28,15 @@ def test_transformer_get_transformer_source_wrong_module_path_raises_error(bad_c
 
 
 def test_xmltransformer_initializes_with_expected_attributes(oai_pmh_records):
-    transformer = XmlTransformer("cool-repo", oai_pmh_records)
+    transformer = XMLTransformer("cool-repo", oai_pmh_records)
     assert transformer.source == "cool-repo"
     assert transformer.source_base_url == "https://example.com/"
     assert transformer.source_name == "A Cool Repository"
     assert transformer.source_records == oai_pmh_records
 
 
 def test_xmltransformer_iterates_through_all_records(oai_pmh_records):
-    output_records = XmlTransformer("cool-repo", oai_pmh_records)
+    output_records = XMLTransformer("cool-repo", oai_pmh_records)
     assert len(list(output_records)) == 2
     assert output_records.processed_record_count == 3
     assert output_records.transformed_record_count == 2
@@ -47,10 +47,10 @@ def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_non
     oai_pmh_records,
 ):
     with patch(
-        "transmogrifier.sources.transformer.XmlTransformer.get_optional_fields"
+        "transmogrifier.sources.transformer.XMLTransformer.get_optional_fields"
     ) as m:
         m.return_value = None
-        output_records = XmlTransformer("cool-repo", oai_pmh_records)
+        output_records = XMLTransformer("cool-repo", oai_pmh_records)
         assert len(list(output_records)) == 0
         assert output_records.processed_record_count == 3
         assert output_records.skipped_record_count == 2
@@ -62,7 +62,7 @@ def test_xmltransformer_transform_and_write_output_files_writes_output_files(
     tmp_path, oai_pmh_records
 ):
     output_file = str(tmp_path / "output_file.json")
-    transformer = XmlTransformer("cool-repo", oai_pmh_records)
+    transformer = XMLTransformer("cool-repo", oai_pmh_records)
     assert not Path(tmp_path / "output_file.json").exists()
     assert not Path(tmp_path / "output_file.txt").exists()
     transformer.transform_and_write_output_files(output_file)
@@ -74,31 +74,31 @@ def test_xmltransformer_transform_and_write_output_files_no_txt_file_if_not_need
     tmp_path,
 ):
     output_file = str(tmp_path / "output_file.json")
-    datacite_records = XmlTransformer.parse_source_file(
+    datacite_records = XMLTransformer.parse_source_file(
         "tests/fixtures/datacite/datacite_records.xml"
     )
-    transformer = XmlTransformer("cool-repo", datacite_records)
+    transformer = XMLTransformer("cool-repo", datacite_records)
     transformer.transform_and_write_output_files(output_file)
     assert len(list(tmp_path.iterdir())) == 1
     assert next(tmp_path.iterdir()).name == "output_file.json"
 
 
 def test_xmltransformer_parse_source_file_returns_record_iterator():
-    records = XmlTransformer.parse_source_file(
+    records = XMLTransformer.parse_source_file(
         "tests/fixtures/datacite/datacite_records.xml"
     )
     assert len(list(records)) == 38
 
 
 def test_xmltransformer_record_is_deleted_returns_true_if_deleted(caplog):
-    source_records = XmlTransformer.parse_source_file(
+    source_records = XMLTransformer.parse_source_file(
         "tests/fixtures/record_deleted.xml"
     )
-    assert XmlTransformer.record_is_deleted(next(source_records)) is True
+    assert XMLTransformer.record_is_deleted(next(source_records)) is True
 
 
 def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_records):
-    transformer = XmlTransformer("cool-repo", oai_pmh_records)
+    transformer = XMLTransformer("cool-repo", oai_pmh_records)
     assert transformer.get_required_fields(next(oai_pmh_records)) == {
         "source": "A Cool Repository",
         "source_link": "https://example.com/12345",
@@ -108,7 +108,7 @@ def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_reco
 
 
 def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records):
-    transformer = XmlTransformer("cool-repo", oai_pmh_records)
+    transformer = XMLTransformer("cool-repo", oai_pmh_records)
     assert next(transformer) == TimdexRecord(
         source="A Cool Repository",
         source_link="https://example.com/12345",
@@ -120,7 +120,7 @@ def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records):
 
 
 def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(caplog):
-    source_records = XmlTransformer.parse_source_file(
+    source_records = XMLTransformer.parse_source_file(
         "tests/fixtures/record_title_field_blank.xml"
     )
     output_records = Datacite("cool-repo", source_records)
@@ -132,7 +132,7 @@ def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(capl
 
 
 def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(caplog):
-    source_records = XmlTransformer.parse_source_file(
+    source_records = XMLTransformer.parse_source_file(
         "tests/fixtures/record_title_field_missing.xml"
     )
     output_records = Datacite("cool-repo", source_records)
@@ -144,7 +144,7 @@ def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(ca
 
 
 def test_xmltransformer_get_valid_title_with_title_field_multiple_logs_warning(caplog):
-    source_records = XmlTransformer.parse_source_file(
+    source_records = XMLTransformer.parse_source_file(
         "tests/fixtures/record_title_field_multiple.xml"
     )
     output_records = Datacite("cool-repo", source_records)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
		{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"}