Skip to content

Commit

Permalink
Additional MITAardvark methods (#109)
Browse files Browse the repository at this point in the history
* Additional MITAardvark methods

Why these changes are being introduced:
* Additonal methods are needed for the MITAardvark class

How this addresses that need:
* Update get_optional_fields method to include format and summary values and add corresponding unit test
* Add get_alternate_titles, get_contributors, get_notes, get_publication_information, and get_rights field methods along with calls in get_optional_fields and corresponding unit tests
* Update aardvark_record_all_fields fixture to include new fields

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-54

* Updates based on discussion PR #109

* Update dct_license_sm field in aardvark fixture to properly reflect expected value
* Refactor get_alternate_titles, get contributors, and get_notes methods for more efficient processing
* Update get_rights method to properly process expected value of dct_license_sm

* Update class name capitalization
  • Loading branch information
ehanson8 authored Dec 20, 2023
1 parent 1aba987 commit 97b8bc1
Show file tree
Hide file tree
Showing 12 changed files with 204 additions and 52 deletions.
16 changes: 7 additions & 9 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
from transmogrifier.sources.transformer import JSONTransformer, XMLTransformer
from transmogrifier.sources.xml.datacite import Datacite


Expand Down Expand Up @@ -48,31 +48,29 @@ def runner():

@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
return JSONTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)


@pytest.fixture()
def datacite_records():
return XmlTransformer.parse_source_file(
return XMLTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
)


@pytest.fixture()
def datacite_record_all_fields():
source_records = XmlTransformer.parse_source_file(
source_records = XMLTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_record_all_fields.xml"
)
return Datacite("cool-repo", source_records)


@pytest.fixture()
def aardvark_records():
return JsonTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")
return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")


@pytest.fixture()
Expand All @@ -87,7 +85,7 @@ def marc_content_type_crosswalk():

@pytest.fixture()
def oai_pmh_records():
return XmlTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")
return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")


@pytest.fixture()
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"}
84 changes: 80 additions & 4 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
}


def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
def test_aardvark_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
Expand All @@ -24,16 +24,92 @@ def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
)


def test_aardvark_get_optional_fields_non_field_method_values_success(
aardvark_record_all_fields,
):
transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
record = next(transformer)
assert record.format == "Shapefile"
assert record.languages == ["eng"]
assert record.summary == ["A description"]


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
assert MITAardvark.get_main_titles(next(aardvark_record_all_fields)) == [
"Test title 1"
]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"
assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123"


def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [
timdex.AlternateTitle(value="Alternate title")
]


def test_aardvark_get_contributors_success(aardvark_record_all_fields):
assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [
timdex.Contributor(
value="Smith, Jane",
kind="Creator",
),
timdex.Contributor(
value="Smith, John",
kind="Creator",
),
]


def test_aardvark_get_notes_success(aardvark_record_all_fields):
assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [
timdex.Note(
value=["Danger: This text will be displayed in a red box"],
kind="Display note",
),
timdex.Note(
value=["Info: This text will be displayed in a blue box"],
kind="Display note",
),
timdex.Note(
value=["Tip: This text will be displayed in a green box"],
kind="Display note",
),
timdex.Note(
value=["Warning: This text will be displayed in a yellow box"],
kind="Display note",
),
timdex.Note(
value=[
"This is text without a tag and it will be assigned default 'note' style"
],
kind="Display note",
),
]


def test_aardvark_get_publication_information_success(aardvark_record_all_fields):
assert MITAardvark.get_publication_information(
next(aardvark_record_all_fields)
) == ["ML InfoMap (Firm)", "MIT"]


def test_aardvark_get_rights_success(aardvark_record_all_fields):
assert MITAardvark.get_rights(next(aardvark_record_all_fields)) == [
timdex.Rights(description="Access note", kind="Access"),
timdex.Rights(uri="http://license.license"),
timdex.Rights(uri="http://another_license.another_license"),
timdex.Rights(description="Some person has the rights"),
timdex.Rights(
description="The person with the rights. Another person with the rights"
),
]


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
Expand Down
32 changes: 16 additions & 16 deletions tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from transmogrifier.models import TimdexRecord
from transmogrifier.sources.transformer import Transformer, XmlTransformer
from transmogrifier.sources.transformer import Transformer, XMLTransformer
from transmogrifier.sources.xml.datacite import Datacite


Expand All @@ -28,15 +28,15 @@ def test_transformer_get_transformer_source_wrong_module_path_raises_error(bad_c


def test_xmltransformer_initializes_with_expected_attributes(oai_pmh_records):
transformer = XmlTransformer("cool-repo", oai_pmh_records)
transformer = XMLTransformer("cool-repo", oai_pmh_records)
assert transformer.source == "cool-repo"
assert transformer.source_base_url == "https://example.com/"
assert transformer.source_name == "A Cool Repository"
assert transformer.source_records == oai_pmh_records


def test_xmltransformer_iterates_through_all_records(oai_pmh_records):
output_records = XmlTransformer("cool-repo", oai_pmh_records)
output_records = XMLTransformer("cool-repo", oai_pmh_records)
assert len(list(output_records)) == 2
assert output_records.processed_record_count == 3
assert output_records.transformed_record_count == 2
Expand All @@ -47,10 +47,10 @@ def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_non
oai_pmh_records,
):
with patch(
"transmogrifier.sources.transformer.XmlTransformer.get_optional_fields"
"transmogrifier.sources.transformer.XMLTransformer.get_optional_fields"
) as m:
m.return_value = None
output_records = XmlTransformer("cool-repo", oai_pmh_records)
output_records = XMLTransformer("cool-repo", oai_pmh_records)
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 3
assert output_records.skipped_record_count == 2
Expand All @@ -62,7 +62,7 @@ def test_xmltransformer_transform_and_write_output_files_writes_output_files(
tmp_path, oai_pmh_records
):
output_file = str(tmp_path / "output_file.json")
transformer = XmlTransformer("cool-repo", oai_pmh_records)
transformer = XMLTransformer("cool-repo", oai_pmh_records)
assert not Path(tmp_path / "output_file.json").exists()
assert not Path(tmp_path / "output_file.txt").exists()
transformer.transform_and_write_output_files(output_file)
Expand All @@ -74,31 +74,31 @@ def test_xmltransformer_transform_and_write_output_files_no_txt_file_if_not_need
tmp_path,
):
output_file = str(tmp_path / "output_file.json")
datacite_records = XmlTransformer.parse_source_file(
datacite_records = XMLTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
)
transformer = XmlTransformer("cool-repo", datacite_records)
transformer = XMLTransformer("cool-repo", datacite_records)
transformer.transform_and_write_output_files(output_file)
assert len(list(tmp_path.iterdir())) == 1
assert next(tmp_path.iterdir()).name == "output_file.json"


def test_xmltransformer_parse_source_file_returns_record_iterator():
records = XmlTransformer.parse_source_file(
records = XMLTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
)
assert len(list(records)) == 38


def test_xmltransformer_record_is_deleted_returns_true_if_deleted(caplog):
source_records = XmlTransformer.parse_source_file(
source_records = XMLTransformer.parse_source_file(
"tests/fixtures/record_deleted.xml"
)
assert XmlTransformer.record_is_deleted(next(source_records)) is True
assert XMLTransformer.record_is_deleted(next(source_records)) is True


def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_records):
transformer = XmlTransformer("cool-repo", oai_pmh_records)
transformer = XMLTransformer("cool-repo", oai_pmh_records)
assert transformer.get_required_fields(next(oai_pmh_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/12345",
Expand All @@ -108,7 +108,7 @@ def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_reco


def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records):
transformer = XmlTransformer("cool-repo", oai_pmh_records)
transformer = XMLTransformer("cool-repo", oai_pmh_records)
assert next(transformer) == TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/12345",
Expand All @@ -120,7 +120,7 @@ def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records):


def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(caplog):
source_records = XmlTransformer.parse_source_file(
source_records = XMLTransformer.parse_source_file(
"tests/fixtures/record_title_field_blank.xml"
)
output_records = Datacite("cool-repo", source_records)
Expand All @@ -132,7 +132,7 @@ def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(capl


def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(caplog):
source_records = XmlTransformer.parse_source_file(
source_records = XMLTransformer.parse_source_file(
"tests/fixtures/record_title_field_missing.xml"
)
output_records = Datacite("cool-repo", source_records)
Expand All @@ -144,7 +144,7 @@ def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(ca


def test_xmltransformer_get_valid_title_with_title_field_multiple_logs_warning(caplog):
source_records = XmlTransformer.parse_source_file(
source_records = XMLTransformer.parse_source_file(
"tests/fixtures/record_title_field_multiple.xml"
)
output_records = Datacite("cool-repo", source_records)
Expand Down
Loading

0 comments on commit 97b8bc1

Please sign in to comment.