Skip to content

Commit

Permalink
Updates based on discussion in PR #108
Browse files Browse the repository at this point in the history
* Update json_records fixture to aardvark_records for more accurate unit tests
* Rename Aardvark > MITAardvark to unify terminology across repos
* Update get_main_titles method to reflect it is a required field
* Update Aardvark method docstrings to provide greater context
* Add Transformer._transform method to minimize code duplication between JsonTransformer and XmlTransformer methods
  • Loading branch information
ehanson8 committed Dec 15, 2023
1 parent 8edb2cf commit 71e9ec1
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 73 deletions.
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def datacite_record_all_fields():


@pytest.fixture()
def json_records():
return JsonTransformer.parse_source_file("tests/fixtures/json_records.jsonl")
def aardvark_records():
return JsonTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")


@pytest.fixture()
Expand Down
2 changes: 2 additions & 0 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "123", "dct_title_s": "Test title 1"}
{"id": "456", "dct_title_s": "Test title 2"}
2 changes: 0 additions & 2 deletions tests/fixtures/json_records.jsonl

This file was deleted.

24 changes: 12 additions & 12 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import Aardvark
from transmogrifier.sources.json.aardvark import MITAardvark


def test_aardvark_get_required_fields_returns_expected_values(json_records):
transformer = Aardvark("cool-repo", json_records)
assert transformer.get_required_fields(next(json_records)) == {
def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/123",
"timdex_record_id": "cool-repo:123",
"title": "Title not provided",
"title": "Test title 1",
}


def test_jsontransformer_transform_returns_timdex_record(json_records):
transformer = Aardvark("cool-repo", json_records)
def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/123",
timdex_record_id="cool-repo:123",
title="Title not provided",
citation="Title not provided. Geospatial data. https://example.com/123",
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
)


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123"
assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert Aardvark.get_subjects(aardvark_record_all_fields) == [
assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
Expand Down
31 changes: 21 additions & 10 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,26 @@
logger = logging.getLogger(__name__)


class Aardvark(JsonTransformer):
"""Aardvark transformer."""
class MITAardvark(JsonTransformer):
"""MITAardvark transformer.
MIT Aardvark records have more required fields than standard Aardvark records
as detailed here in the geo-harvester's MITAardvark class:
https://github.com/MITLibraries/geo-harvester/blob/main/harvester/records/record.py
"""

@classmethod
def get_main_titles(cls, source_record: dict) -> list[str]:
"""
Retrieve main title(s) from a Aardvark JSON record.
Retrieve main title(s) from a MITAardvark JSON record.
Overrides metaclass get_main_titles() method.
Args:
source_record: A JSON object representing a source record.
"""
titles = []
if title := "dct_title_s" in source_record and source_record["dct_title_s"]:
titles.append(title)
return titles
return [source_record["dct_title_s"]]

@classmethod
def get_source_record_id(cls, source_record: dict) -> str:
Expand All @@ -39,7 +42,7 @@ def record_is_deleted(cls, source_record: dict) -> bool:
"""
Determine whether record has a status of deleted.
## WIP - defining to enable instantiation of Aardvark instance.
## WIP - defining to enable instantiation of MITAardvark instance.
Args:
source_record: A JSON object representing a source record.
Expand All @@ -58,7 +61,7 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
"""
fields: dict = {}

# alternate_titles field not used in Aardvark
# alternate_titles

# content_type
fields["content_type"] = ["Geospatial data"]
Expand All @@ -76,7 +79,7 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
# identifiers

# languages
fields["languages"] = source_record.get("dct_langauge_sm")
fields["languages"] = source_record.get("dct_language_sm")

# links

Expand All @@ -100,6 +103,14 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
def get_subjects(source_record: dict) -> list[timdex.Subject]:
"""Get values from source record for TIMDEX subjects field.
Unlike other TIMDEX sources, the subject scheme is not known
for each term. The kind here represents the uncontrolled field
in which the term was found.
DCAT Keyword: https://www.w3.org/TR/vocab-dcat-2/#Property:resource_keyword
DCAT Theme: https://www.w3.org/TR/vocab-dcat-2/#Property:resource_theme
Dublin Core Subject: http://purl.org/dc/terms/subject
Args:
source_record: A JSON object representing a source record.
"""
Expand Down
89 changes: 42 additions & 47 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,45 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON] | Tag]:
"""
pass

@final
def _transform(
self, source_record: dict[str, JSON] | Tag
) -> Optional[TimdexRecord]:
"""
Private method called by Transform a source record into a TIMDEX record.
May not be overridden.
Args:
source_record: A single source record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)

@abstractmethod
def transform(self, source_record: dict[str, JSON] | Tag) -> Optional[TimdexRecord]:
"""
Transform a source record into a TIMDEX record.
Call Transformer._transform method to transform source record to TIMDEX record.
Must be overridden by format subclasses.
Expand Down Expand Up @@ -349,6 +384,8 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:
May not be overridden.
Validates that records in the file are dicts for proper processing.
Args:
source_file: A file containing source records to be transformed.
"""
Expand All @@ -359,35 +396,14 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:
@final
def transform(self, source_record: dict[str, JSON]) -> Optional[TimdexRecord]:
"""
Transform a JSON record into a TIMDEX record.
Call Transformer._transform method to transform JSON record to TIMDEX record.
May not be overridden.
Args:
source_record: A JSON object representing a source record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)
return self._transform(source_record)

@final
def get_required_fields(self, source_record: dict[str, JSON]) -> dict:
Expand Down Expand Up @@ -537,35 +553,14 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
@final
def transform(self, source_record: Tag) -> Optional[TimdexRecord]:
"""
Transform an XML record into a TIMDEX record.
Call Transformer._transform method to transform XML record to TIMDEX record.
May not be overridden.
Args:
source_record: A BeautifulSoup Tag representing a single XML record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)
return self._transform(source_record)

@final
def get_required_fields(self, source_record: Tag) -> dict:
Expand Down

0 comments on commit 71e9ec1

Please sign in to comment.