Skip to content

Commit

Permalink
Merge pull request #210 from MITLibraries/TIMX-403-inputs-support-par…
Browse files Browse the repository at this point in the history
…quet-writing

TIMX 403 - Add "run_id" CLI argument and Transformer attribute
  • Loading branch information
ghukill authored Nov 20, 2024
2 parents 65dab54 + 12c2c6b commit 68bef31
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 5 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ Options:
-s, --source [alma|aspace|dspace|jpal|libguides|gismit|gisogm|researchdatabases|whoas|zenodo]
Source records were harvested from, must
choose from list of options [required]
-r, --run-id TEXT Identifier for Transmogrifier run. This can
be used to group transformed records
produced by Transmogrifier, even if they
span multiple CLI invocations. If a value
is not provided a UUID will be minted and
used.
-v, --verbose Pass to log at debug level instead of info
--help Show this message and exit.
```
29 changes: 29 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.jsontransformer import JSONTransformer
from transmogrifier.sources.transformer import Transformer
from transmogrifier.sources.xml.datacite import Datacite
from transmogrifier.sources.xmltransformer import XMLTransformer

Expand Down Expand Up @@ -43,6 +44,34 @@ def runner():
return CliRunner()


# transformers ##########################


@pytest.fixture
def generic_transformer():

class GenericTransformer(Transformer):
def parse_source_file(self):
pass

def record_is_deleted(self):
pass

def get_main_titles(self):
pass

def get_source_link(self):
pass

def get_source_record_id(self):
pass

def get_timdex_record_id(self):
pass

return GenericTransformer


# aardvark ##########################


Expand Down
14 changes: 14 additions & 0 deletions tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# ruff: noqa: PLR2004

import uuid

import pytest

import transmogrifier.models as timdex
Expand Down Expand Up @@ -75,3 +77,15 @@ def test_create_locations_from_spatial_subjects_success(timdex_record_required_f
timdex.Location(value="City 1", kind="Place Name"),
timdex.Location(value="City 2", kind="Place Name"),
]


def test_transformer_run_id_explicitly_passed(generic_transformer):
run_id = "abc123"
transformer = generic_transformer("alma", [], run_id=run_id)
assert transformer.run_id == run_id


def test_transformer_run_id_none_passed_generates_uuid(generic_transformer):
transformer = generic_transformer("alma", [], run_id=None)
assert transformer.run_id is not None
assert uuid.UUID(transformer.run_id)
50 changes: 50 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# ruff: noqa: S108

from unittest import mock

from transmogrifier.cli import main


Expand Down Expand Up @@ -90,3 +94,49 @@ def test_transform_deleted_records(caplog, runner, tmp_path):
"Completed transform, total records processed: 1, transformed records: 0"
", skipped records: 0, deleted records: 1"
) in caplog.text


def test_transform_run_id_argument_passed_and_used(caplog, runner, tmp_path):
caplog.set_level("INFO")
run_id = "abc123"
with mock.patch(
"transmogrifier.sources.transformer.Transformer.transform_and_write_output_files"
) as mocked_transform_and_write:
mocked_transform_and_write.side_effect = Exception("stopping transformation")
runner.invoke(
main,
[
"--verbose",
"-s",
"alma",
"-r",
run_id,
"-i",
"tests/fixtures/datacite/datacite_records.xml",
"-o",
"/tmp/records.json",
],
)
assert f"run_id set: '{run_id}'" in caplog.text


def test_transform_run_id_argument_not_passed_and_uuid_minted(caplog, runner, tmp_path):
caplog.set_level("INFO")
with mock.patch(
"transmogrifier.sources.transformer.Transformer.transform_and_write_output_files"
) as mocked_transform_and_write:
mocked_transform_and_write.side_effect = Exception("stopping transformation")
runner.invoke(
main,
[
"--verbose",
"-s",
"alma",
"-i",
"tests/fixtures/datacite/datacite_records.xml",
"-o",
"/tmp/records.json",
],
)
assert "explicit run_id not passed, minting new UUID" in caplog.text
assert "run_id set:" in caplog.text
16 changes: 14 additions & 2 deletions transmogrifier/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,31 @@
type=click.Choice(list(SOURCES.keys()), case_sensitive=False),
help="Source records were harvested from, must choose from list of options",
)
@click.option(
"-r",
"--run-id",
required=False,
help="Identifier for Transmogrifier run. This can be used to group transformed "
"records produced by Transmogrifier, even if they span multiple CLI invocations. "
"If a value is not provided a UUID will be minted and used.",
)
@click.option(
"-v", "--verbose", is_flag=True, help="Pass to log at debug level instead of info"
)
def main(
source: str, input_file: str, output_file: str, verbose: bool # noqa: FBT001
source: str,
input_file: str,
output_file: str,
run_id: str,
verbose: bool, # noqa: FBT001
) -> None:
start_time = perf_counter()
root_logger = logging.getLogger()
logger.info(configure_logger(root_logger, verbose))
logger.info(configure_sentry())
logger.info("Running transform for source %s", source)

transformer = Transformer.load(source, input_file)
transformer = Transformer.load(source, input_file, run_id=run_id)
transformer.transform_and_write_output_files(output_file)
logger.info(
(
Expand Down
24 changes: 21 additions & 3 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import logging
import os
import uuid
from abc import ABC, abstractmethod
from importlib import import_module
from typing import TYPE_CHECKING, final
Expand Down Expand Up @@ -32,14 +33,18 @@ class Transformer(ABC):

@final
def __init__(
self, source: str, source_records: Iterator[dict[str, JSON] | Tag]
self,
source: str,
source_records: Iterator[dict[str, JSON] | Tag],
run_id: str | None = None,
) -> None:
"""
Initialize Transformer instance.
Args:
source: Source repository label. Must match a source key from config.SOURCES.
source_records: A set of source records to be processed.
run_id: A unique identifier for this invocation of Transmogrifier.
"""
self.source: str = source
self.source_base_url: str = SOURCES[source]["base-url"]
Expand All @@ -49,6 +54,7 @@ def __init__(
self.transformed_record_count: int = 0
self.skipped_record_count: int = 0
self.deleted_records: list[str] = []
self.run_id = self.set_run_id(run_id)

@final
def __iter__(self) -> Iterator[timdex.TimdexRecord]:
Expand All @@ -72,6 +78,15 @@ def __next__(self) -> timdex.TimdexRecord:
self.transformed_record_count += 1
return record

def set_run_id(self, run_id: str | None) -> str:
"""Method to set run_id for Transmogrifier run."""
if not run_id:
logger.info("explicit run_id not passed, minting new UUID")
run_id = str(uuid.uuid4())
message = f"run_id set: '{run_id}'"
logger.info(message)
return run_id

@final
@classmethod
def get_transformer(cls, source: str) -> type[Transformer]:
Expand All @@ -90,17 +105,20 @@ def get_transformer(cls, source: str) -> type[Transformer]:

@final
@classmethod
def load(cls, source: str, source_file: str) -> Transformer:
def load(
cls, source: str, source_file: str, run_id: str | None = None
) -> Transformer:
"""
Instantiate specified transformer class and populate with source records.
Args:
source: Source repository label. Must match a source key from config.SOURCES.
source_file: A file containing source records to be transformed.
run_id: A unique identifier for this invocation of Transmogrifier.
"""
transformer_class = cls.get_transformer(source)
source_records = transformer_class.parse_source_file(source_file)
return transformer_class(source, source_records)
return transformer_class(source, source_records, run_id=run_id)

@final
def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord:
Expand Down

0 comments on commit 68bef31

Please sign in to comment.