Skip to content

Commit

Permalink
Add run_id CLI argument and Transformer attribute
Browse files Browse the repository at this point in the history
Why these changes are being introduced:

As we move into Transmogrifier writing to a parquet dataset, one
important bit of information it will need is the concept of a
"run id".  This correlates directly to an "Execution UUID" that
every StepFunction invocation produces.  This identifier is then
used when writing the records to the parquet dataset, allowing
for quick and easy access to records associated with that identifier.

There is a small many-to-one relationship that makes naming a bit
awkward: each StepFunction invocation may run Transmogrifier multiple
times (e.g. multiple input files).  Each time it invokes Transmogrifier,
the same "run_id" would be passed.  This effectively groups the outputs
of all Transmogrifier invocations in the same location in the parquet
dataset.  The language of this new "run_id" in Transmogrifier is
intentionally somewhat high level, indicating it's just an identifier
to associate with that invocation of the run.

How this addresses that need:
* Adds new CLI argument -r / --run-id
* Transformer gets new attribute 'run_id'
* Transformer mints a UUID of a run id is not passed, making
the change backwards compatible and inconsequential if a run
id is not passed

Side effects of this change:
* Going forward, invocations of Transmogrifier can use the run id
as part of the parquet record writing.  Until then, it has no effect.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/TIMX-403
  • Loading branch information
ghukill committed Nov 19, 2024
1 parent 65dab54 commit 12c2c6b
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 5 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ Options:
-s, --source [alma|aspace|dspace|jpal|libguides|gismit|gisogm|researchdatabases|whoas|zenodo]
Source records were harvested from, must
choose from list of options [required]
-r, --run-id TEXT Identifier for Transmogrifier run. This can
be used to group transformed records
produced by Transmogrifier, even if they
span multiple CLI invocations. If a value
is not provided a UUID will be minted and
used.
-v, --verbose Pass to log at debug level instead of info
--help Show this message and exit.
```
29 changes: 29 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.jsontransformer import JSONTransformer
from transmogrifier.sources.transformer import Transformer
from transmogrifier.sources.xml.datacite import Datacite
from transmogrifier.sources.xmltransformer import XMLTransformer

Expand Down Expand Up @@ -43,6 +44,34 @@ def runner():
return CliRunner()


# transformers ##########################


@pytest.fixture
def generic_transformer():

class GenericTransformer(Transformer):
def parse_source_file(self):
pass

def record_is_deleted(self):
pass

def get_main_titles(self):
pass

def get_source_link(self):
pass

def get_source_record_id(self):
pass

def get_timdex_record_id(self):
pass

return GenericTransformer


# aardvark ##########################


Expand Down
14 changes: 14 additions & 0 deletions tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# ruff: noqa: PLR2004

import uuid

import pytest

import transmogrifier.models as timdex
Expand Down Expand Up @@ -75,3 +77,15 @@ def test_create_locations_from_spatial_subjects_success(timdex_record_required_f
timdex.Location(value="City 1", kind="Place Name"),
timdex.Location(value="City 2", kind="Place Name"),
]


def test_transformer_run_id_explicitly_passed(generic_transformer):
run_id = "abc123"
transformer = generic_transformer("alma", [], run_id=run_id)
assert transformer.run_id == run_id


def test_transformer_run_id_none_passed_generates_uuid(generic_transformer):
transformer = generic_transformer("alma", [], run_id=None)
assert transformer.run_id is not None
assert uuid.UUID(transformer.run_id)
50 changes: 50 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# ruff: noqa: S108

from unittest import mock

from transmogrifier.cli import main


Expand Down Expand Up @@ -90,3 +94,49 @@ def test_transform_deleted_records(caplog, runner, tmp_path):
"Completed transform, total records processed: 1, transformed records: 0"
", skipped records: 0, deleted records: 1"
) in caplog.text


def test_transform_run_id_argument_passed_and_used(caplog, runner, tmp_path):
caplog.set_level("INFO")
run_id = "abc123"
with mock.patch(
"transmogrifier.sources.transformer.Transformer.transform_and_write_output_files"
) as mocked_transform_and_write:
mocked_transform_and_write.side_effect = Exception("stopping transformation")
runner.invoke(
main,
[
"--verbose",
"-s",
"alma",
"-r",
run_id,
"-i",
"tests/fixtures/datacite/datacite_records.xml",
"-o",
"/tmp/records.json",
],
)
assert f"run_id set: '{run_id}'" in caplog.text


def test_transform_run_id_argument_not_passed_and_uuid_minted(caplog, runner, tmp_path):
caplog.set_level("INFO")
with mock.patch(
"transmogrifier.sources.transformer.Transformer.transform_and_write_output_files"
) as mocked_transform_and_write:
mocked_transform_and_write.side_effect = Exception("stopping transformation")
runner.invoke(
main,
[
"--verbose",
"-s",
"alma",
"-i",
"tests/fixtures/datacite/datacite_records.xml",
"-o",
"/tmp/records.json",
],
)
assert "explicit run_id not passed, minting new UUID" in caplog.text
assert "run_id set:" in caplog.text
16 changes: 14 additions & 2 deletions transmogrifier/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,31 @@
type=click.Choice(list(SOURCES.keys()), case_sensitive=False),
help="Source records were harvested from, must choose from list of options",
)
@click.option(
"-r",
"--run-id",
required=False,
help="Identifier for Transmogrifier run. This can be used to group transformed "
"records produced by Transmogrifier, even if they span multiple CLI invocations. "
"If a value is not provided a UUID will be minted and used.",
)
@click.option(
"-v", "--verbose", is_flag=True, help="Pass to log at debug level instead of info"
)
def main(
source: str, input_file: str, output_file: str, verbose: bool # noqa: FBT001
source: str,
input_file: str,
output_file: str,
run_id: str,
verbose: bool, # noqa: FBT001
) -> None:
start_time = perf_counter()
root_logger = logging.getLogger()
logger.info(configure_logger(root_logger, verbose))
logger.info(configure_sentry())
logger.info("Running transform for source %s", source)

transformer = Transformer.load(source, input_file)
transformer = Transformer.load(source, input_file, run_id=run_id)
transformer.transform_and_write_output_files(output_file)
logger.info(
(
Expand Down
24 changes: 21 additions & 3 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import logging
import os
import uuid
from abc import ABC, abstractmethod
from importlib import import_module
from typing import TYPE_CHECKING, final
Expand Down Expand Up @@ -32,14 +33,18 @@ class Transformer(ABC):

@final
def __init__(
self, source: str, source_records: Iterator[dict[str, JSON] | Tag]
self,
source: str,
source_records: Iterator[dict[str, JSON] | Tag],
run_id: str | None = None,
) -> None:
"""
Initialize Transformer instance.
Args:
source: Source repository label. Must match a source key from config.SOURCES.
source_records: A set of source records to be processed.
run_id: A unique identifier for this invocation of Transmogrifier.
"""
self.source: str = source
self.source_base_url: str = SOURCES[source]["base-url"]
Expand All @@ -49,6 +54,7 @@ def __init__(
self.transformed_record_count: int = 0
self.skipped_record_count: int = 0
self.deleted_records: list[str] = []
self.run_id = self.set_run_id(run_id)

@final
def __iter__(self) -> Iterator[timdex.TimdexRecord]:
Expand All @@ -72,6 +78,15 @@ def __next__(self) -> timdex.TimdexRecord:
self.transformed_record_count += 1
return record

def set_run_id(self, run_id: str | None) -> str:
"""Method to set run_id for Transmogrifier run."""
if not run_id:
logger.info("explicit run_id not passed, minting new UUID")
run_id = str(uuid.uuid4())
message = f"run_id set: '{run_id}'"
logger.info(message)
return run_id

@final
@classmethod
def get_transformer(cls, source: str) -> type[Transformer]:
Expand All @@ -90,17 +105,20 @@ def get_transformer(cls, source: str) -> type[Transformer]:

@final
@classmethod
def load(cls, source: str, source_file: str) -> Transformer:
def load(
cls, source: str, source_file: str, run_id: str | None = None
) -> Transformer:
"""
Instantiate specified transformer class and populate with source records.
Args:
source: Source repository label. Must match a source key from config.SOURCES.
source_file: A file containing source records to be transformed.
run_id: A unique identifier for this invocation of Transmogrifier.
"""
transformer_class = cls.get_transformer(source)
source_records = transformer_class.parse_source_file(source_file)
return transformer_class(source, source_records)
return transformer_class(source, source_records, run_id=run_id)

@final
def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord:
Expand Down

0 comments on commit 12c2c6b

Please sign in to comment.