From 2f7a3d61ff2f21b072f7b1d00b55000ba9bd0900 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 18 Jul 2024 13:46:17 +0200 Subject: [PATCH] Update dtype converter and add tests. --- src/background_ingestor.py | 121 ++++++++--------------------------- src/scicat_dataset.py | 48 ++++++++++++++ tests/test_scicat_dataset.py | 43 +++++++++++++ 3 files changed, 116 insertions(+), 96 deletions(-) create mode 100644 tests/test_scicat_dataset.py diff --git a/src/background_ingestor.py b/src/background_ingestor.py index f41a938..6c08582 100644 --- a/src/background_ingestor.py +++ b/src/background_ingestor.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) # import scippnexus as snx -import datetime import json +import logging import pathlib from urllib.parse import urljoin @@ -13,6 +13,7 @@ build_background_ingestor_arg_parser, build_scicat_background_ingester_config, ) +from scicat_dataset import convert_to_type from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema from system_helpers import exit_at_exceptions @@ -23,24 +24,6 @@ def replace_variables_values(url: str, values: dict) -> str: url = url.replace("{" + key + "}", str(value)) return url -def convert_to_type(input_value, value_type: str) : - output_value = None - if value_type == "string": - output_value = str(input_value) - elif value_type == "string[]": - output_value = [str(v) for v in input_value] - elif value_type == "integer": - output_value = int(input_value) - elif value_type == "float": - output_value = float(input_value) - elif value_type == "date" and isinstance(input_value, int): - output_value = datetime.datetime.fromtimestamp(input_value, tz=datetime.UTC).isoformat() - elif value_type == "date" and isinstance(input_value, str): - output_value = datetime.datetime.fromisoformat(input_value).isoformat() - else - raise Exception("Invalid value type") - return output_value - def extract_variables_values( variables: dict, h5file, config: BackgroundIngestorConfig @@ -82,90 +65,40 @@ def extract_variables_values( else: raise Exception("Invalid variable source configuration") - values[variable] = convert_to_type(value,variables[variable]["value_type"]) + values[variable] = convert_to_type(value, variables[variable]["value_type"]) return values def prepare_scicat_dataset(metadata_schema, values): - """ - Prepare scicat dataset as dictionary ready to be sent over to scicat as a POST request - - This is an example: - { - "pid": "20.500.12269/e3690b21-ee8c-40d6-9409-6b6fdca776d2", - "datasetName": "this is a dataset", - "description": "this is the description of the dataset", - "principalInvestigator": "Massimiliano Novelli", - "creationLocation": "ESS:CODA", - "scientificMetadata": { - "run_number": { - "value": 18856, - "unit": "", - "human_name": "Run Number", - "type": "integer" - }, - "sample_temperature": { - "value": 20.4, - "unit": "C", - "human_name": "Sample Temperature", - "type": "quantity" - }, - "start_time" : { - "value" : "2024-07-16T09:30:12.987Z", - "unit" : "", - "human_name" : "Start Time", - "type" : "date" - } - }, - "owner": "Massimiliano Novelli", - "ownerEmail": "max.novelli@ess.eu", - "sourceFolder": "/ess/data/coda/2024/616254", - "contactEmail": "max.novelli@ess.eu", - "creationTime": "2024-07-16T10:00:00.000Z", - "type": "raw", - "techniques": [ - { - "pid": "http://purl.org/pan-science/PaNET/PaNET01155", - "names": "absorption and phase contrast nanotomography" - } - ], - "instrumentId": "20.500.12269/765b3dc3-f658-410e-b371-04dd1adcd520", - "sampleId": "bd31725a-dbfd-4c32-87db-1c1ebe61e5ca", - "proposalId": "616254", - "ownerGroup": "ess_proposal_616254", - "accessGroups": [ - "scientific information management systems group" - ] -} - """ - schema = metadata_schema["schema"] + """Prepare scicat dataset as dictionary ready to be ``POST``ed.""" + schema: dict = metadata_schema["schema"] dataset = {} scientific_metadata = { - 'ingestor_metadata_schema_id' : { + 'ingestor_metadata_schema_id': { "value": metadata_schema["id"], "unit": "", "human_name": "Ingestor Metadata Schema Id", - "type": "string" + "type": "string", } } - for key, field in schema.items(): + for field in schema.values(): machine_name = field["machine_name"] field_type = field["type"] if field["field_type"] == "high_level": dataset[machine_name] = convert_to_type( - replace_variables_values(field["value"],values), - field_type + replace_variables_values(field["value"], values), field_type ) elif field["field_type"] == "scientific_metadata": scientific_metadata[machine_name] = { - "value" : convert_to_type( - replace_variables_values(field["value"],values), - field_type + "value": convert_to_type( + replace_variables_values(field["value"], values), field_type ), - "unit" : "", - "human_name" : field["human_name"] if "human_name" is in field.keys() and field["human_name"] else machine_name, - "type" : field_type + "unit": "", + "human_name": field["human_name"] + if field.get("human_name", None) + else machine_name, + "type": field_type, } else: raise Exception("Metadata schema field type invalid") @@ -175,7 +108,7 @@ def prepare_scicat_dataset(metadata_schema, values): return dataset -def create_scicat_dataset(dataset,config): +def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) -> dict: """ Execute a POST request to scicat to create a dataset """ @@ -191,19 +124,17 @@ def create_scicat_dataset(dataset,config): result = response.json() if response.ok: - + ... else: err = result.get("error", {}) raise Exception(f"Error creating new dataset: {err}") - logger.info( - "Dataset create successfully. Dataset pid: %s", - result['pid'] - ) + logger.info("Dataset create successfully. Dataset pid: %s", result['pid']) return result -def prepare_files_list(nexus_file,done_writing_message_file,config): ... -def prepare_scicat_origdatablock(files_list,config): ... + +def prepare_files_list(nexus_file, done_writing_message_file, config): ... +def prepare_scicat_origdatablock(files_list, config): ... def create_scicat_origdatablock( scicat_dataset_pid, nexus_file=None, done_writing_message_file=None ): ... @@ -254,15 +185,13 @@ def main() -> None: ) # create files list with b2blake hash of all the files - files_list = prepare_files_list(nexus_file_path,done_writing_message_file,config) + _ = prepare_files_list(nexus_file_path, done_writing_message_file, config) # create and populate scicat dataset entry - scicat_dataset = prepare_scicat_dataset( - metadata_schema, variables_values - ) + scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values) # create dataset in scicat - scicat_dataset = create_scicat_dataset(scicat_dataset,config) + scicat_dataset = create_scicat_dataset(scicat_dataset, config) scicat_dataset_pid = scicat_dataset["pid"] # create and populate scicat origdatablock entry diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 1e9a63f..7dab49d 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -1,9 +1,57 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) +import datetime +from types import MappingProxyType +from typing import Any + from jinja2 import Template from scicat_path_helpers import get_dataset_schema_template_path +def to_string(value: Any) -> str: + return str(value) + + +def to_string_array(value: list[Any]) -> list[str]: + return [str(v) for v in value] + + +def to_integer(value: Any) -> int: + return int(value) + + +def to_float(value: Any) -> float: + return float(value) + + +def to_date(value: Any) -> str | None: + if isinstance(value, str): + return datetime.datetime.fromisoformat(value).isoformat() + elif isinstance(value, int | float): + return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat() + return None + + +_DtypeConvertingMap = MappingProxyType( + { + "string": to_string, + "string[]": to_string_array, + "integer": to_integer, + "float": to_float, + "date": to_date, + } +) + + +def convert_to_type(input_value: Any, dtype_desc: str) -> Any: + if (converter := _DtypeConvertingMap.get(dtype_desc)) is None: + raise ValueError( + "Invalid dtype description. Must be one of: ", + "string, string[], integer, float, date.\nGot: {dtype_desc}", + ) + return converter(input_value) + + def build_dataset_schema( *, nxs_dataset_pid: str, diff --git a/tests/test_scicat_dataset.py b/tests/test_scicat_dataset.py new file mode 100644 index 0000000..e53b492 --- /dev/null +++ b/tests/test_scicat_dataset.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) +import pytest +from scicat_dataset import convert_to_type + + +def test_dtype_string_converter() -> None: + assert convert_to_type("test", "string") == "test" + assert convert_to_type(123, "string") == "123" + assert convert_to_type(123.456, "string") == "123.456" + + +def test_dtype_string_array_converter() -> None: + assert convert_to_type("test", "string[]") == ["t", "e", "s", "t"] + assert convert_to_type([1, 2, 3], "string[]") == ["1", "2", "3"] + assert convert_to_type([1.1, 2.2, 3.3], "string[]") == ["1.1", "2.2", "3.3"] + + +def test_dtype_integer_converter() -> None: + assert convert_to_type("123", "integer") == 123 + assert convert_to_type(123, "integer") == 123 + assert convert_to_type(123.456, "integer") == 123 + + +def test_dtype_float_converter() -> None: + assert convert_to_type("123.456", "float") == 123.456 + assert convert_to_type(123, "float") == 123.0 + assert convert_to_type(123.456, "float") == 123.456 + + +def test_dtype_date_converter() -> None: + import datetime + + test_datetime_isoformat = "1994-06-28T10:20:30+00:00" + test_datetime = datetime.datetime.fromisoformat(test_datetime_isoformat) + assert convert_to_type("1994-06-28T10:20:30Z", "date") == test_datetime_isoformat + assert convert_to_type(test_datetime.timestamp(), "date") == test_datetime_isoformat + assert convert_to_type(object(), "date") is None + + +def test_dtype_converter_invalid_dtype_raises() -> None: + with pytest.raises(ValueError, match="Invalid dtype description."): + convert_to_type("test", "invalid_type")