Update dtype converter and add tests.

SciCatProject · Jul 18, 2024 · 2f7a3d6 · 2f7a3d6
1 parent 9cf66f6
commit 2f7a3d6
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 96 deletions.
diff --git a/src/background_ingestor.py b/src/background_ingestor.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
 # import scippnexus as snx
-import datetime
 import json
+import logging
 import pathlib
 from urllib.parse import urljoin
 
@@ -13,6 +13,7 @@
     build_background_ingestor_arg_parser,
     build_scicat_background_ingester_config,
 )
+from scicat_dataset import convert_to_type
 from scicat_logging import build_logger
 from scicat_metadata import collect_schemas, select_applicable_schema
 from system_helpers import exit_at_exceptions
@@ -23,24 +24,6 @@ def replace_variables_values(url: str, values: dict) -> str:
         url = url.replace("{" + key + "}", str(value))
     return url
 
-def convert_to_type(input_value, value_type: str) :
-    output_value = None
-    if value_type == "string":
-        output_value = str(input_value)
-    elif value_type == "string[]":
-        output_value = [str(v) for v in input_value]
-    elif value_type == "integer":
-        output_value = int(input_value)
-    elif value_type == "float":
-        output_value = float(input_value)
-    elif value_type == "date" and isinstance(input_value, int):
-        output_value = datetime.datetime.fromtimestamp(input_value, tz=datetime.UTC).isoformat()
-    elif value_type == "date" and isinstance(input_value, str):
-        output_value = datetime.datetime.fromisoformat(input_value).isoformat()
-    else
-        raise Exception("Invalid value type")
-    return output_value
-
 
 def extract_variables_values(
     variables: dict, h5file, config: BackgroundIngestorConfig
@@ -82,90 +65,40 @@ def extract_variables_values(
         else:
             raise Exception("Invalid variable source configuration")
 
-        values[variable] = convert_to_type(value,variables[variable]["value_type"])
+        values[variable] = convert_to_type(value, variables[variable]["value_type"])
 
     return values
 
 
 def prepare_scicat_dataset(metadata_schema, values):
-    """
-    Prepare scicat dataset as dictionary ready to be sent over to scicat as a POST request
-
-    This is an example:
-    {
-  "pid": "20.500.12269/e3690b21-ee8c-40d6-9409-6b6fdca776d2",
-  "datasetName": "this is a dataset",
-  "description": "this is the description of the dataset",
-  "principalInvestigator": "Massimiliano Novelli",
-  "creationLocation": "ESS:CODA",
-  "scientificMetadata": {
-    "run_number": {
-      "value": 18856,
-      "unit": "",
-      "human_name": "Run Number",
-      "type": "integer"
-    },
-    "sample_temperature": {
-      "value": 20.4,
-      "unit": "C",
-      "human_name": "Sample Temperature",
-      "type": "quantity"
-    },
-    "start_time" : {
-      "value" : "2024-07-16T09:30:12.987Z",
-      "unit" : "",
-      "human_name" : "Start Time",
-      "type" : "date"
-    }
-  },
-  "owner": "Massimiliano Novelli",
-  "ownerEmail": "[email protected]",
-  "sourceFolder": "/ess/data/coda/2024/616254",
-  "contactEmail": "[email protected]",
-  "creationTime": "2024-07-16T10:00:00.000Z",
-  "type": "raw",
-  "techniques": [
-    {
-      "pid": "http://purl.org/pan-science/PaNET/PaNET01155",
-      "names": "absorption and phase contrast nanotomography"
-    }
-  ],
-  "instrumentId": "20.500.12269/765b3dc3-f658-410e-b371-04dd1adcd520",
-  "sampleId": "bd31725a-dbfd-4c32-87db-1c1ebe61e5ca",
-  "proposalId": "616254",
-  "ownerGroup": "ess_proposal_616254",
-  "accessGroups": [
-    "scientific information management systems group"
-  ]
-}
-    """
-    schema = metadata_schema["schema"]
+    """Prepare scicat dataset as dictionary ready to be ``POST``ed."""
+    schema: dict = metadata_schema["schema"]
     dataset = {}
     scientific_metadata = {
-        'ingestor_metadata_schema_id' : {
+        'ingestor_metadata_schema_id': {
             "value": metadata_schema["id"],
             "unit": "",
             "human_name": "Ingestor Metadata Schema Id",
-            "type": "string"
+            "type": "string",
         }
     }
-    for key, field in schema.items():
+    for field in schema.values():
         machine_name = field["machine_name"]
         field_type = field["type"]
         if field["field_type"] == "high_level":
             dataset[machine_name] = convert_to_type(
-                replace_variables_values(field["value"],values),
-                field_type
+                replace_variables_values(field["value"], values), field_type
             )
         elif field["field_type"] == "scientific_metadata":
             scientific_metadata[machine_name] = {
-                "value" : convert_to_type(
-                    replace_variables_values(field["value"],values),
-                    field_type
+                "value": convert_to_type(
+                    replace_variables_values(field["value"], values), field_type
                 ),
-                "unit" : "",
-                "human_name" : field["human_name"] if "human_name" is in field.keys() and field["human_name"] else machine_name,
-                "type" : field_type
+                "unit": "",
+                "human_name": field["human_name"]
+                if field.get("human_name", None)
+                else machine_name,
+                "type": field_type,
             }
         else:
             raise Exception("Metadata schema field type invalid")
@@ -175,7 +108,7 @@ def prepare_scicat_dataset(metadata_schema, values):
     return dataset
 
 
-def create_scicat_dataset(dataset,config):
+def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) -> dict:
     """
     Execute a POST request to scicat to create a dataset
     """
@@ -191,19 +124,17 @@ def create_scicat_dataset(dataset,config):
 
     result = response.json()
     if response.ok:
-
+        ...
     else:
         err = result.get("error", {})
         raise Exception(f"Error creating new dataset: {err}")
 
-    logger.info(
-        "Dataset create successfully. Dataset pid: %s",
-        result['pid']
-    )
+    logger.info("Dataset create successfully. Dataset pid: %s", result['pid'])
     return result
 
-def prepare_files_list(nexus_file,done_writing_message_file,config): ...
-def prepare_scicat_origdatablock(files_list,config): ...
+
+def prepare_files_list(nexus_file, done_writing_message_file, config): ...
+def prepare_scicat_origdatablock(files_list, config): ...
 def create_scicat_origdatablock(
     scicat_dataset_pid, nexus_file=None, done_writing_message_file=None
 ): ...
@@ -254,15 +185,13 @@ def main() -> None:
             )
 
         # create files list with b2blake hash of all the files
-        files_list = prepare_files_list(nexus_file_path,done_writing_message_file,config)
+        _ = prepare_files_list(nexus_file_path, done_writing_message_file, config)
 
         # create and populate scicat dataset entry
-        scicat_dataset = prepare_scicat_dataset(
-            metadata_schema, variables_values
-        )
+        scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values)
 
         # create dataset in scicat
-        scicat_dataset = create_scicat_dataset(scicat_dataset,config)
+        scicat_dataset = create_scicat_dataset(scicat_dataset, config)
         scicat_dataset_pid = scicat_dataset["pid"]
 
         # create and populate scicat origdatablock entry

diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
@@ -1,9 +1,57 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
+import datetime
+from types import MappingProxyType
+from typing import Any
+
 from jinja2 import Template
 from scicat_path_helpers import get_dataset_schema_template_path
 
 
+def to_string(value: Any) -> str:
+    return str(value)
+
+
+def to_string_array(value: list[Any]) -> list[str]:
+    return [str(v) for v in value]
+
+
+def to_integer(value: Any) -> int:
+    return int(value)
+
+
+def to_float(value: Any) -> float:
+    return float(value)
+
+
+def to_date(value: Any) -> str | None:
+    if isinstance(value, str):
+        return datetime.datetime.fromisoformat(value).isoformat()
+    elif isinstance(value, int | float):
+        return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat()
+    return None
+
+
+_DtypeConvertingMap = MappingProxyType(
+    {
+        "string": to_string,
+        "string[]": to_string_array,
+        "integer": to_integer,
+        "float": to_float,
+        "date": to_date,
+    }
+)
+
+
+def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
+    if (converter := _DtypeConvertingMap.get(dtype_desc)) is None:
+        raise ValueError(
+            "Invalid dtype description. Must be one of: ",
+            "string, string[], integer, float, date.\nGot: {dtype_desc}",
+        )
+    return converter(input_value)
+
+
 def build_dataset_schema(
     *,
     nxs_dataset_pid: str,

diff --git a/tests/test_scicat_dataset.py b/tests/test_scicat_dataset.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
+import pytest
+from scicat_dataset import convert_to_type
+
+
+def test_dtype_string_converter() -> None:
+    assert convert_to_type("test", "string") == "test"
+    assert convert_to_type(123, "string") == "123"
+    assert convert_to_type(123.456, "string") == "123.456"
+
+
+def test_dtype_string_array_converter() -> None:
+    assert convert_to_type("test", "string[]") == ["t", "e", "s", "t"]
+    assert convert_to_type([1, 2, 3], "string[]") == ["1", "2", "3"]
+    assert convert_to_type([1.1, 2.2, 3.3], "string[]") == ["1.1", "2.2", "3.3"]
+
+
+def test_dtype_integer_converter() -> None:
+    assert convert_to_type("123", "integer") == 123
+    assert convert_to_type(123, "integer") == 123
+    assert convert_to_type(123.456, "integer") == 123
+
+
+def test_dtype_float_converter() -> None:
+    assert convert_to_type("123.456", "float") == 123.456
+    assert convert_to_type(123, "float") == 123.0
+    assert convert_to_type(123.456, "float") == 123.456
+
+
+def test_dtype_date_converter() -> None:
+    import datetime
+
+    test_datetime_isoformat = "1994-06-28T10:20:30+00:00"
+    test_datetime = datetime.datetime.fromisoformat(test_datetime_isoformat)
+    assert convert_to_type("1994-06-28T10:20:30Z", "date") == test_datetime_isoformat
+    assert convert_to_type(test_datetime.timestamp(), "date") == test_datetime_isoformat
+    assert convert_to_type(object(), "date") is None
+
+
+def test_dtype_converter_invalid_dtype_raises() -> None:
+    with pytest.raises(ValueError, match="Invalid dtype description."):
+        convert_to_type("test", "invalid_type")