Skip to content

Commit

Permalink
Update dtype converter and add tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
YooSunYoung committed Jul 18, 2024
1 parent 9cf66f6 commit 2f7a3d6
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 96 deletions.
121 changes: 25 additions & 96 deletions src/background_ingestor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
# import scippnexus as snx
import datetime
import json
import logging
import pathlib
from urllib.parse import urljoin

Expand All @@ -13,6 +13,7 @@
build_background_ingestor_arg_parser,
build_scicat_background_ingester_config,
)
from scicat_dataset import convert_to_type
from scicat_logging import build_logger
from scicat_metadata import collect_schemas, select_applicable_schema
from system_helpers import exit_at_exceptions
Expand All @@ -23,24 +24,6 @@ def replace_variables_values(url: str, values: dict) -> str:
url = url.replace("{" + key + "}", str(value))
return url

def convert_to_type(input_value, value_type: str) :
output_value = None
if value_type == "string":
output_value = str(input_value)
elif value_type == "string[]":
output_value = [str(v) for v in input_value]
elif value_type == "integer":
output_value = int(input_value)
elif value_type == "float":
output_value = float(input_value)
elif value_type == "date" and isinstance(input_value, int):
output_value = datetime.datetime.fromtimestamp(input_value, tz=datetime.UTC).isoformat()
elif value_type == "date" and isinstance(input_value, str):
output_value = datetime.datetime.fromisoformat(input_value).isoformat()
else
raise Exception("Invalid value type")
return output_value


def extract_variables_values(
variables: dict, h5file, config: BackgroundIngestorConfig
Expand Down Expand Up @@ -82,90 +65,40 @@ def extract_variables_values(
else:
raise Exception("Invalid variable source configuration")

values[variable] = convert_to_type(value,variables[variable]["value_type"])
values[variable] = convert_to_type(value, variables[variable]["value_type"])

return values


def prepare_scicat_dataset(metadata_schema, values):
"""
Prepare scicat dataset as dictionary ready to be sent over to scicat as a POST request
This is an example:
{
"pid": "20.500.12269/e3690b21-ee8c-40d6-9409-6b6fdca776d2",
"datasetName": "this is a dataset",
"description": "this is the description of the dataset",
"principalInvestigator": "Massimiliano Novelli",
"creationLocation": "ESS:CODA",
"scientificMetadata": {
"run_number": {
"value": 18856,
"unit": "",
"human_name": "Run Number",
"type": "integer"
},
"sample_temperature": {
"value": 20.4,
"unit": "C",
"human_name": "Sample Temperature",
"type": "quantity"
},
"start_time" : {
"value" : "2024-07-16T09:30:12.987Z",
"unit" : "",
"human_name" : "Start Time",
"type" : "date"
}
},
"owner": "Massimiliano Novelli",
"ownerEmail": "[email protected]",
"sourceFolder": "/ess/data/coda/2024/616254",
"contactEmail": "[email protected]",
"creationTime": "2024-07-16T10:00:00.000Z",
"type": "raw",
"techniques": [
{
"pid": "http://purl.org/pan-science/PaNET/PaNET01155",
"names": "absorption and phase contrast nanotomography"
}
],
"instrumentId": "20.500.12269/765b3dc3-f658-410e-b371-04dd1adcd520",
"sampleId": "bd31725a-dbfd-4c32-87db-1c1ebe61e5ca",
"proposalId": "616254",
"ownerGroup": "ess_proposal_616254",
"accessGroups": [
"scientific information management systems group"
]
}
"""
schema = metadata_schema["schema"]
"""Prepare scicat dataset as dictionary ready to be ``POST``ed."""
schema: dict = metadata_schema["schema"]
dataset = {}
scientific_metadata = {
'ingestor_metadata_schema_id' : {
'ingestor_metadata_schema_id': {
"value": metadata_schema["id"],
"unit": "",
"human_name": "Ingestor Metadata Schema Id",
"type": "string"
"type": "string",
}
}
for key, field in schema.items():
for field in schema.values():
machine_name = field["machine_name"]
field_type = field["type"]
if field["field_type"] == "high_level":
dataset[machine_name] = convert_to_type(
replace_variables_values(field["value"],values),
field_type
replace_variables_values(field["value"], values), field_type
)
elif field["field_type"] == "scientific_metadata":
scientific_metadata[machine_name] = {
"value" : convert_to_type(
replace_variables_values(field["value"],values),
field_type
"value": convert_to_type(
replace_variables_values(field["value"], values), field_type
),
"unit" : "",
"human_name" : field["human_name"] if "human_name" is in field.keys() and field["human_name"] else machine_name,
"type" : field_type
"unit": "",
"human_name": field["human_name"]
if field.get("human_name", None)
else machine_name,
"type": field_type,
}
else:
raise Exception("Metadata schema field type invalid")
Expand All @@ -175,7 +108,7 @@ def prepare_scicat_dataset(metadata_schema, values):
return dataset


def create_scicat_dataset(dataset,config):
def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) -> dict:
"""
Execute a POST request to scicat to create a dataset
"""
Expand All @@ -191,19 +124,17 @@ def create_scicat_dataset(dataset,config):

result = response.json()
if response.ok:

...
else:
err = result.get("error", {})
raise Exception(f"Error creating new dataset: {err}")

logger.info(
"Dataset create successfully. Dataset pid: %s",
result['pid']
)
logger.info("Dataset create successfully. Dataset pid: %s", result['pid'])
return result

def prepare_files_list(nexus_file,done_writing_message_file,config): ...
def prepare_scicat_origdatablock(files_list,config): ...

def prepare_files_list(nexus_file, done_writing_message_file, config): ...
def prepare_scicat_origdatablock(files_list, config): ...
def create_scicat_origdatablock(
scicat_dataset_pid, nexus_file=None, done_writing_message_file=None
): ...
Expand Down Expand Up @@ -254,15 +185,13 @@ def main() -> None:
)

# create files list with b2blake hash of all the files
files_list = prepare_files_list(nexus_file_path,done_writing_message_file,config)
_ = prepare_files_list(nexus_file_path, done_writing_message_file, config)

# create and populate scicat dataset entry
scicat_dataset = prepare_scicat_dataset(
metadata_schema, variables_values
)
scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values)

# create dataset in scicat
scicat_dataset = create_scicat_dataset(scicat_dataset,config)
scicat_dataset = create_scicat_dataset(scicat_dataset, config)
scicat_dataset_pid = scicat_dataset["pid"]

# create and populate scicat origdatablock entry
Expand Down
48 changes: 48 additions & 0 deletions src/scicat_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,57 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
import datetime
from types import MappingProxyType
from typing import Any

from jinja2 import Template
from scicat_path_helpers import get_dataset_schema_template_path


def to_string(value: Any) -> str:
return str(value)


def to_string_array(value: list[Any]) -> list[str]:
return [str(v) for v in value]


def to_integer(value: Any) -> int:
return int(value)


def to_float(value: Any) -> float:
return float(value)


def to_date(value: Any) -> str | None:
if isinstance(value, str):
return datetime.datetime.fromisoformat(value).isoformat()
elif isinstance(value, int | float):
return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat()
return None


_DtypeConvertingMap = MappingProxyType(
{
"string": to_string,
"string[]": to_string_array,
"integer": to_integer,
"float": to_float,
"date": to_date,
}
)


def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
if (converter := _DtypeConvertingMap.get(dtype_desc)) is None:
raise ValueError(
"Invalid dtype description. Must be one of: ",
"string, string[], integer, float, date.\nGot: {dtype_desc}",
)
return converter(input_value)


def build_dataset_schema(
*,
nxs_dataset_pid: str,
Expand Down
43 changes: 43 additions & 0 deletions tests/test_scicat_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
import pytest
from scicat_dataset import convert_to_type


def test_dtype_string_converter() -> None:
assert convert_to_type("test", "string") == "test"
assert convert_to_type(123, "string") == "123"
assert convert_to_type(123.456, "string") == "123.456"


def test_dtype_string_array_converter() -> None:
assert convert_to_type("test", "string[]") == ["t", "e", "s", "t"]
assert convert_to_type([1, 2, 3], "string[]") == ["1", "2", "3"]
assert convert_to_type([1.1, 2.2, 3.3], "string[]") == ["1.1", "2.2", "3.3"]


def test_dtype_integer_converter() -> None:
assert convert_to_type("123", "integer") == 123
assert convert_to_type(123, "integer") == 123
assert convert_to_type(123.456, "integer") == 123


def test_dtype_float_converter() -> None:
assert convert_to_type("123.456", "float") == 123.456
assert convert_to_type(123, "float") == 123.0
assert convert_to_type(123.456, "float") == 123.456


def test_dtype_date_converter() -> None:
import datetime

test_datetime_isoformat = "1994-06-28T10:20:30+00:00"
test_datetime = datetime.datetime.fromisoformat(test_datetime_isoformat)
assert convert_to_type("1994-06-28T10:20:30Z", "date") == test_datetime_isoformat
assert convert_to_type(test_datetime.timestamp(), "date") == test_datetime_isoformat
assert convert_to_type(object(), "date") is None


def test_dtype_converter_invalid_dtype_raises() -> None:
with pytest.raises(ValueError, match="Invalid dtype description."):
convert_to_type("test", "invalid_type")

0 comments on commit 2f7a3d6

Please sign in to comment.