diff --git a/docs/specifications/build.R b/docs/specifications/build.R index 8761536..2c1a8fc 100644 --- a/docs/specifications/build.R +++ b/docs/specifications/build.R @@ -1,7 +1,7 @@ library(knitr) dir.create("compiled", showWarnings=FALSE) -for (v in c("1.0", "1.1", "1.2", "1.3")) { +for (v in c("1.0", "1.1", "1.2", "1.3", "1.4")) { .version <- package_version(v) knitr::knit("hdf5.Rmd", output=file.path("compiled", paste0("hdf5-", v, ".md"))) } diff --git a/docs/specifications/hdf5.Rmd b/docs/specifications/hdf5.Rmd index b4c3cb5..56084d3 100644 --- a/docs/specifications/hdf5.Rmd +++ b/docs/specifications/hdf5.Rmd @@ -213,6 +213,39 @@ This should be interpreted as a boolean where a non-zero value specifies that we } ``` +```{r, results="asis", echo=FALSE} +if (.version >= package_version("1.4")) { +cat('### Variable length string arrays + +Arrays of strings can be stored in [**ritsuko**\'s custom variable length string (VLS) array](https://github.com/ArtifactDB/ritsuko). +This is represented as a HDF5 group (`**/`) with the following attributes: + +- `uzuki_object`, a scalar string dataset containing the value `"vector"`. + This should use a datatype that can be represented by a UTF-8 encoded string. +- `uzuki_type`, a scalar string dataset containing `"vls"`. + +This group should contain the `pointers` and `heap` datasets. + +- The `**/data` dataset should be a 1-dimensional or scalar dataset of a compound datatype of 2 members, `"offset"` and `"length"`. + Each member should be of a datatype that can be represented by an unsigned 64-bit integer. + If the dataset is scalar, the length of the VLS array is defined as 1. +- The `**/heap` dataset should be a 1-dimensional dataset of unsigned 8-bit integers. + +Each entry of `**/data` refers to a slice `[offset, offset + length)` of the `**/heap` dataset. +This slice defines a variable-length UTF-8 encoded string of length `length` - unless the slice contains a null terminator, in which case the string is defined as the interval to the first null. +Pointers may be in any order, overlapping or non-contiguous, as long as `[offset, offset + length)` lies within the boundaries of the heap. + +A `missing-value-placeholder` attribute on the `**/data` dataset may be present, defining a placeholder for missing values. +The attribute should be a scalar and should be of any HDF5 string datatype that can be represented by a UTF-8 encoded string. +An entry of `**/data` should be considered as missing if its corresponding string is equal to the placeholder. + +The group may also contain `**/names`, a 1-dimensional string dataset of length equal to `**/data`. +This should use a datatype that can be represented by a UTF-8 encoded string. +If `**/data` is a scalar, `**/names` should have length 1. +') +} +``` + ### Nothing A "nothing" (a.k.a., "null", "none") value is represented as a HDF5 group with the following attributes: diff --git a/include/uzuki2/parse_hdf5.hpp b/include/uzuki2/parse_hdf5.hpp index f60eb8c..19c7595 100644 --- a/include/uzuki2/parse_hdf5.hpp +++ b/include/uzuki2/parse_hdf5.hpp @@ -20,6 +20,7 @@ #include "ritsuko/ritsuko.hpp" #include "ritsuko/hdf5/hdf5.hpp" +#include "ritsuko/hdf5/vls/vls.hpp" /** * @file parse_hdf5.hpp @@ -286,6 +287,48 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const present.insert(std::move(x)); } + } else if (vector_type == "vls" && !version.lt(1, 4)) { + ritsuko::hdf5::vls::validate_pointer_datatype(dhandle.getCompType(), 64, 64); + auto hhandle = ritsuko::hdf5::vls::open_heap(handle, "heap"); + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, "missing-value-placeholder"); + + auto ptr = Provisioner::new_String(len, named, is_scalar, StringVector::NONE); + output.reset(ptr); + + if (is_scalar) { + ritsuko::hdf5::vls::Pointer vlsptr; + dhandle.read(&vlsptr, ritsuko::hdf5::vls::define_pointer_datatype()); + + hsize_t len = vlsptr.length; + H5::DataSpace mspace(1, &len); + hsize_t offset = vlsptr.offset; + hsize_t hlen = ritsuko::hdf5::get_1d_length(hhandle, false); + H5::DataSpace dspace(1, &hlen); + dspace.selectHyperslab(H5S_SELECT_SET, &len, &offset); + + std::vector buffer(vlsptr.length); + hhandle.read(buffer.data(), H5::PredType::NATIVE_UINT8, mspace, dspace); + auto cptr = reinterpret_cast(buffer.data()); + std::string str(cptr, cptr + ritsuko::hdf5::find_string_length(cptr, vlsptr.length)); + + if (missingness.has_value() && str == *missingness) { + ptr->set_missing(0); + } else { + ptr->set(0, std::move(str)); + } + + } else { + ritsuko::hdf5::vls::Stream1dArray stream(&dhandle, &hhandle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (missingness.has_value() && x == *missingness) { + ptr->set_missing(i); + } else { + ptr->set(i, std::move(x)); + } + } + } + } else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) { StringVector::Format format = StringVector::NONE; if (version.equals(1, 0)) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bd29d6c..f2bf232 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,6 +23,7 @@ add_executable( src/factor.cpp src/number.cpp src/string.cpp + src/vls.cpp src/date.cpp src/datetime.cpp src/external.cpp diff --git a/tests/src/string.cpp b/tests/src/string.cpp index 8b0c375..b7c4bdf 100644 --- a/tests/src/string.cpp +++ b/tests/src/string.cpp @@ -235,3 +235,5 @@ TEST(JsonStringTest, CheckError) { *** See integer.cpp for vector error tests. *** ***********************************************/ } + + diff --git a/tests/src/utils.h b/tests/src/utils.h index eccb9df..44a4285 100644 --- a/tests/src/utils.h +++ b/tests/src/utils.h @@ -99,15 +99,7 @@ H5::DataSet create_dataset(const H5::Group& parent, const std::string& name, con } auto dhandle = parent.createDataSet(name, dtype, dspace, cplist); - - if constexpr(std::is_same::value) { - dhandle.write(values.data(), H5::PredType::NATIVE_INT); - } else if constexpr(std::is_same::value) { - dhandle.write(values.data(), H5::PredType::NATIVE_DOUBLE); - } else { - throw std::runtime_error("unknown type!"); - } - + dhandle.write(values.data(), ritsuko::hdf5::as_numeric_datatype()); return dhandle; } diff --git a/tests/src/vls.cpp b/tests/src/vls.cpp new file mode 100644 index 0000000..8ddf71a --- /dev/null +++ b/tests/src/vls.cpp @@ -0,0 +1,191 @@ +#include +#include + +#include "uzuki2/parse_hdf5.hpp" + +#include "utils.h" + +TEST(Hdf5VlsTest, Basic) { + auto path = "TEST-vls.h5"; + std::string heap = "abcdefghijklmno"; + size_t nlen = 10; + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + auto vhandle = vector_opener(handle, "blub", "vls"); + add_version(vhandle, "1.4"); + + auto hhandle = create_dataset(vhandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8); + const unsigned char* hptr = reinterpret_cast(heap.c_str()); + hhandle.write(hptr, H5::PredType::NATIVE_UCHAR); + + std::vector > pointers(nlen); + size_t n = 0; + for (size_t i = 0; i < nlen; ++i) { + pointers[i].offset = n; + size_t count = (i % 2) + 1; // for some interesting differences. + pointers[i].length = count; + n += count; + } + auto ptype = ritsuko::hdf5::vls::define_pointer_datatype(); + auto phandle = create_dataset(vhandle, "data", pointers.size(), ptype); + phandle.write(pointers.data(), ptype); + } + + // Check that it works correctly. + { + auto parsed = load_hdf5(path, "blub"); + EXPECT_EQ(parsed->type(), uzuki2::STRING); + auto sptr = static_cast(parsed.get()); + EXPECT_EQ(sptr->size(), nlen); + std::vector expected { "a", "bc", "d", "ef", "g", "hi", "j", "kl", "m", "no" }; + EXPECT_EQ(sptr->base.values, expected); + } + + // Adding a missing value placeholder. + { + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto vhandle = handle.openDataSet("blub/data"); + H5::StrType stype(0, H5T_VARIABLE); + auto ahandle = vhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); + ahandle.write(stype, std::string("hi")); + } + + auto parsed = load_hdf5(path, "blub"); + EXPECT_EQ(parsed->type(), uzuki2::STRING); + auto sptr = static_cast(parsed.get()); + EXPECT_EQ(sptr->base.values[5], "ich bin missing"); // the test's missing placeholder. + + // Adding the wrong missing value placeholder. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto vhandle = handle.openDataSet("blub/data"); + vhandle.removeAttr("missing-value-placeholder"); + vhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT, H5S_SCALAR); + } + expect_hdf5_error(path, "blub", "string datatype"); + + // Removing for the next checks. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto vhandle = handle.openDataSet("blub/data"); + vhandle.removeAttr("missing-value-placeholder"); + } + } +} + +TEST(Hdf5VlsTest, Failures) { + auto path = "TEST-vls.h5"; + std::string heap = "abcdefghijklmno"; + size_t nlen = 10; + + // Shortening the heap to check that we perform bounds checks on the pointers. + { + H5::H5File handle(path, H5F_ACC_TRUNC); + auto ghandle = vector_opener(handle, "blub", "vls"); + add_version(ghandle, "1.4"); + + hsize_t zero = 0; + H5::DataSpace hspace(1, &zero); + ghandle.createDataSet("heap", H5::PredType::NATIVE_UINT8, hspace); + + std::vector > pointers(nlen); + for (size_t i = 0; i < nlen; ++i) { + pointers[i].offset = i; + pointers[i].length = 1; + } + auto ptype = ritsuko::hdf5::vls::define_pointer_datatype(); + auto phandle = create_dataset(ghandle, "data", pointers.size(), ptype); + phandle.write(pointers.data(), ptype); + } + expect_hdf5_error(path, "blub", "out of range"); + + // Checking that we check for 64-bit unsigned integer types. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto ghandle = handle.openGroup("blub"); + ghandle.unlink("data"); + + std::vector > pointers(3); + for (auto& p : pointers) { + p.offset = 0; + p.length = 0; + } + hsize_t plen = pointers.size(); + H5::DataSpace pspace(1, &plen); + auto ptype = ritsuko::hdf5::vls::define_pointer_datatype(); + auto phandle = ghandle.createDataSet("data", ptype, pspace); + phandle.write(pointers.data(), ptype); + } + expect_hdf5_error(path, "blub", "64-bit unsigned integer"); + + // Checking that this only works in the latest version. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto vhandle = handle.openGroup("blub"); + vhandle.removeAttr("uzuki_version"); + } + expect_hdf5_error(path, "blub", "unknown vector type"); +} + +TEST(Hdf5VlsTest, Scalar) { + auto path = "TEST-vls.h5"; + std::string heap = "abcdefghijklmno"; + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + auto ghandle = vector_opener(handle, "blub", "vls"); + add_version(ghandle, "1.4"); + + auto hhandle = create_dataset(ghandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8); + const unsigned char* hptr = reinterpret_cast(heap.c_str()); + hhandle.write(hptr, H5::PredType::NATIVE_UCHAR); + + ritsuko::hdf5::vls::Pointer ptr; + ptr.offset = 0; ptr.length = 10; + auto ptype = ritsuko::hdf5::vls::define_pointer_datatype(); + auto phandle = ghandle.createDataSet("data", ptype, H5S_SCALAR); + phandle.write(&ptr, ptype); + } + { + auto parsed = load_hdf5(path, "blub"); + EXPECT_EQ(parsed->type(), uzuki2::STRING); + auto sptr = static_cast(parsed.get()); + EXPECT_EQ(sptr->size(), 1); + EXPECT_EQ(sptr->base.values.front(), "abcdefghij"); + } + + // Checking that it works correctly with early termination. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto ghandle = handle.openGroup("blub"); + auto hhandle = ghandle.openDataSet("heap"); + std::vector replacement(heap.size()); + hhandle.write(replacement.data(), H5::PredType::NATIVE_UINT8); + } + { + auto parsed = load_hdf5(path, "blub"); + EXPECT_EQ(parsed->type(), uzuki2::STRING); + auto sptr = static_cast(parsed.get()); + EXPECT_EQ(sptr->size(), 1); + EXPECT_EQ(sptr->base.values.front(), ""); + } + + // Checking that scalar works correctly with missing values. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto ghandle = handle.openGroup("blub"); + auto dhandle = ghandle.openDataSet("data"); + H5::StrType stype(0, 10); + auto ahandle = dhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); + ahandle.write(stype, std::string{}); + } + { + auto parsed = load_hdf5(path, "blub"); + EXPECT_EQ(parsed->type(), uzuki2::STRING); + auto sptr = static_cast(parsed.get()); + EXPECT_EQ(sptr->size(), 1); + EXPECT_EQ(sptr->base.values.front(), "ich bin missing"); + } +}