diff --git a/docs/specifications/build.R b/docs/specifications/build.R
index 8761536..2c1a8fc 100644
--- a/docs/specifications/build.R
+++ b/docs/specifications/build.R
@@ -1,7 +1,7 @@
library(knitr)
dir.create("compiled", showWarnings=FALSE)
-for (v in c("1.0", "1.1", "1.2", "1.3")) {
+for (v in c("1.0", "1.1", "1.2", "1.3", "1.4")) {
.version <- package_version(v)
knitr::knit("hdf5.Rmd", output=file.path("compiled", paste0("hdf5-", v, ".md")))
}
diff --git a/docs/specifications/hdf5.Rmd b/docs/specifications/hdf5.Rmd
index b4c3cb5..56084d3 100644
--- a/docs/specifications/hdf5.Rmd
+++ b/docs/specifications/hdf5.Rmd
@@ -213,6 +213,39 @@ This should be interpreted as a boolean where a non-zero value specifies that we
}
```
+```{r, results="asis", echo=FALSE}
+if (.version >= package_version("1.4")) {
+cat('### Variable length string arrays
+
+Arrays of strings can be stored in [**ritsuko**\'s custom variable length string (VLS) array](https://github.com/ArtifactDB/ritsuko).
+This is represented as a HDF5 group (`**/`) with the following attributes:
+
+- `uzuki_object`, a scalar string dataset containing the value `"vector"`.
+ This should use a datatype that can be represented by a UTF-8 encoded string.
+- `uzuki_type`, a scalar string dataset containing `"vls"`.
+
+This group should contain the `pointers` and `heap` datasets.
+
+- The `**/data` dataset should be a 1-dimensional or scalar dataset of a compound datatype of 2 members, `"offset"` and `"length"`.
+ Each member should be of a datatype that can be represented by an unsigned 64-bit integer.
+ If the dataset is scalar, the length of the VLS array is defined as 1.
+- The `**/heap` dataset should be a 1-dimensional dataset of unsigned 8-bit integers.
+
+Each entry of `**/data` refers to a slice `[offset, offset + length)` of the `**/heap` dataset.
+This slice defines a variable-length UTF-8 encoded string of length `length` - unless the slice contains a null terminator, in which case the string is defined as the interval to the first null.
+Pointers may be in any order, overlapping or non-contiguous, as long as `[offset, offset + length)` lies within the boundaries of the heap.
+
+A `missing-value-placeholder` attribute on the `**/data` dataset may be present, defining a placeholder for missing values.
+The attribute should be a scalar and should be of any HDF5 string datatype that can be represented by a UTF-8 encoded string.
+An entry of `**/data` should be considered as missing if its corresponding string is equal to the placeholder.
+
+The group may also contain `**/names`, a 1-dimensional string dataset of length equal to `**/data`.
+This should use a datatype that can be represented by a UTF-8 encoded string.
+If `**/data` is a scalar, `**/names` should have length 1.
+')
+}
+```
+
### Nothing
A "nothing" (a.k.a., "null", "none") value is represented as a HDF5 group with the following attributes:
diff --git a/include/uzuki2/parse_hdf5.hpp b/include/uzuki2/parse_hdf5.hpp
index f60eb8c..19c7595 100644
--- a/include/uzuki2/parse_hdf5.hpp
+++ b/include/uzuki2/parse_hdf5.hpp
@@ -20,6 +20,7 @@
#include "ritsuko/ritsuko.hpp"
#include "ritsuko/hdf5/hdf5.hpp"
+#include "ritsuko/hdf5/vls/vls.hpp"
/**
* @file parse_hdf5.hpp
@@ -286,6 +287,48 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const
present.insert(std::move(x));
}
+ } else if (vector_type == "vls" && !version.lt(1, 4)) {
+ ritsuko::hdf5::vls::validate_pointer_datatype(dhandle.getCompType(), 64, 64);
+ auto hhandle = ritsuko::hdf5::vls::open_heap(handle, "heap");
+ auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, "missing-value-placeholder");
+
+ auto ptr = Provisioner::new_String(len, named, is_scalar, StringVector::NONE);
+ output.reset(ptr);
+
+ if (is_scalar) {
+ ritsuko::hdf5::vls::Pointer vlsptr;
+ dhandle.read(&vlsptr, ritsuko::hdf5::vls::define_pointer_datatype());
+
+ hsize_t len = vlsptr.length;
+ H5::DataSpace mspace(1, &len);
+ hsize_t offset = vlsptr.offset;
+ hsize_t hlen = ritsuko::hdf5::get_1d_length(hhandle, false);
+ H5::DataSpace dspace(1, &hlen);
+ dspace.selectHyperslab(H5S_SELECT_SET, &len, &offset);
+
+ std::vector buffer(vlsptr.length);
+ hhandle.read(buffer.data(), H5::PredType::NATIVE_UINT8, mspace, dspace);
+ auto cptr = reinterpret_cast(buffer.data());
+ std::string str(cptr, cptr + ritsuko::hdf5::find_string_length(cptr, vlsptr.length));
+
+ if (missingness.has_value() && str == *missingness) {
+ ptr->set_missing(0);
+ } else {
+ ptr->set(0, std::move(str));
+ }
+
+ } else {
+ ritsuko::hdf5::vls::Stream1dArray stream(&dhandle, &hhandle, len, buffer_size);
+ for (hsize_t i = 0; i < len; ++i, stream.next()) {
+ auto x = stream.steal();
+ if (missingness.has_value() && x == *missingness) {
+ ptr->set_missing(i);
+ } else {
+ ptr->set(i, std::move(x));
+ }
+ }
+ }
+
} else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) {
StringVector::Format format = StringVector::NONE;
if (version.equals(1, 0)) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bd29d6c..f2bf232 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -23,6 +23,7 @@ add_executable(
src/factor.cpp
src/number.cpp
src/string.cpp
+ src/vls.cpp
src/date.cpp
src/datetime.cpp
src/external.cpp
diff --git a/tests/src/string.cpp b/tests/src/string.cpp
index 8b0c375..b7c4bdf 100644
--- a/tests/src/string.cpp
+++ b/tests/src/string.cpp
@@ -235,3 +235,5 @@ TEST(JsonStringTest, CheckError) {
*** See integer.cpp for vector error tests. ***
***********************************************/
}
+
+
diff --git a/tests/src/utils.h b/tests/src/utils.h
index eccb9df..44a4285 100644
--- a/tests/src/utils.h
+++ b/tests/src/utils.h
@@ -99,15 +99,7 @@ H5::DataSet create_dataset(const H5::Group& parent, const std::string& name, con
}
auto dhandle = parent.createDataSet(name, dtype, dspace, cplist);
-
- if constexpr(std::is_same::value) {
- dhandle.write(values.data(), H5::PredType::NATIVE_INT);
- } else if constexpr(std::is_same::value) {
- dhandle.write(values.data(), H5::PredType::NATIVE_DOUBLE);
- } else {
- throw std::runtime_error("unknown type!");
- }
-
+ dhandle.write(values.data(), ritsuko::hdf5::as_numeric_datatype());
return dhandle;
}
diff --git a/tests/src/vls.cpp b/tests/src/vls.cpp
new file mode 100644
index 0000000..8ddf71a
--- /dev/null
+++ b/tests/src/vls.cpp
@@ -0,0 +1,191 @@
+#include
+#include
+
+#include "uzuki2/parse_hdf5.hpp"
+
+#include "utils.h"
+
+TEST(Hdf5VlsTest, Basic) {
+ auto path = "TEST-vls.h5";
+ std::string heap = "abcdefghijklmno";
+ size_t nlen = 10;
+
+ {
+ H5::H5File handle(path, H5F_ACC_TRUNC);
+ auto vhandle = vector_opener(handle, "blub", "vls");
+ add_version(vhandle, "1.4");
+
+ auto hhandle = create_dataset(vhandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8);
+ const unsigned char* hptr = reinterpret_cast(heap.c_str());
+ hhandle.write(hptr, H5::PredType::NATIVE_UCHAR);
+
+ std::vector > pointers(nlen);
+ size_t n = 0;
+ for (size_t i = 0; i < nlen; ++i) {
+ pointers[i].offset = n;
+ size_t count = (i % 2) + 1; // for some interesting differences.
+ pointers[i].length = count;
+ n += count;
+ }
+ auto ptype = ritsuko::hdf5::vls::define_pointer_datatype();
+ auto phandle = create_dataset(vhandle, "data", pointers.size(), ptype);
+ phandle.write(pointers.data(), ptype);
+ }
+
+ // Check that it works correctly.
+ {
+ auto parsed = load_hdf5(path, "blub");
+ EXPECT_EQ(parsed->type(), uzuki2::STRING);
+ auto sptr = static_cast(parsed.get());
+ EXPECT_EQ(sptr->size(), nlen);
+ std::vector expected { "a", "bc", "d", "ef", "g", "hi", "j", "kl", "m", "no" };
+ EXPECT_EQ(sptr->base.values, expected);
+ }
+
+ // Adding a missing value placeholder.
+ {
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto vhandle = handle.openDataSet("blub/data");
+ H5::StrType stype(0, H5T_VARIABLE);
+ auto ahandle = vhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR);
+ ahandle.write(stype, std::string("hi"));
+ }
+
+ auto parsed = load_hdf5(path, "blub");
+ EXPECT_EQ(parsed->type(), uzuki2::STRING);
+ auto sptr = static_cast(parsed.get());
+ EXPECT_EQ(sptr->base.values[5], "ich bin missing"); // the test's missing placeholder.
+
+ // Adding the wrong missing value placeholder.
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto vhandle = handle.openDataSet("blub/data");
+ vhandle.removeAttr("missing-value-placeholder");
+ vhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT, H5S_SCALAR);
+ }
+ expect_hdf5_error(path, "blub", "string datatype");
+
+ // Removing for the next checks.
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto vhandle = handle.openDataSet("blub/data");
+ vhandle.removeAttr("missing-value-placeholder");
+ }
+ }
+}
+
+TEST(Hdf5VlsTest, Failures) {
+ auto path = "TEST-vls.h5";
+ std::string heap = "abcdefghijklmno";
+ size_t nlen = 10;
+
+ // Shortening the heap to check that we perform bounds checks on the pointers.
+ {
+ H5::H5File handle(path, H5F_ACC_TRUNC);
+ auto ghandle = vector_opener(handle, "blub", "vls");
+ add_version(ghandle, "1.4");
+
+ hsize_t zero = 0;
+ H5::DataSpace hspace(1, &zero);
+ ghandle.createDataSet("heap", H5::PredType::NATIVE_UINT8, hspace);
+
+ std::vector > pointers(nlen);
+ for (size_t i = 0; i < nlen; ++i) {
+ pointers[i].offset = i;
+ pointers[i].length = 1;
+ }
+ auto ptype = ritsuko::hdf5::vls::define_pointer_datatype();
+ auto phandle = create_dataset(ghandle, "data", pointers.size(), ptype);
+ phandle.write(pointers.data(), ptype);
+ }
+ expect_hdf5_error(path, "blub", "out of range");
+
+ // Checking that we check for 64-bit unsigned integer types.
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto ghandle = handle.openGroup("blub");
+ ghandle.unlink("data");
+
+ std::vector > pointers(3);
+ for (auto& p : pointers) {
+ p.offset = 0;
+ p.length = 0;
+ }
+ hsize_t plen = pointers.size();
+ H5::DataSpace pspace(1, &plen);
+ auto ptype = ritsuko::hdf5::vls::define_pointer_datatype();
+ auto phandle = ghandle.createDataSet("data", ptype, pspace);
+ phandle.write(pointers.data(), ptype);
+ }
+ expect_hdf5_error(path, "blub", "64-bit unsigned integer");
+
+ // Checking that this only works in the latest version.
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto vhandle = handle.openGroup("blub");
+ vhandle.removeAttr("uzuki_version");
+ }
+ expect_hdf5_error(path, "blub", "unknown vector type");
+}
+
+TEST(Hdf5VlsTest, Scalar) {
+ auto path = "TEST-vls.h5";
+ std::string heap = "abcdefghijklmno";
+
+ {
+ H5::H5File handle(path, H5F_ACC_TRUNC);
+ auto ghandle = vector_opener(handle, "blub", "vls");
+ add_version(ghandle, "1.4");
+
+ auto hhandle = create_dataset(ghandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8);
+ const unsigned char* hptr = reinterpret_cast(heap.c_str());
+ hhandle.write(hptr, H5::PredType::NATIVE_UCHAR);
+
+ ritsuko::hdf5::vls::Pointer ptr;
+ ptr.offset = 0; ptr.length = 10;
+ auto ptype = ritsuko::hdf5::vls::define_pointer_datatype();
+ auto phandle = ghandle.createDataSet("data", ptype, H5S_SCALAR);
+ phandle.write(&ptr, ptype);
+ }
+ {
+ auto parsed = load_hdf5(path, "blub");
+ EXPECT_EQ(parsed->type(), uzuki2::STRING);
+ auto sptr = static_cast(parsed.get());
+ EXPECT_EQ(sptr->size(), 1);
+ EXPECT_EQ(sptr->base.values.front(), "abcdefghij");
+ }
+
+ // Checking that it works correctly with early termination.
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto ghandle = handle.openGroup("blub");
+ auto hhandle = ghandle.openDataSet("heap");
+ std::vector replacement(heap.size());
+ hhandle.write(replacement.data(), H5::PredType::NATIVE_UINT8);
+ }
+ {
+ auto parsed = load_hdf5(path, "blub");
+ EXPECT_EQ(parsed->type(), uzuki2::STRING);
+ auto sptr = static_cast(parsed.get());
+ EXPECT_EQ(sptr->size(), 1);
+ EXPECT_EQ(sptr->base.values.front(), "");
+ }
+
+ // Checking that scalar works correctly with missing values.
+ {
+ H5::H5File handle(path, H5F_ACC_RDWR);
+ auto ghandle = handle.openGroup("blub");
+ auto dhandle = ghandle.openDataSet("data");
+ H5::StrType stype(0, 10);
+ auto ahandle = dhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR);
+ ahandle.write(stype, std::string{});
+ }
+ {
+ auto parsed = load_hdf5(path, "blub");
+ EXPECT_EQ(parsed->type(), uzuki2::STRING);
+ auto sptr = static_cast(parsed.get());
+ EXPECT_EQ(sptr->size(), 1);
+ EXPECT_EQ(sptr->base.values.front(), "ich bin missing");
+ }
+}