diff --git a/include/highfive/H5DataType.hpp b/include/highfive/H5DataType.hpp index 68921c249..c06b7a733 100644 --- a/include/highfive/H5DataType.hpp +++ b/include/highfive/H5DataType.hpp @@ -153,16 +153,18 @@ class FixedLengthStringType: public StringType { /// UTF8. In particular, a string with `n` UFT8 characters in general /// requires `4*n` bytes. /// - /// The string padding is subtle, essentially it's just a hint. A - /// null-terminated string is guaranteed to have one `'\0'` which marks the - /// semantic end of the string. The length of the buffer must be at least - /// `size` bytes regardless. HDF5 will read or write `size` bytes, - /// irrespective of the when the `\0` occurs. - /// - /// Note that when writing passing `StringPadding::NullTerminated` is a + /// The string padding is subtle, essentially it's just a hint. While + /// commonly, a null-terminated string is guaranteed to have one `'\0'` + /// which marks the semantic end of the string, this is not enforced by + /// HDF5. In fact, there are HDF5 files that contain strings that claim to + /// be null-terminated but aren't. The length of the buffer must be at + /// least `size` bytes regardless of the padding. HDF5 will read or write + /// `size` bytes, irrespective of when (if at all) the `\0` occurs. + /// + /// Note that when writing, passing `StringPadding::NullTerminated` is a /// guarantee to the reader that it contains a `\0`. Therefore, make sure - /// that the string really is nullterminated. Otherwise prefer a - /// null-padded string which only means states that the buffer is filled up + /// that the string really is null-terminated. Otherwise prefer a + /// null-padded string. This mearly states that the buffer is filled up /// with 0 or more `\0`. FixedLengthStringType(size_t size, StringPadding padding, diff --git a/include/highfive/bits/H5Converter_misc.hpp b/include/highfive/bits/H5Converter_misc.hpp index ed387702f..073e222f4 100644 --- a/include/highfive/bits/H5Converter_misc.hpp +++ b/include/highfive/bits/H5Converter_misc.hpp @@ -103,14 +103,14 @@ enum class BufferMode { Read, Write }; /// /// \brief String length in bytes excluding the `\0`. /// -inline size_t char_buffer_size(char const* const str, size_t max_string_length) { - for (size_t i = 0; i <= max_string_length; ++i) { +inline size_t char_buffer_length(char const* const str, size_t max_string_size) { + for (size_t i = 0; i < max_string_size; ++i) { if (str[i] == '\0') { return i; } } - return max_string_length; + return max_string_size; } @@ -190,7 +190,7 @@ struct StringBuffer { } else if (buffer.isFixedLengthString()) { // If the buffer is fixed-length and null-terminated, then // `buffer.string_length` doesn't include the null-character. - if (length > buffer.string_length) { + if (length > buffer.string_max_length) { throw std::invalid_argument("String length too big."); } @@ -229,9 +229,9 @@ struct StringBuffer { /// `length() + 1` bytes long. size_t length() const { if (buffer.isNullTerminated()) { - return char_buffer_size(data(), buffer.string_length); + return char_buffer_length(data(), buffer.string_size); } else { - return buffer.string_length; + return buffer.string_max_length; } } @@ -272,7 +272,7 @@ struct StringBuffer { : file_datatype(_file_datatype.asStringType()) , padding(file_datatype.getPadding()) , string_size(file_datatype.isVariableStr() ? size_t(-1) : file_datatype.getSize()) - , string_length(string_size - size_t(isNullTerminated())) + , string_max_length(string_size - size_t(isNullTerminated())) , dims(_dims) { if (string_size == 0 && isNullTerminated()) { throw DataTypeException( @@ -322,9 +322,11 @@ struct StringBuffer { private: StringType file_datatype; StringPadding padding; - size_t string_size; // Size of buffer required to store the string. - // Meaningful for fixed length strings only. - size_t string_length; // Semantic length of string. + // Size of buffer required to store the string. + // Meaningful for fixed length strings only. + size_t string_size; + // Maximum length of string. + size_t string_max_length; std::vector dims; std::vector fixed_length_buffer; diff --git a/tests/unit/tests_high_five_base.cpp b/tests/unit/tests_high_five_base.cpp index f7cf67532..c811cc000 100644 --- a/tests/unit/tests_high_five_base.cpp +++ b/tests/unit/tests_high_five_base.cpp @@ -2318,6 +2318,34 @@ void check_multiple_string(Proxy proxy, size_t string_length) { } } +template +void check_supposedly_nullterm(Proxy proxy) { + auto dataspace = HighFive::DataSpace::Scalar(); + auto datatype = HighFive::FixedLengthStringType(5, HighFive::StringPadding::NullTerminated); + proxy.create("not_null_terminated", dataspace, datatype); + auto obj = proxy.get("not_null_terminated"); + + // Creates a 5 byte, "null-terminated", fixed-length string. The first five + // bytes are filled with "GROUP". Clearly, this isn't null-terminated. However, + // h5py will read it back as "GROUP", HDF5 allows us to create these; and they're + // found in the wild. + std::string value = "GROUP"; + obj.write_raw(value.c_str(), datatype); + + auto actual = obj.template read(); + REQUIRE(actual == value); +} + +TEST_CASE("HighFiveSTDString (attribute, nullterm cornercase)") { + auto file = HighFive::File("not_null_terminated_attribute.h5", HighFive::File::Truncate); + check_supposedly_nullterm(ForwardToAttribute(file)); +} + +TEST_CASE("HighFiveSTDString (dataset, nullterm cornercase)") { + auto file = HighFive::File("not_null_terminated_dataset.h5", HighFive::File::Truncate); + check_supposedly_nullterm(ForwardToDataSet(file)); +} + TEST_CASE("HighFiveSTDString (dataset, single, short)") { File file("std_string_dataset_single_short.h5", File::Truncate); check_single_string(ForwardToDataSet(file), 3);