From 757e50e43c18d88365d1225854f23099649755c9 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 4 Jan 2021 12:47:49 -0800 Subject: [PATCH] ARROW-6582: [R] Arrow to R fails with embedded nuls in strings Closes #8365 from nealrichardson/r-nul Lead-authored-by: Benjamin Kietzman Co-authored-by: Neal Richardson Signed-off-by: Neal Richardson --- r/src/array_to_vector.cpp | 94 ++++++++++++++++++++++++++++++----- r/tests/testthat/test-Array.R | 24 ++++++++- 2 files changed, 105 insertions(+), 13 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index c9331b9f92d52..c9a1f6cf9eed0 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -18,6 +18,8 @@ #include "./arrow_types.h" #if defined(ARROW_R_WITH_ARROW) +#include + #include #include #include @@ -288,36 +290,104 @@ struct Converter_String : public Converter { } StringArrayType* string_array = static_cast(array.get()); - if (array->null_count()) { - // need to watch for nulls - arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), - array->offset(), n); + + const bool all_valid = array->null_count() == 0; + const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false); + + bool nul_was_stripped = false; + + if (all_valid) { + // no need to watch for missing strings cpp11::unwind_protect([&] { - for (int i = 0; i < n; i++, null_reader.Next()) { - if (null_reader.IsSet()) { - SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); - } else { - SET_STRING_ELT(data, start + i, NA_STRING); + if (strip_out_nuls) { + for (int i = 0; i < n; i++) { + SET_STRING_ELT(data, start + i, + r_string_from_view_strip_nul(string_array->GetView(i), + &nul_was_stripped)); } + return; + } + + for (int i = 0; i < n; i++) { + SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); } }); } else { cpp11::unwind_protect([&] { - for (int i = 0; i < n; i++) { - SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); + arrow::internal::BitmapReader validity_reader(array->null_bitmap_data(), + array->offset(), n); + + if (strip_out_nuls) { + for (int i = 0; i < n; i++, validity_reader.Next()) { + if (validity_reader.IsSet()) { + SET_STRING_ELT(data, start + i, + r_string_from_view_strip_nul(string_array->GetView(i), + &nul_was_stripped)); + } else { + SET_STRING_ELT(data, start + i, NA_STRING); + } + } + return; + } + + for (int i = 0; i < n; i++, validity_reader.Next()) { + if (validity_reader.IsSet()) { + SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); + } else { + SET_STRING_ELT(data, start + i, NA_STRING); + } } }); } + if (nul_was_stripped) { + cpp11::warning("Stripping '\\0' (nul) from character vector"); + } + return Status::OK(); } bool Parallel() const { return false; } private: - static SEXP r_string_from_view(const arrow::util::string_view& view) { + static SEXP r_string_from_view(arrow::util::string_view view) { return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); } + + static SEXP r_string_from_view_strip_nul(arrow::util::string_view view, + bool* nul_was_stripped) { + const char* old_string = view.data(); + + std::string stripped_string; + size_t stripped_len = 0, nul_count = 0; + + for (size_t i = 0; i < view.size(); i++) { + if (old_string[i] == '\0') { + ++nul_count; + + if (nul_count == 1) { + // first nul spotted: allocate stripped string storage + stripped_string = view.to_string(); + stripped_len = i; + } + + // don't copy old_string[i] (which is \0) into stripped_string + continue; + } + + if (nul_count > 0) { + stripped_string[stripped_len++] = old_string[i]; + } + } + + if (nul_count > 0) { + *nul_was_stripped = true; + stripped_string.resize(stripped_len); + return r_string_from_view(stripped_string); + } + + return r_string_from_view(view); + } }; class Converter_Boolean : public Converter { diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 59c6dd9866aea..126c9f62b43df 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -616,6 +616,29 @@ test_that("Array$create() handles vector -> fixed size list arrays", { ) }) +test_that("Handling string data with embedded nuls", { + raws <- structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76))), + class = c("arrow_binary", "vctrs_vctr", "list")) + expect_error(rawToChar(raws[[3]]), "nul") # See? + array_with_nul <- Array$create(raws)$cast(utf8()) + expect_error(as.vector(array_with_nul), "nul") + + options(arrow.skip_nul = TRUE) + expect_warning( + expect_identical( + as.vector(array_with_nul), + c("person", "woman", "man", "fan", "camera", "tv") + ), + "Stripping '\\\\0' \\(nul\\) from character vector" + ) +}) + test_that("Array$create() should have helpful error", { expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector") expect_error(Array$create(list(numeric(0)), list_of(int32())), "Expecting an integer vector") @@ -762,4 +785,3 @@ test_that("auto int64 conversion to int can be disabled (ARROW-10093)", { tab <- Table$create(x = a) expect_true(inherits(as.data.frame(batch)$x, "integer64")) }) -