Skip to content

Commit

Permalink
ARROW-6582: [R] Arrow to R fails with embedded nuls in strings
Browse files Browse the repository at this point in the history
Closes apache#8365 from nealrichardson/r-nul

Lead-authored-by: Benjamin Kietzman <[email protected]>
Co-authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
bkietz and nealrichardson committed Jan 4, 2021
1 parent f7d47a3 commit 757e50e
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 13 deletions.
94 changes: 82 additions & 12 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "./arrow_types.h"
#if defined(ARROW_R_WITH_ARROW)

#include <type_traits>

#include <arrow/array.h>
#include <arrow/builder.h>
#include <arrow/datum.h>
Expand Down Expand Up @@ -288,36 +290,104 @@ struct Converter_String : public Converter {
}

StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
if (array->null_count()) {
// need to watch for nulls
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);

const bool all_valid = array->null_count() == 0;
const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);

bool nul_was_stripped = false;

if (all_valid) {
// no need to watch for missing strings
cpp11::unwind_protect([&] {
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsSet()) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
} else {
SET_STRING_ELT(data, start + i, NA_STRING);
if (strip_out_nuls) {
for (int i = 0; i < n; i++) {
SET_STRING_ELT(data, start + i,
r_string_from_view_strip_nul(string_array->GetView(i),
&nul_was_stripped));
}
return;
}

for (int i = 0; i < n; i++) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
}
});
} else {
cpp11::unwind_protect([&] {
for (int i = 0; i < n; i++) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
arrow::internal::BitmapReader validity_reader(array->null_bitmap_data(),
array->offset(), n);

if (strip_out_nuls) {
for (int i = 0; i < n; i++, validity_reader.Next()) {
if (validity_reader.IsSet()) {
SET_STRING_ELT(data, start + i,
r_string_from_view_strip_nul(string_array->GetView(i),
&nul_was_stripped));
} else {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
return;
}

for (int i = 0; i < n; i++, validity_reader.Next()) {
if (validity_reader.IsSet()) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
} else {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
});
}

if (nul_was_stripped) {
cpp11::warning("Stripping '\\0' (nul) from character vector");
}

return Status::OK();
}

bool Parallel() const { return false; }

private:
static SEXP r_string_from_view(const arrow::util::string_view& view) {
static SEXP r_string_from_view(arrow::util::string_view view) {
return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8);
}

static SEXP r_string_from_view_strip_nul(arrow::util::string_view view,
bool* nul_was_stripped) {
const char* old_string = view.data();

std::string stripped_string;
size_t stripped_len = 0, nul_count = 0;

for (size_t i = 0; i < view.size(); i++) {
if (old_string[i] == '\0') {
++nul_count;

if (nul_count == 1) {
// first nul spotted: allocate stripped string storage
stripped_string = view.to_string();
stripped_len = i;
}

// don't copy old_string[i] (which is \0) into stripped_string
continue;
}

if (nul_count > 0) {
stripped_string[stripped_len++] = old_string[i];
}
}

if (nul_count > 0) {
*nul_was_stripped = true;
stripped_string.resize(stripped_len);
return r_string_from_view(stripped_string);
}

return r_string_from_view(view);
}
};

class Converter_Boolean : public Converter {
Expand Down
24 changes: 23 additions & 1 deletion r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,29 @@ test_that("Array$create() handles vector -> fixed size list arrays", {
)
})

test_that("Handling string data with embedded nuls", {
raws <- structure(list(
as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
as.raw(c(0x74, 0x76))),
class = c("arrow_binary", "vctrs_vctr", "list"))
expect_error(rawToChar(raws[[3]]), "nul") # See?
array_with_nul <- Array$create(raws)$cast(utf8())
expect_error(as.vector(array_with_nul), "nul")

options(arrow.skip_nul = TRUE)
expect_warning(
expect_identical(
as.vector(array_with_nul),
c("person", "woman", "man", "fan", "camera", "tv")
),
"Stripping '\\\\0' \\(nul\\) from character vector"
)
})

test_that("Array$create() should have helpful error", {
expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector")
expect_error(Array$create(list(numeric(0)), list_of(int32())), "Expecting an integer vector")
Expand Down Expand Up @@ -762,4 +785,3 @@ test_that("auto int64 conversion to int can be disabled (ARROW-10093)", {
tab <- Table$create(x = a)
expect_true(inherits(as.data.frame(batch)$x, "integer64"))
})

0 comments on commit 757e50e

Please sign in to comment.