Skip to content

Commit

Permalink
ARROW-3308: [R] Convert R character vector with data exceeding 2GB to…
Browse files Browse the repository at this point in the history
… Large type

Closes apache#7611 from nealrichardson/large-char

Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
  • Loading branch information
nealrichardson authored and wesm committed Jul 10, 2020
1 parent d792661 commit fa85a63
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 24 deletions.
3 changes: 2 additions & 1 deletion r/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@

VERSION=$(shell grep ^Version DESCRIPTION | sed s/Version:\ //)
ARROW_R_DEV="TRUE"
ARROW_LARGE_MEMORY_TESTS=$(ARROW_R_DEV)

doc:
R -s -e 'roxygen2::roxygenize()'
-git add --all man/*.Rd

test:
export ARROW_R_DEV=$(ARROW_R_DEV) && R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile .
export NOT_CRAN=true && export ARROW_R_DEV=$(ARROW_R_DEV) && export AWS_EC2_METADATA_DISABLED=TRUE && R -s -e 'library(testthat); setwd(file.path(.libPaths()[1], "arrow", "tests")); system.time(test_check("arrow", filter="${file}", reporter=ifelse(nchar("${r}"), "${r}", "summary")))'
export NOT_CRAN=true && export ARROW_R_DEV=$(ARROW_R_DEV) && export AWS_EC2_METADATA_DISABLED=TRUE && export ARROW_LARGE_MEMORY_TESTS=$(ARROW_LARGE_MEMORY_TESTS) && R -s -e 'library(testthat); setwd(file.path(.libPaths()[1], "arrow", "tests")); system.time(test_check("arrow", filter="${file}", reporter=ifelse(nchar("${r}"), "${r}", "summary")))'

deps:
R -s -e 'lib <- Sys.getenv("R_LIB", .libPaths()[1]); install.packages("devtools", repo="https://cloud.r-project.org", lib=lib); devtools::install_dev_deps(lib=lib)'
Expand Down
21 changes: 20 additions & 1 deletion r/src/array_from_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,25 @@ std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<REALSXP>(SEXP x) {
return float64();
}

template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<STRSXP>(SEXP x) {
// See how big the character vector is
R_xlen_t n = XLENGTH(x);
int64_t size = 0;
for (R_xlen_t i = 0; i < n; i++) {
SEXP string_i = STRING_ELT(x, i);
if (string_i != NA_STRING) {
size += XLENGTH(Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8));
}
if (size > arrow::kBinaryMemoryLimit) {
// Exceeds 2GB capacity of utf8 type, so use large
return large_utf8();
}
}

return utf8();
}

static inline std::shared_ptr<arrow::DataType> InferArrowTypeFromDataFrame(SEXP x) {
R_xlen_t n = XLENGTH(x);
SEXP names = Rf_getAttrib(x, R_NamesSymbol);
Expand Down Expand Up @@ -1205,7 +1224,7 @@ std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
case RAWSXP:
return int8();
case STRSXP:
return utf8();
return InferArrowTypeFromVector<STRSXP>(x);
case VECSXP:
return InferArrowTypeFromVector<VECSXP>(x);
default:
Expand Down
5 changes: 5 additions & 0 deletions r/tests/testthat/helper-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,8 @@ example_with_times <- tibble::tibble(
posixlt = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10),
posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10)
)

make_big_string <- function() {
# This creates a character vector that would exceed the capacity of BinaryArray
rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
}
7 changes: 7 additions & 0 deletions r/tests/testthat/helper-skip.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,10 @@ skip_if_not_dev_mode <- function() {
"environment variable ARROW_R_DEV"
)
}

skip_if_not_running_large_memory_tests <- function() {
skip_if_not(
identical(tolower(Sys.getenv("ARROW_LARGE_MEMORY_TESTS")), "true"),
"environment variable ARROW_LARGE_MEMORY_TESTS"
)
}
30 changes: 17 additions & 13 deletions r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,13 @@ test_that("Array supports character vectors (ARROW-3339)", {
expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8())
})

test_that("Character vectors > 2GB become large_utf8", {
skip_on_cran()
skip_if_not_running_large_memory_tests()
big <- make_big_string()
expect_array_roundtrip(big, large_utf8())
})

test_that("empty arrays are supported", {
expect_array_roundtrip(character(), utf8())
expect_array_roundtrip(character(), large_utf8(), as = large_utf8())
Expand Down Expand Up @@ -374,12 +381,7 @@ test_that("Array$create() does not convert doubles to integer", {
for (type in c(int_types, uint_types)) {
a <- Array$create(10, type = type)
expect_type_equal(a$type, type)

# exception for now because we cannot handle
# unsigned 64 bit integers yet
if (type != uint64()) {
expect_true(as.vector(a) == 10L)
}
expect_true(as.vector(a) == 10L)
}
})

Expand All @@ -388,13 +390,15 @@ test_that("Array$create() converts raw vectors to uint8 arrays (ARROW-3794)", {
})

test_that("Array<int8>$as_vector() converts to integer (ARROW-3794)", {
a <- Array$create((-128):127)$cast(int8())
expect_equal(a$type, int8())
expect_equal(a$as_vector(), (-128):127)

a <- Array$create(0:255)$cast(uint8())
expect_equal(a$type, uint8())
expect_equal(a$as_vector(), 0:255)
i8 <- (-128):127
a <- Array$create(i8)$cast(int8())
expect_type_equal(a, int8())
expect_equal(as.vector(a), i8)

u8 <- 0:255
a <- Array$create(u8)$cast(uint8())
expect_type_equal(a, uint8())
expect_equal(as.vector(a), u8)
})

test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", {
Expand Down
10 changes: 10 additions & 0 deletions r/tests/testthat/test-feather.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,4 +185,14 @@ test_that("read_feather closes connection to file", {
expect_false(file.exists(tf))
})

test_that("Character vectors > 2GB can write to feather", {
skip_on_cran()
skip_if_not_running_large_memory_tests()
df <- tibble::tibble(big = make_big_string())
tf <- tempfile()
on.exit(unlink(tf))
write_feather(df, tf)
expect_identical(read_feather(tf), df)
})

unlink(feather_file)
20 changes: 11 additions & 9 deletions r/vignettes/arrow.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,21 @@ In the tables, entries with a `-` are not currently implemented.
| logical | boolean |
| integer | int32 |
| double ("numeric") | float64 |
| character | utf8 |
| character | utf8^1^ |
| factor | dictionary |
| raw | uint8 |
| Date | date32 |
| POSIXct | timestamp |
| POSIXlt | struct |
| data.frame | struct |
| list^+^ | list |
| list^2^ | list |
| bit64::integer64 | int64 |
| difftime | time32 |
| vctrs::vctrs_unspecified | null |

^+^: Only lists where all elements are the same type are able to be translated to Arrow list type (which is a "list of" some type).
^1^: If the character vector exceeds 2GB of strings, it will be converted to a `large_utf8` Arrow type

^2^: Only lists where all elements are the same type are able to be translated to Arrow list type (which is a "list of" some type).

### Arrow to R

Expand All @@ -109,11 +111,11 @@ In the tables, entries with a `-` are not currently implemented.
| int8 | integer |
| int16 | integer |
| int32 | integer |
| int64 | integer^++^ |
| int64 | integer^3^ |
| uint8 | integer |
| uint16 | integer |
| uint32 | integer^++^ |
| uint64 | integer^++^ |
| uint32 | integer^3^ |
| uint64 | integer^3^ |
| float16 | - |
| float32 | double |
| float64 | double |
Expand All @@ -127,7 +129,7 @@ In the tables, entries with a `-` are not currently implemented.
| timestamp | POSIXct |
| duration | - |
| decimal | double |
| dictionary | factor^+++^ |
| dictionary | factor^4^ |
| list | vctrs::vctrs_list_of |
| fixed_size_list | - |
| struct | data.frame |
Expand All @@ -138,9 +140,9 @@ In the tables, entries with a `-` are not currently implemented.
| large_binary | vctrs::vctrs_list_of(raw) |
| large_list | vctrs::vctrs_list_of |

^++^: These integer types may contain values that exceed the range of R's `integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are converted to `double` ("numeric") and `int64` is converted to `bit64::integer64`.
^3^: These integer types may contain values that exceed the range of R's `integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are converted to `double` ("numeric") and `int64` is converted to `bit64::integer64`.

^+++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings.
^4^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings.

### R object attributes

Expand Down

0 comments on commit fa85a63

Please sign in to comment.