Bump rust-polars to 0.42.0 (#1183)

Co-authored-by: eitsupi <[email protected]>
pola-rs · Aug 19, 2024 · 086d6df · 086d6df
1 parent 1294fcc
commit 086d6df
Show file tree

Hide file tree

Showing 68 changed files with 1,725 additions and 1,461 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -119,4 +119,4 @@ Collate:
 Config/rextendr/version: 0.3.1
 VignetteBuilder: knitr
 Config/polars/LibVersion: 0.41.1
-Config/polars/RustToolchainVersion: nightly-2024-06-23
+Config/polars/RustToolchainVersion: nightly-2024-07-26
diff --git a/NEWS.md b/NEWS.md
@@ -4,14 +4,39 @@
 
 ### Breaking changes
 
+- Updated rust-polars to 0.42.0 (#1183).
 - `$describe_plan()` and `$describe_optimized_plan()` are removed. Use
   respectively `$explain(optimized = FALSE)` and `$explain()` instead (#1182).
+- The parameter `inherit_optimization` is removed from all functions that had it
+  (#1183).
+- In `$write_parquet()` and `$sink_parquet()`, the parameter `data_pagesize_limit`
+  is renamed `data_page_size` (#1183).
+- The LazyFrame method `$get_optimization_toggle()` is removed, and
+  `$set_optimization_toggle()` is renamed `$optimization_toggle()` (#1183).
+- In `$unpivot()`, the parameter `streamable` is removed (#1183).
+- Some functions have a parameter `future` that determines the compatibility level
+  when exporting Polars' internal data structures. This parameter is renamed
+  `compat_level`, which takes `FALSE` for the oldest flavor (more compatible)
+  and `TRUE` for the newest one (less compatible). It can also take an integer
+  determining a specific compatibility level when more are added in the future.
+  For now, `future = FALSE` can be replaced by `compat_level = FALSE` (#1183).
 
 ### New features
 
 - New method `$str$extract_many()` (#1163).
 - Converting a `nanoarrow_array` with zero rows to an `RPolarsDataFrame` via
   `as_polars_df()` now keeps the original schema (#1177).
+- `$write_parquet()` has two new arguments `partition_by` and
+  `partition_chunk_size_bytes` to write a `DataFrame` to a hive-partitioned
+  directory (#1183).
+- New method `$bin$size()` (#1183).
+- In `$scan_parquet()` and `$read_parquet()`, the `parallel` argument can take
+  the new value `"prefiltered"` (#1183).
+- `$scan_parquet()`, `$scan_ipc()` and `$read_parquet()` have a new argument
+  `include_file_paths` to automatically add a column containing the path to the
+  source file(s) (#1183).
+- `$scan_ipc` can read a hive-partitioned directory with its new arguments
+  `hive_partitioning`, `hive_schema`, and `try_parse_hive_dates` (#1183).
 
 ### Other changes
 

diff --git a/R/as_polars.R b/R/as_polars.R
@@ -172,7 +172,6 @@ as_polars_df.RPolarsLazyFrame = function(
     cluster_with_columns = TRUE,
     streaming = FALSE,
     no_optimization = FALSE,
-    inherit_optimization = FALSE,
     collect_in_background = FALSE) {
   # capture all args and modify some to match lower level function
   args = as.list(environment())

diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R
@@ -1981,10 +1981,13 @@ DataFrame_write_csv = function(
 #'
 #' @inherit DataFrame_write_csv params return
 #' @inheritParams LazyFrame_sink_ipc
-#' @param future Setting this to `TRUE` will write Polars' internal data structures that
-#' might not be available by other Arrow implementations.
-#' This functionality is considered **unstable**.
-#' It may be changed at any point without it being considered a breaking change.
+#' @param compat_level Use a specific compatibility level when exporting Polars’
+#' internal data structures. This can be:
+#' * an integer indicating the compatibility version (currently only 0 for oldest
+#'   and 1 for newest);
+#' * a logical value with `TRUE` for the newest version and `FALSE` for the oldest
+#'   version.
+#'
 #' @rdname IO_write_ipc
 #' @seealso
 #' - [`<DataFrame>$to_raw_ipc()`][DataFrame_to_raw_ipc]
@@ -2001,16 +2004,12 @@ DataFrame_write_ipc = function(
     file,
     compression = c("uncompressed", "zstd", "lz4"),
     ...,
-    future = FALSE) {
-  if (isTRUE(future)) {
-    warning("The `future` parameter of `$write_ipc()` is considered unstable.")
-  }
-
+    compat_level = TRUE) {
   .pr$DataFrame$write_ipc(
     self,
     file,
     compression %||% "uncompressed",
-    future
+    compat_level
   ) |>
     unwrap("in $write_ipc():")
 
@@ -2019,35 +2018,53 @@ DataFrame_write_ipc = function(
 
 
 #' Write to parquet file
+#'
 #' @inherit DataFrame_write_csv params return
 #' @inheritParams LazyFrame_sink_parquet
+#' @param file File path to which the result should be written. This should be
+#' a path to a directory if writing a partitioned dataset.
+#' @param partition_by Column(s) to partition by. A partitioned dataset will be
+#' written if this is specified.
+#' @param partition_chunk_size_bytes Approximate size to split DataFrames within
+#' a single partition when writing. Note this is calculated using the size of
+#' the DataFrame in memory - the size of the output file may differ depending
+#' on the file format / compression.
 #'
 #' @rdname IO_write_parquet
 #'
-#' @examples
-#' # write table 'mtcars' from mem to parquet
+#' @examplesIf requireNamespace("withr", quietly = TRUE)
 #' dat = pl$DataFrame(mtcars)
 #'
-#' destination = tempfile(fileext = ".parquet")
+#' # write data to a single parquet file
+#' destination = withr::local_tempfile(fileext = ".parquet")
 #' dat$write_parquet(destination)
+#'
+#' # write data to folder with a hive-partitioned structure
+#' dest_folder = withr::local_tempdir()
+#' dat$write_parquet(dest_folder, partition_by = c("gear", "cyl"))
+#' list.files(dest_folder, recursive = TRUE)
 DataFrame_write_parquet = function(
     file,
     ...,
     compression = "zstd",
     compression_level = 3,
     statistics = TRUE,
     row_group_size = NULL,
-    data_pagesize_limit = NULL) {
+    data_page_size = NULL,
+    partition_by = NULL,
+    partition_chunk_size_bytes = 4294967296) {
   statistics = translate_statistics(statistics) |>
     unwrap("in $write_parquet():")
   .pr$DataFrame$write_parquet(
     self,
     file,
-    compression,
-    compression_level,
-    statistics,
-    row_group_size,
-    data_pagesize_limit
+    compression = compression,
+    compression_level = compression_level,
+    statistics = statistics,
+    row_group_size = row_group_size,
+    data_page_size = data_page_size,
+    partition_by = partition_by,
+    partition_chunk_size_bytes = partition_chunk_size_bytes
   ) |>
     unwrap("in $write_parquet():")
 

diff --git a/R/expr__binary.R b/R/expr__binary.R
@@ -4,37 +4,70 @@
 # expr_bin_make_sub_ns = macro_new_subnamespace("^ExprBin_", "RPolarsExprBinNameSpace")
 
 
-#' contains
+#' Check if binaries contain a binary substring
+#'
+#' @param literal The binary substring to look for.
 #'
-#' @aliases expr_bin_contains
-#' @description R Check if binaries in Series contain a binary substring.
-#' @keywords ExprBin
-#' @param lit The binary substring to look for
 #' @return Expr returning a Boolean
-ExprBin_contains = function(lit) {
-  unwrap(.pr$Expr$bin_contains(self, lit))
+#'
+#' @examples
+#' colors = pl$DataFrame(
+#'   name = c("black", "yellow", "blue"),
+#'   code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary),
+#'   lit = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary)
+#' )
+#'
+#' colors$select(
+#'   "name",
+#'   contains_with_lit = pl$col("code")$bin$contains("xff"),
+#'   contains_with_expr = pl$col("code")$bin$contains(pl$col("lit"))
+#' )
+ExprBin_contains = function(literal) {
+  unwrap(.pr$Expr$bin_contains(self, literal))
 }
 
 
-#' starts_with
+#' Check if values start with a binary substring
 #'
-#' @aliases expr_bin_starts_with
-#' @description   Check if values starts with a binary substring.
-#' @keywords ExprBin
 #' @param sub Prefix substring.
+#'
 #' @return Expr returing a Boolean
+#'
+#' @examples
+#' colors = pl$DataFrame(
+#'   name = c("black", "yellow", "blue"),
+#'   code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary),
+#'   prefix = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary)
+#' )
+#'
+#' colors$select(
+#'   "name",
+#'   starts_with_lit = pl$col("code")$bin$starts_with("xff"),
+#'   starts_with_expr = pl$col("code")$bin$starts_with(pl$col("prefix"))
+#' )
 ExprBin_starts_with = function(sub) {
   unwrap(.pr$Expr$bin_starts_with(self, sub))
 }
 
 
-#' ends_with
+#' Check if string values end with a binary substring
 #'
-#' @aliases expr_bin_ends_with
-#' @description Check if string values end with a binary substring.
 #' @param suffix Suffix substring.
-#' @keywords ExprBin
+#'
 #' @return Expr returning a Boolean
+#'
+#' @examples
+#' colors = pl$DataFrame(
+#'   name = c("black", "yellow", "blue"),
+#'   code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary),
+#'   suffix = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary)
+#' )
+#'
+#' colors$select(
+#'   "name",
+#'   ends_with_lit = pl$col("code")$bin$ends_with("xff"),
+#'   ends_with_expr = pl$col("code")$bin$ends_with(pl$col("suffix"))
+#' )
 ExprBin_ends_with = function(suffix) {
   unwrap(.pr$Expr$bin_ends_with(self, suffix))
 }
@@ -100,3 +133,38 @@ ExprBin_decode = function(encoding, ..., strict = TRUE) {
 
   unwrap(res, "in $bin$decode():")
 }
+
+#' Get the size of binary values in the given unit
+#'
+#' @param unit Scale the returned size to the given unit. Can be `"b"`, `"kb"`,
+#' `"mb"`, `"gb"`, `"tb"`, or their full names (`"kilobytes"`, etc.).
+#'
+#' @return [Expr][Expr_class] of data type UInt or Float.
+#'
+#' @examples
+#' df = pl$DataFrame(
+#'   name = c("black", "yellow", "blue"),
+#'   code_hex = as_polars_series(c("000000", "ffff00", "0000ff"))$cast(pl$Binary)
+#' )
+#'
+#' df$with_columns(
+#'   n_bytes = pl$col("code_hex")$bin$size(),
+#'   n_kilobytes = pl$col("code_hex")$bin$size("kb")
+#' )
+ExprBin_size = function(unit = "b") {
+  sz = .pr$Expr$bin_size_bytes(self)
+  switch(unit,
+    "b" = ,
+    "bytes" = sz,
+    "kb" = ,
+    "kilobytes" = sz / 1024,
+    "mb" = ,
+    "megabytes" = sz / 1024**2,
+    "gb" = ,
+    "gigabytes" = sz / 1024**3,
+    "tb" = ,
+    "terabytes" = sz / 1024**4,
+    Err_plain("`unit` must be one of 'b', 'kb', 'mb', 'gb', 'tb'") |>
+      unwrap("in $bin$size():")
+  )
+}
diff --git a/R/expr__expr.R b/R/expr__expr.R
@@ -184,10 +184,12 @@ wrap_e_result = function(e, str_to_lit = TRUE, argname = NULL) {
 #' @return Expr
 #' @examples .pr$env$wrap_elist_result(list(pl$lit(42), 42, 1:3))
 wrap_elist_result = function(elist, str_to_lit = TRUE) {
+  if (!is.list(elist) && length(elist) == 1L) {
+    elist = list(elist)
+  }
   element_i = 0L
   result(
     {
-      if (!is.list(elist) && length(elist) == 1L) elist = list(elist)
       lapply(elist, \(e) {
         element_i <<- element_i + 1L
         wrap_e(e, str_to_lit)

diff --git a/R/expr__meta.R b/R/expr__meta.R
@@ -66,7 +66,8 @@ ExprMeta_neq = function(other) {
 #' e2
 #' e2$meta$pop()
 ExprMeta_pop = function() {
-  .pr$Expr$meta_pop(self)
+  .pr$Expr$meta_pop(self) |>
+    unwrap("in $meta$pop():")
 }
 
 

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -94,11 +94,11 @@ concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series,
 
 new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines)
 
-import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, memory_map) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, memory_map)
+import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, memory_map, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, memory_map, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths)
 
 new_from_ndjson <- function(path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) .Call(wrap__new_from_ndjson, path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors)
 
-new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, glob) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, glob)
+new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, glob, include_file_paths) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, glob, include_file_paths)
 
 test_rpolarserr <- function() .Call(wrap__test_rpolarserr)
 
@@ -200,7 +200,7 @@ RPolarsDataFrame$unnest <- function(names) .Call(wrap__RPolarsDataFrame__unnest,
 
 RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call(wrap__RPolarsDataFrame__partition_by, self, by, maintain_order, include_key)
 
-RPolarsDataFrame$export_stream <- function(stream_ptr, pl_flavor) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr, pl_flavor))
+RPolarsDataFrame$export_stream <- function(stream_ptr, compat_level) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr, compat_level))
 
 RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr)
 
@@ -222,13 +222,13 @@ RPolarsDataFrame$clear <- function() .Call(wrap__RPolarsDataFrame__clear, self)
 
 RPolarsDataFrame$write_csv <- function(file, include_bom, include_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) .Call(wrap__RPolarsDataFrame__write_csv, self, file, include_bom, include_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style)
 
-RPolarsDataFrame$write_ipc <- function(file, compression, future) .Call(wrap__RPolarsDataFrame__write_ipc, self, file, compression, future)
+RPolarsDataFrame$write_ipc <- function(file, compression, compat_level) .Call(wrap__RPolarsDataFrame__write_ipc, self, file, compression, compat_level)
 
-RPolarsDataFrame$to_raw_ipc <- function(compression, future) .Call(wrap__RPolarsDataFrame__to_raw_ipc, self, compression, future)
+RPolarsDataFrame$to_raw_ipc <- function(compression, compat_level) .Call(wrap__RPolarsDataFrame__to_raw_ipc, self, compression, compat_level)
 
 RPolarsDataFrame$from_raw_ipc <- function(bits, n_rows, row_name, row_index) .Call(wrap__RPolarsDataFrame__from_raw_ipc, bits, n_rows, row_name, row_index)
 
-RPolarsDataFrame$write_parquet <- function(file, compression_method, compression_level, statistics, row_group_size, data_pagesize_limit) .Call(wrap__RPolarsDataFrame__write_parquet, self, file, compression_method, compression_level, statistics, row_group_size, data_pagesize_limit)
+RPolarsDataFrame$write_parquet <- function(file, compression_method, compression_level, statistics, row_group_size, data_page_size, partition_by, partition_chunk_size_bytes) .Call(wrap__RPolarsDataFrame__write_parquet, self, file, compression_method, compression_level, statistics, row_group_size, data_page_size, partition_by, partition_chunk_size_bytes)
 
 RPolarsDataFrame$write_json <- function(file, pretty, row_oriented) .Call(wrap__RPolarsDataFrame__write_json, self, file, pretty, row_oriented)
 
@@ -1120,6 +1120,8 @@ RPolarsExpr$bin_hex_decode <- function(strict) .Call(wrap__RPolarsExpr__bin_hex_
 
 RPolarsExpr$bin_base64_decode <- function(strict) .Call(wrap__RPolarsExpr__bin_base64_decode, self, strict)
 
+RPolarsExpr$bin_size_bytes <- function() .Call(wrap__RPolarsExpr__bin_size_bytes, self)
+
 RPolarsExpr$struct_field_by_name <- function(name) .Call(wrap__RPolarsExpr__struct_field_by_name, self, name)
 
 RPolarsExpr$struct_rename_fields <- function(names) .Call(wrap__RPolarsExpr__struct_rename_fields, self, names)
@@ -1190,7 +1192,7 @@ RPolarsLazyFrame$serialize <- function() .Call(wrap__RPolarsLazyFrame__serialize
 
 RPolarsLazyFrame$deserialize <- function(json) .Call(wrap__RPolarsLazyFrame__deserialize, json)
 
-RPolarsLazyFrame$sink_parquet <- function(path, compression_method, compression_level, statistics, row_group_size, data_pagesize_limit, maintain_order) .Call(wrap__RPolarsLazyFrame__sink_parquet, self, path, compression_method, compression_level, statistics, row_group_size, data_pagesize_limit, maintain_order)
+RPolarsLazyFrame$sink_parquet <- function(path, compression_method, compression_level, statistics, row_group_size, data_page_size, maintain_order) .Call(wrap__RPolarsLazyFrame__sink_parquet, self, path, compression_method, compression_level, statistics, row_group_size, data_page_size, maintain_order)
 
 RPolarsLazyFrame$sink_ipc <- function(path, compression, maintain_order) .Call(wrap__RPolarsLazyFrame__sink_ipc, self, path, compression, maintain_order)
 
@@ -1260,17 +1262,15 @@ RPolarsLazyFrame$join <- function(other, left_on, right_on, how, validate, join_
 
 RPolarsLazyFrame$sort_by_exprs <- function(by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsLazyFrame__sort_by_exprs, self, by, dotdotdot, descending, nulls_last, maintain_order, multithreaded)
 
-RPolarsLazyFrame$unpivot <- function(on, index, value_name, variable_name, streamable) .Call(wrap__RPolarsLazyFrame__unpivot, self, on, index, value_name, variable_name, streamable)
+RPolarsLazyFrame$unpivot <- function(on, index, value_name, variable_name) .Call(wrap__RPolarsLazyFrame__unpivot, self, on, index, value_name, variable_name)
 
 RPolarsLazyFrame$rename <- function(existing, new) .Call(wrap__RPolarsLazyFrame__rename, self, existing, new)
 
 RPolarsLazyFrame$schema <- function() .Call(wrap__RPolarsLazyFrame__schema, self)
 
 RPolarsLazyFrame$fetch <- function(n_rows) .Call(wrap__RPolarsLazyFrame__fetch, self, n_rows)
 
-RPolarsLazyFrame$set_optimization_toggle <- function(type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, streaming, eager) .Call(wrap__RPolarsLazyFrame__set_optimization_toggle, self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, streaming, eager)
-
-RPolarsLazyFrame$get_optimization_toggle <- function() .Call(wrap__RPolarsLazyFrame__get_optimization_toggle, self)
+RPolarsLazyFrame$optimization_toggle <- function(type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, streaming, eager) .Call(wrap__RPolarsLazyFrame__optimization_toggle, self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, streaming, eager)
 
 RPolarsLazyFrame$profile <- function() .Call(wrap__RPolarsLazyFrame__profile, self)
 
@@ -1408,7 +1408,7 @@ RPolarsSeries$set_sorted_mut <- function(descending) invisible(.Call(wrap__RPola
 
 RPolarsSeries$struct_fields <- function() .Call(wrap__RPolarsSeries__struct_fields, self)
 
-RPolarsSeries$export_stream <- function(stream_ptr, pl_flavor) invisible(.Call(wrap__RPolarsSeries__export_stream, self, stream_ptr, pl_flavor))
+RPolarsSeries$export_stream <- function(stream_ptr, compat_level) invisible(.Call(wrap__RPolarsSeries__export_stream, self, stream_ptr, compat_level))
 
 RPolarsSeries$import_stream <- function(name, stream_ptr) .Call(wrap__RPolarsSeries__import_stream, name, stream_ptr)