pola-rs · eitsupi · May 8, 2024 · May 5, 2024 · May 5, 2024 · May 6, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -157,6 +157,7 @@ S3method(as_polars_df,RPolarsLazyFrame)
 S3method(as_polars_df,RPolarsLazyGroupBy)
 S3method(as_polars_df,RPolarsRollingGroupBy)
 S3method(as_polars_df,RPolarsSeries)
+S3method(as_polars_df,RecordBatchReader)
 S3method(as_polars_df,data.frame)
 S3method(as_polars_df,default)
 S3method(as_polars_df,nanoarrow_array)
@@ -171,6 +172,7 @@ S3method(as_polars_series,RPolarsChainedThen)
 S3method(as_polars_series,RPolarsExpr)
 S3method(as_polars_series,RPolarsSeries)
 S3method(as_polars_series,RPolarsThen)
+S3method(as_polars_series,RecordBatchReader)
 S3method(as_polars_series,clock_sys_time)
 S3method(as_polars_series,clock_time_point)
 S3method(as_polars_series,clock_zoned_time)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,13 @@
 
 ## Polars R Package (development version)
 
+### Breaking changes
+
+- The following arguments are removed from `as_polars_df(<ArrowTabular>)` (#1078).
+  - `schema` and `schema_overrides`. Use the `<DataFrame>$select()` method and
+    the `<Expr>$cast()` method after conversion to `DataFrame` instead.
+  - `rechunk`. All chunks are automatically rechunked now.
+
 ### New features
 
 - `pl$read_ipc()` can read a raw vector of Apache Arrow IPC file (#1072).
@@ -15,6 +22,8 @@
 - New S3 methods `nanoarrow::as_nanoarrow_array_stream()` and `nanoarrow::infer_nanoarrow_schema()`
   for `RPolarsSeries` (#1076).
 - New method `$dt$is_leap_year()` (#1077).
+- `as_polars_series()` and `as_polars_df()` can create polars objects from `arrow::RecordBatchReader`
+  via the Apache Arrow C stream interface (#1078).
 
 ## Polars R Package 0.16.3
 

diff --git a/R/as_polars.R b/R/as_polars.R
@@ -65,6 +65,11 @@ as_polars_df.default = function(x, ...) {
 #' @param make_names_unique A logical flag to replace duplicated column names
 #' with unique names. If `FALSE` and there are duplicated column names, an
 #' error is thrown.
+#' @param schema named list of DataTypes, or character vector of column names.
+#' Should match the number of columns in `x` and correspond to each column in `x` by position.
+#' If a column in `x` does not match the name or type at the same position, it will be renamed/recast.
+#' If `NULL` (default), convert columns as is.
+#' @param schema_overrides named list of DataTypes. Cast some columns to the DataType.
 #' @inheritParams as_polars_df.ArrowTabular
 #' @export
 as_polars_df.data.frame = function(
@@ -199,26 +204,16 @@ as_polars_df.RPolarsLazyGroupBy = function(x, ...) {
 
 # TODO: link to DataTypes documents
 #' @rdname as_polars_df
-#' @param rechunk A logical flag (default `TRUE`).
-#' Make sure that all data of each column is in contiguous memory.
-#' @param schema named list of DataTypes, or character vector of column names.
-#' Should match the number of columns in `x` and correspond to each column in `x` by position.
-#' If a column in `x` does not match the name or type at the same position, it will be renamed/recast.
-#' If `NULL` (default), convert columns as is.
-#' @param schema_overrides named list of DataTypes. Cast some columns to the DataType.
 #' @export
-as_polars_df.ArrowTabular = function(
-    x,
-    ...,
-    rechunk = TRUE,
-    schema = NULL,
-    schema_overrides = NULL) {
-  arrow_to_rpldf(
-    x,
-    rechunk = rechunk,
-    schema = schema,
-    schema_overrides = schema_overrides
-  )
+as_polars_df.ArrowTabular = function(x, ...) {
+  as_polars_df.RecordBatchReader(arrow::as_record_batch_reader(x))
+}
+
+
+#' @rdname as_polars_df
+#' @export
+as_polars_df.RecordBatchReader = function(x, ...) {
+  as_polars_series.RecordBatchReader(x, name = "")$to_frame()$unnest("")
 }
 
 
@@ -234,33 +229,19 @@ as_polars_df.nanoarrow_array = function(x, ...) {
       unwrap("in as_polars_df(<nanoarrow_array>):")
   }
 
-  series = as_polars_series.nanoarrow_array(x, name = NULL)
-
-  if (length(series)) {
-    series$to_frame()$unnest("")
-  } else {
-    # TODO: support 0-length array
-    pl$DataFrame()
-  }
+  as_polars_series.nanoarrow_array(x, name = "")$to_frame()$unnest("")
 }
 
 
 #' @rdname as_polars_df
 #' @export
 as_polars_df.nanoarrow_array_stream = function(x, ...) {
-  if (!inherits(nanoarrow::infer_nanoarrow_ptype(x$get_schema()), "data.frame")) {
+  if (!identical(nanoarrow::nanoarrow_schema_parse(x$get_schema())$type, "struct")) {
     Err_plain("Can't convert non-struct array stream to RPolarsDataFrame") |>
       unwrap("in as_polars_df(<nanoarrow_array_stream>):")
   }
 
-  series = as_polars_series.nanoarrow_array_stream(x, name = NULL)
-
-  if (length(series)) {
-    series$to_frame()$unnest("")
-  } else {
-    # TODO: support 0-length array stream
-    pl$DataFrame()
-  }
+  as_polars_series.nanoarrow_array_stream(x, name = "")$to_frame()$unnest("")
 }
 
 
@@ -397,6 +378,20 @@ as_polars_series.Array = function(x, name = NULL, ..., rechunk = TRUE) {
 as_polars_series.ChunkedArray = as_polars_series.Array
 
 
+#' @rdname as_polars_series
+#' @export
+as_polars_series.RecordBatchReader = function(x, name = NULL, ...) {
+  stream_out = polars_allocate_array_stream()
+  x$export_to_c(stream_out)
+
+  .pr$Series$import_stream(
+    name %||% "",
+    stream_out
+  ) |>
+    unwrap("in as_polars_series(<RecordBatchReader>):")
+}
+
+
 #' @rdname as_polars_series
 #' @export
 as_polars_series.nanoarrow_array = function(x, name = NULL, ...) {
@@ -411,21 +406,14 @@ as_polars_series.nanoarrow_array = function(x, name = NULL, ...) {
 as_polars_series.nanoarrow_array_stream = function(x, name = NULL, ...) {
   on.exit(x$release())
 
-  list_of_arrays = nanoarrow::collect_array_stream(x, validate = FALSE)
-
-  if (length(list_of_arrays) < 1L) {
-    # TODO: support 0-length array stream
-    out = pl$Series(name = name)
-  } else {
-    out = as_polars_series.nanoarrow_array(list_of_arrays[[1L]], name = name)
-    lapply(
-      list_of_arrays[-1L],
-      \(array) .pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array))
-    ) |>
-      invisible()
-  }
+  stream_out = polars_allocate_array_stream()
+  nanoarrow::nanoarrow_pointer_export(x, stream_out)
 
-  out
+  .pr$Series$import_stream(
+    name %||% "",
+    stream_out
+  ) |>
+    unwrap("in as_polars_series(<nanoarrow_array_stream>):")
 }
 
 

diff --git a/R/construction.R b/R/construction.R
@@ -1,102 +1,3 @@
-#' Internal function of `as_polars_df()` for `arrow::Table` class objects.
-#'
-#' This is a copy of Python Polars' `arrow_to_pydf` function.
-#' @param at arrow::ArrowTabular (arrow::Table and arrow::RecordBatch)
-#' @param rechunk A logical flag (default `TRUE`).
-#' Make sure that all data of each column is in contiguous memory.
-#' @param schema named list of DataTypes, or character vector of column names.
-#' Should be the same length as the number of columns of `x`.
-#' If schema names or types do not match `x`, the columns will be renamed/recast.
-#' If `NULL` (default), convert columns as is.
-#' @param schema_overrides named list of DataTypes. Cast some columns to the DataType.
-#' @noRd
-#' @return RPolarsDataFrame
-arrow_to_rpldf = function(at, schema = NULL, schema_overrides = NULL, rechunk = TRUE) {
-  # new column names by schema, #todo get names if schema not NULL
-  n_cols = at$num_columns
-
-  new_schema = unpack_schema(
-    schema = schema %||% names(at),
-    schema_overrides = schema_overrides
-  )
-  col_names = names(new_schema)
-
-  if (length(col_names) != n_cols) {
-    Err_plain("schema length does not match column length") |>
-      unwrap()
-  }
-
-  data_cols = list()
-  # dictionaries cannot be built in different batches (categorical does not allow
-  # that) so we rechunk them and create them separately.
-  # struct columns don't work properly if they contain multiple chunks.
-  special_cols = list()
-
-  ## iter over columns, possibly do special conversion
-  for (i in seq_len(n_cols)) {
-    column = at$column(i - 1L)
-    col_name = col_names[i]
-
-    if (is_arrow_dictionary(column)) {
-      column = coerce_arrow(column)
-      special_cols[[col_name]] = as_polars_series.ChunkedArray(column, col_name, rechunk = rechunk)
-    } else if (is_arrow_struct(column) && column$num_chunks > 1L) {
-      special_cols[[col_name]] = as_polars_series.ChunkedArray(column, col_name, rechunk = rechunk)
-    } else {
-      data_cols[[col_name]] = column
-    }
-  }
-
-  if (length(data_cols)) {
-    tbl = do.call(arrow::arrow_table, data_cols)
-
-    if (tbl$num_rows == 0L) {
-      rdf = pl$DataFrame() # TODO: support creating 0-row DataFrame
-    } else {
-      rdf = unwrap(
-        .pr$DataFrame$from_arrow_record_batches(arrow::as_record_batch_reader(tbl)$batches())
-      )
-    }
-  } else {
-    rdf = pl$DataFrame()
-  }
-
-  if (rechunk) {
-    rdf = rdf$select(pl$all()$rechunk())
-  }
-
-  if (length(special_cols)) {
-    rdf = rdf$with_columns(
-      unname(lapply(special_cols, \(s) pl$lit(s)$alias(s$name)))
-    )$select(
-      pl$col(col_names)
-    )
-  }
-
-  # cast any imported arrow fields not matching schema
-  cast_these_fields = mapply(
-    new_schema,
-    rdf$schema,
-    FUN = \(new_field, df_field)  {
-      if (is.null(new_field) || new_field == df_field) NULL else new_field
-    },
-    SIMPLIFY = FALSE
-  ) |> (\(l) l[!sapply(l, is.null)])()
-
-  if (length(cast_these_fields)) {
-    rdf = rdf$with_columns(
-      mapply(
-        cast_these_fields,
-        names(cast_these_fields),
-        FUN = \(dtype, name) pl$col(name)$cast(dtype),
-        SIMPLIFY = FALSE
-      ) |> unname()
-    )
-  }
-
-  rdf
-}
-
 unpack_schema = function(
     schema = NULL, # char vector of names or 'schema' a named list of DataTypes
     schema_overrides = NULL # named list of DataTypes
@@ -206,7 +107,7 @@ arrow_to_rseries_result = function(name, values, rechunk = TRUE) {
 
 #' Internal function of `as_polars_df()` for `data.frame` class objects.
 #'
-#' This is a copy of `arrow_to_rpldf`
+#' This is a copy of Python Polars' `arrow_to_pydf` function.
 #' @noRd
 #' @return RPolarsDataFrame
 df_to_rpldf = function(x, ..., schema = NULL, schema_overrides = NULL) {

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -10,6 +10,8 @@
 #' @useDynLib polars, .registration = TRUE
 NULL
 
+polars_allocate_array_stream <- function() .Call(wrap__polars_allocate_array_stream)
+
 all_horizontal <- function(dotdotdot) .Call(wrap__all_horizontal, dotdotdot)
 
 any_horizontal <- function(dotdotdot) .Call(wrap__any_horizontal, dotdotdot)
@@ -58,12 +60,6 @@ struct_ <- function(exprs, eager, schema) .Call(wrap__struct_, exprs, eager, sch
 
 dtype_str_repr <- function(dtype) .Call(wrap__dtype_str_repr, dtype)
 
-new_arrow_stream <- function() .Call(wrap__new_arrow_stream)
-
-arrow_stream_to_df <- function(robj_str) .Call(wrap__arrow_stream_to_df, robj_str)
-
-arrow_stream_to_series <- function(robj_str) .Call(wrap__arrow_stream_to_series, robj_str)
-
 mem_address <- function(robj) .Call(wrap__mem_address, robj)
 
 clone_robj <- function(robj) .Call(wrap__clone_robj, robj)
@@ -204,8 +200,6 @@ RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call
 
 RPolarsDataFrame$export_stream <- function(stream_ptr, pl_flavor) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr, pl_flavor))
 
-RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr)
-
 RPolarsDataFrame$estimated_size <- function() .Call(wrap__RPolarsDataFrame__estimated_size, self)
 
 RPolarsDataFrame$null_count <- function() .Call(wrap__RPolarsDataFrame__null_count, self)
@@ -1376,7 +1370,7 @@ RPolarsSeries$struct_fields <- function() .Call(wrap__RPolarsSeries__struct_fiel
 
 RPolarsSeries$export_stream <- function(stream_ptr, pl_flavor) invisible(.Call(wrap__RPolarsSeries__export_stream, self, stream_ptr, pl_flavor))
 
-RPolarsSeries$from_arrow_array_stream_str <- function(name, robj_str) .Call(wrap__RPolarsSeries__from_arrow_array_stream_str, name, robj_str)
+RPolarsSeries$import_stream <- function(name, stream_ptr) .Call(wrap__RPolarsSeries__import_stream, name, stream_ptr)
 
 RPolarsSeries$from_arrow_array_robj <- function(name, array) .Call(wrap__RPolarsSeries__from_arrow_array_robj, name, array)
 

diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd
diff --git a/man/as_polars_series.Rd b/man/as_polars_series.Rd
diff --git a/src/rust/src/arrow_interop/mod.rs b/src/rust/src/arrow_interop/mod.rs
@@ -1,5 +1,7 @@
 pub mod to_rust;
 
+use polars_core::utils::arrow;
+
 use extendr_api::prelude::*;
 use std::result::Result;
 
@@ -61,3 +63,16 @@ impl RPackage for NanoArrowRPackage {
         "#)
     }
 }
+
+#[extendr]
+pub fn polars_allocate_array_stream() -> Robj {
+    let aas = Box::new(arrow::ffi::ArrowArrayStream::empty());
+    let x = Box::leak(aas); // leak box to make lifetime static
+    let x = x as *mut arrow::ffi::ArrowArrayStream;
+    format!("{:?}", x as usize).into()
+}
+
+extendr_module! {
+    mod arrow_interop;
+    fn polars_allocate_array_stream;
+}