Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat[python]: Add masked array support to numpy interop API #17577

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4142,7 +4142,8 @@ def to_numpy(
allow_copy: bool = True,
use_pyarrow: bool | None = None,
zero_copy_only: bool | None = None,
) -> np.ndarray[Any, Any]:
masked: bool = False,
) -> np.ndarray[Any, Any] | np.ma.MaskedArray[Any, Any]:
"""
Convert this Series to a NumPy ndarray.

Expand Down Expand Up @@ -4182,6 +4183,9 @@ def to_numpy(
.. deprecated:: 0.20.10
Use the `allow_copy` parameter instead, which is the inverse of this
one.
masked
Flag used to determine whether to produce a numpy masked array rather than a
a raw array. More info about numpy masked arrays `here <https://numpy.org/doc/stable/reference/maskedarray.generic.html>`_

Examples
--------
Expand Down Expand Up @@ -4253,7 +4257,7 @@ def to_numpy(
zero_copy_only=not allow_copy, writable=writable
)

return self._s.to_numpy(writable=writable, allow_copy=allow_copy)
return self._s.to_numpy(writable=writable, allow_copy=allow_copy, masked=masked)

@unstable()
def to_jax(self, device: jax.Device | str | None = None) -> jax.Array:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/src/interop/numpy/to_numpy_df.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ fn df_columns_to_numpy(
writable: bool,
) -> PyResult<PyObject> {
let np_arrays = df.iter().map(|s| {
let mut arr = series_to_numpy(py, s, writable, true).unwrap();
let mut arr = series_to_numpy(py, s, writable, true, false).unwrap();

// Convert multidimensional arrays to 1D object arrays.
let shape: Vec<usize> = arr
Expand Down
80 changes: 63 additions & 17 deletions py-polars/src/interop/numpy/to_numpy_series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use polars_core::with_match_physical_numeric_polars_type;
use pyo3::exceptions::PyRuntimeError;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::PySlice;
use pyo3::types::{IntoPyDict, PySlice};

use super::to_numpy_df::df_to_numpy;
use super::utils::{
Expand All @@ -25,8 +25,14 @@ impl PySeries {
/// This method copies data only when necessary. Set `allow_copy` to raise an error if copy
/// is required. Set `writable` to make sure the resulting array is writable, possibly requiring
/// copying the data.
fn to_numpy(&self, py: Python, writable: bool, allow_copy: bool) -> PyResult<PyObject> {
series_to_numpy(py, &self.series, writable, allow_copy)
fn to_numpy(
&self,
py: Python,
writable: bool,
allow_copy: bool,
masked: bool,
) -> PyResult<PyObject> {
series_to_numpy(py, &self.series, writable, allow_copy, masked)
}

/// Create a view of the data as a NumPy ndarray.
Expand All @@ -46,11 +52,12 @@ pub(super) fn series_to_numpy(
s: &Series,
writable: bool,
allow_copy: bool,
masked: bool,
) -> PyResult<PyObject> {
if s.is_empty() {
if s.is_empty() && !masked {
// Take this path to ensure a writable array.
// This does not actually copy data for an empty Series.
return Ok(series_to_numpy_with_copy(py, s, true));
return Ok(series_to_numpy_with_copy(py, s, true, masked));
}
if let Some((mut arr, writable_flag)) = try_series_to_numpy_view(py, s, false, allow_copy) {
if writable && !writable_flag {
Expand All @@ -61,18 +68,39 @@ pub(super) fn series_to_numpy(
}
arr = arr.call_method0(py, intern!(py, "copy"))?;
}
return Ok(arr);
if masked {
let masked_arr = series_to_masked_series(py, arr, s)?;
return Ok(masked_arr);
} else {
return Ok(arr);
}
}

if !allow_copy {
return Err(PyRuntimeError::new_err(
"copy not allowed: cannot convert to a NumPy array without copying data",
));
}

Ok(series_to_numpy_with_copy(py, s, writable))
if masked {
let arr = series_to_numpy_with_copy(py, s, writable, masked);
let masked_arr = series_to_masked_series(py, arr, s)?;
Ok(masked_arr)
} else {
Ok(series_to_numpy_with_copy(py, s, writable, masked))
}
}

/// Wraps an existing numpy array with the
fn series_to_masked_series(py: Python, np_array: PyObject, s: &Series) -> PyResult<PyObject> {
let validity_buffer_array = series_validity_buffer_to_numpy(py, s);
Python::with_gil(|py| {
let masked_array_api = PyModule::import_bound(py, "numpy.ma")?;
let ma_constructor = masked_array_api.getattr("array")?;
let args = (np_array,);
let kwargs = vec![("mask", validity_buffer_array)].into_py_dict_bound(py);
let masked_array = ma_constructor.call(args, Some(&kwargs))?;
Ok(masked_array.into_py(py))
})
}
/// Create a NumPy view of the given Series.
fn try_series_to_numpy_view(
py: Python,
Expand Down Expand Up @@ -183,7 +211,7 @@ fn array_series_to_numpy_view(py: Python, s: &Series, writable: bool) -> PyObjec
/// Convert a Series to a NumPy ndarray, copying data in the process.
///
/// This method will cast integers to floats so that `null = np.nan`.
fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyObject {
fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool, masked: bool) -> PyObject {
use DataType::*;
match s.dtype() {
Int8 => numeric_series_to_numpy::<Int8Type, f32>(py, s),
Expand All @@ -196,7 +224,7 @@ fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyObject
UInt64 => numeric_series_to_numpy::<UInt64Type, f64>(py, s),
Float32 => numeric_series_to_numpy::<Float32Type, f32>(py, s),
Float64 => numeric_series_to_numpy::<Float64Type, f64>(py, s),
Boolean => boolean_series_to_numpy(py, s),
Boolean => boolean_series_to_numpy(py, s, masked),
Date => date_series_to_numpy(py, s),
Datetime(tu, _) => {
use numpy::datetime::{units, Datetime};
Expand Down Expand Up @@ -252,7 +280,7 @@ fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyObject
PyArray1::from_iter_bound(py, values).into_py(py)
},
List(_) => list_series_to_numpy(py, s, writable),
Array(_, _) => array_series_to_numpy(py, s, writable),
Array(_, _) => array_series_to_numpy(py, s, writable, masked),
Struct(_) => {
let ca = s.struct_().unwrap();
let df = ca.clone().unnest();
Expand All @@ -276,6 +304,24 @@ fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyObject
}
}

/// Produce a python array from the validity buffer of the series
fn series_validity_buffer_to_numpy(py: Python, s: &Series) -> PyObject {
let validity_buf: Vec<u8> = s
.chunks()
.iter()
.flat_map(|x| {
let validity = x.validity();
match validity {
Some(mask) => mask.iter().collect(),
None => vec![false; x.len()],
}
})
.map(|x| x as u8)
.collect();

PyArray1::from_iter_bound(py, validity_buf).into_py(py)
}

/// Convert numeric types to f32 or f64 with NaN representing a null value.
fn numeric_series_to_numpy<T, U>(py: Python, s: &Series) -> PyObject
where
Expand All @@ -297,9 +343,9 @@ where
}
}
/// Convert booleans to u8 if no nulls are present, otherwise convert to objects.
fn boolean_series_to_numpy(py: Python, s: &Series) -> PyObject {
fn boolean_series_to_numpy(py: Python, s: &Series, masked: bool) -> PyObject {
let ca = s.bool().unwrap();
if s.null_count() == 0 {
if s.null_count() == 0 || masked {
let values = ca.into_no_null_iter();
PyArray1::<bool>::from_iter_bound(py, values).into_py(py)
} else {
Expand Down Expand Up @@ -345,7 +391,7 @@ fn list_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
let ca = s.list().unwrap();
let s_inner = ca.get_inner();

let np_array_flat = series_to_numpy(py, &s_inner, writable, true).unwrap();
let np_array_flat = series_to_numpy(py, &s_inner, writable, true, false).unwrap();

// Split the NumPy array into subarrays by offset.
// TODO: Downcast the NumPy array to Rust and split without calling into Python.
Expand All @@ -362,10 +408,10 @@ fn list_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
PyArray1::from_iter_bound(py, values).into_py(py)
}
/// Convert arrays by flattening first, converting the flat Series, and then reshaping.
fn array_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
fn array_series_to_numpy(py: Python, s: &Series, writable: bool, masked: bool) -> PyObject {
let ca = s.array().unwrap();
let s_inner = ca.get_inner();
let np_array_flat = series_to_numpy_with_copy(py, &s_inner, writable);
let np_array_flat = series_to_numpy_with_copy(py, &s_inner, writable, masked);

// Reshape to the original shape.
let DataType::Array(_, width) = s.dtype() else {
Expand Down
24 changes: 24 additions & 0 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import TYPE_CHECKING, Any

import numpy as np
import numpy.ma as ma
import pytest
from hypothesis import given, settings
from numpy.testing import assert_array_equal
Expand All @@ -15,6 +16,7 @@

if TYPE_CHECKING:
import numpy.typing as npt
from numpy.ma import MaskedArray


def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
Expand Down Expand Up @@ -461,3 +463,25 @@ def test_to_numpy2(
# As Null values can't be encoded natively in a numpy array,
# this array will never be a view.
assert np_array_with_missing_values.flags.writeable == writable


def test_to_masked_numpy_array() -> None:
values = [1, 2, 3, 4]
s = pl.Series(values)
expected: MaskedArray[Any, Any] = ma.masked_array(np.array(values), [0, 0, 0, 0]) # type:ignore[no-untyped-call]
result = s.to_numpy(masked=True)
assert_array_equal(result, expected)

def test_optional_bool_array_to_masked() -> None:
values = [True, False, None, True]
s = pl.Series(values)
result = s.to_numpy(masked=True)
assert result.data.dtype == bool

def test_optional_int_array_to_masked() -> None:
values = [1, 2, 3, 4]
s = pl.Series('a', values, pl.UInt8)
result = s.to_numpy()
print(result.dtype)
assert result.dtype == int

Comment on lines +481 to +487
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WIP - will update once numeric conversion semantics are clear to me

Loading