From 0dd810463b94469749221d3deeebaf7151ad711f Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 13 Jul 2023 23:17:00 +0200 Subject: [PATCH 01/37] docs(python): add logo `link` entry to sphinx conf and factor-out website root paths (#9864) --- py-polars/docs/source/conf.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/py-polars/docs/source/conf.py b/py-polars/docs/source/conf.py index ccf9d53ec62f..31916f3c6106 100644 --- a/py-polars/docs/source/conf.py +++ b/py-polars/docs/source/conf.py @@ -90,18 +90,26 @@ autosummary_generate = True numpydoc_show_class_members = False +# key site root paths +static_assets_root = "https://raw.githubusercontent.com/pola-rs/polars-static/master" +github_root = "https://github.com/pola-rs/polars" +web_root = "https://pola-rs.github.io" + html_theme_options = { "external_links": [ { "name": "User Guide", - "url": "https://pola-rs.github.io/polars-book/user-guide/index.html", + "url": f"{web_root}/polars-book/user-guide/index.html", + }, + { + "name": "Powered by Xomnia", + "url": "https://www.xomnia.com/", }, - {"name": "Powered by Xomnia", "url": "https://www.xomnia.com/"}, ], "icon_links": [ { "name": "GitHub", - "url": "https://github.com/pola-rs/polars", + "url": github_root, "icon": "fa-brands fa-github", }, { @@ -116,8 +124,9 @@ }, ], "logo": { - "image_light": "https://raw.githubusercontent.com/pola-rs/polars-static/master/logos/polars-logo-dark-medium.png", - "image_dark": "https://raw.githubusercontent.com/pola-rs/polars-static/master/logos/polars-logo-dimmed-medium.png", + "image_light": f"{static_assets_root}/logos/polars-logo-dark-medium.png", + "image_dark": f"{static_assets_root}/logos/polars-logo-dimmed-medium.png", + "link": f"{web_root}/polars/py-polars/html/reference/index.html", }, } @@ -125,12 +134,12 @@ { "rel": "icon", "sizes": "32x32", - "href": "https://raw.githubusercontent.com/pola-rs/polars-static/master/icons/favicon-32x32.png", + "href": f"{static_assets_root}/icons/favicon-32x32.png", }, { "rel": "apple-touch-icon", "sizes": "180x180", - "href": "https://raw.githubusercontent.com/pola-rs/polars-static/master/icons/touchicon-180x180.png", + "href": f"{static_assets_root}/icons/touchicon-180x180.png", }, ] @@ -195,10 +204,7 @@ def linkcode_resolve(domain, info): polars_root = os.path.abspath(f"{conf_dir_path}/../../polars") fn = os.path.relpath(fn, start=polars_root) - - return ( - f"https://github.com/pola-rs/polars/blob/main/py-polars/polars/{fn}{linespec}" - ) + return f"{github_root}/blob/main/py-polars/polars/{fn}{linespec}" def _minify_classpaths(s: str) -> str: From ef69f4bf32e3cfadf4546f216ec762cceaa0006c Mon Sep 17 00:00:00 2001 From: messense Date: Fri, 14 Jul 2023 13:58:14 +0800 Subject: [PATCH 02/37] perf(python): Use `pyo3::intern` to avoid needlessly recreating PyString (#9853) --- py-polars/Cargo.lock | 2 +- py-polars/src/conversion.rs | 143 ++++++++++++++++++++---------------- 2 files changed, 80 insertions(+), 65 deletions(-) diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 29b6b3fd89ae..0bcb9e8e6f1b 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -1718,7 +1718,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "0.18.6" +version = "0.18.7" dependencies = [ "ahash", "built", diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index d6620682386e..ff63d360a75d 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -23,7 +23,7 @@ use pyo3::prelude::*; use pyo3::types::{ PyBool, PyBytes, PyDict, PyFloat, PyList, PySequence, PyString, PyTuple, PyType, }; -use pyo3::{PyAny, PyResult}; +use pyo3::{intern, PyAny, PyResult}; use smartstring::alias::String as SmartString; use crate::error::PyPolarsErr; @@ -71,17 +71,17 @@ impl From for Wrap { // extract a Rust DataFrame from a python DataFrame, that is DataFrame> pub(crate) fn get_df(obj: &PyAny) -> PyResult { - let pydf = obj.getattr("_df")?; + let pydf = obj.getattr(intern!(obj.py(), "_df"))?; Ok(pydf.extract::()?.df) } pub(crate) fn get_lf(obj: &PyAny) -> PyResult { - let pydf = obj.getattr("_ldf")?; + let pydf = obj.getattr(intern!(obj.py(), "_ldf"))?; Ok(pydf.extract::()?.ldf) } pub(crate) fn get_series(obj: &PyAny) -> PyResult { - let pydf = obj.getattr("_s")?; + let pydf = obj.getattr(intern!(obj.py(), "_s"))?; Ok(pydf.extract::()?.series) } @@ -226,11 +226,11 @@ impl IntoPy for Wrap> { s.into_py(py) } AnyValue::Date(v) => { - let convert = utils.getattr("_to_python_date").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_date")).unwrap(); convert.call1((v,)).unwrap().into_py(py) } AnyValue::Datetime(v, time_unit, time_zone) => { - let convert = utils.getattr("_to_python_datetime").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_datetime")).unwrap(); let time_unit = time_unit.to_ascii(); convert .call1((v, time_unit, time_zone.as_ref().map(|s| s.as_str()))) @@ -238,12 +238,12 @@ impl IntoPy for Wrap> { .into_py(py) } AnyValue::Duration(v, time_unit) => { - let convert = utils.getattr("_to_python_timedelta").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_timedelta")).unwrap(); let time_unit = time_unit.to_ascii(); convert.call1((v, time_unit)).unwrap().into_py(py) } AnyValue::Time(v) => { - let convert = utils.getattr("_to_python_time").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_time")).unwrap(); convert.call1((v,)).unwrap().into_py(py) } AnyValue::Array(v, _) | AnyValue::List(v) => PySeries::new(v).to_list(), @@ -262,7 +262,7 @@ impl IntoPy for Wrap> { AnyValue::Binary(v) => v.into_py(py), AnyValue::BinaryOwned(v) => v.into_py(py), AnyValue::Decimal(v, scale) => { - let convert = utils.getattr("_to_python_decimal").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_decimal")).unwrap(); const N: usize = 3; let mut buf = [0_u128; N]; let n_digits = decimal_to_digits(v.abs(), &mut buf); @@ -287,84 +287,88 @@ impl ToPyObject for Wrap { let pl = POLARS.as_ref(py); match &self.0 { - DataType::Int8 => pl.getattr("Int8").unwrap().into(), - DataType::Int16 => pl.getattr("Int16").unwrap().into(), - DataType::Int32 => pl.getattr("Int32").unwrap().into(), - DataType::Int64 => pl.getattr("Int64").unwrap().into(), - DataType::UInt8 => pl.getattr("UInt8").unwrap().into(), - DataType::UInt16 => pl.getattr("UInt16").unwrap().into(), - DataType::UInt32 => pl.getattr("UInt32").unwrap().into(), - DataType::UInt64 => pl.getattr("UInt64").unwrap().into(), - DataType::Float32 => pl.getattr("Float32").unwrap().into(), - DataType::Float64 => pl.getattr("Float64").unwrap().into(), + DataType::Int8 => pl.getattr(intern!(py, "Int8")).unwrap().into(), + DataType::Int16 => pl.getattr(intern!(py, "Int16")).unwrap().into(), + DataType::Int32 => pl.getattr(intern!(py, "Int32")).unwrap().into(), + DataType::Int64 => pl.getattr(intern!(py, "Int64")).unwrap().into(), + DataType::UInt8 => pl.getattr(intern!(py, "UInt8")).unwrap().into(), + DataType::UInt16 => pl.getattr(intern!(py, "UInt16")).unwrap().into(), + DataType::UInt32 => pl.getattr(intern!(py, "UInt32")).unwrap().into(), + DataType::UInt64 => pl.getattr(intern!(py, "UInt64")).unwrap().into(), + DataType::Float32 => pl.getattr(intern!(py, "Float32")).unwrap().into(), + DataType::Float64 => pl.getattr(intern!(py, "Float64")).unwrap().into(), DataType::Decimal(precision, scale) => pl - .getattr("Decimal") + .getattr(intern!(py, "Decimal")) .unwrap() .call1((*scale, *precision)) .unwrap() .into(), - DataType::Boolean => pl.getattr("Boolean").unwrap().into(), - DataType::Utf8 => pl.getattr("Utf8").unwrap().into(), - DataType::Binary => pl.getattr("Binary").unwrap().into(), + DataType::Boolean => pl.getattr(intern!(py, "Boolean")).unwrap().into(), + DataType::Utf8 => pl.getattr(intern!(py, "Utf8")).unwrap().into(), + DataType::Binary => pl.getattr(intern!(py, "Binary")).unwrap().into(), DataType::Array(inner, size) => { let inner = Wrap(*inner.clone()).to_object(py); - let list_class = pl.getattr("Array").unwrap(); + let list_class = pl.getattr(intern!(py, "Array")).unwrap(); list_class.call1((*size, inner)).unwrap().into() } DataType::List(inner) => { let inner = Wrap(*inner.clone()).to_object(py); - let list_class = pl.getattr("List").unwrap(); + let list_class = pl.getattr(intern!(py, "List")).unwrap(); list_class.call1((inner,)).unwrap().into() } - DataType::Date => pl.getattr("Date").unwrap().into(), + DataType::Date => pl.getattr(intern!(py, "Date")).unwrap().into(), DataType::Datetime(tu, tz) => { - let datetime_class = pl.getattr("Datetime").unwrap(); + let datetime_class = pl.getattr(intern!(py, "Datetime")).unwrap(); datetime_class .call1((tu.to_ascii(), tz.clone())) .unwrap() .into() } DataType::Duration(tu) => { - let duration_class = pl.getattr("Duration").unwrap(); + let duration_class = pl.getattr(intern!(py, "Duration")).unwrap(); duration_class.call1((tu.to_ascii(),)).unwrap().into() } #[cfg(feature = "object")] - DataType::Object(_) => pl.getattr("Object").unwrap().into(), - DataType::Categorical(_) => pl.getattr("Categorical").unwrap().into(), - DataType::Time => pl.getattr("Time").unwrap().into(), + DataType::Object(_) => pl.getattr(intern!(py, "Object")).unwrap().into(), + DataType::Categorical(_) => pl.getattr(intern!(py, "Categorical")).unwrap().into(), + DataType::Time => pl.getattr(intern!(py, "Time")).unwrap().into(), DataType::Struct(fields) => { - let field_class = pl.getattr("Field").unwrap(); + let field_class = pl.getattr(intern!(py, "Field")).unwrap(); let iter = fields.iter().map(|fld| { let name = fld.name().as_str(); let dtype = Wrap(fld.data_type().clone()).to_object(py); field_class.call1((name, dtype)).unwrap() }); let fields = PyList::new(py, iter); - let struct_class = pl.getattr("Struct").unwrap(); + let struct_class = pl.getattr(intern!(py, "Struct")).unwrap(); struct_class.call1((fields,)).unwrap().into() } - DataType::Null => pl.getattr("Null").unwrap().into(), - DataType::Unknown => pl.getattr("Unknown").unwrap().into(), + DataType::Null => pl.getattr(intern!(py, "Null")).unwrap().into(), + DataType::Unknown => pl.getattr(intern!(py, "Unknown")).unwrap().into(), } } } impl FromPyObject<'_> for Wrap { fn extract(ob: &PyAny) -> PyResult { - let name = ob.getattr("name")?.str()?.to_str()?; - let dtype = ob.getattr("dtype")?.extract::>()?; + let py = ob.py(); + let name = ob.getattr(intern!(py, "name"))?.str()?.to_str()?; + let dtype = ob + .getattr(intern!(py, "dtype"))? + .extract::>()?; Ok(Wrap(Field::new(name, dtype.0))) } } impl FromPyObject<'_> for Wrap { fn extract(ob: &PyAny) -> PyResult { + let py = ob.py(); let type_name = ob.get_type().name()?; let dtype = match type_name { "DataTypeClass" => { // just the class, not an object - let name = ob.getattr("__name__")?.str()?.to_str()?; + let name = ob.getattr(intern!(py, "__name__"))?.str()?.to_str()?; match name { "UInt8" => DataType::UInt8, "UInt16" => DataType::UInt16, @@ -400,36 +404,36 @@ impl FromPyObject<'_> for Wrap { } } "Duration" => { - let time_unit = ob.getattr("time_unit").unwrap(); + let time_unit = ob.getattr(intern!(py, "time_unit")).unwrap(); let time_unit = time_unit.extract::>()?.0; DataType::Duration(time_unit) } "Datetime" => { - let time_unit = ob.getattr("time_unit").unwrap(); + let time_unit = ob.getattr(intern!(py, "time_unit")).unwrap(); let time_unit = time_unit.extract::>()?.0; - let time_zone = ob.getattr("time_zone").unwrap(); + let time_zone = ob.getattr(intern!(py, "time_zone")).unwrap(); let time_zone = time_zone.extract()?; DataType::Datetime(time_unit, time_zone) } "Decimal" => { - let precision = ob.getattr("precision")?.extract()?; - let scale = ob.getattr("scale")?.extract()?; + let precision = ob.getattr(intern!(py, "precision"))?.extract()?; + let scale = ob.getattr(intern!(py, "scale"))?.extract()?; DataType::Decimal(precision, Some(scale)) } "List" => { - let inner = ob.getattr("inner").unwrap(); + let inner = ob.getattr(intern!(py, "inner")).unwrap(); let inner = inner.extract::>()?; DataType::List(Box::new(inner.0)) } "Array" => { - let inner = ob.getattr("inner").unwrap(); - let width = ob.getattr("width").unwrap(); + let inner = ob.getattr(intern!(py, "inner")).unwrap(); + let width = ob.getattr(intern!(py, "width")).unwrap(); let inner = inner.extract::>()?; let width = width.extract::()?; DataType::Array(Box::new(inner.0), width) } "Struct" => { - let fields = ob.getattr("fields")?; + let fields = ob.getattr(intern!(py, "fields"))?; let fields = fields .extract::>>()? .into_iter() @@ -503,7 +507,7 @@ impl ToPyObject for Wrap<&StructChunked> { impl ToPyObject for Wrap<&DurationChunked> { fn to_object(&self, py: Python) -> PyObject { let utils = UTILS.as_ref(py); - let convert = utils.getattr("_to_python_timedelta").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_timedelta")).unwrap(); let time_unit = Wrap(self.0.time_unit()).to_object(py); let iter = self .0 @@ -516,7 +520,7 @@ impl ToPyObject for Wrap<&DurationChunked> { impl ToPyObject for Wrap<&DatetimeChunked> { fn to_object(&self, py: Python) -> PyObject { let utils = UTILS.as_ref(py); - let convert = utils.getattr("_to_python_datetime").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_datetime")).unwrap(); let time_unit = Wrap(self.0.time_unit()).to_object(py); let time_zone = self.0.time_zone().to_object(py); let iter = self @@ -530,7 +534,7 @@ impl ToPyObject for Wrap<&DatetimeChunked> { impl ToPyObject for Wrap<&TimeChunked> { fn to_object(&self, py: Python) -> PyObject { let utils = UTILS.as_ref(py); - let convert = utils.getattr("_to_python_time").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_time")).unwrap(); let iter = self .0 .into_iter() @@ -542,7 +546,7 @@ impl ToPyObject for Wrap<&TimeChunked> { impl ToPyObject for Wrap<&DateChunked> { fn to_object(&self, py: Python) -> PyObject { let utils = UTILS.as_ref(py); - let convert = utils.getattr("_to_python_date").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_date")).unwrap(); let iter = self .0 .into_iter() @@ -554,7 +558,7 @@ impl ToPyObject for Wrap<&DateChunked> { impl ToPyObject for Wrap<&DecimalChunked> { fn to_object(&self, py: Python) -> PyObject { let utils = UTILS.as_ref(py); - let convert = utils.getattr("_to_python_decimal").unwrap(); + let convert = utils.getattr(intern!(py, "_to_python_decimal")).unwrap(); let py_scale = (-(self.0.scale() as i32)).to_object(py); // if we don't know precision, the only safe bet is to set it to 39 let py_precision = self.0.precision().unwrap_or(39).to_object(py); @@ -611,7 +615,7 @@ fn convert_date(ob: &PyAny) -> PyResult> { Python::with_gil(|py| { let date = UTILS .as_ref(py) - .getattr("_date_to_pl_date") + .getattr(intern!(py, "_date_to_pl_date")) .unwrap() .call1((ob,)) .unwrap(); @@ -624,7 +628,9 @@ fn convert_datetime(ob: &PyAny) -> PyResult> { // windows #[cfg(target_arch = "windows")] let (seconds, microseconds) = { - let convert = UTILS.getattr(py, "_datetime_for_anyvalue_windows").unwrap(); + let convert = UTILS + .getattr(py, intern!(py, "_datetime_for_anyvalue_windows")) + .unwrap(); let out = convert.call1(py, (ob,)).unwrap(); let out: (i64, i64) = out.extract(py).unwrap(); out @@ -632,7 +638,9 @@ fn convert_datetime(ob: &PyAny) -> PyResult> { // unix #[cfg(not(target_arch = "windows"))] let (seconds, microseconds) = { - let convert = UTILS.getattr(py, "_datetime_for_anyvalue").unwrap(); + let convert = UTILS + .getattr(py, intern!(py, "_datetime_for_anyvalue")) + .unwrap(); let out = convert.call1(py, (ob,)).unwrap(); let out: (i64, i64) = out.extract(py).unwrap(); out @@ -737,7 +745,7 @@ impl<'s> FromPyObject<'s> for Wrap> { } fn get_series_el(ob: &PyAny) -> PyResult>> { - let py_pyseries = ob.getattr("_s").unwrap(); + let py_pyseries = ob.getattr(intern!(ob.py(), "_s")).unwrap(); let series = py_pyseries.extract::().unwrap().series; Ok(Wrap(AnyValue::List(series))) } @@ -755,9 +763,9 @@ impl<'s> FromPyObject<'s> for Wrap> { Python::with_gil(|py| { let td = UTILS .as_ref(py) - .getattr("_timedelta_to_pl_timedelta") + .getattr(intern!(py, "_timedelta_to_pl_timedelta")) .unwrap() - .call1((ob, "us")) + .call1((ob, intern!(py, "us"))) .unwrap(); let v = td.extract::().unwrap(); Ok(Wrap(AnyValue::Duration(v, TimeUnit::Microseconds))) @@ -768,7 +776,7 @@ impl<'s> FromPyObject<'s> for Wrap> { Python::with_gil(|py| { let time = UTILS .as_ref(py) - .getattr("_time_to_pl_time") + .getattr(intern!(py, "_time_to_pl_time")) .unwrap() .call1((ob,)) .unwrap(); @@ -778,8 +786,11 @@ impl<'s> FromPyObject<'s> for Wrap> { } fn get_decimal(ob: &PyAny) -> PyResult> { - let (sign, digits, exp): (i8, Vec, i32) = - ob.call_method0("as_tuple").unwrap().extract().unwrap(); + let (sign, digits, exp): (i8, Vec, i32) = ob + .call_method0(intern!(ob.py(), "as_tuple")) + .unwrap() + .extract() + .unwrap(); // note: using Vec is not the most efficient thing here (input is a tuple) let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| { PyErr::from(PyPolarsErr::Other( @@ -820,7 +831,7 @@ impl<'s> FromPyObject<'s> for Wrap> { get_struct } else if ob.is_instance_of::() || ob.is_instance_of::() { get_list - } else if ob.hasattr("_s").unwrap() { + } else if ob.hasattr(intern!(py, "_s")).unwrap() { get_series_el } // TODO: this heap allocs on failure @@ -845,8 +856,12 @@ impl<'s> FromPyObject<'s> for Wrap> { // Can't use pyo3::types::PyDateTime with abi3-py37 feature, // so need this workaround instead of `isinstance(ob, datetime)`. - let bases = - ob.get_type().getattr("__bases__").unwrap().iter().unwrap(); + let bases = ob + .get_type() + .getattr(intern!(py, "__bases__")) + .unwrap() + .iter() + .unwrap(); for base in bases { let parent_type = base.unwrap().str().unwrap().to_str().unwrap(); From 1f440c8b038317aca9572c7ba42ed76f2a7ee4fb Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 14 Jul 2023 10:09:07 +0200 Subject: [PATCH 03/37] feat(rust, python): `polars_warn!` macro (#9868) --- polars/polars-core/src/prelude.rs | 4 ++- polars/polars-error/src/lib.rs | 4 +++ polars/polars-error/src/warning.rs | 32 +++++++++++++++++++ .../polars-lazy/polars-plan/src/dsl/string.rs | 1 - polars/polars-lazy/src/frame/mod.rs | 2 +- .../src/physical_plan/expressions/mod.rs | 2 +- py-polars/Cargo.lock | 1 + py-polars/Cargo.toml | 1 + py-polars/polars/utils/__init__.py | 3 +- py-polars/polars/utils/various.py | 9 ++++++ py-polars/src/on_startup.rs | 20 ++++++++++-- 11 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 polars/polars-error/src/warning.rs diff --git a/polars/polars-core/src/prelude.rs b/polars/polars-core/src/prelude.rs index 922bb0aab783..891eb02e7ec5 100644 --- a/polars/polars-core/src/prelude.rs +++ b/polars/polars-core/src/prelude.rs @@ -32,7 +32,9 @@ pub use crate::chunked_array::temporal::conversion::*; pub use crate::chunked_array::ChunkedArray; pub(crate) use crate::chunked_array::{to_array, ChunkIdIter}; pub use crate::datatypes::*; -pub use crate::error::{polars_bail, polars_ensure, polars_err, PolarsError, PolarsResult}; +pub use crate::error::{ + polars_bail, polars_ensure, polars_err, polars_warn, PolarsError, PolarsResult, +}; #[cfg(feature = "asof_join")] pub use crate::frame::asof_join::*; pub use crate::frame::explode::MeltArgs; diff --git a/polars/polars-error/src/lib.rs b/polars/polars-error/src/lib.rs index 572922b61e64..57cc9446427c 100644 --- a/polars/polars-error/src/lib.rs +++ b/polars/polars-error/src/lib.rs @@ -1,9 +1,13 @@ +mod warning; + use std::borrow::Cow; use std::error::Error; use std::fmt::{self, Display, Formatter}; use std::ops::Deref; use std::{env, io}; +pub use warning::*; + #[derive(Debug)] pub struct ErrString(Cow<'static, str>); diff --git a/polars/polars-error/src/warning.rs b/polars/polars-error/src/warning.rs new file mode 100644 index 000000000000..4a2edd77d533 --- /dev/null +++ b/polars/polars-error/src/warning.rs @@ -0,0 +1,32 @@ +type WarningFunction = fn(&str); +static mut WARNING_FUNCTION: Option = None; + +/// Set the function that will be called by the `polars_warn!` macro. +/// You can use this to set logging in polars. +/// +/// # Safety +/// The caller must ensure there is no other thread accessing this function +/// or calling `polars_warn!`. +pub unsafe fn set_warning_function(function: WarningFunction) { + WARNING_FUNCTION = Some(function) +} + +fn eprintln(fmt: &str) { + eprintln!("{}", fmt); +} + +pub fn get_warning_function() -> WarningFunction { + unsafe { WARNING_FUNCTION.unwrap_or(eprintln) } +} +#[macro_export] +macro_rules! polars_warn { + ($fmt:literal, $($arg:tt)+) => { + {{ + let func = $crate::get_warning_function(); + func(format!($fmt, $($arg)+).as_ref()) + }} + }; + ($($arg:tt)+) => { + polars_warn!("{}", $($arg)+); + }; +} diff --git a/polars/polars-lazy/polars-plan/src/dsl/string.rs b/polars/polars-lazy/polars-plan/src/dsl/string.rs index f9eb6acfd61b..8d878d4be5a9 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/string.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/string.rs @@ -1,4 +1,3 @@ -use polars_arrow::array::ValueSize; #[cfg(feature = "dtype-struct")] use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array}; #[cfg(feature = "dtype-struct")] diff --git a/polars/polars-lazy/src/frame/mod.rs b/polars/polars-lazy/src/frame/mod.rs index b7a305c3445b..593333c96f82 100644 --- a/polars/polars-lazy/src/frame/mod.rs +++ b/polars/polars-lazy/src/frame/mod.rs @@ -482,7 +482,7 @@ impl LazyFrame { let streaming = self.opt_state.streaming; #[cfg(feature = "cse")] if streaming && self.opt_state.common_subplan_elimination { - eprintln!("Cannot combine 'streaming' with 'common_subplan_elimination'. CSE will be turned off."); + polars_warn!("Cannot combine 'streaming' with 'common_subplan_elimination'. CSE will be turned off."); opt_state.common_subplan_elimination = false; } let lp_top = optimize(self.logical_plan, opt_state, lp_arena, expr_arena, scratch)?; diff --git a/polars/polars-lazy/src/physical_plan/expressions/mod.rs b/polars/polars-lazy/src/physical_plan/expressions/mod.rs index ef8c731f52a4..befd3655be99 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/mod.rs +++ b/polars/polars-lazy/src/physical_plan/expressions/mod.rs @@ -402,7 +402,7 @@ impl<'a> AggregationContext<'a> { #[cfg(debug_assertions)] { if self.groups.len() > s.len() { - eprintln!("groups may be out of bounds; more groups than elements in a series is only possible in dynamic groupby") + polars_warn!("groups may be out of bounds; more groups than elements in a series is only possible in dynamic groupby") } } diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 0bcb9e8e6f1b..9e09aa089cdc 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -1733,6 +1733,7 @@ dependencies = [ "polars", "polars-algo", "polars-core", + "polars-error", "polars-lazy", "pyo3", "pyo3-built", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index f8d06264b6da..9cf2d39685f9 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -24,6 +24,7 @@ numpy = "0.19" once_cell = "1" polars-algo = { path = "../polars/polars-algo", default-features = false } polars-core = { path = "../polars/polars-core", features = ["python"], default-features = false } +polars-error = { path = "../polars/polars-error" } polars-lazy = { path = "../polars/polars-lazy", features = ["python"], default-features = false } pyo3 = { version = "0.19", features = ["abi3-py38", "extension-module", "multiple-pymethods"] } pyo3-built = { version = "0.4", optional = true } diff --git a/py-polars/polars/utils/__init__.py b/py-polars/polars/utils/__init__.py index b955557aee12..97b5d6caa20a 100644 --- a/py-polars/polars/utils/__init__.py +++ b/py-polars/polars/utils/__init__.py @@ -19,7 +19,7 @@ ) from polars.utils.meta import get_idx_type, get_index_type, threadpool_size from polars.utils.show_versions import show_versions -from polars.utils.various import NoDefault, no_default +from polars.utils.various import NoDefault, _polars_warn, no_default __all__ = [ "NoDefault", @@ -41,4 +41,5 @@ "_to_python_timedelta", "_datetime_for_anyvalue", "_datetime_for_anyvalue_windows", + "_polars_warn", ] diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index ec15af5ed3a5..b5a50cfd9b49 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -4,6 +4,7 @@ import os import re import sys +import warnings from collections.abc import MappingView, Sized from enum import Enum from typing import TYPE_CHECKING, Any, Generator, Iterable, Literal, Sequence, TypeVar @@ -404,3 +405,11 @@ def _get_stack_locals( return objects stack_frame = stack_frame.f_back return objects + + +# this is called from rust +def _polars_warn(msg: str) -> None: + warnings.warn( + msg, + stacklevel=find_stacklevel(), + ) diff --git a/py-polars/src/on_startup.rs b/py-polars/src/on_startup.rs index be19ebe17a08..f0bbbb17c84c 100644 --- a/py-polars/src/on_startup.rs +++ b/py-polars/src/on_startup.rs @@ -8,12 +8,13 @@ use polars_core::chunked_array::object::registry::AnonymousObjectBuilder; use polars_core::error::PolarsError::ComputeError; use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; +use pyo3::intern; use pyo3::prelude::*; use crate::apply::lazy::{call_lambda_with_series, ToSeries}; use crate::dataframe::PyDataFrame; use crate::prelude::{python_udf, ObjectValue}; -use crate::py_modules::POLARS; +use crate::py_modules::{POLARS, UTILS}; use crate::Wrap; fn python_function_caller_series(s: Series, lambda: &PyObject) -> PolarsResult { @@ -56,6 +57,19 @@ fn python_function_caller_df(df: DataFrame, lambda: &PyObject) -> PolarsResult Date: Fri, 14 Jul 2023 10:13:20 +0200 Subject: [PATCH 04/37] chore(rust): Clean up workspace definition (#9861) --- .github/workflows/lint-rust.yml | 2 +- Cargo.toml | 25 ++++++------------------- polars/polars-core/Cargo.toml | 1 + 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/.github/workflows/lint-rust.yml b/.github/workflows/lint-rust.yml index a21f0b32c520..61cedbc4472e 100644 --- a/.github/workflows/lint-rust.yml +++ b/.github/workflows/lint-rust.yml @@ -60,7 +60,7 @@ jobs: save-if: ${{ github.ref_name == 'main' }} - name: Run cargo clippy - run: cargo clippy -- -D warnings + run: cargo clippy --all-targets -- -D warnings rustfmt: if: github.ref_name != 'main' diff --git a/Cargo.toml b/Cargo.toml index 56bdc542e159..6fa06baf9c27 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,25 +3,12 @@ resolver = "2" members = [ "polars", "polars-cli", - "polars/polars-core", - "polars/polars-io", - "polars/polars-time", - "polars/polars-utils", - "polars/polars-ops", - "polars/polars-algo", - "polars/polars-lazy", - "polars/polars-lazy/polars-plan", - "polars/polars-lazy/polars-pipe", - "polars/polars-sql", - "polars/polars-error", - "polars/polars-row", - "polars/polars-json", - "examples/read_csv", - "examples/read_json", - "examples/read_parquet", - "examples/read_parquet_cloud", - "examples/string_filter", - "examples/python_rust_compiled_function", + "polars/polars-*", + "polars/polars-lazy/polars-*", + "examples/*", +] +exclude = [ + "examples/datasets", ] [workspace.package] diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 0b7821d04df2..a6ccda4ace62 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -183,6 +183,7 @@ wasm-timer = "0.2.5" [dev-dependencies] bincode = "1" +serde_json = "1" [package.metadata.docs.rs] # not all because arrow 4.3 does not compile with simd From 7b0527c02910653449c257db0756af8e4a102894 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 14 Jul 2023 10:27:38 +0200 Subject: [PATCH 05/37] feat(rust,python,cli): add `LENGTH` and `OCTET_LENGTH` string functions for SQL (#9860) --- .../polars-plan/src/dsl/function_expr/mod.rs | 8 +- .../src/dsl/function_expr/strings.rs | 149 ++++++++++-------- .../polars-lazy/polars-plan/src/dsl/string.rs | 14 +- polars/polars-sql/src/functions.rs | 15 ++ py-polars/src/expr/string.rs | 20 +-- py-polars/tests/unit/test_sql.py | 21 +++ 6 files changed, 138 insertions(+), 89 deletions(-) diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs index 52b1922bffb4..3ffc1abe0380 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs @@ -618,6 +618,9 @@ impl From for SpecialEq> { match func { #[cfg(feature = "regex")] Contains { literal, strict } => map_as_slice!(strings::contains, literal, strict), + CountMatch(pat) => { + map!(strings::count_match, &pat) + } EndsWith { .. } => map_as_slice!(strings::ends_with), StartsWith { .. } => map_as_slice!(strings::starts_with), Extract { pat, group_index } => { @@ -626,9 +629,8 @@ impl From for SpecialEq> { ExtractAll => { map_as_slice!(strings::extract_all) } - CountMatch(pat) => { - map!(strings::count_match, &pat) - } + NChars => map!(strings::n_chars), + Length => map!(strings::lengths), #[cfg(feature = "string_justify")] Zfill(alignment) => { map!(strings::zfill, alignment) diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs index da1d7350a76a..60486b1cc073 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs @@ -17,37 +17,39 @@ use super::*; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum StringFunction { + #[cfg(feature = "concat_str")] + ConcatHorizontal(String), + #[cfg(feature = "concat_str")] + ConcatVertical(String), #[cfg(feature = "regex")] Contains { literal: bool, strict: bool, }, - StartsWith, + CountMatch(String), EndsWith, + Explode, Extract { pat: String, group_index: usize, }, - #[cfg(feature = "string_justify")] - Zfill(usize), + ExtractAll, + #[cfg(feature = "string_from_radix")] + FromRadix(u32, bool), + NChars, + Length, #[cfg(feature = "string_justify")] LJust { width: usize, fillchar: char, }, - #[cfg(feature = "string_justify")] - RJust { - width: usize, - fillchar: char, + Lowercase, + LStrip(Option), + #[cfg(feature = "extract_jsonpath")] + JsonExtract { + dtype: Option, + infer_schema_len: Option, }, - ExtractAll, - CountMatch(String), - #[cfg(feature = "temporal")] - Strptime(DataType, StrptimeOptions), - #[cfg(feature = "concat_str")] - ConcatVertical(String), - #[cfg(feature = "concat_str")] - ConcatHorizontal(String), #[cfg(feature = "regex")] Replace { // negative is replace all @@ -55,56 +57,58 @@ pub enum StringFunction { n: i64, literal: bool, }, - Uppercase, - Lowercase, - #[cfg(feature = "nightly")] - Titlecase, - Strip(Option), + #[cfg(feature = "string_justify")] + RJust { + width: usize, + fillchar: char, + }, RStrip(Option), - LStrip(Option), - #[cfg(feature = "string_from_radix")] - FromRadix(u32, bool), Slice(i64, Option), - Explode, + StartsWith, + Strip(Option), + #[cfg(feature = "temporal")] + Strptime(DataType, StrptimeOptions), #[cfg(feature = "dtype-decimal")] ToDecimal(usize), - #[cfg(feature = "extract_jsonpath")] - JsonExtract { - dtype: Option, - infer_schema_len: Option, - }, + #[cfg(feature = "nightly")] + Titlecase, + Uppercase, + #[cfg(feature = "string_justify")] + Zfill(usize), } impl StringFunction { pub(super) fn get_field(&self, mapper: FieldsMapper) -> PolarsResult { use StringFunction::*; match self { + #[cfg(feature = "concat_str")] + ConcatVertical(_) | ConcatHorizontal(_) => mapper.with_same_dtype(), #[cfg(feature = "regex")] Contains { .. } => mapper.with_dtype(DataType::Boolean), + CountMatch(_) => mapper.with_dtype(DataType::UInt32), EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean), + Explode => mapper.with_same_dtype(), Extract { .. } => mapper.with_same_dtype(), ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::Utf8))), - CountMatch(_) => mapper.with_dtype(DataType::UInt32), - #[cfg(feature = "string_justify")] - Zfill { .. } | LJust { .. } | RJust { .. } => mapper.with_same_dtype(), - #[cfg(feature = "temporal")] - Strptime(dtype, _) => mapper.with_dtype(dtype.clone()), - #[cfg(feature = "concat_str")] - ConcatVertical(_) | ConcatHorizontal(_) => mapper.with_same_dtype(), + #[cfg(feature = "string_from_radix")] + FromRadix { .. } => mapper.with_dtype(DataType::Int32), + #[cfg(feature = "extract_jsonpath")] + JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()), + Length => mapper.with_dtype(DataType::UInt32), + NChars => mapper.with_dtype(DataType::UInt32), #[cfg(feature = "regex")] Replace { .. } => mapper.with_same_dtype(), - Uppercase | Lowercase | Strip(_) | LStrip(_) | RStrip(_) | Slice(_, _) => { - mapper.with_same_dtype() - } + #[cfg(feature = "temporal")] + Strptime(dtype, _) => mapper.with_dtype(dtype.clone()), #[cfg(feature = "nightly")] Titlecase => mapper.with_same_dtype(), - #[cfg(feature = "string_from_radix")] - FromRadix { .. } => mapper.with_dtype(DataType::Int32), - Explode => mapper.with_same_dtype(), #[cfg(feature = "dtype-decimal")] ToDecimal(_) => mapper.with_dtype(DataType::Decimal(None, None)), - #[cfg(feature = "extract_jsonpath")] - JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()), + Uppercase | Lowercase | Strip(_) | LStrip(_) | RStrip(_) | Slice(_, _) => { + mapper.with_same_dtype() + } + #[cfg(feature = "string_justify")] + Zfill { .. } | LJust { .. } | RJust { .. } => mapper.with_same_dtype(), } } } @@ -114,42 +118,43 @@ impl Display for StringFunction { let s = match self { #[cfg(feature = "regex")] StringFunction::Contains { .. } => "contains", - StringFunction::StartsWith { .. } => "starts_with", + StringFunction::CountMatch(_) => "count_match", StringFunction::EndsWith { .. } => "ends_with", StringFunction::Extract { .. } => "extract", - #[cfg(feature = "string_justify")] - StringFunction::Zfill(_) => "zfill", + #[cfg(feature = "concat_str")] + StringFunction::ConcatHorizontal(_) => "concat_horizontal", + #[cfg(feature = "concat_str")] + StringFunction::ConcatVertical(_) => "concat_vertical", + StringFunction::Explode => "explode", + StringFunction::ExtractAll => "extract_all", + #[cfg(feature = "string_from_radix")] + StringFunction::FromRadix { .. } => "from_radix", + #[cfg(feature = "extract_jsonpath")] + StringFunction::JsonExtract { .. } => "json_extract", #[cfg(feature = "string_justify")] StringFunction::LJust { .. } => "str.ljust", + StringFunction::LStrip(_) => "lstrip", + StringFunction::Length => "str_lengths", + StringFunction::Lowercase => "lowercase", + StringFunction::NChars => "n_chars", #[cfg(feature = "string_justify")] StringFunction::RJust { .. } => "rjust", - StringFunction::ExtractAll => "extract_all", - StringFunction::CountMatch(_) => "count_match", - #[cfg(feature = "temporal")] - StringFunction::Strptime(_, _) => "strptime", - #[cfg(feature = "concat_str")] - StringFunction::ConcatVertical(_) => "concat_vertical", - #[cfg(feature = "concat_str")] - StringFunction::ConcatHorizontal(_) => "concat_horizontal", + StringFunction::RStrip(_) => "rstrip", #[cfg(feature = "regex")] StringFunction::Replace { .. } => "replace", - StringFunction::Uppercase => "uppercase", - StringFunction::Lowercase => "lowercase", + StringFunction::Slice(_, _) => "str_slice", + StringFunction::StartsWith { .. } => "starts_with", + StringFunction::Strip(_) => "strip", + #[cfg(feature = "temporal")] + StringFunction::Strptime(_, _) => "strptime", #[cfg(feature = "nightly")] StringFunction::Titlecase => "titlecase", - StringFunction::Strip(_) => "strip", - StringFunction::LStrip(_) => "lstrip", - StringFunction::RStrip(_) => "rstrip", - #[cfg(feature = "string_from_radix")] - StringFunction::FromRadix { .. } => "from_radix", - StringFunction::Slice(_, _) => "str_slice", - StringFunction::Explode => "explode", #[cfg(feature = "dtype-decimal")] StringFunction::ToDecimal(_) => "to_decimal", - #[cfg(feature = "extract_jsonpath")] - StringFunction::JsonExtract { .. } => "json_extract", + StringFunction::Uppercase => "uppercase", + #[cfg(feature = "string_justify")] + StringFunction::Zfill(_) => "zfill", }; - write!(f, "str.{s}") } } @@ -170,6 +175,16 @@ pub(super) fn titlecase(s: &Series) -> PolarsResult { Ok(ca.to_titlecase().into_series()) } +pub(super) fn n_chars(s: &Series) -> PolarsResult { + let ca = s.utf8()?; + Ok(ca.str_n_chars().into_series()) +} + +pub(super) fn lengths(s: &Series) -> PolarsResult { + let ca = s.utf8()?; + Ok(ca.str_lengths().into_series()) +} + #[cfg(feature = "regex")] pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResult { let ca = &s[0].utf8()?; diff --git a/polars/polars-lazy/polars-plan/src/dsl/string.rs b/polars/polars-lazy/polars-plan/src/dsl/string.rs index 8d878d4be5a9..fe5409a3ef86 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/string.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/string.rs @@ -455,7 +455,7 @@ impl StringNameSpace { } #[cfg(feature = "string_from_radix")] - /// Parse string in base radix into decimal + /// Parse string in base radix into decimal. pub fn from_radix(self, radix: u32, strict: bool) -> Expr { self.0 .map_private(FunctionExpr::StringExpr(StringFunction::FromRadix( @@ -463,6 +463,18 @@ impl StringNameSpace { ))) } + /// Return the number of characters in the string (not bytes). + pub fn n_chars(self) -> Expr { + self.0 + .map_private(FunctionExpr::StringExpr(StringFunction::NChars)) + } + + /// Return the number of bytes in the string (not characters). + pub fn lengths(self) -> Expr { + self.0 + .map_private(FunctionExpr::StringExpr(StringFunction::Length)) + } + /// Slice the string values. pub fn str_slice(self, start: i64, length: Option) -> Expr { self.0 diff --git a/polars/polars-sql/src/functions.rs b/polars/polars-sql/src/functions.rs index 34a73e9faaa1..3507becbac7b 100644 --- a/polars/polars-sql/src/functions.rs +++ b/polars/polars-sql/src/functions.rs @@ -161,6 +161,11 @@ pub(crate) enum PolarsSqlFunctions { /// SELECT LEFT(column_1, 3) from df; /// ``` Left, + /// SQL 'length' function (characters) + /// ```sql + /// SELECT LENGTH(column_1) from df; + /// ``` + Length, /// SQL 'lower' function /// ```sql /// SELECT LOWER(column_1) from df; @@ -171,6 +176,11 @@ pub(crate) enum PolarsSqlFunctions { /// SELECT LTRIM(column_1) from df; /// ``` LTrim, + /// SQL 'octet_length' function (bytes) + /// ```sql + /// SELECT OCTET_LENGTH(column_1) from df; + /// ``` + OctetLength, /// SQL 'regexp_like' function /// ```sql /// SELECT REGEXP_LIKE(column_1,'xyz', 'i') from df; @@ -368,6 +378,7 @@ impl PolarsSqlFunctions { "ltrim", "max", "min", + "octet_length", "pow", "radians", "round", @@ -428,9 +439,11 @@ impl TryFrom<&'_ SQLFunction> for PolarsSqlFunctions { // String functions // ---- "ends_with" => Self::EndsWith, + "length" => Self::Length, "left" => Self::Left, "lower" => Self::Lower, "ltrim" => Self::LTrim, + "octet_length" => Self::OctetLength, "regexp_like" => Self::RegexpLike, "rtrim" => Self::RTrim, "starts_with" => Self::StartsWith, @@ -532,6 +545,7 @@ impl SqlFunctionVisitor<'_> { } })) }), + Length => self.visit_unary(|e| e.str().n_chars()), Lower => self.visit_unary(|e| e.str().to_lowercase()), LTrim => match function.args.len() { 1 => self.visit_unary(|e| e.str().lstrip(None)), @@ -541,6 +555,7 @@ impl SqlFunctionVisitor<'_> { function.args.len() ), }, + OctetLength => self.visit_unary(|e| e.str().lengths()), RegexpLike => match function.args.len() { 2 => self.visit_binary(|e, s| e.str().contains(s, true)), 3 => self.try_visit_ternary(|e, pat, flags| { diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index 033d686f6b1e..3dc76dc2c1a9 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -89,27 +89,11 @@ impl PyExpr { } fn str_lengths(&self) -> Self { - let function = |s: Series| { - let ca = s.utf8()?; - Ok(Some(ca.str_lengths().into_series())) - }; - self.clone() - .inner - .map(function, GetOutput::from_type(DataType::UInt32)) - .with_fmt("str.lengths") - .into() + self.inner.clone().str().lengths().into() } fn str_n_chars(&self) -> Self { - let function = |s: Series| { - let ca = s.utf8()?; - Ok(Some(ca.str_n_chars().into_series())) - }; - self.clone() - .inner - .map(function, GetOutput::from_type(DataType::UInt32)) - .with_fmt("str.n_chars") - .into() + self.inner.clone().str().n_chars().into() } #[cfg(feature = "lazy_regex")] diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index 3e2fefe9a1d0..27347cf8edd2 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -682,6 +682,27 @@ def test_sql_round_ndigits_errors() -> None: ctx.execute("SELECT ROUND(n,-1) AS n FROM df") +def test_sql_string_lengths() -> None: + df = pl.DataFrame({"words": ["Café", None, "東京"]}) + + with pl.SQLContext(frame=df) as ctx: + res = ctx.execute( + """ + SELECT + words, + LENGTH(words) AS n_chars, + OCTET_LENGTH(words) AS n_bytes + FROM frame + """ + ).collect() + + assert res.to_dict(False) == { + "words": ["Café", None, "東京"], + "n_chars": [4, None, 2], + "n_bytes": [5, None, 6], + } + + def test_sql_substr() -> None: df = pl.DataFrame( { From bae44a0a49ea72a2686451f1ea5a7a19e5a0cb44 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 14 Jul 2023 10:51:46 +0200 Subject: [PATCH 06/37] feat(rust, python): Expr.cat.get_categories expression (#9869) --- .../logical/categorical/builder.rs | 13 ++++++---- polars/polars-lazy/polars-plan/src/dsl/cat.rs | 5 ++++ .../polars-plan/src/dsl/function_expr/cat.rs | 17 ++++++++++++- .../reference/expressions/categories.rst | 1 + .../source/reference/series/categories.rst | 1 + py-polars/polars/expr/categorical.py | 24 +++++++++++++++++++ py-polars/polars/series/categorical.py | 18 ++++++++++++++ py-polars/src/expr/categorical.rs | 4 ++++ .../tests/unit/namespaces/test_categorical.py | 6 +++++ 9 files changed, 84 insertions(+), 5 deletions(-) diff --git a/polars/polars-core/src/chunked_array/logical/categorical/builder.rs b/polars/polars-core/src/chunked_array/logical/categorical/builder.rs index dd1b2d7f112d..93b46dd9475d 100644 --- a/polars/polars-core/src/chunked_array/logical/categorical/builder.rs +++ b/polars/polars-core/src/chunked_array/logical/categorical/builder.rs @@ -92,14 +92,19 @@ impl RevMapping { !self.is_global() } - /// Get the length of the [`RevMapping`] - pub fn len(&self) -> usize { + /// Get the categories in this RevMapping + pub fn get_categories(&self) -> &Utf8Array { match self { - Self::Global(_, a, _) => a.len(), - Self::Local(a) => a.len(), + Self::Global(_, a, _) => a, + Self::Local(a) => a, } } + /// Get the length of the [`RevMapping`] + pub fn len(&self) -> usize { + self.get_categories().len() + } + /// Categorical to str pub fn get(&self, idx: u32) -> &str { match self { diff --git a/polars/polars-lazy/polars-plan/src/dsl/cat.rs b/polars/polars-lazy/polars-plan/src/dsl/cat.rs index 60e234acde60..e4a8601e73aa 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/cat.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/cat.rs @@ -20,4 +20,9 @@ impl CategoricalNameSpace { self.0 .map_private(CategoricalFunction::SetOrdering { lexical }.into()) } + + pub fn get_categories(self) -> Expr { + self.0 + .map_private(CategoricalFunction::GetCategories.into()) + } } diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs index 07eeae7cf143..455ae39e8805 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/cat.rs @@ -5,11 +5,16 @@ use crate::map; #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum CategoricalFunction { SetOrdering { lexical: bool }, + GetCategories, } impl CategoricalFunction { pub(super) fn get_field(&self, mapper: FieldsMapper) -> PolarsResult { - mapper.with_dtype(DataType::Boolean) + use CategoricalFunction::*; + match self { + SetOrdering { .. } => mapper.with_same_dtype(), + GetCategories => mapper.with_dtype(DataType::Utf8), + } } } @@ -18,6 +23,7 @@ impl Display for CategoricalFunction { use CategoricalFunction::*; let s = match self { SetOrdering { .. } => "set_ordering", + GetCategories => "get_categories", }; write!(f, "{s}") } @@ -28,6 +34,7 @@ impl From for SpecialEq> { use CategoricalFunction::*; match func { SetOrdering { lexical } => map!(set_ordering, lexical), + GetCategories => map!(get_categories), } } } @@ -43,3 +50,11 @@ fn set_ordering(s: &Series, lexical: bool) -> PolarsResult { ca.set_lexical_sorted(lexical); Ok(ca.into_series()) } + +fn get_categories(s: &Series) -> PolarsResult { + // categorical check + let ca = s.categorical()?; + let DataType::Categorical(Some(rev_map)) = ca.dtype() else { unreachable!() }; + let arr = rev_map.get_categories().clone().boxed(); + Series::try_from((ca.name(), arr)) +} diff --git a/py-polars/docs/source/reference/expressions/categories.rst b/py-polars/docs/source/reference/expressions/categories.rst index 674702e23595..cd99c9a91d7a 100644 --- a/py-polars/docs/source/reference/expressions/categories.rst +++ b/py-polars/docs/source/reference/expressions/categories.rst @@ -9,4 +9,5 @@ The following methods are available under the `expr.cat` attribute. :toctree: api/ :template: autosummary/accessor_method.rst + Expr.cat.get_categories Expr.cat.set_ordering diff --git a/py-polars/docs/source/reference/series/categories.rst b/py-polars/docs/source/reference/series/categories.rst index 5b0c1e70e40b..1fd15ba47a32 100644 --- a/py-polars/docs/source/reference/series/categories.rst +++ b/py-polars/docs/source/reference/series/categories.rst @@ -9,4 +9,5 @@ The following methods are available under the `Series.cat` attribute. :toctree: api/ :template: autosummary/accessor_method.rst + Series.cat.get_categories Series.cat.set_ordering diff --git a/py-polars/polars/expr/categorical.py b/py-polars/polars/expr/categorical.py index 7fdc19b01663..ea25036d16e6 100644 --- a/py-polars/polars/expr/categorical.py +++ b/py-polars/polars/expr/categorical.py @@ -55,3 +55,27 @@ def set_ordering(self, ordering: CategoricalOrdering) -> Expr: """ return wrap_expr(self._pyexpr.cat_set_ordering(ordering)) + + def get_categories(self) -> Expr: + """ + Get the categories stored in this data type. + + Examples + -------- + >>> df = pl.Series( + ... "cats", ["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical + ... ).to_frame() + >>> df.select(pl.col("cats").cat.get_categories()) + shape: (3, 1) + ┌──────┐ + │ cats │ + │ --- │ + │ str │ + ╞══════╡ + │ foo │ + │ bar │ + │ ham │ + └──────┘ + + """ + return wrap_expr(self._pyexpr.cat_get_categories()) diff --git a/py-polars/polars/series/categorical.py b/py-polars/polars/series/categorical.py index f19a2ec8c2dd..29720880f94b 100644 --- a/py-polars/polars/series/categorical.py +++ b/py-polars/polars/series/categorical.py @@ -56,3 +56,21 @@ def set_ordering(self, ordering: CategoricalOrdering) -> Series: └──────┴──────┘ """ + + def get_categories(self) -> Series: + """ + Get the categories stored in this data type. + + Examples + -------- + >>> s = pl.Series(["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical) + >>> s.cat.get_categories() + shape: (3,) + Series: '' [str] + [ + "foo" + "bar" + "ham" + ] + + """ diff --git a/py-polars/src/expr/categorical.rs b/py-polars/src/expr/categorical.rs index 8bbfe0b752ec..80e85e97315e 100644 --- a/py-polars/src/expr/categorical.rs +++ b/py-polars/src/expr/categorical.rs @@ -9,4 +9,8 @@ impl PyExpr { fn cat_set_ordering(&self, ordering: Wrap) -> Self { self.inner.clone().cat().set_ordering(ordering.0).into() } + + fn cat_get_categories(&self) -> Self { + self.inner.clone().cat().get_categories().into() + } } diff --git a/py-polars/tests/unit/namespaces/test_categorical.py b/py-polars/tests/unit/namespaces/test_categorical.py index 1159c2160a30..b2c2a5157864 100644 --- a/py-polars/tests/unit/namespaces/test_categorical.py +++ b/py-polars/tests/unit/namespaces/test_categorical.py @@ -72,3 +72,9 @@ def test_sort_categoricals_6014() -> None: assert out.to_dict(False) == {"key": ["bbb", "aaa", "ccc"]} out = df2.sort("key") assert out.to_dict(False) == {"key": ["aaa", "bbb", "ccc"]} + + +def test_categorical_get_categories() -> None: + assert pl.Series( + "cats", ["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical + ).cat.get_categories().to_list() == ["foo", "bar", "ham"] From f83e277d78da3238176de2027f23df0fef42565d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 14 Jul 2023 13:47:09 +0200 Subject: [PATCH 07/37] fix(rust, python): fmt unknown dtype (#9872) --- polars/polars-core/src/datatypes/dtype.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polars/polars-core/src/datatypes/dtype.rs b/polars/polars-core/src/datatypes/dtype.rs index cd3adc9c5f0d..57245791fb1c 100644 --- a/polars/polars-core/src/datatypes/dtype.rs +++ b/polars/polars-core/src/datatypes/dtype.rs @@ -332,7 +332,7 @@ impl Display for DataType { DataType::Categorical(_) => "cat", #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()), - DataType::Unknown => unreachable!(), + DataType::Unknown => "unknown", }; f.write_str(s) } From 5937c2251d782931e751ce025ebb39f2f5a91ab2 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 14 Jul 2023 14:43:52 +0200 Subject: [PATCH 08/37] feat(rust, python): allow set_sorted in streaming (#9876) --- polars/polars-core/src/series/series_trait.rs | 3 +++ .../polars-plan/src/dsl/function_expr/dispatch.rs | 6 ++++++ .../polars-lazy/polars-plan/src/dsl/function_expr/mod.rs | 3 +++ .../polars-plan/src/dsl/function_expr/schema.rs | 1 + polars/polars-lazy/polars-plan/src/dsl/mod.rs | 8 +------- polars/polars-lazy/src/physical_plan/streaming/checks.rs | 4 ++++ 6 files changed, 18 insertions(+), 7 deletions(-) diff --git a/polars/polars-core/src/series/series_trait.rs b/polars/polars-core/src/series/series_trait.rs index 95d04c772b2d..5a5871720861 100644 --- a/polars/polars-core/src/series/series_trait.rs +++ b/polars/polars-core/src/series/series_trait.rs @@ -4,6 +4,8 @@ use std::borrow::Cow; use std::sync::Arc; use polars_arrow::prelude::QuantileInterpolOptions; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; #[cfg(feature = "object")] use crate::chunked_array::object::PolarsObjectSafe; @@ -11,6 +13,7 @@ pub use crate::prelude::ChunkCompare; use crate::prelude::*; #[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum IsSorted { Ascending, Descending, diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs index 831eb2bbff8c..5ca34b389efa 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/dispatch.rs @@ -26,3 +26,9 @@ pub(super) fn interpolate(s: &Series, method: InterpolationMethod) -> PolarsResu pub(super) fn to_physical(s: &Series) -> PolarsResult { Ok(s.to_physical_repr().into_owned()) } + +pub(super) fn set_sorted_flag(s: &Series, sorted: IsSorted) -> PolarsResult { + let mut s = s.clone(); + s.set_sorted_flag(sorted); + Ok(s) +} diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs index 3ffc1abe0380..0493c231a429 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs @@ -230,6 +230,7 @@ pub enum FunctionExpr { seed: Option, fixed_seed: bool, }, + SetSortedFlag(IsSorted), } impl Display for FunctionExpr { @@ -335,6 +336,7 @@ impl Display for FunctionExpr { ToPhysical => "to_physical", #[cfg(feature = "random")] Random { method, .. } => method.into(), + SetSortedFlag(_) => "set_sorted", }; write!(f, "{s}") } @@ -607,6 +609,7 @@ impl From for SpecialEq> { seed, fixed_seed ), + SetSortedFlag(sorted) => map!(dispatch::set_sorted_flag, sorted), } } } diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs index 00a031d3d63f..fc0f414b4c77 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs @@ -250,6 +250,7 @@ impl FunctionExpr { ToPhysical => mapper.to_physical_type(), #[cfg(feature = "random")] Random { .. } => mapper.with_same_dtype(), + SetSortedFlag(_) => mapper.with_same_dtype(), } } } diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/mod.rs index 5698a956f740..05d7923a3b82 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/mod.rs @@ -1746,13 +1746,7 @@ impl Expr { /// This can lead to incorrect results if this `Series` is not sorted!! /// Use with care! pub fn set_sorted_flag(self, sorted: IsSorted) -> Expr { - self.apply( - move |mut s| { - s.set_sorted_flag(sorted); - Ok(Some(s)) - }, - GetOutput::same_type(), - ) + self.apply_private(FunctionExpr::SetSortedFlag(sorted)) } /// Cache this expression, so that it is executed only once per context. diff --git a/polars/polars-lazy/src/physical_plan/streaming/checks.rs b/polars/polars-lazy/src/physical_plan/streaming/checks.rs index 70c00053ef02..47d51f2067ee 100644 --- a/polars/polars-lazy/src/physical_plan/streaming/checks.rs +++ b/polars/polars-lazy/src/physical_plan/streaming/checks.rs @@ -21,6 +21,10 @@ pub(super) fn is_streamable(node: Node, expr_arena: &Arena, context: Cont let mut seen_column = false; let mut seen_lit_range = false; let all = expr_arena.iter(node).all(|(_, ae)| match ae { + AExpr::Function { + function: FunctionExpr::SetSortedFlag(_), + .. + } => true, AExpr::Function { options, .. } | AExpr::AnonymousFunction { options, .. } => match context { Context::Default => matches!( From e14238305900fd776c0684f7c2a1530d1e5d8918 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 14 Jul 2023 14:45:07 +0200 Subject: [PATCH 09/37] fix(python,rust,cli): preserve expression aliases when parsing SQL with `pl.sql_expr` (#9875) --- polars/polars-sql/src/functions.rs | 3 ++- polars/polars-sql/src/sql_expr.rs | 16 +++++++++++----- py-polars/tests/unit/test_sql.py | 14 ++++++++++++-- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/polars/polars-sql/src/functions.rs b/polars/polars-sql/src/functions.rs index 3507becbac7b..70a75718a12e 100644 --- a/polars/polars-sql/src/functions.rs +++ b/polars/polars-sql/src/functions.rs @@ -380,6 +380,7 @@ impl PolarsSqlFunctions { "min", "octet_length", "pow", + "power", "radians", "round", "rtrim", @@ -432,7 +433,7 @@ impl TryFrom<&'_ SQLFunction> for PolarsSqlFunctions { "log10" => Self::Log10, "log1p" => Self::Log1p, "log2" => Self::Log2, - "pow" => Self::Pow, + "pow" | "power" => Self::Pow, "round" => Self::Round, // ---- diff --git a/polars/polars-sql/src/sql_expr.rs b/polars/polars-sql/src/sql_expr.rs index 8450c4d9d1db..85e99ff8dcc3 100644 --- a/polars/polars-sql/src/sql_expr.rs +++ b/polars/polars-sql/src/sql_expr.rs @@ -5,8 +5,8 @@ use polars_lazy::prelude::*; use polars_plan::prelude::{col, lit, when}; use sqlparser::ast::{ ArrayAgg, BinaryOperator as SQLBinaryOperator, BinaryOperator, DataType as SQLDataType, - Expr as SqlExpr, Function as SQLFunction, JoinConstraint, OrderByExpr, TrimWhereField, - UnaryOperator, Value as SqlValue, + Expr as SqlExpr, Function as SQLFunction, JoinConstraint, OrderByExpr, SelectItem, + TrimWhereField, UnaryOperator, Value as SqlValue, }; use sqlparser::dialect::GenericDialect; use sqlparser::parser::{Parser, ParserOptions}; @@ -532,8 +532,14 @@ pub fn sql_expr>(s: S) -> PolarsResult { }); let mut ast = parser.try_with_sql(s.as_ref()).map_err(to_compute_err)?; + let expr = ast.parse_select_item().map_err(to_compute_err)?; - let expr = ast.parse_expr().map_err(to_compute_err)?; - - parse_sql_expr(&expr, &ctx) + Ok(match &expr { + SelectItem::ExprWithAlias { expr, alias } => { + let expr = parse_sql_expr(expr, &ctx)?; + expr.alias(&alias.value) + } + SelectItem::UnnamedExpr(expr) => parse_sql_expr(expr, &ctx)?, + _ => polars_bail!(InvalidOperation: "Unable to parse '{}' as Expr", s.as_ref()), + }) } diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index 27347cf8edd2..d2c9869ce718 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -773,7 +773,17 @@ def test_sql_expr() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": ["xyz", "abcde", None]}) sql_exprs = ( pl.sql_expr("MIN(a)"), - pl.sql_expr("SUBSTR(b,1,2)"), + pl.sql_expr("POWER(a,a) AS aa"), + pl.sql_expr("SUBSTR(b,1,2) AS b2"), + ) + expected = pl.DataFrame( + {"a": [1, 1, 1], "aa": [1, 4, 27], "b2": ["yz", "bc", None]} ) - expected = pl.DataFrame({"a": [1, 1, 1], "b": ["yz", "bc", None]}) assert df.select(sql_exprs).frame_equal(expected) + + # expect expressions that can't reasonably be parsed as expressions to raise + # (for example: those that explicitly reference tables and/or use wildcards) + with pytest.raises( + pl.InvalidOperationError, match=r"Unable to parse 'xyz\.\*' as Expr" + ): + pl.sql_expr("xyz.*") From cb04b5be178139b62bd4c6e06ca96aa6e8e7fe91 Mon Sep 17 00:00:00 2001 From: chielP Date: Fri, 14 Jul 2023 16:47:33 +0200 Subject: [PATCH 10/37] fix(rust): Allow None as exponent (#9880) --- .../polars-lazy/polars-plan/src/dsl/function_expr/pow.rs | 8 ++++---- py-polars/polars/expr/expr.py | 2 +- py-polars/polars/series/series.py | 4 ++-- py-polars/tests/unit/test_series.py | 1 + 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs index 09f7298ac301..bf8a7e92e0b0 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/pow.rs @@ -12,13 +12,13 @@ where ChunkedArray: IntoSeries, { let dtype = T::get_dtype(); - let exponent = exponent.cast(&dtype)?; + let exponent = exponent.strict_cast(&dtype)?; let exponent = base.unpack_series_matching_type(&exponent).unwrap(); if exponent.len() == 1 { - let exponent_value = exponent - .get(0) - .ok_or_else(|| polars_err!(ComputeError: "exponent is null"))?; + let Some(exponent_value) = exponent.get(0) else { + return Ok(Some(Series::full_null(base.name(), base.len(), &dtype))) + }; let s = match exponent_value.to_f64().unwrap() { a if a == 1.0 => base.clone().into_series(), // specialized sqrt will ensure (-inf)^0.5 = NaN diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 7fe6103e859c..768bed40458d 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -4568,7 +4568,7 @@ def truediv(self, other: Any) -> Self: """ return self.__truediv__(other) - def pow(self, exponent: int | float | Series | Expr) -> Self: + def pow(self, exponent: int | float | None | Series | Expr) -> Self: """ Method equivalent of exponentiation operator ``expr ** exponent``. diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index efff8054ef61..b36da7732e86 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -828,7 +828,7 @@ def __rmul__(self, other: Any) -> Series: raise ValueError("first cast to integer before multiplying datelike dtypes") return self._arithmetic(other, "mul", "mul_<>") - def __pow__(self, exponent: int | float | Series) -> Series: + def __pow__(self, exponent: int | float | None | Series) -> Series: return self.pow(exponent) def __rpow__(self, other: Any) -> Series: @@ -1421,7 +1421,7 @@ def product(self) -> int | float: """Reduce this Series to the product value.""" return self.to_frame().select(F.col(self.name).product()).to_series().item() - def pow(self, exponent: int | float | Series) -> Series: + def pow(self, exponent: int | float | None | Series) -> Series: """ Raise to the power of the given exponent. diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py index 715150f5b5f0..5aa1215934b1 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/test_series.py @@ -382,6 +382,7 @@ def test_power() -> None: assert_series_equal(a**a, pl.Series([1.0, 4.0], dtype=Float64)) assert_series_equal(b**b, pl.Series([None, 4.0], dtype=Float64)) assert_series_equal(a**b, pl.Series([None, 4.0], dtype=Float64)) + assert_series_equal(a**None, pl.Series([None] * len(a), dtype=Float64)) with pytest.raises(ValueError): c**2 with pytest.raises(pl.ColumnNotFoundError): From 9291ee10fb84335ec8a5aa55c3f994cbfbf0b2cb Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Fri, 14 Jul 2023 17:11:08 +0200 Subject: [PATCH 11/37] depr(python): Deprecate functions series input (#9878) --- py-polars/polars/functions/lazy.py | 78 ++++++++++++------- .../tests/unit/functions/test_functions.py | 42 +++++++--- py-polars/tests/unit/test_df.py | 14 ---- 3 files changed, 85 insertions(+), 49 deletions(-) diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index ae0337ecb9b4..f970b4f36c9d 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import warnings from datetime import date, datetime, time, timedelta from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence, overload @@ -26,6 +27,7 @@ _time_to_pl_time, _timedelta_to_pl_timedelta, ) +from polars.utils.various import find_stacklevel with contextlib.suppress(ImportError): # Module not available when building docs import polars.polars as plr @@ -320,6 +322,11 @@ def count(column: str | Series | None = None) -> Expr | int: return wrap_expr(plr.count()) if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `count` is deprecated. Use `Series.len()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.len() return col(column).count() @@ -377,6 +384,11 @@ def std(column: str | Series, ddof: int = 1) -> Expr | float | None: """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `std` is deprecated. Use `Series.std()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.std(ddof) return col(column).std(ddof) @@ -421,6 +433,11 @@ def var(column: str | Series, ddof: int = 1) -> Expr | float | None: """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `var` is deprecated. Use `Series.var()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.var(ddof) return col(column).var(ddof) @@ -451,11 +468,14 @@ def mean(column: str | Series) -> Expr | float | None: ╞═════╡ │ 4.0 │ └─────┘ - >>> pl.mean(df["a"]) - 4.0 """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `mean` is deprecated. Use `Series.mean()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.mean() return col(column).mean() @@ -486,8 +506,6 @@ def avg(column: str | Series) -> Expr | float: ╞═════╡ │ 4.0 │ └─────┘ - >>> pl.avg(df["a"]) - 4.0 """ return mean(column) @@ -519,11 +537,14 @@ def median(column: str | Series) -> Expr | float | int | None: ╞═════╡ │ 3.0 │ └─────┘ - >>> pl.median(df["a"]) - 3.0 """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `median` is deprecated. Use `Series.median()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.median() return col(column).median() @@ -554,11 +575,14 @@ def n_unique(column: str | Series) -> Expr | int: ╞═════╡ │ 2 │ └─────┘ - >>> pl.n_unique(df["a"]) - 2 """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `n_unique` is deprecated. Use `Series.n_unique()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.n_unique() return col(column).n_unique() @@ -643,14 +667,17 @@ def first(column: str | Series | None = None) -> Expr | Any: ╞═════╡ │ 1 │ └─────┘ - >>> pl.first(df["a"]) - 1 """ if column is None: return wrap_expr(plr.first()) if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `first` is deprecated. Use `series[0]` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) if column.len() > 0: return column[0] else: @@ -706,14 +733,17 @@ def last(column: str | Series | None = None) -> Expr: ╞═════╡ │ 3 │ └─────┘ - >>> pl.last(df["a"]) - 3 """ if column is None: return wrap_expr(plr.last()) if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `last` is deprecated. Use `series[-1]` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) if column.len() > 0: return column[-1] else: @@ -766,16 +796,14 @@ def head(column: str | Series, n: int = 10) -> Expr | Series: │ 1 │ │ 8 │ └─────┘ - >>> pl.head(df["a"], 2) - shape: (2,) - Series: 'a' [i64] - [ - 1 - 8 - ] """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `head` is deprecated. Use `Series.head()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.head(n) return col(column).head(n) @@ -825,16 +853,14 @@ def tail(column: str | Series, n: int = 10) -> Expr | Series: │ 8 │ │ 3 │ └─────┘ - >>> pl.tail(df["a"], 2) - shape: (2,) - Series: 'a' [i64] - [ - 8 - 3 - ] """ if isinstance(column, pl.Series): + warnings.warn( + "passing a Series to `tail` is deprecated. Use `Series.tail()` instead.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) return column.tail(n) return col(column).tail(n) diff --git a/py-polars/tests/unit/functions/test_functions.py b/py-polars/tests/unit/functions/test_functions.py index 464f551aa0d7..cc5d0cca9595 100644 --- a/py-polars/tests/unit/functions/test_functions.py +++ b/py-polars/tests/unit/functions/test_functions.py @@ -7,7 +7,7 @@ import pytest import polars as pl -from polars.testing import assert_frame_equal +from polars.testing import assert_frame_equal, assert_series_equal def test_concat_align() -> None: @@ -375,7 +375,8 @@ def test_lazy_functions() -> None: df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) out = df.select(pl.count("a")) assert list(out["a"]) == [3] - assert pl.count(df["a"]) == 3 + with pytest.deprecated_call(): + assert pl.count(df["a"]) == 3 out = df.select( [ pl.var("b").alias("1"), @@ -392,10 +393,12 @@ def test_lazy_functions() -> None: ) expected = 1.0 assert np.isclose(out.to_series(0), expected) - assert np.isclose(pl.var(df["b"]), expected) # type: ignore[arg-type] + with pytest.deprecated_call(): + assert np.isclose(pl.var(df["b"]), expected) # type: ignore[arg-type] expected = 1.0 assert np.isclose(out.to_series(1), expected) - assert np.isclose(pl.std(df["b"]), expected) # type: ignore[arg-type] + with pytest.deprecated_call(): + assert np.isclose(pl.std(df["b"]), expected) # type: ignore[arg-type] expected = 3 assert np.isclose(out.to_series(2), expected) with pytest.deprecated_call(): @@ -410,19 +413,24 @@ def test_lazy_functions() -> None: assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out.to_series(5), expected) - assert np.isclose(pl.mean(df["b"]), expected) + with pytest.deprecated_call(): + assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out.to_series(6), expected) - assert np.isclose(pl.median(df["b"]), expected) + with pytest.deprecated_call(): + assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out.to_series(7), expected) - assert np.isclose(pl.n_unique(df["b"]), expected) + with pytest.deprecated_call(): + assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out.to_series(8), expected) - assert np.isclose(pl.first(df["b"]), expected) + with pytest.deprecated_call(): + assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out.to_series(9), expected) - assert np.isclose(pl.last(df["b"]), expected) + with pytest.deprecated_call(): + assert np.isclose(pl.last(df["b"]), expected) # regex selection out = df.select( @@ -435,3 +443,19 @@ def test_lazy_functions() -> None: assert out.rows() == [ ({"a": "foo", "b": 3}, {"b": 1, "c": 1.0}, {"a": None, "c": 6.0}) ] + + +def test_head_tail(fruits_cars: pl.DataFrame) -> None: + res_expr = fruits_cars.select([pl.head("A", 2)]) + with pytest.deprecated_call(): + res_series = pl.head(fruits_cars["A"], 2) + expected = pl.Series("A", [1, 2]) + assert_series_equal(res_expr.to_series(0), expected) + assert_series_equal(res_series, expected) + + res_expr = fruits_cars.select([pl.tail("A", 2)]) + with pytest.deprecated_call(): + res_series = pl.tail(fruits_cars["A"], 2) + expected = pl.Series("A", [4, 5]) + assert_series_equal(res_expr.to_series(0), expected) + assert_series_equal(res_series, expected) diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index 366152fa1152..6f1ac0d0e019 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -2967,20 +2967,6 @@ def test_fill_null_limits() -> None: } -def test_head_tail(fruits_cars: pl.DataFrame) -> None: - res_expr = fruits_cars.select([pl.head("A", 2)]) - res_series = pl.head(fruits_cars["A"], 2) - expected = pl.Series("A", [1, 2]) - assert_series_equal(res_expr.to_series(0), expected) - assert_series_equal(res_series, expected) - - res_expr = fruits_cars.select([pl.tail("A", 2)]) - res_series = pl.tail(fruits_cars["A"], 2) - expected = pl.Series("A", [4, 5]) - assert_series_equal(res_expr.to_series(0), expected) - assert_series_equal(res_series, expected) - - def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None: res_expr = fruits_cars.select(pl.col("A").lower_bound()) assert res_expr.item() == -9223372036854775808 From eb2797e90802ea59aa7fa917b73787d0c35aa745 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 14 Jul 2023 19:00:13 +0100 Subject: [PATCH 12/37] docs(python): note ordering guarantee for groupby (#9879) Co-authored-by: Stijn de Gooijer --- py-polars/polars/dataframe/frame.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 41c59e2643f5..8e7b11ebd046 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4641,6 +4641,15 @@ def groupby( Settings this to ``True`` blocks the possibility to run on the streaming engine. + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + Examples -------- Group by one column and call ``agg`` to compute the grouped sum of another From cde0be24e8f7c0e450dd7b2deae001cd52973916 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 14 Jul 2023 20:26:34 +0200 Subject: [PATCH 13/37] feat(rust, python): respect and allow more options in eager json parsing (#9882) --- polars/polars-core/src/schema.rs | 7 ++ polars/polars-io/src/json/mod.rs | 104 +++++++++++++----- polars/polars-io/src/ndjson/buffer.rs | 44 ++++++-- polars/polars-io/src/ndjson/core.rs | 38 +++++-- polars/polars-io/src/utils.rs | 11 ++ .../physical_plan/executors/scan/ndjson.rs | 2 +- py-polars/polars/dataframe/frame.py | 28 ++++- py-polars/polars/io/json.py | 26 ++++- py-polars/polars/io/ndjson.py | 32 +++++- py-polars/src/dataframe.rs | 89 +++++++++------ py-polars/tests/unit/io/test_json.py | 47 ++++++++ 11 files changed, 338 insertions(+), 90 deletions(-) diff --git a/polars/polars-core/src/schema.rs b/polars/polars-core/src/schema.rs index dd05c02e0dea..caac67990add 100644 --- a/polars/polars-core/src/schema.rs +++ b/polars/polars-core/src/schema.rs @@ -195,6 +195,13 @@ impl Schema { .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) } + /// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist + pub fn try_get_mut(&mut self, name: &str) -> PolarsResult<&mut DataType> { + self.inner + .get_mut(name) + .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) + } + /// Return all data about the field named `name`: its index in the schema, its name, and its dtype /// /// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't. diff --git a/polars/polars-io/src/json/mod.rs b/polars/polars-io/src/json/mod.rs index 76cb6b452c10..79a246d5637b 100644 --- a/polars/polars-io/src/json/mod.rs +++ b/polars/polars-io/src/json/mod.rs @@ -157,20 +157,22 @@ where /// Reads JSON in one of the formats in [`JsonFormat`] into a DataFrame. #[must_use] -pub struct JsonReader +pub struct JsonReader<'a, R> where R: MmapBytesReader, { reader: R, rechunk: bool, + ignore_errors: bool, infer_schema_len: Option, batch_size: usize, projection: Option>, - schema: Option, + schema: Option, + schema_overwrite: Option<&'a Schema>, json_format: JsonFormat, } -impl SerReader for JsonReader +impl<'a, R> SerReader for JsonReader<'a, R> where R: MmapBytesReader, { @@ -178,10 +180,12 @@ where JsonReader { reader, rechunk: true, + ignore_errors: false, infer_schema_len: Some(100), batch_size: 8192, projection: None, schema: None, + schema_overwrite: None, json_format: JsonFormat::Json, } } @@ -201,32 +205,63 @@ where let out = match self.json_format { JsonFormat::Json => { + polars_ensure!(!self.ignore_errors, InvalidOperation: "'ignore_errors' only supported in ndjson"); let mut bytes = rb.deref().to_vec(); let json_value = simd_json::to_borrowed_value(&mut bytes).map_err(to_compute_err)?; - // likely struct type - let dtype = if let BorrowedValue::Array(values) = &json_value { - // struct types may have missing fields so find supertype - let dtype = values - .iter() - .take(self.infer_schema_len.unwrap_or(usize::MAX)) - .map(|value| { - infer(value) - .map_err(PolarsError::from) - .map(|dt| DataType::from(&dt)) - }) - .fold_first_(|l, r| { - let l = l?; - let r = r?; - try_get_supertype(&l, &r) - }) - .unwrap()?; - let dtype = DataType::List(Box::new(dtype)); - dtype.to_arrow() + // struct type + let dtype = if let Some(mut schema) = self.schema { + if let Some(overwrite) = self.schema_overwrite { + let mut_schema = Arc::make_mut(&mut schema); + overwrite_schema(mut_schema, overwrite)?; + } + DataType::Struct(schema.iter_fields().collect()).to_arrow() } else { - infer(&json_value)? + // infer + if let BorrowedValue::Array(values) = &json_value { + polars_ensure!(self.schema_overwrite.is_none() && self.schema.is_none(), ComputeError: "schema arguments not yet supported for Array json"); + + // struct types may have missing fields so find supertype + let dtype = values + .iter() + .take(self.infer_schema_len.unwrap_or(usize::MAX)) + .map(|value| { + infer(value) + .map_err(PolarsError::from) + .map(|dt| DataType::from(&dt)) + }) + .fold_first_(|l, r| { + let l = l?; + let r = r?; + try_get_supertype(&l, &r) + }) + .unwrap()?; + let dtype = DataType::List(Box::new(dtype)); + dtype.to_arrow() + } else { + let dtype = infer(&json_value)?; + if let Some(overwrite) = self.schema_overwrite { + let ArrowDataType::Struct(fields) = dtype else { + polars_bail!(ComputeError: "can only deserialize json objects") + }; + + let mut schema = Schema::from_iter(fields.iter()); + overwrite_schema(&mut schema, overwrite)?; + + DataType::Struct( + schema + .into_iter() + .map(|(name, dt)| Field::new(&name, dt)) + .collect(), + ) + .to_arrow() + } else { + dtype + } + } }; + let arr = polars_json::json::deserialize(&json_value, dtype)?; let arr = arr.as_any().downcast_ref::().ok_or_else( || polars_err!(ComputeError: "can only deserialize json objects"), @@ -237,12 +272,14 @@ where let mut json_reader = CoreJsonReader::new( rb, None, - None, + self.schema, + self.schema_overwrite, None, 1024, // sample size 1 << 18, false, self.infer_schema_len, + self.ignore_errors, )?; let mut df: DataFrame = json_reader.as_df()?; if self.rechunk { @@ -252,6 +289,7 @@ where } }?; + // TODO! Ensure we don't materialize the columns we don't need if let Some(proj) = &self.projection { out.select(proj) } else { @@ -260,13 +298,19 @@ where } } -impl JsonReader +impl<'a, R> JsonReader<'a, R> where R: MmapBytesReader, { /// Set the JSON file's schema - pub fn with_schema(mut self, schema: &Schema) -> Self { - self.schema = Some(schema.to_arrow()); + pub fn with_schema(mut self, schema: SchemaRef) -> Self { + self.schema = Some(schema); + self + } + + /// Overwrite parts of the inferred schema. + pub fn with_schema_overwrite(mut self, schema: &'a Schema) -> Self { + self.schema_overwrite = Some(schema); self } @@ -305,4 +349,10 @@ where self.json_format = format; self } + + /// Return a `null` if an error occurs during parsing. + pub fn with_ignore_errors(mut self, ignore: bool) -> Self { + self.ignore_errors = ignore; + self + } } diff --git a/polars/polars-io/src/ndjson/buffer.rs b/polars/polars-io/src/ndjson/buffer.rs index c5f9635a2168..654190777ea5 100644 --- a/polars/polars-io/src/ndjson/buffer.rs +++ b/polars/polars-io/src/ndjson/buffer.rs @@ -20,19 +20,23 @@ impl<'a> Hash for BufferKey<'a> { } } -pub(crate) struct Buffer<'a>(&'a str, AnyValueBuffer<'a>); +pub(crate) struct Buffer<'a> { + name: &'a str, + ignore_errors: bool, + buf: AnyValueBuffer<'a>, +} impl Buffer<'_> { pub fn into_series(self) -> Series { - let mut s = self.1.into_series(); - s.rename(self.0); + let mut s = self.buf.into_series(); + s.rename(self.name); s } #[inline] pub(crate) fn add(&mut self, value: &Value) -> PolarsResult<()> { use AnyValueBuffer::*; - match &mut self.1 { + match &mut self.buf { Boolean(buf) => { match value { Value::Static(StaticNode::Bool(b)) => buf.append_value(*b), @@ -109,7 +113,7 @@ impl Buffer<'_> { Ok(()) } All(dtype, buf) => { - let av = deserialize_all(value, dtype)?; + let av = deserialize_all(value, dtype, self.ignore_errors)?; buf.push(av); Ok(()) } @@ -117,19 +121,27 @@ impl Buffer<'_> { } } pub fn add_null(&mut self) { - self.1.add(AnyValue::Null).expect("should not fail"); + self.buf.add(AnyValue::Null).expect("should not fail"); } } pub(crate) fn init_buffers( schema: &Schema, capacity: usize, + ignore_errors: bool, ) -> PolarsResult> { schema .iter() .map(|(name, dtype)| { let av_buf = (dtype, capacity).into(); let key = KnownKey::from(name.as_str()); - Ok((BufferKey(key), Buffer(name, av_buf))) + Ok(( + BufferKey(key), + Buffer { + name, + buf: av_buf, + ignore_errors, + }, + )) }) .collect() } @@ -163,7 +175,11 @@ where }) } -fn deserialize_all<'a>(json: &Value, dtype: &DataType) -> PolarsResult> { +fn deserialize_all<'a>( + json: &Value, + dtype: &DataType, + ignore_errors: bool, +) -> PolarsResult> { let out = match json { Value::Static(StaticNode::Bool(b)) => AnyValue::Boolean(*b), Value::Static(StaticNode::I64(i)) => AnyValue::Int64(*i), @@ -173,11 +189,14 @@ fn deserialize_all<'a>(json: &Value, dtype: &DataType) -> PolarsResult AnyValue::Utf8Owned(s.as_ref().into()), Value::Array(arr) => { let Some(inner_dtype) = dtype.inner_dtype() else { + if ignore_errors { + return Ok(AnyValue::Null) + } polars_bail!(ComputeError: "expected list/array in json value, got {}", dtype); }; let vals: Vec = arr .iter() - .map(|val| deserialize_all(val, inner_dtype)) + .map(|val| deserialize_all(val, inner_dtype, ignore_errors)) .collect::>()?; let s = Series::from_any_values_and_dtype("", &vals, inner_dtype, false)?; AnyValue::List(s) @@ -191,7 +210,7 @@ fn deserialize_all<'a>(json: &Value, dtype: &DataType) -> PolarsResult(json: &Value, dtype: &DataType) -> PolarsResult>>()?; AnyValue::StructOwned(Box::new((vals, fields.clone()))) } else { + if ignore_errors { + return Ok(AnyValue::Null); + } polars_bail!( - ComputeError: "expected {dtype} in json value, got object", + ComputeError: "expected {} in json value, got object", dtype, ); } } diff --git a/polars/polars-io/src/ndjson/core.rs b/polars/polars-io/src/ndjson/core.rs index 79a11501f20d..9417967e1e0e 100644 --- a/polars/polars-io/src/ndjson/core.rs +++ b/polars/polars-io/src/ndjson/core.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::fs::File; use std::io::Cursor; use std::path::PathBuf; @@ -30,9 +29,11 @@ where n_threads: Option, infer_schema_len: Option, chunk_size: usize, - schema: Option<&'a Schema>, + schema: Option, + schema_overwrite: Option<&'a Schema>, path: Option, low_memory: bool, + ignore_errors: bool, } impl<'a, R> JsonLineReader<'a, R> @@ -43,10 +44,16 @@ where self.n_rows = num_rows; self } - pub fn with_schema(mut self, schema: &'a Schema) -> Self { + pub fn with_schema(mut self, schema: SchemaRef) -> Self { self.schema = Some(schema); self } + + pub fn with_schema_overwrite(mut self, schema: &'a Schema) -> Self { + self.schema_overwrite = Some(schema); + self + } + pub fn with_rechunk(mut self, rechunk: bool) -> Self { self.rechunk = rechunk; self @@ -102,9 +109,11 @@ where n_threads: None, infer_schema_len: Some(128), schema: None, + schema_overwrite: None, path: None, chunk_size: 1 << 18, low_memory: false, + ignore_errors: false, } } fn finish(mut self) -> PolarsResult { @@ -114,11 +123,13 @@ where reader_bytes, self.n_rows, self.schema, + self.schema_overwrite, self.n_threads, 1024, // sample size self.chunk_size, self.low_memory, self.infer_schema_len, + self.ignore_errors, )?; let mut df: DataFrame = json_reader.as_df()?; @@ -132,28 +143,31 @@ where pub(crate) struct CoreJsonReader<'a> { reader_bytes: Option>, n_rows: Option, - schema: Cow<'a, Schema>, + schema: SchemaRef, n_threads: Option, sample_size: usize, chunk_size: usize, low_memory: bool, + ignore_errors: bool, } impl<'a> CoreJsonReader<'a> { #[allow(clippy::too_many_arguments)] pub(crate) fn new( reader_bytes: ReaderBytes<'a>, n_rows: Option, - schema: Option<&'a Schema>, + schema: Option, + schema_overwrite: Option<&Schema>, n_threads: Option, sample_size: usize, chunk_size: usize, low_memory: bool, infer_schema_len: Option, + ignore_errors: bool, ) -> PolarsResult> { let reader_bytes = reader_bytes; - let schema = match schema { - Some(schema) => Cow::Borrowed(schema), + let mut schema = match schema { + Some(schema) => schema, None => { let bytes: &[u8] = &reader_bytes; let mut cursor = Cursor::new(bytes); @@ -161,9 +175,14 @@ impl<'a> CoreJsonReader<'a> { let data_type = polars_json::ndjson::infer(&mut cursor, infer_schema_len)?; let schema = StructArray::get_fields(&data_type).iter().collect(); - Cow::Owned(schema) + Arc::new(schema) } }; + if let Some(overwriting_schema) = schema_overwrite { + let schema = Arc::make_mut(&mut schema); + overwrite_schema(schema, overwriting_schema)?; + } + Ok(CoreJsonReader { reader_bytes: Some(reader_bytes), schema, @@ -172,6 +191,7 @@ impl<'a> CoreJsonReader<'a> { n_threads, chunk_size, low_memory, + ignore_errors, }) } fn parse_json(&mut self, mut n_threads: usize, bytes: &[u8]) -> PolarsResult { @@ -212,7 +232,7 @@ impl<'a> CoreJsonReader<'a> { file_chunks .into_par_iter() .map(|(start_pos, stop_at_nbytes)| { - let mut buffers = init_buffers(&self.schema, capacity)?; + let mut buffers = init_buffers(&self.schema, capacity, self.ignore_errors)?; parse_lines(&bytes[start_pos..stop_at_nbytes], &mut buffers)?; DataFrame::new( buffers diff --git a/polars/polars-io/src/utils.rs b/polars/polars-io/src/utils.rs index 0c2297fd5218..e9785bf3c629 100644 --- a/polars/polars-io/src/utils.rs +++ b/polars/polars-io/src/utils.rs @@ -106,6 +106,17 @@ pub(crate) fn update_row_counts2(dfs: &mut [DataFrame], offset: IdxSize) { } } +#[cfg(feature = "json")] +pub(crate) fn overwrite_schema( + schema: &mut Schema, + overwriting_schema: &Schema, +) -> PolarsResult<()> { + for (k, value) in overwriting_schema.iter() { + *schema.try_get_mut(k)? = value.clone(); + } + Ok(()) +} + #[cfg(test)] mod tests { use std::path::PathBuf; diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs b/polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs index 8687a22b3f74..d9e2cb70d63d 100644 --- a/polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs +++ b/polars/polars-lazy/src/physical_plan/executors/scan/ndjson.rs @@ -10,7 +10,7 @@ impl AnonymousScan for LazyJsonLineReader { fn scan(&self, scan_opts: AnonymousScanOptions) -> PolarsResult { let schema = scan_opts.output_schema.unwrap_or(scan_opts.schema); JsonLineReader::from_path(&self.path)? - .with_schema(&schema) + .with_schema(schema) .with_rechunk(self.rechunk) .with_chunk_size(self.batch_size) .low_memory(self.low_memory) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 8e7b11ebd046..dc648c8a2883 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -991,7 +991,13 @@ def _read_ipc( return self @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + def _read_json( + cls, + source: str | Path | IOBase | bytes, + *, + schema: SchemaDefinition | None = None, + schema_overrides: SchemaDefinition | None = None, + ) -> Self: """ Read into a DataFrame from a JSON file. @@ -1008,11 +1014,20 @@ def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: source = normalise_filepath(source) self = cls.__new__(cls) - self._df = PyDataFrame.read_json(source, False) + self._df = PyDataFrame.read_json( + source, schema=schema, schema_overrides=schema_overrides + ) return self @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + def _read_ndjson( + cls, + source: str | Path | IOBase | bytes, + *, + schema: SchemaDefinition | None = None, + schema_overrides: SchemaDefinition | None = None, + ignore_errors: bool = False, + ) -> Self: """ Read into a DataFrame from a newline delimited JSON file. @@ -1029,7 +1044,12 @@ def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: source = normalise_filepath(source) self = cls.__new__(cls) - self._df = PyDataFrame.read_ndjson(source) + self._df = PyDataFrame.read_ndjson( + source, + ignore_errors=ignore_errors, + schema=schema, + schema_overrides=schema_overrides, + ) return self @property diff --git a/py-polars/polars/io/json.py b/py-polars/polars/io/json.py index b72872efb437..f8c13adfe05a 100644 --- a/py-polars/polars/io/json.py +++ b/py-polars/polars/io/json.py @@ -9,9 +9,15 @@ from pathlib import Path from polars import DataFrame + from polars.type_aliases import SchemaDefinition -def read_json(source: str | Path | IOBase | bytes) -> DataFrame: +def read_json( + source: str | Path | IOBase | bytes, + *, + schema: SchemaDefinition | None = None, + schema_overrides: SchemaDefinition | None = None, +) -> DataFrame: """ Read into a DataFrame from a JSON file. @@ -19,10 +25,26 @@ def read_json(source: str | Path | IOBase | bytes) -> DataFrame: ---------- source Path to a file or a file-like object. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the schema param will be overridden. + underlying data, the names given here will overwrite them. See Also -------- read_ndjson """ - return pl.DataFrame._read_json(source) + return pl.DataFrame._read_json( + source, schema=schema, schema_overrides=schema_overrides + ) diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index e6607caa26b7..847a5ba30337 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -11,9 +11,16 @@ from io import IOBase from polars import DataFrame, LazyFrame + from polars.type_aliases import SchemaDefinition -def read_ndjson(source: str | Path | IOBase | bytes) -> DataFrame: +def read_ndjson( + source: str | Path | IOBase | bytes, + *, + schema: SchemaDefinition | None = None, + schema_overrides: SchemaDefinition | None = None, + ignore_errors: bool = False, +) -> DataFrame: """ Read into a DataFrame from a newline delimited JSON file. @@ -21,9 +28,30 @@ def read_ndjson(source: str | Path | IOBase | bytes) -> DataFrame: ---------- source Path to a file or a file-like object. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the schema param will be overridden. + underlying data, the names given here will overwrite them. + ignore_errors + Return `Null` if parsing fails because of schema mismatches. """ - return pl.DataFrame._read_ndjson(source) + return pl.DataFrame._read_ndjson( + source, + schema=schema, + schema_overrides=schema_overrides, + ignore_errors=ignore_errors, + ) def scan_ndjson( diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 2c03c27dbb32..3d8cf8de109a 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -351,41 +351,46 @@ impl PyDataFrame { #[staticmethod] #[cfg(feature = "json")] - pub fn read_json(py_f: &PyAny, json_lines: bool) -> PyResult { + pub fn read_json( + py_f: &PyAny, + schema: Option>, + schema_overrides: Option>, + ) -> PyResult { + // memmap the file first let mmap_bytes_r = get_mmap_bytes_reader(py_f)?; - if json_lines { - let out = JsonReader::new(mmap_bytes_r) - .with_json_format(JsonFormat::JsonLines) - .finish() - .map_err(|e| PyPolarsErr::Other(format!("{e}")))?; - Ok(out.into()) - } else { - // memmap the file first - let mmap_bytes_r = get_mmap_bytes_reader(py_f)?; - let mmap_read: ReaderBytes = (&mmap_bytes_r).into(); - let bytes = mmap_read.deref(); - - // Happy path is our column oriented json as that is most performant - // on failure we try - match serde_json::from_slice::(bytes) { - Ok(df) => Ok(df.into()), - Err(e) => { - let msg = format!("{e}"); - // parsing succeeded, but the dataframe was invalid - if msg.contains("successful parse invalid data") { - let e = PyPolarsErr::from(PolarsError::ComputeError(msg.into())); - Err(PyErr::from(e)) + let mmap_read: ReaderBytes = (&mmap_bytes_r).into(); + let bytes = mmap_read.deref(); + + // Happy path is our column oriented json as that is most performant + // on failure we try + match serde_json::from_slice::(bytes) { + Ok(df) => Ok(df.into()), + Err(e) => { + let msg = format!("{e}"); + // parsing succeeded, but the dataframe was invalid + if msg.contains("successful parse invalid data") { + let e = PyPolarsErr::from(PolarsError::ComputeError(msg.into())); + Err(PyErr::from(e)) + } + // parsing error + // try arrow json reader instead + // this is row oriented + else { + let mut builder = + JsonReader::new(mmap_bytes_r).with_json_format(JsonFormat::Json); + + if let Some(schema) = schema { + builder = builder.with_schema(Arc::new(schema.0)); } - // parsing error - // try arrow json reader instead - // this is row oriented - else { - let out = JsonReader::new(mmap_bytes_r) - .with_json_format(JsonFormat::Json) - .finish() - .map_err(|e| PyPolarsErr::Other(format!("{e}")))?; - Ok(out.into()) + + if let Some(schema) = schema_overrides.as_ref() { + builder = builder.with_schema_overwrite(&schema.0); } + + let out = builder + .finish() + .map_err(|e| PyPolarsErr::Other(format!("{e}")))?; + Ok(out.into()) } } } @@ -393,11 +398,27 @@ impl PyDataFrame { #[staticmethod] #[cfg(feature = "json")] - pub fn read_ndjson(py_f: &PyAny) -> PyResult { + pub fn read_ndjson( + py_f: &PyAny, + ignore_errors: bool, + schema: Option>, + schema_overrides: Option>, + ) -> PyResult { let mmap_bytes_r = get_mmap_bytes_reader(py_f)?; - let out = JsonReader::new(mmap_bytes_r) + let mut builder = JsonReader::new(mmap_bytes_r) .with_json_format(JsonFormat::JsonLines) + .with_ignore_errors(ignore_errors); + + if let Some(schema) = schema { + builder = builder.with_schema(Arc::new(schema.0)); + } + + if let Some(schema) = schema_overrides.as_ref() { + builder = builder.with_schema_overwrite(&schema.0); + } + + let out = builder .finish() .map_err(|e| PyPolarsErr::Other(format!("{e}")))?; Ok(out.into()) diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 4bb40ba2e43b..f762e045a404 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -152,3 +152,50 @@ def test_json_deserialize_9687() -> None: result = pl.read_json(json.dumps(response).encode()) assert result.to_dict(False) == {k: [v] for k, v in response.items()} + + +def test_ndjson_ignore_errors() -> None: + # this schema is inconsistent as "value" is string and object + jsonl = r"""{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]} + {"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]}""" + + buf = io.BytesIO(jsonl.encode()) + + # check if we can replace with nulls + assert pl.read_ndjson(buf, ignore_errors=True).to_dict(False) == { + "Type": ["insert", "insert"], + "Key": [[1], [1]], + "SeqNo": [1, 1], + "Timestamp": [1, 1], + "Fields": [ + [{"Name": "added_id", "Value": "2"}, {"Name": "body", "Value": None}], + [{"Name": "added_id", "Value": "2"}, {"Name": "body", "Value": None}], + ], + } + + schema = { + "Fields": pl.List( + pl.Struct([pl.Field("Name", pl.Utf8), pl.Field("Value", pl.Int64)]) + ) + } + # schema argument only parses Fields + assert pl.read_ndjson(buf, schema=schema, ignore_errors=True).to_dict(False) == { + "Fields": [ + [{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}], + [{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}], + ] + } + + # schema_overrides argument does schema inference, but overrides Fields + assert pl.read_ndjson(buf, schema_overrides=schema, ignore_errors=True).to_dict( + False + ) == { + "Type": ["insert", "insert"], + "Key": [[1], [1]], + "SeqNo": [1, 1], + "Timestamp": [1, 1], + "Fields": [ + [{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}], + [{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}], + ], + } From 147944c95a11643cf77da3b467938f8a35d6ed9e Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 14 Jul 2023 20:27:31 +0200 Subject: [PATCH 14/37] feat(python): convenience support for parsing a list of SQL strings with `sql_expr` (#9881) --- py-polars/polars/functions/lazy.py | 38 ++++++++++++++++++++++++++---- py-polars/tests/unit/test_sql.py | 12 ++++++---- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index f970b4f36c9d..97a2a9aefa7f 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -2093,17 +2093,29 @@ def rolling_corr( ) -def sql_expr(sql: str) -> Expr: +@overload +def sql_expr(sql: str) -> Expr: # type: ignore[misc] + ... + + +@overload +def sql_expr(sql: Sequence[str]) -> list[Expr]: + ... + + +def sql_expr(sql: str | Sequence[str]) -> Expr | list[Expr]: """ - Parse a SQL expression to a polars expression. + Parse one or more SQL expressions to polars expression(s). Parameters ---------- sql - SQL expression + One or more SQL expressions. Examples -------- + Parse a single SQL expression: + >>> df = pl.DataFrame({"a": [2, 1]}) >>> expr = pl.sql_expr("MAX(a)") >>> df.select(expr) @@ -2115,5 +2127,23 @@ def sql_expr(sql: str) -> Expr: ╞═════╡ │ 2 │ └─────┘ + + Parse multiple SQL expressions: + + >>> df.with_columns( + ... *pl.sql_expr(["POWER(a,a) AS a_a", "CAST(a AS TEXT) AS a_txt"]), + ... ) + shape: (2, 3) + ┌─────┬─────┬───────┐ + │ a ┆ a_a ┆ a_txt │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═══════╡ + │ 2 ┆ 4.0 ┆ 2 │ + │ 1 ┆ 1.0 ┆ 1 │ + └─────┴─────┴───────┘ """ - return wrap_expr(plr.sql_expr(sql)) + if isinstance(sql, str): + return wrap_expr(plr.sql_expr(sql)) + else: + return [wrap_expr(plr.sql_expr(q)) for q in sql] diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index d2c9869ce718..86a78ddddfd2 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -771,15 +771,17 @@ def test_register_context() -> None: def test_sql_expr() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": ["xyz", "abcde", None]}) - sql_exprs = ( - pl.sql_expr("MIN(a)"), - pl.sql_expr("POWER(a,a) AS aa"), - pl.sql_expr("SUBSTR(b,1,2) AS b2"), + sql_exprs = pl.sql_expr( + [ + "MIN(a)", + "POWER(a,a) AS aa", + "SUBSTR(b,1,2) AS b2", + ] ) expected = pl.DataFrame( {"a": [1, 1, 1], "aa": [1, 4, 27], "b2": ["yz", "bc", None]} ) - assert df.select(sql_exprs).frame_equal(expected) + assert df.select(*sql_exprs).frame_equal(expected) # expect expressions that can't reasonably be parsed as expressions to raise # (for example: those that explicitly reference tables and/or use wildcards) From 134d43edeb5d462a77b8d6475ea7746a44656342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Havelund=20Welling?= Date: Fri, 14 Jul 2023 20:28:42 +0200 Subject: [PATCH 15/37] feat(rust): access `OptState` in `LazyFrame` to unit-test optimization toggle methods. (#9883) --- polars/polars-lazy/polars-plan/src/frame/opt_state.rs | 2 +- polars/polars-lazy/src/frame/mod.rs | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/polars/polars-lazy/polars-plan/src/frame/opt_state.rs b/polars/polars-lazy/polars-plan/src/frame/opt_state.rs index f99ae1f4f8b8..28cf448c76f9 100644 --- a/polars/polars-lazy/polars-plan/src/frame/opt_state.rs +++ b/polars/polars-lazy/polars-plan/src/frame/opt_state.rs @@ -1,4 +1,4 @@ -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] /// State of the allowed optimizations pub struct OptState { pub projection_pushdown: bool, diff --git a/polars/polars-lazy/src/frame/mod.rs b/polars/polars-lazy/src/frame/mod.rs index 593333c96f82..ed3641d0b001 100644 --- a/polars/polars-lazy/src/frame/mod.rs +++ b/polars/polars-lazy/src/frame/mod.rs @@ -110,6 +110,11 @@ impl LazyFrame { } } + /// Get current optimizations + pub fn get_current_optimizations(&self) -> OptState { + self.opt_state + } + /// Set allowed optimizations pub fn with_optimizations(mut self, opt_state: OptState) -> Self { self.opt_state = opt_state; From 9672a410bc60833b572a4c06599c26740e97f2ee Mon Sep 17 00:00:00 2001 From: Josh Magarick Date: Fri, 14 Jul 2023 23:08:05 -0700 Subject: [PATCH 16/37] refactor(rust): Rolling quantile and median use DynArgs (#9867) --- .../polars-arrow/src/kernels/rolling/mod.rs | 8 +- .../src/kernels/rolling/no_nulls/quantile.rs | 347 +++++----------- .../src/kernels/rolling/nulls/quantile.rs | 379 ++++++------------ .../src/kernels/rolling/window.rs | 11 +- polars/polars-arrow/src/prelude.rs | 2 +- .../src/frame/groupby/aggregations/mod.rs | 32 +- polars/polars-lazy/polars-plan/src/dsl/mod.rs | 9 +- .../src/chunkedarray/rolling_window/floats.rs | 63 +-- .../src/chunkedarray/rolling_window/ints.rs | 11 +- .../src/chunkedarray/rolling_window/mod.rs | 9 +- .../rolling_kernels/no_nulls.rs | 21 + polars/polars-time/src/series/_trait.rs | 18 +- .../src/series/implementations/floats.rs | 9 +- .../src/series/implementations/integers.rs | 9 +- polars/tests/it/core/rolling_window.rs | 104 +++-- py-polars/src/expr/general.rs | 17 +- 16 files changed, 369 insertions(+), 680 deletions(-) diff --git a/polars/polars-arrow/src/kernels/rolling/mod.rs b/polars/polars-arrow/src/kernels/rolling/mod.rs index 12ad13b47f1f..0b1f2343ae88 100644 --- a/polars/polars-arrow/src/kernels/rolling/mod.rs +++ b/polars/polars-arrow/src/kernels/rolling/mod.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use arrow::array::PrimitiveArray; use arrow::bitmap::{Bitmap, MutableBitmap}; use arrow::types::NativeType; -use num_traits::{Bounded, Float, NumCast, One, ToPrimitive, Zero}; +use num_traits::{Bounded, Float, NumCast, One, Zero}; use window::*; use crate::data_types::IsFloat; @@ -142,3 +142,9 @@ where pub struct RollingVarParams { pub ddof: u8, } + +#[derive(Clone, Copy, Debug)] +pub struct RollingQuantileParams { + pub prob: f64, + pub interpol: QuantileInterpolOptions, +} diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs b/polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs index 2eaa76309f01..c790e2a1706b 100644 --- a/polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs +++ b/polars/polars-arrow/src/kernels/rolling/no_nulls/quantile.rs @@ -5,199 +5,139 @@ use polars_error::polars_ensure; use super::QuantileInterpolOptions::*; use super::*; -use crate::index::IdxSize; -use crate::trusted_len::TrustedLen; -// used by agg_quantile -pub fn rolling_quantile_by_iter( - values: &[T], - quantile: f64, - interpolation: QuantileInterpolOptions, - offsets: O, -) -> ArrayRef -where - O: Iterator + TrustedLen, - T: std::iter::Sum - + NativeType - + Copy - + PartialOrd - + ToPrimitive - + NumCast - + Add - + Sub - + Div - + Mul - + IsFloat, -{ - if values.is_empty() { - let out: Vec = vec![]; - return Box::new(PrimitiveArray::new(T::PRIMITIVE.into(), out.into(), None)); - } - - let mut sorted_window = SortedBuf::new(values, 0, 1); - - let out = offsets - .map(|(start, len)| { - let end = start + len; - - // safety: - // we are in bounds - if start == end { - None - } else { - let window = unsafe { sorted_window.update(start as usize, end as usize) }; - Some(compute_quantile2(window, quantile, interpolation)) - } - }) - .collect::>(); - - Box::new(out) +pub struct QuantileWindow<'a, T: NativeType + IsFloat + PartialOrd> { + sorted: SortedBuf<'a, T>, + prob: f64, + interpol: QuantileInterpolOptions, } -pub(crate) fn compute_quantile2( - vals: &[T], - quantile: f64, - interpolation: QuantileInterpolOptions, -) -> T -where - T: std::iter::Sum - + Copy - + PartialOrd - + ToPrimitive - + NumCast - + Add - + Sub - + Div - + Mul - + IsFloat, +impl< + 'a, + T: NativeType + + IsFloat + + Float + + std::iter::Sum + + AddAssign + + SubAssign + + Div + + NumCast + + One + + Zero + + PartialOrd + + Sub, + > RollingAggWindowNoNulls<'a, T> for QuantileWindow<'a, T> { - let length = vals.len(); - - let mut idx = match interpolation { - QuantileInterpolOptions::Nearest => ((length as f64) * quantile) as usize, - QuantileInterpolOptions::Lower - | QuantileInterpolOptions::Midpoint - | QuantileInterpolOptions::Linear => ((length as f64 - 1.0) * quantile).floor() as usize, - QuantileInterpolOptions::Higher => ((length as f64 - 1.0) * quantile).ceil() as usize, - }; - - idx = std::cmp::min(idx, length - 1); + fn new(slice: &'a [T], start: usize, end: usize, params: DynArgs) -> Self { + let params = params.unwrap(); + let params = params.downcast_ref::().unwrap(); + Self { + sorted: SortedBuf::new(slice, start, end), + prob: params.prob, + interpol: params.interpol, + } + } - match interpolation { - QuantileInterpolOptions::Midpoint => { - let top_idx = ((length as f64 - 1.0) * quantile).ceil() as usize; - if top_idx == idx { - // safety - // we are in bounds - unsafe { *vals.get_unchecked(idx) } - } else { - // safety - // we are in bounds - let (mid, mid_plus_1) = - unsafe { (*vals.get_unchecked(idx), *vals.get_unchecked(idx + 1)) }; + unsafe fn update(&mut self, start: usize, end: usize) -> T { + let vals = self.sorted.update(start, end); + let length = vals.len(); - (mid + mid_plus_1) / T::from::(2.0f64).unwrap() + let mut idx = match self.interpol { + QuantileInterpolOptions::Nearest => ((length as f64) * self.prob) as usize, + QuantileInterpolOptions::Lower + | QuantileInterpolOptions::Midpoint + | QuantileInterpolOptions::Linear => { + ((length as f64 - 1.0) * self.prob).floor() as usize } - } - QuantileInterpolOptions::Linear => { - let float_idx = (length as f64 - 1.0) * quantile; - let top_idx = f64::ceil(float_idx) as usize; - - if top_idx == idx { + QuantileInterpolOptions::Higher => ((length as f64 - 1.0) * self.prob).ceil() as usize, + }; + + idx = std::cmp::min(idx, length - 1); + + match self.interpol { + QuantileInterpolOptions::Midpoint => { + let top_idx = ((length as f64 - 1.0) * self.prob).ceil() as usize; + if top_idx == idx { + // safety + // we are in bounds + unsafe { *vals.get_unchecked(idx) } + } else { + // safety + // we are in bounds + let (mid, mid_plus_1) = + unsafe { (*vals.get_unchecked(idx), *vals.get_unchecked(idx + 1)) }; + + (mid + mid_plus_1) / T::from::(2.0f64).unwrap() + } + } + QuantileInterpolOptions::Linear => { + let float_idx = (length as f64 - 1.0) * self.prob; + let top_idx = f64::ceil(float_idx) as usize; + + if top_idx == idx { + // safety + // we are in bounds + unsafe { *vals.get_unchecked(idx) } + } else { + let proportion = T::from(float_idx - idx as f64).unwrap(); + proportion * (vals[top_idx] - vals[idx]) + vals[idx] + } + } + _ => { // safety // we are in bounds unsafe { *vals.get_unchecked(idx) } - } else { - let proportion = T::from(float_idx - idx as f64).unwrap(); - proportion * (vals[top_idx] - vals[idx]) + vals[idx] } } - _ => { - // safety - // we are in bounds - unsafe { *vals.get_unchecked(idx) } - } } } -pub fn rolling_median( - values: &[T], - window_size: usize, - min_periods: usize, - center: bool, - weights: Option<&[f64]>, - _params: DynArgs, -) -> PolarsResult -where - T: NativeType - + std::iter::Sum - + PartialOrd - + ToPrimitive - + NumCast - + Add - + Sub - + Div - + Mul - + Zero - + IsFloat, -{ - rolling_quantile( - values, - 0.5, - QuantileInterpolOptions::Linear, - window_size, - min_periods, - center, - weights, - ) -} - pub fn rolling_quantile( values: &[T], - quantile: f64, - interpolation: QuantileInterpolOptions, window_size: usize, min_periods: usize, center: bool, weights: Option<&[f64]>, + params: DynArgs, ) -> PolarsResult where T: NativeType - + std::iter::Sum - + PartialOrd - + ToPrimitive - + NumCast - + Add - + Sub + + IsFloat + + Float + + std::iter::Sum + + AddAssign + + SubAssign + Div - + Mul + + NumCast + + One + Zero - + IsFloat, + + PartialOrd + + Sub, { let offset_fn = match center { true => det_offsets_center, false => det_offsets, }; match weights { - None => Ok(rolling_apply_quantile( + None => rolling_apply_agg_window::, _, _>( values, - quantile, - interpolation, window_size, min_periods, offset_fn, - compute_quantile2, - )), + params, + ), Some(weights) => { let wsum = weights.iter().sum(); polars_ensure!( wsum != 0.0, ComputeError: "Weighted quantile is undefined if weights sum to 0" ); + let params = params.unwrap(); + let params = params.downcast_ref::().unwrap(); Ok(rolling_apply_weighted_quantile( values, - quantile, - interpolation, + params.prob, + params.interpol, window_size, min_periods, offset_fn, @@ -208,43 +148,6 @@ where } } -fn rolling_apply_quantile( - values: &[T], - quantile: f64, - interpolation: QuantileInterpolOptions, - window_size: usize, - min_periods: usize, - det_offsets_fn: Fo, - aggregator: Fa, -) -> ArrayRef -where - Fo: Fn(Idx, WindowSize, Len) -> (Start, End), - Fa: Fn(&[T], f64, QuantileInterpolOptions) -> T, - T: Debug + NativeType + IsFloat + PartialOrd, -{ - let len = values.len(); - let (start, end) = det_offsets_fn(0, window_size, len); - let mut sorted_window = SortedBuf::new(values, start, end); - - let out = (0..len) - .map(|idx| { - let (start, end) = det_offsets_fn(idx, window_size, len); - - // Safety: - // we are in bounds - let window = unsafe { sorted_window.update(start, end) }; - aggregator(window, quantile, interpolation) - }) - .collect_trusted::>(); - - let validity = create_validity(min_periods, len, window_size, det_offsets_fn); - Box::new(PrimitiveArray::new( - T::PRIMITIVE.into(), - out.into(), - validity.map(|b| b.into()), - )) -} - #[inline] fn compute_wq(buf: &[(T, f64)], p: f64, wsum: f64, interp: QuantileInterpolOptions) -> T where @@ -348,73 +251,31 @@ mod test { #[test] fn test_rolling_median() { let values = &[1.0, 2.0, 3.0, 4.0]; - - let out = rolling_quantile( - values, - 0.5, - QuantileInterpolOptions::Linear, - 2, - 2, - false, - None, - ) - .unwrap(); + let med_pars = Some(Arc::new(RollingQuantileParams { + prob: 0.5, + interpol: Linear, + }) as Arc); + let out = rolling_quantile(values, 2, 2, false, None, med_pars.clone()).unwrap(); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[None, Some(1.5), Some(2.5), Some(3.5)]); - let out = rolling_quantile( - values, - 0.5, - QuantileInterpolOptions::Linear, - 2, - 1, - false, - None, - ) - .unwrap(); + let out = rolling_quantile(values, 2, 1, false, None, med_pars.clone()).unwrap(); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[Some(1.0), Some(1.5), Some(2.5), Some(3.5)]); - let out = rolling_quantile( - values, - 0.5, - QuantileInterpolOptions::Linear, - 4, - 1, - false, - None, - ) - .unwrap(); + let out = rolling_quantile(values, 4, 1, false, None, med_pars.clone()).unwrap(); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[Some(1.0), Some(1.5), Some(2.0), Some(2.5)]); - let out = rolling_quantile( - values, - 0.5, - QuantileInterpolOptions::Linear, - 4, - 1, - true, - None, - ) - .unwrap(); + let out = rolling_quantile(values, 4, 1, true, None, med_pars.clone()).unwrap(); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[Some(1.5), Some(2.0), Some(2.5), Some(3.0)]); - let out = rolling_quantile( - values, - 0.5, - QuantileInterpolOptions::Linear, - 4, - 4, - true, - None, - ) - .unwrap(); + let out = rolling_quantile(values, 4, 4, true, None, med_pars.clone()).unwrap(); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[None, None, Some(2.5), None]); @@ -433,18 +294,26 @@ mod test { ]; for interpol in interpol_options { + let min_pars = Some(Arc::new(RollingQuantileParams { + prob: 0.0, + interpol, + }) as Arc); let out1 = rolling_min(values, 2, 2, false, None, None).unwrap(); let out1 = out1.as_any().downcast_ref::>().unwrap(); let out1 = out1.into_iter().map(|v| v.copied()).collect::>(); - let out2 = rolling_quantile(values, 0.0, interpol, 2, 2, false, None).unwrap(); + let out2 = rolling_quantile(values, 2, 2, false, None, min_pars).unwrap(); let out2 = out2.as_any().downcast_ref::>().unwrap(); let out2 = out2.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out1, out2); + let max_pars = Some(Arc::new(RollingQuantileParams { + prob: 1.0, + interpol, + }) as Arc); let out1 = rolling_max(values, 2, 2, false, None, None).unwrap(); let out1 = out1.as_any().downcast_ref::>().unwrap(); let out1 = out1.into_iter().map(|v| v.copied()).collect::>(); - let out2 = rolling_quantile(values, 1.0, interpol, 2, 2, false, None).unwrap(); + let out2 = rolling_quantile(values, 2, 2, false, None, max_pars).unwrap(); let out2 = out2.as_any().downcast_ref::>().unwrap(); let out2 = out2.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out1, out2); diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/quantile.rs b/polars/polars-arrow/src/kernels/rolling/nulls/quantile.rs index 337971272715..08602957a7c9 100644 --- a/polars/polars-arrow/src/kernels/rolling/nulls/quantile.rs +++ b/polars/polars-arrow/src/kernels/rolling/nulls/quantile.rs @@ -1,291 +1,134 @@ use super::*; -use crate::index::IdxSize; -use crate::trusted_len::TrustedLen; -// used by agg_quantile -#[allow(clippy::too_many_arguments)] -pub fn rolling_quantile_by_iter( - values: &[T], - bitmap: &Bitmap, - quantile: f64, - interpolation: QuantileInterpolOptions, - offsets: O, -) -> ArrayRef -where - O: Iterator + TrustedLen, - T: std::iter::Sum - + NativeType - + Copy - + PartialOrd - + ToPrimitive - + NumCast - + Add - + Sub - + Div - + Mul - + IsFloat - + AddAssign - + Zero, -{ - if values.is_empty() { - let out: Vec = vec![]; - return Box::new(PrimitiveArray::new(T::PRIMITIVE.into(), out.into(), None)); - } - - let len = values.len(); - // Safety - // we are in bounds - let mut sorted_window = unsafe { SortedBufNulls::new(values, bitmap, 0, 1) }; - - let mut validity = MutableBitmap::with_capacity(len); - validity.extend_constant(len, true); - - let out = offsets - .enumerate() - .map(|(idx, (start, len))| { - let end = start + len; - - if start == end { - validity.set(idx, false); - T::default() - } else { - // safety - // we are in bounds - unsafe { sorted_window.update(start as usize, end as usize) }; - let null_count = sorted_window.null_count; - let window = sorted_window.window(); - - match compute_quantile(window, null_count, quantile, interpolation, 1) { - Some(val) => val, - None => { - validity.set(idx, false); - T::default() - } - } - } - }) - .collect_trusted::>(); - - Box::new(PrimitiveArray::new( - T::PRIMITIVE.into(), - out.into(), - Some(validity.into()), - )) +pub struct QuantileWindow<'a, T: NativeType + IsFloat + PartialOrd> { + sorted: SortedBufNulls<'a, T>, + prob: f64, + interpol: QuantileInterpolOptions, } -#[allow(clippy::too_many_arguments)] -fn rolling_apply_quantile( - values: &[T], - bitmap: &Bitmap, - quantile: f64, - interpolation: QuantileInterpolOptions, - window_size: usize, - min_periods: usize, - det_offsets_fn: Fo, - aggregator: Fa, -) -> ArrayRef -where - Fo: Fn(Idx, WindowSize, Len) -> (Start, End) + Copy, - // &[Option] -> window values - // usize -> null_count - // f764 -> quantile - // QuantileInterpolOptions -> Interpolation option - // usize -> min_periods - Fa: Fn(&[Option], usize, f64, QuantileInterpolOptions, usize) -> Option, - T: Default + NativeType + IsFloat + PartialOrd, +impl< + 'a, + T: NativeType + + IsFloat + + Float + + std::iter::Sum + + AddAssign + + SubAssign + + Div + + NumCast + + One + + Zero + + PartialOrd + + Sub, + > RollingAggWindowNulls<'a, T> for QuantileWindow<'a, T> { - let len = values.len(); - let (start, end) = det_offsets_fn(0, window_size, len); - // Safety - // we are in bounds - let mut sorted_window = unsafe { SortedBufNulls::new(values, bitmap, start, end) }; - - let mut validity = match create_validity(min_periods, len, window_size, det_offsets_fn) { - Some(v) => v, - None => { - let mut validity = MutableBitmap::with_capacity(len); - validity.extend_constant(len, true); - validity + unsafe fn new( + slice: &'a [T], + validity: &'a Bitmap, + start: usize, + end: usize, + params: DynArgs, + ) -> Self { + let params = params.unwrap(); + let params = params.downcast_ref::().unwrap(); + Self { + sorted: SortedBufNulls::new(slice, validity, start, end), + prob: params.prob, + interpol: params.interpol, } - }; - - let out = (0..len) - .map(|idx| { - let (start, end) = det_offsets_fn(idx, window_size, len); - - // safety - // we are in bounds - unsafe { sorted_window.update(start, end) }; - let null_count = sorted_window.null_count; - let window = sorted_window.window(); - - match aggregator(window, null_count, quantile, interpolation, min_periods) { - Some(val) => val, - None => { - validity.set(idx, false); - T::default() - } - } - }) - .collect_trusted::>(); - - Box::new(PrimitiveArray::new( - T::PRIMITIVE.into(), - out.into(), - Some(validity.into()), - )) -} - -fn compute_quantile( - values: &[Option], - null_count: usize, - quantile: f64, - interpolation: QuantileInterpolOptions, - min_periods: usize, -) -> Option -where - T: NativeType - + std::iter::Sum - + Zero - + AddAssign - + PartialOrd - + ToPrimitive - + NumCast - + Default - + Add - + Sub - + Div - + Mul - + IsFloat, -{ - if (values.len() - null_count) < min_periods { - return None; } - // slice off nulls - let values = &values[null_count..]; - let length = values.len(); - let mut idx = match interpolation { - QuantileInterpolOptions::Nearest => ((length as f64) * quantile) as usize, - QuantileInterpolOptions::Lower - | QuantileInterpolOptions::Midpoint - | QuantileInterpolOptions::Linear => ((length as f64 - 1.0) * quantile).floor() as usize, - QuantileInterpolOptions::Higher => ((length as f64 - 1.0) * quantile).ceil() as usize, - }; - - idx = std::cmp::min(idx, length - 1); - - // we can unwrap because we sliced of the nulls - match interpolation { - QuantileInterpolOptions::Midpoint => { - let top_idx = ((length as f64 - 1.0) * quantile).ceil() as usize; - Some( - (values[idx].unwrap() + values[top_idx].unwrap()) / T::from::(2.0f64).unwrap(), - ) + unsafe fn update(&mut self, start: usize, end: usize) -> Option { + let (values, null_count) = self.sorted.update(start, end); + // The min periods_issue will be taken care of when actually rolling + if null_count == values.len() { + return None; } - QuantileInterpolOptions::Linear => { - let float_idx = (length as f64 - 1.0) * quantile; - let top_idx = f64::ceil(float_idx) as usize; + // Nulls are guaranteed to be at the front + let values = &values[null_count..]; + let length = values.len(); + + let mut idx = match self.interpol { + QuantileInterpolOptions::Nearest => ((length as f64) * self.prob) as usize, + QuantileInterpolOptions::Lower + | QuantileInterpolOptions::Midpoint + | QuantileInterpolOptions::Linear => { + ((length as f64 - 1.0) * self.prob).floor() as usize + } + QuantileInterpolOptions::Higher => ((length as f64 - 1.0) * self.prob).ceil() as usize, + }; + + idx = std::cmp::min(idx, length - 1); - if top_idx == idx { - Some(values[idx].unwrap()) - } else { - let proportion = T::from(float_idx - idx as f64).unwrap(); + // we can unwrap because we sliced of the nulls + match self.interpol { + QuantileInterpolOptions::Midpoint => { + let top_idx = ((length as f64 - 1.0) * self.prob).ceil() as usize; Some( - proportion * (values[top_idx].unwrap() - values[idx].unwrap()) - + values[idx].unwrap(), + (values[idx].unwrap() + values[top_idx].unwrap()) + / T::from::(2.0f64).unwrap(), ) } + QuantileInterpolOptions::Linear => { + let float_idx = (length as f64 - 1.0) * self.prob; + let top_idx = f64::ceil(float_idx) as usize; + + if top_idx == idx { + Some(values[idx].unwrap()) + } else { + let proportion = T::from(float_idx - idx as f64).unwrap(); + Some( + proportion * (values[top_idx].unwrap() - values[idx].unwrap()) + + values[idx].unwrap(), + ) + } + } + _ => Some(values[idx].unwrap()), } - _ => Some(values[idx].unwrap()), } -} -pub fn rolling_median( - arr: &PrimitiveArray, - window_size: usize, - min_periods: usize, - center: bool, - weights: Option<&[f64]>, - _params: DynArgs, -) -> ArrayRef -where - T: NativeType - + std::iter::Sum - + Zero - + AddAssign - + Copy - + PartialOrd - + ToPrimitive - + NumCast - + Default - + Add - + Sub - + Div - + Mul - + IsFloat, -{ - rolling_quantile( - arr, - 0.5, - QuantileInterpolOptions::Linear, - window_size, - min_periods, - center, - weights, - ) + + fn is_valid(&self, min_periods: usize) -> bool { + self.sorted.is_valid(min_periods) + } } pub fn rolling_quantile( arr: &PrimitiveArray, - quantile: f64, - interpolation: QuantileInterpolOptions, window_size: usize, min_periods: usize, center: bool, weights: Option<&[f64]>, + params: DynArgs, ) -> ArrayRef where T: NativeType + + IsFloat + + Float + std::iter::Sum - + Zero + AddAssign - + Copy - + PartialOrd - + ToPrimitive - + NumCast - + Default - + Add - + Sub + + SubAssign + Div - + Mul - + IsFloat, + + NumCast + + One + + Zero + + PartialOrd + + Sub, { if weights.is_some() { panic!("weights not yet supported on array with null values") } - if center { - rolling_apply_quantile( - arr.values().as_slice(), - arr.validity().as_ref().unwrap(), - quantile, - interpolation, - window_size, - min_periods, - det_offsets_center, - compute_quantile, - ) - } else { - rolling_apply_quantile( - arr.values().as_slice(), - arr.validity().as_ref().unwrap(), - quantile, - interpolation, - window_size, - min_periods, - det_offsets, - compute_quantile, - ) - } + let offset_fn = match center { + true => det_offsets_center, + false => det_offsets, + }; + rolling_apply_agg_window::, _, _>( + arr.values().as_slice(), + arr.validity().as_ref().unwrap(), + window_size, + min_periods, + offset_fn, + params, + ) } #[cfg(test)] @@ -304,28 +147,32 @@ mod test { buf, Some(Bitmap::from(&[true, false, true, true])), ); + let med_pars = Some(Arc::new(RollingQuantileParams { + prob: 0.5, + interpol: QuantileInterpolOptions::Linear, + }) as Arc); - let out = rolling_quantile(arr, 0.5, QuantileInterpolOptions::Linear, 2, 2, false, None); + let out = rolling_quantile(arr, 2, 2, false, None, med_pars.clone()); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[None, None, None, Some(3.5)]); - let out = rolling_quantile(arr, 0.5, QuantileInterpolOptions::Linear, 2, 1, false, None); + let out = rolling_quantile(arr, 2, 1, false, None, med_pars.clone()); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[Some(1.0), Some(1.0), Some(3.0), Some(3.5)]); - let out = rolling_quantile(arr, 0.5, QuantileInterpolOptions::Linear, 4, 1, false, None); + let out = rolling_quantile(arr, 4, 1, false, None, med_pars.clone()); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[Some(1.0), Some(1.0), Some(2.0), Some(3.0)]); - let out = rolling_quantile(arr, 0.5, QuantileInterpolOptions::Linear, 4, 1, true, None); + let out = rolling_quantile(arr, 4, 1, true, None, med_pars.clone()); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[Some(1.0), Some(2.0), Some(3.0), Some(3.5)]); - let out = rolling_quantile(arr, 0.5, QuantileInterpolOptions::Linear, 4, 4, true, None); + let out = rolling_quantile(arr, 4, 4, true, None, med_pars.clone()); let out = out.as_any().downcast_ref::>().unwrap(); let out = out.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out, &[None, None, None, None]); @@ -350,18 +197,26 @@ mod test { ]; for interpol in interpol_options { + let min_pars = Some(Arc::new(RollingQuantileParams { + prob: 0.0, + interpol, + }) as Arc); let out1 = rolling_min(values, 2, 1, false, None, None); let out1 = out1.as_any().downcast_ref::>().unwrap(); let out1 = out1.into_iter().map(|v| v.copied()).collect::>(); - let out2 = rolling_quantile(values, 0.0, interpol, 2, 1, false, None); + let out2 = rolling_quantile(values, 2, 1, false, None, min_pars); let out2 = out2.as_any().downcast_ref::>().unwrap(); let out2 = out2.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out1, out2); + let max_pars = Some(Arc::new(RollingQuantileParams { + prob: 1.0, + interpol, + }) as Arc); let out1 = rolling_max(values, 2, 1, false, None, None); let out1 = out1.as_any().downcast_ref::>().unwrap(); let out1 = out1.into_iter().map(|v| v.copied()).collect::>(); - let out2 = rolling_quantile(values, 1.0, interpol, 2, 1, false, None); + let out2 = rolling_quantile(values, 2, 1, false, None, max_pars); let out2 = out2.as_any().downcast_ref::>().unwrap(); let out2 = out2.into_iter().map(|v| v.copied()).collect::>(); assert_eq!(out1, out2); diff --git a/polars/polars-arrow/src/kernels/rolling/window.rs b/polars/polars-arrow/src/kernels/rolling/window.rs index 2ab2d07ce10a..7471c4e2f174 100644 --- a/polars/polars-arrow/src/kernels/rolling/window.rs +++ b/polars/polars-arrow/src/kernels/rolling/window.rs @@ -166,15 +166,11 @@ impl<'a, T: NativeType + IsFloat + PartialOrd> SortedBufNulls<'a, T> { out } - pub(super) fn window(&self) -> &[Option] { - &self.buf - } - /// Update the window position by setting the `start` index and the `end` index. /// # Safety /// The caller must ensure that `start` and `end` are within bounds of `self.slice` /// - pub(super) unsafe fn update(&mut self, start: usize, end: usize) { + pub(super) unsafe fn update(&mut self, start: usize, end: usize) -> (&[Option], usize) { // swap the whole buffer if start >= self.last_end { self.fill_and_sort_buf(start, end); @@ -221,6 +217,11 @@ impl<'a, T: NativeType + IsFloat + PartialOrd> SortedBufNulls<'a, T> { } self.last_start = start; self.last_end = end; + (&self.buf, self.null_count) + } + + pub(super) fn is_valid(&self, min_periods: usize) -> bool { + ((self.last_end - self.last_start) - self.null_count) >= min_periods } } diff --git a/polars/polars-arrow/src/prelude.rs b/polars/polars-arrow/src/prelude.rs index ba928d6b2f43..e1b1fd012f2a 100644 --- a/polars/polars-arrow/src/prelude.rs +++ b/polars/polars-arrow/src/prelude.rs @@ -6,7 +6,7 @@ pub use crate::bitmap::mutable::MutableBitmapExtension; pub use crate::data_types::*; pub use crate::index::*; pub use crate::kernels::rolling::no_nulls::QuantileInterpolOptions; -pub use crate::kernels::rolling::{DynArgs, RollingVarParams}; +pub use crate::kernels::rolling::{DynArgs, RollingQuantileParams, RollingVarParams}; pub type LargeStringArray = Utf8Array; pub type LargeBinaryArray = BinaryArray; diff --git a/polars/polars-core/src/frame/groupby/aggregations/mod.rs b/polars/polars-core/src/frame/groupby/aggregations/mod.rs index b7d3445e601d..c9400cd7d14c 100644 --- a/polars/polars-core/src/frame/groupby/aggregations/mod.rs +++ b/polars/polars-core/src/frame/groupby/aggregations/mod.rs @@ -12,10 +12,10 @@ use num_traits::{Bounded, Float, Num, NumCast, ToPrimitive, Zero}; use polars_arrow::data_types::IsFloat; use polars_arrow::kernels::rolling; use polars_arrow::kernels::rolling::no_nulls::{ - MaxWindow, MeanWindow, MinWindow, RollingAggWindowNoNulls, SumWindow, VarWindow, + MaxWindow, MeanWindow, MinWindow, QuantileWindow, RollingAggWindowNoNulls, SumWindow, VarWindow, }; use polars_arrow::kernels::rolling::nulls::RollingAggWindowNulls; -use polars_arrow::kernels::rolling::{DynArgs, RollingVarParams}; +use polars_arrow::kernels::rolling::{DynArgs, RollingQuantileParams, RollingVarParams}; use polars_arrow::kernels::take_agg::*; use polars_arrow::prelude::QuantileInterpolOptions; use polars_arrow::trusted_len::TrustedLenPush; @@ -271,6 +271,7 @@ where ChunkedArray: QuantileDispatcher, ChunkedArray: IntoSeries, K: PolarsNumericType, + ::Native: num_traits::Float, { let invalid_quantile = !(0.0..=1.0).contains(&quantile); if invalid_quantile { @@ -298,19 +299,25 @@ where let values = arr.values().as_slice(); let offset_iter = groups.iter().map(|[first, len]| (*first, *len)); let arr = match arr.validity() { - None => rolling::no_nulls::rolling_quantile_by_iter( + None => _rolling_apply_agg_window_no_nulls::, _, _>( values, - quantile, - interpol, - offset_iter, - ), - Some(validity) => rolling::nulls::rolling_quantile_by_iter( - values, - validity, - quantile, - interpol, offset_iter, + Some(Arc::new(RollingQuantileParams { + prob: quantile, + interpol, + })), ), + Some(validity) => { + _rolling_apply_agg_window_nulls::, _, _>( + values, + validity, + offset_iter, + Some(Arc::new(RollingQuantileParams { + prob: quantile, + interpol, + })), + ) + } }; // the rolling kernels works on the dtype, this is not yet the float // output type we need. @@ -342,6 +349,7 @@ where ChunkedArray: QuantileDispatcher, ChunkedArray: IntoSeries, K: PolarsNumericType, + ::Native: num_traits::Float, { match groups { GroupsProxy::Idx(groups) => { diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/mod.rs index 05d7923a3b82..3cf8f1cf29ae 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/mod.rs @@ -1352,17 +1352,12 @@ impl Expr { /// /// See: [`RollingAgg::rolling_quantile`] #[cfg(feature = "rolling_window")] - pub fn rolling_quantile( - self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptions, - ) -> Expr { + pub fn rolling_quantile(self, options: RollingOptions) -> Expr { self.finish_rolling( options, "rolling_quantile", "rolling_quantile_by", - Arc::new(move |s, options| s.rolling_quantile(quantile, interpolation, options)), + Arc::new(|s, options| s.rolling_quantile(options)), GetOutput::float_type(), ) } diff --git a/polars/polars-time/src/chunkedarray/rolling_window/floats.rs b/polars/polars-time/src/chunkedarray/rolling_window/floats.rs index 98f645876b30..6a87c6ca4943 100644 --- a/polars/polars-time/src/chunkedarray/rolling_window/floats.rs +++ b/polars/polars-time/src/chunkedarray/rolling_window/floats.rs @@ -80,58 +80,35 @@ where /// A window of length `window_size` will traverse the array. The values that fill this window /// will (optionally) be weighted according to the `weights` vector. fn rolling_median(&self, options: RollingOptionsImpl) -> PolarsResult { - if options.by.is_some() { - panic!("'rolling by' not yet supported for 'rolling_median', consider using 'groupby_rolling'") - } + // At the last possible second, right before we do computations, make sure we're using the + // right quantile parameters to get a median. This also lets us have the convenience of + // calling `rolling_median` from Rust without a bunch of dedicated functions that just call + // out to the `rolling_quantile` anyway. + let mut options = options.clone(); + options.fn_params = Some(Arc::new(RollingQuantileParams { + prob: 0.5, + interpol: QuantileInterpolOptions::Linear, + }) as Arc); rolling_agg( &self.0, options, - &rolling::no_nulls::rolling_median, - &rolling::nulls::rolling_median, - None, + &rolling::no_nulls::rolling_quantile, + &rolling::nulls::rolling_quantile, + Some(&super::rolling_kernels::no_nulls::rolling_quantile), ) } /// Apply a rolling quantile (moving quantile) over the values in this array. /// A window of length `window_size` will traverse the array. The values that fill this window /// will (optionally) be weighted according to the `weights` vector. - fn rolling_quantile( - &self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptionsImpl, - ) -> PolarsResult { - if options.by.is_some() { - panic!("'rolling by' not yet supported for 'rolling_quantile', consider using 'groupby_rolling'") - } - - let options: RollingOptionsFixedWindow = options.into(); - check_input(options.window_size, options.min_periods)?; - let ca = self.0.rechunk(); - - let arr = ca.downcast_iter().next().unwrap(); - let arr = match self.0.has_validity() { - false => rolling::no_nulls::rolling_quantile( - arr.values(), - quantile, - interpolation, - options.window_size, - options.min_periods, - options.center, - options.weights.as_deref(), - ) - .unwrap(), - _ => rolling::nulls::rolling_quantile( - arr, - quantile, - interpolation, - options.window_size, - options.min_periods, - options.center, - options.weights.as_deref(), - ), - }; - Series::try_from((self.0.name(), arr)) + fn rolling_quantile(&self, options: RollingOptionsImpl) -> PolarsResult { + rolling_agg( + &self.0, + options, + &rolling::no_nulls::rolling_quantile, + &rolling::nulls::rolling_quantile, + Some(&super::rolling_kernels::no_nulls::rolling_quantile), + ) } fn rolling_var(&self, options: RollingOptionsImpl) -> PolarsResult { diff --git a/polars/polars-time/src/chunkedarray/rolling_window/ints.rs b/polars/polars-time/src/chunkedarray/rolling_window/ints.rs index 0e9b163a6ec7..a25664f98cae 100644 --- a/polars/polars-time/src/chunkedarray/rolling_window/ints.rs +++ b/polars/polars-time/src/chunkedarray/rolling_window/ints.rs @@ -32,15 +32,8 @@ where self.0.cast(&DataType::Float64)?.rolling_median(options) } - fn rolling_quantile( - &self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptionsImpl, - ) -> PolarsResult { - self.0 - .cast(&DataType::Float64)? - .rolling_quantile(quantile, interpolation, options) + fn rolling_quantile(&self, options: RollingOptionsImpl) -> PolarsResult { + self.0.cast(&DataType::Float64)?.rolling_quantile(options) } fn rolling_min(&self, options: RollingOptionsImpl) -> PolarsResult { diff --git a/polars/polars-time/src/chunkedarray/rolling_window/mod.rs b/polars/polars-time/src/chunkedarray/rolling_window/mod.rs index 72ba20addd2c..ead6d9012eb9 100644 --- a/polars/polars-time/src/chunkedarray/rolling_window/mod.rs +++ b/polars/polars-time/src/chunkedarray/rolling_window/mod.rs @@ -14,8 +14,6 @@ use polars_arrow::data_types::IsFloat; use polars_arrow::export::arrow; #[cfg(feature = "rolling_window")] use polars_arrow::kernels::rolling; -#[cfg(feature = "rolling_window")] -use polars_arrow::prelude::QuantileInterpolOptions; use polars_core::prelude::*; #[cfg(feature = "rolling_window")] @@ -191,12 +189,7 @@ pub trait RollingAgg { /// Apply a rolling quantile (moving quantile) over the values in this array. /// A window of length `window_size` will traverse the array. The values that fill this window /// will (optionally) be weighted according to the `weights` vector. - fn rolling_quantile( - &self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptionsImpl, - ) -> PolarsResult; + fn rolling_quantile(&self, options: RollingOptionsImpl) -> PolarsResult; /// Apply a rolling var (moving var) over the values in this array. /// A window of length `window_size` will traverse the array. The values that fill this window diff --git a/polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs b/polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs index 67522451833e..79a349009ead 100644 --- a/polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs +++ b/polars/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs @@ -150,3 +150,24 @@ where }; rolling_apply_agg_window::, _, _>(values, offset_iter, params) } + +#[allow(clippy::too_many_arguments)] +pub(crate) fn rolling_quantile( + values: &[T], + period: Duration, + time: &[i64], + closed_window: ClosedWindow, + tu: TimeUnit, + tz: Option<&TimeZone>, + params: DynArgs, +) -> PolarsResult +where + T: NativeType + Float + std::iter::Sum + SubAssign + AddAssign + IsFloat, +{ + let offset_iter = match tz { + #[cfg(feature = "timezones")] + Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => groupby_values_iter(period, time, closed_window, tu, None), + }; + rolling_apply_agg_window::, _, _>(values, offset_iter, params) +} diff --git a/polars/polars-time/src/series/_trait.rs b/polars/polars-time/src/series/_trait.rs index 03722e5ac06c..a4a913f39131 100644 --- a/polars/polars-time/src/series/_trait.rs +++ b/polars/polars-time/src/series/_trait.rs @@ -34,12 +34,7 @@ pub trait SeriesOpsTime { } /// Apply a rolling quantile to a Series. #[cfg(feature = "rolling_window")] - fn rolling_quantile( - &self, - _quantile: f64, - _interpolation: QuantileInterpolOptions, - _options: RollingOptionsImpl, - ) -> PolarsResult { + fn rolling_quantile(&self, _options: RollingOptionsImpl) -> PolarsResult { invalid_operation!(self) } @@ -85,16 +80,9 @@ impl SeriesOpsTime for Series { } /// Apply a rolling quantile to a Series. #[cfg(feature = "rolling_window")] - fn rolling_quantile( - &self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptionsImpl, - ) -> PolarsResult { - self.to_ops() - .rolling_quantile(quantile, interpolation, options) + fn rolling_quantile(&self, options: RollingOptionsImpl) -> PolarsResult { + self.to_ops().rolling_quantile(options) } - #[cfg(feature = "rolling_window")] fn rolling_min(&self, options: RollingOptionsImpl) -> PolarsResult { self.to_ops().rolling_min(options) diff --git a/polars/polars-time/src/series/implementations/floats.rs b/polars/polars-time/src/series/implementations/floats.rs index e47e792483cd..d6a8d9377dab 100644 --- a/polars/polars-time/src/series/implementations/floats.rs +++ b/polars/polars-time/src/series/implementations/floats.rs @@ -27,13 +27,8 @@ where } #[cfg(feature = "rolling_window")] - fn rolling_quantile( - &self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptionsImpl, - ) -> PolarsResult { - RollingAgg::rolling_quantile(self, quantile, interpolation, options) + fn rolling_quantile(&self, options: RollingOptionsImpl) -> PolarsResult { + RollingAgg::rolling_quantile(self, options) } #[cfg(feature = "rolling_window")] diff --git a/polars/polars-time/src/series/implementations/integers.rs b/polars/polars-time/src/series/implementations/integers.rs index 1d35649ef93e..8ede537ba771 100644 --- a/polars/polars-time/src/series/implementations/integers.rs +++ b/polars/polars-time/src/series/implementations/integers.rs @@ -24,13 +24,8 @@ where } #[cfg(feature = "rolling_window")] - fn rolling_quantile( - &self, - quantile: f64, - interpolation: QuantileInterpolOptions, - options: RollingOptionsImpl, - ) -> PolarsResult { - RollingAgg::rolling_quantile(self, quantile, interpolation, options) + fn rolling_quantile(&self, options: RollingOptionsImpl) -> PolarsResult { + RollingAgg::rolling_quantile(self, options) } #[cfg(feature = "rolling_window")] diff --git a/polars/tests/it/core/rolling_window.rs b/polars/tests/it/core/rolling_window.rs index 6d037a5722db..babb587f3dba 100644 --- a/polars/tests/it/core/rolling_window.rs +++ b/polars/tests/it/core/rolling_window.rs @@ -1,3 +1,7 @@ +use std::any::Any; + +use polars_core::prelude::QuantileInterpolOptions::Linear; + use super::*; #[test] @@ -309,29 +313,27 @@ fn test_median_quantile_types() { }) .unwrap(); + let rq_params = Some(Arc::new(RollingQuantileParams { + prob: 0.3, + interpol: Linear, + }) as Arc); let rol_quantile = s - .rolling_quantile( - 0.3, - QuantileInterpolOptions::Linear, - RollingOptionsImpl { - window_size: Duration::new(2), - min_periods: 1, - ..Default::default() - }, - ) + .rolling_quantile(RollingOptionsImpl { + window_size: Duration::new(2), + min_periods: 1, + fn_params: rq_params.clone(), + ..Default::default() + }) .unwrap(); let rol_quantile_weighted = s - .rolling_quantile( - 0.3, - QuantileInterpolOptions::Linear, - RollingOptionsImpl { - window_size: Duration::new(2), - min_periods: 1, - weights: Some(vec![1.0, 2.0]), - ..Default::default() - }, - ) + .rolling_quantile(RollingOptionsImpl { + window_size: Duration::new(2), + min_periods: 1, + weights: Some(vec![1.0, 2.0]), + fn_params: rq_params.clone(), + ..Default::default() + }) .unwrap(); assert_eq!(*rol_med.dtype(), DataType::Float64); @@ -358,28 +360,22 @@ fn test_median_quantile_types() { .unwrap(); let rol_quantile = s - .rolling_quantile( - 0.3, - QuantileInterpolOptions::Linear, - RollingOptionsImpl { - window_size: Duration::new(2), - min_periods: 1, - ..Default::default() - }, - ) + .rolling_quantile(RollingOptionsImpl { + window_size: Duration::new(2), + min_periods: 1, + fn_params: rq_params.clone(), + ..Default::default() + }) .unwrap(); let rol_quantile_weighted = s - .rolling_quantile( - 0.3, - QuantileInterpolOptions::Linear, - RollingOptionsImpl { - window_size: Duration::new(2), - min_periods: 1, - weights: Some(vec![1.0, 2.0]), - ..Default::default() - }, - ) + .rolling_quantile(RollingOptionsImpl { + window_size: Duration::new(2), + min_periods: 1, + weights: Some(vec![1.0, 2.0]), + fn_params: rq_params.clone(), + ..Default::default() + }) .unwrap(); assert_eq!(*rol_med.dtype(), DataType::Float32); @@ -406,28 +402,22 @@ fn test_median_quantile_types() { .unwrap(); let rol_quantile = s1 - .rolling_quantile( - 0.3, - QuantileInterpolOptions::Linear, - RollingOptionsImpl { - window_size: Duration::new(2), - min_periods: 1, - ..Default::default() - }, - ) + .rolling_quantile(RollingOptionsImpl { + window_size: Duration::new(2), + min_periods: 1, + fn_params: rq_params.clone(), + ..Default::default() + }) .unwrap(); let rol_quantile_weighted = s1 - .rolling_quantile( - 0.3, - QuantileInterpolOptions::Linear, - RollingOptionsImpl { - window_size: Duration::new(2), - min_periods: 1, - weights: Some(vec![1.0, 2.0]), - ..Default::default() - }, - ) + .rolling_quantile(RollingOptionsImpl { + window_size: Duration::new(2), + min_periods: 1, + weights: Some(vec![1.0, 2.0]), + fn_params: rq_params.clone(), + ..Default::default() + }) .unwrap(); assert_eq!(*rol_med.dtype(), DataType::Float64); diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index 8a6ea0834bf7..fa7d0bf638e3 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -925,9 +925,12 @@ impl PyExpr { center, by, closed_window: closed.map(|c| c.0), - ..Default::default() + fn_params: Some(Arc::new(RollingQuantileParams { + prob: 0.5, + interpol: QuantileInterpolOptions::Linear, + }) as Arc), }; - self.inner.clone().rolling_median(options).into() + self.inner.clone().rolling_quantile(options).into() } #[pyo3(signature = (quantile, interpolation, window_size, weights, min_periods, center, by, closed))] @@ -950,13 +953,13 @@ impl PyExpr { center, by, closed_window: closed.map(|c| c.0), - ..Default::default() + fn_params: Some(Arc::new(RollingQuantileParams { + prob: quantile, + interpol: interpolation.0, + }) as Arc), }; - self.inner - .clone() - .rolling_quantile(quantile, interpolation.0, options) - .into() + self.inner.clone().rolling_quantile(options).into() } fn rolling_skew(&self, window_size: usize, bias: bool) -> Self { From 003db95ddaefd2b995c0c280102c91c74d58043a Mon Sep 17 00:00:00 2001 From: Josh Magarick Date: Sat, 15 Jul 2023 00:15:57 -0700 Subject: [PATCH 17/37] perf(rust, python): Rolling min/max for partially sorted data (#9819) --- .../src/kernels/rolling/no_nulls/min_max.rs | 545 +++++++----------- 1 file changed, 211 insertions(+), 334 deletions(-) diff --git a/polars/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs b/polars/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs index 58de1a0ec9ee..42be11b9d9ae 100644 --- a/polars/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs +++ b/polars/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs @@ -3,180 +3,206 @@ use no_nulls::{rolling_apply_agg_window, RollingAggWindowNoNulls}; use super::*; -pub struct SortedMinMax<'a, T: NativeType> { - slice: &'a [T], +#[inline] +fn new_is_min(old: &T, new: &T) -> bool { + compare_fn_nan_min(old, new).is_ge() } -impl<'a, T: NativeType> RollingAggWindowNoNulls<'a, T> for SortedMinMax<'a, T> { - fn new(slice: &'a [T], _start: usize, _end: usize, _params: DynArgs) -> Self { - Self { slice } - } - - #[inline] - unsafe fn update(&mut self, start: usize, _end: usize) -> T { - *self.slice.get_unchecked(start) - } +#[inline] +fn new_is_max(old: &T, new: &T) -> bool { + compare_fn_nan_max(old, new).is_le() } #[inline] -unsafe fn get_min_and_idx(slice: &[T], start: usize, end: usize) -> Option<(usize, &T)> +unsafe fn get_min_and_idx( + slice: &[T], + start: usize, + end: usize, + sorted_to: usize, +) -> Option<(usize, &T)> where T: NativeType + IsFloat + PartialOrd, { - // Reversed because min_by returns the first min if there's a tie but we want the last - slice - .get_unchecked(start..end) - .iter() - .enumerate() - .rev() - .min_by(|&a, &b| compare_fn_nan_min(a.1, b.1)) -} - -pub struct MinWindow<'a, T: NativeType + PartialOrd + IsFloat> { - slice: &'a [T], - min: T, - min_idx: usize, - last_start: usize, - last_end: usize, -} - -impl<'a, T: NativeType + IsFloat + PartialOrd> RollingAggWindowNoNulls<'a, T> for MinWindow<'a, T> { - fn new(slice: &'a [T], start: usize, end: usize, _params: DynArgs) -> Self { - let (idx, min) = - unsafe { get_min_and_idx(slice, start, end).unwrap_or((0, &slice[start])) }; - Self { - slice, - min: *min, - min_idx: start + idx, - last_start: start, - last_end: end, - } - } - - unsafe fn update(&mut self, start: usize, end: usize) -> T { - //For details see: https://github.com/pola-rs/polars/pull/9277#issuecomment-1581401692 - self.last_start = start; // Don't care where the last one started - let old_last_end = self.last_end; // But we need this - self.last_end = end; - - let entering_start = std::cmp::max(old_last_end, start); - let entering = get_min_and_idx(self.slice, entering_start, end); - let empty_overlap = old_last_end <= start; - - if entering.is_some_and(|em| compare_fn_nan_min(&self.min, em.1).is_ge() || empty_overlap) { - // If the entering min <= the current min return early, since no value in the overlap can be smaller than either. - self.min = *entering.unwrap().1; - self.min_idx = entering_start + entering.unwrap().0; - return self.min; - } else if self.min_idx >= start || empty_overlap { - // If the entering min isn't the smallest but the current min is between start and end we can still ignore the overlap - return self.min; - } - // Otherwise get the min of the overlapping window and the entering min - match (get_min_and_idx(self.slice, start, old_last_end), entering) { - (Some(pm), Some(em)) => { - if compare_fn_nan_min(pm.1, em.1).is_ge() { - self.min = *em.1; - self.min_idx = entering_start + em.0; + if sorted_to >= end { + // If we're sorted past the end we can just take the first element because this function + // won't be called on intervals that contain the previous min + Some((start, slice.get_unchecked(start))) + } else if sorted_to <= start { + // We have to inspect the whole range + // Reversed because min_by returns the first min if there's a tie but we want the last + slice + .get_unchecked(start..end) + .iter() + .enumerate() + .rev() + .min_by(|&a, &b| compare_fn_nan_min(a.1, b.1)) + .map(|v| (v.0 + start, v.1)) + } else { + // It's sorted in range start..sorted_to. Compare slice[start] to min over sorted_to..end + let s = (start, slice.get_unchecked(start)); + slice + .get_unchecked(sorted_to..end) + .iter() + .enumerate() + .rev() + .min_by(|&a, &b| compare_fn_nan_min(a.1, b.1)) + .map(|v| { + if new_is_min(s.1, v.1) { + (v.0 + sorted_to, v.1) } else { - self.min = *pm.1; - self.min_idx = start + pm.0; + s } - } - (Some(pm), None) => { - self.min = *pm.1; - self.min_idx = start + pm.0; - } - (None, Some(em)) => { - self.min = *em.1; - self.min_idx = entering_start + em.0; - } - // We shouldn't reach this, but it means - (None, None) => {} - } - - self.min + }) } } #[inline] -unsafe fn get_max_and_idx(slice: &[T], start: usize, end: usize) -> Option<(usize, &T)> +unsafe fn get_max_and_idx( + slice: &[T], + start: usize, + end: usize, + sorted_to: usize, +) -> Option<(usize, &T)> where T: NativeType + IsFloat + PartialOrd, { + if sorted_to >= end { + Some((start, slice.get_unchecked(start))) + } else if sorted_to <= start { + slice + .get_unchecked(start..end) + .iter() + .enumerate() + .max_by(|&a, &b| compare_fn_nan_max(a.1, b.1)) + .map(|v| (v.0 + start, v.1)) + } else { + let s = (start, slice.get_unchecked(start)); + slice + .get_unchecked(sorted_to..end) + .iter() + .enumerate() + .max_by(|&a, &b| compare_fn_nan_max(a.1, b.1)) + .map(|v| { + if new_is_max(s.1, v.1) { + (v.0 + sorted_to, v.1) + } else { + s + } + }) + } +} + +#[inline] +fn n_sorted_past_min(slice: &[T]) -> usize { slice - .get_unchecked(start..end) - .iter() - .enumerate() - .max_by(|&a, &b| compare_fn_nan_max(a.1, b.1)) + .windows(2) + .position(|x| compare_fn_nan_min(&x[0], &x[1]).is_gt()) + .unwrap_or(slice.len() - 1) } -pub struct MaxWindow<'a, T: NativeType> { - slice: &'a [T], - max: T, - max_idx: usize, - last_start: usize, - last_end: usize, +#[inline] +fn n_sorted_past_max(slice: &[T]) -> usize { + slice + .windows(2) + .position(|x| compare_fn_nan_max(&x[0], &x[1]).is_lt()) + .unwrap_or(slice.len() - 1) } -impl<'a, T: NativeType + IsFloat + PartialOrd> RollingAggWindowNoNulls<'a, T> for MaxWindow<'a, T> { - fn new(slice: &'a [T], start: usize, end: usize, _params: DynArgs) -> Self { - let (idx, max) = - unsafe { get_max_and_idx(slice, start, end).unwrap_or((0, &slice[start])) }; - Self { - slice, - max: *max, - max_idx: start + idx, - last_start: start, - last_end: end, +// Min and max really are the same thing up to a difference in comparison direction, as represented +// here by helpers we pass in. Making both with a macro helps keep behavior synchronized +macro_rules! minmax_window { + ($m_window:tt, $get_m_and_idx:ident, $new_is_m:ident, $n_sorted_past:ident) => { + pub struct $m_window<'a, T: NativeType + PartialOrd + IsFloat> { + slice: &'a [T], + m: T, + m_idx: usize, + sorted_to: usize, + last_start: usize, + last_end: usize, } - } - unsafe fn update(&mut self, start: usize, end: usize) -> T { - self.last_start = start; // Don't care where the last one started - let old_last_end = self.last_end; // But we need this - self.last_end = end; + impl<'a, T: NativeType + IsFloat + PartialOrd> $m_window<'a, T> { + #[inline] + unsafe fn update_m_and_m_idx(&mut self, m_and_idx: (usize, &T)) { + self.m = *m_and_idx.1; + self.m_idx = m_and_idx.0; + if self.sorted_to <= self.m_idx { + // Track how far past the current extremum values are sorted. Direction depends on min/max + // Tracking sorted ranges lets us only do comparisons when we have to. + self.sorted_to = + self.m_idx + 1 + $n_sorted_past(&self.slice.get_unchecked(self.m_idx..)); + } + } + } - let entering_start = std::cmp::max(old_last_end, start); - let entering = get_max_and_idx(self.slice, entering_start, end); - let empty_overlap = old_last_end < start; + impl<'a, T: NativeType + IsFloat + PartialOrd> RollingAggWindowNoNulls<'a, T> + for $m_window<'a, T> + { + fn new(slice: &'a [T], start: usize, end: usize, _params: DynArgs) -> Self { + let (idx, m) = + unsafe { $get_m_and_idx(slice, start, end, 0).unwrap_or((0, &slice[start])) }; + Self { + slice, + m: *m, + m_idx: idx, + sorted_to: idx + 1 + $n_sorted_past(&slice[idx..]), + last_start: start, + last_end: end, + } + } - if entering.is_some_and(|em| compare_fn_nan_max(&self.max, em.1).is_le() || empty_overlap) { - // If the entering max >= the current max return early, since no value in the overlap can be larger than either. - self.max = *entering.unwrap().1; - self.max_idx = entering_start + entering.unwrap().0; - return self.max; - } else if self.max_idx >= start || empty_overlap { - // If the entering max isn't the largest but the current max is between start and end we can still ignore the overlap - return self.max; - } - // Otherwise get the max of the overlapping window and the entering max - match (get_max_and_idx(self.slice, start, old_last_end), entering) { - (Some(pm), Some(em)) => { - if compare_fn_nan_max(pm.1, em.1).is_le() { - self.max = *em.1; - self.max_idx = entering_start + em.0; + unsafe fn update(&mut self, start: usize, end: usize) -> T { + //For details see: https://github.com/pola-rs/polars/pull/9277#issuecomment-1581401692 + self.last_start = start; // Don't care where the last one started + let old_last_end = self.last_end; // But we need this + self.last_end = end; + let entering_start = std::cmp::max(old_last_end, start); + let entering = if end - entering_start == 1 { + // Faster in the special, but common, case of a fixed window rolling by one + Some((entering_start, self.slice.get_unchecked(entering_start))) + } else if old_last_end == end { + // Edge case for shrinking windows + None } else { - self.max = *pm.1; - self.max_idx = start + pm.0; + $get_m_and_idx(self.slice, entering_start, end, self.sorted_to) + }; + let empty_overlap = old_last_end <= start; + + if entering.is_some_and(|em| $new_is_m(&self.m, em.1) || empty_overlap) { + // The entering extremum "beats" the previous extremum so we can ignore the overlap + self.update_m_and_m_idx(entering.unwrap()); + return self.m; + } else if self.m_idx >= start || empty_overlap { + // The previous extremum didn't drop off. Keep it + return self.m; } + // Otherwise get the min of the overlapping window and the entering min + match ( + $get_m_and_idx(self.slice, start, old_last_end, self.sorted_to), + entering, + ) { + (Some(pm), Some(em)) => { + if $new_is_m(pm.1, em.1) { + self.update_m_and_m_idx(em); + } else { + self.update_m_and_m_idx(pm); + } + } + (Some(pm), None) => self.update_m_and_m_idx(pm), + (None, Some(em)) => self.update_m_and_m_idx(em), + // This would mean both the entering and previous windows are empty + (None, None) => unreachable!(), + } + + self.m } - (Some(pm), None) => { - self.max = *pm.1; - self.max_idx = start + pm.0; - } - (None, Some(em)) => { - self.max = *em.1; - self.max_idx = entering_start + em.0; - } - // We shouldn't reach this, but it means - (None, None) => {} } - - self.max - } + }; } +minmax_window!(MinWindow, get_min_and_idx, new_is_min, n_sorted_past_min); +minmax_window!(MaxWindow, get_max_and_idx, new_is_max, n_sorted_past_max); + pub(crate) fn compute_min_weights(values: &[T], weights: &[T]) -> T where T: NativeType + PartialOrd + std::ops::Mul, @@ -206,206 +232,57 @@ where max } -pub fn is_reverse_sorted_max(values: &[T]) -> bool { - values - .windows(2) - .all(|w| match compare_fn_nan_min(&w[0], &w[1]) { - Ordering::Equal => true, - Ordering::Greater => true, - Ordering::Less => false, - }) -} - -pub fn rolling_max( - values: &[T], - window_size: usize, - min_periods: usize, - center: bool, - weights: Option<&[f64]>, - _params: DynArgs, -) -> PolarsResult -where - T: NativeType + PartialOrd + IsFloat + Bounded + NumCast + Mul, -{ - match (center, weights) { - (true, None) => { - // will be O(n2) if we don't take this path we hope that we hit an early return on not sorted data - if is_reverse_sorted_max(values) { - rolling_apply_agg_window::, _, _>( +// Same as the window definition. The dispatch is identical up to the name. +macro_rules! rolling_minmax_func { + ($rolling_m:ident, $window:tt, $wtd_f:ident) => { + pub fn $rolling_m( + values: &[T], + window_size: usize, + min_periods: usize, + center: bool, + weights: Option<&[f64]>, + _params: DynArgs, + ) -> PolarsResult + where + T: NativeType + PartialOrd + IsFloat + Bounded + NumCast + Mul, + { + let offset_fn = match center { + true => det_offsets_center, + false => det_offsets, + }; + match weights { + None => rolling_apply_agg_window::<$window<_>, _, _>( values, window_size, min_periods, - det_offsets_center, + offset_fn, None, - ) - } else { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets_center, - None, - ) - } - } - (false, None) => { - if is_reverse_sorted_max(values) { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets, - None, - ) - } else { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets, - None, - ) + ), + Some(weights) => { + assert!( + T::is_float(), + "implementation error, should only be reachable by float types" + ); + let weights = weights + .iter() + .map(|v| NumCast::from(*v).unwrap()) + .collect::>(); + no_nulls::rolling_apply_weights( + values, + window_size, + min_periods, + offset_fn, + $wtd_f, + &weights, + ) + } } } - (true, Some(weights)) => { - assert!( - T::is_float(), - "implementation error, should only be reachable by float types" - ); - let weights = weights - .iter() - .map(|v| NumCast::from(*v).unwrap()) - .collect::>(); - no_nulls::rolling_apply_weights( - values, - window_size, - min_periods, - det_offsets_center, - compute_max_weights, - &weights, - ) - } - (false, Some(weights)) => { - assert!( - T::is_float(), - "implementation error, should only be reachable by float types" - ); - let weights = weights - .iter() - .map(|v| NumCast::from(*v).unwrap()) - .collect::>(); - no_nulls::rolling_apply_weights( - values, - window_size, - min_periods, - det_offsets, - compute_max_weights, - &weights, - ) - } - } + }; } -pub fn is_sorted_min(values: &[T]) -> bool { - values - .windows(2) - .all(|w| match compare_fn_nan_min(&w[0], &w[1]) { - Ordering::Equal => true, - Ordering::Less => true, - Ordering::Greater => false, - }) -} - -pub fn rolling_min( - values: &[T], - window_size: usize, - min_periods: usize, - center: bool, - weights: Option<&[f64]>, - _params: DynArgs, -) -> PolarsResult -where - T: NativeType + PartialOrd + NumCast + Mul + Bounded + IsFloat, -{ - match (center, weights) { - (true, None) => { - // will be O(n2) if we don't take this path we hope that we hit an early return on not sorted data - if is_sorted_min(values) { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets_center, - None, - ) - } else { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets_center, - None, - ) - } - } - (false, None) => { - // will be O(n2) - if is_sorted_min(values) { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets, - None, - ) - } else { - rolling_apply_agg_window::, _, _>( - values, - window_size, - min_periods, - det_offsets, - None, - ) - } - } - (true, Some(weights)) => { - assert!( - T::is_float(), - "implementation error, should only be reachable by float types" - ); - let weights = weights - .iter() - .map(|v| NumCast::from(*v).unwrap()) - .collect::>(); - no_nulls::rolling_apply_weights( - values, - window_size, - min_periods, - det_offsets_center, - compute_min_weights, - &weights, - ) - } - (false, Some(weights)) => { - assert!( - T::is_float(), - "implementation error, should only be reachable by float types" - ); - let weights = weights - .iter() - .map(|v| NumCast::from(*v).unwrap()) - .collect::>(); - no_nulls::rolling_apply_weights( - values, - window_size, - min_periods, - det_offsets, - compute_min_weights, - &weights, - ) - } - } -} +rolling_minmax_func!(rolling_min, MinWindow, compute_min_weights); +rolling_minmax_func!(rolling_max, MaxWindow, compute_max_weights); #[cfg(test)] mod test { From 5810a1dc9081eb0e8da32cca209d0a289cf79e52 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 15 Jul 2023 09:57:32 +0200 Subject: [PATCH 18/37] fix(rust, python): sum aggregation empty set is 0, not null (#9894) --- .../src/chunked_array/upstream_traits.rs | 11 ++--- .../src/frame/groupby/aggregations/mod.rs | 41 +++++++++++++++---- polars/polars-lazy/src/tests/queries.rs | 2 +- polars/tests/it/lazy/groupby.rs | 2 +- py-polars/polars/expr/expr.py | 32 +++++++-------- .../unit/operations/test_aggregations.py | 4 ++ .../tests/unit/operations/test_rolling.py | 8 +++- .../tests/unit/operations/test_window.py | 2 +- py-polars/tests/unit/test_queries.py | 2 +- 9 files changed, 66 insertions(+), 38 deletions(-) diff --git a/polars/polars-core/src/chunked_array/upstream_traits.rs b/polars/polars-core/src/chunked_array/upstream_traits.rs index 4c3dbb4ae232..45e514c42234 100644 --- a/polars/polars-core/src/chunked_array/upstream_traits.rs +++ b/polars/polars-core/src/chunked_array/upstream_traits.rs @@ -421,14 +421,9 @@ where fn from_par_iter>(iter: I) -> Self { // Get linkedlist filled with different vec result from different threads let vectors = collect_into_linked_list(iter); - let capacity: usize = get_capacity_from_par_results(&vectors); - - let mut av = Vec::::with_capacity(capacity); - for v in vectors { - av.extend_from_slice(&v) - } - let arr = to_array::(av, None); - unsafe { NoNull::new(ChunkedArray::from_chunks("", vec![arr])) } + let vectors = vectors.into_iter().collect::>(); + let values = flatten_par(&vectors); + NoNull::new(ChunkedArray::new_vec("", values)) } } diff --git a/polars/polars-core/src/frame/groupby/aggregations/mod.rs b/polars/polars-core/src/frame/groupby/aggregations/mod.rs index c9400cd7d14c..2c17f80fc3c2 100644 --- a/polars/polars-core/src/frame/groupby/aggregations/mod.rs +++ b/polars/polars-core/src/frame/groupby/aggregations/mod.rs @@ -29,6 +29,7 @@ use crate::frame::groupby::GroupsIndicator; use crate::prelude::*; use crate::series::implementations::SeriesWrap; use crate::series::IsSorted; +use crate::utils::NoNull; use crate::{apply_method_physical_integer, POOL}; fn idx2usize(idx: &[IdxSize]) -> impl Iterator + ExactSizeIterator + '_ { @@ -167,6 +168,17 @@ where ca.into_series() } +// same helper as `_agg_helper_idx` but for aggregations that don't return an Option +pub fn _agg_helper_idx_no_null(groups: &GroupsIdx, f: F) -> Series +where + F: Fn((IdxSize, &Vec)) -> T::Native + Send + Sync, + T: PolarsNumericType, + ChunkedArray: IntoSeries, +{ + let ca: NoNull> = POOL.install(|| groups.into_par_iter().map(f).collect()); + ca.into_inner().into_series() +} + // helper that iterates on the `all: Vec` collection // this doesn't have traverse the `first: Vec` memory and is therefore faster fn agg_helper_idx_on_all(groups: &GroupsIdx, f: F) -> Series @@ -189,6 +201,16 @@ where ca.into_series() } +pub fn _agg_helper_slice_no_null(groups: &[[IdxSize; 2]], f: F) -> Series +where + F: Fn([IdxSize; 2]) -> T::Native + Send + Sync, + T: PolarsNumericType, + ChunkedArray: IntoSeries, +{ + let ca: NoNull> = POOL.install(|| groups.par_iter().copied().map(f).collect()); + ca.into_inner().into_series() +} + #[inline(always)] fn take_min(a: T, b: T) -> T { if a < b { @@ -548,19 +570,19 @@ where let ca = self.rechunk(); let arr = ca.downcast_iter().next().unwrap(); let no_nulls = arr.null_count() == 0; - _agg_helper_idx::(groups, |(first, idx)| { + _agg_helper_idx_no_null::(groups, |(first, idx)| { debug_assert!(idx.len() <= self.len()); if idx.is_empty() { - None + T::Native::zero() } else if idx.len() == 1 { - arr.get(first as usize) + arr.get(first as usize).unwrap_or(T::Native::zero()) } else if no_nulls { - Some(take_agg_no_null_primitive_iter_unchecked( + take_agg_no_null_primitive_iter_unchecked( arr, idx2usize(idx), |a, b| a + b, T::Native::zero(), - )) + ) } else { take_agg_primitive_iter_unchecked::( arr, @@ -569,6 +591,7 @@ where T::Native::zero(), idx.len() as IdxSize, ) + .unwrap_or(T::Native::zero()) } }) } @@ -593,14 +616,14 @@ where }; Self::from_chunks("", vec![arr]).into_series() } else { - _agg_helper_slice::(groups, |[first, len]| { + _agg_helper_slice_no_null::(groups, |[first, len]| { debug_assert!(len <= self.len() as IdxSize); match len { - 0 => None, - 1 => self.get(first as usize), + 0 => T::Native::zero(), + 1 => self.get(first as usize).unwrap_or(T::Native::zero()), _ => { let arr_group = _slice_from_offsets(self, first, len); - arr_group.sum() + arr_group.sum().unwrap_or(T::Native::zero()) } } }) diff --git a/polars/polars-lazy/src/tests/queries.rs b/polars/polars-lazy/src/tests/queries.rs index a1bc18dd27f7..2a6dc72fc4f9 100644 --- a/polars/polars-lazy/src/tests/queries.rs +++ b/polars/polars-lazy/src/tests/queries.rs @@ -907,7 +907,7 @@ fn test_lazy_groupby_filter() -> PolarsResult<()> { assert_eq!( Vec::from(out.column("b_sum").unwrap().i32().unwrap()), - [Some(6), None, None] + [Some(6), Some(0), Some(0)] ); assert_eq!( Vec::from(out.column("b_first").unwrap().i32().unwrap()), diff --git a/polars/tests/it/lazy/groupby.rs b/polars/tests/it/lazy/groupby.rs index c74cf3363ed0..6e3ed7e09666 100644 --- a/polars/tests/it/lazy/groupby.rs +++ b/polars/tests/it/lazy/groupby.rs @@ -27,7 +27,7 @@ fn test_filter_sort_diff_2984() -> PolarsResult<()> { .sort("group", Default::default()) .collect()?; - assert_eq!(Vec::from(out.column("id")?.i32()?), &[Some(1), None]); + assert_eq!(Vec::from(out.column("id")?.i32()?), &[Some(1), Some(0)]); Ok(()) } diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 768bed40458d..f1e7853c618d 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3492,14 +3492,14 @@ def filter(self, predicate: Expr) -> Self: ... ] ... ).sort("group_col") shape: (2, 3) - ┌───────────┬──────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪══════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ null ┆ 3 │ - └───────────┴──────┴─────┘ + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.filter(predicate._pyexpr)) @@ -3530,14 +3530,14 @@ def where(self, predicate: Expr) -> Self: ... ] ... ).sort("group_col") shape: (2, 3) - ┌───────────┬──────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪══════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ null ┆ 3 │ - └───────────┴──────┴─────┘ + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ """ return self.filter(predicate) diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/test_aggregations.py index 425729fc95e9..b0e731c3c536 100644 --- a/py-polars/tests/unit/operations/test_aggregations.py +++ b/py-polars/tests/unit/operations/test_aggregations.py @@ -276,3 +276,7 @@ def test_sum_empty_and_null_set() -> None: series = pl.Series("a", [None]) assert series.sum() == 0 + + df = pl.DataFrame({"a": [None, None, None], "b": [1, 1, 1]}) + assert df.select(pl.sum("a")).item() == 0.0 + assert df.groupby("b").agg(pl.sum("a"))["a"].item() == 0.0 diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index 9218de269fb1..81096064da74 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -50,7 +50,13 @@ def test_rolling_kernels_and_groupby_rolling( out1 = example_df.select( [ pl.col("dt"), - pl.col("values").rolling_sum(period, by="dt", closed=closed).alias("sum"), + # this differs from groupby aggregation because the empty window is + # null here + # where the sum aggregation of an empty set is 0 + pl.col("values") + .rolling_sum(period, by="dt", closed=closed) + .fill_null(0) + .alias("sum"), pl.col("values").rolling_var(period, by="dt", closed=closed).alias("var"), pl.col("values").rolling_mean(period, by="dt", closed=closed).alias("mean"), pl.col("values").rolling_std(period, by="dt", closed=closed).alias("std"), diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index 54643364cee5..67ccd2c9ce63 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -308,7 +308,7 @@ def test_window_5868() -> None: df = pl.DataFrame({"a": [None, 1, 2, 3, 3, 3, 4, 4]}) result = df.select(pl.col("a").sum().over("a")).get_column("a") - expected = pl.Series("a", [None, 1, 2, 9, 9, 9, 8, 8]) + expected = pl.Series("a", [0, 1, 2, 9, 9, 9, 8, 8]) assert_series_equal(result, expected) result = ( diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index 3e333b852df1..6f658157c275 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -194,7 +194,7 @@ def test_groupby_agg_equals_zero_3535() -> None: ).to_dict(False) == { "key": ["aa", "bb", "cc"], "val1": [10, 0, -99], - "val2": [None, 0.0, 10.5], + "val2": [0.0, 0.0, 10.5], } From c8a98f9d4c0fb0273e59b51b653dd6dcec187f99 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 15 Jul 2023 10:03:15 +0200 Subject: [PATCH 19/37] fix(python): Handle `DataFrame.vstack` stacking itself (#9895) --- py-polars/polars/dataframe/frame.py | 24 ++++++---- py-polars/polars/series/series.py | 4 +- py-polars/src/dataframe.rs | 24 +++++----- .../tests/unit/{ => dataframe}/test_df.py | 16 ------- py-polars/tests/unit/dataframe/test_vstack.py | 46 +++++++++++++++++++ py-polars/tests/unit/test_lazy.py | 2 +- 6 files changed, 77 insertions(+), 39 deletions(-) rename py-polars/tests/unit/{ => dataframe}/test_df.py (99%) create mode 100644 py-polars/tests/unit/dataframe/test_vstack.py diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index dc648c8a2883..f3f1a71b8f0d 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5811,16 +5811,17 @@ def hstack( else: return self._from_pydf(self._df.hstack([s._s for s in columns])) - def vstack(self, df: DataFrame, *, in_place: bool = False) -> Self: + @deprecated_alias(df="other") + def vstack(self, other: DataFrame, *, in_place: bool = False) -> Self: """ Grow this DataFrame vertically by stacking a DataFrame to it. Parameters ---------- - df + other DataFrame to stack. in_place - Modify in place + Modify in place. Examples -------- @@ -5853,12 +5854,19 @@ def vstack(self, df: DataFrame, *, in_place: bool = False) -> Self: """ if in_place: - self._df.vstack_mut(df._df) - return self - else: - return self._from_pydf(self._df.vstack(df._df)) + try: + self._df.vstack_mut(other._df) + return self + except RuntimeError as exc: + if str(exc) == "Already mutably borrowed": + self._df.vstack_mut(other._df.clone()) + return self + else: + raise exc + + return self._from_pydf(self._df.vstack(other._df)) - def extend(self, other: Self) -> Self: + def extend(self, other: DataFrame) -> Self: """ Extend the memory backed by this `DataFrame` with the values from `other`. diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index b36da7732e86..3f7b1a3f5c38 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -2358,12 +2358,12 @@ def append(self, other: Series, *, append_chunks: bool = True) -> Series: self._s.append(other._s) else: self._s.extend(other._s) + return self except RuntimeError as exc: if str(exc) == "Already mutably borrowed": - self.append(other.clone(), append_chunks=append_chunks) + return self.append(other.clone(), append_chunks=append_chunks) else: raise exc - return self def filter(self, predicate: Series | list[bool]) -> Self: """ diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 3d8cf8de109a..11aeb760cb60 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -940,33 +940,33 @@ impl PyDataFrame { self.df.width() } + pub fn hstack(&self, columns: Vec) -> PyResult { + let columns = columns.to_series(); + let df = self.df.hstack(&columns).map_err(PyPolarsErr::from)?; + Ok(df.into()) + } + pub fn hstack_mut(&mut self, columns: Vec) -> PyResult<()> { let columns = columns.to_series(); self.df.hstack_mut(&columns).map_err(PyPolarsErr::from)?; Ok(()) } - pub fn hstack(&self, columns: Vec) -> PyResult { - let columns = columns.to_series(); - let df = self.df.hstack(&columns).map_err(PyPolarsErr::from)?; + pub fn vstack(&self, other: &PyDataFrame) -> PyResult { + let df = self.df.vstack(&other.df).map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn extend(&mut self, df: &PyDataFrame) -> PyResult<()> { - self.df.extend(&df.df).map_err(PyPolarsErr::from)?; + pub fn vstack_mut(&mut self, other: &PyDataFrame) -> PyResult<()> { + self.df.vstack_mut(&other.df).map_err(PyPolarsErr::from)?; Ok(()) } - pub fn vstack_mut(&mut self, df: &PyDataFrame) -> PyResult<()> { - self.df.vstack_mut(&df.df).map_err(PyPolarsErr::from)?; + pub fn extend(&mut self, other: &PyDataFrame) -> PyResult<()> { + self.df.extend(&other.df).map_err(PyPolarsErr::from)?; Ok(()) } - pub fn vstack(&mut self, df: &PyDataFrame) -> PyResult { - let df = self.df.vstack(&df.df).map_err(PyPolarsErr::from)?; - Ok(df.into()) - } - pub fn drop_in_place(&mut self, name: &str) -> PyResult { let s = self.df.drop_in_place(name).map_err(PyPolarsErr::from)?; Ok(PySeries { series: s }) diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/dataframe/test_df.py similarity index 99% rename from py-polars/tests/unit/test_df.py rename to py-polars/tests/unit/dataframe/test_df.py index 6f1ac0d0e019..a2056ee74bef 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -704,22 +704,6 @@ def test_hstack_dataframe(in_place: bool) -> None: assert_frame_equal(df_out, expected) -@pytest.mark.parametrize("in_place", [True, False]) -def test_vstack(in_place: bool) -> None: - df1 = pl.DataFrame({"foo": [1, 2], "bar": [6, 7], "ham": ["a", "b"]}) - df2 = pl.DataFrame({"foo": [3, 4], "bar": [8, 9], "ham": ["c", "d"]}) - - expected = pl.DataFrame( - {"foo": [1, 2, 3, 4], "bar": [6, 7, 8, 9], "ham": ["a", "b", "c", "d"]} - ) - - out = df1.vstack(df2, in_place=in_place) - if in_place: - assert_frame_equal(df1, expected) - else: - assert_frame_equal(out, expected) - - def test_extend() -> None: with pl.StringCache(): df1 = pl.DataFrame( diff --git a/py-polars/tests/unit/dataframe/test_vstack.py b/py-polars/tests/unit/dataframe/test_vstack.py new file mode 100644 index 000000000000..ecf88a2f987f --- /dev/null +++ b/py-polars/tests/unit/dataframe/test_vstack.py @@ -0,0 +1,46 @@ +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +@pytest.fixture() +def df1() -> pl.DataFrame: + return pl.DataFrame({"foo": [1, 2], "bar": [6, 7], "ham": ["a", "b"]}) + + +@pytest.fixture() +def df2() -> pl.DataFrame: + return pl.DataFrame({"foo": [3, 4], "bar": [8, 9], "ham": ["c", "d"]}) + + +def test_vstack(df1: pl.DataFrame, df2: pl.DataFrame) -> None: + result = df1.vstack(df2) + expected = pl.DataFrame( + {"foo": [1, 2, 3, 4], "bar": [6, 7, 8, 9], "ham": ["a", "b", "c", "d"]} + ) + assert_frame_equal(result, expected) + + +def test_vstack_in_place(df1: pl.DataFrame, df2: pl.DataFrame) -> None: + df1.vstack(df2, in_place=True) + expected = pl.DataFrame( + {"foo": [1, 2, 3, 4], "bar": [6, 7, 8, 9], "ham": ["a", "b", "c", "d"]} + ) + assert_frame_equal(df1, expected) + + +def test_vstack_self(df1: pl.DataFrame) -> None: + result = df1.vstack(df1) + expected = pl.DataFrame( + {"foo": [1, 2, 1, 2], "bar": [6, 7, 6, 7], "ham": ["a", "b", "a", "b"]} + ) + assert_frame_equal(result, expected) + + +def test_vstack_self_in_place(df1: pl.DataFrame) -> None: + df1.vstack(df1, in_place=True) + expected = pl.DataFrame( + {"foo": [1, 2, 1, 2], "bar": [6, 7, 6, 7], "ham": ["a", "b", "a", "b"]} + ) + assert_frame_equal(df1, expected) diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 1d885bcd893c..377ccb624b55 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -1099,7 +1099,7 @@ def test_lazy_concat(df: pl.DataFrame) -> None: out = pl.concat([df.lazy(), df.lazy()]).collect() assert out.shape == shape - assert_frame_equal(out, df.vstack(df.clone())) + assert_frame_equal(out, df.vstack(df)) def test_self_join() -> None: From edc9894b1e5737ab6a0e1ba7b4c35e82d9f1ba21 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 15 Jul 2023 10:49:07 +0200 Subject: [PATCH 20/37] feat(rust, python): pass through unknown schema in unnest (#9896) --- .../src/logical_plan/functions/mod.rs | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs b/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs index 540ce8962960..550040af3b73 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs @@ -196,15 +196,23 @@ impl FunctionNode { let mut new_schema = Schema::with_capacity(input_schema.len() * 2); for (name, dtype) in input_schema.iter() { if _columns.iter().any(|item| item.as_ref() == name.as_str()) { - if let DataType::Struct(flds) = dtype { - for fld in flds { - new_schema - .with_column(fld.name().clone(), fld.data_type().clone()); + match dtype { + DataType::Struct(flds) => { + for fld in flds { + new_schema.with_column( + fld.name().clone(), + fld.data_type().clone(), + ); + } + } + DataType::Unknown => { + // pass through unknown + } + _ => { + polars_bail!( + SchemaMismatch: "expected struct dtype, got: `{}`", dtype + ); } - } else { - polars_bail!( - SchemaMismatch: "expected struct dtype, got: `{}`", dtype - ); } } else { new_schema.with_column(name.clone(), dtype.clone()); From af8596582a7f1c300d225cc5ab5aba067a0434c8 Mon Sep 17 00:00:00 2001 From: Bela Stoyan Date: Sat, 15 Jul 2023 10:49:31 +0200 Subject: [PATCH 21/37] fix(rust,python) respect original series dtype when constructing `LitIter` (#9886) --- .../physical_plan/expressions/group_iter.rs | 55 ++++++++++++------- py-polars/tests/unit/test_schema.py | 22 ++++++++ 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/polars/polars-lazy/src/physical_plan/expressions/group_iter.rs b/polars/polars-lazy/src/physical_plan/expressions/group_iter.rs index c7813506f18b..048b53e7e649 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/group_iter.rs +++ b/polars/polars-lazy/src/physical_plan/expressions/group_iter.rs @@ -14,22 +14,29 @@ impl<'a> AggregationContext<'a> { self.groups(); let s = self.series().rechunk(); let name = if keep_names { s.name() } else { "" }; - Box::new(LitIter::new( - s.array_ref(0).clone(), - self.groups.len(), - name, - )) + // safety: dtype is correct + unsafe { + Box::new(LitIter::new( + s.array_ref(0).clone(), + self.groups.len(), + s._dtype(), + name, + )) + } } AggState::AggregatedFlat(_) => { self.groups(); let s = self.series(); let name = if keep_names { s.name() } else { "" }; - Box::new(FlatIter::new( - s.array_ref(0).clone(), - self.groups.len(), - s.dtype(), - name, - )) + // safety: dtype is correct + unsafe { + Box::new(FlatIter::new( + s.array_ref(0).clone(), + self.groups.len(), + s.dtype(), + name, + )) + } } AggState::AggregatedList(_) => { let s = self.series(); @@ -59,8 +66,15 @@ struct LitIter<'a> { } impl<'a> LitIter<'a> { - fn new(array: ArrayRef, len: usize, name: &str) -> Self { - let mut series_container = Box::pin(Series::try_from((name, array.clone())).unwrap()); + /// # Safety + /// Caller must ensure the given `logical` dtype belongs to `array`. + unsafe fn new(array: ArrayRef, len: usize, logical: &DataType, name: &str) -> Self { + let mut series_container = Box::pin(Series::from_chunks_and_dtype_unchecked( + name, + vec![array], + logical, + )); + let ref_s = &mut *series_container as *mut Series; Self { offset: 0, @@ -100,13 +114,14 @@ struct FlatIter<'a> { } impl<'a> FlatIter<'a> { - fn new(array: ArrayRef, len: usize, logical: &DataType, name: &str) -> Self { - let mut series_container = Box::pin( - Series::try_from((name, array.clone())) - .unwrap() - .cast(logical) - .unwrap(), - ); + /// # Safety + /// Caller must ensure the given `logical` dtype belongs to `array`. + unsafe fn new(array: ArrayRef, len: usize, logical: &DataType, name: &str) -> Self { + let mut series_container = Box::pin(Series::from_chunks_and_dtype_unchecked( + name, + vec![array.clone()], + logical, + )); let ref_s = &mut *series_container as *mut Series; Self { array, diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 3f3a28cc0767..d88bc41df905 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import date, timedelta from typing import Any import pytest @@ -513,3 +514,24 @@ def test_concat_vertically_relaxed() -> None: "a": [1.0, 0.2, 1.0, 2.0], "b": [None, 0.1, 2.0, 1.0], } + + +def test_lit_iter_schema() -> None: + df = pl.DataFrame( + { + "key": ["A", "A", "A", "A"], + "dates": [ + date(1970, 1, 1), + date(1970, 1, 1), + date(1970, 1, 2), + date(1970, 1, 3), + ], + } + ) + + assert df.groupby("key").agg(pl.col("dates").unique() + timedelta(days=1)).to_dict( + False + ) == { + "key": ["A"], + "dates": [[date(1970, 1, 2), date(1970, 1, 3), date(1970, 1, 4)]], + } From 672922491bac1f144747d39b864106d90010fd1e Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 15 Jul 2023 11:50:03 +0200 Subject: [PATCH 22/37] rust polars 0.31.0 (#9898) --- Cargo.toml | 8 ++++---- polars-cli/Cargo.toml | 6 +++--- polars/Cargo.toml | 14 +++++++------- polars/polars-algo/Cargo.toml | 6 +++--- polars/polars-arrow/Cargo.toml | 2 +- polars/polars-core/Cargo.toml | 8 ++++---- polars/polars-error/Cargo.toml | 2 +- polars/polars-io/Cargo.toml | 12 ++++++------ polars/polars-json/Cargo.toml | 6 +++--- polars/polars-lazy/Cargo.toml | 18 +++++++++--------- polars/polars-lazy/polars-pipe/Cargo.toml | 14 +++++++------- polars/polars-lazy/polars-plan/Cargo.toml | 12 ++++++------ polars/polars-ops/Cargo.toml | 8 ++++---- .../src/chunked_array/array/min_max.rs | 2 +- .../src/chunked_array/list/any_all.rs | 2 +- polars/polars-row/Cargo.toml | 4 ++-- polars/polars-sql/Cargo.toml | 8 ++++---- polars/polars-time/Cargo.toml | 8 ++++---- 18 files changed, 70 insertions(+), 70 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6fa06baf9c27..be81549ee883 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ exclude = [ ] [workspace.package] -version = "0.30.0" +version = "0.31.1" [workspace.dependencies] rayon = "1.6" @@ -33,11 +33,11 @@ strum_macros = "0.25" [workspace.dependencies.arrow] package = "arrow2" # git = "https://github.com/jorgecarleitao/arrow2" -git = "https://github.com/ritchie46/arrow2" +# git = "https://github.com/ritchie46/arrow2" # rev = "2d2e7053f9a50810bfe9cecff25ab39089aef98e" # path = "../arrow2" -branch = "polars_2023-06-26" -version = "0.17" +# branch = "polars_2023-06-26" +version = "0.17.2" default-features = false features = [ "compute_aggregate", diff --git a/polars-cli/Cargo.toml b/polars-cli/Cargo.toml index 8eca407c93ed..f6d8bd3c56c9 100644 --- a/polars-cli/Cargo.toml +++ b/polars-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "polars-cli" -version = "0.2.0" +version = "0.3.0" edition = "2021" license = "MIT" repository = "https://github.com/pola-rs/polars" @@ -28,11 +28,11 @@ ciborium = "0.2.0" clap = { version = "4.2.2", features = ["derive", "cargo"] } nu-ansi-term = { version = "0.47.0", optional = true } once_cell.workspace = true -polars = { version = "0.30.0", path = "../polars", features = ["lazy", "sql", "dtype-full", "serde-lazy"] } +polars = { version = "0.31.1", path = "../polars", features = ["lazy", "sql", "dtype-full", "serde-lazy"] } reedline = { version = "0.21.0" } serde = { version = "1.0.160", features = ["derive"] } sqlparser = "0.34" tmp_env = "0.1.1" [target.'cfg(target_os = "linux")'.dependencies] -jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } +jemallocator = { version = "0.5.0", features = ["disable_initial_exec_tls"] } diff --git a/polars/Cargo.toml b/polars/Cargo.toml index cdd74c68a9cd..48a3ef38dc15 100644 --- a/polars/Cargo.toml +++ b/polars/Cargo.toml @@ -312,13 +312,13 @@ bench = [ ] [dependencies] -polars-algo = { version = "0.30.0", path = "./polars-algo", optional = true } -polars-core = { version = "0.30.0", path = "./polars-core", features = ["docs"], default-features = false } -polars-io = { version = "0.30.0", path = "./polars-io", features = [], default-features = false, optional = true } -polars-lazy = { version = "0.30.0", path = "./polars-lazy", features = [], default-features = false, optional = true } -polars-ops = { version = "0.30.0", path = "./polars-ops" } -polars-sql = { version = "0.30.0", path = "./polars-sql", default-features = false, optional = true } -polars-time = { version = "0.30.0", path = "./polars-time", default-features = false, optional = true } +polars-algo = { version = "0.31.1", path = "./polars-algo", optional = true } +polars-core = { version = "0.31.1", path = "./polars-core", features = ["docs"], default-features = false } +polars-io = { version = "0.31.1", path = "./polars-io", features = [], default-features = false, optional = true } +polars-lazy = { version = "0.31.1", path = "./polars-lazy", features = [], default-features = false, optional = true } +polars-ops = { version = "0.31.1", path = "./polars-ops" } +polars-sql = { version = "0.31.1", path = "./polars-sql", default-features = false, optional = true } +polars-time = { version = "0.31.1", path = "./polars-time", default-features = false, optional = true } # enable js feature for getrandom to work in wasm [target.'cfg(target_family = "wasm")'.dependencies.getrandom] diff --git a/polars/polars-algo/Cargo.toml b/polars/polars-algo/Cargo.toml index 8f3d38b2659e..f3b2dc979ccf 100644 --- a/polars/polars-algo/Cargo.toml +++ b/polars/polars-algo/Cargo.toml @@ -9,9 +9,9 @@ description = "Algorithms built upon Polars primitives" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -polars-core = { version = "0.30.0", path = "../polars-core", features = ["dtype-categorical", "asof_join"], default-features = false } -polars-lazy = { version = "0.30.0", path = "../polars-lazy", features = ["asof_join", "concat_str", "strings"] } -polars-ops = { version = "0.30.0", path = "../polars-ops", features = ["dtype-categorical", "asof_join"], default-features = false } +polars-core = { version = "0.31.1", path = "../polars-core", features = ["dtype-categorical", "asof_join"], default-features = false } +polars-lazy = { version = "0.31.1", path = "../polars-lazy", features = ["asof_join", "concat_str", "strings"] } +polars-ops = { version = "0.31.1", path = "../polars-ops", features = ["dtype-categorical", "asof_join"], default-features = false } [package.metadata.docs.rs] all-features = true diff --git a/polars/polars-arrow/Cargo.toml b/polars/polars-arrow/Cargo.toml index ad7f29ad3334..ef42608fe1cc 100644 --- a/polars/polars-arrow/Cargo.toml +++ b/polars/polars-arrow/Cargo.toml @@ -17,7 +17,7 @@ ethnum = { version = "1.3.2", optional = true } hashbrown.workspace = true multiversion.workspace = true num-traits.workspace = true -polars-error = { version = "0.30.0", path = "../polars-error" } +polars-error = { version = "0.31.1", path = "../polars-error" } serde = { version = "1", features = ["derive"], optional = true } thiserror.workspace = true diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index a6ccda4ace62..f09c883b08c8 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -162,10 +162,10 @@ ndarray = { version = "0.15", optional = true, default_features = false } num-traits.workspace = true object_store = { version = "0.6.0", default-features = false, optional = true } once_cell.workspace = true -polars-arrow = { version = "0.30.0", path = "../polars-arrow", features = ["compute"] } -polars-error = { version = "0.30.0", path = "../polars-error" } -polars-row = { version = "0.30.0", path = "../polars-row" } -polars-utils = { version = "0.30.0", path = "../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../polars-arrow", features = ["compute"] } +polars-error = { version = "0.31.1", path = "../polars-error" } +polars-row = { version = "0.31.1", path = "../polars-row" } +polars-utils = { version = "0.31.1", path = "../polars-utils" } rand = { version = "0.8", optional = true, features = ["small_rng", "std"] } rand_distr = { version = "0.4", optional = true } rayon.workspace = true diff --git a/polars/polars-error/Cargo.toml b/polars/polars-error/Cargo.toml index a5ded6c1bfe3..1f4fb3a9a393 100644 --- a/polars/polars-error/Cargo.toml +++ b/polars/polars-error/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "polars-error" -version = "0.30.0" +version.workspace = true edition = "2021" license = "MIT" repository = "https://github.com/pola-rs/polars" diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index da041b52edf0..4c04dcd1d23c 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -67,12 +67,12 @@ memmap = { package = "memmap2", version = "0.5.2", optional = true } num-traits.workspace = true object_store = { version = "0.6.0", default-features = false, optional = true } once_cell = "1" -polars-arrow = { version = "0.30.0", path = "../polars-arrow" } -polars-core = { version = "0.30.0", path = "../polars-core", features = [], default-features = false } -polars-error = { version = "0.30.0", path = "../polars-error", default-features = false } -polars-json = { version = "0.30.0", optional = true, path = "../polars-json" } -polars-time = { version = "0.30.0", path = "../polars-time", features = [], default-features = false, optional = true } -polars-utils = { version = "0.30.0", path = "../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../polars-arrow" } +polars-core = { version = "0.31.1", path = "../polars-core", features = [], default-features = false } +polars-error = { version = "0.31.1", path = "../polars-error", default-features = false } +polars-json = { version = "0.31.1", optional = true, path = "../polars-json" } +polars-time = { version = "0.31.1", path = "../polars-time", features = [], default-features = false, optional = true } +polars-utils = { version = "0.31.1", path = "../polars-utils" } rayon.workspace = true regex = "1.6" serde = { version = "1", features = ["derive"], optional = true } diff --git a/polars/polars-json/Cargo.toml b/polars/polars-json/Cargo.toml index 1a365e579e3b..bc3fef2e38fa 100644 --- a/polars/polars-json/Cargo.toml +++ b/polars/polars-json/Cargo.toml @@ -16,7 +16,7 @@ fallible-streaming-iterator = "0.1" hashbrown.workspace = true indexmap.workspace = true num-traits.workspace = true -polars-arrow = { version = "0.30.0", path = "../polars-arrow", default-features = false } -polars-error = { version = "0.30.0", path = "../polars-error" } -polars-utils = { version = "0.30.0", path = "../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../polars-arrow", default-features = false } +polars-error = { version = "0.31.1", path = "../polars-error" } +polars-utils = { version = "0.31.1", path = "../polars-utils" } simd-json = { version = "0.10", features = ["allow-non-simd", "known-key"] } diff --git a/polars/polars-lazy/Cargo.toml b/polars/polars-lazy/Cargo.toml index ff3a9048b524..e43dba048ce8 100644 --- a/polars/polars-lazy/Cargo.toml +++ b/polars/polars-lazy/Cargo.toml @@ -17,15 +17,15 @@ ahash.workspace = true bitflags.workspace = true glob = "0.3" once_cell = "1" -polars-arrow = { version = "0.30.0", path = "../polars-arrow" } -polars-core = { version = "0.30.0", path = "../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } -polars-io = { version = "0.30.0", path = "../polars-io", features = ["lazy", "csv"], default-features = false } -polars-json = { version = "0.30.0", path = "../polars-json", optional = true } -polars-ops = { version = "0.30.0", path = "../polars-ops", default-features = false } -polars-pipe = { version = "0.30.0", path = "./polars-pipe", optional = true } -polars-plan = { version = "0.30.0", path = "./polars-plan" } -polars-time = { version = "0.30.0", path = "../polars-time", optional = true } -polars-utils = { version = "0.30.0", path = "../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../polars-arrow" } +polars-core = { version = "0.31.1", path = "../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } +polars-io = { version = "0.31.1", path = "../polars-io", features = ["lazy", "csv"], default-features = false } +polars-json = { version = "0.31.1", path = "../polars-json", optional = true } +polars-ops = { version = "0.31.1", path = "../polars-ops", default-features = false } +polars-pipe = { version = "0.31.1", path = "./polars-pipe", optional = true } +polars-plan = { version = "0.31.1", path = "./polars-plan" } +polars-time = { version = "0.31.1", path = "../polars-time", optional = true } +polars-utils = { version = "0.31.1", path = "../polars-utils" } pyo3 = { version = "0.19", optional = true } rayon.workspace = true smartstring.workspace = true diff --git a/polars/polars-lazy/polars-pipe/Cargo.toml b/polars/polars-lazy/polars-pipe/Cargo.toml index 608648f9940d..5ee405d57da8 100644 --- a/polars/polars-lazy/polars-pipe/Cargo.toml +++ b/polars/polars-lazy/polars-pipe/Cargo.toml @@ -14,13 +14,13 @@ crossbeam-queue = { version = "0.3", optional = true } enum_dispatch = "0.3" hashbrown.workspace = true num-traits.workspace = true -polars-arrow = { version = "0.30.0", path = "../../polars-arrow", default-features = false } -polars-core = { version = "0.30.0", path = "../../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } -polars-io = { version = "0.30.0", path = "../../polars-io", default-features = false, features = ["ipc", "async"] } -polars-ops = { version = "0.30.0", path = "../../polars-ops", features = ["search_sorted"] } -polars-plan = { version = "0.30.0", path = "../polars-plan", default-features = false, features = ["compile"] } -polars-row = { version = "0.30.0", path = "../../polars-row" } -polars-utils = { version = "0.30.0", path = "../../polars-utils", features = ["sysinfo"] } +polars-arrow = { version = "0.31.1", path = "../../polars-arrow", default-features = false } +polars-core = { version = "0.31.1", path = "../../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } +polars-io = { version = "0.31.1", path = "../../polars-io", default-features = false, features = ["ipc", "async"] } +polars-ops = { version = "0.31.1", path = "../../polars-ops", features = ["search_sorted"] } +polars-plan = { version = "0.31.1", path = "../polars-plan", default-features = false, features = ["compile"] } +polars-row = { version = "0.31.1", path = "../../polars-row" } +polars-utils = { version = "0.31.1", path = "../../polars-utils", features = ["sysinfo"] } rayon.workspace = true smartstring = { version = "1" } diff --git a/polars/polars-lazy/polars-plan/Cargo.toml b/polars/polars-lazy/polars-plan/Cargo.toml index 0ef24d05de08..090f7eded60a 100644 --- a/polars/polars-lazy/polars-plan/Cargo.toml +++ b/polars/polars-lazy/polars-plan/Cargo.toml @@ -16,12 +16,12 @@ chrono-tz = { version = "0.8", optional = true } ciborium = { version = "0.2", optional = true } futures = { version = "0.3.25", optional = true } once_cell.workspace = true -polars-arrow = { version = "0.30.0", path = "../../polars-arrow" } -polars-core = { version = "0.30.0", path = "../../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } -polars-io = { version = "0.30.0", path = "../../polars-io", features = ["lazy", "csv"], default-features = false } -polars-ops = { version = "0.30.0", path = "../../polars-ops", default-features = false } -polars-time = { version = "0.30.0", path = "../../polars-time", optional = true } -polars-utils = { version = "0.30.0", path = "../../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../../polars-arrow" } +polars-core = { version = "0.31.1", path = "../../polars-core", features = ["lazy", "zip_with", "random"], default-features = false } +polars-io = { version = "0.31.1", path = "../../polars-io", features = ["lazy", "csv"], default-features = false } +polars-ops = { version = "0.31.1", path = "../../polars-ops", default-features = false } +polars-time = { version = "0.31.1", path = "../../polars-time", optional = true } +polars-utils = { version = "0.31.1", path = "../../polars-utils" } pyo3 = { version = "0.19", optional = true } rayon.workspace = true regex = { version = "1.6", optional = true } diff --git a/polars/polars-ops/Cargo.toml b/polars/polars-ops/Cargo.toml index dfd0ce03a412..dae80e271e0c 100644 --- a/polars/polars-ops/Cargo.toml +++ b/polars/polars-ops/Cargo.toml @@ -18,10 +18,10 @@ hex = { version = "0.4", optional = true } indexmap.workspace = true jsonpath_lib = { version = "0.3.0", optional = true, git = "https://github.com/ritchie46/jsonpath", branch = "improve_compiled" } memchr.workspace = true -polars-arrow = { version = "0.30.0", path = "../polars-arrow", default-features = false } -polars-core = { version = "0.30.0", path = "../polars-core", features = [], default-features = false } -polars-json = { version = "0.30.0", optional = true, path = "../polars-json", default-features = false } -polars-utils = { version = "0.30.0", path = "../polars-utils", default-features = false } +polars-arrow = { version = "0.31.1", path = "../polars-arrow", default-features = false } +polars-core = { version = "0.31.1", path = "../polars-core", features = [], default-features = false } +polars-json = { version = "0.31.1", optional = true, path = "../polars-json", default-features = false } +polars-utils = { version = "0.31.1", path = "../polars-utils", default-features = false } serde = { version = "1", features = ["derive"], optional = true } serde_json = { version = "1", optional = true } smartstring.workspace = true diff --git a/polars/polars-ops/src/chunked_array/array/min_max.rs b/polars/polars-ops/src/chunked_array/array/min_max.rs index 22744120f7bb..ac4c85ced9ed 100644 --- a/polars/polars-ops/src/chunked_array/array/min_max.rs +++ b/polars/polars-ops/src/chunked_array/array/min_max.rs @@ -30,7 +30,7 @@ where (0..values.len()) .step_by(width) .map(|start| { - let sliced = values.clone().sliced_unchecked(start, start + width); + let sliced = unsafe { values.clone().sliced_unchecked(start, start + width) }; arr_agg(sliced) }) .collect() diff --git a/polars/polars-ops/src/chunked_array/list/any_all.rs b/polars/polars-ops/src/chunked_array/list/any_all.rs index 689955056865..550863cce5a2 100644 --- a/polars/polars-ops/src/chunked_array/list/any_all.rs +++ b/polars/polars-ops/src/chunked_array/list/any_all.rs @@ -30,7 +30,7 @@ where // TODO! // we can speed this upp if the boolean array doesn't have nulls // Then we can work directly on the byte slice. - let val = values.clone().sliced_unchecked(start, len); + let val = unsafe { values.clone().sliced_unchecked(start, len) }; start = end; op(&val) }); diff --git a/polars/polars-row/Cargo.toml b/polars/polars-row/Cargo.toml index d3120a649b64..5ccc9a647763 100644 --- a/polars/polars-row/Cargo.toml +++ b/polars/polars-row/Cargo.toml @@ -10,5 +10,5 @@ description = "Row encodings for the Polars DataFrame library" [dependencies] arrow.workspace = true -polars-error = { version = "0.30.0", path = "../polars-error" } -polars-utils = { version = "0.30.0", path = "../polars-utils" } +polars-error = { version = "0.31.1", path = "../polars-error" } +polars-utils = { version = "0.31.1", path = "../polars-utils" } diff --git a/polars/polars-sql/Cargo.toml b/polars/polars-sql/Cargo.toml index 30586f61797a..db7871220d6b 100644 --- a/polars/polars-sql/Cargo.toml +++ b/polars/polars-sql/Cargo.toml @@ -15,10 +15,10 @@ ipc = ["polars-lazy/ipc"] parquet = ["polars-lazy/parquet"] [dependencies] -polars-arrow = { version = "0.30.0", path = "../polars-arrow", features = ["like"] } -polars-core = { version = "0.30.0", path = "../polars-core", features = [] } -polars-lazy = { version = "0.30.0", path = "../polars-lazy", features = ["compile", "strings", "cross_join", "trigonometry", "abs", "round_series", "log", "regex", "is_in", "meta", "cum_agg"] } -polars-plan = { version = "0.30.0", path = "../polars-lazy/polars-plan", features = ["compile"] } +polars-arrow = { version = "0.31.1", path = "../polars-arrow", features = ["like"] } +polars-core = { version = "0.31.1", path = "../polars-core", features = [] } +polars-lazy = { version = "0.31.1", path = "../polars-lazy", features = ["compile", "strings", "cross_join", "trigonometry", "abs", "round_series", "log", "regex", "is_in", "meta", "cum_agg"] } +polars-plan = { version = "0.31.1", path = "../polars-lazy/polars-plan", features = ["compile"] } serde = "1" serde_json = { version = "1" } # sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs.git", rev = "ae3b5844c839072c235965fe0d1bddc473dced87" } diff --git a/polars/polars-time/Cargo.toml b/polars/polars-time/Cargo.toml index 4eb725ff7150..e7e0ff20adee 100644 --- a/polars/polars-time/Cargo.toml +++ b/polars/polars-time/Cargo.toml @@ -15,10 +15,10 @@ chrono = { version = "0.4", default-features = false, features = ["std"] } chrono-tz = { version = "0.8", optional = true } now = "0.1" once_cell.workspace = true -polars-arrow = { version = "0.30.0", path = "../polars-arrow", features = ["compute", "temporal"] } -polars-core = { version = "0.30.0", path = "../polars-core", default-features = false, features = ["dtype-datetime", "dtype-duration", "dtype-time", "dtype-date"] } -polars-ops = { version = "0.30.0", path = "../polars-ops" } -polars-utils = { version = "0.30.0", path = "../polars-utils" } +polars-arrow = { version = "0.31.1", path = "../polars-arrow", features = ["compute", "temporal"] } +polars-core = { version = "0.31.1", path = "../polars-core", default-features = false, features = ["dtype-datetime", "dtype-duration", "dtype-time", "dtype-date"] } +polars-ops = { version = "0.31.1", path = "../polars-ops" } +polars-utils = { version = "0.31.1", path = "../polars-utils" } regex = "1.7.1" serde = { version = "1", features = ["derive"], optional = true } smartstring.workspace = true From c35b9817a8230f50694b1cb5290ffd379e4777f0 Mon Sep 17 00:00:00 2001 From: Thomas Aarholt Date: Sat, 15 Jul 2023 16:41:17 +0200 Subject: [PATCH 23/37] docs(python): Mention func_horizontal on deprecated func docstrings (#9863) Co-authored-by: Stijn de Gooijer --- .../polars/functions/aggregation/vertical.py | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py index 1f0e3adfbee1..fa345cbeb43c 100644 --- a/py-polars/polars/functions/aggregation/vertical.py +++ b/py-polars/polars/functions/aggregation/vertical.py @@ -40,7 +40,7 @@ def all( Otherwise, this function computes the bitwise AND horizontally across multiple columns. - **This functionality is deprecated**. + **This functionality is deprecated**, use ``pl.all_horizontal`` instead. Parameters ---------- @@ -50,6 +50,10 @@ def all( *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + See Also + -------- + all_horizontal + Examples -------- Selecting all columns. @@ -126,7 +130,11 @@ def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | b Otherwise, this function computes the bitwise OR horizontally across multiple columns. - **This functionality is deprecated**. + **This functionality is deprecated**, use ``pl.any_horizontal`` instead. + + See Also + -------- + any_horizontal Parameters ---------- @@ -195,7 +203,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A Otherwise, this function computes the maximum value horizontally across multiple columns. - **This functionality is deprecated**. + **This functionality is deprecated**, use ``pl.max_horizontal`` instead. Parameters ---------- @@ -205,6 +213,10 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + See Also + -------- + max_horizontal + Examples -------- Get the maximum value of a column by passing a single column name. @@ -291,7 +303,7 @@ def min( Otherwise, this function computes the minimum value horizontally across multiple columns. - **This functionality is deprecated**. + **This functionality is deprecated**, use ``pl.min_horizontal`` instead. Parameters ---------- @@ -301,6 +313,10 @@ def min( *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + See Also + -------- + min_horizontal + Examples -------- Get the minimum value of a column by passing a single column name. @@ -387,7 +403,7 @@ def sum( **This functionality is deprecated**. Otherwise, this function computes the sum horizontally across multiple columns. - **This functionality is deprecated**. + **This functionality is deprecated**, use ``pl.sum_horizontal`` instead. Parameters ---------- @@ -397,6 +413,10 @@ def sum( *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + See Also + -------- + sum_horizontal + Examples -------- Sum a column by name: @@ -485,7 +505,7 @@ def cumsum( Otherwise, this function computes the cumulative sum horizontally across multiple columns. - **This functionality is deprecated**. + **This functionality is deprecated**, use ``pl.cumsum_horizontal`` instead. Parameters ---------- @@ -495,6 +515,10 @@ def cumsum( *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + See Also + -------- + cumsum_horizontal + Examples -------- >>> df = pl.DataFrame( From 2f95f84f4ed0c50eb57938d98f2b9be0ee0d9cfc Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 15 Jul 2023 17:46:14 +0200 Subject: [PATCH 24/37] chore(python): Workaround for PyCharm deprecation warning (#9907) --- .../polars/functions/aggregation/vertical.py | 38 ++++++------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py index fa345cbeb43c..5d210e9fb1e8 100644 --- a/py-polars/polars/functions/aggregation/vertical.py +++ b/py-polars/polars/functions/aggregation/vertical.py @@ -100,11 +100,7 @@ def all( elif isinstance(exprs, str): return F.col(exprs).all() - warnings.warn( - "using `all` for horizontal computation is deprecated. Use `all_horizontal` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), - ) + _warn_for_deprecated_horizontal_use("all") return F.all_horizontal(exprs, *more_exprs) @@ -174,11 +170,7 @@ def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | b elif isinstance(exprs, str): return F.col(exprs).any() - warnings.warn( - "using `any` for horizontal computation is deprecated. Use `any_horizontal` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), - ) + _warn_for_deprecated_horizontal_use("any") return F.any_horizontal(exprs, *more_exprs) @@ -272,11 +264,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A elif isinstance(exprs, str): return F.col(exprs).max() - warnings.warn( - "using `max` for horizontal computation is deprecated. Use `max_horizontal` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), - ) + _warn_for_deprecated_horizontal_use("max") return F.max_horizontal(exprs, *more_exprs) @@ -372,11 +360,7 @@ def min( elif isinstance(exprs, str): return F.col(exprs).min() - warnings.warn( - "using `min` for horizontal computation is deprecated. Use `min_horizontal` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), - ) + _warn_for_deprecated_horizontal_use("min") return F.min_horizontal(exprs, *more_exprs) @@ -473,11 +457,7 @@ def sum( elif isinstance(exprs, str): return F.col(exprs).sum() - warnings.warn( - "using `sum` for horizontal computation is deprecated. Use `sum_horizontal` instead.", - DeprecationWarning, - stacklevel=find_stacklevel(), - ) + _warn_for_deprecated_horizontal_use("sum") return F.sum_horizontal(exprs, *more_exprs) @@ -551,9 +531,13 @@ def cumsum( elif isinstance(exprs, str): return F.col(exprs).cumsum() + _warn_for_deprecated_horizontal_use("cumsum") + return F.cumsum_horizontal(exprs, *more_exprs) + + +def _warn_for_deprecated_horizontal_use(name: str) -> None: warnings.warn( - "using `cumsum` for horizontal computation is deprecated. Use `cumsum_horizontal` instead.", + f"using `{name}` for horizontal computation is deprecated. Use `{name}_horizontal` instead.", DeprecationWarning, stacklevel=find_stacklevel(), ) - return F.cumsum_horizontal(exprs, *more_exprs) From d6b8fb5cf894b3fd9d3b9c26194a63d359e96b63 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 15 Jul 2023 17:47:12 +0200 Subject: [PATCH 25/37] chore(python): Update autolabeler (#9885) --- .github/release-drafter.yml | 40 +++++++++++++------------------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index fea9adc2ad04..3a1ae627cd7e 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,8 +1,10 @@ categories: - title: 🏆 Highlights labels: highlight - - title: ⚠️ Breaking changes + - title: 💥 Breaking changes labels: breaking + - title: ⚠️ Deprecations + labels: deprecation - title: 🚀 Performance improvements labels: performance - title: ✨ Enhancements @@ -12,13 +14,8 @@ categories: - title: 🛠️ Other improvements labels: - build - - chore - - ci - documentation - - refactor - - revert - - style - - test + - internal exclude-labels: - skip-changelog @@ -28,7 +25,7 @@ change-template: '- $TITLE (#$NUMBER)' change-title-escapes: '\<*_&' replacers: # Remove conventional commits from titles - - search: '/- (build|chore|ci|docs|feat|fix|perf|refactor|release|revert|style|test)(\(.*\))?(\!)?\: /g' + - search: '/- (build|chore|depr|docs|feat|fix|perf|release)(\(.*\))?(\!)?\: /g' replace: '- ' version-resolver: @@ -39,22 +36,25 @@ version-resolver: autolabeler: - label: rust title: - - '/^(build|chore|ci|docs|feat|fix|perf|refactor|release|revert|style|test)\(.*rust.*\)/' + - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*rust.*\))?\!?\:) /' - label: python title: - - '/^(build|chore|ci|docs|feat|fix|perf|refactor|release|revert|style|test)\(.*python.*\)/' + - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*python.*\))?\!?\:) /' + - label: cli + title: + - '/^(build|chore|depr|docs|feat|fix|perf|release)\(.*cli.*\)\!?\:) /' # CLI tag not in global scope - label: breaking title: - - '/^(build|chore|ci|docs|feat|fix|perf|refactor|release|revert|style|test)(\(.*\))?\!\: /' + - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*\))?\!\: /' - label: build title: - '/^build/' - - label: chore + - label: internal title: - '/^chore/' - - label: ci + - label: deprecation title: - - '/^ci/' + - '/^depr/' - label: documentation title: - '/^docs/' @@ -67,21 +67,9 @@ autolabeler: - label: performance title: - '/^perf/' - - label: refactor - title: - - '/^refactor/' - label: release title: - '/^release/' - - label: revert - title: - - '/^revert/' - - label: style - title: - - '/^style/' - - label: test - title: - - '/^test/' template: | $CHANGES From 66ce209dafb402f7f61c59db97827571c8dc39c2 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 15 Jul 2023 18:00:35 +0200 Subject: [PATCH 26/37] chore: Fix autolabeler regex (#9909) --- .github/release-drafter.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 3a1ae627cd7e..d7fb09be71da 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -36,13 +36,13 @@ version-resolver: autolabeler: - label: rust title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*rust.*\))?\!?\:) /' + - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*rust.*\))?\!?\: /' - label: python title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*python.*\))?\!?\:) /' + - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*python.*\))?\!?\: /' - label: cli title: - - '/^(build|chore|depr|docs|feat|fix|perf|release)\(.*cli.*\)\!?\:) /' # CLI tag not in global scope + - '/^(build|chore|depr|docs|feat|fix|perf|release)\(.*cli.*\)\!?\: /' # CLI tag not in global scope - label: breaking title: - '/^(build|chore|depr|docs|feat|fix|perf|release)(\(.*\))?\!\: /' From fd871ebcab4caed85ec63c9276b23d69d92b243b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 15 Jul 2023 18:27:00 +0100 Subject: [PATCH 27/37] docs(python): add big warnings about using apply (#9906) --- py-polars/polars/dataframe/frame.py | 4 ++++ py-polars/polars/dataframe/groupby.py | 4 ++++ py-polars/polars/expr/expr.py | 4 ++++ py-polars/polars/functions/lazy.py | 4 ++++ py-polars/polars/lazyframe/groupby.py | 4 ++++ py-polars/polars/series/series.py | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index f3f1a71b8f0d..6ce26c79211f 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5682,6 +5682,10 @@ def apply( """ Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + The UDF will receive each row as a tuple of values: ``udf(row)``. Implementing logic using a Python function is almost always _significantly_ diff --git a/py-polars/polars/dataframe/groupby.py b/py-polars/polars/dataframe/groupby.py index cad4e0a15b60..573938c59a08 100644 --- a/py-polars/polars/dataframe/groupby.py +++ b/py-polars/polars/dataframe/groupby.py @@ -252,6 +252,10 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: """ Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame. + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + Implementing logic using a Python function is almost always _significantly_ slower and more memory intensive than implementing the same logic using the native expression API because: diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index f1e7853c618d..27dd05178986 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3613,6 +3613,10 @@ def apply( """ Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + Depending on the context it has the following behavior: * Selection diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 97a2a9aefa7f..3503a3b0b791 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1164,6 +1164,10 @@ def apply( """ Apply a custom/user-defined function (UDF) in a GroupBy context. + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + Depending on the context it has the following behavior: * Select diff --git a/py-polars/polars/lazyframe/groupby.py b/py-polars/polars/lazyframe/groupby.py index fef864b8be3d..a75726cde9ec 100644 --- a/py-polars/polars/lazyframe/groupby.py +++ b/py-polars/polars/lazyframe/groupby.py @@ -163,6 +163,10 @@ def apply( """ Apply a custom/user-defined function (UDF) over the groups as a new DataFrame. + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + Using this is considered an anti-pattern. This will be very slow because: - it forces the engine to materialize the whole `DataFrames` for the groups. diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 3f7b1a3f5c38..a831c272b358 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4364,6 +4364,10 @@ def apply( """ Apply a custom/user-defined function (UDF) over elements in this Series. + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + If the function returns a different datatype, the return_dtype arg should be set, otherwise the method will fail. From ed7724ce1cb9302ffb1b7fdcea2aaf4eb9d5f765 Mon Sep 17 00:00:00 2001 From: J van Zundert Date: Sat, 15 Jul 2023 19:02:06 +0100 Subject: [PATCH 28/37] chore(python): Add various unit tests (#9903) --- py-polars/polars/config.py | 2 +- py-polars/polars/utils/show_versions.py | 2 +- py-polars/polars/utils/various.py | 6 --- py-polars/tests/unit/namespaces/test_array.py | 11 +++++ .../tests/unit/namespaces/test_struct.py | 2 + py-polars/tests/unit/test_api.py | 25 +++++++++++ py-polars/tests/unit/test_cfg.py | 43 ++++++++++++++++++- py-polars/tests/unit/test_show_graph.py | 15 +++++++ .../tests/unit/utils/test_parse_expr_input.py | 8 ++++ py-polars/tests/unit/utils/test_utils.py | 13 +++++- 10 files changed, 117 insertions(+), 10 deletions(-) create mode 100644 py-polars/tests/unit/test_show_graph.py diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index c16f95733a59..fc4e49f86929 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -10,7 +10,7 @@ # dummy func required (so docs build) -def _get_float_fmt() -> str: +def _get_float_fmt() -> str: # pragma: no cover return "n/a" diff --git a/py-polars/polars/utils/show_versions.py b/py-polars/polars/utils/show_versions.py index ce4b39ac3719..f34db533ec26 100644 --- a/py-polars/polars/utils/show_versions.py +++ b/py-polars/polars/utils/show_versions.py @@ -87,6 +87,6 @@ def _get_dependency_version(dep_name: str) -> str: if hasattr(module, "__version__"): module_version = module.__version__ else: - module_version = importlib.metadata.version(dep_name) + module_version = importlib.metadata.version(dep_name) # pragma: no cover return module_version diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index b5a50cfd9b49..69d3cb502fe6 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -19,7 +19,6 @@ Int64, Time, Utf8, - is_polars_dtype, unpack_dtypes, ) from polars.dependencies import _PYARROW_AVAILABLE @@ -72,11 +71,6 @@ def is_bool_sequence(val: object) -> TypeGuard[Sequence[bool]]: return isinstance(val, Sequence) and _is_iterable_of(val, bool) -def is_dtype_sequence(val: object) -> TypeGuard[Sequence[PolarsDataType]]: - """Check whether the given object is a sequence of polars DataTypes.""" - return isinstance(val, Sequence) and all(is_polars_dtype(x) for x in val) - - def is_int_sequence(val: object) -> TypeGuard[Sequence[int]]: """Check whether the given sequence is a sequence of integers.""" return isinstance(val, Sequence) and _is_iterable_of(val, int) diff --git a/py-polars/tests/unit/namespaces/test_array.py b/py-polars/tests/unit/namespaces/test_array.py index f12b76172a56..ac69510cd8ed 100644 --- a/py-polars/tests/unit/namespaces/test_array.py +++ b/py-polars/tests/unit/namespaces/test_array.py @@ -1,6 +1,7 @@ import numpy as np import polars as pl +from polars.testing import assert_frame_equal def test_arr_min_max() -> None: @@ -14,6 +15,16 @@ def test_arr_sum() -> None: assert s.arr.sum().to_list() == [3, 7] +def test_arr_unique() -> None: + df = pl.DataFrame( + {"a": pl.Series("a", [[1, 1], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64))} + ) + + out = df.select(pl.col("a").arr.unique(maintain_order=True)) + expected = pl.DataFrame({"a": [[1], [4, 3]]}) + assert_frame_equal(out, expected) + + def test_array_to_numpy() -> None: s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(width=2, inner=pl.Int64)) assert (s.to_numpy() == np.array([[1, 2], [3, 4], [5, 6]])).all() diff --git a/py-polars/tests/unit/namespaces/test_struct.py b/py-polars/tests/unit/namespaces/test_struct.py index b0ab63ea094d..db9dec236160 100644 --- a/py-polars/tests/unit/namespaces/test_struct.py +++ b/py-polars/tests/unit/namespaces/test_struct.py @@ -15,6 +15,8 @@ def test_struct_various() -> None: assert s[1] == {"int": 2, "str": "b", "bool": None, "list": [3]} assert s.struct.field("list").to_list() == [[1, 2], [3]] assert s.struct.field("int").to_list() == [1, 2] + assert s.struct["list"].to_list() == [[1, 2], [3]] + assert s.struct["int"].to_list() == [1, 2] assert_frame_equal(df.to_struct("my_struct").struct.unnest(), df) assert s.struct._ipython_key_completions_() == s.struct.fields diff --git a/py-polars/tests/unit/test_api.py b/py-polars/tests/unit/test_api.py index 94e130895652..206b7173b9b2 100644 --- a/py-polars/tests/unit/test_api.py +++ b/py-polars/tests/unit/test_api.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import polars as pl from polars.testing import assert_frame_equal @@ -137,3 +139,26 @@ def test_class_namespaces_are_registered() -> None: assert ( ns in namespaces ), f"{ns!r} should be registered in {pcls.__name__}._accessors" + + +def test_namespace_cannot_override_builtin() -> None: + with pytest.raises(AttributeError): + + @pl.api.register_dataframe_namespace("dt") + class CustomDt: + def __init__(self, df: pl.DataFrame): + self._df = df + + +def test_namespace_warning_on_override() -> None: + @pl.api.register_dataframe_namespace("math") + class CustomMath: + def __init__(self, df: pl.DataFrame): + self._df = df + + with pytest.raises(UserWarning): + + @pl.api.register_dataframe_namespace("math") + class CustomMath2: + def __init__(self, df: pl.DataFrame): + self._df = df diff --git a/py-polars/tests/unit/test_cfg.py b/py-polars/tests/unit/test_cfg.py index e4b9512cc537..1027a739fd51 100644 --- a/py-polars/tests/unit/test_cfg.py +++ b/py-polars/tests/unit/test_cfg.py @@ -513,7 +513,7 @@ def test_string_cache() -> None: @pytest.mark.write_disk() def test_config_load_save(tmp_path: Path) -> None: - for file in (None, tmp_path / "polars.config"): + for file in (None, tmp_path / "polars.config", str(tmp_path / "polars.config")): # set some config options... pl.Config.set_tbl_cols(12) pl.Config.set_verbose(True) @@ -577,3 +577,44 @@ def test_config_scope() -> None: # expect scope-exit to restore original state assert pl.Config.state() == initial_state + + +def test_config_raise_error_if_not_exist() -> None: + with pytest.raises(AttributeError), pl.Config(i_do_not_exist=True): + pass + + +def test_config_state_env_only() -> None: + pl.Config.set_verbose(False) + pl.Config.set_fmt_float("full") + + state_all = pl.Config.state(env_only=False) + state_env_only = pl.Config.state(env_only=True) + assert len(state_env_only) < len(state_all) + assert "set_fmt_float" in state_all + assert "set_fmt_float" not in state_env_only + + +def test_activate_decimals() -> None: + with pl.Config() as cfg: + cfg.activate_decimals(True) + assert os.environ.get("POLARS_ACTIVATE_DECIMAL") == "1" + cfg.activate_decimals(False) + assert "POLARS_ACTIVATE_DECIMAL" not in os.environ + + +def test_set_streaming_chunk_size() -> None: + with pl.Config() as cfg: + cfg.set_streaming_chunk_size(8) + assert os.environ.get("POLARS_STREAMING_CHUNK_SIZE") == "8" + + with pytest.raises(ValueError), pl.Config() as cfg: + cfg.set_streaming_chunk_size(0) + + +def test_set_fmt_str_lengths_invalid_length() -> None: + with pl.Config() as cfg: + with pytest.raises(ValueError): + cfg.set_fmt_str_lengths(0) + with pytest.raises(ValueError): + cfg.set_fmt_str_lengths(-2) diff --git a/py-polars/tests/unit/test_show_graph.py b/py-polars/tests/unit/test_show_graph.py new file mode 100644 index 000000000000..09a9b9484933 --- /dev/null +++ b/py-polars/tests/unit/test_show_graph.py @@ -0,0 +1,15 @@ +import polars as pl + + +def test_show_graph() -> None: + # only test raw output, otherwise we need graphviz and matplotlib + ldf = pl.LazyFrame( + { + "a": ["a", "b", "a", "b", "b", "c"], + "b": [1, 2, 3, 4, 5, 6], + "c": [6, 5, 4, 3, 2, 1], + } + ) + query = ldf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort("a") + out = query.show_graph(raw_output=True) + assert isinstance(out, str) diff --git a/py-polars/tests/unit/utils/test_parse_expr_input.py b/py-polars/tests/unit/utils/test_parse_expr_input.py index 4e039382c1e7..b1eae283b31a 100644 --- a/py-polars/tests/unit/utils/test_parse_expr_input.py +++ b/py-polars/tests/unit/utils/test_parse_expr_input.py @@ -92,3 +92,11 @@ def test_parse_as_expression_structify() -> None: result = wrap_expr(parse_as_expression(pl.col("a", "b"), structify=True)) expected = pl.struct("a", "b") assert_expr_equal(result, expected) + + +def test_parse_as_expression_structify_multiple_outputs() -> None: + # note: this only works because assert_expr_equal evaluates on a dataframe with + # columns "a" and "b" + result = wrap_expr(parse_as_expression(pl.col("*"), structify=True)) + expected = pl.struct("a", "b") + assert_expr_equal(result, expected) diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py index 388b6c2f5855..7119dab5cfed 100644 --- a/py-polars/tests/unit/utils/test_utils.py +++ b/py-polars/tests/unit/utils/test_utils.py @@ -16,7 +16,8 @@ _timedelta_to_pl_timedelta, ) from polars.utils.decorators import deprecate_nonkeyword_arguments, redirect -from polars.utils.various import parse_version +from polars.utils.meta import get_idx_type +from polars.utils.various import _in_notebook, parse_version if TYPE_CHECKING: from polars.type_aliases import TimeUnit @@ -158,3 +159,13 @@ def bar(self, upper: bool = False) -> str: return "BAZ" if upper else "baz" assert DemoClass2().foo() == "BAZ" # type: ignore[attr-defined] + + +def test_get_idx_type_deprecation() -> None: + with pytest.deprecated_call(): + get_idx_type() + + +def test_in_notebook() -> None: + # private function, but easier to test this separately and mock it in the callers + assert not _in_notebook() From b016b9ca8e85c531f57b87a8bb94a439f26b265d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 15 Jul 2023 20:03:24 +0200 Subject: [PATCH 29/37] chore(rust): Minor makeover for Rust Makefile (#9874) --- .github/workflows/lint-rust.yml | 10 ++-- polars/Makefile | 85 +++++++++++++++++++-------------- py-polars/Makefile | 2 +- 3 files changed, 55 insertions(+), 42 deletions(-) diff --git a/.github/workflows/lint-rust.yml b/.github/workflows/lint-rust.yml index 61cedbc4472e..bc7c18399145 100644 --- a/.github/workflows/lint-rust.yml +++ b/.github/workflows/lint-rust.yml @@ -90,9 +90,9 @@ jobs: env: MIRIFLAGS: -Zmiri-disable-isolation -Zmiri-ignore-leaks -Zmiri-disable-stacked-borrows POLARS_ALLOW_EXTENSION: '1' - run: | - cargo miri test \ - --no-default-features \ - --features object \ - -p polars-core \ + run: > + cargo miri test + --no-default-features + --features object + -p polars-core -p polars-arrow diff --git a/polars/Makefile b/polars/Makefile index cdbc267f754e..8cdfaff4d96c 100644 --- a/polars/Makefile +++ b/polars/Makefile @@ -1,26 +1,46 @@ .DEFAULT_GOAL := help +SHELL=/bin/bash BASE ?= main -.PHONY: fmt check check-features clippy clippy-default test test-doc integration-tests - -fmt: +.PHONY: fmt +fmt: ## Run rustfmt and dprint cargo fmt --all dprint fmt -generate_test_files: - cargo run -p polars-cli "select * from read_csv('../examples/datasets/foods1.csv')" -o parquet > ../examples/datasets/foods1.parquet - cargo run -p polars-cli "select * from read_csv('../examples/datasets/foods1.csv')" -o arrow > ../examples/datasets/foods1.ipc - -check: +.PHONY: check +check: ## Run cargo check with all features cargo check --all-targets --all-features -clippy: +.PHONY: clippy +clippy: ## Run clippy with all features cargo clippy --all-targets --all-features -clippy-default: - cargo clippy +.PHONY: clippy-default +clippy-default: ## Run clippy with default features + cargo clippy --all-targets + +.PHONY: pre-commit +pre-commit: fmt clippy clippy-default ## Run autoformatting and linting + +.PHONY: check-features +check-features: ## Run cargo check for feature flag combinations (warning: slow) + cargo hack check --each-feature --no-dev-deps + +.PHONY: miri +miri: ## Run miri + # not tested on all features because miri does not support SIMD + # some tests are also filtered, because miri cannot deal with the rayon threadpool + # we ignore leaks because the thread pool of rayon is never killed. + MIRIFLAGS="-Zmiri-disable-isolation -Zmiri-ignore-leaks -Zmiri-disable-stacked-borrows" \ + POLARS_ALLOW_EXTENSION=1 \ + cargo miri test \ + --no-default-features \ + --features object \ + -p polars-core \ + -p polars-arrow +.PHONY: test test: ## Run tests cargo test --all-features \ -p polars-lazy \ @@ -34,22 +54,12 @@ test: ## Run tests -- \ --test-threads=2 -integration-tests: - cargo test --all-features --test it -- --test-threads=2 - -miri: - # not tested on all features because miri does not support SIMD - # some tests are also filtered, because miri cannot deal with the rayon threadpool - # we ignore leaks because the thread pool of rayon is never killed. - MIRIFLAGS="-Zmiri-disable-isolation -Zmiri-ignore-leaks -Zmiri-disable-stacked-borrows" \ - POLARS_ALLOW_EXTENSION=1 \ - cargo miri test \ - --no-default-features \ - --features object \ - -p polars-core \ - -p polars-arrow +.PHONY: integration-tests +integration-tests: ## Run integration tests + cargo test --all-features --test it -test-doc: +.PHONY: test-doc +test-doc: ## Run doc examples cargo test --doc \ -p polars-lazy \ -p polars-io \ @@ -57,19 +67,21 @@ test-doc: -p polars-arrow \ -p polars-sql -pre-commit: fmt clippy clippy-default ## Run autoformatting and linting - - -check-features: - cargo hack check --each-feature --no-dev-deps +.PHONY: generate_test_files +generate_test_files: ## Generate some datasets + cargo run -p polars-cli "select * from read_csv('../examples/datasets/foods1.csv')" -o parquet > ../examples/datasets/foods1.parquet + cargo run -p polars-cli "select * from read_csv('../examples/datasets/foods1.csv')" -o arrow > ../examples/datasets/foods1.ipc -bench-save: +.PHONY: bench-save +bench-save: ## Run benchmark and save cargo bench --features=random --bench $(BENCH) -- --save-baseline $(SAVE) -bench-cmp: +.PHONY: bench-cmp +bench-cmp: ## Run benchmark and compare cargo bench --features=random --bench $(BENCH) -- --load-baseline $(FEAT) --baseline $(BASE) -doctest: +.PHONY: doctest +doctest: ## Check that documentation builds cargo doc --all-features -p polars-arrow cargo doc --all-features -p polars-utils cargo doc --features=docs-selection -p polars-core @@ -80,7 +92,8 @@ doctest: cargo doc --features=docs-selection -p polars cargo doc --all-features -p polars-sql -publish: +.PHONY: publish +publish: ## Publish Polars crates cargo publish --allow-dirty -p polars-error cargo publish --allow-dirty -p polars-utils cargo publish --allow-dirty -p polars-row @@ -100,5 +113,5 @@ publish: .PHONY: help help: ## Display this help screen - @echo -e "\033[1mAvailable commands:\033[0m\n" + @echo -e "\033[1mAvailable commands:\033[0m" @grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-18s\033[0m %s\n", $$1, $$2}' | sort diff --git a/py-polars/Makefile b/py-polars/Makefile index ec717c3b0d09..fc636fcbcd2e 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -82,5 +82,5 @@ clean: ## Clean up caches and build artifacts .PHONY: help help: ## Display this help screen - @echo -e "\033[1mAvailable commands:\033[0m\n" + @echo -e "\033[1mAvailable commands:\033[0m" @grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-18s\033[0m %s\n", $$1, $$2}' | sort From e51326f89b6a2689ba173469107afbe76ccd8c3d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 15 Jul 2023 20:16:18 +0200 Subject: [PATCH 30/37] chore(rust): fix docs build and add to CI (#9904) --- .github/workflows/docs-rust.yml | 3 +- polars/polars-core/Cargo.toml | 1 + polars/polars-core/src/datatypes/_serde.rs | 4 +-- .../src/series/implementations/decimal.rs | 1 + polars/polars-core/src/series/ops/moment.rs | 4 +-- polars/polars-io/Cargo.toml | 11 +++++- .../polars-plan/src/dsl/functions/temporal.rs | 4 +-- .../polars-lazy/polars-plan/src/dsl/list.rs | 2 +- polars/polars-lazy/polars-plan/src/dsl/mod.rs | 2 +- py-polars/Cargo.lock | 35 ++++++++++--------- 10 files changed, 40 insertions(+), 27 deletions(-) diff --git a/.github/workflows/docs-rust.yml b/.github/workflows/docs-rust.yml index b20a479f1650..0439b307b1e0 100644 --- a/.github/workflows/docs-rust.yml +++ b/.github/workflows/docs-rust.yml @@ -32,7 +32,8 @@ jobs: - name: Build Rust documentation env: RUSTDOCFLAGS: --cfg docsrs -D warnings - run: cargo doc --features=docs-selection --package polars + working-directory: polars + run: make doctest - name: Create redirect to Polars crate and set no-jekyll if: ${{ github.ref_name == 'main' }} diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index f09c883b08c8..695ea1fc613e 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -128,6 +128,7 @@ docs-selection = [ "diff", "moment", "dtype-categorical", + "dtype-decimal", "rank", "diagonal_concat", "horizontal_concat", diff --git a/polars/polars-core/src/datatypes/_serde.rs b/polars/polars-core/src/datatypes/_serde.rs index 7ae368fdc30a..4277432ae5f3 100644 --- a/polars/polars-core/src/datatypes/_serde.rs +++ b/polars/polars-core/src/datatypes/_serde.rs @@ -1,8 +1,8 @@ //! Having `Object<&;static> in [`DataType`] make serde tag the `Deserialize` trait bound 'static //! even though we skip serializing `Object`. //! -//! We could use https://github.com/serde-rs/serde/issues/1712, but that gave problems caused by -//! https://github.com/rust-lang/rust/issues/96956, so we make a dummy type without static +//! We could use [serde_1712](https://github.com/serde-rs/serde/issues/1712), but that gave problems caused by +//! [rust_96956](https://github.com/rust-lang/rust/issues/96956), so we make a dummy type without static pub use arrow::datatypes::DataType as ArrowDataType; use serde::{Deserialize, Deserializer, Serialize, Serializer}; diff --git a/polars/polars-core/src/series/implementations/decimal.rs b/polars/polars-core/src/series/implementations/decimal.rs index fd7828165a64..2f054673d864 100644 --- a/polars/polars-core/src/series/implementations/decimal.rs +++ b/polars/polars-core/src/series/implementations/decimal.rs @@ -52,6 +52,7 @@ impl private::PrivateSeries for SeriesWrap { self.0.dtype() } + #[cfg(feature = "zip_with")] fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { Ok(self .0 diff --git a/polars/polars-core/src/series/ops/moment.rs b/polars/polars-core/src/series/ops/moment.rs index 50785ae709f5..32e68945e7fa 100644 --- a/polars/polars-core/src/series/ops/moment.rs +++ b/polars/polars-core/src/series/ops/moment.rs @@ -48,7 +48,7 @@ impl Series { /// function `skewtest` can be used to determine if the skewness value /// is close enough to zero, statistically speaking. /// - /// see: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1024 + /// see: [scipy](https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1024) pub fn skew(&self, bias: bool) -> PolarsResult> { let mean = match self.mean() { Some(mean) => mean, @@ -76,7 +76,7 @@ impl Series { /// If bias is `false` then the kurtosis is calculated using k statistics to /// eliminate bias coming from biased moment estimators /// - /// see: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1027 + /// see: [scipy](https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1027) pub fn kurtosis(&self, fisher: bool, bias: bool) -> PolarsResult> { let mean = match self.mean() { Some(mean) => mean, diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index 4c04dcd1d23c..09209622a32d 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -11,7 +11,16 @@ description = "IO related logic for the Polars DataFrame library" [features] # support for arrows json parsing -json = ["arrow/io_json_write", "polars-json", "simd-json", "memmap", "lexical", "lexical-core", "serde_json"] +json = [ + "arrow/io_json_write", + "polars-json", + "simd-json", + "memmap", + "lexical", + "lexical-core", + "serde_json", + "dtype-struct", +] # support for arrows ipc file parsing ipc = ["arrow/io_ipc", "arrow/io_ipc_compression", "memmap"] # support for arrows streaming ipc file parsing diff --git a/polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs b/polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs index c663c7e1bd62..53c556262b7a 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/functions/temporal.rs @@ -10,7 +10,7 @@ macro_rules! impl_unit_setter { }; } -/// Arguments used by [`datetime`] in order to produce an `Expr` of `Datetime` +/// Arguments used by `datetime` in order to produce an `Expr` of `Datetime` /// /// Construct a `DatetimeArgs` with `DatetimeArgs::new(y, m, d)`. This will set the other time units to `lit(0)`. You /// can then set the other fields with the `with_*` methods, or use `with_hms` to set `hour`, `minute`, and `second` all @@ -175,7 +175,7 @@ pub fn datetime(args: DatetimeArgs) -> Expr { .alias("datetime") } -/// Arguments used by [`duration`] in order to produce an `Expr` of `Duration` +/// Arguments used by `duration` in order to produce an `Expr` of `Duration` /// /// To construct a `DurationArgs`, use struct literal syntax with `..Default::default()` to leave unspecified fields at /// their default value of `lit(0)`, as demonstrated below. diff --git a/polars/polars-lazy/polars-plan/src/dsl/list.rs b/polars/polars-lazy/polars-plan/src/dsl/list.rs index bc362e01aef7..5f61f6e9195e 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/list.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/list.rs @@ -243,7 +243,7 @@ impl ListNameSpace { /// /// # Schema /// - /// A polars [`LazyFrame`] needs to know the schema at all time. The caller therefore must provide + /// A polars `LazyFrame` needs to know the schema at all time. The caller therefore must provide /// an `upper_bound` of struct fields that will be set. /// If this is incorrectly downstream operation may fail. For instance an `all().sum()` expression /// will look in the current schema to determine which columns to select. diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/mod.rs index 3cf8f1cf29ae..e43b1efb3537 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/mod.rs @@ -1529,7 +1529,7 @@ impl Expr { /// function `skewtest` can be used to determine if the skewness value /// is close enough to zero, statistically speaking. /// - /// see: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1024 + /// see: [scipy](https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1024) pub fn skew(self, bias: bool) -> Expr { self.apply( move |s| { diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 9e09aa089cdc..4be7e133bb57 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -98,8 +98,9 @@ dependencies = [ [[package]] name = "arrow2" -version = "0.17.1" -source = "git+https://github.com/ritchie46/arrow2?branch=polars_2023-06-26#e71d66689f6ebde0e01f185bad0db8ef46f5fc8e" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e44f27e89e3edd8738a07c5e2c881efaa25e69be97a816d2df051685d460670c" dependencies = [ "ahash", "arrow-format", @@ -1429,7 +1430,7 @@ dependencies = [ [[package]] name = "polars" -version = "0.30.0" +version = "0.31.1" dependencies = [ "getrandom", "polars-core", @@ -1443,7 +1444,7 @@ dependencies = [ [[package]] name = "polars-algo" -version = "0.30.0" +version = "0.31.1" dependencies = [ "polars-core", "polars-lazy", @@ -1452,7 +1453,7 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.30.0" +version = "0.31.1" dependencies = [ "arrow2", "atoi", @@ -1469,7 +1470,7 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.30.0" +version = "0.31.1" dependencies = [ "ahash", "arrow2", @@ -1502,7 +1503,7 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.30.0" +version = "0.31.1" dependencies = [ "arrow2", "regex", @@ -1511,7 +1512,7 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.30.0" +version = "0.31.1" dependencies = [ "ahash", "arrow2", @@ -1546,7 +1547,7 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.30.0" +version = "0.31.1" dependencies = [ "ahash", "arrow2", @@ -1562,7 +1563,7 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.30.0" +version = "0.31.1" dependencies = [ "ahash", "bitflags", @@ -1584,7 +1585,7 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.30.0" +version = "0.31.1" dependencies = [ "argminmax", "arrow2", @@ -1605,7 +1606,7 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.30.0" +version = "0.31.1" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1625,7 +1626,7 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.30.0" +version = "0.31.1" dependencies = [ "ahash", "arrow2", @@ -1649,7 +1650,7 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.30.0" +version = "0.31.1" dependencies = [ "arrow2", "polars-error", @@ -1658,7 +1659,7 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.30.0" +version = "0.31.1" dependencies = [ "polars-arrow", "polars-core", @@ -1671,7 +1672,7 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.30.0" +version = "0.31.1" dependencies = [ "arrow2", "atoi", @@ -1690,7 +1691,7 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.30.0" +version = "0.31.1" dependencies = [ "ahash", "hashbrown 0.14.0", From cc0795f7b980da385268d77216a9ae36dccdfcf4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 15 Jul 2023 20:52:54 +0200 Subject: [PATCH 31/37] fix(python): don't SO on align_frames (#9911) --- py-polars/polars/functions/eager.py | 37 +++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/py-polars/polars/functions/eager.py b/py-polars/polars/functions/eager.py index f8b6ac474164..10a2139ab25a 100644 --- a/py-polars/polars/functions/eager.py +++ b/py-polars/polars/functions/eager.py @@ -347,17 +347,34 @@ def align_frames( # create aligned master frame (this is the most expensive part; afterwards # we just subselect out the columns representing the component frames) eager = isinstance(frames[0], pl.DataFrame) - alignment_frame: LazyFrame = ( - reduce( # type: ignore[attr-defined] - lambda x, y: x.lazy().join( # type: ignore[arg-type, return-value] - y.lazy(), how=how, on=align_on, suffix=str(id(y)) - ), - frames, + + # we stackoverflow on many frames + # so we branch on an arbitrary chosen large number of frames + if len(frames) < 250: + # lazy variant + # this can SO + alignment_frame: LazyFrame = ( + reduce( # type: ignore[attr-defined] + lambda x, y: x.lazy().join( # type: ignore[arg-type, return-value] + y.lazy(), how=how, on=align_on, suffix=str(id(y)) + ), + frames, + ) + .sort(by=align_on, descending=descending) + .collect(no_optimization=True) + .lazy() + ) + else: + # eager variant + # this doesn't SO + alignment_frame = ( + reduce( + lambda x, y: x.join(y, how=how, on=align_on, suffix=str(id(y))), + frames, + ) + .sort(by=align_on, descending=descending) + .lazy() ) - .sort(by=align_on, descending=descending) - .collect() - .lazy() - ) # select-out aligned components from the master frame aligned_cols = set(alignment_frame.columns) From da4df4fe65273eaec1c21af76eb951c29e7f6dd1 Mon Sep 17 00:00:00 2001 From: J van Zundert Date: Sun, 16 Jul 2023 07:50:40 +0100 Subject: [PATCH 32/37] chore(python): Use Pathlib everywhere (#9914) --- py-polars/docs/source/conf.py | 7 ++++--- py-polars/polars/config.py | 8 ++++---- py-polars/polars/io/_utils.py | 2 +- py-polars/polars/lazyframe/frame.py | 5 ++--- py-polars/polars/utils/various.py | 15 +++++++-------- py-polars/pyproject.toml | 1 + py-polars/scripts/check_stacklevels.py | 6 +++--- py-polars/tests/benchmark/test_release.py | 5 ++--- py-polars/tests/unit/io/conftest.py | 4 +--- py-polars/tests/unit/io/test_csv.py | 8 +++----- py-polars/tests/unit/io/test_database.py | 7 ++----- py-polars/tests/unit/io/test_lazy_csv.py | 12 +++++------- py-polars/tests/unit/io/test_lazy_json.py | 4 ++-- py-polars/tests/unit/io/test_other.py | 12 ++++++------ py-polars/tests/unit/io/test_parquet.py | 6 ++---- py-polars/tests/unit/streaming/conftest.py | 4 +--- py-polars/tests/unit/test_cfg.py | 8 +++----- py-polars/tests/unit/test_sql.py | 5 ++--- 18 files changed, 51 insertions(+), 68 deletions(-) diff --git a/py-polars/docs/source/conf.py b/py-polars/docs/source/conf.py index 31916f3c6106..6c4f0891e9a6 100644 --- a/py-polars/docs/source/conf.py +++ b/py-polars/docs/source/conf.py @@ -16,11 +16,12 @@ import re import sys import warnings +from pathlib import Path import sphinx_autosummary_accessors # add polars directory -sys.path.insert(0, os.path.abspath("../..")) +sys.path.insert(0, str(Path("../..").resolve())) # -- Project information ----------------------------------------------------- @@ -200,8 +201,8 @@ def linkcode_resolve(domain, info): linespec = f"#L{lineno}-L{lineno + len(source) - 1}" if lineno else "" - conf_dir_path = os.path.dirname(os.path.realpath(__file__)) - polars_root = os.path.abspath(f"{conf_dir_path}/../../polars") + conf_dir_path = Path(__file__).absolute().parent + polars_root = (conf_dir_path.parent.parent / "polars").absolute() fn = os.path.relpath(fn, start=polars_root) return f"{github_root}/blob/main/py-polars/polars/{fn}{linespec}" diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index fc4e49f86929..2ef42ad14af3 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -156,7 +156,7 @@ def load(cls, cfg: Path | str) -> type[Config]: """ options = json.loads( Path(normalise_filepath(cfg)).read_text() - if isinstance(cfg, Path) or os.path.exists(cfg) + if isinstance(cfg, Path) or Path(cfg).exists() else cfg ) os.environ.update(options.get("environment", {})) @@ -221,9 +221,9 @@ def save(cls, file: Path | str | None = None) -> str: separators=(",", ":"), ) if isinstance(file, (str, Path)): - file = os.path.abspath(normalise_filepath(file)) - Path(file).write_text(options) - return file + file = Path(normalise_filepath(file)).resolve() + file.write_text(options) + return str(file) return options diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index a0bdceb7f20e..46d55343ec27 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -166,7 +166,7 @@ def managed_file(file: Any) -> Iterator[Any]: if isinstance(file, str): file = normalise_filepath(file, check_not_dir) if has_non_utf8_non_utf8_lossy_encoding: - with open(file, encoding=encoding_str) as f: + with Path(file).open(encoding=encoding_str) as f: return _check_empty( BytesIO(f.read().encode("utf8")), context=f"{file!r}" ) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 66d91f95fa87..f4ac1d5dced5 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -894,7 +894,7 @@ def show_graph( *, optimized: bool = True, show: bool = True, - output_path: str | None = None, + output_path: str | Path | None = None, raw_output: bool = False, figsize: tuple[float, float] = (16.0, 12.0), type_coercion: bool = True, @@ -975,8 +975,7 @@ def show_graph( raise ImportError("Graphviz dot binary should be on your PATH") from None if output_path: - with Path(output_path).open(mode="wb") as file: - file.write(graph) + Path(output_path).write_bytes(graph) if not show: return None diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index 69d3cb502fe6..d6820aaf3b23 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -1,12 +1,12 @@ from __future__ import annotations import inspect -import os import re import sys import warnings from collections.abc import MappingView, Sized from enum import Enum +from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Iterable, Literal, Sequence, TypeVar import polars as pl @@ -25,7 +25,6 @@ if TYPE_CHECKING: from collections.abc import Reversible - from pathlib import Path from polars import DataFrame, Series from polars.type_aliases import PolarsDataType, PolarsIntegerType, SizeUnit @@ -183,10 +182,10 @@ def can_create_dicts_with_pyarrow(dtypes: Sequence[PolarsDataType]) -> bool: def normalise_filepath(path: str | Path, check_not_directory: bool = True) -> str: """Create a string path, expanding the home directory if present.""" - path = os.path.expanduser(path) - if check_not_directory and os.path.exists(path) and os.path.isdir(path): + path = Path(path).expanduser() + if check_not_directory and path.exists() and path.is_dir(): raise IsADirectoryError(f"Expected a file path; {path!r} is a directory") - return path + return str(path) def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: @@ -358,15 +357,15 @@ def find_stacklevel() -> int: Taken from: https://github.com/pandas-dev/pandas/blob/ab89c53f48df67709a533b6a95ce3d911871a0a8/pandas/util/_exceptions.py#L30-L51 """ - pkg_dir = os.path.dirname(pl.__file__) - test_dir = os.path.join(pkg_dir, "tests") + pkg_dir = Path(pl.__file__).parent + test_dir = pkg_dir / "tests" # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow frame = inspect.currentframe() n = 0 while frame: fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): + if fname.startswith(str(pkg_dir)) and not fname.startswith(str(test_dir)): frame = frame.f_back n += 1 else: diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index ca01851a144f..724f6638ce6e 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -119,6 +119,7 @@ select = [ "UP", # pyupgrade "PT", # flake8-pytest-style "RUF", # Ruff-specific rules + "PTH", # flake8-use-pathlib ] ignore = [ diff --git a/py-polars/scripts/check_stacklevels.py b/py-polars/scripts/check_stacklevels.py index 93805063acad..2ff14283ea01 100644 --- a/py-polars/scripts/check_stacklevels.py +++ b/py-polars/scripts/check_stacklevels.py @@ -7,6 +7,7 @@ import subprocess import sys from ast import NodeVisitor +from pathlib import Path # Files in which it's OK to set the stacklevel manually. # `git ls-files` lists files with forwards-slashes @@ -38,10 +39,9 @@ def visit_Call(self, node: ast.Call) -> None: for file in files: if file in EXCLUDE: continue - if not file.endswith(".py"): + if Path(file).suffix != ".py": continue - with open(file) as fd: - content = fd.read() + content = Path(file).read_text() tree = ast.parse(content) stacklevel_checker = StackLevelChecker(file) stacklevel_checker.visit(tree) diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py index bbea4df6b928..e6ebdac7dcbe 100644 --- a/py-polars/tests/benchmark/test_release.py +++ b/py-polars/tests/benchmark/test_release.py @@ -5,7 +5,6 @@ To run these tests: pytest -m benchmark """ -import os import time from pathlib import Path from typing import cast @@ -21,12 +20,12 @@ @pytest.mark.skipif( - not (Path(os.path.dirname(__file__)) / "G1_1e7_1e2_5_0.csv").is_file(), + not (Path(__file__).parent / "G1_1e7_1e2_5_0.csv").is_file(), reason="Dataset must be generated before running this test.", ) def test_read_scan_large_csv() -> None: filename = "G1_1e7_1e2_5_0.csv" - path = Path(os.path.dirname(__file__)) / filename + path = Path(__file__).parent / filename predicate = pl.col("v2") < 5 diff --git a/py-polars/tests/unit/io/conftest.py b/py-polars/tests/unit/io/conftest.py index b488a9d29c44..fd174486b25f 100644 --- a/py-polars/tests/unit/io/conftest.py +++ b/py-polars/tests/unit/io/conftest.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os from pathlib import Path import pytest @@ -8,5 +7,4 @@ @pytest.fixture() def io_files_path() -> Path: - current_dir = os.path.dirname(__file__) - return Path(current_dir) / "files" + return Path(__file__).parent / "files" diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index f4a18436e751..8f065bdff851 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -379,8 +379,7 @@ def test_read_csv_encoding(tmp_path: Path) -> None: ) file_path = tmp_path / "encoding.csv" - with open(file_path, "wb") as f: - f.write(bts) + file_path.write_bytes(bts) file_str = str(file_path) bytesio = io.BytesIO(bts) @@ -487,9 +486,8 @@ def test_compressed_csv(io_files_path: Path) -> None: def test_partial_decompression(foods_file_path: Path) -> None: f_out = io.BytesIO() - with open(foods_file_path, "rb") as f_read: # noqa: SIM117 - with gzip.GzipFile(fileobj=f_out, mode="w") as f: - f.write(f_read.read()) + with gzip.GzipFile(fileobj=f_out, mode="w") as f: + f.write(foods_file_path.read_bytes()) csv_bytes = f_out.getvalue() for n_rows in [1, 5, 26]: diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py index a292af1217ca..4466a57761c6 100644 --- a/py-polars/tests/unit/io/test_database.py +++ b/py-polars/tests/unit/io/test_database.py @@ -1,8 +1,8 @@ from __future__ import annotations -import os import sys from datetime import date +from pathlib import Path from typing import TYPE_CHECKING import pytest @@ -11,8 +11,6 @@ from polars.testing import assert_frame_equal if TYPE_CHECKING: - from pathlib import Path - from polars.type_aliases import ( DbReadEngine, DbWriteEngine, @@ -35,8 +33,7 @@ def sample_df() -> pl.DataFrame: def create_temp_sqlite_db(test_db: str) -> None: import sqlite3 - if os.path.exists(test_db): - os.unlink(test_db) + Path(test_db).unlink(missing_ok=True) # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index cd5aea1a1e05..2eaa730b0bc8 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -42,8 +42,7 @@ def test_invalid_utf8(tmp_path: Path) -> None: bts = bytes(np.random.randint(0, 255, 200)) file_path = tmp_path / "nonutf8.csv" - with open(file_path, "wb") as f: - f.write(bts) + file_path.write_bytes(bts) a = pl.read_csv(file_path, has_header=False, encoding="utf8-lossy") b = pl.scan_csv(file_path, has_header=False, encoding="utf8-lossy").collect() @@ -192,9 +191,8 @@ def test_glob_skip_rows(tmp_path: Path) -> None: for i in range(2): file_path = tmp_path / f"test_{i}.csv" - with open(file_path, "w") as f: - f.write( - f""" + file_path.write_text( + f""" metadata goes here file number {i} foo,bar,baz @@ -202,7 +200,7 @@ def test_glob_skip_rows(tmp_path: Path) -> None: 4,5,6 7,8,9 """ - ) + ) file_path = tmp_path / "*.csv" assert pl.read_csv(file_path, skip_rows=2).to_dict(False) == { "foo": [1, 4, 7, 1, 4, 7], @@ -227,7 +225,7 @@ def test_glob_n_rows(io_files_path: Path) -> None: } -def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: str) -> None: +def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: Path) -> None: df = ( pl.scan_csv( foods_file_path, diff --git a/py-polars/tests/unit/io/test_lazy_json.py b/py-polars/tests/unit/io/test_lazy_json.py index 924d59aba717..8c16e9039e2c 100644 --- a/py-polars/tests/unit/io/test_lazy_json.py +++ b/py-polars/tests/unit/io/test_lazy_json.py @@ -57,8 +57,8 @@ def test_scan_with_projection(tmp_path: Path) -> None: json_bytes = bytes(json, "utf-8") file_path = tmp_path / "escape_chars.json" - with open(file_path, "wb") as f: - f.write(json_bytes) + file_path.write_bytes(json_bytes) + actual = pl.scan_ndjson(file_path).select(["id", "text"]).collect() expected = pl.DataFrame( diff --git a/py-polars/tests/unit/io/test_other.py b/py-polars/tests/unit/io/test_other.py index 7e7746b9ef98..8b068708bdc4 100644 --- a/py-polars/tests/unit/io/test_other.py +++ b/py-polars/tests/unit/io/test_other.py @@ -1,7 +1,7 @@ from __future__ import annotations import copy -import os.path +from pathlib import Path from typing import cast import polars as pl @@ -51,8 +51,8 @@ def test_unit_io_subdir_has_no_init() -> None: # -------------------------------------------------------------------------------- # TLDR: it can mask the builtin 'io' module, causing a fatal python error. # -------------------------------------------------------------------------------- - io_dir = os.path.dirname(__file__) - assert io_dir.endswith(f"unit{os.path.sep}io") - assert not os.path.exists( - f"{io_dir}{os.path.sep}__init__.py" - ), "Found undesirable '__init__.py' in the 'unit.io' tests subdirectory" + io_dir = Path(__file__).parent + assert io_dir.parts[-2:] == ("unit", "io") + assert not ( + io_dir / "__init__.py" + ).exists(), "Found undesirable '__init__.py' in the 'unit.io' tests subdirectory" diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 556a3efbbf39..b57765242a24 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1,8 +1,8 @@ from __future__ import annotations import io -import os from datetime import datetime, timezone +from pathlib import Path from typing import TYPE_CHECKING import numpy as np @@ -19,8 +19,6 @@ ) if TYPE_CHECKING: - from pathlib import Path - from polars.type_aliases import ParquetCompression @@ -513,7 +511,7 @@ def test_parquet_string_cache() -> None: def test_tz_aware_parquet_9586() -> None: result = pl.read_parquet( - os.path.join("tests", "unit", "io", "files", "tz_aware.parquet") + Path("tests") / "unit" / "io" / "files" / "tz_aware.parquet" ) expected = pl.DataFrame( {"UTC_DATETIME_ID": [datetime(2023, 6, 26, 14, 15, 0, tzinfo=timezone.utc)]} diff --git a/py-polars/tests/unit/streaming/conftest.py b/py-polars/tests/unit/streaming/conftest.py index 31e98521a2a2..b7b476474316 100644 --- a/py-polars/tests/unit/streaming/conftest.py +++ b/py-polars/tests/unit/streaming/conftest.py @@ -1,4 +1,3 @@ -import os from pathlib import Path import pytest @@ -6,5 +5,4 @@ @pytest.fixture() def io_files_path() -> Path: - current_dir = os.path.dirname(__file__) - return Path(current_dir) / ".." / "io" / "files" + return Path(__file__).parent.parent / "io" / "files" diff --git a/py-polars/tests/unit/test_cfg.py b/py-polars/tests/unit/test_cfg.py index 1027a739fd51..1d1be35d45dc 100644 --- a/py-polars/tests/unit/test_cfg.py +++ b/py-polars/tests/unit/test_cfg.py @@ -1,7 +1,8 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Iterator +from pathlib import Path +from typing import Iterator import pytest @@ -10,9 +11,6 @@ from polars.exceptions import StringCacheMismatchError from polars.testing import assert_frame_equal -if TYPE_CHECKING: - from pathlib import Path - @pytest.fixture(autouse=True) def _environ() -> Iterator[None]: @@ -531,7 +529,7 @@ def test_config_load_save(tmp_path: Path) -> None: # ...load back from config... if file is not None: - assert os.path.isfile(cfg) + assert Path(cfg).is_file() pl.Config.load(cfg) # ...and confirm the saved options were set. diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index 86a78ddddfd2..5f328390ac1b 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import warnings from pathlib import Path @@ -13,8 +12,8 @@ # TODO: Do not rely on I/O for these tests @pytest.fixture() -def foods_ipc_path() -> str: - return str(Path(os.path.dirname(__file__)) / "io" / "files" / "foods1.ipc") +def foods_ipc_path() -> Path: + return Path(__file__).parent / "io" / "files" / "foods1.ipc" def test_sql_cast() -> None: From 4a12df1e5cf65f2dcbf818ab0b479fa2e6bd0905 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 16 Jul 2023 08:50:53 +0200 Subject: [PATCH 33/37] feat(python): Add `Series.extend` (#9901) --- .../source/reference/series/modify_select.rst | 1 + py-polars/polars/series/series.py | 114 ++++++++++++++++-- .../polars/testing/parametric/primitives.py | 2 +- py-polars/polars/utils/_construction.py | 2 +- py-polars/tests/unit/dataframe/__init__.py | 0 py-polars/tests/unit/datatypes/test_struct.py | 50 -------- py-polars/tests/unit/series/__init__.py | 0 py-polars/tests/unit/series/test_append.py | 96 +++++++++++++++ py-polars/tests/unit/series/test_extend.py | 34 ++++++ .../tests/unit/{ => series}/test_series.py | 14 --- 10 files changed, 236 insertions(+), 77 deletions(-) create mode 100644 py-polars/tests/unit/dataframe/__init__.py create mode 100644 py-polars/tests/unit/series/__init__.py create mode 100644 py-polars/tests/unit/series/test_append.py create mode 100644 py-polars/tests/unit/series/test_extend.py rename py-polars/tests/unit/{ => series}/test_series.py (99%) diff --git a/py-polars/docs/source/reference/series/modify_select.rst b/py-polars/docs/source/reference/series/modify_select.rst index 2738415e5010..d9808a9a0651 100644 --- a/py-polars/docs/source/reference/series/modify_select.rst +++ b/py-polars/docs/source/reference/series/modify_select.rst @@ -21,6 +21,7 @@ Manipulation/selection Series.drop_nans Series.drop_nulls Series.explode + Series.extend Series.extend_constant Series.fill_nan Series.fill_null diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index a831c272b358..e7ff377258fa 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -2303,7 +2303,7 @@ def slice(self, offset: int, length: int | None = None) -> Series: """ - def append(self, other: Series, *, append_chunks: bool = True) -> Series: + def append(self, other: Series, *, append_chunks: bool | None = None) -> Self: """ Append a Series to this one. @@ -2312,6 +2312,11 @@ def append(self, other: Series, *, append_chunks: bool = True) -> Series: other Series to append. append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + If set to `True` the append operation will add the chunks from `other` to self. This is super cheap. @@ -2335,13 +2340,21 @@ def append(self, other: Series, *, append_chunks: bool = True) -> Series: to store them in a single `Series`. In the latter case, finish the sequence of `append_chunks` operations with a `rechunk`. + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend Examples -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s.append(s2) - shape: (6,) + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) Series: 'a' [i64] [ 1 @@ -2349,21 +2362,100 @@ def append(self, other: Series, *, append_chunks: bool = True) -> Series: 3 4 5 - 6 ] + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + """ + if append_chunks is not None: + warnings.warn( + "the `append_chunks` argument will be removed and `append` will change" + " to always behave like `append_chunks=True` (the previous default)." + " For the behavior of `append_chunks=False`, use `Series.extend`.", + DeprecationWarning, + stacklevel=find_stacklevel(), + ) + else: + append_chunks = True + + if not append_chunks: + return self.extend(other) + try: - if append_chunks: - self._s.append(other._s) + self._s.append(other._s) + except RuntimeError as exc: + if str(exc) == "Already mutably borrowed": + self._s.append(other._s.clone()) else: - self._s.extend(other._s) - return self + raise exc + return self + + def extend(self, other: Series) -> Self: + """ + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: 'a' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + """ + try: + self._s.extend(other._s) except RuntimeError as exc: if str(exc) == "Already mutably borrowed": - return self.append(other.clone(), append_chunks=append_chunks) + self._s.extend(other._s.clone()) else: raise exc + return self def filter(self, predicate: Series | list[bool]) -> Self: """ diff --git a/py-polars/polars/testing/parametric/primitives.py b/py-polars/polars/testing/parametric/primitives.py index d6b24e2956dc..b88b6bf776b7 100644 --- a/py-polars/polars/testing/parametric/primitives.py +++ b/py-polars/polars/testing/parametric/primitives.py @@ -438,7 +438,7 @@ def draw_series(draw: DrawFn) -> Series: s = s.cast(Categorical) if series_size and (chunked or (chunked is None and draw(booleans()))): split_at = series_size // 2 - s = s[:split_at].append(s[split_at:], append_chunks=True) + s = s[:split_at].append(s[split_at:]) return s return draw_series() diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py index d2f76fad2a9b..25efe25d6a32 100644 --- a/py-polars/polars/utils/_construction.py +++ b/py-polars/polars/utils/_construction.py @@ -288,7 +288,7 @@ def to_series_chunk(values: list[Any], dtype: PolarsDataType | None) -> Series: series = schunk dtype = series.dtype else: - series.append(schunk, append_chunks=True) + series.append(schunk) n_chunks += 1 if series is None: diff --git a/py-polars/tests/unit/dataframe/__init__.py b/py-polars/tests/unit/dataframe/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 22ec2abbd96c..72e9316448d5 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -5,7 +5,6 @@ import pandas as pd import pyarrow as pa -import pytest import polars as pl from polars.testing import assert_frame_equal @@ -517,55 +516,6 @@ def test_struct_order() -> None: ) == [{"a": 1, "b": 10}, {"a": 2, "b": None}] -def test_struct_schema_on_append_extend_3452() -> None: - housing1_data = [ - { - "city": "Chicago", - "address": "100 Main St", - "price": 250000, - "nbr_bedrooms": 3, - }, - { - "city": "New York", - "address": "100 First Ave", - "price": 450000, - "nbr_bedrooms": 2, - }, - ] - - housing2_data = [ - { - "address": "303 Mockingbird Lane", - "city": "Los Angeles", - "nbr_bedrooms": 2, - "price": 450000, - }, - { - "address": "404 Moldave Dr", - "city": "Miami Beach", - "nbr_bedrooms": 1, - "price": 250000, - }, - ] - housing1, housing2 = pl.Series(housing1_data), pl.Series(housing2_data) - with pytest.raises( - pl.SchemaError, - match=( - 'cannot append field with name "address" ' - 'to struct with field name "city"' - ), - ): - housing1.append(housing2, append_chunks=True) - with pytest.raises( - pl.SchemaError, - match=( - 'cannot extend field with name "address" ' - 'to struct with field name "city"' - ), - ): - housing1.append(housing2, append_chunks=False) - - def test_struct_arr_eval() -> None: df = pl.DataFrame( {"col_struct": [[{"a": 1, "b": 11}, {"a": 2, "b": 12}, {"a": 1, "b": 11}]]} diff --git a/py-polars/tests/unit/series/__init__.py b/py-polars/tests/unit/series/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/py-polars/tests/unit/series/test_append.py b/py-polars/tests/unit/series/test_append.py new file mode 100644 index 000000000000..46520ae67204 --- /dev/null +++ b/py-polars/tests/unit/series/test_append.py @@ -0,0 +1,96 @@ +import pytest + +import polars as pl +from polars.testing import assert_series_equal + + +def test_append() -> None: + a = pl.Series("a", [1, 2]) + b = pl.Series("b", [8, 9, None]) + + result = a.append(b) + + expected = pl.Series("a", [1, 2, 8, 9, None]) + assert_series_equal(a, expected) + assert_series_equal(result, expected) + assert a.n_chunks() == 2 + + +def test_append_deprecated_append_chunks() -> None: + a = pl.Series("a", [1, 2]) + b = pl.Series("b", [8, 9, None]) + + with pytest.deprecated_call(): + a.append(b, append_chunks=False) + + expected = pl.Series("a", [1, 2, 8, 9, None]) + assert_series_equal(a, expected) + assert a.n_chunks() == 1 + + +def test_append_self_3915() -> None: + a = pl.Series("a", [1, 2]) + + a.append(a) + + expected = pl.Series("a", [1, 2, 1, 2]) + assert_series_equal(a, expected) + assert a.n_chunks() == 2 + + +def test_append_bad_input() -> None: + a = pl.Series("a", [1, 2]) + b = a.to_frame() + + with pytest.raises(AttributeError): + a.append(b) # type: ignore[arg-type] + + +def test_struct_schema_on_append_extend_3452() -> None: + housing1_data = [ + { + "city": "Chicago", + "address": "100 Main St", + "price": 250000, + "nbr_bedrooms": 3, + }, + { + "city": "New York", + "address": "100 First Ave", + "price": 450000, + "nbr_bedrooms": 2, + }, + ] + + housing2_data = [ + { + "address": "303 Mockingbird Lane", + "city": "Los Angeles", + "nbr_bedrooms": 2, + "price": 450000, + }, + { + "address": "404 Moldave Dr", + "city": "Miami Beach", + "nbr_bedrooms": 1, + "price": 250000, + }, + ] + housing1, housing2 = pl.Series(housing1_data), pl.Series(housing2_data) + with pytest.raises( + pl.SchemaError, + match=( + 'cannot append field with name "address" ' + 'to struct with field name "city"' + ), + ): + housing1.append(housing2) + + with pytest.raises( + pl.SchemaError, + match=( + 'cannot extend field with name "address" ' + 'to struct with field name "city"' + ), + ): + housing1.extend(housing2) diff --git a/py-polars/tests/unit/series/test_extend.py b/py-polars/tests/unit/series/test_extend.py new file mode 100644 index 000000000000..9e3b71acd075 --- /dev/null +++ b/py-polars/tests/unit/series/test_extend.py @@ -0,0 +1,34 @@ +import pytest + +import polars as pl +from polars.testing import assert_series_equal + + +def test_extend() -> None: + a = pl.Series("a", [1, 2]) + b = pl.Series("b", [8, 9, None]) + + result = a.extend(b) + + expected = pl.Series("a", [1, 2, 8, 9, None]) + assert_series_equal(a, expected) + assert_series_equal(result, expected) + assert a.n_chunks() == 1 + + +def test_extend_self() -> None: + a = pl.Series("a", [1, 2]) + + a.extend(a) + + expected = pl.Series("a", [1, 2, 1, 2]) + assert_series_equal(a, expected) + assert a.n_chunks() == 1 + + +def test_extend_bad_input() -> None: + a = pl.Series("a", [1, 2]) + b = a.to_frame() + + with pytest.raises(AttributeError): + a.extend(b) # type: ignore[arg-type] diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/series/test_series.py similarity index 99% rename from py-polars/tests/unit/test_series.py rename to py-polars/tests/unit/series/test_series.py index 5aa1215934b1..624147a8b69a 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -409,15 +409,6 @@ def test_add_string() -> None: assert_series_equal(result, pl.Series(["pfx:hello", "pfx:weird"])) -def test_append_extend() -> None: - a = pl.Series("a", [1, 2]) - b = pl.Series("b", [8, 9, None]) - a.append(b, append_chunks=False) - expected = pl.Series("a", [1, 2, 8, 9, None]) - assert_series_equal(a, expected) - assert a.n_chunks() == 1 - - @pytest.mark.parametrize( ("data", "expected_dtype"), [ @@ -2305,11 +2296,6 @@ def test_clip() -> None: assert s.clip(1, 10).to_list() == [1, 5, None, 10] -def test_mutable_borrowed_append_3915() -> None: - s = pl.Series("s", [1, 2, 3]) - assert s.append(s).to_list() == [1, 2, 3, 1, 2, 3] - - def test_set_at_idx() -> None: s = pl.Series("s", [1, 2, 3]) From bb36e4c6841523f3189f30cc6152b16d99f12c49 Mon Sep 17 00:00:00 2001 From: Josh Magarick Date: Sat, 15 Jul 2023 23:51:36 -0700 Subject: [PATCH 34/37] feat(rust, python): Optional three-valued logic for any/all (#9848) --- .../src/chunked_array/comparison/mod.rs | 17 ++++++ .../src/dsl/function_expr/boolean.rs | 32 +++++------ polars/polars-lazy/polars-plan/src/dsl/mod.rs | 8 +-- polars/polars-sql/src/sql_expr.rs | 4 +- polars/tests/it/lazy/expressions/window.rs | 4 +- py-polars/polars/expr/expr.py | 57 +++++++++++++++++-- .../polars/functions/aggregation/vertical.py | 6 +- py-polars/polars/series/series.py | 8 +-- py-polars/src/expr/general.rs | 8 +-- 9 files changed, 104 insertions(+), 40 deletions(-) diff --git a/polars/polars-core/src/chunked_array/comparison/mod.rs b/polars/polars-core/src/chunked_array/comparison/mod.rs index 1008fdd676e2..2400b568cbe7 100644 --- a/polars/polars-core/src/chunked_array/comparison/mod.rs +++ b/polars/polars-core/src/chunked_array/comparison/mod.rs @@ -1004,6 +1004,23 @@ impl BooleanChunked { pub fn any(&self) -> bool { self.downcast_iter().any(compute::boolean::any) } + + // Three-valued versions which can return None + pub fn all_3val(&self, drop_nulls: bool) -> Option { + if drop_nulls || self.null_count() == 0 { + Some(self.all()) + } else { + None + } + } + pub fn any_3val(&self, drop_nulls: bool) -> Option { + let res = self.any(); + if drop_nulls || res { + Some(res) + } else { + None + } + } } // private diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/boolean.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/boolean.rs index a71f240d913b..5575dfc31f32 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/boolean.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/boolean.rs @@ -8,8 +8,12 @@ use crate::wrap; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum BooleanFunction { - All, - Any, + All { + drop_nulls: bool, + }, + Any { + drop_nulls: bool, + }, IsNot, IsNull, IsNotNull, @@ -37,8 +41,8 @@ impl Display for BooleanFunction { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { use BooleanFunction::*; let s = match self { - All => "all", - Any => "any", + All { .. } => "all", + Any { .. } => "any", IsNot => "is_not", IsNull => "is_null", IsNotNull => "is_not_null", @@ -63,8 +67,8 @@ impl From for SpecialEq> { fn from(func: BooleanFunction) -> Self { use BooleanFunction::*; match func { - All => map!(all), - Any => map!(any), + All { drop_nulls } => map!(all, drop_nulls), + Any { drop_nulls } => map!(any, drop_nulls), IsNot => map!(is_not), IsNull => map!(is_null), IsNotNull => map!(is_not_null), @@ -90,22 +94,14 @@ impl From for FunctionExpr { } } -fn all(s: &Series) -> PolarsResult { +fn all(s: &Series, drop_nulls: bool) -> PolarsResult { let boolean = s.bool()?; - if boolean.all() { - Ok(Series::new(s.name(), [true])) - } else { - Ok(Series::new(s.name(), [false])) - } + Ok(Series::new(s.name(), [boolean.all_3val(drop_nulls)])) } -fn any(s: &Series) -> PolarsResult { +fn any(s: &Series, drop_nulls: bool) -> PolarsResult { let boolean = s.bool()?; - if boolean.any() { - Ok(Series::new(s.name(), [true])) - } else { - Ok(Series::new(s.name(), [false])) - } + Ok(Series::new(s.name(), [boolean.any_3val(drop_nulls)])) } fn is_not(s: &Series) -> PolarsResult { diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/mod.rs index e43b1efb3537..120ec51da67d 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/mod.rs @@ -1639,8 +1639,8 @@ impl Expr { } /// Check if any boolean value is `true` - pub fn any(self) -> Self { - self.apply_private(BooleanFunction::Any.into()) + pub fn any(self, drop_nulls: bool) -> Self { + self.apply_private(BooleanFunction::Any { drop_nulls }.into()) .with_function_options(|mut opt| { opt.auto_explode = true; opt @@ -1655,8 +1655,8 @@ impl Expr { } /// Check if all boolean values are `true` - pub fn all(self) -> Self { - self.apply_private(BooleanFunction::All.into()) + pub fn all(self, drop_nulls: bool) -> Self { + self.apply_private(BooleanFunction::All { drop_nulls }.into()) .with_function_options(|mut opt| { opt.auto_explode = true; opt diff --git a/polars/polars-sql/src/sql_expr.rs b/polars/polars-sql/src/sql_expr.rs index 85e99ff8dcc3..31aaabf56de7 100644 --- a/polars/polars-sql/src/sql_expr.rs +++ b/polars/polars-sql/src/sql_expr.rs @@ -60,8 +60,8 @@ pub(crate) struct SqlExprVisitor<'a> { impl SqlExprVisitor<'_> { fn visit_expr(&self, expr: &SqlExpr) -> PolarsResult { match expr { - SqlExpr::AllOp(_) => Ok(self.visit_expr(expr)?.all()), - SqlExpr::AnyOp(expr) => Ok(self.visit_expr(expr)?.any()), + SqlExpr::AllOp(_) => Ok(self.visit_expr(expr)?.all(true)), + SqlExpr::AnyOp(expr) => Ok(self.visit_expr(expr)?.any(true)), SqlExpr::ArrayAgg(expr) => self.visit_arr_agg(expr), SqlExpr::Between { expr, diff --git a/polars/tests/it/lazy/expressions/window.rs b/polars/tests/it/lazy/expressions/window.rs index b5c4d5fc6326..c150378d9352 100644 --- a/polars/tests/it/lazy/expressions/window.rs +++ b/polars/tests/it/lazy/expressions/window.rs @@ -364,8 +364,8 @@ fn test_window_exprs_any_all() -> PolarsResult<()> { ]? .lazy() .select([ - col("var2").any().over([col("var1")]).alias("any"), - col("var2").all().over([col("var1")]).alias("all"), + col("var2").any(true).over([col("var1")]).alias("any"), + col("var2").all(true).over([col("var1")]).alias("all"), ]) .collect()?; diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 27dd05178986..3bacfff59766 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -311,10 +311,15 @@ def to_physical(self) -> Self: """ return self._from_pyexpr(self._pyexpr.to_physical()) - def any(self) -> Self: + def any(self, drop_nulls: bool = True) -> Self: """ Check if any boolean value in a Boolean column is `True`. + Parameters + ---------- + drop_nulls + If False, return None if there are nulls but no Trues. + Returns ------- Boolean literal @@ -331,17 +336,42 @@ def any(self) -> Self: ╞══════╪═══════╡ │ true ┆ false │ └──────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) + shape: (1, 2) + ┌───────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪══════╡ + │ false ┆ true │ + └───────┴──────┘ + >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ true │ + └──────┴──────┘ """ - return self._from_pyexpr(self._pyexpr.any()) + return self._from_pyexpr(self._pyexpr.any(drop_nulls)) - def all(self) -> Self: + def all(self, drop_nulls: bool = True) -> Self: """ Check if all boolean values in a Boolean column are `True`. This method is an expression - not to be confused with :func:`polars.all` which is a function to select all columns. + Parameters + ---------- + drop_nulls + If False, return None if there are any nulls. + + Returns ------- Boolean literal @@ -360,9 +390,28 @@ def all(self) -> Self: ╞══════╪═══════╪═══════╡ │ true ┆ false ┆ false │ └──────┴───────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) + shape: (1, 2) + ┌───────┬───────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + └───────┴───────┘ + >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ null │ + └──────┴──────┘ """ - return self._from_pyexpr(self._pyexpr.all()) + return self._from_pyexpr(self._pyexpr.all(drop_nulls)) def arg_true(self) -> Self: """ diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py index 5d210e9fb1e8..85b3cc698dac 100644 --- a/py-polars/polars/functions/aggregation/vertical.py +++ b/py-polars/polars/functions/aggregation/vertical.py @@ -28,7 +28,7 @@ def all( @deprecated_alias(columns="exprs") def all( exprs: IntoExpr | Iterable[IntoExpr] | None = None, *more_exprs: IntoExpr -) -> Expr | bool: +) -> Expr | bool | None: """ Either return an expression representing all columns, or evaluate a bitwise AND operation. @@ -115,7 +115,9 @@ def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: @deprecated_alias(columns="exprs") -def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | bool: +def any( + exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr +) -> Expr | bool | None: """ Evaluate a bitwise OR operation. diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index e7ff377258fa..55d6c7fed39a 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1196,7 +1196,7 @@ def sqrt(self) -> Series: """ - def any(self) -> bool: + def any(self, drop_nulls: bool = True) -> bool | None: """ Check if any boolean value in the column is `True`. @@ -1205,9 +1205,9 @@ def any(self) -> bool: Boolean literal """ - return self.to_frame().select(F.col(self.name).any()).to_series()[0] + return self.to_frame().select(F.col(self.name).any(drop_nulls)).to_series()[0] - def all(self) -> bool: + def all(self, drop_nulls: bool = True) -> bool | None: """ Check if all boolean values in the column are `True`. @@ -1216,7 +1216,7 @@ def all(self) -> bool: Boolean literal """ - return self.to_frame().select(F.col(self.name).all()).to_series()[0] + return self.to_frame().select(F.col(self.name).all(drop_nulls)).to_series()[0] def log(self, base: float = math.e) -> Series: """Compute the logarithm to a given base.""" diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index fa7d0bf638e3..f7a0f8f79352 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -1112,12 +1112,12 @@ impl PyExpr { .with_fmt("extend") .into() } - fn any(&self) -> Self { - self.inner.clone().any().into() + fn any(&self, drop_nulls: bool) -> Self { + self.inner.clone().any(drop_nulls).into() } - fn all(&self) -> Self { - self.inner.clone().all().into() + fn all(&self, drop_nulls: bool) -> Self { + self.inner.clone().all(drop_nulls).into() } fn log(&self, base: f64) -> Self { From 20212499e1e39649b96ede8cd28edc8f00ac669d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 16 Jul 2023 14:49:45 +0200 Subject: [PATCH 35/37] fix(python): Handle `DataFrame.extend` extending by itself (#9897) --- py-polars/polars/dataframe/frame.py | 40 ++++++--- py-polars/tests/unit/dataframe/test_df.py | 55 ------------- py-polars/tests/unit/dataframe/test_extend.py | 81 +++++++++++++++++++ py-polars/tests/unit/dataframe/test_vstack.py | 14 ++++ 4 files changed, 125 insertions(+), 65 deletions(-) create mode 100644 py-polars/tests/unit/dataframe/test_extend.py diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 6ce26c79211f..f86fbb4c45b1 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5827,6 +5827,10 @@ def vstack(self, other: DataFrame, *, in_place: bool = False) -> Self: in_place Modify in place. + See Also + -------- + extend + Examples -------- >>> df1 = pl.DataFrame( @@ -5874,26 +5878,36 @@ def extend(self, other: DataFrame) -> Self: """ Extend the memory backed by this `DataFrame` with the values from `other`. - Different from `vstack` which adds the chunks from `other` to the chunks of this - `DataFrame` `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation. + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. If this does not cause a reallocation, the resulting data structure will not have any extra chunks and thus will yield faster queries. - Prefer `extend` over `vstack` when you want to do a query after a single append. - For instance during online operations where you add `n` rows and rerun a query. + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. - Prefer `vstack` over `extend` when you want to append many times before doing a - query. For instance when you read in multiple files and when to store them in a - single `DataFrame`. In the latter case, finish the sequence of `vstack` - operations with a `rechunk`. + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. Parameters ---------- other DataFrame to vertically add. + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + Examples -------- >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) @@ -5914,7 +5928,13 @@ def extend(self, other: DataFrame) -> Self: └─────┴─────┘ """ - self._df.extend(other._df) + try: + self._df.extend(other._df) + except RuntimeError as exc: + if str(exc) == "Already mutably borrowed": + self._df.extend(other._df.clone()) + else: + raise exc return self def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index a2056ee74bef..6739a9a6d72f 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -704,61 +704,6 @@ def test_hstack_dataframe(in_place: bool) -> None: assert_frame_equal(df_out, expected) -def test_extend() -> None: - with pl.StringCache(): - df1 = pl.DataFrame( - { - "foo": [1, 2], - "bar": [True, False], - "ham": ["a", "b"], - "cat": ["A", "B"], - "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)], - } - ).with_columns( - [ - pl.col("cat").cast(pl.Categorical), - ] - ) - df2 = pl.DataFrame( - { - "foo": [3, 4], - "bar": [True, None], - "ham": ["c", "d"], - "cat": ["C", "B"], - "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)], - } - ).with_columns( - [ - pl.col("cat").cast(pl.Categorical), - ] - ) - - df1.extend(df2) - expected = pl.DataFrame( - { - "foo": [1, 2, 3, 4], - "bar": [True, False, True, None], - "ham": ["a", "b", "c", "d"], - "cat": ["A", "B", "C", "B"], - "dates": [ - datetime(2021, 1, 1), - datetime(2021, 2, 1), - datetime(2022, 9, 1), - datetime(2021, 2, 1), - ], - } - ).with_columns( - pl.col("cat").cast(pl.Categorical), - ) - assert_frame_equal(df1, expected) - - # 8745 - df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}]) - df = df[:-1] - tail = pl.DataFrame([{"age": 8}]) - assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]} - - def test_file_buffer() -> None: f = BytesIO() f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12") diff --git a/py-polars/tests/unit/dataframe/test_extend.py b/py-polars/tests/unit/dataframe/test_extend.py new file mode 100644 index 000000000000..08359cc85c19 --- /dev/null +++ b/py-polars/tests/unit/dataframe/test_extend.py @@ -0,0 +1,81 @@ +from datetime import datetime + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +def test_extend_various_dtypes() -> None: + with pl.StringCache(): + df1 = pl.DataFrame( + { + "foo": [1, 2], + "bar": [True, False], + "ham": ["a", "b"], + "cat": ["A", "B"], + "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)], + }, + schema_overrides={"cat": pl.Categorical}, + ) + df2 = pl.DataFrame( + { + "foo": [3, 4], + "bar": [True, None], + "ham": ["c", "d"], + "cat": ["C", "B"], + "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)], + }, + schema_overrides={"cat": pl.Categorical}, + ) + + df1.extend(df2) + + expected = pl.DataFrame( + { + "foo": [1, 2, 3, 4], + "bar": [True, False, True, None], + "ham": ["a", "b", "c", "d"], + "cat": ["A", "B", "C", "B"], + "dates": [ + datetime(2021, 1, 1), + datetime(2021, 2, 1), + datetime(2022, 9, 1), + datetime(2021, 2, 1), + ], + }, + schema_overrides={"cat": pl.Categorical}, + ) + assert_frame_equal(df1, expected) + + +def test_extend_slice_offset_8745() -> None: + df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}]) + df = df[:-1] + tail = pl.DataFrame([{"age": 8}]) + assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]} + + +def test_extend_self() -> None: + df = pl.DataFrame({"a": [1, 2], "b": [True, False]}) + + df.extend(df) + + expected = pl.DataFrame({"a": [1, 2, 1, 2], "b": [True, False, True, False]}) + assert_frame_equal(df, expected) + + +def test_extend_column_number_mismatch() -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]}) + df2 = df1.drop("a") + + with pytest.raises(pl.ShapeError): + df1.extend(df2) + + +def test_extend_column_name_mismatch() -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]}) + df2 = df1.with_columns(pl.col("a").alias("c")) + + with pytest.raises(pl.ShapeError): + df1.extend(df2) diff --git a/py-polars/tests/unit/dataframe/test_vstack.py b/py-polars/tests/unit/dataframe/test_vstack.py index ecf88a2f987f..504ae9a24b97 100644 --- a/py-polars/tests/unit/dataframe/test_vstack.py +++ b/py-polars/tests/unit/dataframe/test_vstack.py @@ -44,3 +44,17 @@ def test_vstack_self_in_place(df1: pl.DataFrame) -> None: {"foo": [1, 2, 1, 2], "bar": [6, 7, 6, 7], "ham": ["a", "b", "a", "b"]} ) assert_frame_equal(df1, expected) + + +def test_vstack_column_number_mismatch(df1: pl.DataFrame) -> None: + df2 = df1.drop("ham") + + with pytest.raises(pl.ShapeError): + df1.vstack(df2) + + +def test_vstack_column_name_mismatch(df1: pl.DataFrame) -> None: + df2 = df1.with_columns(pl.col("foo").alias("oof")) + + with pytest.raises(pl.ShapeError): + df1.vstack(df2) From f93e79665edb78dbe7e950187c6b02d15f1232e1 Mon Sep 17 00:00:00 2001 From: Marshall Date: Sun, 16 Jul 2023 16:39:45 -0400 Subject: [PATCH 36/37] refactor(python): deprecate `bins` argument and rename to `breaks` in `Series.cut` (#9913) --- py-polars/polars/expr/expr.py | 2 +- py-polars/polars/series/series.py | 15 ++++++++------- .../tests/unit/operations/test_statistics.py | 8 ++++---- py-polars/tests/unit/test_errors.py | 4 +++- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 3bacfff59766..f113a20cba29 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3298,7 +3298,7 @@ def cut( breaks A list of unique cut points. labels - Labels to assign to bins. If given, the length must be len(probs) + 1. + Labels to assign to bins. If given, the length must be len(breaks) + 1. left_closed Whether intervals should be [) instead of the default of (] include_breaks diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 55d6c7fed39a..594a6ef54ba4 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1604,9 +1604,10 @@ def to_dummies(self, separator: str = "_") -> DataFrame: """ return wrap_df(self._s.to_dummies(separator)) + @deprecated_alias(bins="breaks") def cut( self, - bins: list[float], + breaks: list[float], labels: list[str] | None = None, break_point_label: str = "break_point", category_label: str = "category", @@ -1620,11 +1621,11 @@ def cut( Parameters ---------- - bins - Bins to create. + breaks + A list of unique cut points. labels Labels to assign to the bins. If given the length of labels must be - len(bins) + 1. + len(breaks) + 1. break_point_label Name given to the breakpoint column/field. Only used if series == False or include_breaks == True @@ -1707,14 +1708,14 @@ def cut( return ( self.to_frame() .with_columns( - F.col(n).cut(bins, labels, left_closed, True).alias(n + "_bin") + F.col(n).cut(breaks, labels, left_closed, True).alias(n + "_bin") ) .unnest(n + "_bin") .rename({"brk": break_point_label, n + "_bin": category_label}) ) res = ( self.to_frame() - .select(F.col(n).cut(bins, labels, left_closed, include_breaks)) + .select(F.col(n).cut(breaks, labels, left_closed, include_breaks)) .to_series() ) if include_breaks: @@ -1743,7 +1744,7 @@ def qcut( We expect quantiles ``0.0 <= quantile <= 1`` labels Labels to assign to the quantiles. If given the length of labels must be - len(bins) + 1. + len(breaks) + 1. break_point_label Name given to the breakpoint column/field. Only used if series == False or include_breaks == True diff --git a/py-polars/tests/unit/operations/test_statistics.py b/py-polars/tests/unit/operations/test_statistics.py index 12d578d6f8b9..e052862b9b06 100644 --- a/py-polars/tests/unit/operations/test_statistics.py +++ b/py-polars/tests/unit/operations/test_statistics.py @@ -27,7 +27,7 @@ def test_corr() -> None: def test_cut() -> None: a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) - out = cast(pl.DataFrame, a.cut(bins=[-1, 1], series=False)) + out = cast(pl.DataFrame, a.cut(breaks=[-1, 1], series=False)) assert out.shape == (12, 3) assert out.filter(pl.col("break_point") < 1e9).to_dict(False) == { @@ -50,7 +50,7 @@ def test_cut() -> None: inf = float("inf") df = pl.DataFrame({"a": list(range(5))}) ser = df.select("a").to_series() - assert cast(pl.DataFrame, ser.cut(bins=[-1, 1], series=False)).rows() == [ + assert cast(pl.DataFrame, ser.cut(breaks=[-1, 1], series=False)).rows() == [ (0.0, 1.0, "(-1, 1]"), (1.0, 1.0, "(-1, 1]"), (2.0, inf, "(1, inf]"), @@ -78,8 +78,8 @@ def test_cut() -> None: ) np.random.seed(1) a = pl.Series("a", np.random.randint(0, 10, 10)) - out = cast(pl.DataFrame, a.cut(bins=[-1, 1], series=False)) - out_s = cast(pl.Series, a.cut(bins=[-1, 1], series=True)) + out = cast(pl.DataFrame, a.cut(breaks=[-1, 1], series=False)) + out_s = cast(pl.Series, a.cut(breaks=[-1, 1], series=True)) assert out["a"].cast(int).series_equal(a) # Compare strings and categoricals without a hassle assert_frame_equal(expected_df, out, check_dtype=False) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 6a551d3e98e2..afb443272591 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -43,7 +43,9 @@ def test_error_on_reducing_map() -> None: ), ): df.select( - pl.col("x").map(lambda x: x.cut(bins=[1, 2, 3], series=False)).over("group") + pl.col("x") + .map(lambda x: x.cut(breaks=[1, 2, 3], series=False)) + .over("group") ) From f5a8c6cf070d6e269490833587fdb17bb532a97a Mon Sep 17 00:00:00 2001 From: Ray Zhang Date: Mon, 17 Jul 2023 01:46:15 -0400 Subject: [PATCH 37/37] feat(rust, python): Add cloudpickle for serializing python UDFs (#9921) --- .../polars-lazy/polars-plan/src/dsl/python_udf.rs | 14 +++++++++----- py-polars/polars/utils/show_versions.py | 1 + py-polars/pyproject.toml | 3 ++- py-polars/requirements-dev.txt | 1 + py-polars/tests/unit/test_serde.py | 14 ++++++++++++++ 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/polars/polars-lazy/polars-plan/src/dsl/python_udf.rs b/polars/polars-lazy/polars-plan/src/dsl/python_udf.rs index e5d73d696869..46bf0d97795b 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/python_udf.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/python_udf.rs @@ -56,8 +56,9 @@ impl Serialize for PythonFunction { S: Serializer, { Python::with_gil(|py| { - let pickle = PyModule::import(py, "pickle") - .expect("Unable to import 'pickle'") + let pickle = PyModule::import(py, "cloudpickle") + .or(PyModule::import(py, "pickle")) + .expect("Unable to import 'cloudpickle' or 'pickle'") .getattr("dumps") .unwrap(); @@ -83,7 +84,8 @@ impl<'a> Deserialize<'a> for PythonFunction { let bytes = Vec::::deserialize(deserializer)?; Python::with_gil(|py| { - let pickle = PyModule::import(py, "pickle") + let pickle = PyModule::import(py, "cloudpickle") + .or(PyModule::import(py, "pickle")) .expect("Unable to import 'pickle'") .getattr("loads") .unwrap(); @@ -122,7 +124,8 @@ impl PythonUdfExpression { let remainder = &buf[reader.position() as usize..]; Python::with_gil(|py| { - let pickle = PyModule::import(py, "pickle") + let pickle = PyModule::import(py, "cloudpickle") + .or(PyModule::import(py, "pickle")) .expect("Unable to import 'pickle'") .getattr("loads") .unwrap(); @@ -169,7 +172,8 @@ impl SeriesUdf for PythonUdfExpression { ciborium::ser::into_writer(&self.output_type, &mut *buf).unwrap(); Python::with_gil(|py| { - let pickle = PyModule::import(py, "pickle") + let pickle = PyModule::import(py, "cloudpickle") + .or(PyModule::import(py, "pickle")) .expect("Unable to import 'pickle'") .getattr("dumps") .unwrap(); diff --git a/py-polars/polars/utils/show_versions.py b/py-polars/polars/utils/show_versions.py index f34db533ec26..2f7ff9ed42f6 100644 --- a/py-polars/polars/utils/show_versions.py +++ b/py-polars/polars/utils/show_versions.py @@ -59,6 +59,7 @@ def _get_dependency_info() -> dict[str, str]: # see the list of dependencies in pyproject.toml opt_deps = [ "adbc_driver_sqlite", + "cloudpickle", "connectorx", "deltalake", "fsspec", diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index 724f6638ce6e..b6a615424d50 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -51,8 +51,9 @@ pydantic = ["pydantic"] sqlalchemy = ["sqlalchemy", "pandas"] xlsxwriter = ["xlsxwriter"] adbc = ["adbc_driver_sqlite"] +cloudpickle = ["cloudpickle"] all = [ - "polars[pyarrow,pandas,numpy,fsspec,connectorx,xlsx2csv,deltalake,timezone,matplotlib,pydantic,sqlalchemy,xlsxwriter,adbc]", + "polars[pyarrow,pandas,numpy,fsspec,connectorx,xlsx2csv,deltalake,timezone,matplotlib,pydantic,sqlalchemy,xlsxwriter,adbc,cloudpickle]", ] [tool.mypy] diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 1d05bc42f0bc..96d05a38d370 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -17,6 +17,7 @@ xlsx2csv XlsxWriter adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows' connectorx==0.3.2a5; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released +cloudpickle # Tooling hypothesis==6.79.4; python_version < '3.8' diff --git a/py-polars/tests/unit/test_serde.py b/py-polars/tests/unit/test_serde.py index 0734a8eb9b5a..fe5939e1bc56 100644 --- a/py-polars/tests/unit/test_serde.py +++ b/py-polars/tests/unit/test_serde.py @@ -152,3 +152,17 @@ def test_pickle_lazyframe_udf() -> None: q = pickle.loads(b) assert q.collect()["a"].to_list() == [2, 4, 6] + + +def test_pickle_lazyframe_nested_function_udf() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + + # NOTE: This is only possible when we're using cloudpickle. + def inner_df_times2(df: pl.DataFrame) -> pl.DataFrame: + return df.select(pl.all() * 2) + + q = df.lazy().map(inner_df_times2) + b = pickle.dumps(q) + + q = pickle.loads(b) + assert q.collect()["a"].to_list() == [2, 4, 6]