diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index de73bd294c523..774eedfde53cf 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -2,6 +2,8 @@ use base64::engine::general_purpose; #[cfg(feature = "string_encoding")] use base64::Engine as _; +#[cfg(feature = "dtype-struct")] +use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array}; use polars_arrow::kernels::string::*; #[cfg(feature = "string_from_radix")] use polars_core::export::num::Num; @@ -376,6 +378,119 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { } } + #[cfg(feature = "dtype-struct")] + fn split_exact(&self, by: &str, n: usize) -> PolarsResult { + let ca = self.as_utf8(); + + let mut arrs = (0..n + 1) + .map(|_| MutableUtf8Array::::with_capacity(ca.len())) + .collect::>(); + + ca.for_each(|opt_s| match opt_s { + None => { + for arr in &mut arrs { + arr.push_null() + } + }, + Some(s) => { + let mut arr_iter = arrs.iter_mut(); + let split_iter = s.split(by); + (split_iter) + .zip(&mut arr_iter) + .for_each(|(splitted, arr)| arr.push(Some(splitted))); + // fill the remaining with null + for arr in arr_iter { + arr.push_null() + } + }, + }); + + let fields = arrs + .into_iter() + .enumerate() + .map(|(i, mut arr)| { + Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() + }) + .collect::>(); + + StructChunked::new(ca.name(), &fields) + } + + #[cfg(feature = "dtype-struct")] + fn split_exact_inclusive(&self, by: &str, n: usize) -> PolarsResult { + let ca = self.as_utf8(); + + let mut arrs = (0..n + 1) + .map(|_| MutableUtf8Array::::with_capacity(ca.len())) + .collect::>(); + + ca.for_each(|opt_s| match opt_s { + None => { + for arr in &mut arrs { + arr.push_null() + } + }, + Some(s) => { + let mut arr_iter = arrs.iter_mut(); + let split_iter = s.split_inclusive(by); + (split_iter) + .zip(&mut arr_iter) + .for_each(|(splitted, arr)| arr.push(Some(splitted))); + // fill the remaining with null + for arr in arr_iter { + arr.push_null() + } + }, + }); + + let fields = arrs + .into_iter() + .enumerate() + .map(|(i, mut arr)| { + Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() + }) + .collect::>(); + + StructChunked::new(ca.name(), &fields) + } + + #[cfg(feature = "dtype-struct")] + fn splitn(&self, by: &str, n: usize) -> PolarsResult { + let ca = self.as_utf8(); + + let mut arrs = (0..n) + .map(|_| MutableUtf8Array::::with_capacity(ca.len())) + .collect::>(); + + ca.for_each(|opt_s| match opt_s { + None => { + for arr in &mut arrs { + arr.push_null() + } + }, + Some(s) => { + let mut arr_iter = arrs.iter_mut(); + let split_iter = s.splitn(n, &by); + (split_iter) + .zip(&mut arr_iter) + .for_each(|(splitted, arr)| arr.push(Some(splitted))); + // fill the remaining with null + for arr in arr_iter { + arr.push_null() + } + }, + }); + let fields = arrs + .into_iter() + .enumerate() + .map(|(i, mut arr)| { + Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() + }) + .collect::>(); + + StructChunked::new(ca.name(), &fields) + } + fn split(&self, by: &str) -> ListChunked { let ca = self.as_utf8(); let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index ebc72544e71a9..8eed67a0229ec 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -726,6 +726,12 @@ impl From for SpecialEq> { SplitInclusive => { map_as_slice!(strings::split_inclusive) }, + #[cfg(feature = "dtype-struct")] + SplitExact { by, n } => map!(strings::split_exact, &by, n), + #[cfg(feature = "dtype-struct")] + SplitExactInclusive { by, n } => map!(strings::split_exact_inclusive, &by, n), + #[cfg(feature = "dtype-struct")] + SplitN { by, n } => map!(strings::splitn, &by, n), #[cfg(feature = "concat_str")] ConcatVertical(delimiter) => map!(strings::concat, &delimiter), #[cfg(feature = "concat_str")] diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 12260dba49dd3..269de89f4cf3c 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -12,6 +12,9 @@ use serde::{Deserialize, Serialize}; static TZ_AWARE_RE: Lazy = Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap()); +#[cfg(feature = "dtype-struct")] +use polars_utils::format_smartstring; + use super::*; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -73,6 +76,21 @@ pub enum StringFunction { StripCharsEnd(Option), StripPrefix, StripSuffix, + #[cfg(feature = "dtype-struct")] + SplitExact { + by: String, + n: usize, + }, + #[cfg(feature = "dtype-struct")] + SplitExactInclusive { + by: String, + n: usize, + }, + #[cfg(feature = "dtype-struct")] + SplitN { + by: String, + n: usize, + }, #[cfg(feature = "temporal")] Strptime(DataType, StrptimeOptions), Split, @@ -126,6 +144,22 @@ impl StringFunction { | Slice(_, _) => mapper.with_same_dtype(), #[cfg(feature = "string_justify")] Zfill { .. } | LJust { .. } | RJust { .. } => mapper.with_same_dtype(), + #[cfg(feature = "dtype-struct")] + SplitExact { by: _, n } | SplitExactInclusive { by: _, n } => { + mapper.with_dtype(DataType::Struct( + (0..n + 1) + .map(|i| { + Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) + }) + .collect(), + )) + }, + #[cfg(feature = "dtype-struct")] + SplitN { by: _, n } => mapper.with_dtype(DataType::Struct( + (0..*n) + .map(|i| Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8)) + .collect(), + )), } } } @@ -166,6 +200,12 @@ impl Display for StringFunction { StringFunction::StripCharsEnd(_) => "strip_chars_end", StringFunction::StripPrefix => "strip_prefix", StringFunction::StripSuffix => "strip_suffix", + #[cfg(feature = "dtype-struct")] + StringFunction::SplitExact { .. } => "split_exact", + #[cfg(feature = "dtype-struct")] + StringFunction::SplitExactInclusive { .. } => "split_exact_inclusive", + #[cfg(feature = "dtype-struct")] + StringFunction::SplitN { .. } => "splitn", #[cfg(feature = "temporal")] StringFunction::Strptime(_, _) => "strptime", StringFunction::Split => "split", @@ -393,6 +433,24 @@ pub(super) fn strptime( } } +#[cfg(feature = "dtype-struct")] +pub(super) fn split_exact(s: &Series, by: &str, n: usize) -> PolarsResult { + let ca = s.utf8()?; + ca.split_exact(by, n).map(|ca| ca.into_series()) +} + +#[cfg(feature = "dtype-struct")] +pub(super) fn split_exact_inclusive(s: &Series, by: &str, n: usize) -> PolarsResult { + let ca = s.utf8()?; + ca.split_exact_inclusive(by, n).map(|ca| ca.into_series()) +} + +#[cfg(feature = "dtype-struct")] +pub(super) fn splitn(s: &Series, by: &str, n: usize) -> PolarsResult { + let ca = s.utf8()?; + ca.splitn(by, n).map(|ca| ca.into_series()) +} + pub(super) fn split(s: &[Series]) -> PolarsResult { let ca = s[0].utf8()?; let by = s[1].utf8()?; diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index 1022d87bed528..382e9b4694dbb 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -1,8 +1,3 @@ -#[cfg(feature = "dtype-struct")] -use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array}; -#[cfg(feature = "dtype-struct")] -use polars_utils::format_smartstring; - use super::function_expr::StringFunction; use super::*; /// Specialized expressions for [`Series`] of [`DataType::Utf8`]. @@ -199,18 +194,12 @@ impl StringNameSpace { /// * `delimiter` - A string that will act as delimiter between values. #[cfg(feature = "concat_str")] pub fn concat(self, delimiter: &str) -> Expr { - let delimiter = delimiter.to_owned(); - - Expr::Function { - input: vec![self.0], - function: StringFunction::ConcatVertical(delimiter).into(), - options: FunctionOptions { - collect_groups: ApplyOptions::ApplyGroups, - input_wildcard_expansion: false, - auto_explode: true, - ..Default::default() - }, - } + self.0 + .apply_private(StringFunction::ConcatVertical(delimiter.to_owned()).into()) + .with_function_options(|mut options| { + options.auto_explode = true; + options + }) } /// Split the string by a substring. The resulting dtype is `List`. @@ -230,52 +219,8 @@ impl StringNameSpace { pub fn split_exact(self, by: &str, n: usize) -> Expr { let by = by.to_string(); - let function = move |s: Series| { - let ca = s.utf8()?; - - let mut arrs = (0..n + 1) - .map(|_| MutableUtf8Array::::with_capacity(ca.len())) - .collect::>(); - - ca.into_iter().for_each(|opt_s| match opt_s { - None => { - for arr in &mut arrs { - arr.push_null() - } - }, - Some(s) => { - let mut arr_iter = arrs.iter_mut(); - let split_iter = s.split(&by); - (split_iter) - .zip(&mut arr_iter) - .for_each(|(splitted, arr)| arr.push(Some(splitted))); - // fill the remaining with null - for arr in arr_iter { - arr.push_null() - } - }, - }); - let fields = arrs - .into_iter() - .enumerate() - .map(|(i, mut arr)| { - Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() - }) - .collect::>(); - Ok(Some(StructChunked::new(ca.name(), &fields)?.into_series())) - }; self.0 - .map( - function, - GetOutput::from_type(DataType::Struct( - (0..n + 1) - .map(|i| { - Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) - }) - .collect(), - )), - ) - .with_fmt("str.split_exact") + .map_private(StringFunction::SplitExact { by, n }.into()) } #[cfg(feature = "dtype-struct")] @@ -284,52 +229,8 @@ impl StringNameSpace { pub fn split_exact_inclusive(self, by: &str, n: usize) -> Expr { let by = by.to_string(); - let function = move |s: Series| { - let ca = s.utf8()?; - - let mut arrs = (0..n + 1) - .map(|_| MutableUtf8Array::::with_capacity(ca.len())) - .collect::>(); - - ca.into_iter().for_each(|opt_s| match opt_s { - None => { - for arr in &mut arrs { - arr.push_null() - } - }, - Some(s) => { - let mut arr_iter = arrs.iter_mut(); - let split_iter = s.split_inclusive(&by); - (split_iter) - .zip(&mut arr_iter) - .for_each(|(splitted, arr)| arr.push(Some(splitted))); - // fill the remaining with null - for arr in arr_iter { - arr.push_null() - } - }, - }); - let fields = arrs - .into_iter() - .enumerate() - .map(|(i, mut arr)| { - Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() - }) - .collect::>(); - Ok(Some(StructChunked::new(ca.name(), &fields)?.into_series())) - }; self.0 - .map( - function, - GetOutput::from_type(DataType::Struct( - (0..n + 1) - .map(|i| { - Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) - }) - .collect(), - )), - ) - .with_fmt("str.split_exact") + .map_private(StringFunction::SplitExactInclusive { by, n }.into()) } #[cfg(feature = "dtype-struct")] @@ -338,52 +239,7 @@ impl StringNameSpace { pub fn splitn(self, by: &str, n: usize) -> Expr { let by = by.to_string(); - let function = move |s: Series| { - let ca = s.utf8()?; - - let mut arrs = (0..n) - .map(|_| MutableUtf8Array::::with_capacity(ca.len())) - .collect::>(); - - ca.into_iter().for_each(|opt_s| match opt_s { - None => { - for arr in &mut arrs { - arr.push_null() - } - }, - Some(s) => { - let mut arr_iter = arrs.iter_mut(); - let split_iter = s.splitn(n, &by); - (split_iter) - .zip(&mut arr_iter) - .for_each(|(splitted, arr)| arr.push(Some(splitted))); - // fill the remaining with null - for arr in arr_iter { - arr.push_null() - } - }, - }); - let fields = arrs - .into_iter() - .enumerate() - .map(|(i, mut arr)| { - Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() - }) - .collect::>(); - Ok(Some(StructChunked::new(ca.name(), &fields)?.into_series())) - }; - self.0 - .map( - function, - GetOutput::from_type(DataType::Struct( - (0..n) - .map(|i| { - Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) - }) - .collect(), - )), - ) - .with_fmt("str.splitn") + self.0.map_private(StringFunction::SplitN { by, n }.into()) } #[cfg(feature = "regex")]