Skip to content

Commit

Permalink
refactor(rust): bind all remaining method StringNameSpace to function…
Browse files Browse the repository at this point in the history
… expr
  • Loading branch information
reswqa committed Sep 21, 2023
1 parent e739743 commit 7568525
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 153 deletions.
115 changes: 115 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
use base64::engine::general_purpose;
#[cfg(feature = "string_encoding")]
use base64::Engine as _;
#[cfg(feature = "dtype-struct")]
use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array};
use polars_arrow::kernels::string::*;
#[cfg(feature = "string_from_radix")]
use polars_core::export::num::Num;
Expand Down Expand Up @@ -376,6 +378,119 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
}
}

#[cfg(feature = "dtype-struct")]
fn split_exact(&self, by: &str, n: usize) -> PolarsResult<StructChunked> {
let ca = self.as_utf8();

let mut arrs = (0..n + 1)
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
.collect::<Vec<_>>();

ca.for_each(|opt_s| match opt_s {
None => {
for arr in &mut arrs {
arr.push_null()
}
},
Some(s) => {
let mut arr_iter = arrs.iter_mut();
let split_iter = s.split(by);
(split_iter)
.zip(&mut arr_iter)
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
// fill the remaining with null
for arr in arr_iter {
arr.push_null()
}
},
});

let fields = arrs
.into_iter()
.enumerate()
.map(|(i, mut arr)| {
Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap()
})
.collect::<Vec<_>>();

StructChunked::new(ca.name(), &fields)
}

#[cfg(feature = "dtype-struct")]
fn split_exact_inclusive(&self, by: &str, n: usize) -> PolarsResult<StructChunked> {
let ca = self.as_utf8();

let mut arrs = (0..n + 1)
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
.collect::<Vec<_>>();

ca.for_each(|opt_s| match opt_s {
None => {
for arr in &mut arrs {
arr.push_null()
}
},
Some(s) => {
let mut arr_iter = arrs.iter_mut();
let split_iter = s.split_inclusive(by);
(split_iter)
.zip(&mut arr_iter)
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
// fill the remaining with null
for arr in arr_iter {
arr.push_null()
}
},
});

let fields = arrs
.into_iter()
.enumerate()
.map(|(i, mut arr)| {
Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap()
})
.collect::<Vec<_>>();

StructChunked::new(ca.name(), &fields)
}

#[cfg(feature = "dtype-struct")]
fn splitn(&self, by: &str, n: usize) -> PolarsResult<StructChunked> {
let ca = self.as_utf8();

let mut arrs = (0..n)
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
.collect::<Vec<_>>();

ca.for_each(|opt_s| match opt_s {
None => {
for arr in &mut arrs {
arr.push_null()
}
},
Some(s) => {
let mut arr_iter = arrs.iter_mut();
let split_iter = s.splitn(n, &by);
(split_iter)
.zip(&mut arr_iter)
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
// fill the remaining with null
for arr in arr_iter {
arr.push_null()
}
},
});
let fields = arrs
.into_iter()
.enumerate()
.map(|(i, mut arr)| {
Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap()
})
.collect::<Vec<_>>();

StructChunked::new(ca.name(), &fields)
}

fn split(&self, by: &str) -> ListChunked {
let ca = self.as_utf8();
let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size());
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,12 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
SplitInclusive => {
map_as_slice!(strings::split_inclusive)
},
#[cfg(feature = "dtype-struct")]
SplitExact { by, n } => map!(strings::split_exact, &by, n),
#[cfg(feature = "dtype-struct")]
SplitExactInclusive { by, n } => map!(strings::split_exact_inclusive, &by, n),
#[cfg(feature = "dtype-struct")]
SplitN { by, n } => map!(strings::splitn, &by, n),
#[cfg(feature = "concat_str")]
ConcatVertical(delimiter) => map!(strings::concat, &delimiter),
#[cfg(feature = "concat_str")]
Expand Down
58 changes: 58 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ use serde::{Deserialize, Serialize};
static TZ_AWARE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap());

#[cfg(feature = "dtype-struct")]
use polars_utils::format_smartstring;

use super::*;

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
Expand Down Expand Up @@ -73,6 +76,21 @@ pub enum StringFunction {
StripCharsEnd(Option<String>),
StripPrefix,
StripSuffix,
#[cfg(feature = "dtype-struct")]
SplitExact {
by: String,
n: usize,
},
#[cfg(feature = "dtype-struct")]
SplitExactInclusive {
by: String,
n: usize,
},
#[cfg(feature = "dtype-struct")]
SplitN {
by: String,
n: usize,
},
#[cfg(feature = "temporal")]
Strptime(DataType, StrptimeOptions),
Split,
Expand Down Expand Up @@ -126,6 +144,22 @@ impl StringFunction {
| Slice(_, _) => mapper.with_same_dtype(),
#[cfg(feature = "string_justify")]
Zfill { .. } | LJust { .. } | RJust { .. } => mapper.with_same_dtype(),
#[cfg(feature = "dtype-struct")]
SplitExact { by: _, n } | SplitExactInclusive { by: _, n } => {
mapper.with_dtype(DataType::Struct(
(0..n + 1)
.map(|i| {
Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8)
})
.collect(),
))
},
#[cfg(feature = "dtype-struct")]
SplitN { by: _, n } => mapper.with_dtype(DataType::Struct(
(0..*n)
.map(|i| Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8))
.collect(),
)),
}
}
}
Expand Down Expand Up @@ -166,6 +200,12 @@ impl Display for StringFunction {
StringFunction::StripCharsEnd(_) => "strip_chars_end",
StringFunction::StripPrefix => "strip_prefix",
StringFunction::StripSuffix => "strip_suffix",
#[cfg(feature = "dtype-struct")]
StringFunction::SplitExact { .. } => "split_exact",
#[cfg(feature = "dtype-struct")]
StringFunction::SplitExactInclusive { .. } => "split_exact_inclusive",
#[cfg(feature = "dtype-struct")]
StringFunction::SplitN { .. } => "splitn",
#[cfg(feature = "temporal")]
StringFunction::Strptime(_, _) => "strptime",
StringFunction::Split => "split",
Expand Down Expand Up @@ -393,6 +433,24 @@ pub(super) fn strptime(
}
}

#[cfg(feature = "dtype-struct")]
pub(super) fn split_exact(s: &Series, by: &str, n: usize) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.split_exact(by, n).map(|ca| ca.into_series())
}

#[cfg(feature = "dtype-struct")]
pub(super) fn split_exact_inclusive(s: &Series, by: &str, n: usize) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.split_exact_inclusive(by, n).map(|ca| ca.into_series())
}

#[cfg(feature = "dtype-struct")]
pub(super) fn splitn(s: &Series, by: &str, n: usize) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.splitn(by, n).map(|ca| ca.into_series())
}

pub(super) fn split(s: &[Series]) -> PolarsResult<Series> {
let ca = s[0].utf8()?;
let by = s[1].utf8()?;
Expand Down
Loading

0 comments on commit 7568525

Please sign in to comment.