From 18e903c4fae06abc2a20344146eae3dfeae53f6b Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sat, 28 Dec 2024 00:17:10 +0100 Subject: [PATCH] improve following comment --- .../src/chunked_array/strings/normalize.rs | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/normalize.rs b/crates/polars-ops/src/chunked_array/strings/normalize.rs index 7f974b7bdde2..29c4939d5679 100644 --- a/crates/polars-ops/src/chunked_array/strings/normalize.rs +++ b/crates/polars-ops/src/chunked_array/strings/normalize.rs @@ -1,5 +1,4 @@ -use polars_core::prelude::arity::unary_elementwise; -use polars_core::prelude::StringChunked; +use polars_core::prelude::{StringChunked, StringChunkedBuilder}; use unicode_normalization::UnicodeNormalization; #[derive(Clone, Eq, PartialEq, Hash, Debug)] @@ -11,13 +10,29 @@ pub enum UnicodeForm { NFKD, } +pub fn normalize_with( + ca: &StringChunked, + normalizer: F, +) -> StringChunked { + let mut buffer = String::new(); + let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len()); + for opt_s in ca.iter() { + if let Some(s) = opt_s { + buffer.clear(); + normalizer(s, &mut buffer); + builder.append_value(&buffer); + } else { + builder.append_null(); + } + } + builder.finish() +} + pub fn normalize(ca: &StringChunked, form: UnicodeForm) -> StringChunked { - unary_elementwise(ca, |val| { - val.map(|x| match form { - UnicodeForm::NFC => x.nfc().collect::(), - UnicodeForm::NFKC => x.nfkc().collect::(), - UnicodeForm::NFD => x.nfd().collect::(), - UnicodeForm::NFKD => x.nfkd().collect::(), - }) - }) + match form { + UnicodeForm::NFC => normalize_with(ca, |s, b| b.extend(s.nfc())), + UnicodeForm::NFKC => normalize_with(ca, |s, b| b.extend(s.nfkc())), + UnicodeForm::NFD => normalize_with(ca, |s, b| b.extend(s.nfd())), + UnicodeForm::NFKD => normalize_with(ca, |s, b| b.extend(s.nfkd())), + } }