Skip to content

Commit

Permalink
improve following comment
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher committed Dec 27, 2024
1 parent dae704c commit 18e903c
Showing 1 changed file with 25 additions and 10 deletions.
35 changes: 25 additions & 10 deletions crates/polars-ops/src/chunked_array/strings/normalize.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use polars_core::prelude::arity::unary_elementwise;
use polars_core::prelude::StringChunked;
use polars_core::prelude::{StringChunked, StringChunkedBuilder};
use unicode_normalization::UnicodeNormalization;

#[derive(Clone, Eq, PartialEq, Hash, Debug)]
Expand All @@ -11,13 +10,29 @@ pub enum UnicodeForm {
NFKD,
}

pub fn normalize_with<F: Fn(&str, &mut String)>(
ca: &StringChunked,
normalizer: F,
) -> StringChunked {
let mut buffer = String::new();
let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len());
for opt_s in ca.iter() {
if let Some(s) = opt_s {
buffer.clear();
normalizer(s, &mut buffer);
builder.append_value(&buffer);
} else {
builder.append_null();
}
}
builder.finish()
}

pub fn normalize(ca: &StringChunked, form: UnicodeForm) -> StringChunked {
unary_elementwise(ca, |val| {
val.map(|x| match form {
UnicodeForm::NFC => x.nfc().collect::<String>(),
UnicodeForm::NFKC => x.nfkc().collect::<String>(),
UnicodeForm::NFD => x.nfd().collect::<String>(),
UnicodeForm::NFKD => x.nfkd().collect::<String>(),
})
})
match form {
UnicodeForm::NFC => normalize_with(ca, |s, b| b.extend(s.nfc())),
UnicodeForm::NFKC => normalize_with(ca, |s, b| b.extend(s.nfkc())),
UnicodeForm::NFD => normalize_with(ca, |s, b| b.extend(s.nfd())),
UnicodeForm::NFKD => normalize_with(ca, |s, b| b.extend(s.nfkd())),
}
}

0 comments on commit 18e903c

Please sign in to comment.