From 34a4c90b7c63c15a5803769f42bd58c02d78df83 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 19 Sep 2023 13:53:52 +0800 Subject: [PATCH] perf: Using cache for str.contains regex compilation (#11183) --- .../src/chunked_array/strings/namespace.rs | 38 ++++++++++--------- crates/polars-utils/src/cache.rs | 19 ++++++++++ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index e72f738bc800..c5fddf7583a0 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -112,14 +112,23 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { src.contains(pat) })) } else if strict { - try_binary_elementwise_values(ca, pat, |src, pat| { - Ok(Regex::new(pat)?.is_match(src)) + // A sqrt(n) regex cache is not too small, not too large. + let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize); + try_binary_elementwise(ca, pat, |opt_src, opt_pat| match (opt_src, opt_pat) { + (Some(src), Some(pat)) => { + let reg = reg_cache.try_get_or_insert_with(pat, |p| Regex::new(p))?; + Ok(Some(reg.is_match(src))) + }, + _ => Ok(None), }) } else { + // A sqrt(n) regex cache is not too small, not too large. + let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize); Ok(binary_elementwise(ca, pat, |opt_src, opt_pat| { match (opt_src, opt_pat) { (Some(src), Some(pat)) => { - Regex::new(pat).ok().map(|re| re.is_match(src)) + let reg = reg_cache.try_get_or_insert_with(pat, |p| Regex::new(p)); + reg.ok().map(|re| re.is_match(src)) }, _ => None, } @@ -399,15 +408,13 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { // A sqrt(n) regex cache is not too small, not too large. let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize); let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); - for (opt_s, opt_pat) in ca.into_iter().zip(pat) { - match (opt_s, opt_pat) { - (_, None) | (None, _) => builder.append_null(), - (Some(s), Some(pat)) => { - let reg = reg_cache.get_or_insert_with(pat, |p| Regex::new(p).unwrap()); - builder.append_values_iter(reg.find_iter(s).map(|m| m.as_str())); - }, - } - } + binary_elementwise_for_each(ca, pat, |opt_s, opt_pat| match (opt_s, opt_pat) { + (_, None) | (None, _) => builder.append_null(), + (Some(s), Some(pat)) => { + let reg = reg_cache.get_or_insert_with(pat, |p| Regex::new(p).unwrap()); + builder.append_values_iter(reg.find_iter(s).map(|m| m.as_str())); + }, + }); Ok(builder.finish()) } @@ -427,12 +434,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { Regex::new(pat)? }; - let mut out: UInt32Chunked = ca - .into_iter() - .map(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32)) - .collect(); - out.rename(ca.name()); - Ok(out) + Ok(ca.apply_generic(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32))) } /// Count all successive non-overlapping regex matches. diff --git a/crates/polars-utils/src/cache.rs b/crates/polars-utils/src/cache.rs index 21a022e6f604..26c6a932bfb5 100644 --- a/crates/polars-utils/src/cache.rs +++ b/crates/polars-utils/src/cache.rs @@ -111,6 +111,25 @@ impl FastFixedCache { } } + pub fn try_get_or_insert_with(&mut self, key: &Q, f: F) -> Result<&mut V, E> + where + K: Borrow, + Q: Hash + Eq + ToOwned + ?Sized, + F: FnOnce(&K) -> Result, + { + unsafe { + let h = self.hash(key); + if let Some(slot_idx) = self.raw_get(self.hash(&key), key) { + let slot = self.slots.get_unchecked_mut(slot_idx); + return Ok(slot.value.assume_init_mut()); + } + + let key = key.to_owned(); + let val = f(&key)?; + Ok(self.raw_insert(h, key, val)) + } + } + unsafe fn raw_get(&self, h: HashResult, key: &Q) -> Option where K: Borrow,