Skip to content

Commit

Permalink
perf: Using cache for str.contains regex compilation (#11183)
Browse files Browse the repository at this point in the history
  • Loading branch information
reswqa authored Sep 19, 2023
1 parent 43c57a6 commit 34a4c90
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 18 deletions.
38 changes: 20 additions & 18 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,23 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
src.contains(pat)
}))
} else if strict {
try_binary_elementwise_values(ca, pat, |src, pat| {
Ok(Regex::new(pat)?.is_match(src))
// A sqrt(n) regex cache is not too small, not too large.
let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize);
try_binary_elementwise(ca, pat, |opt_src, opt_pat| match (opt_src, opt_pat) {
(Some(src), Some(pat)) => {
let reg = reg_cache.try_get_or_insert_with(pat, |p| Regex::new(p))?;
Ok(Some(reg.is_match(src)))
},
_ => Ok(None),
})
} else {
// A sqrt(n) regex cache is not too small, not too large.
let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize);
Ok(binary_elementwise(ca, pat, |opt_src, opt_pat| {
match (opt_src, opt_pat) {
(Some(src), Some(pat)) => {
Regex::new(pat).ok().map(|re| re.is_match(src))
let reg = reg_cache.try_get_or_insert_with(pat, |p| Regex::new(p));
reg.ok().map(|re| re.is_match(src))
},
_ => None,
}
Expand Down Expand Up @@ -399,15 +408,13 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
// A sqrt(n) regex cache is not too small, not too large.
let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize);
let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size());
for (opt_s, opt_pat) in ca.into_iter().zip(pat) {
match (opt_s, opt_pat) {
(_, None) | (None, _) => builder.append_null(),
(Some(s), Some(pat)) => {
let reg = reg_cache.get_or_insert_with(pat, |p| Regex::new(p).unwrap());
builder.append_values_iter(reg.find_iter(s).map(|m| m.as_str()));
},
}
}
binary_elementwise_for_each(ca, pat, |opt_s, opt_pat| match (opt_s, opt_pat) {
(_, None) | (None, _) => builder.append_null(),
(Some(s), Some(pat)) => {
let reg = reg_cache.get_or_insert_with(pat, |p| Regex::new(p).unwrap());
builder.append_values_iter(reg.find_iter(s).map(|m| m.as_str()));
},
});
Ok(builder.finish())
}

Expand All @@ -427,12 +434,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
Regex::new(pat)?
};

let mut out: UInt32Chunked = ca
.into_iter()
.map(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32))
.collect();
out.rename(ca.name());
Ok(out)
Ok(ca.apply_generic(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32)))
}

/// Count all successive non-overlapping regex matches.
Expand Down
19 changes: 19 additions & 0 deletions crates/polars-utils/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,25 @@ impl<K: Hash + Eq, V> FastFixedCache<K, V> {
}
}

pub fn try_get_or_insert_with<Q, F, E>(&mut self, key: &Q, f: F) -> Result<&mut V, E>
where
K: Borrow<Q>,
Q: Hash + Eq + ToOwned<Owned = K> + ?Sized,
F: FnOnce(&K) -> Result<V, E>,
{
unsafe {
let h = self.hash(key);
if let Some(slot_idx) = self.raw_get(self.hash(&key), key) {
let slot = self.slots.get_unchecked_mut(slot_idx);
return Ok(slot.value.assume_init_mut());
}

let key = key.to_owned();
let val = f(&key)?;
Ok(self.raw_insert(h, key, val))
}
}

unsafe fn raw_get<Q: Eq + ?Sized>(&self, h: HashResult, key: &Q) -> Option<usize>
where
K: Borrow<Q>,
Expand Down

0 comments on commit 34a4c90

Please sign in to comment.