Skip to content

Commit

Permalink
feat(python, rust!): Add disable_string_cache (#11020)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Sep 25, 2023
1 parent 5e0dc4d commit 843ea92
Show file tree
Hide file tree
Showing 29 changed files with 528 additions and 267 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -512,12 +512,12 @@ impl CategoricalChunked {
mod test {
use crate::chunked_array::categorical::CategoricalChunkedBuilder;
use crate::prelude::*;
use crate::{enable_string_cache, reset_string_cache, SINGLE_LOCK};
use crate::{disable_string_cache, enable_string_cache, SINGLE_LOCK};

#[test]
fn test_categorical_rev() -> PolarsResult<()> {
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
disable_string_cache();
let slice = &[
Some("foo"),
None,
Expand All @@ -532,7 +532,7 @@ mod test {
assert_eq!(out.get_rev_map().len(), 2);

// test the global branch
enable_string_cache(true);
enable_string_cache();
// empty global cache
let out = ca.cast(&DataType::Categorical(None))?;
let out = out.categorical().unwrap().clone();
Expand All @@ -556,11 +556,13 @@ mod test {

#[test]
fn test_categorical_builder() {
use crate::{enable_string_cache, reset_string_cache};
use crate::{disable_string_cache, enable_string_cache};
let _lock = crate::SINGLE_LOCK.lock();
for b in &[false, true] {
reset_string_cache();
enable_string_cache(*b);
for use_string_cache in [false, true] {
disable_string_cache();
if use_string_cache {
enable_string_cache();
}

// Use 2 builders to check if the global string cache
// does not interfere with the index mapping
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,13 @@ impl CategoricalChunked {
mod test {
use super::*;
use crate::chunked_array::categorical::CategoricalChunkedBuilder;
use crate::{enable_string_cache, reset_string_cache, IUseStringCache};
use crate::{disable_string_cache, enable_string_cache, StringCacheHolder};

#[test]
fn test_merge_rev_map() {
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
let _sc = IUseStringCache::hold();
disable_string_cache();
let _sc = StringCacheHolder::hold();

let mut builder1 = CategoricalChunkedBuilder::new("foo", 10);
let mut builder2 = CategoricalChunkedBuilder::new("foo", 10);
Expand Down
13 changes: 6 additions & 7 deletions crates/polars-core/src/chunked_array/logical/categorical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ mod builder;
mod from;
mod merge;
mod ops;
pub mod stringcache;
pub mod string_cache;

use bitflags::bitflags;
pub use builder::*;
Expand Down Expand Up @@ -265,12 +265,12 @@ mod test {
use std::convert::TryFrom;

use super::*;
use crate::{enable_string_cache, reset_string_cache, SINGLE_LOCK};
use crate::{disable_string_cache, enable_string_cache, SINGLE_LOCK};

#[test]
fn test_categorical_round_trip() -> PolarsResult<()> {
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
disable_string_cache();
let slice = &[
Some("foo"),
None,
Expand All @@ -295,8 +295,8 @@ mod test {
#[test]
fn test_append_categorical() {
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
enable_string_cache(true);
disable_string_cache();
enable_string_cache();

let mut s1 = Series::new("1", vec!["a", "b", "c"])
.cast(&DataType::Categorical(None))
Expand Down Expand Up @@ -329,8 +329,7 @@ mod test {
#[test]
fn test_categorical_flow() -> PolarsResult<()> {
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
enable_string_cache(false);
disable_string_cache();

// tests several things that may lose the dtype information
let s = Series::new("a", vec!["a", "b", "c"]).cast(&DataType::Categorical(None))?;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::hash::{Hash, Hasher};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};

use ahash::RandomState;
use hashbrown::hash_map::RawEntryMut;
Expand All @@ -11,80 +11,103 @@ use crate::datatypes::PlIdHashMap;
use crate::hashing::_HASHMAP_INIT_SIZE;
use crate::prelude::InitHashMaps;

/// We use atomic reference counting
/// to determine how many threads use the string cache
/// if the refcount is zero, we may clear the string cache.
pub(crate) static USE_STRING_CACHE: AtomicU32 = AtomicU32::new(0);
/// We use atomic reference counting to determine how many threads use the
/// string cache. If the refcount is zero, we may clear the string cache.
static STRING_CACHE_REFCOUNT: Mutex<u32> = Mutex::new(0);
static STRING_CACHE_ENABLED_GLOBALLY: AtomicBool = AtomicBool::new(false);
static STRING_CACHE_UUID_CTR: AtomicU32 = AtomicU32::new(0);

/// RAII for the string cache
/// If an operation creates categoricals and uses them in a join
/// or comparison that operation must hold this cache via
/// `let handle = IUseStringCache::hold()`
/// The cache is valid until `handle` is dropped.
/// Enable the global string cache as long as the object is alive ([RAII]).
///
/// # Examples
///
/// Enable the string cache by initializing the object:
///
/// ```
/// use polars_core::StringCacheHolder;
///
/// let _sc = StringCacheHolder::hold();
/// ```
///
/// The string cache is enabled until `handle` is dropped.
///
/// # De-allocation
///
/// Multiple threads can hold the string cache at the same time.
/// The contents of the cache will only get dropped when no
/// thread holds it.
pub struct IUseStringCache {
/// The contents of the cache will only get dropped when no thread holds it.
///
/// [RAII]: https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization
pub struct StringCacheHolder {
// only added so that it will never be constructed directly
#[allow(dead_code)]
private_zst: (),
}

impl Default for IUseStringCache {
impl Default for StringCacheHolder {
fn default() -> Self {
Self::hold()
}
}

impl IUseStringCache {
impl StringCacheHolder {
/// Hold the StringCache
pub fn hold() -> IUseStringCache {
enable_string_cache(true);
IUseStringCache { private_zst: () }
pub fn hold() -> StringCacheHolder {
increment_string_cache_refcount();
StringCacheHolder { private_zst: () }
}
}

impl Drop for IUseStringCache {
impl Drop for StringCacheHolder {
fn drop(&mut self) {
enable_string_cache(false)
decrement_string_cache_refcount();
}
}

pub fn with_string_cache<F: FnOnce() -> T, T>(func: F) -> T {
enable_string_cache(true);
let out = func();
enable_string_cache(false);
out
fn increment_string_cache_refcount() {
let mut refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
*refcount += 1;
}
fn decrement_string_cache_refcount() {
let mut refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
*refcount -= 1;
if *refcount == 0 {
STRING_CACHE.clear()
}
}

/// Use a global string cache for the Categorical Types.
/// Enable the global string cache.
///
/// This is used to cache the string categories locally.
/// This allows join operations on categorical types.
pub fn enable_string_cache(toggle: bool) {
if toggle {
USE_STRING_CACHE.fetch_add(1, Ordering::Release);
} else {
let previous = USE_STRING_CACHE.fetch_sub(1, Ordering::Release);
if previous == 0 || previous == 1 {
USE_STRING_CACHE.store(0, Ordering::Release);
STRING_CACHE.clear()
}
/// [`Categorical`] columns created under the same global string cache have the
/// same underlying physical value when string values are equal. This allows the
/// columns to be concatenated or used in a join operation, for example.
///
/// Note that enabling the global string cache introduces some overhead.
/// The amount of overhead depends on the number of categories in your data.
/// It is advised to enable the global string cache only when strictly necessary.
///
/// [`Categorical`]: crate::datatypes::DataType::Categorical
pub fn enable_string_cache() {
let was_enabled = STRING_CACHE_ENABLED_GLOBALLY.swap(true, Ordering::AcqRel);
if !was_enabled {
increment_string_cache_refcount();
}
}

/// Reset the global string cache used for the Categorical Types.
pub fn reset_string_cache() {
USE_STRING_CACHE.store(0, Ordering::Release);
STRING_CACHE.clear()
/// Disable and clear the global string cache.
///
/// Note: Consider using [`StringCacheHolder`] for a more reliable way of
/// enabling and disabling the string cache.
pub fn disable_string_cache() {
let was_enabled = STRING_CACHE_ENABLED_GLOBALLY.swap(false, Ordering::AcqRel);
if was_enabled {
decrement_string_cache_refcount();
}
}

/// Check if string cache is set.
/// Check whether the global string cache is enabled.
pub fn using_string_cache() -> bool {
USE_STRING_CACHE.load(Ordering::Acquire) > 0
let refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
*refcount > 0
}

// This is the hash and the Index offset in the linear buffer
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -827,9 +827,9 @@ pub(crate) mod test {
#[test]
#[cfg(feature = "dtype-categorical")]
fn test_iter_categorical() {
use crate::{reset_string_cache, SINGLE_LOCK};
use crate::{disable_string_cache, SINGLE_LOCK};
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
disable_string_cache();
let ca = Utf8Chunked::new("", &[Some("foo"), None, Some("bar"), Some("ham")]);
let ca = ca.cast(&DataType::Categorical(None)).unwrap();
let ca = ca.categorical().unwrap();
Expand Down
20 changes: 13 additions & 7 deletions crates/polars-core/src/chunked_array/ops/sort/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ impl CategoricalChunked {
#[cfg(test)]
mod test {
use crate::prelude::*;
use crate::{enable_string_cache, reset_string_cache, SINGLE_LOCK};
use crate::{disable_string_cache, enable_string_cache, SINGLE_LOCK};

fn assert_order(ca: &CategoricalChunked, cmp: &[&str]) {
let s = ca.cast(&DataType::Utf8).unwrap();
Expand All @@ -133,9 +133,12 @@ mod test {
let init = &["c", "b", "a", "d"];

let _lock = SINGLE_LOCK.lock();
for toggle in [true, false] {
reset_string_cache();
enable_string_cache(toggle);
for use_string_cache in [true, false] {
disable_string_cache();
if use_string_cache {
enable_string_cache();
}

let s = Series::new("", init).cast(&DataType::Categorical(None))?;
let ca = s.categorical()?;
let mut ca_lexical = ca.clone();
Expand All @@ -157,13 +160,16 @@ mod test {
}

#[test]

fn test_cat_lexical_sort_multiple() -> PolarsResult<()> {
let init = &["c", "b", "a", "a"];

let _lock = SINGLE_LOCK.lock();
for enable in [true, false] {
enable_string_cache(enable);
for use_string_cache in [true, false] {
disable_string_cache();
if use_string_cache {
enable_string_cache();
}

let s = Series::new("", init).cast(&DataType::Categorical(None))?;
let ca = s.categorical()?;
let mut ca_lexical: CategoricalChunked = ca.clone();
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ use once_cell::sync::Lazy;
use rayon::{ThreadPool, ThreadPoolBuilder};

#[cfg(feature = "dtype-categorical")]
pub use crate::chunked_array::logical::categorical::stringcache::*;
pub use crate::chunked_array::logical::categorical::string_cache::*;

pub static PROCESS_ID: Lazy<u128> = Lazy::new(|| {
SystemTime::now()
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-error/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ Help: if you're using Python, this may look something like:
Alternatively, if the performance cost is acceptable, you could just set:
import polars as pl
pl.enable_string_cache(True)
pl.enable_string_cache()
on startup."#.trim_start())
};
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-io/src/csv/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ where

#[cfg(feature = "dtype-categorical")]
if _has_cat {
_cat_lock = Some(polars_core::IUseStringCache::hold())
_cat_lock = Some(polars_core::StringCacheHolder::hold())
}

let mut csv_reader = self.core_reader(Some(Arc::new(schema)), to_cast)?;
Expand All @@ -602,7 +602,7 @@ where
})
.unwrap_or(false);
if has_cat {
_cat_lock = Some(polars_core::IUseStringCache::hold())
_cat_lock = Some(polars_core::StringCacheHolder::hold())
}
}
let mut csv_reader = self.core_reader(self.schema.clone(), vec![])?;
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-io/src/csv/read_impl/batched_mmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ impl<'a> CoreReader<'a> {
// RAII structure that will ensure we maintain a global stringcache
#[cfg(feature = "dtype-categorical")]
let _cat_lock = if _has_cat {
Some(polars_core::IUseStringCache::hold())
Some(polars_core::StringCacheHolder::hold())
} else {
None
};
Expand Down Expand Up @@ -196,7 +196,7 @@ pub struct BatchedCsvReaderMmap<'a> {
schema: SchemaRef,
rows_read: IdxSize,
#[cfg(feature = "dtype-categorical")]
_cat_lock: Option<polars_core::IUseStringCache>,
_cat_lock: Option<polars_core::StringCacheHolder>,
#[cfg(not(feature = "dtype-categorical"))]
_cat_lock: Option<u8>,
}
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-io/src/csv/read_impl/batched_read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ impl<'a> CoreReader<'a> {
// RAII structure that will ensure we maintain a global stringcache
#[cfg(feature = "dtype-categorical")]
let _cat_lock = if _has_cat {
Some(polars_core::IUseStringCache::hold())
Some(polars_core::StringCacheHolder::hold())
} else {
None
};
Expand Down Expand Up @@ -279,7 +279,7 @@ pub struct BatchedCsvReaderRead<'a> {
schema: SchemaRef,
rows_read: IdxSize,
#[cfg(feature = "dtype-categorical")]
_cat_lock: Option<polars_core::IUseStringCache>,
_cat_lock: Option<polars_core::StringCacheHolder>,
#[cfg(not(feature = "dtype-categorical"))]
_cat_lock: Option<u8>,
}
Expand Down
Loading

0 comments on commit 843ea92

Please sign in to comment.