From 3fbb34b3ecac25a85b6a30785d331755e60f97c8 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 24 Oct 2023 13:30:09 +0800 Subject: [PATCH] chore(rust): move unique_counts to ops (#11963) --- crates/polars-core/Cargo.toml | 1 - crates/polars-core/src/series/ops/mod.rs | 1 - crates/polars-core/src/series/ops/unique.rs | 49 ------------------- crates/polars-ops/Cargo.toml | 1 + crates/polars-ops/src/series/ops/mod.rs | 4 ++ crates/polars-ops/src/series/ops/unique.rs | 42 ++++++++++++++++ crates/polars-plan/Cargo.toml | 2 +- .../src/dsl/function_expr/dispatch.rs | 2 +- crates/polars/Cargo.toml | 2 +- 9 files changed, 50 insertions(+), 54 deletions(-) delete mode 100644 crates/polars-core/src/series/ops/unique.rs create mode 100644 crates/polars-ops/src/series/ops/unique.rs diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index d34488684198..46534d76e899 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -151,7 +151,6 @@ docs-selection = [ "abs", "dataframe_arithmetic", "product", - "unique_counts", "describe", "chunked_ids", "partition_by", diff --git a/crates/polars-core/src/series/ops/mod.rs b/crates/polars-core/src/series/ops/mod.rs index 159991052e1a..aa47b007bcba 100644 --- a/crates/polars-core/src/series/ops/mod.rs +++ b/crates/polars-core/src/series/ops/mod.rs @@ -2,7 +2,6 @@ mod downcast; mod extend; mod null; mod to_list; -mod unique; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; diff --git a/crates/polars-core/src/series/ops/unique.rs b/crates/polars-core/src/series/ops/unique.rs deleted file mode 100644 index 9daee89710d3..000000000000 --- a/crates/polars-core/src/series/ops/unique.rs +++ /dev/null @@ -1,49 +0,0 @@ -#[cfg(feature = "unique_counts")] -use std::hash::Hash; - -#[cfg(feature = "unique_counts")] -use crate::hashing::_HASHMAP_INIT_SIZE; -use crate::prelude::*; -#[cfg(feature = "unique_counts")] -use crate::utils::NoNull; - -#[cfg(feature = "unique_counts")] -fn unique_counts(items: I) -> IdxCa -where - I: Iterator, - J: Hash + Eq, -{ - let mut map = PlIndexMap::with_capacity_and_hasher(_HASHMAP_INIT_SIZE, Default::default()); - for item in items { - map.entry(item) - .and_modify(|cnt| { - *cnt += 1; - }) - .or_insert(1 as IdxSize); - } - let out: NoNull = map.into_values().collect(); - out.into_inner() -} - -impl Series { - /// Returns a count of the unique values in the order of appearance. - #[cfg(feature = "unique_counts")] - pub fn unique_counts(&self) -> IdxCa { - if self.dtype().to_physical().is_numeric() { - if self.bit_repr_is_large() { - let ca = self.bit_repr_large(); - unique_counts(ca.into_iter()) - } else { - let ca = self.bit_repr_small(); - unique_counts(ca.into_iter()) - } - } else { - match self.dtype() { - DataType::Utf8 => unique_counts(self.utf8().unwrap().into_iter()), - dt => { - panic!("'unique_counts' not implemented for {dt} data types") - }, - } - } - } -} diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 24f429134949..27cc2df8e3df 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -65,6 +65,7 @@ round_series = [] is_first_distinct = [] is_last_distinct = [] is_unique = [] +unique_counts = [] approx_unique = [] fused = [] cutqcut = ["dtype-categorical", "dtype-struct"] diff --git a/crates/polars-ops/src/series/ops/mod.rs b/crates/polars-ops/src/series/ops/mod.rs index b422b4e28509..e12c38cbf7cb 100644 --- a/crates/polars-ops/src/series/ops/mod.rs +++ b/crates/polars-ops/src/series/ops/mod.rs @@ -44,6 +44,8 @@ mod round; mod search_sorted; #[cfg(feature = "to_dummies")] mod to_dummies; +#[cfg(feature = "unique_counts")] +mod unique; mod various; pub use approx_algo::*; @@ -93,6 +95,8 @@ pub use round::*; pub use search_sorted::*; #[cfg(feature = "to_dummies")] pub use to_dummies::*; +#[cfg(feature = "unique_counts")] +pub use unique::*; pub use various::*; pub trait SeriesSealed { diff --git a/crates/polars-ops/src/series/ops/unique.rs b/crates/polars-ops/src/series/ops/unique.rs new file mode 100644 index 000000000000..01fa766426dc --- /dev/null +++ b/crates/polars-ops/src/series/ops/unique.rs @@ -0,0 +1,42 @@ +use std::hash::Hash; + +use polars_core::hashing::_HASHMAP_INIT_SIZE; +use polars_core::prelude::*; +use polars_core::utils::NoNull; + +fn unique_counts_helper(items: I) -> IdxCa +where + I: Iterator, + J: Hash + Eq, +{ + let mut map = PlIndexMap::with_capacity_and_hasher(_HASHMAP_INIT_SIZE, Default::default()); + for item in items { + map.entry(item) + .and_modify(|cnt| { + *cnt += 1; + }) + .or_insert(1 as IdxSize); + } + let out: NoNull = map.into_values().collect(); + out.into_inner() +} + +/// Returns a count of the unique values in the order of appearance. +pub fn unique_counts(s: &Series) -> PolarsResult { + if s.dtype().to_physical().is_numeric() { + if s.bit_repr_is_large() { + let ca = s.bit_repr_large(); + Ok(unique_counts_helper(ca.into_iter()).into_series()) + } else { + let ca = s.bit_repr_small(); + Ok(unique_counts_helper(ca.into_iter()).into_series()) + } + } else { + match s.dtype() { + DataType::Utf8 => Ok(unique_counts_helper(s.utf8().unwrap().into_iter()).into_series()), + dt => { + polars_bail!(opq = unique_counts, dt) + }, + } + } +} diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index b8d55668befe..fbe48390df6d 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -115,7 +115,7 @@ random = ["polars-core/random"] dynamic_group_by = ["polars-core/dynamic_group_by"] ewma = ["polars-ops/ewma"] dot_diagram = [] -unique_counts = ["polars-core/unique_counts"] +unique_counts = ["polars-ops/unique_counts"] log = ["polars-ops/log"] chunked_ids = ["polars-core/chunked_ids"] list_to_struct = ["polars-ops/list_to_struct"] diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index eb00120e970f..20586189ca1d 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -54,7 +54,7 @@ pub(super) fn value_counts(s: &Series, sort: bool, parallel: bool) -> PolarsResu #[cfg(feature = "unique_counts")] pub(super) fn unique_counts(s: &Series) -> PolarsResult { - Ok(s.unique_counts().into_series()) + polars_ops::prelude::unique_counts(s) } pub(super) fn backward_fill(s: &Series, limit: FillNullLimit) -> PolarsResult { diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 01f9e520eb51..e23bce18d84e 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -156,7 +156,7 @@ ewma = ["polars-ops/ewma", "polars-lazy?/ewma"] dot_diagram = ["polars-lazy?/dot_diagram"] dataframe_arithmetic = ["polars-core/dataframe_arithmetic"] product = ["polars-core/product"] -unique_counts = ["polars-core/unique_counts", "polars-lazy?/unique_counts"] +unique_counts = ["polars-ops/unique_counts", "polars-lazy?/unique_counts"] log = ["polars-ops/log", "polars-lazy?/log"] partition_by = ["polars-core/partition_by"] semi_anti_join = ["polars-lazy?/semi_anti_join", "polars-ops/semi_anti_join", "polars-sql?/semi_anti_join"]