diff --git a/experimental/zerotrie/src/builder/nonconst/builder.rs b/experimental/zerotrie/src/builder/nonconst/builder.rs index 77c42a23c7a..40abf0c435b 100644 --- a/experimental/zerotrie/src/builder/nonconst/builder.rs +++ b/experimental/zerotrie/src/builder/nonconst/builder.rs @@ -10,49 +10,11 @@ use super::store::TrieBuilderStore; use crate::builder::bytestr::ByteStr; use crate::byte_phf::PerfectByteHashMapCacheOwned; use crate::error::Error; +use crate::options::*; use crate::varint; use alloc::borrow::Cow; use alloc::vec::Vec; -/// Whether to use the perfect hash function in the ZeroTrie. -pub enum PhfMode { - /// Use binary search for all branch nodes. - BinaryOnly, - /// Use the perfect hash function for large branch nodes. - UsePhf, -} - -/// Whether to support non-ASCII data in the ZeroTrie. -pub enum AsciiMode { - /// Support only ASCII, returning an error if non-ASCII is found. - AsciiOnly, - /// Support all data, creating span nodes for non-ASCII bytes. - BinarySpans, -} - -/// Whether to enforce a limit to the capacity of the ZeroTrie. -pub enum CapacityMode { - /// Return an error if the trie requires a branch of more than 2^32 bytes. - Normal, - /// Construct the trie without returning an error. - Extended, -} - -/// Whether to permit strings that have inconsistent ASCII case at a node, such as "abc" and "Abc" -pub enum MixedCaseMode { - /// Allows strings regardless of case. - Allow, - /// Returns an error if a node exists with the same character in ambiguous case. - Reject, -} - -pub struct ZeroTrieBuilderOptions { - pub phf_mode: PhfMode, - pub ascii_mode: AsciiMode, - pub capacity_mode: CapacityMode, - pub mixed_case_mode: MixedCaseMode, -} - /// A low-level builder for ZeroTrie. Supports all options. pub(crate) struct ZeroTrieBuilder { data: S, @@ -144,7 +106,7 @@ impl ZeroTrieBuilder { items.sort_by(|a, b| cmp_keys_values(&options, *a, *b)); let ascii_str_slice = items.as_slice(); let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice); - Self::from_sorted_tuple_slice(byte_str_slice, options) + Self::from_sorted_tuple_slice_impl(byte_str_slice, options) } /// Builds a ZeroTrie with the given items and options. Assumes that the items are sorted, @@ -158,12 +120,20 @@ impl ZeroTrieBuilder { options: ZeroTrieBuilderOptions, ) -> Result { let mut items = Cow::Borrowed(items); - if matches!(options.mixed_case_mode, MixedCaseMode::Reject) { + if matches!(options.case_sensitivity, CaseSensitivity::IgnoreCase) { // We need to re-sort the items with our custom comparator. items.to_mut().sort_by(|a, b| { cmp_keys_values(&options, (a.0.as_bytes(), a.1), (b.0.as_bytes(), b.1)) }); } + Self::from_sorted_tuple_slice_impl(&items, options) + } + + /// Internal constructor that does not re-sort the items. + fn from_sorted_tuple_slice_impl( + items: &[(&ByteStr, usize)], + options: ZeroTrieBuilderOptions, + ) -> Result { for ab in items.windows(2) { debug_assert!(cmp_keys_values( &options, @@ -177,7 +147,7 @@ impl ZeroTrieBuilder { phf_cache: PerfectByteHashMapCacheOwned::new_empty(), options, }; - let total_size = result.create(&items)?; + let total_size = result.create(items)?; debug_assert!(total_size == result.data.atbs_len()); Ok(result) } @@ -267,7 +237,9 @@ impl ZeroTrieBuilder { if ascii_i == key_ascii && ascii_j == key_ascii { let len = self.prepend_ascii(key_ascii)?; current_len += len; - if matches!(self.options.mixed_case_mode, MixedCaseMode::Reject) && i == new_i + 2 { + if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase) + && i == new_i + 2 + { // This can happen if two strings were picked up, each with a different case return Err(Error::MixedCase); } @@ -324,7 +296,7 @@ impl ZeroTrieBuilder { }; let mut branch_metas = lengths_stack.pop_many_or_panic(total_count); let original_keys = branch_metas.map_to_ascii_bytes(); - if matches!(self.options.mixed_case_mode, MixedCaseMode::Reject) { + if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase) { // Check to see if we have the same letter in two different cases let mut seen_ascii_alpha = [false; 26]; for c in original_keys.as_const_slice().as_slice() { @@ -435,7 +407,7 @@ fn cmp_keys_values( a: (&[u8], usize), b: (&[u8], usize), ) -> Ordering { - if matches!(options.mixed_case_mode, MixedCaseMode::Allow) { + if matches!(options.case_sensitivity, CaseSensitivity::Sensitive) { a.0.cmp(b.0) } else { let a_iter = a.0.iter().map(|x| x.to_ascii_lowercase()); diff --git a/experimental/zerotrie/src/builder/nonconst/mod.rs b/experimental/zerotrie/src/builder/nonconst/mod.rs index c060eb2856a..6ed78d715ba 100644 --- a/experimental/zerotrie/src/builder/nonconst/mod.rs +++ b/experimental/zerotrie/src/builder/nonconst/mod.rs @@ -7,39 +7,3 @@ mod store; pub(crate) use builder::*; pub(crate) use store::TrieBuilderStore; - -impl crate::ZeroTrieSimpleAscii { - pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { - phf_mode: PhfMode::BinaryOnly, - ascii_mode: AsciiMode::AsciiOnly, - capacity_mode: CapacityMode::Normal, - mixed_case_mode: MixedCaseMode::Allow, - }; -} - -impl crate::ZeroAsciiIgnoreCaseTrie { - pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { - phf_mode: PhfMode::BinaryOnly, - ascii_mode: AsciiMode::AsciiOnly, - capacity_mode: CapacityMode::Normal, - mixed_case_mode: MixedCaseMode::Reject, - }; -} - -impl crate::ZeroTriePerfectHash { - pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { - phf_mode: PhfMode::UsePhf, - ascii_mode: AsciiMode::BinarySpans, - capacity_mode: CapacityMode::Normal, - mixed_case_mode: MixedCaseMode::Allow, - }; -} - -impl crate::ZeroTrieExtendedCapacity { - pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { - phf_mode: PhfMode::UsePhf, - ascii_mode: AsciiMode::BinarySpans, - capacity_mode: CapacityMode::Extended, - mixed_case_mode: MixedCaseMode::Allow, - }; -} diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index 655e064ec34..6764c551603 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -60,6 +60,7 @@ mod cursor; mod error; #[macro_use] mod helpers; +mod options; mod reader; #[cfg(feature = "serde")] mod serde; diff --git a/experimental/zerotrie/src/options.rs b/experimental/zerotrie/src/options.rs new file mode 100644 index 00000000000..13e15451e55 --- /dev/null +++ b/experimental/zerotrie/src/options.rs @@ -0,0 +1,93 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Options for building and reading from a ZeroTrie. +//! +//! These options are internal to the crate. A small selection of options +//! are exported by way of the different public types on this crate. + +/// Whether to use the perfect hash function in the ZeroTrie. +pub(crate) enum PhfMode { + /// Use binary search for all branch nodes. + BinaryOnly, + /// Use the perfect hash function for large branch nodes. + UsePhf, +} + +/// Whether to support non-ASCII data in the ZeroTrie. +pub(crate) enum AsciiMode { + /// Support only ASCII, returning an error if non-ASCII is found. + AsciiOnly, + /// Support all data, creating span nodes for non-ASCII bytes. + BinarySpans, +} + +/// Whether to enforce a limit to the capacity of the ZeroTrie. +pub(crate) enum CapacityMode { + /// Return an error if the trie requires a branch of more than 2^32 bytes. + Normal, + /// Construct the trie without returning an error. + Extended, +} + +/// How to handle strings with mixed ASCII case at a node, such as "abc" and "Abc" +pub(crate) enum CaseSensitivity { + /// Allow all strings and sort them by byte value. + Sensitive, + /// Reject strings with different case and sort them as if `to_ascii_lowercase` is called. + IgnoreCase, +} + +pub(crate) struct ZeroTrieBuilderOptions { + pub phf_mode: PhfMode, + pub ascii_mode: AsciiMode, + pub capacity_mode: CapacityMode, + pub case_sensitivity: CaseSensitivity, +} + +pub(crate) trait ZeroTrieWithOptions { + const OPTIONS: ZeroTrieBuilderOptions; +} + +/// All branch nodes are binary search +/// and there are no span nodes. +impl ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::BinaryOnly, + ascii_mode: AsciiMode::AsciiOnly, + capacity_mode: CapacityMode::Normal, + case_sensitivity: CaseSensitivity::Sensitive, + }; +} + +/// All branch nodes are binary search +/// and nodes use case-insensitive matching. +impl ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::BinaryOnly, + ascii_mode: AsciiMode::AsciiOnly, + capacity_mode: CapacityMode::Normal, + case_sensitivity: CaseSensitivity::IgnoreCase, + }; +} + +/// Branch nodes could be either binary search or PHF. +impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::UsePhf, + ascii_mode: AsciiMode::BinarySpans, + capacity_mode: CapacityMode::Normal, + case_sensitivity: CaseSensitivity::Sensitive, + }; +} + +/// No limited capacity assertion. +impl ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::UsePhf, + ascii_mode: AsciiMode::BinarySpans, + capacity_mode: CapacityMode::Extended, + case_sensitivity: CaseSensitivity::Sensitive, + }; +} diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 11493aa1571..410392e3189 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -205,6 +205,7 @@ use crate::byte_phf::PerfectByteHashMap; use crate::helpers::*; +use crate::options::*; use crate::varint::read_varint_meta2; use crate::varint::read_varint_meta3; @@ -295,17 +296,11 @@ fn byte_type(b: u8) -> NodeType { } } -// DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w. -// -// | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 | -// |---------------|-------------------|--------------------|-------------------|--------------------| -// | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns | -// | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs | -// | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs | - -/// Query the trie assuming all branch nodes are binary search -/// and there are no span nodes. -pub fn get_ascii_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { +#[inline] +pub(crate) fn get_parameterized( + mut trie: &[u8], + mut ascii: &[u8], +) -> Option { loop { let (b, x, i, search); (b, trie) = trie.split_first()?; @@ -313,126 +308,25 @@ pub fn get_ascii_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option (0, trie), NodeType::Span => { - debug_assert!(false, "Span node found in ASCII trie!"); - return None; - } - NodeType::Value => read_varint_meta3(*b, trie), - NodeType::Branch => read_varint_meta2(*b, trie), - }; - if let Some((c, temp)) = ascii.split_first() { - if matches!(byte_type, NodeType::Ascii) { - if b == c { - // Matched a byte - ascii = temp; - continue; + if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) { + read_varint_meta3(*b, trie) } else { - // Byte that doesn't match + debug_assert!(false, "Span node found in ASCII trie!"); return None; } } - if matches!(byte_type, NodeType::Value) { - // Value node, but not at end of string - continue; - } - // Branch node - let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; - // See comment above regarding this assertion - debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); - let w = w & 0x3; - let x = if x == 0 { 256 } else { x }; - // Always use binary search - (search, trie) = trie.debug_split_at(x); - i = search.binary_search(c).ok()?; - trie = if w == 0 { - get_branch_w0(trie, i, x) - } else { - get_branch(trie, i, x, w) - }; - ascii = temp; - continue; - } else { - if matches!(byte_type, NodeType::Value) { - // Value node at end of string - return Some(x); - } - return None; - } - } -} - -/// Query the trie assuming all branch nodes are binary search -/// and nodes use case-insensitive matching. -pub fn get_ascii_bsearch_only_ignore_case(mut trie: &[u8], mut ascii: &[u8]) -> Option { - loop { - let (b, x, i, search); - (b, trie) = trie.split_first()?; - let byte_type = byte_type(*b); - (x, trie) = match byte_type { - NodeType::Ascii => (0, trie), - NodeType::Span => { - debug_assert!(false, "Span node found in ASCII trie!"); - return None; - } NodeType::Value => read_varint_meta3(*b, trie), NodeType::Branch => read_varint_meta2(*b, trie), }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { - if b.to_ascii_lowercase() == c.to_ascii_lowercase() { - // Matched a byte - ascii = temp; - continue; + let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) + { + b.to_ascii_lowercase() == c.to_ascii_lowercase() } else { - // Byte that doesn't match - return None; - } - } - if matches!(byte_type, NodeType::Value) { - // Value node, but not at end of string - continue; - } - // Branch node - let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; - // See comment above regarding this assertion - debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); - let w = w & 0x3; - let x = if x == 0 { 256 } else { x }; - // Always use binary search - (search, trie) = trie.debug_split_at(x); - i = search - .binary_search_by_key(&c.to_ascii_lowercase(), |x| x.to_ascii_lowercase()) - .ok()?; - trie = if w == 0 { - get_branch_w0(trie, i, x) - } else { - get_branch(trie, i, x, w) - }; - ascii = temp; - continue; - } else { - if matches!(byte_type, NodeType::Value) { - // Value node at end of string - return Some(x); - } - return None; - } - } -} - -/// Query the trie assuming branch nodes could be either binary search or PHF. -pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { - loop { - let (b, x, i, search); - (b, trie) = trie.split_first()?; - let byte_type = byte_type(*b); - (x, trie) = match byte_type { - NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), - NodeType::Branch => read_varint_meta2(*b, trie), - }; - if let Some((c, temp)) = ascii.split_first() { - if matches!(byte_type, NodeType::Ascii) { - if b == c { + b == c + }; + if is_match { // Matched a byte ascii = temp; continue; @@ -445,7 +339,9 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { // Value node, but not at end of string continue; } - if matches!(byte_type, NodeType::Span) { + if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) + && matches!(byte_type, NodeType::Span) + { let (trie_span, ascii_span); (trie_span, trie) = trie.debug_split_at(x); (ascii_span, ascii) = ascii.maybe_split_at(x)?; @@ -459,81 +355,26 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { } // Branch node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; - // See comment above regarding this assertion - debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); - let w = w & 0x3; - let x = if x == 0 { 256 } else { x }; - if x < 16 { - // binary search - (search, trie) = trie.debug_split_at(x); - i = search.binary_search(c).ok()?; + let w = if matches!(T::OPTIONS.capacity_mode, CapacityMode::Extended) { + w } else { - // phf - (search, trie) = trie.debug_split_at(x * 2 + 1); - i = PerfectByteHashMap::from_store(search).get(*c)?; - } - trie = if w == 0 { - get_branch_w0(trie, i, x) - } else { - get_branch(trie, i, x, w) + // See the table below regarding this assertion + debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); + w & 0x3 }; - ascii = temp; - continue; - } else { - if matches!(byte_type, NodeType::Value) { - // Value node at end of string - return Some(x); - } - return None; - } - } -} - -/// Query the trie without the limited capacity assertion. -pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { - loop { - let (b, x, i, search); - (b, trie) = trie.split_first()?; - let byte_type = byte_type(*b); - (x, trie) = match byte_type { - NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), - NodeType::Branch => read_varint_meta2(*b, trie), - }; - if let Some((c, temp)) = ascii.split_first() { - if matches!(byte_type, NodeType::Ascii) { - if b == c { - // Matched a byte - ascii = temp; - continue; - } else { - // Byte that doesn't match - return None; - } - } - if matches!(byte_type, NodeType::Value) { - // Value node, but not at end of string - continue; - } - if matches!(byte_type, NodeType::Span) { - let (trie_span, ascii_span); - (trie_span, trie) = trie.debug_split_at(x); - (ascii_span, ascii) = ascii.maybe_split_at(x)?; - if trie_span == ascii_span { - // Matched a byte span - continue; - } else { - // Byte span that doesn't match - return None; - } - } - // Branch node - let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; let x = if x == 0 { 256 } else { x }; - if x < 16 { + if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 { // binary search (search, trie) = trie.debug_split_at(x); - i = search.binary_search(c).ok()?; + let bsearch_result = + if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { + search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { + x.to_ascii_lowercase() + }) + } else { + search.binary_search(c) + }; + i = bsearch_result.ok()?; } else { // phf (search, trie) = trie.debug_split_at(x * 2 + 1); @@ -556,6 +397,14 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { } } +// DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w. +// +// | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 | +// |---------------|-------------------|--------------------|-------------------|--------------------| +// | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns | +// | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs | +// | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs | + /// Steps one node into the trie assuming all branch nodes are binary search and that /// there are no span nodes. /// diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 04e79df7f64..4b79611a685 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -2,7 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::reader::*; +use crate::reader; use core::borrow::Borrow; @@ -236,7 +236,7 @@ impl ZeroTrieExtendedCapacity { } macro_rules! impl_zerotrie_subtype { - ($name:ident, $getter_fn:path, $iter_ty:ty, $iter_fn:path, $cnv_fn:path) => { + ($name:ident, $iter_ty:ty, $iter_fn:path, $cnv_fn:path) => { impl $name { /// Create a trie directly from a store. /// @@ -274,10 +274,9 @@ macro_rules! impl_zerotrie_subtype { Store: AsRef<[u8]> + ?Sized, { /// Queries the trie for a string. - #[inline] pub fn get(&self, key: K) -> Option where K: AsRef<[u8]> { // TODO: Should this be AsRef or Borrow? - $getter_fn(self.store.as_ref(), key.as_ref()) + reader::get_parameterized::(self.store.as_ref(), key.as_ref()) } /// Returns `true` if the trie is empty. #[inline] @@ -387,9 +386,10 @@ macro_rules! impl_zerotrie_subtype { #[cfg(feature = "alloc")] impl $name> { pub(crate) fn try_from_tuple_slice(items: &[(&ByteStr, usize)]) -> Result { + use crate::options::ZeroTrieWithOptions; ZeroTrieBuilder::>::from_sorted_tuple_slice( items, - Self::BUILDER_OPTIONS, + Self::OPTIONS, ) .map(|s| Self { store: s.to_bytes(), @@ -402,10 +402,11 @@ macro_rules! impl_zerotrie_subtype { K: AsRef<[u8]> { fn from_iter>(iter: T) -> Self { + use crate::options::ZeroTrieWithOptions; use crate::builder::nonconst::ZeroTrieBuilder; ZeroTrieBuilder::>::from_bytes_iter( iter, - Self::BUILDER_OPTIONS + Self::OPTIONS ) .map(|s| Self { store: s.to_bytes(), @@ -624,30 +625,26 @@ fn string_to_box_u8(input: String) -> Box<[u8]> { impl_zerotrie_subtype!( ZeroTrieSimpleAscii, - get_ascii_bsearch_only, String, - get_iter_ascii_or_panic, + reader::get_iter_ascii_or_panic, string_to_box_u8 ); impl_zerotrie_subtype!( ZeroAsciiIgnoreCaseTrie, - get_ascii_bsearch_only_ignore_case, String, - get_iter_ascii_or_panic, + reader::get_iter_ascii_or_panic, string_to_box_u8 ); impl_zerotrie_subtype!( ZeroTriePerfectHash, - get_phf_limited, Vec, - get_iter_phf, + reader::get_iter_phf, Vec::into_boxed_slice ); impl_zerotrie_subtype!( ZeroTrieExtendedCapacity, - get_phf_extended, Vec, - get_iter_phf, + reader::get_iter_phf, Vec::into_boxed_slice );