Skip to content

Commit

Permalink
ZeroTrie feedback and refactoring (#4553)
Browse files Browse the repository at this point in the history
Follow-ups to #4549
  • Loading branch information
sffc authored Jan 30, 2024
1 parent 7a7e93c commit aeef4f6
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 287 deletions.
62 changes: 17 additions & 45 deletions experimental/zerotrie/src/builder/nonconst/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,49 +10,11 @@ use super::store::TrieBuilderStore;
use crate::builder::bytestr::ByteStr;
use crate::byte_phf::PerfectByteHashMapCacheOwned;
use crate::error::Error;
use crate::options::*;
use crate::varint;
use alloc::borrow::Cow;
use alloc::vec::Vec;

/// Whether to use the perfect hash function in the ZeroTrie.
pub enum PhfMode {
/// Use binary search for all branch nodes.
BinaryOnly,
/// Use the perfect hash function for large branch nodes.
UsePhf,
}

/// Whether to support non-ASCII data in the ZeroTrie.
pub enum AsciiMode {
/// Support only ASCII, returning an error if non-ASCII is found.
AsciiOnly,
/// Support all data, creating span nodes for non-ASCII bytes.
BinarySpans,
}

/// Whether to enforce a limit to the capacity of the ZeroTrie.
pub enum CapacityMode {
/// Return an error if the trie requires a branch of more than 2^32 bytes.
Normal,
/// Construct the trie without returning an error.
Extended,
}

/// Whether to permit strings that have inconsistent ASCII case at a node, such as "abc" and "Abc"
pub enum MixedCaseMode {
/// Allows strings regardless of case.
Allow,
/// Returns an error if a node exists with the same character in ambiguous case.
Reject,
}

pub struct ZeroTrieBuilderOptions {
pub phf_mode: PhfMode,
pub ascii_mode: AsciiMode,
pub capacity_mode: CapacityMode,
pub mixed_case_mode: MixedCaseMode,
}

/// A low-level builder for ZeroTrie. Supports all options.
pub(crate) struct ZeroTrieBuilder<S> {
data: S,
Expand Down Expand Up @@ -144,7 +106,7 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
items.sort_by(|a, b| cmp_keys_values(&options, *a, *b));
let ascii_str_slice = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice);
Self::from_sorted_tuple_slice(byte_str_slice, options)
Self::from_sorted_tuple_slice_impl(byte_str_slice, options)
}

/// Builds a ZeroTrie with the given items and options. Assumes that the items are sorted,
Expand All @@ -158,12 +120,20 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
options: ZeroTrieBuilderOptions,
) -> Result<Self, Error> {
let mut items = Cow::Borrowed(items);
if matches!(options.mixed_case_mode, MixedCaseMode::Reject) {
if matches!(options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// We need to re-sort the items with our custom comparator.
items.to_mut().sort_by(|a, b| {
cmp_keys_values(&options, (a.0.as_bytes(), a.1), (b.0.as_bytes(), b.1))
});
}
Self::from_sorted_tuple_slice_impl(&items, options)
}

/// Internal constructor that does not re-sort the items.
fn from_sorted_tuple_slice_impl(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, Error> {
for ab in items.windows(2) {
debug_assert!(cmp_keys_values(
&options,
Expand All @@ -177,7 +147,7 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
phf_cache: PerfectByteHashMapCacheOwned::new_empty(),
options,
};
let total_size = result.create(&items)?;
let total_size = result.create(items)?;
debug_assert!(total_size == result.data.atbs_len());
Ok(result)
}
Expand Down Expand Up @@ -267,7 +237,9 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
if ascii_i == key_ascii && ascii_j == key_ascii {
let len = self.prepend_ascii(key_ascii)?;
current_len += len;
if matches!(self.options.mixed_case_mode, MixedCaseMode::Reject) && i == new_i + 2 {
if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase)
&& i == new_i + 2
{
// This can happen if two strings were picked up, each with a different case
return Err(Error::MixedCase);
}
Expand Down Expand Up @@ -324,7 +296,7 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
};
let mut branch_metas = lengths_stack.pop_many_or_panic(total_count);
let original_keys = branch_metas.map_to_ascii_bytes();
if matches!(self.options.mixed_case_mode, MixedCaseMode::Reject) {
if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// Check to see if we have the same letter in two different cases
let mut seen_ascii_alpha = [false; 26];
for c in original_keys.as_const_slice().as_slice() {
Expand Down Expand Up @@ -435,7 +407,7 @@ fn cmp_keys_values(
a: (&[u8], usize),
b: (&[u8], usize),
) -> Ordering {
if matches!(options.mixed_case_mode, MixedCaseMode::Allow) {
if matches!(options.case_sensitivity, CaseSensitivity::Sensitive) {
a.0.cmp(b.0)
} else {
let a_iter = a.0.iter().map(|x| x.to_ascii_lowercase());
Expand Down
36 changes: 0 additions & 36 deletions experimental/zerotrie/src/builder/nonconst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,3 @@ mod store;

pub(crate) use builder::*;
pub(crate) use store::TrieBuilderStore;

impl<S: ?Sized> crate::ZeroTrieSimpleAscii<S> {
pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
mixed_case_mode: MixedCaseMode::Allow,
};
}

impl<S: ?Sized> crate::ZeroAsciiIgnoreCaseTrie<S> {
pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
mixed_case_mode: MixedCaseMode::Reject,
};
}

impl<S: ?Sized> crate::ZeroTriePerfectHash<S> {
pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Normal,
mixed_case_mode: MixedCaseMode::Allow,
};
}

impl<S: ?Sized> crate::ZeroTrieExtendedCapacity<S> {
pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Extended,
mixed_case_mode: MixedCaseMode::Allow,
};
}
1 change: 1 addition & 0 deletions experimental/zerotrie/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ mod cursor;
mod error;
#[macro_use]
mod helpers;
mod options;
mod reader;
#[cfg(feature = "serde")]
mod serde;
Expand Down
93 changes: 93 additions & 0 deletions experimental/zerotrie/src/options.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Options for building and reading from a ZeroTrie.
//!
//! These options are internal to the crate. A small selection of options
//! are exported by way of the different public types on this crate.
/// Whether to use the perfect hash function in the ZeroTrie.
pub(crate) enum PhfMode {
/// Use binary search for all branch nodes.
BinaryOnly,
/// Use the perfect hash function for large branch nodes.
UsePhf,
}

/// Whether to support non-ASCII data in the ZeroTrie.
pub(crate) enum AsciiMode {
/// Support only ASCII, returning an error if non-ASCII is found.
AsciiOnly,
/// Support all data, creating span nodes for non-ASCII bytes.
BinarySpans,
}

/// Whether to enforce a limit to the capacity of the ZeroTrie.
pub(crate) enum CapacityMode {
/// Return an error if the trie requires a branch of more than 2^32 bytes.
Normal,
/// Construct the trie without returning an error.
Extended,
}

/// How to handle strings with mixed ASCII case at a node, such as "abc" and "Abc"
pub(crate) enum CaseSensitivity {
/// Allow all strings and sort them by byte value.
Sensitive,
/// Reject strings with different case and sort them as if `to_ascii_lowercase` is called.
IgnoreCase,
}

pub(crate) struct ZeroTrieBuilderOptions {
pub phf_mode: PhfMode,
pub ascii_mode: AsciiMode,
pub capacity_mode: CapacityMode,
pub case_sensitivity: CaseSensitivity,
}

pub(crate) trait ZeroTrieWithOptions {
const OPTIONS: ZeroTrieBuilderOptions;
}

/// All branch nodes are binary search
/// and there are no span nodes.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::Sensitive,
};
}

/// All branch nodes are binary search
/// and nodes use case-insensitive matching.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::IgnoreCase,
};
}

/// Branch nodes could be either binary search or PHF.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTriePerfectHash<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::Sensitive,
};
}

/// No limited capacity assertion.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Extended,
case_sensitivity: CaseSensitivity::Sensitive,
};
}
Loading

0 comments on commit aeef4f6

Please sign in to comment.