diff --git a/zebra-state/src/service/finalized_state/disk_db.rs b/zebra-state/src/service/finalized_state/disk_db.rs index 3772fb7a789..c042ea6db0d 100644 --- a/zebra-state/src/service/finalized_state/disk_db.rs +++ b/zebra-state/src/service/finalized_state/disk_db.rs @@ -21,6 +21,7 @@ use std::{ use itertools::Itertools; use rlimit::increase_nofile_limit; +use rocksdb::ReadOptions; use zebra_chain::parameters::Network; use crate::{ @@ -194,7 +195,7 @@ pub trait ReadDisk { fn zs_first_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk; /// Returns the highest key in `cf`, and the corresponding value. @@ -203,7 +204,7 @@ pub trait ReadDisk { fn zs_last_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk; /// Returns the first key greater than or equal to `lower_bound` in `cf`, @@ -321,34 +322,22 @@ impl ReadDisk for DiskDb { fn zs_first_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk, { // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, rocksdb::IteratorMode::Start) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, .., false).next() } #[allow(clippy::unwrap_in_result)] fn zs_last_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk, { // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, rocksdb::IteratorMode::End) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, .., true).next() } #[allow(clippy::unwrap_in_result)] @@ -358,17 +347,8 @@ impl ReadDisk for DiskDb { K: IntoDisk + FromDisk, V: FromDisk, { - let lower_bound = lower_bound.as_bytes(); - let from = rocksdb::IteratorMode::From(lower_bound.as_ref(), rocksdb::Direction::Forward); - // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, from) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, lower_bound.., false).next() } #[allow(clippy::unwrap_in_result)] @@ -378,17 +358,8 @@ impl ReadDisk for DiskDb { K: IntoDisk + FromDisk, V: FromDisk, { - let upper_bound = upper_bound.as_bytes(); - let from = rocksdb::IteratorMode::From(upper_bound.as_ref(), rocksdb::Direction::Reverse); - // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, from) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, ..=upper_bound, true).next() } fn zs_items_in_range_ordered(&self, cf: &C, range: R) -> BTreeMap @@ -398,7 +369,7 @@ impl ReadDisk for DiskDb { V: FromDisk, R: RangeBounds, { - self.zs_range_iter(cf, range).collect() + self.zs_range_iter(cf, range, false).collect() } fn zs_items_in_range_unordered(&self, cf: &C, range: R) -> HashMap @@ -408,7 +379,7 @@ impl ReadDisk for DiskDb { V: FromDisk, R: RangeBounds, { - self.zs_range_iter(cf, range).collect() + self.zs_range_iter(cf, range, false).collect() } } @@ -430,15 +401,24 @@ impl DiskWriteBatch { impl DiskDb { /// Returns an iterator over the items in `cf` in `range`. /// + /// Accepts a `reverse` argument. If it is `true`, creates the iterator with an + /// [`IteratorMode`](rocksdb::IteratorMode) of [`End`](rocksdb::IteratorMode::End), or + /// [`From`](rocksdb::IteratorMode::From) with [`Direction::Reverse`](rocksdb::Direction::Reverse). + /// /// Holding this iterator open might delay block commit transactions. - pub fn zs_range_iter(&self, cf: &C, range: R) -> impl Iterator + '_ + pub fn zs_range_iter( + &self, + cf: &C, + range: R, + reverse: bool, + ) -> impl Iterator + '_ where C: rocksdb::AsColumnFamilyRef, K: IntoDisk + FromDisk, V: FromDisk, R: RangeBounds, { - self.zs_range_iter_with_direction(cf, range, false) + self.zs_range_iter_with_direction(cf, range, reverse) } /// Returns a reverse iterator over the items in `cf` in `range`. @@ -495,11 +475,12 @@ impl DiskDb { let range = (start_bound, end_bound); let mode = Self::zs_iter_mode(&range, reverse); + let opts = Self::zs_iter_opts(&range); // Reading multiple items from iterators has caused database hangs, // in previous RocksDB versions self.db - .iterator_cf(cf, mode) + .iterator_cf_opt(cf, opts, mode) .map(|result| result.expect("unexpected database failure")) .map(|(key, value)| (key.to_vec(), value)) // Skip excluded "from" bound and empty ranges. The `mode` already skips keys @@ -514,6 +495,64 @@ impl DiskDb { .map(|(key, value)| (K::from_bytes(key), V::from_bytes(value))) } + /// Returns the RocksDB ReadOptions with a lower and upper bound for a range. + fn zs_iter_opts(range: &R) -> ReadOptions + where + R: RangeBounds>, + { + let mut opts = ReadOptions::default(); + let (lower_bound, upper_bound) = Self::zs_iter_bounds(range); + + if let Some(bound) = lower_bound { + opts.set_iterate_lower_bound(bound); + }; + + if let Some(bound) = upper_bound { + opts.set_iterate_upper_bound(bound); + }; + + opts + } + + /// Returns a lower and upper iterate bounds for a range. + /// + /// Note: Since upper iterate bounds are always exclusive in RocksDB, this method + /// will increment the upper bound by 1 if the end bound of the provided range + /// is inclusive. + fn zs_iter_bounds(range: &R) -> (Option>, Option>) + where + R: RangeBounds>, + { + use std::ops::Bound::*; + + let lower_bound = match range.start_bound() { + Included(bound) | Excluded(bound) => Some(bound.clone()), + Unbounded => None, + }; + + let upper_bound = match range.end_bound().cloned() { + Included(mut bound) => { + // Increment the last byte in the upper bound that is less than u8::MAX, and + // clear any bytes after it to increment the next key in lexicographic order + // (next big-endian number) this Vec represents to RocksDB. + let is_wrapped_overflow = bound.iter_mut().rev().all(|v| { + *v = v.wrapping_add(1); + v == &0 + }); + + if is_wrapped_overflow { + bound.insert(0, 0x01) + } + + Some(bound) + } + Excluded(bound) => Some(bound), + Unbounded => None, + }; + + (lower_bound, upper_bound) + } + /// Returns the RocksDB iterator "from" mode for `range`. /// /// RocksDB iterators are ordered by increasing key bytes by default. diff --git a/zebra-state/src/service/finalized_state/disk_db/tests.rs b/zebra-state/src/service/finalized_state/disk_db/tests.rs index 17613e8b3b5..20fecbbf127 100644 --- a/zebra-state/src/service/finalized_state/disk_db/tests.rs +++ b/zebra-state/src/service/finalized_state/disk_db/tests.rs @@ -24,3 +24,50 @@ impl DiskDb { rocksdb::DB::list_cf(&opts, path) } } + +/// Check that zs_iter_opts returns an upper bound one greater than provided inclusive end bounds. +#[test] +fn zs_iter_opts_increments_key_by_one() { + let _init_guard = zebra_test::init(); + + // TODO: add an empty key (`()` type or `[]` when serialized) test case + let keys: [u32; 14] = [ + 0, + 1, + 200, + 255, + 256, + 257, + 65535, + 65536, + 65537, + 16777215, + 16777216, + 16777217, + 16777218, + u32::MAX, + ]; + + for key in keys { + let (_, bytes) = DiskDb::zs_iter_bounds(&..=key.to_be_bytes().to_vec()); + let mut extra_bytes = bytes.expect("there should be an upper bound"); + let bytes = extra_bytes.split_off(extra_bytes.len() - 4); + let upper_bound = u32::from_be_bytes(bytes.clone().try_into().expect("should be 4 bytes")); + let expected_upper_bound = key.wrapping_add(1); + + assert_eq!( + expected_upper_bound, upper_bound, + "the upper bound should be 1 greater than the original key" + ); + + if expected_upper_bound == 0 { + assert_eq!( + extra_bytes, + vec![1], + "there should be an extra byte with a value of 1" + ); + } else { + assert_eq!(extra_bytes.len(), 0, "there should be no extra bytes"); + } + } +} diff --git a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs index 58d9e43a6c1..77992a1128f 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs @@ -209,7 +209,7 @@ impl ZebraDb { R: std::ops::RangeBounds, { let sapling_trees = self.db.cf_handle("sapling_note_commitment_tree").unwrap(); - self.db.zs_range_iter(&sapling_trees, range) + self.db.zs_range_iter(&sapling_trees, range, false) } /// Returns the Sapling note commitment trees in the reversed range, in decreasing height order. @@ -282,7 +282,7 @@ impl ZebraDb { if let Some(exclusive_end_bound) = exclusive_end_bound { list = self .db - .zs_range_iter(&sapling_subtrees, start_index..exclusive_end_bound) + .zs_range_iter(&sapling_subtrees, start_index..exclusive_end_bound, false) .collect(); } else { // If there is no end bound, just return all the trees. @@ -291,7 +291,7 @@ impl ZebraDb { // the trees run out.) list = self .db - .zs_range_iter(&sapling_subtrees, start_index..) + .zs_range_iter(&sapling_subtrees, start_index.., false) .collect(); } @@ -382,7 +382,7 @@ impl ZebraDb { R: std::ops::RangeBounds, { let orchard_trees = self.db.cf_handle("orchard_note_commitment_tree").unwrap(); - self.db.zs_range_iter(&orchard_trees, range) + self.db.zs_range_iter(&orchard_trees, range, false) } /// Returns the Orchard note commitment trees in the reversed range, in decreasing height order. @@ -455,7 +455,7 @@ impl ZebraDb { if let Some(exclusive_end_bound) = exclusive_end_bound { list = self .db - .zs_range_iter(&orchard_subtrees, start_index..exclusive_end_bound) + .zs_range_iter(&orchard_subtrees, start_index..exclusive_end_bound, false) .collect(); } else { // If there is no end bound, just return all the trees. @@ -464,7 +464,7 @@ impl ZebraDb { // the trees run out.) list = self .db - .zs_range_iter(&orchard_subtrees, start_index..) + .zs_range_iter(&orchard_subtrees, start_index.., false) .collect(); }