From f42fad31883288a5fbfb99d015ae8e56fb21231a Mon Sep 17 00:00:00 2001 From: Little-Wallace Date: Fri, 25 Mar 2022 16:25:24 +0800 Subject: [PATCH 01/18] copy files Signed-off-by: Little-Wallace --- CMakeLists.txt | 1 + memtable/art.h | 264 ++++++++++++++++++++++++++++++++++++ memtable/art_inner_node.h | 277 +++++++++++++++++++++++++++++++++++++ memtable/art_leaf_node.h | 26 ++++ memtable/art_node.h | 58 ++++++++ memtable/art_node_16.h | 150 ++++++++++++++++++++ memtable/art_node_256.h | 100 ++++++++++++++ memtable/art_node_4.h | 126 +++++++++++++++++ memtable/art_node_48.h | 139 +++++++++++++++++++ memtable/art_tree_it.h | 206 ++++++++++++++++++++++++++++ memtable/artrep.cc | 278 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 1625 insertions(+) create mode 100644 memtable/art.h create mode 100644 memtable/art_inner_node.h create mode 100644 memtable/art_leaf_node.h create mode 100644 memtable/art_node.h create mode 100644 memtable/art_node_16.h create mode 100644 memtable/art_node_256.h create mode 100644 memtable/art_node_4.h create mode 100644 memtable/art_node_48.h create mode 100644 memtable/art_tree_it.h create mode 100644 memtable/artrep.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index b66f0dfaee9..606cbe5b77f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -576,6 +576,7 @@ set(SOURCES memtable/hash_linklist_rep.cc memtable/hash_skiplist_rep.cc memtable/skiplistrep.cc + memtable/artrep.cc memtable/vectorrep.cc memtable/write_buffer_manager.cc monitoring/histogram.cc diff --git a/memtable/art.h b/memtable/art.h new file mode 100644 index 00000000000..d57008186fc --- /dev/null +++ b/memtable/art.h @@ -0,0 +1,264 @@ +/** + * @file adaptive radix tree + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_leaf_node.h" +#include "memtable/art_inner_node.h" +#include "memtable/art_node.h" +#include "memtable/art_node_4.h" +#include "memtable/art_tree_it.h" +#include +#include +#include + +namespace rocksdb { + +class AdaptiveRadixTree { +public: + ~AdaptiveRadixTree(); + + /** + * Finds the value associated with the given key. + * + * @param key - The key to find. + * @return the value associated with the key or a nullptr. + */ + char* get(const char *key) const; + + /** + * Associates the given key with the given value. + * If another value is already associated with the given key, + * since the method consumer is the resource owner. + * + * @param key - The key to associate with the value. + * @param value - The value to be associated with the key. + * @return a nullptr if no other value is associated with they or the + * previously associated value. + */ + char *set(const char *key, char *value); + + /** + * Deletes the given key and returns it's associated value. + * The associated value is returned, + * since the method consumer is the resource owner. + * If no value is associated with the given key, nullptr is returned. + * + * @param key - The key to delete. + * @return the values assciated with they key or a nullptr otherwise. + */ + char* del(const char *key); + + /** + * Forward iterator that traverses the tree in lexicographic order. + */ + TreeIter begin(); + + /** + * Forward iterator that traverses the tree in lexicographic order starting + * from the provided key. + */ + TreeIter begin(const char *key); + + /** + * Iterator to the end of the lexicographic order. + */ + TreeIter end(); + +private: + Node *root_ = nullptr; +}; + +AdaptiveRadixTree::~AdaptiveRadixTree() { + if (root_ == nullptr) { + return; + } + std::stack node_stack; + node_stack.push(root_); + Node *cur; + InnerNode *cur_inner; + ChildIter it, it_end; + while (!node_stack.empty()) { + cur = node_stack.top(); + node_stack.pop(); + if (!cur->is_leaf()) { + cur_inner = static_cast(cur); + for (it = cur_inner->begin(), it_end = cur_inner->end(); it != it_end; ++it) { + node_stack.push(*cur_inner->find_child(*it)); + } + } + if (cur->prefix_ != nullptr) { + delete[] cur->prefix_; + } + delete cur; + } +} + +char* AdaptiveRadixTree::get(const char *key) const { + Node *cur = root_, **child; + int depth = 0, key_len = std::strlen(key) + 1; + while (cur != nullptr) { + if (cur->prefix_len_ != cur->check_prefix(key + depth, key_len - depth)) { + /* prefix mismatch */ + return nullptr; + } + if (cur->prefix_len_ == key_len - depth) { + /* exact match */ + return cur->is_leaf() ? static_cast(cur)->value_ : nullptr; + } + child = static_cast(cur)->find_child(key[depth + cur->prefix_len_]); + depth += (cur->prefix_len_ + 1); + cur = child != nullptr ? *child : nullptr; + } + return nullptr; +} + +char* AdaptiveRadixTree::set(const char *key, char* value) { + int key_len = std::strlen(key) + 1, depth = 0, prefix_match_len; + if (root_ == nullptr) { + root_ = new LeafNode(value); + root_->prefix_ = new char[key_len]; + std::copy(key, key + key_len + 1, root_->prefix_); + root_->prefix_len_ = key_len; + return nullptr; + } + + Node **cur = &root_, **child; + InnerNode **cur_inner; + char child_partial_key; + bool is_prefix_match; + + while (true) { + /* number of bytes of the current node's prefix that match the key */ + prefix_match_len = (**cur).check_prefix(key + depth, key_len - depth); + + /* true if the current node's prefix matches with a part of the key */ + is_prefix_match = (std::min((**cur).prefix_len_, key_len - depth)) == + prefix_match_len; + + if (is_prefix_match && (**cur).prefix_len_ == key_len - depth) { + /* exact match: + * => "replace" + * => replace value of current node. + * => return old value to caller to handle. + * _ _ + * | | + * (aa) (aa) + * a / \ b +[aaaaa,v3] a / \ b + * / \ ==========> / \ + * *(aa)->v1 ()->v2 *(aa)->v3 ()->v2 + * + */ + + /* cur must be a leaf */ + auto cur_leaf = static_cast(*cur); + char *old_value = cur_leaf->value_; + cur_leaf->value_ = value; + return old_value; + } + + if (!is_prefix_match) { + /* prefix mismatch: + * => new parent node with common prefix and no associated value. + * => new node with value to insert. + * => current and new node become children of new parent node. + * + * | | + * *(aa) +(a)->Ø + * a / \ b +[ab,v3] a / \ b + * / \ =======> / \ + * (aa)->v1 ()->v2 *()->Ø +()->v3 + * a / \ b + * / \ + * (aa)->v1 ()->v2 + * /|\ /|\ + */ + + auto new_parent = new Node4(); + new_parent->prefix_ = new char[prefix_match_len]; + std::copy((**cur).prefix_, (**cur).prefix_ + prefix_match_len, + new_parent->prefix_); + new_parent->prefix_len_ = prefix_match_len; + new_parent->set_child((**cur).prefix_[prefix_match_len], *cur); + + // TODO(rafaelkallis): shrink? + /* memmove((**cur).prefix_, (**cur).prefix_ + prefix_match_len + 1, */ + /* (**cur).prefix_len_ - prefix_match_len - 1); */ + /* (**cur).prefix_len_ -= prefix_match_len + 1; */ + + auto old_prefix = (**cur).prefix_; + auto old_prefix_len = (**cur).prefix_len_; + (**cur).prefix_ = new char[old_prefix_len - prefix_match_len - 1]; + (**cur).prefix_len_ = old_prefix_len - prefix_match_len - 1; + std::copy(old_prefix + prefix_match_len + 1, old_prefix + old_prefix_len, + (**cur).prefix_); + delete old_prefix; + + auto new_node = new LeafNode(value); + new_node->prefix_ = new char[key_len - depth - prefix_match_len - 1]; + std::copy(key + depth + prefix_match_len + 1, key + key_len, + new_node->prefix_); + new_node->prefix_len_ = key_len - depth - prefix_match_len - 1; + new_parent->set_child(key[depth + prefix_match_len], new_node); + + *cur = new_parent; + return nullptr; + } + + /* must be inner node */ + cur_inner = reinterpret_cast(cur); + child_partial_key = key[depth + (**cur).prefix_len_]; + child = (**cur_inner).find_child(child_partial_key); + + if (child == nullptr) { + /* + * no child associated with the next partial key. + * => create new node with value to insert. + * => new node becomes current node's child. + * + * *(aa)->Ø *(aa)->Ø + * a / +[aab,v2] a / \ b + * / ========> / \ + * (a)->v1 (a)->v1 +()->v2 + */ + + if ((**cur_inner).is_full()) { + *cur_inner = (**cur_inner).grow(); + } + + auto new_node = new LeafNode(value); + new_node->prefix_ = new char[key_len - depth - (**cur).prefix_len_ - 1]; + std::copy(key + depth + (**cur).prefix_len_ + 1, key + key_len, + new_node->prefix_); + new_node->prefix_len_ = key_len - depth - (**cur).prefix_len_ - 1; + (**cur_inner).set_child(child_partial_key, new_node); + return nullptr; + } + + /* propagate down and repeat: + * + * *(aa)->Ø (aa)->Ø + * a / \ b +[aaba,v3] a / \ b repeat + * / \ =========> / \ ========> ... + * (a)->v1 ()->v2 (a)->v1 *()->v2 + */ + + depth += (**cur).prefix_len_ + 1; + cur = child; + } +} + + +TreeIter AdaptiveRadixTree::begin() { + return TreeIter::min(this->root_); +} + +TreeIter AdaptiveRadixTree::begin(const char *key) { + return TreeIter::greater_equal(this->root_, key); +} + +TreeIter AdaptiveRadixTree::end() { return TreeIter(); } + +} // namespace rocksdb diff --git a/memtable/art_inner_node.h b/memtable/art_inner_node.h new file mode 100644 index 00000000000..167d8040d83 --- /dev/null +++ b/memtable/art_inner_node.h @@ -0,0 +1,277 @@ +/** + * @file InnerNode header + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_leaf_node.h" +#include "memtable/art_node.h" +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +class ChildIter; + +class InnerNode : public Node { +public: + virtual ~InnerNode() = default; + + InnerNode() = default; + InnerNode(const InnerNode &other) = default; + InnerNode(InnerNode &&other) noexcept = default; + InnerNode &operator=(const InnerNode &other) = default; + InnerNode &operator=(InnerNode &&other) noexcept = default; + + bool is_leaf() const override; + + /** + * Finds and returns the child Node identified by the given partial key. + * + * @param partial_key - The partial key associated with the child. + * @return Child Node identified by the given partial key or + * a null pointer of no child Node is associated with the partial key. + */ + virtual Node **find_child(char partial_key) = 0; + + /** + * Adds the given Node to the Node's children. + * No bounds checking is done. + * If a child already exists under the given partial key, the child + * is overwritten without deleting it. + * + * @pre Node should not be full. + * @param partial_key - The partial key associated with the child. + * @param child - The child Node. + */ + virtual void set_child(char partial_key, Node *child) = 0; + + /** + * Deletes the child associated with the given partial key. + * + * @param partial_key - The partial key associated with the child. + */ + virtual Node *del_child(char partial_key) = 0; + + /** + * Creates and returns a new Node with bigger children capacity. + * The current Node gets deleted. + * + * @return Node with bigger capacity + */ + virtual InnerNode *grow() = 0; + + /** + * Determines if the Node is full, i.e. can carry no more child Nodes. + */ + virtual bool is_full() const = 0; + + /** + * Determines if the Node is underfull, i.e. carries less child Nodes than + * intended. + */ + virtual bool is_underfull() const = 0; + + virtual int n_children() const = 0; + + virtual char next_partial_key(char partial_key) const = 0; + + virtual char prev_partial_key(char partial_key) const = 0; + + /** + * Iterator on the first child Node. + * + * @return Iterator on the first child Node. + */ + ChildIter begin(); + std::reverse_iterator rbegin(); + + /** + * Iterator on after the last child Node. + * + * @return Iterator on after the last child Node. + */ + ChildIter end(); + std::reverse_iterator rend(); +}; + +bool InnerNode::is_leaf() const { return false; } + +class ChildIter { + public: + ChildIter() = default; + ChildIter(const ChildIter &other) = default; + ChildIter(ChildIter &&other) noexcept = default; + ChildIter &operator=(const ChildIter &other) = default; + ChildIter &operator=(ChildIter &&other) noexcept = default; + + explicit ChildIter(InnerNode *n); + ChildIter(InnerNode *n, int relative_index); + + using iterator_category = std::bidirectional_iterator_tag; + using value_type = const char; + using difference_type = int; + using pointer = value_type *; + using reference = value_type &; + + reference operator*() const; + pointer operator->() const; + ChildIter &operator++(); + ChildIter operator++(int); + ChildIter &operator--(); + ChildIter operator--(int); + bool operator==(const ChildIter &rhs) const; + bool operator!=(const ChildIter &rhs) const; + bool operator<(const ChildIter &rhs) const; + bool operator>(const ChildIter &rhs) const; + bool operator<=(const ChildIter &rhs) const; + bool operator>=(const ChildIter &rhs) const; + + char get_partial_key() const; + Node *get_child_node() const; + + private: + InnerNode *node_ = nullptr; + char cur_partial_key_ = -128; + int relative_index_ = 0; +}; + +ChildIter::ChildIter(InnerNode *n) : ChildIter(n, 0) {} + + +ChildIter::ChildIter(InnerNode *n, int relative_index) + : node_(n), cur_partial_key_(0), relative_index_(relative_index) { + if (relative_index_ < 0) { + /* relative_index is out of bounds, no seek */ + return; + } + + if (relative_index_ >= node_->n_children()) { + /* relative_index is out of bounds, no seek */ + return; + } + + if (relative_index_ == node_->n_children() - 1) { + cur_partial_key_ = node_->prev_partial_key(127); + return; + } + + cur_partial_key_ = node_->next_partial_key(-128); + for (int i = 0; i < relative_index_; ++i) { + cur_partial_key_ = node_->next_partial_key(cur_partial_key_ + 1); + } +} + + +typename ChildIter::reference ChildIter::operator*() const { + if (relative_index_ < 0 || relative_index_ >= node_->n_children()) { + throw std::out_of_range("child iterator is out of range"); + } + + return cur_partial_key_; +} + + +typename ChildIter::pointer ChildIter::operator->() const { + if (relative_index_ < 0 || relative_index_ >= node_->n_children()) { + throw std::out_of_range("child iterator is out of range"); + } + + return &cur_partial_key_; +} + +ChildIter &ChildIter::operator++() { + ++relative_index_; + if (relative_index_ < 0) { + return *this; + } else if (relative_index_ == 0) { + cur_partial_key_ = node_->next_partial_key(-128); + } else if (relative_index_ < node_->n_children()) { + cur_partial_key_ = node_->next_partial_key(cur_partial_key_ + 1); + } + return *this; +} + +ChildIter ChildIter::operator++(int) { + auto old = *this; + operator++(); + return old; +} + +ChildIter &ChildIter::operator--() { + --relative_index_; + if (relative_index_ > node_->n_children() - 1) { + return *this; + } else if (relative_index_ == node_->n_children() - 1) { + cur_partial_key_ = node_->prev_partial_key(127); + } else if (relative_index_ >= 0) { + cur_partial_key_ = node_->prev_partial_key(cur_partial_key_ - 1); + } + return *this; +} + +ChildIter ChildIter::operator--(int) { + auto old = *this; + operator--(); + return old; +} + +bool ChildIter::operator==(const ChildIter &rhs) const { + return node_ == rhs.node_ && relative_index_ == rhs.relative_index_; +} + +bool ChildIter::operator<(const ChildIter &rhs) const { + return node_ == rhs.node_ && relative_index_ < rhs.relative_index_; +} + +bool ChildIter::operator!=(const ChildIter &rhs) const { + return !((*this) == rhs); +} + +bool ChildIter::operator>=(const ChildIter &rhs) const { + return !((*this) < rhs); +} + +bool ChildIter::operator<=(const ChildIter &rhs) const { + return (rhs >= (*this)); +} + +bool ChildIter::operator>(const ChildIter &rhs) const { + return (rhs < (*this)); +} + + +char ChildIter::get_partial_key() const { + return cur_partial_key_; +} + +Node *ChildIter::get_child_node() const { + assert(0 <= relative_index_ && relative_index_ < node_->n_children()); + return *node_->find_child(cur_partial_key_); +} + +ChildIter InnerNode::begin() { + return ChildIter(this); +} + +ChildIter InnerNode::end() { + return ChildIter(this, n_children()); +} + +std::reverse_iterator InnerNode::rbegin() { + return std::reverse_iterator(end()); +} + +std::reverse_iterator InnerNode::rend() { + return std::reverse_iterator(begin()); +} + + + +} // namespace art diff --git a/memtable/art_leaf_node.h b/memtable/art_leaf_node.h new file mode 100644 index 00000000000..ffe7c2aef2c --- /dev/null +++ b/memtable/art_leaf_node.h @@ -0,0 +1,26 @@ +/** + * @file LeafNode header + * @author Rafael Kallis + */ + + +#pragma once + +#include "memtable/art_node.h" + +namespace rocksdb { + +class LeafNode : public Node { +public: + explicit LeafNode(char *value); + bool is_leaf() const override; + + char* value_; +}; + +LeafNode::LeafNode(char *value): value_(value) {} + +bool LeafNode::is_leaf() const { return true; } + +} // namespace rocksdb + diff --git a/memtable/art_node.h b/memtable/art_node.h new file mode 100644 index 00000000000..859cae05e0b --- /dev/null +++ b/memtable/art_node.h @@ -0,0 +1,58 @@ +/** + * @file trie Nodes header. + * @author Rafael Kallis + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +class Node { +public: + virtual ~Node() = default; + + Node() = default; + Node(const Node &other) = default; + Node(Node &&other) noexcept = default; + Node &operator=(const Node &other) = default; + Node &operator=(Node &&other) noexcept = default; + + /** + * Determines if this Node is a leaf Node, i.e., contains a value. + * Needed for downcasting a Node instance to a leaf_Node or inner_Node instance. + */ + virtual bool is_leaf() const = 0; + + /** + * Determines the number of matching bytes between the Node's prefix and the key. + * + * Given a Node with prefix: "abbbd", a key "abbbccc", + * check_prefix returns 4, since byte 4 of the prefix ('d') does not + * match byte 4 of the key ('c'). + * + * key: "abbbccc" + * prefix: "abbbd" + * ^^^^* + * index: 01234 + */ + int check_prefix(const char *key, int key_len) const; + + char *prefix_ = nullptr; + uint16_t prefix_len_ = 0; +}; + +int Node::check_prefix(const char *key, int key_len) const { + key_len = std::min(key_len, (int)prefix_len_); + return std::mismatch(prefix_, prefix_ + key_len, key).second - key; +} + +} // namespace rocksdb + diff --git a/memtable/art_node_16.h b/memtable/art_node_16.h new file mode 100644 index 00000000000..397088053da --- /dev/null +++ b/memtable/art_node_16.h @@ -0,0 +1,150 @@ +/** + * @file Node16 header + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_inner_node.h" +#include "memtable/art_node_48.h" +#include +#include +#include +#include + +#if defined(__i386__) || defined(__amd64__) +#include +#endif + +namespace rocksdb { + + + class Node16 : public InnerNode { +friend class Node48; +public: + Node **find_child(char partial_key) override; + void set_child(char partial_key, Node *child) override; + Node *del_child(char partial_key) override; + InnerNode *grow() override; + InnerNode *shrink() override; + bool is_full() const override; + bool is_underfull() const override; + + char next_partial_key(char partial_key) const override; + + char prev_partial_key(char partial_key) const override; + + int n_children() const override; + + uint8_t n_children_ = 0; +private: + char keys_[16]; + Node *children_[16]; +}; + + Node **Node16::find_child(char partial_key) { +#if defined(__i386__) || defined(__amd64__) + int bitfield = + _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(partial_key), + _mm_loadu_si128((__m128i *)keys_))) & + ((1 << n_children_) - 1); + return (bool)bitfield ? &children_[__builtin_ctz(bitfield)] : nullptr; +#else + int lo, mid, hi; + lo = 0; + hi = n_children_; + while (lo < hi) { + mid = (lo + hi) / 2; + if (partial_key < keys_[mid]) { + hi = mid; + } else if (partial_key > keys_[mid]) { + lo = mid + 1; + } else { + return &children_[mid]; + } + } + return nullptr; +#endif +} + + +void Node16::set_child(char partial_key, Node *child) { + /* determine index for child */ + int child_i; + for (int i = this->n_children_ - 1;; --i) { + if (i >= 0 && partial_key < this->keys_[i]) { + /* move existing sibling to the right */ + this->keys_[i + 1] = this->keys_[i]; + this->children_[i + 1] = this->children_[i]; + } else { + child_i = i + 1; + break; + } + } + + this->keys_[child_i] = partial_key; + this->children_[child_i] = child; + ++n_children_; +} + + Node *Node16::del_child(char partial_key) { + Node *child_to_delete = nullptr; + for (int i = 0; i < n_children_; ++i) { + if (child_to_delete == nullptr && partial_key == keys_[i]) { + child_to_delete = children_[i]; + } + if (child_to_delete != nullptr) { + /* move existing sibling to the left */ + keys_[i] = i < n_children_ - 1 ? keys_[i + 1] : 0; + children_[i] = i < n_children_ - 1 ? children_[i + 1] : nullptr; + } + } + if (child_to_delete != nullptr) { + --n_children_; + } + return child_to_delete; +} + + InnerNode *Node16::grow() { + auto new_node = new Node48(); + new_node->prefix_ = this->prefix_; + new_node->prefix_len_ = this->prefix_len_; + std::copy(this->children_, this->children_ + this->n_children_, new_node->children_); + for (int i = 0; i < n_children_; ++i) { + new_node->indexes_[(uint8_t) this->keys_[i]] = i; + } + delete this; + return new_node; +} + + + bool Node16::is_full() const { + return n_children_ == 16; +} + + bool Node16::is_underfull() const { + return n_children_ == 4; +} + + char Node16::next_partial_key(char partial_key) const { + for (int i = 0; i < n_children_; ++i) { + if (keys_[i] >= partial_key) { + return keys_[i]; + } + } + throw std::out_of_range("provided partial key does not have a successor"); +} + + char Node16::prev_partial_key(char partial_key) const { + for (int i = n_children_ - 1; i >= 0; --i) { + if (keys_[i] <= partial_key) { + return keys_[i]; + } + } + throw std::out_of_range("provided partial key does not have a predecessor"); +} + + int Node16::n_children() const { return n_children_; } + +} // namespace rocksdb + diff --git a/memtable/art_node_256.h b/memtable/art_node_256.h new file mode 100644 index 00000000000..71fa82f42aa --- /dev/null +++ b/memtable/art_node_256.h @@ -0,0 +1,100 @@ +/** + * @file Node256 header + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_inner_node.h" +#include +#include + +namespace rocksdb { + + + class Node256 : public InnerNode { +public: + Node256(); + + Node **find_child(char partial_key) override; + void set_child(char partial_key, Node *child) override; + Node *del_child(char partial_key) override; + InnerNode *grow() override; + InnerNode *shrink() override; + bool is_full() const override; + bool is_underfull() const override; + + char next_partial_key(char partial_key) const override; + + char prev_partial_key(char partial_key) const override; + + int n_children() const override; + +private: + uint16_t n_children_ = 0; + std::array children_; +}; + + Node256::Node256() { children_.fill(nullptr); } + + Node **Node256::find_child(char partial_key) { + return children_[128 + partial_key] != nullptr ? &children_[128 + partial_key] + : nullptr; +} + + +void Node256::set_child(char partial_key, Node *child) { + children_[128 + partial_key] = child; + ++n_children_; +} + + Node *Node256::del_child(char partial_key) { + Node *child_to_delete = children_[128 + partial_key]; + if (child_to_delete != nullptr) { + children_[128 + partial_key] = nullptr; + --n_children_; + } + return child_to_delete; +} + + InnerNode *Node256::grow() { + throw std::runtime_error("Node256 cannot grow"); +} + + + bool Node256::is_full() const { + return n_children_ == 256; +} + + bool Node256::is_underfull() const { + return n_children_ == 48; +} + + char Node256::next_partial_key(char partial_key) const { + while (true) { + if (children_[128 + partial_key] != nullptr) { + return partial_key; + } + if (partial_key == 127) { + throw std::out_of_range("provided partial key does not have a successor"); + } + ++partial_key; + } +} + + char Node256::prev_partial_key(char partial_key) const { + while (true) { + if (children_[128 + partial_key] != nullptr) { + return partial_key; + } + if (partial_key == -128) { + throw std::out_of_range( + "provided partial key does not have a predecessor"); + } + --partial_key; + } +} + + int Node256::n_children() const { return n_children_; } + +} // namespace rocksdb diff --git a/memtable/art_node_4.h b/memtable/art_node_4.h new file mode 100644 index 00000000000..a49298cf3e4 --- /dev/null +++ b/memtable/art_node_4.h @@ -0,0 +1,126 @@ +/** + * @file Node4 header + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_inner_node.h" +#include "memtable/art_node_16.h" +#include +#include +#include +#include +#include + +namespace rocksdb { + +class Node16; + +class Node4 : public InnerNode { + friend class Node16; + +public: + Node **find_child(char partial_key) override; + void set_child(char partial_key, Node *child) override; + Node *del_child(char partial_key) override; + InnerNode *grow() override; + bool is_full() const override; + bool is_underfull() const override; + + char next_partial_key(char partial_key) const override; + + char prev_partial_key(char partial_key) const override; + + int n_children() const override; + +private: + uint8_t n_children_ = 0; + char keys_[4]; + Node *children_[4]; +}; + + Node **Node4::find_child(char partial_key) { + for (int i = 0; i < n_children_; ++i) { + if (keys_[i] == partial_key) { + return &children_[i]; + } + } + return nullptr; +} + + void Node4::set_child(char partial_key, Node *child) { + /* determine index for child */ + int c_i; + for (c_i = 0; c_i < n_children_ && partial_key >= keys_[c_i]; ++c_i) { + } + std::memmove(keys_ + c_i + 1, keys_ + c_i, n_children_ - c_i); + std::memmove(children_ + c_i + 1, children_ + c_i, + (n_children_ - c_i) * sizeof(void *)); + + keys_[c_i] = partial_key; + children_[c_i] = child; + ++n_children_; +} + + Node *Node4::del_child(char partial_key) { + Node *child_to_delete = nullptr; + for (int i = 0; i < n_children_; ++i) { + if (child_to_delete == nullptr && partial_key == keys_[i]) { + child_to_delete = children_[i]; + } + if (child_to_delete != nullptr) { + /* move existing sibling to the left */ + keys_[i] = i < n_children_ - 1 ? keys_[i + 1] : 0; + children_[i] = i < n_children_ - 1 ? children_[i + 1] : nullptr; + } + } + if (child_to_delete != nullptr) { + --n_children_; + } + return child_to_delete; +} + + InnerNode *Node4::grow() { + auto new_node = new Node16(); + new_node->prefix_ = this->prefix_; + new_node->prefix_len_ = this->prefix_len_; + new_node->n_children_ = this->n_children_; + std::copy(this->keys_, this->keys_ + this->n_children_, new_node->keys_); + std::copy(this->children_, this->children_ + this->n_children_, new_node->children_); + delete this; + return new_node; +} + + + bool Node4::is_full() const { return n_children_ == 4; } + + bool Node4::is_underfull() const { + return false; +} + + char Node4::next_partial_key(char partial_key) const { + for (int i = 0; i < n_children_; ++i) { + if (keys_[i] >= partial_key) { + return keys_[i]; + } + } + /* return 0; */ + throw std::out_of_range("provided partial key does not have a successor"); +} + + char Node4::prev_partial_key(char partial_key) const { + for (int i = n_children_ - 1; i >= 0; --i) { + if (keys_[i] <= partial_key) { + return keys_[i]; + } + } + /* return 255; */ + throw std::out_of_range("provided partial key does not have a predecessor"); +} + + int Node4::n_children() const { + return this->n_children_; +} + +} // namespace rocksdb diff --git a/memtable/art_node_48.h b/memtable/art_node_48.h new file mode 100644 index 00000000000..b090ad86cb0 --- /dev/null +++ b/memtable/art_node_48.h @@ -0,0 +1,139 @@ +/** + * @file Node48 header + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_node_256.h" +#include +#include +#include +#include + +namespace rocksdb { + + + class Node48 : public InnerNode { + friend class Node16; + friend class Node256; + +public: + Node48(); + + Node **find_child(char partial_key) override; + void set_child(char partial_key, Node *child) override; + Node *del_child(char partial_key) override; + InnerNode *grow() override; + bool is_full() const override; + bool is_underfull() const override; + + char next_partial_key(char partial_key) const override; + char prev_partial_key(char partial_key) const override; + + int n_children() const override; + +private: + static const char EMPTY; + + uint8_t n_children_ = 0; + char indexes_[256]; + Node *children_[48]; +}; + + Node48::Node48() { + std::fill(this->indexes_, this->indexes_ + 256, Node48::EMPTY); + std::fill(this->children_, this->children_ + 48, nullptr); +} + + Node **Node48::find_child(char partial_key) { + // TODO(rafaelkallis): direct lookup instead of temp save? + uint8_t index = indexes_[128 + partial_key]; + return Node48::EMPTY != index ? &children_[index] : nullptr; +} + + +void Node48::set_child(char partial_key, Node *child) { + + // TODO(rafaelkallis): pick random starting entry in order to increase + // performance? i.e. for (int i = random([0,48)); i != (i-1) % 48; i = (i+1) % + // 48){} + + /* find empty child entry */ + for (int i = 0; i < 48; ++i) { + if (children_[i] == nullptr) { + indexes_[128 + partial_key] = (uint8_t) i; + children_[i] = child; + break; + } + } + ++n_children_; +} + + Node *Node48::del_child(char partial_key) { + Node *child_to_delete = nullptr; + unsigned char index = indexes_[128 + partial_key]; + if (index != Node48::EMPTY) { + child_to_delete = children_[index]; + indexes_[128 + partial_key] = Node48::EMPTY; + children_[index] = nullptr; + --n_children_; + } + return child_to_delete; +} + + InnerNode *Node48::grow() { + auto new_node = new Node256(); + new_node->prefix_ = this->prefix_; + new_node->prefix_len_ = this->prefix_len_; + uint8_t index; + for (int partial_key = -128; partial_key < 127; ++partial_key) { + index = indexes_[128 + partial_key]; + if (index != Node48::EMPTY) { + new_node->set_child(partial_key, children_[index]); + } + } + delete this; + return new_node; +} + + + bool Node48::is_full() const { + return n_children_ == 48; +} + + bool Node48::is_underfull() const { + return n_children_ == 16; +} + + const char Node48::EMPTY = 48; + + char Node48::next_partial_key(char partial_key) const { + while (true) { + if (indexes_[128 + partial_key] != Node48::EMPTY) { + return partial_key; + } + if (partial_key == 127) { + throw std::out_of_range("provided partial key does not have a successor"); + } + ++partial_key; + } +} + + char Node48::prev_partial_key(char partial_key) const { + while (true) { + if (indexes_[128 + partial_key] != Node48::EMPTY) { + return partial_key; + } + if (partial_key == -128) { + throw std::out_of_range( + "provided partial key does not have a predecessor"); + } + --partial_key; + } +} + + int Node48::n_children() const { return n_children_; } + +} // namespace rocksdb + diff --git a/memtable/art_tree_it.h b/memtable/art_tree_it.h new file mode 100644 index 00000000000..1474f198250 --- /dev/null +++ b/memtable/art_tree_it.h @@ -0,0 +1,206 @@ +/** + * @file tree iterator + * @author Rafael Kallis + */ + +#pragma once + +#include "memtable/art_inner_node.h" +#include "memtable/art_leaf_node.h" +#include +#include +#include +#include +#include + +namespace rocksdb { + +class TreeIter { +public: + struct Step { + Node *node_; + int depth_; + ChildIter child_it_; + ChildIter child_it_end_; + + Step(int depth, ChildIter c_it, ChildIter c_it_end); + Step(Node *node, int depth, ChildIter c_it, ChildIter c_it_end); + + Step &operator++(); + Step operator++(int); + }; + + TreeIter() = default; + explicit TreeIter(std::vector traversal_stack); + + static TreeIter min(Node *root); + static TreeIter greater_equal(Node *root, const char *key); + + using iterator_category = std::forward_iterator_tag; + using difference_type = int; + /* using reference = const value_type &; */ + + /* reference operator*(); */ + char* operator*(); + char** operator->(); + TreeIter &operator++(); + TreeIter operator++(int); + bool operator==(const TreeIter &rhs) const; + bool operator!=(const TreeIter &rhs) const; + + Node *get_node() const; + int get_depth() const; + +private: + Step &get_Step(); + const Step &get_Step() const; + void seek_leaf(); + + std::vector traversal_stack_; +}; + +TreeIter::Step::Step(Node *node, int depth, ChildIter c_it, ChildIter c_it_end) + : node_(node), depth_(depth), child_it_(c_it), child_it_end_(c_it_end) {} + +TreeIter::Step::Step(int depth, ChildIter c_it, ChildIter c_it_end) + : Step(c_it != c_it_end ? c_it.get_child_node() : nullptr, depth, c_it, c_it_end) {} + +TreeIter::Step &TreeIter::Step::operator++() { + assert(child_it_ != child_it_end_); + ++child_it_; + node_ = child_it_ != child_it_end_ + ? child_it_.get_child_node() + : nullptr; + return *this; +} + +TreeIter::Step TreeIter::Step::operator++(int) { + auto old = *this; + operator++(); + return old; +} + +TreeIter::TreeIter(std::vector traversal_stack) : traversal_stack_(traversal_stack) { + seek_leaf(); +} + +TreeIter TreeIter::min(Node *root) { + return TreeIter::greater_equal(root, ""); +} + +TreeIter TreeIter::greater_equal(Node *root, const char *key) { + assert(root != nullptr); + + int key_len = std::strlen(key); + InnerNode *cur_InnerNode; + ChildIter child_it, child_it_end; + std::vector traversal_stack; + + // sentinel child iterator for root + traversal_stack.push_back({root, 0, {nullptr, -2}, {nullptr, -1}}); + + while (true) { + TreeIter::Step &cur_Step = traversal_stack.back(); + Node *cur_node = cur_Step.node_; + int cur_depth = cur_Step.depth_; + + int prefix_match_len = std::min(cur_node->check_prefix(key + cur_depth, key_len - cur_depth), key_len - cur_depth); + // if search key "equals" the prefix + if (key_len == cur_depth + prefix_match_len) { + return TreeIter(traversal_stack); + } + // if search key is "greater than" the prefix + if (prefix_match_len < cur_node->prefix_len_ && key[cur_depth + prefix_match_len] > cur_node->prefix_[prefix_match_len]) { + ++cur_Step; + return TreeIter(traversal_stack); + } + if (cur_node->is_leaf()) { + continue; + } + // seek subtree where search key is "lesser than or equal" the subtree partial key + cur_InnerNode = static_cast(cur_node); + child_it = cur_InnerNode->begin(); + child_it_end = cur_InnerNode->end(); + // TODO more efficient with specialized node search method? + for (; child_it != child_it_end; ++child_it) { + if (key[cur_depth + cur_node->prefix_len_] <= child_it.get_partial_key()) { + break; + } + } + traversal_stack.push_back({cur_depth + cur_node->prefix_len_ + 1, child_it, child_it_end}); + } +} + +char* TreeIter::operator*() { + assert(get_node()->is_leaf()); + return static_cast(get_node())->value_; +} + +char** TreeIter::operator->() { + assert(get_node()->is_leaf()); + return &static_cast(get_node())->value_; +} + +TreeIter &TreeIter::operator++() { + assert(get_node()->is_leaf()); + ++get_Step(); + seek_leaf(); + return *this; +} + +TreeIter TreeIter::operator++(int) { + auto old = *this; + operator++(); + return old; +} + +bool TreeIter::operator==(const TreeIter &rhs) const { + return (traversal_stack_.empty() && rhs.traversal_stack_.empty()) || + (!traversal_stack_.empty() && !rhs.traversal_stack_.empty() && + get_node() == rhs.get_node()); +} + + bool TreeIter::operator!=(const TreeIter &rhs) const { + return !(*this == rhs); +} + +Node * TreeIter::get_node() const { + return get_Step().node_; +} + +int TreeIter::get_depth() const { + return get_Step().depth_; +} + +TreeIter::Step &TreeIter::get_Step() { + assert(!traversal_stack_.empty()); + return traversal_stack_.back(); +} + +const TreeIter::Step &TreeIter::get_Step() const { + assert(!traversal_stack_.empty()); + return traversal_stack_.back(); +} + +void TreeIter::seek_leaf() { + + /* traverse up until a node on the right is found or stack gets empty */ + for (; get_Step().child_it_ == get_Step().child_it_end_; ++get_Step()) { + traversal_stack_.pop_back(); + if (traversal_stack_.empty()) { + return; + } + } + + /* find leftmost leaf node */ + while (!get_node()->is_leaf()) { + InnerNode *cur_InnerNode = static_cast(get_node()); + int depth = get_Step().depth_ + get_node()->prefix_len_ + 1; + ChildIter c_it = cur_InnerNode->begin(); + ChildIter c_it_end = cur_InnerNode->end(); + traversal_stack_.push_back({depth, c_it, c_it_end}); + } +} + +} // namespace art + diff --git a/memtable/artrep.cc b/memtable/artrep.cc new file mode 100644 index 00000000000..07b3f2a371f --- /dev/null +++ b/memtable/artrep.cc @@ -0,0 +1,278 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/art.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { +namespace { + +class AdaptiveRadixTreeRep : public MemTableRep { + AdaptiveRadixTree skip_list_; + const MemTableRep::KeyComparator& cmp_; + const SliceTransform* transform_; + const size_t lookahead_; + + friend class LookaheadIterator; +public: + explicit AdaptiveRadixTreeRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, const SliceTransform* transform, + const size_t lookahead) + : MemTableRep(allocator), + skip_list_(compare, allocator), + cmp_(compare), + transform_(transform), + lookahead_(lookahead) {} + + KeyHandle Allocate(const size_t len, char** buf) override { + *buf = skip_list_.AllocateKey(len); + return static_cast(*buf); + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(KeyHandle handle) override { + skip_list_.Insert(static_cast(handle)); + } + + bool InsertKey(KeyHandle handle) override { + return skip_list_.Insert(static_cast(handle)); + } + + void InsertWithHint(KeyHandle handle, void** hint) override { + skip_list_.InsertWithHint(static_cast(handle), hint); + } + + bool InsertKeyWithHint(KeyHandle handle, void** hint) override { + return skip_list_.InsertWithHint(static_cast(handle), hint); + } + + void InsertConcurrently(KeyHandle handle) override { + skip_list_.InsertConcurrently(static_cast(handle)); + } + + bool InsertKeyConcurrently(KeyHandle handle) override { + return skip_list_.InsertConcurrently(static_cast(handle)); + } + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const char* key) const override { + return skip_list_.Contains(key); + } + + size_t ApproximateMemoryUsage() override { + // All memory is allocated through allocator; nothing to report here + return 0; + } + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override { + AdaptiveRadixTreeRep::Iterator iter(&skip_list_); + Slice dummy_slice; + for (iter.Seek(dummy_slice, k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { + } + } + + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) override { + std::string tmp; + uint64_t start_count = + skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey)); + uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey)); + return (end_count >= start_count) ? (end_count - start_count) : 0; + } + + ~AdaptiveRadixTreeRep() override {} + + // Iteration over the contents of a skip list + class Iterator : public MemTableRep::Iterator { + typename SkipList::Iterator iter_; + + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator( + const SkipList* list) + : iter_(list) {} + + ~Iterator() override {} + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const override { return iter_.Valid(); } + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const override { return iter_.key(); } + + // Advances to the next position. + // REQUIRES: Valid() + void Next() override { iter_.Next(); } + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev() override { iter_.Prev(); } + + // Advance to the first entry with a key >= target + void Seek(const Slice& user_key, const char* memtable_key) override { + if (memtable_key != nullptr) { + iter_.Seek(memtable_key); + } else { + iter_.Seek(EncodeKey(&tmp_, user_key)); + } + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& user_key, const char* memtable_key) override { + if (memtable_key != nullptr) { + iter_.SeekForPrev(memtable_key); + } else { + iter_.SeekForPrev(EncodeKey(&tmp_, user_key)); + } + } + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst() override { iter_.SeekToFirst(); } + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast() override { iter_.SeekToLast(); } + + protected: + std::string tmp_; // For passing to EncodeKey + }; + + // Iterator over the contents of a skip list which also keeps track of the + // previously visited node. In Seek(), it examines a few nodes after it + // first, falling back to O(log n) search from the head of the list only if + // the target key hasn't been found. + class LookaheadIterator : public MemTableRep::Iterator { + public: + explicit LookaheadIterator(const AdaptiveRadixTreeRep& rep) : + rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} + + ~LookaheadIterator() override {} + + bool Valid() const override { return iter_.Valid(); } + + const char* key() const override { + assert(Valid()); + return iter_.key(); + } + + void Next() override { + assert(Valid()); + + bool advance_prev = true; + if (prev_.Valid()) { + auto k1 = rep_.UserKey(prev_.key()); + auto k2 = rep_.UserKey(iter_.key()); + + if (k1.compare(k2) == 0) { + // same user key, don't move prev_ + advance_prev = false; + } else if (rep_.transform_) { + // only advance prev_ if it has the same prefix as iter_ + auto t1 = rep_.transform_->Transform(k1); + auto t2 = rep_.transform_->Transform(k2); + advance_prev = t1.compare(t2) == 0; + } + } + + if (advance_prev) { + prev_ = iter_; + } + iter_.Next(); + } + + void Prev() override { + assert(Valid()); + iter_.Prev(); + prev_ = iter_; + } + + void Seek(const Slice& internal_key, const char* memtable_key) override { + const char *encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + + if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { + // prev_.key() is smaller or equal to our target key; do a quick + // linear search (at most lookahead_ steps) starting from prev_ + iter_ = prev_; + + size_t cur = 0; + while (cur++ <= rep_.lookahead_ && iter_.Valid()) { + if (rep_.cmp_(encoded_key, iter_.key()) <= 0) { + return; + } + Next(); + } + } + + iter_.Seek(encoded_key); + prev_ = iter_; + } + + void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.SeekForPrev(encoded_key); + prev_ = iter_; + } + + void SeekToFirst() override { + iter_.SeekToFirst(); + prev_ = iter_; + } + + void SeekToLast() override { + iter_.SeekToLast(); + prev_ = iter_; + } + + protected: + std::string tmp_; // For passing to EncodeKey + + private: + const AdaptiveRadixTreeRep& rep_; + typename SkipList::Iterator iter_; + typename SkipList::Iterator prev_; + }; + + MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + if (lookahead_ > 0) { + void *mem = + arena ? arena->AllocateAligned(sizeof(AdaptiveRadixTreeRep::LookaheadIterator)) + : operator new(sizeof(AdaptiveRadixTreeRep::LookaheadIterator)); + return new (mem) AdaptiveRadixTreeRep::LookaheadIterator(*this); + } else { + void *mem = + arena ? arena->AllocateAligned(sizeof(AdaptiveRadixTreeRep::Iterator)) + : operator new(sizeof(AdaptiveRadixTreeRep::Iterator)); + return new (mem) AdaptiveRadixTreeRep::Iterator(&skip_list_); + } + } +}; +} + +MemTableRep* SkipListFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* /*logger*/) { + return new AdaptiveRadixTreeRep(compare, allocator, transform, lookahead_); +} + +MemTableRep* DoublySkipListFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* /*logger*/) { + return new AdaptiveRadixTreeRep(compare, allocator, transform, lookahead_); +} + +} // namespace rocksdb From d4737e6f8d701cf488e79dad09f5ebae84f950a3 Mon Sep 17 00:00:00 2001 From: Little-Wallace Date: Thu, 31 Mar 2022 22:08:21 +0800 Subject: [PATCH 02/18] fix for rocksdb Signed-off-by: Little-Wallace --- memtable/art.h | 230 +++++++++++++----------------- memtable/art_inner_node.h | 221 +---------------------------- memtable/art_leaf_node.h | 26 ---- memtable/art_node.h | 43 ++++-- memtable/art_node_16.h | 163 +++++++++++----------- memtable/art_node_256.h | 39 ++---- memtable/art_node_4.h | 111 +++++++-------- memtable/art_node_48.h | 90 +++++------- memtable/artrep.cc | 285 ++++++++++---------------------------- 9 files changed, 379 insertions(+), 829 deletions(-) delete mode 100644 memtable/art_leaf_node.h diff --git a/memtable/art.h b/memtable/art.h index d57008186fc..0b26c290faa 100644 --- a/memtable/art.h +++ b/memtable/art.h @@ -5,11 +5,10 @@ #pragma once -#include "memtable/art_leaf_node.h" #include "memtable/art_inner_node.h" #include "memtable/art_node.h" #include "memtable/art_node_4.h" -#include "memtable/art_tree_it.h" +#include "memory/allocator.h" #include #include #include @@ -18,6 +17,7 @@ namespace rocksdb { class AdaptiveRadixTree { public: + AdaptiveRadixTree(Allocator* allocator); ~AdaptiveRadixTree(); /** @@ -26,7 +26,7 @@ class AdaptiveRadixTree { * @param key - The key to find. * @return the value associated with the key or a nullptr. */ - char* get(const char *key) const; + char* Get(const char *key) const; /** * Associates the given key with the given value. @@ -38,107 +38,86 @@ class AdaptiveRadixTree { * @return a nullptr if no other value is associated with they or the * previously associated value. */ - char *set(const char *key, char *value); + char *Insert(const char *key, int key_len, char* v); - /** - * Deletes the given key and returns it's associated value. - * The associated value is returned, - * since the method consumer is the resource owner. - * If no value is associated with the given key, nullptr is returned. - * - * @param key - The key to delete. - * @return the values assciated with they key or a nullptr otherwise. - */ - char* del(const char *key); - /** - * Forward iterator that traverses the tree in lexicographic order. - */ - TreeIter begin(); - - /** - * Forward iterator that traverses the tree in lexicographic order starting - * from the provided key. - */ - TreeIter begin(const char *key); - - /** - * Iterator to the end of the lexicographic order. - */ - TreeIter end(); + char* AllocateLeafNode(const char* v, size_t value_size); + Node* AllocateNode(InnerNode* inner, size_t prefix_size); private: - Node *root_ = nullptr; + std::atomic root_; + Allocator* allocator_; }; AdaptiveRadixTree::~AdaptiveRadixTree() { - if (root_ == nullptr) { - return; - } - std::stack node_stack; - node_stack.push(root_); - Node *cur; - InnerNode *cur_inner; - ChildIter it, it_end; - while (!node_stack.empty()) { - cur = node_stack.top(); - node_stack.pop(); - if (!cur->is_leaf()) { - cur_inner = static_cast(cur); - for (it = cur_inner->begin(), it_end = cur_inner->end(); it != it_end; ++it) { - node_stack.push(*cur_inner->find_child(*it)); - } - } - if (cur->prefix_ != nullptr) { - delete[] cur->prefix_; - } - delete cur; - } } -char* AdaptiveRadixTree::get(const char *key) const { - Node *cur = root_, **child; +AdaptiveRadixTree::AdaptiveRadixTree(Allocator* allocator) + : allocator_(allocator){ + root_.store(nullptr, std::memory_order_relaxed); +} + +char* AdaptiveRadixTree::Get(const char *key) const { + Node *cur = root_.load(std::memory_order_acquire); + std::atomic* child = nullptr; int depth = 0, key_len = std::strlen(key) + 1; while (cur != nullptr) { - if (cur->prefix_len_ != cur->check_prefix(key + depth, key_len - depth)) { + if (cur->prefix_len != cur->check_prefix(key, depth, key_len)) { /* prefix mismatch */ return nullptr; } - if (cur->prefix_len_ == key_len - depth) { + if (cur->prefix_len == key_len - depth) { /* exact match */ - return cur->is_leaf() ? static_cast(cur)->value_ : nullptr; + return cur->value; + } + + if (cur->inner == nullptr) { + return nullptr; } - child = static_cast(cur)->find_child(key[depth + cur->prefix_len_]); - depth += (cur->prefix_len_ + 1); - cur = child != nullptr ? *child : nullptr; + child = cur->inner->find_child(key[depth + cur->prefix_len]); + depth += cur->prefix_len + 1; + cur = child != nullptr ? child->load(std::memory_order_acquire) : nullptr; } return nullptr; } -char* AdaptiveRadixTree::set(const char *key, char* value) { +Node* AdaptiveRadixTree::AllocateNode(InnerNode* inner, size_t prefix_size) { + size_t extra_prefix = prefix_size; + if (extra_prefix > 0) { + extra_prefix -= 1; + } + char* addr = allocator_->AllocateAligned(sizeof(Node) + extra_prefix); + Node* node = reinterpret_cast(addr); + node->inner = inner; + node->value = nullptr; + node->prefix_len = prefix_size; + return node; +} + + +char* AdaptiveRadixTree::Insert(const char *key, int l, char* leaf) { int key_len = std::strlen(key) + 1, depth = 0, prefix_match_len; - if (root_ == nullptr) { - root_ = new LeafNode(value); - root_->prefix_ = new char[key_len]; - std::copy(key, key + key_len + 1, root_->prefix_); - root_->prefix_len_ = key_len; + + std::atomic* cur_address = &root_; + Node *cur = root_.load(std::memory_order_relaxed); + if (cur == nullptr) { + Node* root = AllocateNode(nullptr, l); + root->value = leaf; + memcpy(root->prefix, key, l); + root_.store(root, std::memory_order_release); return nullptr; } - Node **cur = &root_, **child; - InnerNode **cur_inner; char child_partial_key; - bool is_prefix_match; while (true) { /* number of bytes of the current node's prefix that match the key */ - prefix_match_len = (**cur).check_prefix(key + depth, key_len - depth); + prefix_match_len = cur->check_prefix(key, depth, key_len); /* true if the current node's prefix matches with a part of the key */ - is_prefix_match = (std::min((**cur).prefix_len_, key_len - depth)) == - prefix_match_len; + bool is_prefix_match = cur->prefix_len == prefix_match_len; - if (is_prefix_match && (**cur).prefix_len_ == key_len - depth) { + if (is_prefix_match && cur->prefix_len == key_len - depth) { /* exact match: * => "replace" * => replace value of current node. @@ -153,13 +132,9 @@ char* AdaptiveRadixTree::set(const char *key, char* value) { */ /* cur must be a leaf */ - auto cur_leaf = static_cast(*cur); - char *old_value = cur_leaf->value_; - cur_leaf->value_ = value; - return old_value; - } - - if (!is_prefix_match) { + cur->value = leaf; + return cur->value; + } else if (!is_prefix_match) { /* prefix mismatch: * => new parent node with common prefix and no associated value. * => new node with value to insert. @@ -176,43 +151,44 @@ char* AdaptiveRadixTree::set(const char *key, char* value) { * /|\ /|\ */ - auto new_parent = new Node4(); - new_parent->prefix_ = new char[prefix_match_len]; - std::copy((**cur).prefix_, (**cur).prefix_ + prefix_match_len, - new_parent->prefix_); - new_parent->prefix_len_ = prefix_match_len; - new_parent->set_child((**cur).prefix_[prefix_match_len], *cur); - - // TODO(rafaelkallis): shrink? - /* memmove((**cur).prefix_, (**cur).prefix_ + prefix_match_len + 1, */ - /* (**cur).prefix_len_ - prefix_match_len - 1); */ - /* (**cur).prefix_len_ -= prefix_match_len + 1; */ - - auto old_prefix = (**cur).prefix_; - auto old_prefix_len = (**cur).prefix_len_; - (**cur).prefix_ = new char[old_prefix_len - prefix_match_len - 1]; - (**cur).prefix_len_ = old_prefix_len - prefix_match_len - 1; - std::copy(old_prefix + prefix_match_len + 1, old_prefix + old_prefix_len, - (**cur).prefix_); - delete old_prefix; + InnerNode* inner = new (allocator_->AllocateAligned(sizeof(Node4)))Node4(); + Node* new_parent = AllocateNode(inner, prefix_match_len); + memcpy(new_parent->prefix, cur->prefix, prefix_match_len); - auto new_node = new LeafNode(value); - new_node->prefix_ = new char[key_len - depth - prefix_match_len - 1]; - std::copy(key + depth + prefix_match_len + 1, key + key_len, - new_node->prefix_); - new_node->prefix_len_ = key_len - depth - prefix_match_len - 1; - new_parent->set_child(key[depth + prefix_match_len], new_node); + int old_prefix_len = cur->prefix_len; + int new_prefix_len = old_prefix_len - prefix_match_len - 1; - *cur = new_parent; + Node* new_cur = AllocateNode(cur->inner, new_prefix_len); + new_cur->value = cur->value; + if (new_prefix_len > 0) { + memcpy(new_cur->prefix, cur->prefix + prefix_match_len + 1, new_prefix_len); + } + inner->set_child(cur->prefix[prefix_match_len], cur); + if (depth + prefix_match_len < key_len) { + size_t leaf_prefix_len = key_len - depth - prefix_match_len - 1; + Node* new_node = AllocateNode(nullptr, leaf_prefix_len); + new_node->value = leaf; + if (leaf_prefix_len > 0) { + memcpy(new_node->prefix, key + depth + prefix_match_len + 1, leaf_prefix_len); + } + inner->set_child(key[depth + prefix_match_len], new_node); + } else { + new_parent->value = leaf; + } + cur_address->store(new_parent, std::memory_order_release); return nullptr; } + assert(depth + cur->prefix_len < key_len); /* must be inner node */ - cur_inner = reinterpret_cast(cur); - child_partial_key = key[depth + (**cur).prefix_len_]; - child = (**cur_inner).find_child(child_partial_key); + child_partial_key = key[depth + cur->prefix_len]; + if (cur->inner == nullptr) { + Node4* new_inner = new (allocator_->AllocateAligned(sizeof(Node4))) Node4(); + cur->inner = new_inner; + } + std::atomic* child = cur->inner->find_child(child_partial_key); - if (child == nullptr) { + if (child == nullptr || child->load(std::memory_order_relaxed) == nullptr) { /* * no child associated with the next partial key. * => create new node with value to insert. @@ -224,16 +200,16 @@ char* AdaptiveRadixTree::set(const char *key, char* value) { * (a)->v1 (a)->v1 +()->v2 */ - if ((**cur_inner).is_full()) { - *cur_inner = (**cur_inner).grow(); + if (cur->inner->is_full()) { + cur->inner = cur->inner->grow(allocator_); } - - auto new_node = new LeafNode(value); - new_node->prefix_ = new char[key_len - depth - (**cur).prefix_len_ - 1]; - std::copy(key + depth + (**cur).prefix_len_ + 1, key + key_len, - new_node->prefix_); - new_node->prefix_len_ = key_len - depth - (**cur).prefix_len_ - 1; - (**cur_inner).set_child(child_partial_key, new_node); + size_t leaf_prefix_len = key_len - depth - cur->prefix_len - 1; + Node* new_node = AllocateNode(nullptr, leaf_prefix_len); + new_node->value = leaf; + if (leaf_prefix_len > 0) { + memcpy(new_node->prefix, key + depth + cur->prefix_len + 1, leaf_prefix_len); + } + cur->inner->set_child(child_partial_key, new_node); return nullptr; } @@ -245,20 +221,8 @@ char* AdaptiveRadixTree::set(const char *key, char* value) { * (a)->v1 ()->v2 (a)->v1 *()->v2 */ - depth += (**cur).prefix_len_ + 1; - cur = child; + depth += cur->prefix_len + 1; + cur = child->load(std::memory_order_relaxed); } } - - -TreeIter AdaptiveRadixTree::begin() { - return TreeIter::min(this->root_); -} - -TreeIter AdaptiveRadixTree::begin(const char *key) { - return TreeIter::greater_equal(this->root_, key); -} - -TreeIter AdaptiveRadixTree::end() { return TreeIter(); } - } // namespace rocksdb diff --git a/memtable/art_inner_node.h b/memtable/art_inner_node.h index 167d8040d83..7ea6718eb0a 100644 --- a/memtable/art_inner_node.h +++ b/memtable/art_inner_node.h @@ -5,8 +5,6 @@ #pragma once -#include "memtable/art_leaf_node.h" -#include "memtable/art_node.h" #include #include #include @@ -17,19 +15,11 @@ namespace rocksdb { -class ChildIter; +struct Node; -class InnerNode : public Node { +class InnerNode { public: - virtual ~InnerNode() = default; - - InnerNode() = default; - InnerNode(const InnerNode &other) = default; - InnerNode(InnerNode &&other) noexcept = default; - InnerNode &operator=(const InnerNode &other) = default; - InnerNode &operator=(InnerNode &&other) noexcept = default; - - bool is_leaf() const override; + virtual ~InnerNode() {} /** * Finds and returns the child Node identified by the given partial key. @@ -38,7 +28,7 @@ class InnerNode : public Node { * @return Child Node identified by the given partial key or * a null pointer of no child Node is associated with the partial key. */ - virtual Node **find_child(char partial_key) = 0; + virtual std::atomic* find_child(char partial_key) = 0; /** * Adds the given Node to the Node's children. @@ -52,226 +42,25 @@ class InnerNode : public Node { */ virtual void set_child(char partial_key, Node *child) = 0; - /** - * Deletes the child associated with the given partial key. - * - * @param partial_key - The partial key associated with the child. - */ - virtual Node *del_child(char partial_key) = 0; - /** * Creates and returns a new Node with bigger children capacity. * The current Node gets deleted. * * @return Node with bigger capacity */ - virtual InnerNode *grow() = 0; + virtual InnerNode *grow(Allocator* allocator) = 0; /** * Determines if the Node is full, i.e. can carry no more child Nodes. */ virtual bool is_full() const = 0; - /** - * Determines if the Node is underfull, i.e. carries less child Nodes than - * intended. - */ - virtual bool is_underfull() const = 0; - virtual int n_children() const = 0; virtual char next_partial_key(char partial_key) const = 0; virtual char prev_partial_key(char partial_key) const = 0; - - /** - * Iterator on the first child Node. - * - * @return Iterator on the first child Node. - */ - ChildIter begin(); - std::reverse_iterator rbegin(); - - /** - * Iterator on after the last child Node. - * - * @return Iterator on after the last child Node. - */ - ChildIter end(); - std::reverse_iterator rend(); }; -bool InnerNode::is_leaf() const { return false; } - -class ChildIter { - public: - ChildIter() = default; - ChildIter(const ChildIter &other) = default; - ChildIter(ChildIter &&other) noexcept = default; - ChildIter &operator=(const ChildIter &other) = default; - ChildIter &operator=(ChildIter &&other) noexcept = default; - - explicit ChildIter(InnerNode *n); - ChildIter(InnerNode *n, int relative_index); - - using iterator_category = std::bidirectional_iterator_tag; - using value_type = const char; - using difference_type = int; - using pointer = value_type *; - using reference = value_type &; - - reference operator*() const; - pointer operator->() const; - ChildIter &operator++(); - ChildIter operator++(int); - ChildIter &operator--(); - ChildIter operator--(int); - bool operator==(const ChildIter &rhs) const; - bool operator!=(const ChildIter &rhs) const; - bool operator<(const ChildIter &rhs) const; - bool operator>(const ChildIter &rhs) const; - bool operator<=(const ChildIter &rhs) const; - bool operator>=(const ChildIter &rhs) const; - - char get_partial_key() const; - Node *get_child_node() const; - - private: - InnerNode *node_ = nullptr; - char cur_partial_key_ = -128; - int relative_index_ = 0; -}; - -ChildIter::ChildIter(InnerNode *n) : ChildIter(n, 0) {} - - -ChildIter::ChildIter(InnerNode *n, int relative_index) - : node_(n), cur_partial_key_(0), relative_index_(relative_index) { - if (relative_index_ < 0) { - /* relative_index is out of bounds, no seek */ - return; - } - - if (relative_index_ >= node_->n_children()) { - /* relative_index is out of bounds, no seek */ - return; - } - - if (relative_index_ == node_->n_children() - 1) { - cur_partial_key_ = node_->prev_partial_key(127); - return; - } - - cur_partial_key_ = node_->next_partial_key(-128); - for (int i = 0; i < relative_index_; ++i) { - cur_partial_key_ = node_->next_partial_key(cur_partial_key_ + 1); - } -} - - -typename ChildIter::reference ChildIter::operator*() const { - if (relative_index_ < 0 || relative_index_ >= node_->n_children()) { - throw std::out_of_range("child iterator is out of range"); - } - - return cur_partial_key_; -} - -typename ChildIter::pointer ChildIter::operator->() const { - if (relative_index_ < 0 || relative_index_ >= node_->n_children()) { - throw std::out_of_range("child iterator is out of range"); - } - - return &cur_partial_key_; -} - -ChildIter &ChildIter::operator++() { - ++relative_index_; - if (relative_index_ < 0) { - return *this; - } else if (relative_index_ == 0) { - cur_partial_key_ = node_->next_partial_key(-128); - } else if (relative_index_ < node_->n_children()) { - cur_partial_key_ = node_->next_partial_key(cur_partial_key_ + 1); - } - return *this; -} - -ChildIter ChildIter::operator++(int) { - auto old = *this; - operator++(); - return old; -} - -ChildIter &ChildIter::operator--() { - --relative_index_; - if (relative_index_ > node_->n_children() - 1) { - return *this; - } else if (relative_index_ == node_->n_children() - 1) { - cur_partial_key_ = node_->prev_partial_key(127); - } else if (relative_index_ >= 0) { - cur_partial_key_ = node_->prev_partial_key(cur_partial_key_ - 1); - } - return *this; -} - -ChildIter ChildIter::operator--(int) { - auto old = *this; - operator--(); - return old; -} - -bool ChildIter::operator==(const ChildIter &rhs) const { - return node_ == rhs.node_ && relative_index_ == rhs.relative_index_; -} - -bool ChildIter::operator<(const ChildIter &rhs) const { - return node_ == rhs.node_ && relative_index_ < rhs.relative_index_; -} - -bool ChildIter::operator!=(const ChildIter &rhs) const { - return !((*this) == rhs); -} - -bool ChildIter::operator>=(const ChildIter &rhs) const { - return !((*this) < rhs); -} - -bool ChildIter::operator<=(const ChildIter &rhs) const { - return (rhs >= (*this)); } - -bool ChildIter::operator>(const ChildIter &rhs) const { - return (rhs < (*this)); -} - - -char ChildIter::get_partial_key() const { - return cur_partial_key_; -} - -Node *ChildIter::get_child_node() const { - assert(0 <= relative_index_ && relative_index_ < node_->n_children()); - return *node_->find_child(cur_partial_key_); -} - -ChildIter InnerNode::begin() { - return ChildIter(this); -} - -ChildIter InnerNode::end() { - return ChildIter(this, n_children()); -} - -std::reverse_iterator InnerNode::rbegin() { - return std::reverse_iterator(end()); -} - -std::reverse_iterator InnerNode::rend() { - return std::reverse_iterator(begin()); -} - - - -} // namespace art diff --git a/memtable/art_leaf_node.h b/memtable/art_leaf_node.h deleted file mode 100644 index ffe7c2aef2c..00000000000 --- a/memtable/art_leaf_node.h +++ /dev/null @@ -1,26 +0,0 @@ -/** - * @file LeafNode header - * @author Rafael Kallis - */ - - -#pragma once - -#include "memtable/art_node.h" - -namespace rocksdb { - -class LeafNode : public Node { -public: - explicit LeafNode(char *value); - bool is_leaf() const override; - - char* value_; -}; - -LeafNode::LeafNode(char *value): value_(value) {} - -bool LeafNode::is_leaf() const { return true; } - -} // namespace rocksdb - diff --git a/memtable/art_node.h b/memtable/art_node.h index 859cae05e0b..35a664d1c3f 100644 --- a/memtable/art_node.h +++ b/memtable/art_node.h @@ -12,24 +12,29 @@ #include #include #include +#include "memtable/art_inner_node.h" namespace rocksdb { -class Node { -public: - virtual ~Node() = default; +enum NodeType { + kNode4, + kNode16, + kNode48, + kNode256, +}; + + - Node() = default; - Node(const Node &other) = default; - Node(Node &&other) noexcept = default; - Node &operator=(const Node &other) = default; - Node &operator=(Node &&other) noexcept = default; +struct Node { + Node() {} /** * Determines if this Node is a leaf Node, i.e., contains a value. * Needed for downcasting a Node instance to a leaf_Node or inner_Node instance. */ - virtual bool is_leaf() const = 0; + bool is_leaf() const { + return value != nullptr; + } /** * Determines the number of matching bytes between the Node's prefix and the key. @@ -43,15 +48,23 @@ class Node { * ^^^^* * index: 01234 */ - int check_prefix(const char *key, int key_len) const; + int check_prefix(const char *key, int depth, int key_len) const; - char *prefix_ = nullptr; - uint16_t prefix_len_ = 0; + NodeType inner_type; + InnerNode* inner; + char* value; + uint16_t prefix_len; + char prefix[1]; }; -int Node::check_prefix(const char *key, int key_len) const { - key_len = std::min(key_len, (int)prefix_len_); - return std::mismatch(prefix_, prefix_ + key_len, key).second - key; +int Node::check_prefix(const char *key, int depth, int key_len) const { + int l = std::min((int)prefix_len, key_len - depth); + for (int i = 0; i < l; i ++) { + if (key[i + depth] != prefix[i]) { + return i; + } + } + return l; } } // namespace rocksdb diff --git a/memtable/art_node_16.h b/memtable/art_node_16.h index 397088053da..00158b2411b 100644 --- a/memtable/art_node_16.h +++ b/memtable/art_node_16.h @@ -20,15 +20,14 @@ namespace rocksdb { class Node16 : public InnerNode { -friend class Node48; public: - Node **find_child(char partial_key) override; + Node16() { + } + ~Node16() {} + std::atomic* find_child(char partial_key) override; void set_child(char partial_key, Node *child) override; - Node *del_child(char partial_key) override; - InnerNode *grow() override; - InnerNode *shrink() override; + InnerNode *grow(Allocator* allocator) override; bool is_full() const override; - bool is_underfull() const override; char next_partial_key(char partial_key) const override; @@ -36,84 +35,69 @@ friend class Node48; int n_children() const override; - uint8_t n_children_ = 0; + std::atomic n_children_; private: - char keys_[16]; - Node *children_[16]; + std::atomic keys_[2]; + std::atomic children_[16]; }; - Node **Node16::find_child(char partial_key) { -#if defined(__i386__) || defined(__amd64__) - int bitfield = - _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(partial_key), - _mm_loadu_si128((__m128i *)keys_))) & - ((1 << n_children_) - 1); - return (bool)bitfield ? &children_[__builtin_ctz(bitfield)] : nullptr; -#else - int lo, mid, hi; - lo = 0; - hi = n_children_; - while (lo < hi) { - mid = (lo + hi) / 2; - if (partial_key < keys_[mid]) { - hi = mid; - } else if (partial_key > keys_[mid]) { - lo = mid + 1; - } else { - return &children_[mid]; - } - } + std::atomic *Node16::find_child(char partial_key) { + uint8_t key = partial_key + 128; + uint8_t n_children = n_children_.load(std::memory_order_acquire); + uint32_t keys = keys_[0].load(std::memory_order_acquire); + uint8_t l = std::min(n_children, (uint8_t)8); + for (uint8_t i = 0; i < l; ++i) { + if ((keys & 255) == key) { + return &children_[i]; + } + keys >>= 8; + } + if (n_children > 8) { + n_children -= 8; + keys = keys_[1].load(std::memory_order_acquire); + for (uint8_t i = 0; i < n_children; ++i) { + if ((keys & 255) == key) { + return &children_[i + 8]; + } + keys >>= 8; + } + } return nullptr; -#endif } void Node16::set_child(char partial_key, Node *child) { /* determine index for child */ - int child_i; - for (int i = this->n_children_ - 1;; --i) { - if (i >= 0 && partial_key < this->keys_[i]) { - /* move existing sibling to the right */ - this->keys_[i + 1] = this->keys_[i]; - this->children_[i + 1] = this->children_[i]; - } else { - child_i = i + 1; - break; - } + uint8_t child_i = n_children_.load(std::memory_order_relaxed); + uint8_t key = partial_key + 128; + if (child_i < 8) { + uint64_t k = keys_[0].load(std::memory_order_relaxed); + keys_[0].store(k | key << child_i, std::memory_order_release); + } else { + uint64_t k = keys_[1].load(std::memory_order_relaxed); + keys_[1].store(k | key << (child_i - 8), std::memory_order_release); } - - this->keys_[child_i] = partial_key; - this->children_[child_i] = child; - ++n_children_; + children_[child_i].store(child, std::memory_order_release); + n_children_.store(child_i + 1, std::memory_order_release); } - Node *Node16::del_child(char partial_key) { - Node *child_to_delete = nullptr; - for (int i = 0; i < n_children_; ++i) { - if (child_to_delete == nullptr && partial_key == keys_[i]) { - child_to_delete = children_[i]; - } - if (child_to_delete != nullptr) { - /* move existing sibling to the left */ - keys_[i] = i < n_children_ - 1 ? keys_[i + 1] : 0; - children_[i] = i < n_children_ - 1 ? children_[i + 1] : nullptr; - } - } - if (child_to_delete != nullptr) { - --n_children_; + InnerNode *Node16::grow(Allocator* allocator) { + auto new_node = new (allocator->AllocateAligned(sizeof(Node48)))Node48(); + uint8_t n_children = n_children_.load(std::memory_order_acquire); + uint32_t keys = keys_[0].load(std::memory_order_acquire); + uint8_t l = std::min(n_children, (uint8_t)8); + for (uint8_t i = 0; i < l; ++i) { + new_node->set_child(keys & 255, children_[i].load(std::memory_order_relaxed)); + keys >>= 8; } - return child_to_delete; -} - - InnerNode *Node16::grow() { - auto new_node = new Node48(); - new_node->prefix_ = this->prefix_; - new_node->prefix_len_ = this->prefix_len_; - std::copy(this->children_, this->children_ + this->n_children_, new_node->children_); - for (int i = 0; i < n_children_; ++i) { - new_node->indexes_[(uint8_t) this->keys_[i]] = i; + if (n_children > 8) { + n_children -= 8; + keys = keys_[1].load(std::memory_order_acquire); + for (uint8_t i = 0; i < n_children; ++i) { + new_node->set_child(keys & 255, children_[i + 8].load(std::memory_order_relaxed)); + keys >>= 8; + } } - delete this; return new_node; } @@ -122,25 +106,36 @@ void Node16::set_child(char partial_key, Node *child) { return n_children_ == 16; } - bool Node16::is_underfull() const { - return n_children_ == 4; -} - char Node16::next_partial_key(char partial_key) const { - for (int i = 0; i < n_children_; ++i) { - if (keys_[i] >= partial_key) { - return keys_[i]; - } - } + uint8_t n_children = n_children_.load(std::memory_order_acquire); + uint8_t key = partial_key + 128; + uint32_t keys = keys_[0].load(std::memory_order_acquire); + uint8_t l = std::min(n_children, (uint8_t)8); + for (uint8_t i = 0; i < l; ++i) { + if ((keys & 255) >= key) { + return (keys & 255) - 128; + } + keys >>= 8; + } + if (n_children > 8) { + n_children -= 8; + keys = keys_[1].load(std::memory_order_acquire); + for (uint8_t i = 0; i < n_children; ++i) { + if ((keys & 255) >= key) { + return (keys & 255) - 128; + } + keys >>= 8; + } + } throw std::out_of_range("provided partial key does not have a successor"); } char Node16::prev_partial_key(char partial_key) const { - for (int i = n_children_ - 1; i >= 0; --i) { - if (keys_[i] <= partial_key) { - return keys_[i]; - } - } +// for (int i = n_children_ - 1; i >= 0; --i) { +// if (keys_[i] <= partial_key) { +// return keys_[i]; +// } +// } throw std::out_of_range("provided partial key does not have a predecessor"); } diff --git a/memtable/art_node_256.h b/memtable/art_node_256.h index 71fa82f42aa..4c503eee510 100644 --- a/memtable/art_node_256.h +++ b/memtable/art_node_256.h @@ -14,15 +14,13 @@ namespace rocksdb { class Node256 : public InnerNode { public: - Node256(); + Node256() { n_children_.store(0); } + virtual ~Node256() {} - Node **find_child(char partial_key) override; + std::atomic*find_child(char partial_key) override; void set_child(char partial_key, Node *child) override; - Node *del_child(char partial_key) override; - InnerNode *grow() override; - InnerNode *shrink() override; + InnerNode *grow(Allocator* allocator) override; bool is_full() const override; - bool is_underfull() const override; char next_partial_key(char partial_key) const override; @@ -31,44 +29,27 @@ namespace rocksdb { int n_children() const override; private: - uint16_t n_children_ = 0; - std::array children_; + std::atomic n_children_; + std::atomic children_[256]; }; - Node256::Node256() { children_.fill(nullptr); } - - Node **Node256::find_child(char partial_key) { - return children_[128 + partial_key] != nullptr ? &children_[128 + partial_key] - : nullptr; + std::atomic*Node256::find_child(char partial_key) { + return &children_[128 + partial_key]; } - void Node256::set_child(char partial_key, Node *child) { - children_[128 + partial_key] = child; + children_[128 + partial_key].store(child, std::memory_order_release); ++n_children_; } - Node *Node256::del_child(char partial_key) { - Node *child_to_delete = children_[128 + partial_key]; - if (child_to_delete != nullptr) { - children_[128 + partial_key] = nullptr; - --n_children_; - } - return child_to_delete; -} - - InnerNode *Node256::grow() { + InnerNode *Node256::grow(Allocator* _allocator) { throw std::runtime_error("Node256 cannot grow"); } - bool Node256::is_full() const { return n_children_ == 256; } - bool Node256::is_underfull() const { - return n_children_ == 48; -} char Node256::next_partial_key(char partial_key) const { while (true) { diff --git a/memtable/art_node_4.h b/memtable/art_node_4.h index a49298cf3e4..e641af44598 100644 --- a/memtable/art_node_4.h +++ b/memtable/art_node_4.h @@ -15,18 +15,13 @@ namespace rocksdb { -class Node16; - class Node4 : public InnerNode { - friend class Node16; - public: - Node **find_child(char partial_key) override; + ~Node4() {} + std::atomic *find_child(char partial_key) override; void set_child(char partial_key, Node *child) override; - Node *del_child(char partial_key) override; - InnerNode *grow() override; + InnerNode *grow(Allocator* allocator) override; bool is_full() const override; - bool is_underfull() const override; char next_partial_key(char partial_key) const override; @@ -35,14 +30,17 @@ class Node4 : public InnerNode { int n_children() const override; private: - uint8_t n_children_ = 0; - char keys_[4]; - Node *children_[4]; + std::atomic n_children_; + std::atomic keys_; + std::atomic children_[4]; }; - Node **Node4::find_child(char partial_key) { - for (int i = 0; i < n_children_; ++i) { - if (keys_[i] == partial_key) { + std::atomic *Node4::find_child(char partial_key) { + uint8_t key = partial_key; + uint8_t n_children = n_children_.load(std::memory_order_acquire); + uint32_t keys = keys_.load(std::memory_order_acquire); + for (uint8_t i = 0; i < n_children; ++i) { + if (((keys >> (i * 8)) & 255) == key) { return &children_[i]; } } @@ -51,70 +49,57 @@ class Node4 : public InnerNode { void Node4::set_child(char partial_key, Node *child) { /* determine index for child */ - int c_i; - for (c_i = 0; c_i < n_children_ && partial_key >= keys_[c_i]; ++c_i) { - } - std::memmove(keys_ + c_i + 1, keys_ + c_i, n_children_ - c_i); - std::memmove(children_ + c_i + 1, children_ + c_i, - (n_children_ - c_i) * sizeof(void *)); - - keys_[c_i] = partial_key; - children_[c_i] = child; - ++n_children_; + uint8_t n_children = n_children_.load(std::memory_order_relaxed); + uint8_t c_i = partial_key; + uint32_t idx_value = (uint32_t)c_i << (n_children * 8); + uint32_t key = keys_.load(std::memory_order_relaxed); + keys_.store(key | idx_value, std::memory_order_release); + children_[c_i].store(child, std::memory_order_release); + n_children_.store(n_children + 1, std::memory_order_release); } - Node *Node4::del_child(char partial_key) { - Node *child_to_delete = nullptr; - for (int i = 0; i < n_children_; ++i) { - if (child_to_delete == nullptr && partial_key == keys_[i]) { - child_to_delete = children_[i]; + InnerNode *Node4::grow(Allocator* allocator) { + Node16* new_node = new (allocator->AllocateAligned(sizeof(Node16)))Node16(); + uint8_t n_children = n_children_.load(std::memory_order_acquire); + new_node->n_children_.store(n_children, std::memory_order_relaxed); + uint32_t keys = keys_.load(std::memory_order_acquire); + + for (uint8_t i = 0; i < n_children; ++i) { + uint8_t c = (keys >> (i * 8)) & 255; + if (c >= 128) { + new_node->set_child(c - 128, children_[i].load(std::memory_order_relaxed)); + } else { + new_node->set_child((char)c - 128, children_[i].load(std::memory_order_relaxed)); } - if (child_to_delete != nullptr) { - /* move existing sibling to the left */ - keys_[i] = i < n_children_ - 1 ? keys_[i + 1] : 0; - children_[i] = i < n_children_ - 1 ? children_[i + 1] : nullptr; - } - } - if (child_to_delete != nullptr) { - --n_children_; } - return child_to_delete; -} - - InnerNode *Node4::grow() { - auto new_node = new Node16(); - new_node->prefix_ = this->prefix_; - new_node->prefix_len_ = this->prefix_len_; - new_node->n_children_ = this->n_children_; - std::copy(this->keys_, this->keys_ + this->n_children_, new_node->keys_); - std::copy(this->children_, this->children_ + this->n_children_, new_node->children_); - delete this; return new_node; } - bool Node4::is_full() const { return n_children_ == 4; } - bool Node4::is_underfull() const { - return false; -} - char Node4::next_partial_key(char partial_key) const { - for (int i = 0; i < n_children_; ++i) { - if (keys_[i] >= partial_key) { - return keys_[i]; - } - } + uint32_t keys = keys_.load(std::memory_order_acquire); + uint8_t n_children = n_children_.load(std::memory_order_acquire); + for (uint8_t i = 0; i < n_children; ++i) { + uint8_t c = keys & 255; + if ((char)c >= partial_key) { + return (char)c; + } + keys >>= 8; + } /* return 0; */ throw std::out_of_range("provided partial key does not have a successor"); } char Node4::prev_partial_key(char partial_key) const { - for (int i = n_children_ - 1; i >= 0; --i) { - if (keys_[i] <= partial_key) { - return keys_[i]; - } - } + uint32_t keys = keys_.load(std::memory_order_acquire); + uint8_t n_children = n_children_.load(std::memory_order_acquire); + for (uint8_t i = n_children - 1; i >= 0; --i) { + uint8_t c = (keys >> (i * 8)) & 255; + if ((char)c <= partial_key) { + return (char)c; + } + } /* return 255; */ throw std::out_of_range("provided partial key does not have a predecessor"); } diff --git a/memtable/art_node_48.h b/memtable/art_node_48.h index b090ad86cb0..e163c4b06d5 100644 --- a/memtable/art_node_48.h +++ b/memtable/art_node_48.h @@ -15,85 +15,73 @@ namespace rocksdb { class Node48 : public InnerNode { - friend class Node16; - friend class Node256; public: Node48(); + virtual ~Node48() {} - Node **find_child(char partial_key) override; + std::atomic *find_child(char partial_key) override; void set_child(char partial_key, Node *child) override; - Node *del_child(char partial_key) override; - InnerNode *grow() override; + InnerNode *grow(Allocator* allocator) override; bool is_full() const override; - bool is_underfull() const override; char next_partial_key(char partial_key) const override; char prev_partial_key(char partial_key) const override; + uint8_t get_index(uint8_t key) const; + void set_index(uint8_t key, uint8_t index); int n_children() const override; private: - static const char EMPTY; + static const uint8_t EMPTY; - uint8_t n_children_ = 0; - char indexes_[256]; - Node *children_[48]; + std::atomic n_children_; + std::atomic indexes_[32]; + std::atomic children_[48]; }; Node48::Node48() { - std::fill(this->indexes_, this->indexes_ + 256, Node48::EMPTY); - std::fill(this->children_, this->children_ + 48, nullptr); + for (int i = 0; i < 32; i ++) { + indexes_[i].store(0, std::memory_order_relaxed); + } + for (int i = 0; i < 48; i ++) { + children_[i].store(nullptr, std::memory_order_relaxed); + } + n_children_.store(0, std::memory_order_relaxed); } - Node **Node48::find_child(char partial_key) { + std::atomic *Node48::find_child(char partial_key) { // TODO(rafaelkallis): direct lookup instead of temp save? - uint8_t index = indexes_[128 + partial_key]; + uint8_t index = get_index(partial_key + 128); return Node48::EMPTY != index ? &children_[index] : nullptr; } +uint8_t Node48::get_index(uint8_t key) const { + uint64_t index = indexes_[key >> 3].load(std::memory_order_acquire); + return (index >> ((key & 7) << 3) & 255) - 1; +} -void Node48::set_child(char partial_key, Node *child) { - - // TODO(rafaelkallis): pick random starting entry in order to increase - // performance? i.e. for (int i = random([0,48)); i != (i-1) % 48; i = (i+1) % - // 48){} - - /* find empty child entry */ - for (int i = 0; i < 48; ++i) { - if (children_[i] == nullptr) { - indexes_[128 + partial_key] = (uint8_t) i; - children_[i] = child; - break; - } - } - ++n_children_; +void Node48::set_index(uint8_t key, uint8_t index) { + uint64_t old_index = indexes_[key >> 3].load(std::memory_order_acquire); + indexes_[key >> 3].store(old_index | (index + 1) << (key & 7), std::memory_order_release); } - Node *Node48::del_child(char partial_key) { - Node *child_to_delete = nullptr; - unsigned char index = indexes_[128 + partial_key]; - if (index != Node48::EMPTY) { - child_to_delete = children_[index]; - indexes_[128 + partial_key] = Node48::EMPTY; - children_[index] = nullptr; - --n_children_; - } - return child_to_delete; +void Node48::set_child(char partial_key, Node *child) { + uint8_t n_children = n_children_.load(std::memory_order_relaxed); + set_index(partial_key + 128, n_children); + children_[n_children].store(child, std::memory_order_release); + n_children_.store(n_children + 1, std::memory_order_release); } - InnerNode *Node48::grow() { - auto new_node = new Node256(); - new_node->prefix_ = this->prefix_; - new_node->prefix_len_ = this->prefix_len_; + InnerNode *Node48::grow(Allocator* allocator) { + auto new_node = new (allocator->AllocateAligned(sizeof(Node256)))Node256(); uint8_t index; for (int partial_key = -128; partial_key < 127; ++partial_key) { - index = indexes_[128 + partial_key]; + index = get_index(partial_key + 128); if (index != Node48::EMPTY) { new_node->set_child(partial_key, children_[index]); } } - delete this; return new_node; } @@ -102,15 +90,12 @@ void Node48::set_child(char partial_key, Node *child) { return n_children_ == 48; } - bool Node48::is_underfull() const { - return n_children_ == 16; -} - - const char Node48::EMPTY = 48; + const uint8_t Node48::EMPTY = 255; char Node48::next_partial_key(char partial_key) const { while (true) { - if (indexes_[128 + partial_key] != Node48::EMPTY) { + uint8_t index = get_index(partial_key + 128); + if (index != Node48::EMPTY) { return partial_key; } if (partial_key == 127) { @@ -122,7 +107,8 @@ void Node48::set_child(char partial_key, Node *child) { char Node48::prev_partial_key(char partial_key) const { while (true) { - if (indexes_[128 + partial_key] != Node48::EMPTY) { + uint8_t index = get_index(partial_key + 128); + if (index != Node48::EMPTY) { return partial_key; } if (partial_key == -128) { diff --git a/memtable/artrep.cc b/memtable/artrep.cc index 07b3f2a371f..f1b6c8475e2 100644 --- a/memtable/artrep.cc +++ b/memtable/artrep.cc @@ -23,256 +23,119 @@ class AdaptiveRadixTreeRep : public MemTableRep { Allocator* allocator, const SliceTransform* transform, const size_t lookahead) : MemTableRep(allocator), - skip_list_(compare, allocator), + skip_list_(allocator), cmp_(compare), transform_(transform), lookahead_(lookahead) {} KeyHandle Allocate(const size_t len, char** buf) override { - *buf = skip_list_.AllocateKey(len); - return static_cast(*buf); + // *buf = skip_list_.AllocateKey(len); + return static_cast(nullptr); } // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. void Insert(KeyHandle handle) override { - skip_list_.Insert(static_cast(handle)); + skip_list_.Insert(static_cast(handle), 0, nullptr); } bool InsertKey(KeyHandle handle) override { - return skip_list_.Insert(static_cast(handle)); - } - - void InsertWithHint(KeyHandle handle, void** hint) override { - skip_list_.InsertWithHint(static_cast(handle), hint); - } - - bool InsertKeyWithHint(KeyHandle handle, void** hint) override { - return skip_list_.InsertWithHint(static_cast(handle), hint); - } - - void InsertConcurrently(KeyHandle handle) override { - skip_list_.InsertConcurrently(static_cast(handle)); - } - - bool InsertKeyConcurrently(KeyHandle handle) override { - return skip_list_.InsertConcurrently(static_cast(handle)); + return skip_list_.Insert(static_cast(handle), 0, nullptr); } // Returns true iff an entry that compares equal to key is in the list. bool Contains(const char* key) const override { - return skip_list_.Contains(key); + return skip_list_.Get(key) != nullptr; } size_t ApproximateMemoryUsage() override { - // All memory is allocated through allocator; nothing to report here + // All memory is allocated through allocator_; nothing to report here return 0; } void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)) override { - AdaptiveRadixTreeRep::Iterator iter(&skip_list_); - Slice dummy_slice; - for (iter.Seek(dummy_slice, k.memtable_key().data()); - iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { - } + callback_func(callback_args, skip_list_.Get(k.user_key().data())); } uint64_t ApproximateNumEntries(const Slice& start_ikey, const Slice& end_ikey) override { - std::string tmp; - uint64_t start_count = - skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey)); - uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey)); - return (end_count >= start_count) ? (end_count - start_count) : 0; +// std::string tmp; +// uint64_t start_count = +// skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey)); +// uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey)); +// return (end_count >= start_count) ? (end_count - start_count) : 0; + return 0; } ~AdaptiveRadixTreeRep() override {} - // Iteration over the contents of a skip list - class Iterator : public MemTableRep::Iterator { - typename SkipList::Iterator iter_; - - public: - // Initialize an iterator over the specified list. - // The returned iterator is not valid. - explicit Iterator( - const SkipList* list) - : iter_(list) {} - - ~Iterator() override {} - - // Returns true iff the iterator is positioned at a valid node. - bool Valid() const override { return iter_.Valid(); } - - // Returns the key at the current position. - // REQUIRES: Valid() - const char* key() const override { return iter_.key(); } - - // Advances to the next position. - // REQUIRES: Valid() - void Next() override { iter_.Next(); } - - // Advances to the previous position. - // REQUIRES: Valid() - void Prev() override { iter_.Prev(); } - - // Advance to the first entry with a key >= target - void Seek(const Slice& user_key, const char* memtable_key) override { - if (memtable_key != nullptr) { - iter_.Seek(memtable_key); - } else { - iter_.Seek(EncodeKey(&tmp_, user_key)); - } - } - - // Retreat to the last entry with a key <= target - void SeekForPrev(const Slice& user_key, const char* memtable_key) override { - if (memtable_key != nullptr) { - iter_.SeekForPrev(memtable_key); - } else { - iter_.SeekForPrev(EncodeKey(&tmp_, user_key)); - } - } - - // Position at the first entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToFirst() override { iter_.SeekToFirst(); } - - // Position at the last entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToLast() override { iter_.SeekToLast(); } - - protected: - std::string tmp_; // For passing to EncodeKey - }; - - // Iterator over the contents of a skip list which also keeps track of the - // previously visited node. In Seek(), it examines a few nodes after it - // first, falling back to O(log n) search from the head of the list only if - // the target key hasn't been found. - class LookaheadIterator : public MemTableRep::Iterator { - public: - explicit LookaheadIterator(const AdaptiveRadixTreeRep& rep) : - rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} - - ~LookaheadIterator() override {} - - bool Valid() const override { return iter_.Valid(); } - - const char* key() const override { - assert(Valid()); - return iter_.key(); - } - - void Next() override { - assert(Valid()); - - bool advance_prev = true; - if (prev_.Valid()) { - auto k1 = rep_.UserKey(prev_.key()); - auto k2 = rep_.UserKey(iter_.key()); - - if (k1.compare(k2) == 0) { - // same user key, don't move prev_ - advance_prev = false; - } else if (rep_.transform_) { - // only advance prev_ if it has the same prefix as iter_ - auto t1 = rep_.transform_->Transform(k1); - auto t2 = rep_.transform_->Transform(k2); - advance_prev = t1.compare(t2) == 0; - } - } - - if (advance_prev) { - prev_ = iter_; - } - iter_.Next(); - } - - void Prev() override { - assert(Valid()); - iter_.Prev(); - prev_ = iter_; - } - - void Seek(const Slice& internal_key, const char* memtable_key) override { - const char *encoded_key = - (memtable_key != nullptr) ? - memtable_key : EncodeKey(&tmp_, internal_key); - - if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { - // prev_.key() is smaller or equal to our target key; do a quick - // linear search (at most lookahead_ steps) starting from prev_ - iter_ = prev_; - - size_t cur = 0; - while (cur++ <= rep_.lookahead_ && iter_.Valid()) { - if (rep_.cmp_(encoded_key, iter_.key()) <= 0) { - return; - } - Next(); - } - } - - iter_.Seek(encoded_key); - prev_ = iter_; - } - - void SeekForPrev(const Slice& internal_key, - const char* memtable_key) override { - const char* encoded_key = (memtable_key != nullptr) - ? memtable_key - : EncodeKey(&tmp_, internal_key); - iter_.SeekForPrev(encoded_key); - prev_ = iter_; - } - - void SeekToFirst() override { - iter_.SeekToFirst(); - prev_ = iter_; - } - - void SeekToLast() override { - iter_.SeekToLast(); - prev_ = iter_; - } - - protected: - std::string tmp_; // For passing to EncodeKey +// // Iteration over the contents of a skip list +// class Iterator : public MemTableRep::Iterator { +// public: +// // Initialize an iterator over the specified list. +// // The returned iterator is not valid. +// explicit Iterator( +// const AdaptiveRadixTreeRep* list) +// : iter_(list) {} +// +// ~Iterator() override {} +// +// // Returns true iff the iterator is positioned at a valid node. +// bool Valid() const override { return iter_.Valid(); } +// +// // Returns the key at the current position. +// // REQUIRES: Valid() +// const char* key() const override { return iter_.key(); } +// +// // Advances to the next position. +// // REQUIRES: Valid() +// void Next() override { iter_.Next(); } +// +// // Advances to the previous position. +// // REQUIRES: Valid() +// void Prev() override { iter_.Prev(); } +// +// // Advance to the first entry with a key >= target +// void Seek(const Slice& user_key, const char* memtable_key) override { +// if (memtable_key != nullptr) { +// iter_.Seek(memtable_key); +// } else { +// iter_.Seek(EncodeKey(&tmp_, user_key)); +// } +// } +// +// // Retreat to the last entry with a key <= target +// void SeekForPrev(const Slice& user_key, const char* memtable_key) override { +// if (memtable_key != nullptr) { +// iter_.SeekForPrev(memtable_key); +// } else { +// iter_.SeekForPrev(EncodeKey(&tmp_, user_key)); +// } +// } +// +// // Position at the first entry in list. +// // Final state of iterator is Valid() iff list is not empty. +// void SeekToFirst() override { iter_.SeekToFirst(); } +// +// // Position at the last entry in list. +// // Final state of iterator is Valid() iff list is not empty. +// void SeekToLast() override { iter_.SeekToLast(); } +// +// protected: +// std::string tmp_; // For passing to EncodeKey +// }; - private: - const AdaptiveRadixTreeRep& rep_; - typename SkipList::Iterator iter_; - typename SkipList::Iterator prev_; - }; MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { - if (lookahead_ > 0) { - void *mem = - arena ? arena->AllocateAligned(sizeof(AdaptiveRadixTreeRep::LookaheadIterator)) - : operator new(sizeof(AdaptiveRadixTreeRep::LookaheadIterator)); - return new (mem) AdaptiveRadixTreeRep::LookaheadIterator(*this); - } else { - void *mem = - arena ? arena->AllocateAligned(sizeof(AdaptiveRadixTreeRep::Iterator)) - : operator new(sizeof(AdaptiveRadixTreeRep::Iterator)); - return new (mem) AdaptiveRadixTreeRep::Iterator(&skip_list_); - } +// void *mem = +// arena ? arena->AllocateAligned(sizeof(AdaptiveRadixTreeRep::Iterator)) +// : operator new(sizeof(AdaptiveRadixTreeRep::Iterator)); +// return new (mem) AdaptiveRadixTreeRep::Iterator(&skip_list_); + return nullptr; } }; } -MemTableRep* SkipListFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* /*logger*/) { - return new AdaptiveRadixTreeRep(compare, allocator, transform, lookahead_); -} - -MemTableRep* DoublySkipListFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* /*logger*/) { - return new AdaptiveRadixTreeRep(compare, allocator, transform, lookahead_); -} } // namespace rocksdb From 62b54879ecb8709fea575598ab4d60788403efed Mon Sep 17 00:00:00 2001 From: Little-Wallace Date: Thu, 31 Mar 2022 22:26:07 +0800 Subject: [PATCH 03/18] fix warn Signed-off-by: Little-Wallace --- db/compaction/compaction_picker_test.cc | 2 -- db/db_test.cc | 2 -- db/db_test2.cc | 5 ----- options/db_options.h | 1 - 4 files changed, 10 deletions(-) diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index c75b14a7946..7c7c550d475 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -516,13 +516,11 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { // verify whether compaction is needed based on the current // size of L0 files. - uint64_t current_size = 0; for (int i = 1; i <= kFileCount; ++i) { NewVersionStorage(1, kCompactionStyleFIFO); Add(0, i, ToString((i + 100) * 1000).c_str(), ToString((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100, i * 100 + 99); - current_size += kFileSize; UpdateVersionStorageInfo(); ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), vstorage_->CompactionScore(0) >= 1); diff --git a/db/db_test.cc b/db/db_test.cc index 16ac9f79173..ce903b6c774 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1021,7 +1021,6 @@ TEST_F(DBTest, FailMoreDbPaths) { void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { uint64_t cf_size = 0; - uint64_t cf_csize = 0; size_t file_count = 0; for (auto level_meta : cf_meta.levels) { uint64_t level_size = 0; @@ -1032,7 +1031,6 @@ void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { } ASSERT_EQ(level_meta.size, level_size); cf_size += level_size; - cf_csize += level_csize; } ASSERT_EQ(cf_meta.file_count, file_count); ASSERT_EQ(cf_meta.size, cf_size); diff --git a/db/db_test2.cc b/db/db_test2.cc index a7b3d2b4aba..33e7cc20eb5 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2199,7 +2199,6 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { Close(); Reopen(options); - uint64_t total_useful_bytes = 0; std::set read_keys; std::string value; // Iter1: Read half the DB, Read even keys @@ -2210,8 +2209,6 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { if (read_keys.find(i) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); - total_useful_bytes += - GetEncodedEntrySize(internal_key.size(), value.size()); read_keys.insert(i); } } @@ -2237,8 +2234,6 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { if (read_keys.find(i) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); - total_useful_bytes += - GetEncodedEntrySize(internal_key.size(), value.size()); read_keys.insert(i); } } diff --git a/options/db_options.h b/options/db_options.h index 01c737bc4c3..39ebf680db6 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -87,7 +87,6 @@ struct ImmutableDBOptions { struct MutableDBOptions { MutableDBOptions(); - explicit MutableDBOptions(const MutableDBOptions& options) = default; explicit MutableDBOptions(const DBOptions& options); void Dump(Logger* log) const; From 2fe3f27babb85602c33edd2a812e856b0659786e Mon Sep 17 00:00:00 2001 From: Little-Wallace Date: Sun, 3 Apr 2022 21:07:54 +0800 Subject: [PATCH 04/18] finish iterator next and seek Signed-off-by: Little-Wallace --- CMakeLists.txt | 1 + include/rocksdb/memtablerep.h | 17 + memtable/art.h | 311 +++++++++++++++++- memtable/art_node.h | 2 +- memtable/art_node_16.h | 2 +- memtable/art_node_256.h | 37 ++- memtable/art_node_4.h | 25 +- memtable/art_node_48.h | 37 +-- memtable/art_test.cc | 590 ++++++++++++++++++++++++++++++++++ memtable/artrep.cc | 155 +++++---- 10 files changed, 1035 insertions(+), 142 deletions(-) create mode 100644 memtable/art_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 606cbe5b77f..972e654d662 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -974,6 +974,7 @@ if(WITH_TESTS) memory/arena_test.cc memtable/inlineskiplist_test.cc memtable/skiplist_test.cc + memtable/art_test.cc memtable/write_buffer_manager_test.cc monitoring/histogram_test.cc monitoring/iostats_context_test.cc diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index dca9650d8a3..594bb976bc9 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -323,6 +323,23 @@ class DoublySkipListFactory : public MemTableRepFactory { const size_t lookahead_; }; +// This uses a doubly skip list to store keys, which is similar to skip list, +// but optimize for prev seek. +class AdaptiveRadixTreeFactory : public MemTableRepFactory { + public: + AdaptiveRadixTreeFactory() {} + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; + const char* Name() const override { return "AdaptiveRadixTreeFactory"; } + + bool IsInsertConcurrentlySupported() const override { return false; } + + bool CanHandleDuplicatedKey() const override { return false; } +}; + #ifndef ROCKSDB_LITE // This creates MemTableReps that are backed by an std::vector. On iteration, // the vector is sorted. This is useful for workloads where iteration is very diff --git a/memtable/art.h b/memtable/art.h index 0b26c290faa..cce7a5194fd 100644 --- a/memtable/art.h +++ b/memtable/art.h @@ -16,6 +16,39 @@ namespace rocksdb { class AdaptiveRadixTree { + struct NodeIterator { + Node* node_; + int depth_; + char cur_partial_key_ = -128; + Node* child = nullptr; + + NodeIterator(Node* node, int depth) : node_(node), depth_(depth) {} + + void Next(); + void Prev(); + bool Valid(); + void SeekToFirst(); + void SeekToLast(); + }; + + public: + struct Iterator { + std::atomic* root_; + std::vector traversal_stack_; + explicit Iterator(AdaptiveRadixTree* tree) : root_(&tree->root_) {} + void Seek(const char* key, int l); + void SeekToFirst(); + void SeekToLast(); + void SeekForPrev(const char* key, int l); + void SeekForPrevImpl(const char* key, int l); + void Next(); + bool Valid() const; + const char* Value() const { return traversal_stack_.back().node_->value; } + void SeekImpl(const char* key, int key_len); + void SeekLeftLeaf(); + void SeekRightLeaf(); + }; + public: AdaptiveRadixTree(Allocator* allocator); ~AdaptiveRadixTree(); @@ -26,7 +59,7 @@ class AdaptiveRadixTree { * @param key - The key to find. * @return the value associated with the key or a nullptr. */ - char* Get(const char *key) const; + const char* Get(const char* key) const; /** * Associates the given key with the given value. @@ -38,10 +71,8 @@ class AdaptiveRadixTree { * @return a nullptr if no other value is associated with they or the * previously associated value. */ - char *Insert(const char *key, int key_len, char* v); - + const char* Insert(const char* key, int key_len, const char* v); - char* AllocateLeafNode(const char* v, size_t value_size); Node* AllocateNode(InnerNode* inner, size_t prefix_size); private: @@ -57,7 +88,7 @@ AdaptiveRadixTree::AdaptiveRadixTree(Allocator* allocator) root_.store(nullptr, std::memory_order_relaxed); } -char* AdaptiveRadixTree::Get(const char *key) const { +const char* AdaptiveRadixTree::Get(const char* key) const { Node *cur = root_.load(std::memory_order_acquire); std::atomic* child = nullptr; int depth = 0, key_len = std::strlen(key) + 1; @@ -94,8 +125,8 @@ Node* AdaptiveRadixTree::AllocateNode(InnerNode* inner, size_t prefix_size) { return node; } - -char* AdaptiveRadixTree::Insert(const char *key, int l, char* leaf) { +const char* AdaptiveRadixTree::Insert(const char* key, int l, + const char* leaf) { int key_len = std::strlen(key) + 1, depth = 0, prefix_match_len; std::atomic* cur_address = &root_; @@ -201,7 +232,12 @@ char* AdaptiveRadixTree::Insert(const char *key, int l, char* leaf) { */ if (cur->inner->is_full()) { - cur->inner = cur->inner->grow(allocator_); + Node* old = cur; + cur = AllocateNode(cur->inner->grow(allocator_), old->prefix_len); + if (old->prefix_len > 0) { + memcpy(cur->prefix, old->prefix, old->prefix_len); + cur_address->store(cur, std::memory_order_release); + } } size_t leaf_prefix_len = key_len - depth - cur->prefix_len - 1; Node* new_node = AllocateNode(nullptr, leaf_prefix_len); @@ -222,7 +258,266 @@ char* AdaptiveRadixTree::Insert(const char *key, int l, char* leaf) { */ depth += cur->prefix_len + 1; + cur_address = child; cur = child->load(std::memory_order_relaxed); } } + +void AdaptiveRadixTree::NodeIterator::SeekToLast() { + cur_partial_key_ = node_->inner->prev_partial_key(127); + auto next = node_->inner->find_child(cur_partial_key_); + if (next != nullptr) { + child = next->load(std::memory_order_acquire); + } else { + child = nullptr; + } +} + +void AdaptiveRadixTree::NodeIterator::SeekToFirst() { + cur_partial_key_ = node_->inner->next_partial_key(-128); + auto next = node_->inner->find_child(cur_partial_key_); + if (next != nullptr) { + child = next->load(std::memory_order_acquire); + } else { + child = nullptr; + } +} + +void AdaptiveRadixTree::NodeIterator::Next() { + if (cur_partial_key_ == 127) { + child = nullptr; + return; + } + cur_partial_key_ = node_->inner->next_partial_key(cur_partial_key_ + 1); + auto next = node_->inner->find_child(cur_partial_key_); + if (next != nullptr) { + child = next->load(std::memory_order_acquire); + } else { + child = nullptr; + } +} +void AdaptiveRadixTree::NodeIterator::Prev() { + if (cur_partial_key_ == -128) { + child = nullptr; + return; + } + cur_partial_key_ = node_->inner->prev_partial_key(cur_partial_key_ - 1); + auto next = node_->inner->find_child(cur_partial_key_); + if (next != nullptr) { + child = next->load(std::memory_order_acquire); + } else { + child = nullptr; + } +} + +bool AdaptiveRadixTree::NodeIterator::Valid() { return child != nullptr; } + +void AdaptiveRadixTree::Iterator::Seek(const char* key, int l) { + SeekImpl(key, l); + if (!traversal_stack_.empty()) { + SeekLeftLeaf(); + } +} + +bool AdaptiveRadixTree::Iterator::Valid() const { + return !traversal_stack_.empty(); +} + +void AdaptiveRadixTree::Iterator::Next() { + NodeIterator& step = traversal_stack_.back(); + assert(step.node_->value != nullptr); + if (step.node_->inner == nullptr) { + traversal_stack_.pop_back(); + while (!traversal_stack_.empty()) { + NodeIterator& cur_step = traversal_stack_.back(); + cur_step.Next(); + if (cur_step.Valid()) { + traversal_stack_.emplace_back( + cur_step.child, cur_step.depth_ + cur_step.node_->prefix_len + 1); + break; + } + traversal_stack_.pop_back(); + } + if (!traversal_stack_.empty()) { + SeekLeftLeaf(); + } + } else { + step.SeekToFirst(); + traversal_stack_.emplace_back(step.child, + step.depth_ + step.node_->prefix_len + 1); + SeekLeftLeaf(); + } +} + +void AdaptiveRadixTree::Iterator::SeekLeftLeaf() { + if (traversal_stack_.empty()) { + return; + } + while (!traversal_stack_.back().node_->is_leaf()) { + NodeIterator& cur_step = traversal_stack_.back(); + cur_step.SeekToFirst(); + traversal_stack_.emplace_back( + cur_step.child, cur_step.depth_ + cur_step.node_->prefix_len + 1); + } +} +void AdaptiveRadixTree::Iterator::SeekRightLeaf() { + if (traversal_stack_.empty()) { + return; + } + while (!traversal_stack_.back().node_->is_leaf()) { + NodeIterator& cur_step = traversal_stack_.back(); + cur_step.SeekToLast(); + traversal_stack_.emplace_back( + cur_step.child, cur_step.depth_ + cur_step.node_->prefix_len + 1); + } +} + +void AdaptiveRadixTree::Iterator::SeekToFirst() { + traversal_stack_.clear(); + traversal_stack_.emplace_back(root_->load(std::memory_order_acquire), 0); + SeekLeftLeaf(); +} + +void AdaptiveRadixTree::Iterator::SeekToLast() { + traversal_stack_.clear(); + traversal_stack_.emplace_back(root_->load(std::memory_order_acquire), 0); +} + +void AdaptiveRadixTree::Iterator::SeekForPrev(const char* key, int key_len) { + SeekForPrevImpl(key, key_len); + SeekRightLeaf(); +} + +void AdaptiveRadixTree::Iterator::SeekForPrevImpl(const char* key, + int key_len) { + Node* cur = root_->load(std::memory_order_acquire); + + // sentinel child iterator for root + traversal_stack_.clear(); + traversal_stack_.push_back(NodeIterator(cur, 0)); + + while (!traversal_stack_.empty()) { + NodeIterator& cur_step = traversal_stack_.back(); + Node* cur_node = cur_step.node_; + int cur_depth = cur_step.depth_; + int prefix_match_len = cur_node->check_prefix(key, cur_depth, key_len); + // if search key "equals" the prefix + if (key_len == cur_depth + prefix_match_len) { + // if search key is "equal" or "less" than the prefix, + // we only need to seek to left leaf in this tree. + return; + } else if (prefix_match_len < cur_node->prefix_len) { + if (key[cur_depth + prefix_match_len] > + cur_node->prefix[prefix_match_len]) { + // if search key is "less than" the prefix, + // we only need to seek to left leaf in this tree. + return; + } else { + // this prefix is less than target key, it means that no key in this + // tree is greater than the target key. + traversal_stack_.clear(); + return; + } + } else { + assert(prefix_match_len == cur_node->prefix_len && + key_len > cur_depth + prefix_match_len); + // seek subtree where search key is "lesser than or equal" the subtree + // partial key + if (cur_node->is_leaf() && cur_node->inner == nullptr) { + traversal_stack_.clear(); + return; + } + std::atomic* child = + cur_node->inner->find_child(key[cur_depth + cur_node->prefix_len]); + if (child != nullptr) { + Node* next = child->load(std::memory_order_acquire); + if (next != nullptr) { + traversal_stack_.emplace_back(next, + cur_depth + cur_node->prefix_len + 1); + continue; + } + } + cur_step.SeekToLast(); + for (; cur_step.Valid(); cur_step.Prev()) { + if (key[cur_depth + cur_node->prefix_len] > cur_step.cur_partial_key_) { + break; + } + } + if (cur_step.Valid()) { + traversal_stack_.emplace_back(cur_step.child, + cur_depth + cur_node->prefix_len + 1); + } else { + traversal_stack_.clear(); + } + return; + } + } +} + +void AdaptiveRadixTree::Iterator::SeekImpl(const char* key, int key_len) { + Node* cur = root_->load(std::memory_order_acquire); + + // sentinel child iterator for root + traversal_stack_.clear(); + traversal_stack_.push_back(NodeIterator(cur, 0)); + + while (!traversal_stack_.empty()) { + NodeIterator& cur_step = traversal_stack_.back(); + Node* cur_node = cur_step.node_; + int cur_depth = cur_step.depth_; + int prefix_match_len = std::min( + cur_node->check_prefix(key, cur_depth, key_len), key_len - cur_depth); + // if search key "equals" the prefix + if (key_len == cur_depth + prefix_match_len) { + // if search key is "equal" or "less" than the prefix, + // we only need to seek to left leaf in this tree. + return; + } else if (prefix_match_len < cur_node->prefix_len) { + if (key[cur_depth + prefix_match_len] < + cur_node->prefix[prefix_match_len]) { + // if search key is "less than" the prefix, + // we only need to seek to left leaf in this tree. + return; + } else { + // this prefix is less than target key, it means that no key in this + // tree is greater than the target key. + traversal_stack_.clear(); + return; + } + } else { + assert(prefix_match_len == cur_node->prefix_len && + key_len > cur_depth + prefix_match_len); + // seek subtree where search key is "lesser than or equal" the subtree + // partial key + if (cur_node->is_leaf() && cur_node->inner == nullptr) { + traversal_stack_.clear(); + return; + } + std::atomic* child = + cur_node->inner->find_child(key[cur_depth + cur_node->prefix_len]); + if (child != nullptr) { + Node* next = child->load(std::memory_order_acquire); + if (next != nullptr) { + traversal_stack_.emplace_back(next, + cur_depth + cur_node->prefix_len + 1); + continue; + } + } + cur_step.SeekToFirst(); + for (; cur_step.Valid(); cur_step.Next()) { + if (key[cur_depth + cur_node->prefix_len] < cur_step.cur_partial_key_) { + break; + } + } + if (cur_step.Valid()) { + traversal_stack_.emplace_back(cur_step.child, + cur_depth + cur_node->prefix_len + 1); + } else { + traversal_stack_.clear(); + } + return; + } + } +} + } // namespace rocksdb diff --git a/memtable/art_node.h b/memtable/art_node.h index 35a664d1c3f..8d9a9cae2e8 100644 --- a/memtable/art_node.h +++ b/memtable/art_node.h @@ -52,7 +52,7 @@ struct Node { NodeType inner_type; InnerNode* inner; - char* value; + const char* value; uint16_t prefix_len; char prefix[1]; }; diff --git a/memtable/art_node_16.h b/memtable/art_node_16.h index 00158b2411b..4619cdbc805 100644 --- a/memtable/art_node_16.h +++ b/memtable/art_node_16.h @@ -127,7 +127,7 @@ void Node16::set_child(char partial_key, Node *child) { keys >>= 8; } } - throw std::out_of_range("provided partial key does not have a successor"); + return 127; } char Node16::prev_partial_key(char partial_key) const { diff --git a/memtable/art_node_256.h b/memtable/art_node_256.h index 4c503eee510..da8476b3cd1 100644 --- a/memtable/art_node_256.h +++ b/memtable/art_node_256.h @@ -52,28 +52,27 @@ void Node256::set_child(char partial_key, Node *child) { char Node256::next_partial_key(char partial_key) const { - while (true) { - if (children_[128 + partial_key] != nullptr) { - return partial_key; - } - if (partial_key == 127) { - throw std::out_of_range("provided partial key does not have a successor"); - } - ++partial_key; - } + uint8_t key = 128 + partial_key; + while (key < 255) { + if (children_[key] != nullptr) { + return partial_key; + } + ++partial_key; + ++key; + } + return partial_key; } char Node256::prev_partial_key(char partial_key) const { - while (true) { - if (children_[128 + partial_key] != nullptr) { - return partial_key; - } - if (partial_key == -128) { - throw std::out_of_range( - "provided partial key does not have a predecessor"); - } - --partial_key; - } + uint8_t key = 128 + partial_key; + while (key > 0) { + if (children_[key] != nullptr) { + return partial_key; + } + --partial_key; + --key; + } + return partial_key; } int Node256::n_children() const { return n_children_; } diff --git a/memtable/art_node_4.h b/memtable/art_node_4.h index e641af44598..9fd95167a1a 100644 --- a/memtable/art_node_4.h +++ b/memtable/art_node_4.h @@ -36,7 +36,7 @@ class Node4 : public InnerNode { }; std::atomic *Node4::find_child(char partial_key) { - uint8_t key = partial_key; + uint8_t key = partial_key + 128; uint8_t n_children = n_children_.load(std::memory_order_acquire); uint32_t keys = keys_.load(std::memory_order_acquire); for (uint8_t i = 0; i < n_children; ++i) { @@ -50,7 +50,7 @@ class Node4 : public InnerNode { void Node4::set_child(char partial_key, Node *child) { /* determine index for child */ uint8_t n_children = n_children_.load(std::memory_order_relaxed); - uint8_t c_i = partial_key; + uint8_t c_i = partial_key + 128; uint32_t idx_value = (uint32_t)c_i << (n_children * 8); uint32_t key = keys_.load(std::memory_order_relaxed); keys_.store(key | idx_value, std::memory_order_release); @@ -66,11 +66,7 @@ class Node4 : public InnerNode { for (uint8_t i = 0; i < n_children; ++i) { uint8_t c = (keys >> (i * 8)) & 255; - if (c >= 128) { - new_node->set_child(c - 128, children_[i].load(std::memory_order_relaxed)); - } else { - new_node->set_child((char)c - 128, children_[i].load(std::memory_order_relaxed)); - } + new_node->set_child(c - 128, children_[i].load(std::memory_order_relaxed)); } return new_node; } @@ -78,26 +74,27 @@ class Node4 : public InnerNode { bool Node4::is_full() const { return n_children_ == 4; } char Node4::next_partial_key(char partial_key) const { + uint8_t key = partial_key + 128; uint32_t keys = keys_.load(std::memory_order_acquire); uint8_t n_children = n_children_.load(std::memory_order_acquire); for (uint8_t i = 0; i < n_children; ++i) { uint8_t c = keys & 255; - if ((char)c >= partial_key) { - return (char)c; - } + if (c >= key) { + return c - 128; + } keys >>= 8; } - /* return 0; */ - throw std::out_of_range("provided partial key does not have a successor"); + return 127; } char Node4::prev_partial_key(char partial_key) const { + uint8_t key = partial_key + 128; uint32_t keys = keys_.load(std::memory_order_acquire); uint8_t n_children = n_children_.load(std::memory_order_acquire); for (uint8_t i = n_children - 1; i >= 0; --i) { uint8_t c = (keys >> (i * 8)) & 255; - if ((char)c <= partial_key) { - return (char)c; + if (c <= key) { + return c - 128; } } /* return 255; */ diff --git a/memtable/art_node_48.h b/memtable/art_node_48.h index e163c4b06d5..33cc357a268 100644 --- a/memtable/art_node_48.h +++ b/memtable/art_node_48.h @@ -93,30 +93,25 @@ void Node48::set_child(char partial_key, Node *child) { const uint8_t Node48::EMPTY = 255; char Node48::next_partial_key(char partial_key) const { - while (true) { - uint8_t index = get_index(partial_key + 128); - if (index != Node48::EMPTY) { - return partial_key; - } - if (partial_key == 127) { - throw std::out_of_range("provided partial key does not have a successor"); - } - ++partial_key; - } + while (partial_key < 127) { + uint8_t index = get_index(partial_key + 128); + if (index != Node48::EMPTY) { + break; + } + ++partial_key; + } + return partial_key; } char Node48::prev_partial_key(char partial_key) const { - while (true) { - uint8_t index = get_index(partial_key + 128); - if (index != Node48::EMPTY) { - return partial_key; - } - if (partial_key == -128) { - throw std::out_of_range( - "provided partial key does not have a predecessor"); - } - --partial_key; - } + while (partial_key > -128) { + uint8_t index = get_index(partial_key + 128); + if (index != Node48::EMPTY) { + return partial_key; + } + --partial_key; + } + return partial_key; } int Node48::n_children() const { return n_children_; } diff --git a/memtable/art_test.cc b/memtable/art_test.cc new file mode 100644 index 00000000000..3fea530493f --- /dev/null +++ b/memtable/art_test.cc @@ -0,0 +1,590 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "memtable/art.h" + +#include + +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "util/hash.h" +#include "util/random.h" + +namespace rocksdb { + +typedef uint64_t Key; + +static const char* Encode(const uint64_t* key) { + return reinterpret_cast(key); +} + +static Key Decode(const char* key) { + Key rv; + memcpy(&rv, key, sizeof(Key)); + return rv; +} + +class ArtTest : public testing::Test {}; + +TEST_F(ArtTest, Empty) { + Arena arena; + AdaptiveRadixTree list(&arena); + + AdaptiveRadixTree::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek("ancd", 4); + ASSERT_TRUE(!iter.Valid()); + // iter.SeekForPrev(100); + // ASSERT_TRUE(!iter.Valid()); + // iter.SeekToLast(); + // ASSERT_TRUE(!iter.Valid()); +} + +TEST_F(ArtTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + AdaptiveRadixTree list(&arena); + const char* v = "abc"; + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + char* buf = arena.AllocateAligned(sizeof(Key)); + memcpy(buf, &key, sizeof(Key)); + list.Insert(buf, sizeof(key), v); + } + } + + for (Key i = 0; i < R; i++) { + if (list.Get(Encode(&i))) { + ASSERT_EQ(keys.count(i), 1U); + } else { + ASSERT_EQ(keys.count(i), 0U); + } + } + + // Simple iterator tests + { + AdaptiveRadixTree::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + uint64_t zero = 0; + iter.Seek(Encode(&zero), 8); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), Decode(iter.Value())); + + uint64_t max_key = R - 1; + iter.SeekForPrev(Encode(&max_key)); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), Decode(iter.Value())); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), Decode(iter.Value())); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), Decode(iter.Value())); + } + + // Forward iteration test + for (Key i = 0; i < R; i++) { + AdaptiveRadixTree::Iterator iter(&list); + iter.Seek(Encode(&i)); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, Decode(iter.Value())); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + for (Key i = 0; i < R; i++) { + AdaptiveRadixTree::Iterator iter(&list); + iter.SeekForPrev(Encode(&i)); + + // Compare against model iterator + std::set::iterator model_iter = keys.upper_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.begin()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*--model_iter, Decode(iter.Value())); + iter.Prev(); + } + } + } +} + +TEST_F(ArtTest, InsertWithHint_Sequential) { + const int N = 100000; + Arena arena; + TestComparator cmp; + TestInlineSkipList list(cmp, &arena); + void* hint = nullptr; + for (int i = 0; i < N; i++) { + Key key = i; + InsertWithHint(&list, key, &hint); + } + Validate(&list); +} + +TEST_F(ArtTest, InsertWithHint_MultipleHints) { + const int N = 100000; + const int S = 100; + Random rnd(534); + Arena arena; + TestComparator cmp; + TestInlineSkipList list(cmp, &arena); + void* hints[S]; + Key last_key[S]; + for (int i = 0; i < S; i++) { + hints[i] = nullptr; + last_key[i] = 0; + } + for (int i = 0; i < N; i++) { + Key s = rnd.Uniform(S); + Key key = (s << 32) + (++last_key[s]); + InsertWithHint(&list, key, &hints[s]); + } + Validate(&list); +} + +TEST_F(ArtTest, InsertWithHint_MultipleHintsRandom) { + const int N = 100000; + const int S = 100; + Random rnd(534); + Arena arena; + TestComparator cmp; + TestInlineSkipList list(cmp, &arena); + void* hints[S]; + for (int i = 0; i < S; i++) { + hints[i] = nullptr; + } + for (int i = 0; i < N; i++) { + Key s = rnd.Uniform(S); + Key key = (s << 32) + rnd.Next(); + InsertWithHint(&list, key, &hints[s]); + } + Validate(&list); +} + +TEST_F(ArtTest, InsertWithHint_CompatibleWithInsertWithoutHint) { + const int N = 100000; + const int S1 = 100; + const int S2 = 100; + Random rnd(534); + Arena arena; + TestComparator cmp; + TestInlineSkipList list(cmp, &arena); + std::unordered_set used; + Key with_hint[S1]; + Key without_hint[S2]; + void* hints[S1]; + for (int i = 0; i < S1; i++) { + hints[i] = nullptr; + while (true) { + Key s = rnd.Next(); + if (used.insert(s).second) { + with_hint[i] = s; + break; + } + } + } + for (int i = 0; i < S2; i++) { + while (true) { + Key s = rnd.Next(); + if (used.insert(s).second) { + without_hint[i] = s; + break; + } + } + } + for (int i = 0; i < N; i++) { + Key s = rnd.Uniform(S1 + S2); + if (s < S1) { + Key key = (with_hint[s] << 32) + rnd.Next(); + InsertWithHint(&list, key, &hints[s]); + } else { + Key key = (without_hint[s - S1] << 32) + rnd.Next(); + Insert(&list, key); + } + } + Validate(&list); +} + +#ifndef ROCKSDB_VALGRIND_RUN +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + public: + static const uint32_t K = 8; + + private: + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = {k, g}; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + std::atomic generation[K]; + void Set(int k, int v) { + generation[k].store(v, std::memory_order_release); + } + int Get(int k) { return generation[k].load(std::memory_order_acquire); } + + State() { + for (unsigned int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // InlineSkipList is not protected by mu_. We just use a single writer + // thread to modify it. + AdaptiveRadixTree list_; + + public: + ConcurrentTest() : list_(TestComparator(), &arena_) {} + + // REQUIRES: No concurrent calls to WriteStep or ConcurrentWriteStep + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const int g = current_.Get(k) + 1; + const Key new_key = MakeKey(k, g); + char* buf = list_.AllocateKey(sizeof(Key)); + memcpy(buf, &new_key, sizeof(Key)); + list_.Insert(buf); + current_.Set(k, g); + } + + // REQUIRES: No concurrent calls for the same k + void ConcurrentWriteStep(uint32_t k) { + const int g = current_.Get(k) + 1; + const Key new_key = MakeKey(k, g); + char* buf = list_.AllocateKey(sizeof(Key)); + memcpy(buf, &new_key, sizeof(Key)); + list_.InsertConcurrently(buf); + ASSERT_EQ(g, current_.Get(k) + 1); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (unsigned int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + typename AdaptiveRadixTree::Iterator iter(&list_); + iter.Seek(Encode(&pos)); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = Decode(iter.Value()); + ASSERT_TRUE(IsValidKey(current)) << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0U) || + (gen(pos) > static_cast(initial_state.Get( + static_cast(key(pos)))))) + << "key: " << key(pos) << "; gen: " << gen(pos) + << "; initgen: " << initial_state.Get(static_cast(key(pos))); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(Encode(&new_target)); + } + } + } + } +}; + +template