Skip to content

Commit

Permalink
RCORE-2064 String EQ/NEQ optimisations for compressed strings (#7820)
Browse files Browse the repository at this point in the history
* find_first optimization for compressed strings

* core test passing

* compression tests for collection of strings

* code review

* Fixes (#7872)

---------

Co-authored-by: Jørgen Edelbo <[email protected]>
  • Loading branch information
nicola-cab and jedelbo authored Jul 10, 2024
1 parent ce6f196 commit 1afc39e
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 26 deletions.
23 changes: 20 additions & 3 deletions src/realm/array_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,14 @@ StringData ArrayString::get(size_t ndx) const
return {};
}

std::optional<StringID> realm::ArrayString::get_string_id(size_t ndx) const
{
if (m_type == Type::interned_strings) {
return StringID(static_cast<Array*>(m_arr)->get(ndx));
}
return m_string_interner->lookup(get(ndx));
}

Mixed ArrayString::get_any(size_t ndx) const
{
return Mixed(get(ndx));
Expand Down Expand Up @@ -274,6 +282,16 @@ void ArrayString::clear()
}

size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const noexcept
{
// This should only be called if we don't have a string id for this particular array (aka no string interner)
std::optional<StringID> id;
if (m_type == Type::interned_strings)
id = m_string_interner->lookup(value);

return find_first(value, begin, end, id);
}

size_t ArrayString::find_first(StringData value, size_t begin, size_t end, std::optional<StringID> id) const noexcept
{
switch (m_type) {
case Type::small_strings:
Expand All @@ -289,14 +307,13 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const
break;
}
case Type::interned_strings: {
// we need a way to avoid this lookup for each leaf array. The lookup must appear
// higher up the call stack and passed down.
auto id = m_string_interner->lookup(value);
if (id) {
return static_cast<Array*>(m_arr)->find_first(*id, begin, end);
}
break;
}
default:
break;
}
return not_found;
}
Expand Down
9 changes: 9 additions & 0 deletions src/realm/array_string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <realm/array_string_short.hpp>
#include <realm/array_blobs_small.hpp>
#include <realm/array_blobs_big.hpp>
#include <realm/string_interner.hpp>

namespace realm {

Expand Down Expand Up @@ -74,6 +75,10 @@ class ArrayString : public ArrayPayload {
{
m_string_interner = string_interner;
}
bool is_compressed() const
{
return m_type == Type::interned_strings;
}

void update_parent()
{
Expand All @@ -99,6 +104,7 @@ class ArrayString : public ArrayPayload {
}
void insert(size_t ndx, StringData value);
StringData get(size_t ndx) const;
std::optional<StringID> get_string_id(size_t ndx) const;
Mixed get_any(size_t ndx) const override;
bool is_null(size_t ndx) const;
void erase(size_t ndx);
Expand All @@ -107,6 +113,9 @@ class ArrayString : public ArrayPayload {

size_t find_first(StringData value, size_t begin, size_t end) const noexcept;

/// Special version for searching in an array or compressed strings.
size_t find_first(StringData value, size_t begin, size_t end, std::optional<StringID>) const noexcept;

size_t lower_bound(StringData value);

/// Get the specified element without the cost of constructing an
Expand Down
21 changes: 13 additions & 8 deletions src/realm/query_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ bool StringNode<Equal>::do_consume_condition(ParentNode& node)
size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
{
if (m_needles.empty()) {
return m_leaf->find_first(m_string_value, start, end);
return m_leaf->find_first(m_string_value, start, end, m_interned_string_id);
}
else {
if (end == npos)
Expand Down Expand Up @@ -505,7 +505,8 @@ size_t StringNode<EqualIns>::_find_first_local(size_t start, size_t end)
}

StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm)
: StringNodeEqualBase(v, column)
: m_value(v)
, m_col(column)
, m_link_map(std::move(lm))
{
if (!m_link_map)
Expand All @@ -518,17 +519,21 @@ void StringNodeFulltext::table_changed()
}

StringNodeFulltext::StringNodeFulltext(const StringNodeFulltext& other)
: StringNodeEqualBase(other)
: ParentNode(other)
, m_value(other.m_value)
, m_col(other.m_col)
, m_link_map(std::make_unique<LinkMap>(*other.m_link_map))
{
m_link_map = std::make_unique<LinkMap>(*other.m_link_map);
}

void StringNodeFulltext::_search_index_init()
void StringNodeFulltext::init(bool will_query_ranges)
{
StringIndex* index = m_link_map->get_target_table()->get_string_index(ParentNode::m_condition_column_key);
ParentNode::init(will_query_ranges);

StringIndex* index = m_link_map->get_target_table()->get_string_index(m_col);
REALM_ASSERT(index && index->is_fulltext_index());
m_index_matches.clear();
index->find_all_fulltext(m_index_matches, StringNodeBase::m_string_value);
index->find_all_fulltext(m_index_matches, m_value);

// If links exists, use backlinks to find the original objects
if (m_link_map->links_exist()) {
Expand All @@ -541,7 +546,7 @@ void StringNodeFulltext::_search_index_init()
}

m_index_evaluator = IndexEvaluator{};
m_index_evaluator->init(&m_index_matches);
m_index_evaluator.init(&m_index_matches);
}

std::unique_ptr<ArrayPayload> TwoColumnsNodeBase::update_cached_leaf_pointers_for_column(Allocator& alloc,
Expand Down
48 changes: 40 additions & 8 deletions src/realm/query_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ class ParentNode {
{
m_dD = 100.0;

if (m_condition_column_key)
m_table->check_column(m_condition_column_key);
if (m_child)
m_child->init(will_query_ranges);
}
Expand Down Expand Up @@ -1647,6 +1649,11 @@ class StringNodeBase : public ParentNode {
m_dT = 10.0;
}

void table_changed() override
{
m_string_interner = m_table.unchecked_ptr()->get_string_interner(m_condition_column_key);
}

void cluster_changed() override
{
m_leaf.emplace(m_table.unchecked_ptr()->get_alloc());
Expand All @@ -1662,6 +1669,7 @@ class StringNodeBase : public ParentNode {
m_end_s = 0;
m_leaf_start = 0;
m_leaf_end = 0;
m_interned_string_id = m_string_interner->lookup(m_value);
}

virtual void clear_leaf_state()
Expand All @@ -1673,6 +1681,8 @@ class StringNodeBase : public ParentNode {
: ParentNode(from)
, m_value(from.m_value)
, m_string_value(m_value)
, m_string_interner(from.m_string_interner)
, m_interned_string_id(from.m_interned_string_id)
{
}

Expand All @@ -1687,6 +1697,8 @@ class StringNodeBase : public ParentNode {
std::optional<std::string> m_value;
std::optional<ArrayString> m_leaf;
StringData m_string_value;
StringInterner* m_string_interner = nullptr;
std::optional<StringID> m_interned_string_id;

size_t m_end_s = 0;
size_t m_leaf_start = 0;
Expand All @@ -1703,7 +1715,7 @@ template <class TConditionFunction>
class StringNode : public StringNodeBase {
public:
constexpr static bool case_sensitive_comparison =
is_any_v<TConditionFunction, Greater, GreaterEqual, Less, LessEqual>;
is_any_v<TConditionFunction, NotEqual, Greater, GreaterEqual, Less, LessEqual>;
StringNode(StringData v, ColKey column)
: StringNodeBase(v, column)
{
Expand Down Expand Up @@ -1732,8 +1744,21 @@ class StringNode : public StringNodeBase {
TConditionFunction cond;

for (size_t s = start; s < end; ++s) {
if constexpr (std::is_same_v<TConditionFunction, NotEqual>) {
if (m_leaf->is_compressed()) {
if (m_interned_string_id) {
// The search string has been interned, so there might be a match
// We can compare the string IDs directly
const auto id = m_leaf->get_string_id(s);
if (m_string_interner->compare(*m_interned_string_id, *id) == 0) {
// The value matched, so we continue to the next value
continue;
}
}
return s;
}
}
StringData t = get_string(s);

if constexpr (case_sensitive_comparison) {
// case insensitive not implemented for: >, >=, <, <=
if (cond(t, m_string_value))
Expand Down Expand Up @@ -2061,20 +2086,24 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
size_t _find_first_local(size_t start, size_t end) override;
};


class StringNodeFulltext : public StringNodeEqualBase {
class StringNodeFulltext : public ParentNode {
public:
StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm = {});

void table_changed() override;

void _search_index_init() override;
void init(bool will_query_ranges) override;

bool has_search_index() const override
{
return true; // it's a required precondition for fulltext queries
}

const IndexEvaluator* index_based_keys() override
{
return &m_index_evaluator;
}

std::unique_ptr<ParentNode> clone() const override
{
return std::unique_ptr<ParentNode>(new StringNodeFulltext(*this));
Expand All @@ -2086,13 +2115,16 @@ class StringNodeFulltext : public StringNodeEqualBase {
}

private:
std::vector<ObjKey> m_index_matches;
std::string m_value;
ColKey m_col;
std::unique_ptr<LinkMap> m_link_map;
IndexEvaluator m_index_evaluator;
std::vector<ObjKey> m_index_matches;
StringNodeFulltext(const StringNodeFulltext&);

size_t _find_first_local(size_t, size_t) override
size_t find_first_local(size_t start, size_t end) override
{
REALM_UNREACHABLE();
return m_index_evaluator.do_search_index(m_cluster, start, end);
}
};

Expand Down
20 changes: 18 additions & 2 deletions src/realm/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1735,9 +1735,25 @@ ObjKey Table::find_first(ColKey col_key, T value) const
using LeafType = typename ColumnTypeTraits<T>::cluster_leaf_type;
LeafType leaf(get_alloc());

auto f = [&key, &col_key, &value, &leaf](const Cluster* cluster) {
// In case of a string column we can try to look up the StringID of the search string,
// and search for that in case the leaf is compressed.
std::optional<StringID> string_id;
if constexpr (std::is_same_v<T, StringData>) {
auto string_interner = get_string_interner(col_key);
REALM_ASSERT(string_interner != nullptr);
string_id = string_interner->lookup(value);
}

auto f = [&](const Cluster* cluster) {
cluster->init_leaf(col_key, &leaf);
size_t row = leaf.find_first(value, 0, cluster->node_size());
size_t row;
if constexpr (std::is_same_v<T, StringData>) {
row = leaf.find_first(value, 0, cluster->node_size(), string_id);
}
else {
row = leaf.find_first(value, 0, cluster->node_size());
}

if (row != realm::npos) {
key = cluster->get_real_key(row);
return IteratorControl::Stop;
Expand Down
40 changes: 35 additions & 5 deletions test/test_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,13 @@ columns or queries involved
*/


TEST(Query_NextGen_StringConditions)
TEST_TYPES(Query_NextGen_StringConditions, std::true_type, std::false_type)
{
Group group;
TableRef table1 = group.add_table("table1");
SHARED_GROUP_TEST_PATH(path);

auto db = DB::create(make_in_realm_history(), path);
auto wt = db->start_write();
TableRef table1 = wt->add_table("table1");
auto col_str1 = table1->add_column(type_String, "str1");
auto col_str2 = table1->add_column(type_String, "str2");

Expand All @@ -342,6 +345,11 @@ TEST(Query_NextGen_StringConditions)
table1->create_object().set_all("!", "x").get_key();
ObjKey key_1_2 = table1->create_object().set_all("bar", "r").get_key();

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

ObjKey m;
// Equal
m = table1->column<String>(col_str1).equal("bar", false).find();
Expand Down Expand Up @@ -433,7 +441,7 @@ TEST(Query_NextGen_StringConditions)
CHECK_EQUAL(m, null_key);

// Test various compare operations with null
TableRef table2 = group.add_table("table2");
TableRef table2 = wt->add_table("table2");
auto col_str3 = table2->add_column(type_String, "str3", true);

ObjKey key_2_0 = table2->create_object().set(col_str3, "foo").get_key();
Expand All @@ -442,6 +450,11 @@ TEST(Query_NextGen_StringConditions)
ObjKey key_2_3 = table2->create_object().set(col_str3, "bar").get_key();
ObjKey key_2_4 = table2->create_object().set(col_str3, "").get_key();

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

size_t cnt;
cnt = table2->column<String>(col_str3).contains(StringData("")).count();
CHECK_EQUAL(cnt, 4);
Expand Down Expand Up @@ -522,6 +535,12 @@ TEST(Query_NextGen_StringConditions)
}
};

// not equal
check_results((table2->column<String>(col_str3) != StringData("")), {StringData(), "foo", "bar", "!"});
check_results((table2->column<String>(col_str3) != StringData()), {"", "foo", "bar", "!"});
check_results((table2->column<String>(col_str3) != StringData("foo")), {StringData(), "", "bar", "!"});
check_results((table2->column<String>(col_str3) != StringData("barr")), {StringData(), "", "foo", "bar", "!"});

// greater
check_results((table2->column<String>(col_str3) > StringData("")), {"foo", "bar", "!"});
check_results((table2->column<String>(col_str3) > StringData("b")), {"foo", "bar"});
Expand Down Expand Up @@ -553,7 +572,7 @@ TEST(Query_NextGen_StringConditions)
check_results((table2->column<String>(col_str3) <= StringData("barrrr")), {"bar", "", "!", StringData()});
check_results((table2->column<String>(col_str3) <= StringData("z")), {"foo", "bar", "", "!", StringData()});

TableRef table3 = group.add_table(StringData("table3"));
TableRef table3 = wt->add_table(StringData("table3"));
auto col_link1 = table3->add_column(*table2, "link1");

table3->create_object().set(col_link1, key_2_0);
Expand All @@ -562,6 +581,11 @@ TEST(Query_NextGen_StringConditions)
table3->create_object().set(col_link1, key_2_3);
table3->create_object().set(col_link1, key_2_4);

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

cnt = table3->link(col_link1).column<String>(col_str3).contains(StringData("")).count();
CHECK_EQUAL(cnt, 4);

Expand Down Expand Up @@ -638,8 +662,14 @@ TEST(Query_NextGen_StringConditions)
"This is a long search string that does not contain the word being searched for!, "
"This is a long search string that does not contain the word being searched for!, "
"needle";

table2->create_object().set(col_str3, long_string).get_key();

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

cnt = table2->column<String>(col_str3).contains(search_1, false).count();
CHECK_EQUAL(cnt, 1);

Expand Down

0 comments on commit 1afc39e

Please sign in to comment.