Skip to content

Commit

Permalink
reorder facts to minimize ownership data
Browse files Browse the repository at this point in the history
Summary: In exchange for a ~10% regression in indexing time, we can increase write speed with ownership by ~2x. See the comment for details.

Reviewed By: donsbot

Differential Revision: D67284128

fbshipit-source-id: 43913ed229e9e5278bad271fe2a5dd1c9e135f06
  • Loading branch information
Simon Marlow authored and facebook-github-bot committed Dec 19, 2024
1 parent 52861fa commit 9e6e4b9
Showing 1 changed file with 134 additions and 8 deletions.
142 changes: 134 additions & 8 deletions glean/cpp/glean.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,152 @@ void BatchBase::beginUnit(std::string unit) {
current = &owned.back();
}

/*
Minimizing ownership data
We start with all the facts produced for a translation unit in a single
ownership interval
[A, B)
The fact batch goes through a few rounds of de-duplication on its
way to the DB. If we end up de-duplicating some of the facts, the ownership
data remains, so we end up with
[a,b) [c,d) [e,f) [A, B)
large numbers of intervals are expensive to process each time we de-duplicate
this batch.
So what we do is to partition the facts into two: "root" facts that are not
reachable from any other fact, and facts that are reachable. As long as the
root facts are covered by ownership, we don't need to cover the reachable
facts since the ownership will be automatically propagated by the server
later. We sort the facts so that the root facts are all together, meaning
we still have a single ownership interval for the batch.
By having fewer facts covered by ownership, we will have fewer
ownership intervals to process. This speeds up writing, and also results in
less ownership data to process on the server during the propagation phase.
*/
std::pair<rts::closed_interval_set<Id>, rts::FactSet> minimalOwnership(
const rts::Inventory& inventory,
rts::FactSet& facts,
rts::LookupCache::Anchor& anchor,
Id start,
Id end) {
rts::closed_interval_set<Id> owned;
owned.add({start, end - 1});

// iterate through the facts in reverse order
// remove referenced facts from the interval_set
{
const auto eachRef = rts::syscall([&owned, start, end](Id id, Pid) {
if (id >= start && id < end) {
owned.erase(id);
}
});

auto it = facts.enumerateBack(end, start);
while (auto fact = it->get()) {
const auto* predicate = inventory.lookupPredicate(fact.type);
assert(predicate);
predicate->traverse(eachRef, fact.clause);
it->next();
}
}

rts::FactSet new_buffer(facts.startingId());
rts::Stacked<rts::Define> new_facts(&anchor, &new_buffer);
rts::MutableSubstitution subst(facts.startingId(), facts.size());

auto rename =
rts::Predicate::Rename([&](Id id, Pid) { return subst.subst(id); });

auto addFact = [&](Id id) {
facts.factById(id, [&](Pid type, rts::Fact::Clause fact) {
if (const auto* predicate = inventory.lookupPredicate(type)) {
binary::Output out;
uint64_t key_size;
predicate->typecheck(rename, fact, out, key_size);
const auto clause = rts::Fact::Clause::from(out.bytes(), key_size);
auto new_id = new_facts.define(type, clause, Id::invalid());
CHECK(new_id != Id::invalid());
subst.set(id, new_id);
} else {
error("invalid predicate id {}", type);
}
});
};

// first add all the facts that are not owned
{
Id last = facts.startingId();
auto it = owned.begin();
if (it != owned.end()) {
// up to the first owned fact, we can just copy because the substitution
// is the identity
for (auto id = last; id < it->lower(); id++) {
facts.factById(id, [&](Pid type, rts::Fact::Clause fact) {
auto new_id = new_facts.define(type, fact, Id::invalid());
CHECK(new_id != Id::invalid());
subst.set(id, new_id);
});
}
last = it->upper() + 1;

it++;
for (; it != owned.end(); it++) {
for (auto id = last; id < it->lower(); id++) {
addFact(id);
}
last = it->upper() + 1;
}
}
for (auto id = last; id < facts.firstFreeId(); id++) {
addFact(id);
}
}

// next add all the facts that are owned, so we have one interval
auto owned_start = new_facts.firstFreeId();
for (auto it = owned.begin(); it != owned.end(); it++) {
for (auto id = it->lower(); id <= it->upper(); id++) {
addFact(id);
}
}
auto owned_end = new_facts.firstFreeId();

rts::closed_interval_set<Id> res;
res.add({owned_start, owned_end - 1});
return std::make_pair(std::move(res), std::move(new_buffer));
}

void BatchBase::endUnit() {
if (current) {
current->finish = buffer.firstFreeId();
if (current->finish > current->start) {
auto p = minimalOwnership(
inventory->inventory,
buffer,
anchor,
current->start,
current->finish);
current->facts = std::move(p.first);
buffer = std::move(p.second);
}
LOG(INFO) << current->unit << ": " << current->facts.size() << " / "
<< current->facts.iterative_size();
current->finish = buffer.firstFreeId();
current = nullptr;
++seen_units;

} else {
LOG(ERROR) << "mismatched endUnit";
}
}

Id BatchBase::define(Pid ty, rts::Fact::Clause clause) {
const auto id = facts.define(ty, clause);
if (id) {
if (current != nullptr) {
current->facts.add(id);
}
}
return id;
return facts.define(ty, clause);
}

void BatchBase::logEnd() const {
Expand Down

0 comments on commit 9e6e4b9

Please sign in to comment.