-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement lazy Distinct
operation
#1558
base: master
Are you sure you want to change the base?
Changes from all commits
eb9f1c6
22ad6b6
4f27992
81614cd
f50cf30
dce9e4f
5210c7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,10 +4,7 @@ | |
|
||
#include "./Distinct.h" | ||
|
||
#include <sstream> | ||
|
||
#include "engine/CallFixedSize.h" | ||
#include "engine/Engine.h" | ||
#include "engine/QueryExecutionTree.h" | ||
|
||
using std::endl; | ||
|
@@ -19,7 +16,7 @@ size_t Distinct::getResultWidth() const { return subtree_->getResultWidth(); } | |
// _____________________________________________________________________________ | ||
Distinct::Distinct(QueryExecutionContext* qec, | ||
std::shared_ptr<QueryExecutionTree> subtree, | ||
const vector<ColumnIndex>& keepIndices) | ||
const std::vector<ColumnIndex>& keepIndices) | ||
: Operation(qec), subtree_(std::move(subtree)), _keepIndices(keepIndices) {} | ||
|
||
// _____________________________________________________________________________ | ||
|
@@ -36,17 +33,107 @@ VariableToColumnMap Distinct::computeVariableToColumnMap() const { | |
return subtree_->getVariableColumns(); | ||
} | ||
|
||
template <size_t WIDTH> | ||
cppcoro::generator<IdTable> Distinct::lazyDistinct( | ||
cppcoro::generator<IdTable> originalGenerator, | ||
RobinTF marked this conversation as resolved.
Show resolved
Hide resolved
|
||
std::vector<ColumnIndex> keepIndices, | ||
std::optional<IdTable> aggregateTable) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a good reason to not use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mainly for simplicity reasons. This function is currently static just like the regular distinct function and if a bool was passed instead you'd have to pass the width and the allocator to construct the |
||
std::optional<typename IdTableStatic<WIDTH>::row_type> previousRow = | ||
std::nullopt; | ||
for (IdTable& idTable : originalGenerator) { | ||
IdTable result = | ||
distinct<WIDTH>(std::move(idTable), keepIndices, previousRow); | ||
if (!result.empty()) { | ||
previousRow.emplace(result.asStaticView<WIDTH>().back()); | ||
if (aggregateTable.has_value()) { | ||
aggregateTable.value().insertAtEnd(result); | ||
} else { | ||
co_yield result; | ||
} | ||
} | ||
} | ||
if (aggregateTable.has_value()) { | ||
co_yield aggregateTable.value(); | ||
} | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
ProtoResult Distinct::computeResult([[maybe_unused]] bool requestLaziness) { | ||
IdTable idTable{getExecutionContext()->getAllocator()}; | ||
ProtoResult Distinct::computeResult(bool requestLaziness) { | ||
LOG(DEBUG) << "Getting sub-result for distinct result computation..." << endl; | ||
std::shared_ptr<const Result> subRes = subtree_->getResult(); | ||
std::shared_ptr<const Result> subRes = subtree_->getResult(true); | ||
|
||
LOG(DEBUG) << "Distinct result computation..." << endl; | ||
idTable.setNumColumns(subRes->idTable().numColumns()); | ||
size_t width = subRes->idTable().numColumns(); | ||
CALL_FIXED_SIZE(width, &Engine::distinct, subRes->idTable(), _keepIndices, | ||
&idTable); | ||
LOG(DEBUG) << "Distinct result computation done." << endl; | ||
return {std::move(idTable), resultSortedOn(), subRes->getSharedLocalVocab()}; | ||
size_t width = subtree_->getResultWidth(); | ||
if (subRes->isFullyMaterialized()) { | ||
IdTable idTable = | ||
CALL_FIXED_SIZE(width, &Distinct::distinct, subRes->idTable().clone(), | ||
_keepIndices, std::nullopt); | ||
LOG(DEBUG) << "Distinct result computation done." << endl; | ||
return {std::move(idTable), resultSortedOn(), | ||
subRes->getSharedLocalVocab()}; | ||
} | ||
|
||
auto generator = CALL_FIXED_SIZE( | ||
width, &Distinct::lazyDistinct, std::move(subRes->idTables()), | ||
_keepIndices, | ||
requestLaziness ? std::nullopt | ||
: std::optional{IdTable{width, allocator()}}); | ||
if (!requestLaziness) { | ||
IdTable result = cppcoro::getSingleElement(std::move(generator)); | ||
return {std::move(result), resultSortedOn(), subRes->getSharedLocalVocab()}; | ||
} | ||
return {std::move(generator), resultSortedOn(), | ||
subRes->getSharedLocalVocab()}; | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
template <size_t WIDTH> | ||
IdTable Distinct::distinct( | ||
IdTable dynInput, const std::vector<ColumnIndex>& keepIndices, | ||
std::optional<typename IdTableStatic<WIDTH>::row_type> previousRow) { | ||
AD_CONTRACT_CHECK(keepIndices.size() <= dynInput.numColumns()); | ||
LOG(DEBUG) << "Distinct on " << dynInput.size() << " elements.\n"; | ||
IdTableStatic<WIDTH> result = std::move(dynInput).toStatic<WIDTH>(); | ||
|
||
auto matchesRow = [&keepIndices](const auto& a, const auto& b) { | ||
for (ColumnIndex i : keepIndices) { | ||
if (a[i] != b[i]) { | ||
return false; | ||
} | ||
} | ||
return true; | ||
}; | ||
|
||
// Variant of `std::ranges::unique` that allows to skip the first rows of | ||
// elements found in the previous table. | ||
auto first = std::ranges::find_if(result, [&matchesRow, | ||
&previousRow](const auto& row) { | ||
return !previousRow.has_value() || !matchesRow(row, previousRow.value()); | ||
}); | ||
auto last = result.end(); | ||
|
||
auto dest = result.begin(); | ||
if (first == dest) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I see it,there are two optimizations:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In total I think there might be a version with less code possible here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
// Optimization to avoid redundant move operations. | ||
first = std::ranges::adjacent_find(first, last, matchesRow); | ||
dest = first; | ||
if (first != last) { | ||
++first; | ||
} | ||
} else if (first != last) { | ||
*dest = std::move(*first); | ||
} | ||
|
||
if (first != last) { | ||
while (++first != last) { | ||
if (!matchesRow(*dest, *first)) { | ||
*++dest = std::move(*first); | ||
} | ||
} | ||
++dest; | ||
} | ||
result.erase(dest, last); | ||
|
||
LOG(DEBUG) << "Distinct done.\n"; | ||
return std::move(result).toDynamic(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,30 +3,26 @@ | |
// Author: Björn Buchhold ([email protected]) | ||
#pragma once | ||
|
||
#include <utility> | ||
#include <vector> | ||
|
||
#include "engine/Operation.h" | ||
#include "engine/QueryExecutionTree.h" | ||
#include "parser/ParsedQuery.h" | ||
|
||
using std::vector; | ||
|
||
class Distinct : public Operation { | ||
private: | ||
std::shared_ptr<QueryExecutionTree> subtree_; | ||
vector<ColumnIndex> _keepIndices; | ||
std::vector<ColumnIndex> _keepIndices; | ||
|
||
public: | ||
Distinct(QueryExecutionContext* qec, | ||
std::shared_ptr<QueryExecutionTree> subtree, | ||
const vector<ColumnIndex>& keepIndices); | ||
const std::vector<ColumnIndex>& keepIndices); | ||
|
||
[[nodiscard]] size_t getResultWidth() const override; | ||
|
||
[[nodiscard]] string getDescriptor() const override; | ||
|
||
[[nodiscard]] vector<ColumnIndex> resultSortedOn() const override { | ||
[[nodiscard]] std::vector<ColumnIndex> resultSortedOn() const override { | ||
return subtree_->resultSortedOn(); | ||
} | ||
|
||
|
@@ -46,15 +42,32 @@ class Distinct : public Operation { | |
|
||
bool knownEmptyResult() override { return subtree_->knownEmptyResult(); } | ||
|
||
vector<QueryExecutionTree*> getChildren() override { | ||
std::vector<QueryExecutionTree*> getChildren() override { | ||
return {subtree_.get()}; | ||
} | ||
|
||
protected: | ||
[[nodiscard]] string getCacheKeyImpl() const override; | ||
|
||
private: | ||
ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; | ||
ProtoResult computeResult(bool requestLaziness) override; | ||
|
||
VariableToColumnMap computeVariableToColumnMap() const override; | ||
|
||
template <size_t WIDTH> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. short docstring please. |
||
static cppcoro::generator<IdTable> lazyDistinct( | ||
cppcoro::generator<IdTable> originalGenerator, | ||
std::vector<ColumnIndex> keepIndices, | ||
std::optional<IdTable> aggregateTable); | ||
|
||
// Removes all duplicates from input with regards to the columns | ||
// in keepIndices. The input needs to be sorted on the keep indices, | ||
// otherwise the result of this function is undefined. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please also document the |
||
template <size_t WIDTH> | ||
static IdTable distinct( | ||
IdTable dynInput, const std::vector<ColumnIndex>& keepIndices, | ||
std::optional<typename IdTableStatic<WIDTH>::row_type> previousRow); | ||
|
||
FRIEND_TEST(Distinct, distinct); | ||
FRIEND_TEST(Distinct, distinctWithEmptyInput); | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.