Skip to content

Commit

Permalink
Better size estimate for SpatialJoin (#1555)
Browse files Browse the repository at this point in the history
Up until now the `SpatialJoin` class always returned the product of the sizes of both children as its own size estimate. For a k-nearest-neighbor join this is usually way larger than the actual maximum size of the result. This PR implements a more precise size estimate.
  • Loading branch information
ullingerc authored Oct 18, 2024
1 parent 91c6184 commit 2b4e6b3
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 17 deletions.
18 changes: 16 additions & 2 deletions src/engine/SpatialJoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,19 @@ size_t SpatialJoin::getCostEstimate() {
// ____________________________________________________________________________
uint64_t SpatialJoin::getSizeEstimateBeforeLimit() {
if (childLeft_ && childRight_) {
return childLeft_->getSizeEstimate() * childRight_->getSizeEstimate();
// If we limit the number of results to k, even in the largest scenario, the
// result can be at most `|childLeft| * k`
auto maxResults = getMaxResults();
if (maxResults.has_value()) {
return childLeft_->getSizeEstimate() * maxResults.value();
}

// If we don't limit the number of results, we cannot draw conclusions about
// the size, other than the worst case `|childLeft| * |childRight|`. However
// to improve query planning for the average case, we apply a constant
// factor (the asymptotic behavior remains unchanged).
return (childLeft_->getSizeEstimate() * childRight_->getSizeEstimate()) /
SPATIAL_JOIN_MAX_DIST_SIZE_ESTIMATE;
}
return 1; // dummy return if not both children are added
}
Expand Down Expand Up @@ -263,7 +275,9 @@ float SpatialJoin::getMultiplicity(size_t col) {
column -= childLeft_->getResultWidth();
}
auto distinctnessChild = getDistinctness(child, column);
return static_cast<float>(getSizeEstimate()) / distinctnessChild;
return static_cast<float>(childLeft_->getSizeEstimate() *
childRight_->getSizeEstimate()) /
distinctnessChild;
} else {
return 1;
}
Expand Down
6 changes: 6 additions & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ static const std::string MAX_DIST_IN_METERS = "<max-distance-in-meters:";
static constexpr auto MAX_DIST_IN_METERS_REGEX =
ctll::fixed_string{"<max-distance-in-meters:(?<dist>[0-9]+)>"};

// The spatial join operation without a limit on the maximum number of results
// can, in the worst case have a square number of results, but usually this is
// not the case. 1 divided by this constant is the damping factor for the
// estimated number of results.
static const size_t SPATIAL_JOIN_MAX_DIST_SIZE_ESTIMATE = 1000;

// This predicate is one of the supported identifiers for the SpatialJoin class.
// For a given triple of the form ?a <nearest-neighbors:XXXX:YYYY> ?b, it will
// produce for each value of ?a the nearest neighbors ?b to ?a. The number of
Expand Down
25 changes: 10 additions & 15 deletions test/engine/SpatialJoinTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "engine/Join.h"
#include "engine/QueryExecutionTree.h"
#include "engine/SpatialJoin.h"
#include "global/Constants.h"
#include "parser/data/Variable.h"

namespace { // anonymous namespace to avoid linker problems
Expand Down Expand Up @@ -608,12 +609,7 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst,
}
};

auto assert_double_with_bounds = [](double value1, double value2) {
// ASSERT_DOUBLE_EQ did not work properly
ASSERT_TRUE(value1 * 0.99999 < value2);
ASSERT_TRUE(value1 * 1.00001 > value2);
};

const double doubleBound = 0.00001;
std::string kg = createSmallDatasetWithPoints();

// add multiplicities to test knowledge graph
Expand Down Expand Up @@ -704,15 +700,15 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst,
auto sizeEstimateChild = inputChild->getSizeEstimate();
double distinctnessChild = sizeEstimateChild / multiplicityChild;
auto mult = spatialJoin->getMultiplicity(i);
auto sizeEst = spatialJoin->getSizeEstimate();
auto sizeEst = std::pow(sizeEstimateChild, 2);
double distinctness = sizeEst / mult;
// multiplicity, distinctness and size are related via the formula
// size = distinctness * multiplicity. Therefore if we have two of them,
// we can calculate the third one. Here we check, that this formula
// holds true. The distinctness must not change after the operation, the
// other two variables can change. Therefore we check the correctness
// via distinctness.
assert_double_with_bounds(distinctnessChild, distinctness);
ASSERT_NEAR(distinctnessChild, distinctness, doubleBound);
}
}
} else {
Expand All @@ -723,12 +719,12 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst,
ASSERT_EQ(spatialJoin->getSizeEstimate(), 1);
auto spJoin2 = spatialJoin->addChild(secondChild, secondVariable);
spatialJoin = static_cast<SpatialJoin*>(spJoin2.get());
// the size should be 49, because both input tables have 7 rows and it is
// assumed, that the whole cross product is build
// the size should be at most 49, because both input tables have 7 rows and
// it is assumed, that in the worst case the whole cross product is build
auto estimate =
spatialJoin->onlyForTestingGetLeftChild()->getSizeEstimate() *
spatialJoin->onlyForTestingGetRightChild()->getSizeEstimate();
ASSERT_EQ(estimate, spatialJoin->getSizeEstimate());
ASSERT_LE(spatialJoin->getSizeEstimate(), estimate);
}

{ // new block for hard coded testing
Expand Down Expand Up @@ -790,9 +786,8 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst,
auto assertMultiplicity = [&](Variable var, double multiplicity,
SpatialJoin* spatialJoin,
VariableToColumnMap& varColsMap) {
assert_double_with_bounds(
spatialJoin->getMultiplicity(varColsMap[var].columnIndex_),
multiplicity);
ASSERT_NEAR(spatialJoin->getMultiplicity(varColsMap[var].columnIndex_),
multiplicity, doubleBound);
};
multiplicitiesBeforeAllChildrenAdded(spatialJoin);
auto spJoin1 = spatialJoin->addChild(firstChild, firstVariable);
Expand All @@ -815,7 +810,7 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst,
spatialJoin = static_cast<SpatialJoin*>(spJoin1.get());
auto spJoin2 = spatialJoin->addChild(secondChild, secondVariable);
spatialJoin = static_cast<SpatialJoin*>(spJoin2.get());
ASSERT_EQ(spatialJoin->getSizeEstimate(), 49);
ASSERT_LE(spatialJoin->getSizeEstimate(), 49);
}
}
}
Expand Down

0 comments on commit 2b4e6b3

Please sign in to comment.