Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nereids] consider numNulls in filter estimation #29496

Merged
merged 1 commit into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation
colBuilder.setMinValue(union.getLow()).setMinExpr(union.getLowExpr())
.setMaxValue(union.getHigh()).setMaxExpr(union.getHighExpr())
.setNdv(union.getDistinctValues());
double maxNumNulls = Math.max(leftColStats.numNulls, rightColStats.numNulls);
colBuilder.setNumNulls(Math.min(colBuilder.getCount(), maxNumNulls));
orStats.addColumnStats(slot, colBuilder.build());
}
}
Expand Down Expand Up @@ -175,7 +177,7 @@ public Statistics visitComparisonPredicate(ComparisonPredicate cp, EstimationCon
}

private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight, EstimationContext context, boolean contains) {
ColumnStatistic statsForRight, EstimationContext context) {
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr,
statsForRight.maxValue, statsForRight.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
Expand All @@ -185,7 +187,7 @@ private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic st
}

private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight, EstimationContext context, boolean contains) {
ColumnStatistic statsForRight, EstimationContext context) {
StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr,
statsForLeft.maxValue, statsForLeft.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
Expand All @@ -202,12 +204,9 @@ private Statistics calculateWhenLiteralRight(ComparisonPredicate cp,
return estimateEqualTo(cp, statsForLeft, statsForRight, context);
} else {
if (cp instanceof LessThan || cp instanceof LessThanEqual) {
return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight,
context, cp instanceof LessThanEqual);
return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, context);
} else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) {

return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context,
cp instanceof GreaterThanEqual);
return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context);
} else {
throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql()));
}
Expand All @@ -225,6 +224,7 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats
} else {
selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv);
}
selectivity = getNotNullSelectivity(statsForLeft, selectivity);
Statistics equalStats = context.statistics.withSel(selectivity);
Expression left = cp.left();
equalStats.addColumnStats(left, statsForRight);
Expand Down Expand Up @@ -331,10 +331,12 @@ A not in (1, 2, 3, 100):
selectivity = 1.0;
}
}
compareExprStatsBuilder.setNumNulls(0);
Statistics estimated = new Statistics(context.statistics);
ColumnStatistic stats = compareExprStatsBuilder.build();
selectivity = getNotNullSelectivity(stats, selectivity);
estimated = estimated.withSel(selectivity);
estimated.addColumnStats(compareExpr,
compareExprStatsBuilder.build());
estimated.addColumnStats(compareExpr, stats);
context.addKeyIfSlot(compareExpr);
return estimated;
}
Expand Down Expand Up @@ -394,6 +396,11 @@ public Statistics visitNot(Not not, EstimationContext context) {
.setMaxValue(originColStats.maxValue)
.setMaxExpr(originColStats.maxExpr);
}
if (not.child().getInputSlots().size() == 1 && !(child instanceof IsNull)) {
// only consider the single column numNull, otherwise, ignore
rowCount = Math.max(rowCount - originColStats.numNulls, 1);
statisticsBuilder.setRowCount(rowCount);
}
statisticsBuilder.putColumnStatistics(slot, colBuilder.build());
}
}
Expand Down Expand Up @@ -460,15 +467,18 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnSta
.setMaxValue(Double.POSITIVE_INFINITY)
.setMaxExpr(null)
.setNdv(0)
.setCount(0);
.setCount(0)
.setNumNulls(0);
} else {
leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
.setMinValue(intersectRange.getLow())
.setMinExpr(intersectRange.getLowExpr())
.setMaxValue(intersectRange.getHigh())
.setMaxExpr(intersectRange.getHighExpr())
.setNdv(intersectRange.getDistinctValues());
.setNdv(intersectRange.getDistinctValues())
.setNumNulls(0);
double sel = leftRange.overlapPercentWith(rightRange);
sel = getNotNullSelectivity(leftStats, sel);
updatedStatistics = context.statistics.withSel(sel);
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
}
Expand All @@ -488,6 +498,7 @@ private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatis
intersectBuilder.setNdv(intersect.getDistinctValues());
intersectBuilder.setMinValue(intersect.getLow());
intersectBuilder.setMaxValue(intersect.getHigh());
intersectBuilder.setNumNulls(0);
double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv));
Statistics updatedStatistics = context.statistics.withSel(sel);
updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build());
Expand Down Expand Up @@ -568,10 +579,34 @@ public Statistics visitLike(Like like, EstimationContext context) {
"col stats not found. slot=%s in %s",
like.left().toSql(), like.toSql());
ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(origin);
colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0);
double selectivity = StatsMathUtil.divide(DEFAULT_LIKE_COMPARISON_SELECTIVITY, origin.ndv);
double notNullSel = getNotNullSelectivity(origin, selectivity);
colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY)
.setCount(notNullSel * context.statistics.getRowCount()).setNumNulls(0);
statsBuilder.putColumnStatistics(like.left(), colBuilder.build());
context.addKeyIfSlot(like.left());
}
return statsBuilder.build();
}

private double getNotNullSelectivity(ColumnStatistic stats, double origSel) {
double rowCount = stats.count;
double numNulls = stats.numNulls;

// comment following check since current rowCount and ndv may be inconsistant
// e.g, rowCount has been reduced by one filter but another filter column's
// ndv and numNull remains originally, which will unexpectedly go into the following
// normalization.

//if (numNulls > rowCount - ndv) {
// numNulls = rowCount - ndv > 0 ? rowCount - ndv : 0;
//}
double notNullSel = rowCount <= 1.0 ? 1.0 : 1 - getValidSelectivity(numNulls / rowCount);
double validSel = origSel * notNullSel;
return getValidSelectivity(validSel);
}

private static double getValidSelectivity(double nullSel) {
return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ public void testOrNaN() {
Or or = new Or(greaterThan1, lessThan);
Map<Expression, ColumnStatistic> columnStat = new HashMap<>();
ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4)
.setNumNulls(500).setDataSize(0)
.setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4)
.setNumNulls(500).setDataSize(0)
.setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build();
columnStat.put(a, aStats);
columnStat.put(b, bStats);
Expand All @@ -93,10 +93,10 @@ public void testAndNaN() {
And and = new And(greaterThan1, lessThan);
Map<Expression, ColumnStatistic> columnStat = new HashMap<>();
ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500)
.setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
.setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500)
.setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
.setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build();
columnStat.put(a, aStats);
columnStat.put(b, bStats);
Expand Down Expand Up @@ -185,13 +185,13 @@ public void test1() {
Or or = new Or(and, equalTo);
Map<Expression, ColumnStatistic> slotToColumnStat = new HashMap<>();
ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500)
.setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
.setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500)
.setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
.setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
ColumnStatistic cStats = new ColumnStatisticBuilder().setCount(500).setNdv(500)
.setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
.setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
.setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
slotToColumnStat.put(a, aStats);
slotToColumnStat.put(b, bStats);
Expand Down Expand Up @@ -910,4 +910,193 @@ public void testIsNotNull() {
Statistics result = filterEstimation.estimate(not, stats);
Assertions.assertEquals(result.getRowCount(), 90);
}

/**
* a = 1
*/
@Test
public void testNumNullsEqualTo() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
EqualTo equalTo = new EqualTo(a, int1);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(equalTo, stats);
Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
}

/**
* a > 1
*/
@Test
public void testNumNullsComparable() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
GreaterThan greaterThan = new GreaterThan(a, int1);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(greaterThan, stats);
Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
}

/**
* a in (1, 2)
*/
@Test
public void testNumNullsIn() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
IntegerLiteral int2 = new IntegerLiteral(2);
InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2));
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(in, stats);
Assertions.assertEquals(result.getRowCount(), 10.0, 0.01);
}

/**
* not a = 1
*/
@Test
public void testNumNullsNotEqualTo() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
EqualTo equalTo = new EqualTo(a, int1);
Not not = new Not(equalTo);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(not, stats);
Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
}

/**
* a not in (1, 2)
*/
@Test
public void testNumNullsNotIn() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
IntegerLiteral int2 = new IntegerLiteral(2);
InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2));
Not not = new Not(in);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(not, stats);
Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
}

/**
* a >= 1 and a <= 2
*/
@Test
public void testNumNullsAnd() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
IntegerLiteral int2 = new IntegerLiteral(2);
GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1);
LessThanEqual lessThanEqual = new LessThanEqual(a, int2);
And and = new And(greaterThanEqual, lessThanEqual);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(and, stats);
Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
}

/**
* a >= 1 or a <= 2
*/
@Test
public void testNumNullsOr() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
IntegerLiteral int2 = new IntegerLiteral(2);
GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int2);
LessThanEqual lessThanEqual = new LessThanEqual(a, int1);
Or or = new Or(greaterThanEqual, lessThanEqual);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(or, stats);
Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
}

/**
* a >= 1 or a is null
*/
@Test
public void testNumNullsOrIsNull() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1);
IsNull isNull = new IsNull(a);
Or or = new Or(greaterThanEqual, isNull);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(or, stats);
Assertions.assertEquals(result.getRowCount(), 10.0, 0.01);
}

}
Loading