diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index b716b350f24b84..c086aaef5c81bb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -117,6 +117,8 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation colBuilder.setMinValue(union.getLow()).setMinExpr(union.getLowExpr()) .setMaxValue(union.getHigh()).setMaxExpr(union.getHighExpr()) .setNdv(union.getDistinctValues()); + double maxNumNulls = Math.max(leftColStats.numNulls, rightColStats.numNulls); + colBuilder.setNumNulls(Math.min(colBuilder.getCount(), maxNumNulls)); orStats.addColumnStats(slot, colBuilder.build()); } } @@ -175,7 +177,7 @@ public Statistics visitComparisonPredicate(ComparisonPredicate cp, EstimationCon } private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, EstimationContext context, boolean contains) { + ColumnStatistic statsForRight, EstimationContext context) { StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, statsForRight.maxValue, statsForRight.maxExpr, statsForLeft.ndv, leftExpr.getDataType()); @@ -185,7 +187,7 @@ private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic st } private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, EstimationContext context, boolean contains) { + ColumnStatistic statsForRight, EstimationContext context) { StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, statsForLeft.maxValue, statsForLeft.maxExpr, statsForLeft.ndv, leftExpr.getDataType()); @@ -202,12 +204,9 @@ private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, return estimateEqualTo(cp, statsForLeft, statsForRight, context); } else { if (cp instanceof LessThan || cp instanceof LessThanEqual) { - return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, - context, cp instanceof LessThanEqual); + return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, context); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - - return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context, - cp instanceof GreaterThanEqual); + return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); } @@ -225,6 +224,7 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats } else { selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv); } + selectivity = getNotNullSelectivity(statsForLeft, selectivity); Statistics equalStats = context.statistics.withSel(selectivity); Expression left = cp.left(); equalStats.addColumnStats(left, statsForRight); @@ -331,10 +331,12 @@ A not in (1, 2, 3, 100): selectivity = 1.0; } } + compareExprStatsBuilder.setNumNulls(0); Statistics estimated = new Statistics(context.statistics); + ColumnStatistic stats = compareExprStatsBuilder.build(); + selectivity = getNotNullSelectivity(stats, selectivity); estimated = estimated.withSel(selectivity); - estimated.addColumnStats(compareExpr, - compareExprStatsBuilder.build()); + estimated.addColumnStats(compareExpr, stats); context.addKeyIfSlot(compareExpr); return estimated; } @@ -394,6 +396,11 @@ public Statistics visitNot(Not not, EstimationContext context) { .setMaxValue(originColStats.maxValue) .setMaxExpr(originColStats.maxExpr); } + if (not.child().getInputSlots().size() == 1 && !(child instanceof IsNull)) { + // only consider the single column numNull, otherwise, ignore + rowCount = Math.max(rowCount - originColStats.numNulls, 1); + statisticsBuilder.setRowCount(rowCount); + } statisticsBuilder.putColumnStatistics(slot, colBuilder.build()); } } @@ -460,15 +467,18 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnSta .setMaxValue(Double.POSITIVE_INFINITY) .setMaxExpr(null) .setNdv(0) - .setCount(0); + .setCount(0) + .setNumNulls(0); } else { leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) .setMinValue(intersectRange.getLow()) .setMinExpr(intersectRange.getLowExpr()) .setMaxValue(intersectRange.getHigh()) .setMaxExpr(intersectRange.getHighExpr()) - .setNdv(intersectRange.getDistinctValues()); + .setNdv(intersectRange.getDistinctValues()) + .setNumNulls(0); double sel = leftRange.overlapPercentWith(rightRange); + sel = getNotNullSelectivity(leftStats, sel); updatedStatistics = context.statistics.withSel(sel); leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); } @@ -488,6 +498,7 @@ private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatis intersectBuilder.setNdv(intersect.getDistinctValues()); intersectBuilder.setMinValue(intersect.getLow()); intersectBuilder.setMaxValue(intersect.getHigh()); + intersectBuilder.setNumNulls(0); double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); Statistics updatedStatistics = context.statistics.withSel(sel); updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build()); @@ -568,10 +579,34 @@ public Statistics visitLike(Like like, EstimationContext context) { "col stats not found. slot=%s in %s", like.left().toSql(), like.toSql()); ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(origin); - colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0); + double selectivity = StatsMathUtil.divide(DEFAULT_LIKE_COMPARISON_SELECTIVITY, origin.ndv); + double notNullSel = getNotNullSelectivity(origin, selectivity); + colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY) + .setCount(notNullSel * context.statistics.getRowCount()).setNumNulls(0); statsBuilder.putColumnStatistics(like.left(), colBuilder.build()); context.addKeyIfSlot(like.left()); } return statsBuilder.build(); } + + private double getNotNullSelectivity(ColumnStatistic stats, double origSel) { + double rowCount = stats.count; + double numNulls = stats.numNulls; + + // comment following check since current rowCount and ndv may be inconsistant + // e.g, rowCount has been reduced by one filter but another filter column's + // ndv and numNull remains originally, which will unexpectedly go into the following + // normalization. + + //if (numNulls > rowCount - ndv) { + // numNulls = rowCount - ndv > 0 ? rowCount - ndv : 0; + //} + double notNullSel = rowCount <= 1.0 ? 1.0 : 1 - getValidSelectivity(numNulls / rowCount); + double validSel = origSel * notNullSel; + return getValidSelectivity(validSel); + } + + private static double getValidSelectivity(double nullSel) { + return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 888ec139df1cda..177fac64f16818 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -64,10 +64,10 @@ public void testOrNaN() { Or or = new Or(greaterThan1, lessThan); Map columnStat = new HashMap<>(); ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4) - .setNumNulls(500).setDataSize(0) + .setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4) - .setNumNulls(500).setDataSize(0) + .setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build(); columnStat.put(a, aStats); columnStat.put(b, bStats); @@ -93,10 +93,10 @@ public void testAndNaN() { And and = new And(greaterThan1, lessThan); Map columnStat = new HashMap<>(); ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) - .setAvgSizeByte(4).setNumNulls(500).setDataSize(0) + .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) - .setAvgSizeByte(4).setNumNulls(500).setDataSize(0) + .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build(); columnStat.put(a, aStats); columnStat.put(b, bStats); @@ -185,13 +185,13 @@ public void test1() { Or or = new Or(and, equalTo); Map slotToColumnStat = new HashMap<>(); ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) - .setAvgSizeByte(4).setNumNulls(500).setDataSize(0) + .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) - .setAvgSizeByte(4).setNumNulls(500).setDataSize(0) + .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); ColumnStatistic cStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) - .setAvgSizeByte(4).setNumNulls(500).setDataSize(0) + .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); slotToColumnStat.put(a, aStats); slotToColumnStat.put(b, bStats); @@ -910,4 +910,193 @@ public void testIsNotNull() { Statistics result = filterEstimation.estimate(not, stats); Assertions.assertEquals(result.getRowCount(), 90); } + + /** + * a = 1 + */ + @Test + public void testNumNullsEqualTo() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + EqualTo equalTo = new EqualTo(a, int1); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(equalTo, stats); + Assertions.assertEquals(result.getRowCount(), 1.0, 0.01); + } + + /** + * a > 1 + */ + @Test + public void testNumNullsComparable() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + GreaterThan greaterThan = new GreaterThan(a, int1); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(greaterThan, stats); + Assertions.assertEquals(result.getRowCount(), 2.0, 0.01); + } + + /** + * a in (1, 2) + */ + @Test + public void testNumNullsIn() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + IntegerLiteral int2 = new IntegerLiteral(2); + InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2)); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(in, stats); + Assertions.assertEquals(result.getRowCount(), 10.0, 0.01); + } + + /** + * not a = 1 + */ + @Test + public void testNumNullsNotEqualTo() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + EqualTo equalTo = new EqualTo(a, int1); + Not not = new Not(equalTo); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(not, stats); + Assertions.assertEquals(result.getRowCount(), 1.0, 0.01); + } + + /** + * a not in (1, 2) + */ + @Test + public void testNumNullsNotIn() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + IntegerLiteral int2 = new IntegerLiteral(2); + InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2)); + Not not = new Not(in); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(not, stats); + Assertions.assertEquals(result.getRowCount(), 1.0, 0.01); + } + + /** + * a >= 1 and a <= 2 + */ + @Test + public void testNumNullsAnd() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + IntegerLiteral int2 = new IntegerLiteral(2); + GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1); + LessThanEqual lessThanEqual = new LessThanEqual(a, int2); + And and = new And(greaterThanEqual, lessThanEqual); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(and, stats); + Assertions.assertEquals(result.getRowCount(), 2.0, 0.01); + } + + /** + * a >= 1 or a <= 2 + */ + @Test + public void testNumNullsOr() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + IntegerLiteral int2 = new IntegerLiteral(2); + GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int2); + LessThanEqual lessThanEqual = new LessThanEqual(a, int1); + Or or = new Or(greaterThanEqual, lessThanEqual); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(or, stats); + Assertions.assertEquals(result.getRowCount(), 2.0, 0.01); + } + + /** + * a >= 1 or a is null + */ + @Test + public void testNumNullsOrIsNull() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(2) + .setAvgSizeByte(4) + .setNumNulls(8) + .setMaxValue(2) + .setMinValue(1) + .setCount(10); + IntegerLiteral int1 = new IntegerLiteral(1); + GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1); + IsNull isNull = new IsNull(a); + Or or = new Or(greaterThanEqual, isNull); + Statistics stats = new Statistics(10, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(or, stats); + Assertions.assertEquals(result.getRowCount(), 10.0, 0.01); + } + }