Skip to content

Commit

Permalink
[fix](nereids) "not is null" stats estimation fix (#28860)
Browse files Browse the repository at this point in the history
* fix not is null stats
  • Loading branch information
englefly authored Dec 27, 2023
1 parent 58e7ad8 commit 576a2b3
Show file tree
Hide file tree
Showing 14 changed files with 331 additions and 303 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,8 @@ public Statistics visitIsNull(IsNull isNull, EstimationContext context) {
.setMinValue(Double.NEGATIVE_INFINITY)
.setNdv(0);
StatisticsBuilder builder = new StatisticsBuilder(context.statistics);
builder.putColumnStatistics(isNull.child(), colBuilder.build());
builder.setRowCount(outputRowCount);
builder.putColumnStatistics(isNull, colBuilder.build());
context.addKeyIfSlot(isNull.child());
return builder.build();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.doris.nereids.trees.expressions.GreaterThan;
import org.apache.doris.nereids.trees.expressions.GreaterThanEqual;
import org.apache.doris.nereids.trees.expressions.InPredicate;
import org.apache.doris.nereids.trees.expressions.IsNull;
import org.apache.doris.nereids.trees.expressions.LessThan;
import org.apache.doris.nereids.trees.expressions.LessThanEqual;
import org.apache.doris.nereids.trees.expressions.Not;
Expand Down Expand Up @@ -872,4 +873,41 @@ public void testDateRangeSelectivity() {
Statistics result = filterEstimation.estimate(greaterThan, stats);
Assertions.assertEquals(result.getRowCount(), 10, 0.1);
}

@Test
public void testIsNull() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(100)
.setAvgSizeByte(4)
.setNumNulls(10)
.setMaxValue(100)
.setMinValue(0)
.setCount(100);
IsNull isNull = new IsNull(a);
Statistics stats = new Statistics(100, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(isNull, stats);
Assertions.assertEquals(result.getRowCount(), 10);
}

@Test
public void testIsNotNull() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(100)
.setAvgSizeByte(4)
.setNumNulls(10)
.setMaxValue(100)
.setMinValue(0)
.setCount(100);
IsNull isNull = new IsNull(a);
Not not = new Not(isNull);
Statistics stats = new Statistics(100, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(not, stats);
Assertions.assertEquals(result.getRowCount(), 90);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,30 @@ PhysicalResultSink
--------------------------PhysicalOlapScan[item]
----------------PhysicalDistribute
------------------PhysicalProject
--------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
----------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk]
------------------------PhysicalProject
--------------------------filter(ws_promo_sk IS NULL)
----------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 RF3
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[date_dim]
--------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 ws_item_sk->[i_item_sk]
----------------------PhysicalProject
------------------------PhysicalOlapScan[item] apply RFs: RF3
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------PhysicalOlapScan[item]
----------------PhysicalDistribute
------------------PhysicalProject
--------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF5 i_item_sk->[cs_item_sk]
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk]
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------filter(ws_promo_sk IS NULL)
--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[date_dim]
----------------PhysicalProject
------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF5 i_item_sk->[cs_item_sk]
--------------------PhysicalDistribute
----------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF4 d_date_sk->[cs_sold_date_sk]
------------------------PhysicalProject
--------------------------filter(cs_bill_customer_sk IS NULL)
----------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF4 RF5
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------filter(cs_bill_customer_sk IS NULL)
------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF4 RF5
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[date_dim]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------PhysicalOlapScan[item]
--------------------PhysicalDistribute
----------------------PhysicalProject
------------------------PhysicalOlapScan[item]

Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,41 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
--PhysicalResultSink
----PhysicalTopN[MERGE_SORT]
------PhysicalTopN[LOCAL_SORT]
--------PhysicalProject
----------hashAgg[DISTINCT_GLOBAL]
------------PhysicalDistribute
--------------hashAgg[DISTINCT_LOCAL]
----------------hashAgg[GLOBAL]
------------------hashAgg[LOCAL]
--------------------PhysicalProject
----------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = web_returns.wr_order_number)) otherCondition=()
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN] hashCondition=((web_returns.wr_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF5 wr_order_number->[ws_order_number];RF6 wr_order_number->[ws_order_number,ws_order_number]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[web_returns]
------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF4 ws_order_number->[ws_order_number];RF7 ws_order_number->[ws_order_number,ws_order_number]
--------hashAgg[DISTINCT_GLOBAL]
----------PhysicalDistribute
------------hashAgg[DISTINCT_LOCAL]
--------------hashAgg[GLOBAL]
----------------hashAgg[LOCAL]
------------------PhysicalProject
--------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = web_returns.wr_order_number)) otherCondition=()
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((web_returns.wr_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF5 wr_order_number->[ws_order_number];RF6 wr_order_number->[ws_order_number,ws_order_number]
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
--------------------------PhysicalDistribute
----------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_web_site_sk = web_site.web_site_sk)) otherCondition=() build RFs:RF3 web_site_sk->[ws_web_site_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_ship_date_sk]
--------------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF1 ca_address_sk->[ws_ship_addr_sk]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
----------------------------------PhysicalDistribute
------------------------------------PhysicalProject
--------------------------------------filter((customer_address.ca_state = 'VA'))
----------------------------------------PhysicalOlapScan[customer_address]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[web_returns]
----------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF4 ws_order_number->[ws_order_number];RF7 ws_order_number->[ws_order_number,ws_order_number]
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
------------------------PhysicalDistribute
--------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_web_site_sk = web_site.web_site_sk)) otherCondition=() build RFs:RF3 web_site_sk->[ws_web_site_sk]
----------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_ship_date_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF1 ca_address_sk->[ws_ship_addr_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------filter((date_dim.d_date <= '2001-05-31') and (date_dim.d_date >= '2001-04-01'))
--------------------------------------PhysicalOlapScan[date_dim]
------------------------------------filter((customer_address.ca_state = 'VA'))
--------------------------------------PhysicalOlapScan[customer_address]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------filter((web_site.web_company_name = 'pri'))
------------------------------------PhysicalOlapScan[web_site]
----------------------------------filter((date_dim.d_date <= '2001-05-31') and (date_dim.d_date >= '2001-04-01'))
------------------------------------PhysicalOlapScan[date_dim]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter((web_site.web_company_name = 'pri'))
----------------------------------PhysicalOlapScan[web_site]

Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,25 @@ PhysicalResultSink
--------------------PhysicalDistribute
----------------------hashAgg[LOCAL]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 ss_sold_date_sk->[d_date_sk]
--------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
----------------------------PhysicalProject
------------------------------filter((date_dim.d_month_seq <= 1210) and (date_dim.d_month_seq >= 1199))
--------------------------------PhysicalOlapScan[date_dim] apply RFs: RF1
------------------------------filter(( not ss_sold_date_sk IS NULL))
--------------------------------PhysicalOlapScan[store_sales] apply RFs: RF1
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter(( not ss_sold_date_sk IS NULL))
----------------------------------PhysicalOlapScan[store_sales]
--------------------------------filter((date_dim.d_month_seq <= 1210) and (date_dim.d_month_seq >= 1199))
----------------------------------PhysicalOlapScan[date_dim]
----------------PhysicalProject
------------------hashAgg[GLOBAL]
--------------------PhysicalDistribute
----------------------hashAgg[LOCAL]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 cs_sold_date_sk->[d_date_sk]
--------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
----------------------------PhysicalProject
------------------------------filter((date_dim.d_month_seq <= 1210) and (date_dim.d_month_seq >= 1199))
--------------------------------PhysicalOlapScan[date_dim] apply RFs: RF0
------------------------------filter(( not cs_sold_date_sk IS NULL))
--------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter(( not cs_sold_date_sk IS NULL))
----------------------------------PhysicalOlapScan[catalog_sales]
--------------------------------filter((date_dim.d_month_seq <= 1210) and (date_dim.d_month_seq >= 1199))
----------------------------------PhysicalOlapScan[date_dim]

Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,41 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
--PhysicalResultSink
----PhysicalTopN[MERGE_SORT]
------PhysicalTopN[LOCAL_SORT]
--------PhysicalProject
----------hashAgg[DISTINCT_GLOBAL]
------------PhysicalDistribute
--------------hashAgg[DISTINCT_LOCAL]
----------------hashAgg[GLOBAL]
------------------hashAgg[LOCAL]
--------------------PhysicalProject
----------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = web_returns.wr_order_number)) otherCondition=()
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN] hashCondition=((web_returns.wr_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF5 wr_order_number->[ws_order_number];RF6 wr_order_number->[ws_order_number,ws_order_number]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[web_returns]
------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF4 ws_order_number->[ws_order_number];RF7 ws_order_number->[ws_order_number,ws_order_number]
--------hashAgg[DISTINCT_GLOBAL]
----------PhysicalDistribute
------------hashAgg[DISTINCT_LOCAL]
--------------hashAgg[GLOBAL]
----------------hashAgg[LOCAL]
------------------PhysicalProject
--------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = web_returns.wr_order_number)) otherCondition=()
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((web_returns.wr_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF5 wr_order_number->[ws_order_number];RF6 wr_order_number->[ws_order_number,ws_order_number]
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
--------------------------PhysicalDistribute
----------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_web_site_sk = web_site.web_site_sk)) otherCondition=() build RFs:RF3 web_site_sk->[ws_web_site_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[ws_ship_addr_sk]
--------------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_ship_date_sk]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
----------------------------------PhysicalDistribute
------------------------------------PhysicalProject
--------------------------------------filter((date_dim.d_date <= '1999-04-02') and (date_dim.d_date >= '1999-02-01'))
----------------------------------------PhysicalOlapScan[date_dim]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[web_returns]
----------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((ws1.ws_order_number = ws_wh.ws_order_number)) otherCondition=() build RFs:RF4 ws_order_number->[ws_order_number];RF7 ws_order_number->[ws_order_number,ws_order_number]
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
------------------------PhysicalDistribute
--------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_web_site_sk = web_site.web_site_sk)) otherCondition=() build RFs:RF3 web_site_sk->[ws_web_site_sk]
----------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[ws_ship_addr_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((ws1.ws_ship_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_ship_date_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------filter((customer_address.ca_state = 'NC'))
--------------------------------------PhysicalOlapScan[customer_address]
------------------------------------filter((date_dim.d_date <= '1999-04-02') and (date_dim.d_date >= '1999-02-01'))
--------------------------------------PhysicalOlapScan[date_dim]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------filter((web_site.web_company_name = 'pri'))
------------------------------------PhysicalOlapScan[web_site]
----------------------------------filter((customer_address.ca_state = 'NC'))
------------------------------------PhysicalOlapScan[customer_address]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter((web_site.web_company_name = 'pri'))
----------------------------------PhysicalOlapScan[web_site]

Loading

0 comments on commit 576a2b3

Please sign in to comment.