Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: AggregateMonotonicity #14271

Merged
merged 34 commits into from
Jan 31, 2025
Merged
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
a2919b6
add monotonic function definitions for aggregate expressions
mertak-synnada Jan 16, 2025
14109e6
fix benchmark results
mertak-synnada Jan 16, 2025
b3d75ba
set prefer_existing_sort to true in sqllogictests
mertak-synnada Jan 17, 2025
549502e
set prefer_existing_sort to true in sqllogictests
mertak-synnada Jan 17, 2025
623e0c5
fix typo
mertak-synnada Jan 17, 2025
6a9d24e
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 20, 2025
53ee3de
re-add test_utils.rs changes to the new file
mertak-synnada Jan 20, 2025
97d8951
clone input with Arc
mertak-synnada Jan 20, 2025
cc33031
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 22, 2025
41d9430
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 23, 2025
e988dcf
inject aggr expr indices
mertak-synnada Jan 23, 2025
906245e
remove redundant file
mertak-synnada Jan 23, 2025
475fe2d
add Sum monotonicity
mertak-synnada Jan 24, 2025
57e000e
fix sql logic tests
mertak-synnada Jan 24, 2025
ca57f46
fix sql logic tests
mertak-synnada Jan 24, 2025
6cf9644
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 24, 2025
072e6ef
update docs
mertak-synnada Jan 24, 2025
7d62cb0
Merge branch 'apache_main' into feature/monotonic-sets
berkaysynnada Jan 28, 2025
491aabe
review part 1
berkaysynnada Jan 28, 2025
972c56f
fix the tests
berkaysynnada Jan 29, 2025
4b946b3
revert slt's
berkaysynnada Jan 29, 2025
481b5b4
simplify terms
berkaysynnada Jan 29, 2025
29af731
Update mod.rs
berkaysynnada Jan 29, 2025
1f02953
remove unnecessary computations
berkaysynnada Jan 29, 2025
79dd942
remove index calc
berkaysynnada Jan 29, 2025
247d5fe
Update mod.rs
berkaysynnada Jan 29, 2025
16bdac4
Apply suggestions from code review
ozankabak Jan 29, 2025
1875336
add slt
berkaysynnada Jan 29, 2025
ba7b94f
remove aggregate changes, tests already give expected results
berkaysynnada Jan 30, 2025
2152b7f
fix clippy
berkaysynnada Jan 30, 2025
7822613
remove one row sorts
berkaysynnada Jan 30, 2025
5e9b2db
Improve comments
ozankabak Jan 30, 2025
54d62d6
Use a short name for set monotonicity
ozankabak Jan 30, 2025
1146811
Merge branch 'main' into feature/monotonic-sets
ozankabak Jan 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
remove index calc
berkaysynnada committed Jan 29, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 79dd9422e5163e78c7e3e1941c6660f1213a2421
31 changes: 11 additions & 20 deletions datafusion/physical-plan/src/aggregates/mod.rs
Original file line number Diff line number Diff line change
@@ -375,7 +375,6 @@ pub struct AggregateExec {
/// Describes how the input is ordered relative to the group by columns
input_order_mode: InputOrderMode,
cache: PlanProperties,
aggr_expr_indices: Vec<usize>,
}

impl AggregateExec {
@@ -400,7 +399,6 @@ impl AggregateExec {
input: Arc::clone(&self.input),
schema: Arc::clone(&self.schema),
input_schema: Arc::clone(&self.input_schema),
aggr_expr_indices: self.aggr_expr_indices.clone(),
}
}

@@ -417,8 +415,7 @@ impl AggregateExec {
input: Arc<dyn ExecutionPlan>,
input_schema: SchemaRef,
) -> Result<Self> {
let (schema, aggr_exprs_indices) =
create_schema(&input.schema(), &group_by, &aggr_expr, mode)?;
let schema = create_schema(&input.schema(), &group_by, &aggr_expr, mode)?;

let schema = Arc::new(schema);
AggregateExec::try_new_with_schema(
@@ -429,7 +426,6 @@ impl AggregateExec {
input,
input_schema,
schema,
aggr_exprs_indices,
)
}

@@ -450,7 +446,6 @@ impl AggregateExec {
input: Arc<dyn ExecutionPlan>,
input_schema: SchemaRef,
schema: SchemaRef,
aggr_expr_indices: Vec<usize>,
) -> Result<Self> {
// Make sure arguments are consistent in size
if aggr_expr.len() != filter_expr.len() {
@@ -518,7 +513,6 @@ impl AggregateExec {
&mode,
&input_order_mode,
aggr_expr.clone(),
aggr_expr_indices.clone(),
);

Ok(AggregateExec {
@@ -534,7 +528,6 @@ impl AggregateExec {
limit: None,
input_order_mode,
cache,
aggr_expr_indices,
})
}

@@ -657,13 +650,15 @@ impl AggregateExec {
mode: &AggregateMode,
input_order_mode: &InputOrderMode,
aggr_exprs: Vec<Arc<AggregateFunctionExpr>>,
ozankabak marked this conversation as resolved.
Show resolved Hide resolved
aggr_expr_indices: Vec<usize>,
) -> PlanProperties {
// Construct equivalence properties:
let mut eq_properties = input
.equivalence_properties()
.project(group_expr_mapping, schema);
.project(group_expr_mapping, Arc::clone(&schema));

let schema_len = schema.fields.len();
let aggr_expr_indices =
((schema_len - aggr_exprs.len())..schema_len).collect::<Vec<_>>();
// if the aggregate function is set monotonic, add it into equivalence properties
for (i, aggr_expr) in aggr_exprs.iter().enumerate() {
let aggr_expr_index = aggr_expr_indices[i];
@@ -870,7 +865,6 @@ impl ExecutionPlan for AggregateExec {
Arc::clone(&children[0]),
Arc::clone(&self.input_schema),
Arc::clone(&self.schema),
self.aggr_expr_indices.clone(),
)?;
me.limit = self.limit;

@@ -947,8 +941,7 @@ fn create_schema(
group_by: &PhysicalGroupBy,
aggr_expr: &[Arc<AggregateFunctionExpr>],
mode: AggregateMode,
) -> Result<(Schema, Vec<usize>)> {
let mut aggr_exprs_indices = vec![];
) -> Result<Schema> {
let mut fields = Vec::with_capacity(group_by.num_output_exprs() + aggr_expr.len());
fields.extend(group_by.output_fields(input_schema)?);

@@ -957,7 +950,6 @@ fn create_schema(
// in partial mode, the fields of the accumulator's state
for expr in aggr_expr {
fields.extend(expr.state_fields()?.iter().cloned());
aggr_exprs_indices.push(fields.len() - 1);
}
}
AggregateMode::Final
@@ -966,15 +958,14 @@ fn create_schema(
| AggregateMode::SinglePartitioned => {
// in final mode, the field with the final result of the accumulator
for expr in aggr_expr {
fields.push(expr.field());
aggr_exprs_indices.push(fields.len() - 1);
fields.extend(expr.state_fields()?.iter().cloned())
}
}
}

Ok((
Schema::new_with_metadata(fields, input_schema.metadata().clone()),
aggr_exprs_indices,
Ok(Schema::new_with_metadata(
fields,
input_schema.metadata().clone(),
))
}

@@ -2810,7 +2801,7 @@ mod tests {
vec![false, false], // (a,b)
],
);
let (aggr_schema, _) = create_schema(
let aggr_schema = create_schema(
&input_schema,
&grouping_set,
&aggr_expr,
2 changes: 1 addition & 1 deletion datafusion/physical-plan/src/aggregates/row_hash.rs
Original file line number Diff line number Diff line change
@@ -506,7 +506,7 @@ impl GroupedHashAggregateStream {
// Therefore, when we spill these intermediate states or pass them to another
// aggregation operator, we must use a schema that includes both the group
// columns **and** the partial-state columns.
let (partial_agg_schema, _) = create_schema(
let partial_agg_schema = create_schema(
&agg.input().schema(),
&agg_group_by,
&aggregate_exprs,
6 changes: 2 additions & 4 deletions datafusion/physical-plan/src/windows/mod.rs
Original file line number Diff line number Diff line change
@@ -344,10 +344,8 @@ pub(crate) fn window_equivalence_properties(
.extend(input.equivalence_properties().clone());

let schema_len = schema.fields.len();
let window_expr_indices = (schema_len..(schema_len - window_exprs.len()))
.rev()
.collect::<Vec<_>>();

let window_expr_indices =
((schema_len - window_exprs.len())..schema_len).collect::<Vec<_>>();
for (i, expr) in window_exprs.iter().enumerate() {
if let Some(udf_window_expr) = expr.as_any().downcast_ref::<StandardWindowExpr>()
{
5 changes: 0 additions & 5 deletions datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
@@ -4963,9 +4963,6 @@ false
true
NULL

statement ok
set datafusion.optimizer.prefer_existing_sort = true;

#
# Add valid distinct case as aggregation plan test
#
@@ -4992,8 +4989,6 @@ physical_plan
11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
12)----------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], has_header=true

statement ok
set datafusion.optimizer.prefer_existing_sort = false;

#
# Push limit into distinct group-by aggregation tests
8 changes: 1 addition & 7 deletions datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
Original file line number Diff line number Diff line change
@@ -16,9 +16,6 @@
# specific language governing permissions and limitations
# under the License.

statement ok
set datafusion.optimizer.prefer_existing_sort = true;

query TT
explain select
c_count,
@@ -75,7 +72,4 @@ physical_plan
17)------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
18)--------------------------CoalesceBatchesExec: target_batch_size=8192
19)----------------------------FilterExec: o_comment@2 NOT LIKE %special%requests%, projection=[o_orderkey@0, o_custkey@1]
20)------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_comment], has_header=false

statement ok
set datafusion.optimizer.prefer_existing_sort = false;
20)------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_comment], has_header=false