Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: AggregateMonotonicity #14271

Open
wants to merge 33 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a2919b6
add monotonic function definitions for aggregate expressions
mertak-synnada Jan 16, 2025
14109e6
fix benchmark results
mertak-synnada Jan 16, 2025
b3d75ba
set prefer_existing_sort to true in sqllogictests
mertak-synnada Jan 17, 2025
549502e
set prefer_existing_sort to true in sqllogictests
mertak-synnada Jan 17, 2025
623e0c5
fix typo
mertak-synnada Jan 17, 2025
6a9d24e
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 20, 2025
53ee3de
re-add test_utils.rs changes to the new file
mertak-synnada Jan 20, 2025
97d8951
clone input with Arc
mertak-synnada Jan 20, 2025
cc33031
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 22, 2025
41d9430
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 23, 2025
e988dcf
inject aggr expr indices
mertak-synnada Jan 23, 2025
906245e
remove redundant file
mertak-synnada Jan 23, 2025
475fe2d
add Sum monotonicity
mertak-synnada Jan 24, 2025
57e000e
fix sql logic tests
mertak-synnada Jan 24, 2025
ca57f46
fix sql logic tests
mertak-synnada Jan 24, 2025
6cf9644
Merge branch 'refs/heads/apache_main' into feature/monotonic-sets
mertak-synnada Jan 24, 2025
072e6ef
update docs
mertak-synnada Jan 24, 2025
7d62cb0
Merge branch 'apache_main' into feature/monotonic-sets
berkaysynnada Jan 28, 2025
491aabe
review part 1
berkaysynnada Jan 28, 2025
972c56f
fix the tests
berkaysynnada Jan 29, 2025
4b946b3
revert slt's
berkaysynnada Jan 29, 2025
481b5b4
simplify terms
berkaysynnada Jan 29, 2025
29af731
Update mod.rs
berkaysynnada Jan 29, 2025
1f02953
remove unnecessary computations
berkaysynnada Jan 29, 2025
79dd942
remove index calc
berkaysynnada Jan 29, 2025
247d5fe
Update mod.rs
berkaysynnada Jan 29, 2025
16bdac4
Apply suggestions from code review
ozankabak Jan 29, 2025
1875336
add slt
berkaysynnada Jan 29, 2025
ba7b94f
remove aggregate changes, tests already give expected results
berkaysynnada Jan 30, 2025
2152b7f
fix clippy
berkaysynnada Jan 30, 2025
7822613
remove one row sorts
berkaysynnada Jan 30, 2025
5e9b2db
Improve comments
ozankabak Jan 30, 2025
54d62d6
Use a short name for set monotonicity
ozankabak Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
345 changes: 340 additions & 5 deletions datafusion/core/tests/physical_optimizer/enforce_sorting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
use std::sync::Arc;

use crate::physical_optimizer::test_utils::{
aggregate_exec, bounded_window_exec, check_integrity, coalesce_batches_exec,
aggregate_exec, aggregate_exec_non_set_monotonic, aggregate_exec_set_monotonic,
bounded_window_exec, bounded_window_exec_non_set_monotonic,
bounded_window_exec_with_partition, check_integrity, coalesce_batches_exec,
coalesce_partitions_exec, create_test_schema, create_test_schema2,
create_test_schema3, filter_exec, global_limit_exec, hash_join_exec, limit_exec,
local_limit_exec, memory_exec, parquet_exec, repartition_exec, sort_exec, sort_expr,
sort_expr_options, sort_merge_join_exec, sort_preserving_merge_exec,
spr_repartition_exec, stream_exec_ordered, union_exec, RequirementsTestExec,
create_test_schema3, create_test_schema4, filter_exec, global_limit_exec,
hash_join_exec, limit_exec, local_limit_exec, memory_exec, parquet_exec,
repartition_exec, sort_exec, sort_expr, sort_expr_options, sort_merge_join_exec,
sort_preserving_merge_exec, spr_repartition_exec, stream_exec_ordered, union_exec,
RequirementsTestExec,
};

use datafusion_physical_plan::displayable;
Expand Down Expand Up @@ -238,6 +241,338 @@ async fn test_remove_unnecessary_sort5() -> Result<()> {
Ok(())
}

#[test]
fn test_aggregate_set_monotonic_no_group() -> Result<()> {
let schema = create_test_schema4()?;

let source = memory_exec(&schema);

let sort_exprs = vec![sort_expr("a", &schema)];
let sort = sort_exec(sort_exprs.clone(), source);

let aggregate = aggregate_exec_set_monotonic(sort, vec![]);

let sort_exprs = LexOrdering::new(vec![sort_expr("count", &aggregate.schema())]);
let physical_plan: Arc<dyn ExecutionPlan> =
Arc::new(SortExec::new(sort_exprs.clone(), aggregate)) as _;

let expected_input = [
"SortExec: expr=[count@0 ASC], preserve_partitioning=[false]",
" AggregateExec: mode=Single, gby=[], aggr=[count]",
" SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];

let expected_optimized = [
"AggregateExec: mode=Single, gby=[], aggr=[count]",
ozankabak marked this conversation as resolved.
Show resolved Hide resolved
" MemoryExec: partitions=1, partition_sizes=[0]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[test]
fn test_aggregate_set_monotonic_with_group() -> Result<()> {
let schema = create_test_schema4()?;

let source = memory_exec(&schema);

let sort_exprs = vec![sort_expr("a", &schema)];
let sort = sort_exec(sort_exprs.clone(), source);

let aggregate =
aggregate_exec_set_monotonic(sort, vec![(col("a", &schema)?, "a".to_string())]);

let sort_exprs = LexOrdering::new(vec![sort_expr("count", &aggregate.schema())]);
let physical_plan: Arc<dyn ExecutionPlan> =
Arc::new(SortExec::new(sort_exprs.clone(), aggregate)) as _;

let expected_input = [
"SortExec: expr=[count@1 ASC], preserve_partitioning=[false]",
" AggregateExec: mode=Single, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted",
" SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];

let expected_optimized = [
"SortExec: expr=[count@1 ASC], preserve_partitioning=[false]",
" AggregateExec: mode=Single, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted",
" SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[test]
fn test_aggregate_set_monotonic_with_group_partial() -> Result<()> {
let schema = create_test_schema4()?;

let source = memory_exec(&schema);

let sort_exprs = vec![sort_expr("a", &schema)];
let sort = sort_exec(sort_exprs.clone(), source);

let aggregate =
aggregate_exec_set_monotonic(sort, vec![(col("a", &schema)?, "a".to_string())]);

let sort_exprs = LexOrdering::new(vec![
sort_expr("a", &schema),
sort_expr("count", &aggregate.schema()),
]);
let physical_plan: Arc<dyn ExecutionPlan> =
Arc::new(SortExec::new(sort_exprs.clone(), aggregate)) as _;

let expected_input = [
"SortExec: expr=[a@0 ASC, count@1 ASC], preserve_partitioning=[false]",
" AggregateExec: mode=Single, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted",
" SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];

let expected_optimized = [
"AggregateExec: mode=Single, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted",
" SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[test]
fn test_aggregate_non_set_monotonic() -> Result<()> {
let schema = create_test_schema4()?;
let source = memory_exec(&schema);
let sort_exprs = vec![sort_expr("a", &schema)];
let sort = sort_exec(sort_exprs.clone(), source);

let aggregate = aggregate_exec_non_set_monotonic(sort);
let sort_exprs = LexOrdering::new(vec![sort_expr("avg", &aggregate.schema())]);
let physical_plan: Arc<dyn ExecutionPlan> =
Arc::new(SortExec::new(sort_exprs.clone(), aggregate)) as _;

let expected_input = [
"SortExec: expr=[avg@0 ASC], preserve_partitioning=[false]",
" AggregateExec: mode=Single, gby=[], aggr=[avg]",
" SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];

let expected_optimized = [
"SortExec: expr=[avg@0 ASC], preserve_partitioning=[false]",
ozankabak marked this conversation as resolved.
Show resolved Hide resolved
" AggregateExec: mode=Single, gby=[], aggr=[avg]",
" MemoryExec: partitions=1, partition_sizes=[0]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[tokio::test]
async fn test_bounded_window_set_monotonic_no_partition() -> Result<()> {
let schema = create_test_schema()?;

let source = parquet_exec_sorted(&schema, vec![]);

let sort_exprs = vec![sort_expr_options(
"nullable_col",
&schema,
SortOptions {
descending: true,
nulls_first: false,
},
)];
let sort = sort_exec(sort_exprs.clone(), source);

let bounded_window = bounded_window_exec("nullable_col", vec![], sort);

let output_schema = bounded_window.schema();
let sort_exprs2 = vec![sort_expr_options(
"count",
&output_schema,
SortOptions {
descending: false,
nulls_first: false,
},
)];
let physical_plan = sort_exec(sort_exprs2.clone(), bounded_window);

let expected_input = [
"SortExec: expr=[count@2 ASC NULLS LAST], preserve_partitioning=[false]",
" BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
];
let expected_optimized = [
"BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[tokio::test]
async fn test_bounded_plain_window_set_monotonic_with_partitions() -> Result<()> {
let schema = create_test_schema()?;

let source = parquet_exec_sorted(&schema, vec![]);

let sort_exprs = vec![sort_expr_options(
"nullable_col",
&schema,
SortOptions {
descending: true,
nulls_first: false,
},
)];
let sort = sort_exec(sort_exprs.clone(), source);

let partition_bys = &[col("nullable_col", &schema)?];
let bounded_window = bounded_window_exec_with_partition(
"non_nullable_col",
vec![],
partition_bys,
sort,
false,
);

let output_schema = bounded_window.schema();
let sort_exprs2 = vec![sort_expr_options(
"count",
&output_schema,
SortOptions {
descending: false,
nulls_first: false,
},
)];
let physical_plan = sort_exec(sort_exprs2.clone(), bounded_window);

let expected_input = [
"SortExec: expr=[count@2 ASC NULLS LAST], preserve_partitioning=[false]",
" BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
];
let expected_optimized = [
"SortExec: expr=[count@2 ASC NULLS LAST], preserve_partitioning=[false]",
" BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[tokio::test]
async fn test_bounded_plain_window_set_monotonic_with_partitions_partial() -> Result<()> {
let schema = create_test_schema()?;

let source = parquet_exec_sorted(&schema, vec![]);

let sort_exprs = vec![sort_expr_options(
"nullable_col",
&schema,
SortOptions {
descending: true,
nulls_first: false,
},
)];
let sort = sort_exec(sort_exprs.clone(), source);

let partition_bys = &[col("nullable_col", &schema)?];
let bounded_window = bounded_window_exec_with_partition(
"non_nullable_col",
vec![],
partition_bys,
sort,
false,
);

let output_schema = bounded_window.schema();
let sort_exprs2 = vec![
sort_expr_options(
"nullable_col",
&output_schema,
SortOptions {
descending: true,
nulls_first: false,
},
),
sort_expr_options(
"count",
&output_schema,
SortOptions {
descending: false,
nulls_first: false,
},
),
];
let physical_plan = sort_exec(sort_exprs2.clone(), bounded_window);

let expected_input = [
"SortExec: expr=[nullable_col@0 DESC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
" BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
];
let expected_optimized = [
"BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" SortExec: expr=[nullable_col@0 DESC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[tokio::test]
async fn test_bounded_window_non_set_monotonic_sort() -> Result<()> {
let schema = create_test_schema4()?;
let sort_exprs = vec![sort_expr_options(
"a",
&schema,
SortOptions {
descending: true,
nulls_first: false,
},
)];
let source = parquet_exec_sorted(&schema, sort_exprs.clone());
let sort = sort_exec(sort_exprs.clone(), source);

let bounded_window =
bounded_window_exec_non_set_monotonic("a", sort_exprs.clone(), sort);
let output_schema = bounded_window.schema();
let sort_exprs2 = vec![sort_expr_options(
"avg",
&output_schema,
SortOptions {
descending: false,
nulls_first: false,
},
)];
let physical_plan = sort_exec(sort_exprs2.clone(), bounded_window);

let expected_input = [
"SortExec: expr=[avg@5 ASC NULLS LAST], preserve_partitioning=[false]",
" BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST]",
];
let expected_optimized = [
"SortExec: expr=[avg@5 ASC NULLS LAST], preserve_partitioning=[false]",
" BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
" ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST]",
];
assert_optimized!(expected_input, expected_optimized, physical_plan, true);

Ok(())
}

#[tokio::test]
async fn test_do_not_remove_sort_with_limit() -> Result<()> {
let schema = create_test_schema()?;
Expand Down
Loading