From 82be1bdf331c2f8a62eb934a924cd4d5d8a00270 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 30 Dec 2024 21:37:56 +0800 Subject: [PATCH 01/20] refactor all --- be/src/pipeline/dependency.h | 34 +- .../pipeline/exec/analytic_sink_operator.cpp | 642 +++++++++++++++--- be/src/pipeline/exec/analytic_sink_operator.h | 130 +++- .../exec/analytic_source_operator.cpp | 573 +--------------- .../pipeline/exec/analytic_source_operator.h | 99 +-- 5 files changed, 713 insertions(+), 765 deletions(-) diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 1b811219c322e9..6989389535110c 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -564,26 +564,36 @@ struct BlockRowPos { } }; +struct BoundaryPose { + BlockRowPos start; + BlockRowPos end; + bool is_ended = false; +}; + struct AnalyticSharedState : public BasicSharedState { ENABLE_FACTORY_CREATOR(AnalyticSharedState) public: AnalyticSharedState() = default; - int64_t current_row_position = 0; - BlockRowPos partition_by_end; - int64_t input_total_rows = 0; - BlockRowPos all_block_end; - std::vector input_blocks; - bool input_eos = false; - BlockRowPos found_partition_end; - std::vector origin_cols; - std::vector input_block_first_row_positions; - std::vector> agg_input_columns; + // int64_t current_row_position = 0; + // BlockRowPos partition_by_end; + // int64_t input_total_rows = 0; + // BlockRowPos all_block_end; + // std::vector input_blocks; + // bool input_eos = false; + // BlockRowPos found_partition_end; + // std::vector origin_cols; + // std::vector input_block_first_row_positions; + // std::vector> agg_input_columns; + std::queue blocks_buffer; + std::mutex buffer_mutex; + bool sink_eos = false; + std::mutex sink_eos_lock; // TODO: maybe global? - std::vector partition_by_column_idxs; - std::vector ordey_by_column_idxs; + // std::vector partition_by_column_idxs; + // std::vector order_by_column_idxs; }; struct JoinSharedState : public BasicSharedState { diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 7cc25eef9446d6..16453c594c808b 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -18,6 +18,8 @@ #include "analytic_sink_operator.h" +#include + #include #include "pipeline/exec/operator.h" @@ -34,6 +36,58 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); + _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); + _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); + _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); + _agg_arena_pool = std::make_unique(); + auto& p = _parent->cast(); + if (!p._has_window) { //haven't set window, Unbounded: [unbounded preceding,unbounded following] + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_partition; + } else if (p._has_range_window) { + // RANGE windows must have UNBOUNDED PRECEDING + // RANGE window end bound must be CURRENT ROW or UNBOUNDED FOLLOWING + if (!p._has_window_end) { //haven't set end, so same as PARTITION, [unbounded preceding, unbounded following] + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_partition; + + } else { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_range; + } + } else { + //haven't set start and end, same as PARTITION + if (!p._has_window_start && !p._has_window_end) { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_partition; + } else if (!p._has_window_start) { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_rows; + } else { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_sliding_rows; + } + + if (p._has_window_start) { //calculate start boundary + TAnalyticWindowBoundary b = p._window.window_start; + if (b.__isset.rows_offset_value) { //[offset , ] + _rows_start_offset = b.rows_offset_value; + if (b.type == TAnalyticWindowBoundaryType::PRECEDING) { + _rows_start_offset *= -1; //preceding--> negative + } //current_row 0 + } else { //following positive + DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::CURRENT_ROW); //[current row, ] + _rows_start_offset = 0; + } + } + + if (p._has_window_end) { //calculate end boundary + TAnalyticWindowBoundary b = p._window.window_end; + if (b.__isset.rows_offset_value) { //[ , offset] + _rows_end_offset = b.rows_offset_value; + if (b.type == TAnalyticWindowBoundaryType::PRECEDING) { + _rows_end_offset *= -1; + } + } else { + DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::CURRENT_ROW); //[ ,current row] + _rows_end_offset = 0; + } + } + } return Status::OK(); } @@ -42,70 +96,320 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); - _shared_state->partition_by_column_idxs.resize(p._partition_by_eq_expr_ctxs.size()); - _shared_state->ordey_by_column_idxs.resize(p._order_by_eq_expr_ctxs.size()); - - size_t agg_size = p._agg_expr_ctxs.size(); - _agg_expr_ctxs.resize(agg_size); - _shared_state->agg_input_columns.resize(agg_size); - for (int i = 0; i < agg_size; ++i) { - _shared_state->agg_input_columns[i].resize(p._num_agg_input[i]); + + _agg_functions_size = p._agg_functions_size; + _agg_expr_ctxs.resize(_agg_functions_size); + _agg_functions.resize(_agg_functions_size); + _agg_input_columns.resize(_agg_functions_size); + + for (int i = 0; i < _agg_functions_size; ++i) { + _agg_functions[i] = p._agg_functions[i]->clone(state, state->obj_pool()); + _agg_input_columns[i].resize(p._num_agg_input[i]); _agg_expr_ctxs[i].resize(p._agg_expr_ctxs[i].size()); for (int j = 0; j < p._agg_expr_ctxs[i].size(); ++j) { RETURN_IF_ERROR(p._agg_expr_ctxs[i][j]->clone(state, _agg_expr_ctxs[i][j])); - } - - for (size_t j = 0; j < _agg_expr_ctxs[i].size(); ++j) { - _shared_state->agg_input_columns[i][j] = - _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); + _agg_input_columns[i][j] = _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); } } _partition_by_eq_expr_ctxs.resize(p._partition_by_eq_expr_ctxs.size()); + _partition_by_column_idxs.resize(p._partition_by_eq_expr_ctxs.size()); for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); i++) { RETURN_IF_ERROR( p._partition_by_eq_expr_ctxs[i]->clone(state, _partition_by_eq_expr_ctxs[i])); } _order_by_eq_expr_ctxs.resize(p._order_by_eq_expr_ctxs.size()); + _order_by_column_idxs.resize(p._order_by_eq_expr_ctxs.size()); for (size_t i = 0; i < _order_by_eq_expr_ctxs.size(); i++) { RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); } + _fn_place_ptr = _agg_arena_pool->aligned_alloc(p._total_size_of_aggregate_states, + p._align_aggregate_states); + _create_agg_status(); + return Status::OK(); +} + +Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { + do { + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + auto remain_size = _current_row_position - current_block_base_pos - batch_size; + + _init_result_columns(); + _get_partition_by_end(); + while (_current_row_position < _partition_by_pose.end.pos && remain_size > 0) { + // return {_current_row_position + _rows_start_offset, _current_row_position + _rows_end_offset + 1}; + const bool is_n_following_frame = _rows_end_offset > 0; + auto range_start = _current_row_position + _rows_start_offset; + auto range_end = _current_row_position + _rows_end_offset + 1; + // For window clause like `ROWS BETWEEN N PRECEDING AND M FOLLOWING`, + // if the current chunk has not reach the partition boundary, it may need more data. + if (is_n_following_frame && !_partition_by_pose.is_ended && + range_end > _partition_by_pose.end.pos) { + return Status::OK(); + } + _reset_agg_status(); + _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, + range_start, range_end); + _insert_result_info(1); + _current_row_position++; + remain_size--; + } + if (_partition_by_pose.is_ended && _current_row_position == _partition_by_pose.end.pos) { + _reset_state_for_next_partition(); + } + + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + } while (_has_input_data()); + return Status::OK(); +} + +Status AnalyticSinkLocalState::_get_next_for_rows() { + do { + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + auto remain_size = _current_row_position - current_block_base_pos - batch_size; + + _init_result_columns(); + _get_partition_by_end(); + while (_current_row_position < _partition_by_pose.end.pos && remain_size > 0) { + // return {_partition.start, _current_row_position + _rows_end_offset + 1}; + const bool is_n_following_frame = _rows_end_offset > 0; + auto current_row_end = _current_row_position + _rows_end_offset + 1; + // if the current chunk has not reach the partition boundary, it may need more data. + if (is_n_following_frame && !_partition_by_pose.is_ended && + current_row_end > _partition_by_pose.end.pos) { + return Status::OK(); + } + + if (is_n_following_frame && _current_row_position == _partition_by_pose.start.pos) { + _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, + _partition_by_pose.start.pos, current_row_end - 1); + } + _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, + current_row_end - 1, current_row_end); + _insert_result_info(1); + _current_row_position++; + remain_size--; + } + if (_partition_by_pose.is_ended && _current_row_position == _partition_by_pose.end.pos) { + _reset_state_for_next_partition(); + } + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + } while (_has_input_data()); + return Status::OK(); +} + +Status AnalyticSinkLocalState::_get_next_for_partition() { + while (_has_input_data()) { + { + SCOPED_TIMER(_evaluation_timer); + _get_partition_by_end(); + if (!_partition_by_pose.is_ended) { + break; + } + _init_result_columns(); + if (_current_row_position == _partition_by_pose.start.pos) { + _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, + _partition_by_pose.start.pos, _partition_by_pose.end.pos); + } + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + + // the end pos maybe after multis blocks, but should output by batch size and should not exceed partition end + auto window_end_pos = _current_row_position + batch_size; + window_end_pos = std::min(window_end_pos, _partition_by_pose.end.pos); + + auto previous_window_frame_width = _current_row_position - current_block_base_pos; + auto current_window_frame_width = window_end_pos - current_block_base_pos; + // should not exceed block batch size + current_window_frame_width = std::min(current_window_frame_width, batch_size); + auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; + + _insert_result_info(real_deal_with_width); + _current_row_position += real_deal_with_width; + + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + if (_current_row_position == _partition_by_pose.end.pos) { + _reset_state_for_next_partition(); + } + } + } + return Status::OK(); +} + +Status AnalyticSinkLocalState::_get_next_for_range() { + bool has_finish_current_partition = true; + while (_has_input_data()) { + if (has_finish_current_partition) { + _get_partition_by_end(); + } + _update_order_by_range(); + if (!_order_by_pose.is_ended) { + break; + } + + // maybe need break the loop + if (_current_row_position < _order_by_pose.end.pos) { + // real frame is [partition_start, order_by_end] + // but the real deal with frame is [order_by_start, order_by_end] + _execute_for_win_func(_order_by_pose.start.pos, _order_by_pose.end.pos, + _order_by_pose.start.pos, _order_by_pose.end.pos); + } + + while (_current_row_position < _order_by_pose.end.pos) { + _init_result_columns(); + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + + auto previous_window_frame_width = _current_row_position - current_block_base_pos; + auto current_window_frame_width = _order_by_pose.end.pos - current_block_base_pos; + current_window_frame_width = std::min(current_window_frame_width, batch_size); + auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; + + _insert_result_info(real_deal_with_width); + _current_row_position += real_deal_with_width; + + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + } + + if (_partition_by_pose.is_ended && _current_row_position == _order_by_pose.end.pos) { + has_finish_current_partition = true; + _reset_state_for_next_partition(); + } else { + has_finish_current_partition = false; + } + } + return Status::OK(); +} + +void AnalyticSinkLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, + int64_t frame_start, int64_t frame_end) { + SCOPED_TIMER(_execute_timer); + for (size_t i = 0; i < _agg_functions_size; ++i) { + std::vector agg_columns; + for (int j = 0; j < _agg_input_columns[i].size(); ++j) { + agg_columns.push_back(_agg_input_columns[i][j].get()); + } + _agg_functions[i]->function()->add_range_single_place( + partition_start, partition_end, frame_start, frame_end, + _fn_place_ptr + + _parent->cast()._offsets_of_aggregate_states[i], + agg_columns.data(), _agg_arena_pool.get()); + + // If the end is not greater than the start, the current window should be empty. + _current_window_empty = + std::min(frame_end, partition_end) <= std::max(frame_start, partition_start); + } +} + +void AnalyticSinkLocalState::_insert_result_info(int64_t real_deal_with_width) { + SCOPED_TIMER(_get_result_timer); + const auto& offsets_of_aggregate_states = + _parent->cast()._offsets_of_aggregate_states; + for (size_t i = 0; i < _agg_functions_size; ++i) { + for (size_t j = 0; j < real_deal_with_width; ++j) { + if (!_agg_functions[i]->function()->get_return_type()->is_nullable() && + _result_window_columns[i]->is_nullable()) { + if (_current_window_empty) { + _result_window_columns[i]->insert_default(); + } else { + auto* dst = assert_cast( + _result_window_columns[i].get()); + dst->get_null_map_data().push_back(0); + _agg_functions[i]->insert_result_info( + _fn_place_ptr + offsets_of_aggregate_states[i], + &dst->get_nested_column()); + } + continue; + } + _agg_functions[i]->insert_result_info(_fn_place_ptr + offsets_of_aggregate_states[i], + _result_window_columns[i].get()); + } + } +} + +Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { + block->swap(std::move(_input_blocks[_output_block_index])); + // _blocks_memory_usage->add(-block->allocated_bytes()); + if (_input_col_ids.size() < block->columns()) { + block->erase_not_in(_input_col_ids); + } + + DCHECK(_parent->cast()._change_to_nullable_flags.size() == + _result_window_columns.size()); + for (size_t i = 0; i < _result_window_columns.size(); ++i) { + if (_parent->cast()._change_to_nullable_flags[i]) { + block->insert({make_nullable(std::move(_result_window_columns[i])), + make_nullable(_agg_functions[i]->data_type()), ""}); + } else { + block->insert( + {std::move(_result_window_columns[i]), _agg_functions[i]->data_type(), ""}); + } + } + + _output_block_index++; return Status::OK(); } -bool AnalyticSinkLocalState::_whether_need_next_partition(BlockRowPos& found_partition_end) { - auto& shared_state = *_shared_state; - if (shared_state.input_eos || - (shared_state.current_row_position < - shared_state.partition_by_end.pos)) { //now still have partition data - return false; +void AnalyticSinkLocalState::_init_result_columns() { + if (_current_row_position - _input_block_first_row_positions[_output_block_index] == 0) { + _result_window_columns.resize(_agg_functions_size); + for (size_t i = 0; i < _agg_functions_size; ++i) { + _result_window_columns[i] = + _agg_functions[i]->data_type()->create_column(); //return type + } } - if ((_partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || - (found_partition_end.pos == 0)) { //no partition, get until fetch to EOS - return true; +} + +void AnalyticSinkLocalState::_refresh_buffer_and_dependency_state(vectorized::Block* block) { + size_t buffer_size = 0; + { + std::unique_lock lc(_shared_state->buffer_mutex); + _shared_state->blocks_buffer.push(std::move(*block)); + buffer_size = _shared_state->blocks_buffer.size(); } - if (!_partition_by_eq_expr_ctxs.empty() && - found_partition_end.pos == shared_state.all_block_end.pos && - !shared_state.input_eos) { //current partition data calculate done - return true; + if (buffer_size > 128) { + // buffer have enough data, could block the sink + _dependency->block(); } - return false; + // buffer have push data, could signal the source to read + _dependency->set_ready_to_read(); +} +void AnalyticSinkLocalState::_reset_state_for_next_partition() { + _partition_by_pose.start = _partition_by_pose.end; + _current_row_position = _partition_by_pose.start.pos; + _reset_agg_status(); } //_partition_by_columns,_order_by_columns save in blocks, so if need to calculate the boundary, may find in which blocks firstly BlockRowPos AnalyticSinkLocalState::_compare_row_to_find_end(int64_t idx, BlockRowPos start, BlockRowPos end, bool need_check_first) { - auto& shared_state = *_shared_state; int64_t start_init_row_num = start.row_num; - vectorized::ColumnPtr start_column = - shared_state.input_blocks[start.block_num].get_by_position(idx).column; + vectorized::ColumnPtr start_column = _input_blocks[start.block_num].get_by_position(idx).column; vectorized::ColumnPtr start_next_block_column = start_column; DCHECK_LE(start.block_num, end.block_num); - DCHECK_LE(start.block_num, shared_state.input_blocks.size() - 1); + DCHECK_LE(start.block_num, _input_blocks.size() - 1); int64_t start_block_num = start.block_num; int64_t end_block_num = end.block_num; - int64_t mid_blcok_num = end.block_num; + int64_t mid_block_num = end.block_num; // To fix this problem: https://github.com/apache/doris/issues/15951 // in this case, the partition by column is last row of block, so it's pointed to a new block at row = 0, range is: [left, right) // From the perspective of order by column, the two values are exactly equal. @@ -113,27 +417,25 @@ BlockRowPos AnalyticSinkLocalState::_compare_row_to_find_end(int64_t idx, BlockR if (need_check_first && end.block_num > 0 && end.row_num == 0) { end.block_num--; end_block_num--; - end.row_num = shared_state.input_blocks[end_block_num].rows(); + end.row_num = _input_blocks[end_block_num].rows(); } //binary search find in which block while (start_block_num < end_block_num) { - mid_blcok_num = (start_block_num + end_block_num + 1) >> 1; - start_next_block_column = - shared_state.input_blocks[mid_blcok_num].get_by_position(idx).column; + mid_block_num = (start_block_num + end_block_num + 1) >> 1; + start_next_block_column = _input_blocks[mid_block_num].get_by_position(idx).column; //Compares (*this)[n] and rhs[m], this: start[init_row] rhs: mid[0] if (start_column->compare_at(start_init_row_num, 0, *start_next_block_column, 1) == 0) { - start_block_num = mid_blcok_num; + start_block_num = mid_block_num; } else { - end_block_num = mid_blcok_num - 1; + end_block_num = mid_block_num - 1; } } - // have check the start.block_num: start_column[start_init_row_num] with mid_blcok_num start_next_block_column[0] + // have check the start.block_num: start_column[start_init_row_num] with mid_block_num start_next_block_column[0] // now next block must not be result, so need check with end_block_num: start_next_block_column[last_row] - if (end_block_num == mid_blcok_num - 1) { - start_next_block_column = - shared_state.input_blocks[end_block_num].get_by_position(idx).column; - int64_t block_size = shared_state.input_blocks[end_block_num].rows(); + if (end_block_num == mid_block_num - 1) { + start_next_block_column = _input_blocks[end_block_num].get_by_position(idx).column; + int64_t block_size = _input_blocks[end_block_num].rows(); if ((start_column->compare_at(start_init_row_num, block_size - 1, *start_next_block_column, 1) == 0)) { start.block_num = end_block_num + 1; @@ -147,11 +449,11 @@ BlockRowPos AnalyticSinkLocalState::_compare_row_to_find_end(int64_t idx, BlockR if (start_block_num != start.block_num) { start_init_row_num = 0; start.block_num = start_block_num; - start_column = shared_state.input_blocks[start.block_num].get_by_position(idx).column; + start_column = _input_blocks[start.block_num].get_by_position(idx).column; } //binary search, set start and end pos int64_t start_pos = start_init_row_num; - int64_t end_pos = shared_state.input_blocks[start.block_num].rows(); + int64_t end_pos = _input_blocks[start.block_num].rows(); //if end_block_num haven't moved, only start_block_num go to the end block //so could use the end.row_num for binary search if (start.block_num == end.block_num) { @@ -169,32 +471,80 @@ BlockRowPos AnalyticSinkLocalState::_compare_row_to_find_end(int64_t idx, BlockR return start; } -BlockRowPos AnalyticSinkLocalState::_get_partition_by_end() { - auto& shared_state = *_shared_state; - if (shared_state.current_row_position < - shared_state.partition_by_end.pos) { //still have data, return partition_by_end directly - return shared_state.partition_by_end; +void AnalyticSinkLocalState::_update_order_by_range() { + if (_order_by_pose.is_ended && _current_row_position < _order_by_pose.end.pos) { + return; + } + + if (_order_by_pose.is_ended) { + _order_by_pose.start = _order_by_pose.end; + } + _order_by_pose.end = _partition_by_pose.end; + + for (size_t i = 0; i < _order_by_exprs_size; ++i) { + _order_by_pose.end = _compare_row_to_find_end( + _order_by_column_idxs[i], _order_by_pose.start, _order_by_pose.end, true); + } + _order_by_pose.start.pos = _input_block_first_row_positions[_order_by_pose.start.block_num] + + _order_by_pose.start.row_num; + _order_by_pose.end.pos = _input_block_first_row_positions[_order_by_pose.end.block_num] + + _order_by_pose.end.row_num; + // `_order_by_end` will be assigned to `_order_by_start` next time, + // so make it a valid position. + if (_order_by_pose.end.row_num == _input_blocks[_order_by_pose.end.block_num].rows()) { + _order_by_pose.end.block_num++; + _order_by_pose.end.row_num = 0; + } + + if (_order_by_pose.end.pos < _partition_by_pose.end.pos) { + _order_by_pose.is_ended = true; + return; + } + DCHECK_EQ(_partition_by_pose.end.pos, _order_by_pose.end.pos); + if (_partition_by_pose.is_ended) { + _order_by_pose.is_ended = true; + return; } + _order_by_pose.is_ended = false; +} - if (_partition_by_eq_expr_ctxs.empty() || - (shared_state.input_total_rows == 0)) { //no partition_by, the all block is end - return shared_state.all_block_end; +void AnalyticSinkLocalState::_get_partition_by_end() { + //still have data, return partition_by_end directly + if (_partition_by_pose.is_ended && _current_row_position < _partition_by_pose.end.pos) { + return; + } + //no partition_by, the all block is end + if (_partition_by_eq_expr_ctxs.empty() || (_input_total_rows == 0)) { + _partition_by_pose.end.block_num = _input_blocks.size() - 1; + _partition_by_pose.end.row_num = _input_blocks.back().rows(); + _partition_by_pose.end.pos = _input_total_rows; + _partition_by_pose.is_ended = _input_eos; + return; } - BlockRowPos cal_end = shared_state.all_block_end; - for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); - ++i) { //have partition_by, binary search the partiton end - cal_end = _compare_row_to_find_end(shared_state.partition_by_column_idxs[i], - shared_state.partition_by_end, cal_end); + BlockRowPos cal_end = _all_block_end; + //have partition_by, binary search the partition end + for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); ++i) { + cal_end = _compare_row_to_find_end(_partition_by_column_idxs[i], _partition_by_pose.end, + cal_end); } - cal_end.pos = shared_state.input_block_first_row_positions[cal_end.block_num] + cal_end.row_num; - return cal_end; + cal_end.pos = _input_block_first_row_positions[cal_end.block_num] + cal_end.row_num; + _partition_by_pose.end = cal_end; + if (_partition_by_pose.end.pos < _input_total_rows) { + _partition_by_pose.is_ended = true; + return; + } + DCHECK_EQ(_partition_by_pose.end.pos, _input_total_rows); + _partition_by_pose.is_ended = _input_eos; } AnalyticSinkOperatorX::AnalyticSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, const DescriptorTbl& descs, bool require_bucket_distribution) : DataSinkOperatorX(operator_id, tnode.node_id), + _pool(pool), + _intermediate_tuple_id(tnode.analytic_node.intermediate_tuple_id), + _output_tuple_id(tnode.analytic_node.output_tuple_id), _buffered_tuple_id(tnode.analytic_node.__isset.buffered_tuple_id ? tnode.analytic_node.buffered_tuple_id : 0), @@ -202,20 +552,45 @@ AnalyticSinkOperatorX::AnalyticSinkOperatorX(ObjectPool* pool, int operator_id, _require_bucket_distribution(require_bucket_distribution), _partition_exprs(tnode.__isset.distribute_expr_lists && require_bucket_distribution ? tnode.distribute_expr_lists[0] - : tnode.analytic_node.partition_exprs) { + : tnode.analytic_node.partition_exprs), + _window(tnode.analytic_node.window), + _has_window(tnode.analytic_node.__isset.window), + _has_range_window(tnode.analytic_node.window.type == TAnalyticWindowType::RANGE), + _has_window_start(tnode.analytic_node.window.__isset.window_start), + _has_window_end(tnode.analytic_node.window.__isset.window_end) { _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; + _fn_scope = AnalyticFnScope::PARTITION; + if (_has_window && _has_range_window) { + // haven't set end, so same as PARTITION, [unbounded preceding, unbounded following] + if (_has_window_end) { + _fn_scope = AnalyticFnScope::RANGE; // range: [unbounded preceding,current row] + } + } else if (_has_window) { + if (_has_window_start || _has_window_end) { + // both not set, same as PARTITION + _fn_scope = AnalyticFnScope::ROWS; + } + } } Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); const TAnalyticNode& analytic_node = tnode.analytic_node; - size_t agg_size = analytic_node.analytic_functions.size(); - _agg_expr_ctxs.resize(agg_size); - _num_agg_input.resize(agg_size); - for (int i = 0; i < agg_size; ++i) { + _agg_functions_size = analytic_node.analytic_functions.size(); + _agg_expr_ctxs.resize(_agg_functions_size); + _num_agg_input.resize(_agg_functions_size); + for (int i = 0; i < _agg_functions_size; ++i) { const TExpr& desc = analytic_node.analytic_functions[i]; - _num_agg_input[i] = desc.nodes[0].num_children; + vectorized::AggFnEvaluator* evaluator = nullptr; + // Window function treats all NullableAggregateFunction as AlwaysNullable. + // Its behavior is same with executed without group by key. + // https://github.com/apache/doris/pull/40693 + RETURN_IF_ERROR(vectorized::AggFnEvaluator::create(_pool, desc, {}, /*without_key*/ true, + &evaluator)); + _agg_functions.emplace_back(evaluator); + int node_idx = 0; + _num_agg_input[i] = desc.nodes[0].num_children; for (int j = 0; j < desc.nodes[0].num_children; ++j) { ++node_idx; vectorized::VExprSPtr expr; @@ -230,7 +605,6 @@ Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) _partition_by_eq_expr_ctxs)); RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(analytic_node.order_by_exprs, _order_by_eq_expr_ctxs)); - _agg_functions_size = agg_size; return Status::OK(); } @@ -239,6 +613,17 @@ Status AnalyticSinkOperatorX::open(RuntimeState* state) { for (const auto& ctx : _agg_expr_ctxs) { RETURN_IF_ERROR(vectorized::VExpr::prepare(ctx, state, _child->row_desc())); } + _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); + _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); + for (size_t i = 0; i < _agg_functions_size; ++i) { + SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[i]; + SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[i]; + RETURN_IF_ERROR(_agg_functions[i]->prepare(state, _child->row_desc(), + intermediate_slot_desc, output_slot_desc)); + _agg_functions[i]->set_version(state->be_exec_version()); + _change_to_nullable_flags.push_back(output_slot_desc->is_nullable() && + !_agg_functions[i]->data_type()->is_nullable()); + } if (!_partition_by_eq_expr_ctxs.empty() || !_order_by_eq_expr_ctxs.empty()) { vector tuple_ids; tuple_ids.push_back(_child->row_desc().tuple_descriptors()[0]->id()); @@ -253,11 +638,34 @@ Status AnalyticSinkOperatorX::open(RuntimeState* state) { vectorized::VExpr::prepare(_order_by_eq_expr_ctxs, state, cmp_row_desc)); } } + RETURN_IF_ERROR(vectorized::VExpr::open(_partition_by_eq_expr_ctxs, state)); RETURN_IF_ERROR(vectorized::VExpr::open(_order_by_eq_expr_ctxs, state)); for (size_t i = 0; i < _agg_functions_size; ++i) { + RETURN_IF_ERROR(_agg_functions[i]->open(state)); RETURN_IF_ERROR(vectorized::VExpr::open(_agg_expr_ctxs[i], state)); } + + _offsets_of_aggregate_states.resize(_agg_functions_size); + for (size_t i = 0; i < _agg_functions_size; ++i) { + _offsets_of_aggregate_states[i] = _total_size_of_aggregate_states; + const auto& agg_function = _agg_functions[i]->function(); + // aggregate states are aligned based on maximum requirement + _align_aggregate_states = std::max(_align_aggregate_states, agg_function->align_of_data()); + _total_size_of_aggregate_states += agg_function->size_of_data(); + // If not the last aggregate_state, we need pad it so that next aggregate_state will be aligned. + if (i + 1 < _agg_functions_size) { + size_t alignment_of_next_state = _agg_functions[i + 1]->function()->align_of_data(); + if ((alignment_of_next_state & (alignment_of_next_state - 1)) != 0) { + return Status::RuntimeError("Logical error: align_of_data is not 2^N"); + } + /// Extend total_size to next alignment requirement + /// Add padding by rounding up 'total_size_of_aggregate_states' to be a multiplier of alignment_of_next_state. + _total_size_of_aggregate_states = + (_total_size_of_aggregate_states + alignment_of_next_state - 1) / + alignment_of_next_state * alignment_of_next_state; + } + } return Status::OK(); } @@ -266,37 +674,46 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); - local_state._shared_state->input_eos = eos; - if (local_state._shared_state->input_eos && input_block->rows() == 0) { - local_state._dependency->set_ready_to_read(); - local_state._dependency->block(); + local_state._input_eos = eos; + if (input_block->rows() > 0) { + RETURN_IF_ERROR(_add_input_block(state, input_block)); + RETURN_IF_ERROR((local_state.*(local_state._executor.get_next_impl))()); + } + if (local_state._input_eos) { + local_state._dependency->set_ready_to_read(); // ready for source to read + std::unique_lock lc(local_state._shared_state->sink_eos_lock); + local_state._shared_state->sink_eos = true; return Status::OK(); } + return Status::OK(); +} - local_state._shared_state->input_block_first_row_positions.emplace_back( - local_state._shared_state->input_total_rows); +Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, + vectorized::Block* input_block) { + auto& local_state = get_local_state(state); + local_state._input_block_first_row_positions.emplace_back(local_state._input_total_rows); size_t block_rows = input_block->rows(); - local_state._shared_state->input_total_rows += block_rows; - local_state._shared_state->all_block_end.block_num = - local_state._shared_state->input_blocks.size(); - local_state._shared_state->all_block_end.row_num = block_rows; - local_state._shared_state->all_block_end.pos = local_state._shared_state->input_total_rows; - - if (local_state._shared_state->origin_cols - .empty()) { //record origin columns, maybe be after this, could cast some column but no need to save + local_state._input_total_rows += block_rows; + + local_state._all_block_end.block_num = local_state._input_blocks.size(); + local_state._all_block_end.row_num = block_rows; + local_state._all_block_end.pos = local_state._input_total_rows; + + // record origin columns, maybe be after this, could cast some column but no need to output + if (local_state._input_col_ids.empty()) { for (int c = 0; c < input_block->columns(); ++c) { - local_state._shared_state->origin_cols.emplace_back(c); + local_state._input_col_ids.emplace_back(c); } } { SCOPED_TIMER(local_state._compute_agg_data_timer); - for (size_t i = 0; i < _agg_functions_size; - ++i) { //insert _agg_input_columns, execute calculate for its + //insert _agg_input_columns, execute calculate for its, and those columns maybe could remove have used data + for (size_t i = 0; i < _agg_functions_size; ++i) { for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { - RETURN_IF_ERROR(_insert_range_column( - input_block, local_state._agg_expr_ctxs[i][j], - local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + RETURN_IF_ERROR(_insert_range_column(input_block, local_state._agg_expr_ctxs[i][j], + local_state._agg_input_columns[i][j].get(), + block_rows)); } } } @@ -307,7 +724,7 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block RETURN_IF_ERROR(local_state._partition_by_eq_expr_ctxs[i]->execute(input_block, &result_col_id)); DCHECK_GE(result_col_id, 0); - local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + local_state._partition_by_column_idxs[i] = result_col_id; } } @@ -318,20 +735,12 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block RETURN_IF_ERROR( local_state._order_by_eq_expr_ctxs[i]->execute(input_block, &result_col_id)); DCHECK_GE(result_col_id, 0); - local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + local_state._order_by_column_idxs[i] = result_col_id; } } COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); - - //TODO: if need improvement, the is a tips to maintain a free queue, - //so the memory could reuse, no need to new/delete again; - local_state._shared_state->input_blocks.emplace_back(std::move(*input_block)); - { - SCOPED_TIMER(local_state._evaluation_timer); - local_state._shared_state->found_partition_end = local_state._get_partition_by_end(); - } - local_state._refresh_need_more_input(); + local_state._input_blocks.emplace_back(std::move(*input_block)); return Status::OK(); } @@ -346,6 +755,43 @@ Status AnalyticSinkOperatorX::_insert_range_column(vectorized::Block* block, return Status::OK(); } +void AnalyticSinkLocalState::_reset_agg_status() { + for (size_t i = 0; i < _agg_functions_size; ++i) { + _agg_functions[i]->reset( + _fn_place_ptr + + _parent->cast()._offsets_of_aggregate_states[i]); + } +} + +void AnalyticSinkLocalState::_create_agg_status() { + for (size_t i = 0; i < _agg_functions_size; ++i) { + try { + _agg_functions[i]->create( + _fn_place_ptr + + _parent->cast()._offsets_of_aggregate_states[i]); + } catch (...) { + for (int j = 0; j < i; ++j) { + _agg_functions[j]->destroy( + _fn_place_ptr + + _parent->cast()._offsets_of_aggregate_states[j]); + } + throw; + } + } + _agg_functions_created = true; +} + +void AnalyticSinkLocalState::_destroy_agg_status() { + if (UNLIKELY(_fn_place_ptr == nullptr || !_agg_functions_created)) { + return; + } + for (size_t i = 0; i < _agg_functions_size; ++i) { + _agg_functions[i]->destroy( + _fn_place_ptr + + _parent->cast()._offsets_of_aggregate_states[i]); + } +} + template class DataSinkOperatorX; } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 0ff7c4e4e047bd..9416a868baa4f0 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -27,6 +27,7 @@ namespace doris { #include "common/compile_check_begin.h" namespace pipeline { class AnalyticSinkOperatorX; +enum AnalyticFnScope { PARTITION, RANGE, ROWS }; class AnalyticSinkLocalState : public PipelineXSinkLocalState { ENABLE_FACTORY_CREATOR(AnalyticSinkLocalState); @@ -40,31 +41,97 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalStatefound_partition_end); - if (need_more_input) { - _dependency->set_block_to_read(); - _dependency->set_ready(); - } else { - _dependency->block(); - _dependency->set_ready_to_read(); - } - return need_more_input; - } - BlockRowPos _get_partition_by_end(); + // For window frame `ROWS|RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` + Status _get_next_for_partition(); + // For window frame `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW` + Status _get_next_for_range(); + // 1. `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW` + // 2. `ROWS BETWEEN UNBOUNDED PRECEDING AND N PRECEDING` + // 3. `ROWS BETWEEN UNBOUNDED PRECEDING AND N FOLLOWING` + Status _get_next_for_rows(); + + // 1. `ROWS BETWEEN N PRECEDING AND M PRECEDING` or + // 2. `ROWS BETWEEN N FOLLOWING AND M FOLLOWING` or + // 3. `ROWS BETWEEN N PRECEDING AND M FOLLOWING` or + // 4. `ROWS BETWEEN N PRECEDING AND CURRENT ROW` or + // 5. `ROWS BETWEEN CURRENT ROW AND M FOLLOWING` + Status _get_next_for_sliding_rows(); + + void _execute_for_win_func(int64_t partition_start, int64_t partition_end, int64_t frame_start, + int64_t frame_end); + void _insert_result_info(int64_t real_deal_with_width); + // void _insert_result_info(int64_t start, int64_t end); + Status output_current_block(vectorized::Block* block); + void _init_result_columns(); + + void _reset_agg_status(); + void _create_agg_status(); + void _destroy_agg_status(); + + void _update_order_by_range(); + void _get_partition_by_end(); BlockRowPos _compare_row_to_find_end(int64_t idx, BlockRowPos start, BlockRowPos end, bool need_check_first = false); - bool _whether_need_next_partition(BlockRowPos& found_partition_end); + bool _has_input_data() {return _output_block_index < _input_blocks.size();} + bool _check_need_block_task(); + void _refresh_buffer_and_dependency_state(vectorized::Block* block); + void _reset_state_for_next_partition(); + + std::vector _agg_expr_ctxs; + vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; + vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; + const size_t _partition_exprs_size = 0; + const size_t _order_by_exprs_size = 0; + + size_t _agg_functions_size = 0; + vectorized::AggregateDataPtr _fn_place_ptr = nullptr; + std::unique_ptr _agg_arena_pool = nullptr; + std::vector _agg_functions; + + using vectorized_get_next = Status (AnalyticSinkLocalState::*)(); + struct executor { + vectorized_get_next get_next_impl; + }; + executor _executor; + + bool _agg_functions_created = false; + bool _current_window_empty = false; + bool _next_partition = false; + int64_t _output_block_index = 0; + int64_t _window_end_position = 0; + std::vector _result_window_columns; + + int64_t _rows_start_offset = 0; + int64_t _rows_end_offset = 0; + std::vector _partition_by_column_idxs; + std::vector _order_by_column_idxs; + + // BlockRowPos _order_by_start; + // BlockRowPos _order_by_end; + // BlockRowPos _partition_by_start; + // BlockRowPos _partition_by_end; + + BoundaryPose _partition_by_pose; + BoundaryPose _order_by_pose; + int64_t _current_row_position = 0; + // BlockRowPos partition_by_end; + int64_t _input_total_rows = 0; + BlockRowPos _all_block_end; + std::vector _input_blocks; + bool _input_eos = false; + // BlockRowPos found_partition_end; + std::vector _input_col_ids; + std::vector _input_block_first_row_positions; + std::vector> _agg_input_columns; RuntimeProfile::Counter* _evaluation_timer = nullptr; RuntimeProfile::Counter* _compute_agg_data_timer = nullptr; RuntimeProfile::Counter* _compute_partition_by_timer = nullptr; RuntimeProfile::Counter* _compute_order_by_timer = nullptr; - - std::vector _agg_expr_ctxs; - vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; - vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; + RuntimeProfile::Counter* _execute_timer = nullptr; + RuntimeProfile::Counter* _get_next_timer = nullptr; + RuntimeProfile::Counter* _get_result_timer = nullptr; + // RuntimeProfile::HighWaterMarkCounter* _blocks_memory_usage = nullptr; }; class AnalyticSinkOperatorX final : public DataSinkOperatorX { @@ -94,23 +161,44 @@ class AnalyticSinkOperatorX final : public DataSinkOperatorX _agg_expr_ctxs; vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; size_t _agg_functions_size = 0; + std::vector _num_agg_input; + std::vector _agg_functions; + TupleId _intermediate_tuple_id; + TupleId _output_tuple_id; + TupleDescriptor* _intermediate_tuple_desc = nullptr; + TupleDescriptor* _output_tuple_desc = nullptr; const TTupleId _buffered_tuple_id; - std::vector _num_agg_input; const bool _is_colocate; const bool _require_bucket_distribution; const std::vector _partition_exprs; + + AnalyticFnScope _fn_scope; + TAnalyticWindow _window; + bool _has_window; + bool _has_range_window; + bool _has_window_start; + bool _has_window_end; + + /// The offset of the n-th functions. + std::vector _offsets_of_aggregate_states; + /// The total size of the row from the functions. + size_t _total_size_of_aggregate_states = 0; + /// The max align size for functions + size_t _align_aggregate_states = 1; + std::vector _change_to_nullable_flags; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index fe0ab0b148e55a..06bb03b42637ae 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -27,148 +27,14 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" AnalyticLocalState::AnalyticLocalState(RuntimeState* state, OperatorXBase* parent) - : PipelineXLocalState(state, parent), - _output_block_index(0), - _window_end_position(0), - _next_partition(false), - _rows_start_offset(0), - _rows_end_offset(0), - _fn_place_ptr(nullptr), - _agg_functions_size(0), - _agg_functions_created(false), - _agg_arena_pool(std::make_unique()) {} - -//_partition_by_columns,_order_by_columns save in blocks, so if need to calculate the boundary, may find in which blocks firstly -BlockRowPos AnalyticLocalState::_compare_row_to_find_end(int64_t idx, BlockRowPos start, - BlockRowPos end, bool need_check_first) { - auto& shared_state = *_shared_state; - int64_t start_init_row_num = start.row_num; - vectorized::ColumnPtr start_column = - shared_state.input_blocks[start.block_num].get_by_position(idx).column; - vectorized::ColumnPtr start_next_block_column = start_column; - - DCHECK_LE(start.block_num, end.block_num); - DCHECK_LE(start.block_num, shared_state.input_blocks.size() - 1); - int64_t start_block_num = start.block_num; - int64_t end_block_num = end.block_num; - int64_t mid_blcok_num = end.block_num; - // To fix this problem: https://github.com/apache/doris/issues/15951 - // in this case, the partition by column is last row of block, so it's pointed to a new block at row = 0, range is: [left, right) - // From the perspective of order by column, the two values are exactly equal. - // so the range will be get wrong because it's compare_at == 0 with next block at row = 0 - if (need_check_first && end.block_num > 0 && end.row_num == 0) { - end.block_num--; - end_block_num--; - end.row_num = shared_state.input_blocks[end_block_num].rows(); - } - //binary search find in which block - while (start_block_num < end_block_num) { - mid_blcok_num = (start_block_num + end_block_num + 1) >> 1; - start_next_block_column = - shared_state.input_blocks[mid_blcok_num].get_by_position(idx).column; - //Compares (*this)[n] and rhs[m], this: start[init_row] rhs: mid[0] - if (start_column->compare_at(start_init_row_num, 0, *start_next_block_column, 1) == 0) { - start_block_num = mid_blcok_num; - } else { - end_block_num = mid_blcok_num - 1; - } - } - - // have check the start.block_num: start_column[start_init_row_num] with mid_blcok_num start_next_block_column[0] - // now next block must not be result, so need check with end_block_num: start_next_block_column[last_row] - if (end_block_num == mid_blcok_num - 1) { - start_next_block_column = - shared_state.input_blocks[end_block_num].get_by_position(idx).column; - int64_t block_size = shared_state.input_blocks[end_block_num].rows(); - if ((start_column->compare_at(start_init_row_num, block_size - 1, *start_next_block_column, - 1) == 0)) { - start.block_num = end_block_num + 1; - start.row_num = 0; - return start; - } - } - - //check whether need get column again, maybe same as first init - // if the start_block_num have move to forword, so need update start block num and compare it from row_num=0 - if (start_block_num != start.block_num) { - start_init_row_num = 0; - start.block_num = start_block_num; - start_column = shared_state.input_blocks[start.block_num].get_by_position(idx).column; - } - //binary search, set start and end pos - int64_t start_pos = start_init_row_num; - int64_t end_pos = shared_state.input_blocks[start.block_num].rows(); - //if end_block_num haven't moved, only start_block_num go to the end block - //so could use the end.row_num for binary search - if (start.block_num == end.block_num) { - end_pos = end.row_num; - } - while (start_pos < end_pos) { - int64_t mid_pos = (start_pos + end_pos) >> 1; - if (start_column->compare_at(start_init_row_num, mid_pos, *start_column, 1)) { - end_pos = mid_pos; - } else { - start_pos = mid_pos + 1; - } - } - start.row_num = start_pos; //update row num, return the find end - return start; -} - -BlockRowPos AnalyticLocalState::_get_partition_by_end() { - auto& shared_state = *_shared_state; - if (shared_state.current_row_position < - shared_state.partition_by_end.pos) { //still have data, return partition_by_end directly - return shared_state.partition_by_end; - } - - const auto partition_exprs_size = - _parent->cast()._partition_exprs_size; - if (partition_exprs_size == 0 || - (shared_state.input_total_rows == 0)) { //no partition_by, the all block is end - return shared_state.all_block_end; - } - - BlockRowPos cal_end = shared_state.all_block_end; - for (size_t i = 0; i < partition_exprs_size; - ++i) { //have partition_by, binary search the partiton end - cal_end = _compare_row_to_find_end(shared_state.partition_by_column_idxs[i], - shared_state.partition_by_end, cal_end); - } - cal_end.pos = shared_state.input_block_first_row_positions[cal_end.block_num] + cal_end.row_num; - return cal_end; -} - -bool AnalyticLocalState::_whether_need_next_partition(BlockRowPos& found_partition_end) { - auto& shared_state = *_shared_state; - if (shared_state.input_eos || - (shared_state.current_row_position < - shared_state.partition_by_end.pos)) { //now still have partition data - return false; - } - const auto partition_exprs_size = - _parent->cast()._partition_exprs_size; - if ((partition_exprs_size == 0 && !shared_state.input_eos) || - (found_partition_end.pos == 0)) { //no partition, get until fetch to EOS - return true; - } - if (partition_exprs_size != 0 && found_partition_end.pos == shared_state.all_block_end.pos && - !shared_state.input_eos) { //current partition data calculate done - return true; - } - return false; -} + : PipelineXLocalState(state, parent) +{} Status AnalyticLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); - _blocks_memory_usage = - profile()->AddHighWaterMarkCounter("MemoryUsageBlocks", TUnit::BYTES, "", 1); - _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); - _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); - _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); return Status::OK(); } @@ -176,384 +42,52 @@ Status AnalyticLocalState::open(RuntimeState* state) { RETURN_IF_ERROR(PipelineXLocalState::open(state)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); - - auto& p = _parent->cast(); - _agg_functions_size = p._agg_functions.size(); - - _agg_functions.resize(p._agg_functions.size()); - for (size_t i = 0; i < _agg_functions.size(); i++) { - _agg_functions[i] = p._agg_functions[i]->clone(state, state->obj_pool()); - } - - _fn_place_ptr = _agg_arena_pool->aligned_alloc(p._total_size_of_aggregate_states, - p._align_aggregate_states); - - if (!p._has_window) { //haven't set window, Unbounded: [unbounded preceding,unbounded following] - _executor.get_next = std::bind(&AnalyticLocalState::_get_next_for_partition, this, - std::placeholders::_1); - - } else if (p._has_range_window) { - if (!p._has_window_end) { //haven't set end, so same as PARTITION, [unbounded preceding, unbounded following] - _executor.get_next = std::bind(&AnalyticLocalState::_get_next_for_partition, - this, std::placeholders::_1); - - } else { - _executor.get_next = std::bind(&AnalyticLocalState::_get_next_for_range, this, - std::placeholders::_1); - } - - } else { - if (!p._has_window_start && - !p._has_window_end) { //haven't set start and end, same as PARTITION - _executor.get_next = std::bind(&AnalyticLocalState::_get_next_for_partition, - this, std::placeholders::_1); - - } else { - if (p._has_window_start) { //calculate start boundary - TAnalyticWindowBoundary b = p._window.window_start; - if (b.__isset.rows_offset_value) { //[offset , ] - _rows_start_offset = b.rows_offset_value; - if (b.type == TAnalyticWindowBoundaryType::PRECEDING) { - _rows_start_offset *= -1; //preceding--> negative - } //current_row 0 - } else { //following positive - DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::CURRENT_ROW); //[current row, ] - _rows_start_offset = 0; - } - } - - if (p._has_window_end) { //calculate end boundary - TAnalyticWindowBoundary b = p._window.window_end; - if (b.__isset.rows_offset_value) { //[ , offset] - _rows_end_offset = b.rows_offset_value; - if (b.type == TAnalyticWindowBoundaryType::PRECEDING) { - _rows_end_offset *= -1; - } - } else { - DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::CURRENT_ROW); //[ ,current row] - _rows_end_offset = 0; - } - } - - _executor.get_next = std::bind(&AnalyticLocalState::_get_next_for_rows, this, - std::placeholders::_1); - } - } - _create_agg_status(); - return Status::OK(); -} - -void AnalyticLocalState::_reset_agg_status() { - for (size_t i = 0; i < _agg_functions_size; ++i) { - _agg_functions[i]->reset( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i]); - } -} - -void AnalyticLocalState::_create_agg_status() { - for (size_t i = 0; i < _agg_functions_size; ++i) { - try { - _agg_functions[i]->create( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i]); - } catch (...) { - for (int j = 0; j < i; ++j) { - _agg_functions[j]->destroy( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[j]); - } - throw; - } - } - _agg_functions_created = true; -} - -void AnalyticLocalState::_destroy_agg_status() { - if (UNLIKELY(_fn_place_ptr == nullptr || !_agg_functions_created)) { - return; - } - for (size_t i = 0; i < _agg_functions_size; ++i) { - _agg_functions[i]->destroy( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i]); - } -} - -void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, - int64_t frame_start, int64_t frame_end) { - SCOPED_TIMER(_execute_timer); - for (size_t i = 0; i < _agg_functions_size; ++i) { - std::vector agg_columns; - for (int j = 0; j < _shared_state->agg_input_columns[i].size(); ++j) { - agg_columns.push_back(_shared_state->agg_input_columns[i][j].get()); - } - _agg_functions[i]->function()->add_range_single_place( - partition_start, partition_end, frame_start, frame_end, - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i], - agg_columns.data(), _agg_arena_pool.get()); - - // If the end is not greater than the start, the current window should be empty. - _current_window_empty = - std::min(frame_end, partition_end) <= std::max(frame_start, partition_start); - } -} - -void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { - SCOPED_TIMER(_get_result_timer); - int64_t current_block_row_pos = - _shared_state->input_block_first_row_positions[_output_block_index]; - int64_t get_result_start = _shared_state->current_row_position - current_block_row_pos; - if (_parent->cast()._fn_scope == AnalyticFnScope::PARTITION) { - int64_t get_result_end = - std::min(_shared_state->current_row_position + current_block_rows, - _shared_state->partition_by_end.pos); - _window_end_position = - std::min(get_result_end - current_block_row_pos, current_block_rows); - _shared_state->current_row_position += (_window_end_position - get_result_start); - } else if (_parent->cast()._fn_scope == AnalyticFnScope::RANGE) { - _window_end_position = - std::min(_order_by_end.pos - current_block_row_pos, current_block_rows); - _shared_state->current_row_position += (_window_end_position - get_result_start); - } else { - _window_end_position++; - _shared_state->current_row_position++; - } - - const auto& offsets_of_aggregate_states = - _parent->cast()._offsets_of_aggregate_states; - for (size_t i = 0; i < _agg_functions_size; ++i) { - for (size_t j = get_result_start; j < _window_end_position; ++j) { - if (!_agg_functions[i]->function()->get_return_type()->is_nullable() && - _result_window_columns[i]->is_nullable()) { - if (_current_window_empty) { - _result_window_columns[i]->insert_default(); - } else { - auto* dst = assert_cast( - _result_window_columns[i].get()); - dst->get_null_map_data().push_back(0); - _agg_functions[i]->insert_result_info( - _fn_place_ptr + offsets_of_aggregate_states[i], - &dst->get_nested_column()); - } - continue; - } - _agg_functions[i]->insert_result_info(_fn_place_ptr + offsets_of_aggregate_states[i], - _result_window_columns[i].get()); - } - } -} - -Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { - SCOPED_TIMER(_get_next_timer); - while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && - _window_end_position < current_block_rows) { - int64_t range_start, range_end; - if (!_parent->cast()._window.__isset.window_start && - _parent->cast()._window.window_end.type == - TAnalyticWindowBoundaryType::CURRENT_ROW) { - // [preceding, current_row], [current_row, following] rewrite it's same - // as could reuse the previous calculate result, so don't call _reset_agg_status function - // going on calculate, add up data, no need to reset state - range_start = _shared_state->current_row_position; - range_end = _shared_state->current_row_position + 1; - } else { - _reset_agg_status(); - range_end = _shared_state->current_row_position + _rows_end_offset + 1; - //[preceding, offset] --unbound: [preceding, following] - if (!_parent->cast()._window.__isset.window_start) { - range_start = _partition_by_start.pos; - } else { - range_start = _shared_state->current_row_position + _rows_start_offset; - } - // Make sure range_start <= range_end - range_start = std::min(range_start, range_end); - } - _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, - range_start, range_end); - _insert_result_info(current_block_rows); - } return Status::OK(); } -Status AnalyticLocalState::_get_next_for_partition(size_t current_block_rows) { - SCOPED_TIMER(_get_next_timer); - if (_next_partition) { - _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _partition_by_start.pos, _shared_state->partition_by_end.pos); - } - _insert_result_info(current_block_rows); - return Status::OK(); -} - -Status AnalyticLocalState::_get_next_for_range(size_t current_block_rows) { - SCOPED_TIMER(_get_next_timer); - while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && - _window_end_position < current_block_rows) { - if (_shared_state->current_row_position >= _order_by_end.pos) { - _update_order_by_range(); - _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _order_by_start.pos, _order_by_end.pos); - } - _insert_result_info(current_block_rows); - } - return Status::OK(); -} - -void AnalyticLocalState::_update_order_by_range() { - _order_by_start = _order_by_end; - _order_by_end = _shared_state->partition_by_end; - for (size_t i = 0; i < _parent->cast()._order_by_exprs_size; ++i) { - _order_by_end = _compare_row_to_find_end(_shared_state->ordey_by_column_idxs[i], - _order_by_start, _order_by_end, true); - } - _order_by_start.pos = - _shared_state->input_block_first_row_positions[_order_by_start.block_num] + - _order_by_start.row_num; - _order_by_end.pos = _shared_state->input_block_first_row_positions[_order_by_end.block_num] + - _order_by_end.row_num; - // `_order_by_end` will be assigned to `_order_by_start` next time, - // so make it a valid position. - if (_order_by_end.row_num == _shared_state->input_blocks[_order_by_end.block_num].rows()) { - _order_by_end.block_num++; - _order_by_end.row_num = 0; - } -} - -void AnalyticLocalState::init_result_columns() { - if (!_window_end_position) { - _result_window_columns.resize(_agg_functions_size); - for (size_t i = 0; i < _agg_functions_size; ++i) { - _result_window_columns[i] = - _agg_functions[i]->data_type()->create_column(); //return type - } - } -} - -//calculate pos have arrive partition end, so it's needed to init next partition, and update the boundary of partition -bool AnalyticLocalState::init_next_partition(BlockRowPos found_partition_end) { - if ((_shared_state->current_row_position >= _shared_state->partition_by_end.pos) && - ((_shared_state->partition_by_end.pos == 0) || - (_shared_state->partition_by_end.pos != found_partition_end.pos))) { - _partition_by_start = _shared_state->partition_by_end; - _shared_state->partition_by_end = found_partition_end; - _shared_state->current_row_position = _partition_by_start.pos; - _reset_agg_status(); - return true; - } - return false; -} - -Status AnalyticLocalState::output_current_block(vectorized::Block* block) { - block->swap(std::move(_shared_state->input_blocks[_output_block_index])); - _blocks_memory_usage->add(-block->allocated_bytes()); - if (_shared_state->origin_cols.size() < block->columns()) { - block->erase_not_in(_shared_state->origin_cols); - } - - DCHECK(_parent->cast()._change_to_nullable_flags.size() == - _result_window_columns.size()); - for (size_t i = 0; i < _result_window_columns.size(); ++i) { - if (_parent->cast()._change_to_nullable_flags[i]) { - block->insert({make_nullable(std::move(_result_window_columns[i])), - make_nullable(_agg_functions[i]->data_type()), ""}); - } else { - block->insert( - {std::move(_result_window_columns[i]), _agg_functions[i]->data_type(), ""}); - } - } - - _output_block_index++; - _window_end_position = 0; - - return Status::OK(); -} AnalyticSourceOperatorX::AnalyticSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs) - : OperatorX(pool, tnode, operator_id, descs), - _window(tnode.analytic_node.window), - _intermediate_tuple_id(tnode.analytic_node.intermediate_tuple_id), - _output_tuple_id(tnode.analytic_node.output_tuple_id), - _has_window(tnode.analytic_node.__isset.window), - _has_range_window(tnode.analytic_node.window.type == TAnalyticWindowType::RANGE), - _has_window_start(tnode.analytic_node.window.__isset.window_start), - _has_window_end(tnode.analytic_node.window.__isset.window_end), - _partition_exprs_size(tnode.analytic_node.partition_exprs.size()), - _order_by_exprs_size(tnode.analytic_node.order_by_exprs.size()) { + : OperatorX(pool, tnode, operator_id, descs) { _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; - _fn_scope = AnalyticFnScope::PARTITION; - if (tnode.analytic_node.__isset.window && - tnode.analytic_node.window.type == TAnalyticWindowType::RANGE) { - DCHECK(!_window.__isset.window_start) << "RANGE windows must have UNBOUNDED PRECEDING"; - DCHECK(!_window.__isset.window_end || - _window.window_end.type == TAnalyticWindowBoundaryType::CURRENT_ROW) - << "RANGE window end bound must be CURRENT ROW or UNBOUNDED FOLLOWING"; - - if (_window.__isset - .window_end) { //haven't set end, so same as PARTITION, [unbounded preceding, unbounded following] - _fn_scope = AnalyticFnScope::RANGE; //range: [unbounded preceding,current row] - } - } else if (tnode.analytic_node.__isset.window) { - if (_window.__isset.window_start || _window.__isset.window_end) { - _fn_scope = AnalyticFnScope::ROWS; - } - } } -Status AnalyticSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { - RETURN_IF_ERROR(OperatorX::init(tnode, state)); - const TAnalyticNode& analytic_node = tnode.analytic_node; - size_t agg_size = analytic_node.analytic_functions.size(); - for (int i = 0; i < agg_size; ++i) { - vectorized::AggFnEvaluator* evaluator = nullptr; - // Window function treats all NullableAggregateFunction as AlwaysNullable. - // Its behavior is same with executed without group by key. - // https://github.com/apache/doris/pull/40693 - RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( - _pool, analytic_node.analytic_functions[i], {}, /*wihout_key*/ true, &evaluator)); - _agg_functions.emplace_back(evaluator); - } - return Status::OK(); -} - -Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, +Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* output_block, bool* eos) { + RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - if (local_state._shared_state->input_eos && - (local_state._output_block_index == local_state._shared_state->input_blocks.size() || - local_state._shared_state->input_total_rows == 0)) { - *eos = true; - return Status::OK(); - } - - while (!local_state._shared_state->input_eos || - local_state._output_block_index < local_state._shared_state->input_blocks.size()) { - { - SCOPED_TIMER(local_state._evaluation_timer); - local_state._shared_state->found_partition_end = local_state._get_partition_by_end(); - } - if (local_state._refresh_need_more_input()) { - return Status::OK(); - } - local_state._next_partition = - local_state.init_next_partition(local_state._shared_state->found_partition_end); - local_state.init_result_columns(); - size_t current_block_rows = - local_state._shared_state->input_blocks[local_state._output_block_index].rows(); - RETURN_IF_ERROR(local_state._executor.get_next(current_block_rows)); - if (local_state._window_end_position == current_block_rows) { - break; + SCOPED_TIMER(local_state._get_next_timer); + output_block->clear_column_data(); + { + std::lock_guard lock(local_state._shared_state->buffer_mutex); + if (!local_state._shared_state->blocks_buffer.empty()) { + local_state._shared_state->blocks_buffer.front().swap(*output_block); + local_state._shared_state->blocks_buffer.pop(); + //if buffer have no data and sink not eos, block reading and wait for signal again + RETURN_IF_ERROR(vectorized::VExprContext::filter_block( + local_state._conjuncts, output_block, output_block->columns())); + if (local_state._shared_state->blocks_buffer.empty() && + !local_state._shared_state->sink_eos) { + // add this mutex to check, as in some case maybe is doing block(), and the sink is doing set eos. + // so have to hold mutex to set block(), avoid to sink have set eos and set ready, but here set block() by mistake + std::unique_lock lc(local_state._shared_state->sink_eos_lock); + if (!local_state._shared_state->sink_eos) { + local_state._dependency->block(); // block self source + local_state._dependency->set_ready_to_write(); // ready for sink write + } + } + if (!output_block->empty()) { + local_state._num_rows_returned += output_block->rows(); + } + } else { + //iff buffer have no data and sink eos, set eos + std::unique_lock lc(local_state._shared_state->sink_eos_lock); + *eos = local_state._shared_state->sink_eos; } } - RETURN_IF_ERROR(local_state.output_current_block(block)); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); - local_state.reached_limit(block, eos); return Status::OK(); } @@ -564,52 +98,17 @@ Status AnalyticLocalState::close(RuntimeState* state) { return Status::OK(); } - _destroy_agg_status(); - _agg_arena_pool = nullptr; + // _destroy_agg_status(); + // _agg_arena_pool = nullptr; - std::vector tmp_result_window_columns; - _result_window_columns.swap(tmp_result_window_columns); + // std::vector tmp_result_window_columns; + // _result_window_columns.swap(tmp_result_window_columns); return PipelineXLocalState::close(state); } Status AnalyticSourceOperatorX::open(RuntimeState* state) { RETURN_IF_ERROR(OperatorX::open(state)); DCHECK(_child->row_desc().is_prefix_of(_row_descriptor)); - _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); - _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); - for (size_t i = 0; i < _agg_functions.size(); ++i) { - SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[i]; - SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[i]; - RETURN_IF_ERROR(_agg_functions[i]->prepare(state, _child->row_desc(), - intermediate_slot_desc, output_slot_desc)); - _agg_functions[i]->set_version(state->be_exec_version()); - _change_to_nullable_flags.push_back(output_slot_desc->is_nullable() && - !_agg_functions[i]->data_type()->is_nullable()); - } - - _offsets_of_aggregate_states.resize(_agg_functions.size()); - for (size_t i = 0; i < _agg_functions.size(); ++i) { - _offsets_of_aggregate_states[i] = _total_size_of_aggregate_states; - const auto& agg_function = _agg_functions[i]->function(); - // aggregate states are aligned based on maximum requirement - _align_aggregate_states = std::max(_align_aggregate_states, agg_function->align_of_data()); - _total_size_of_aggregate_states += agg_function->size_of_data(); - // If not the last aggregate_state, we need pad it so that next aggregate_state will be aligned. - if (i + 1 < _agg_functions.size()) { - size_t alignment_of_next_state = _agg_functions[i + 1]->function()->align_of_data(); - if ((alignment_of_next_state & (alignment_of_next_state - 1)) != 0) { - return Status::RuntimeError("Logical error: align_of_data is not 2^N"); - } - /// Extend total_size to next alignment requirement - /// Add padding by rounding up 'total_size_of_aggregate_states' to be a multiplier of alignment_of_next_state. - _total_size_of_aggregate_states = - (_total_size_of_aggregate_states + alignment_of_next_state - 1) / - alignment_of_next_state * alignment_of_next_state; - } - } - for (auto* agg_function : _agg_functions) { - RETURN_IF_ERROR(agg_function->open(state)); - } return Status::OK(); } diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 56c664cec6193b..415e63783c78fe 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -27,7 +27,7 @@ class RuntimeState; namespace pipeline { #include "common/compile_check_begin.h" -enum AnalyticFnScope { PARTITION, RANGE, ROWS }; + class AnalyticSourceOperatorX; class AnalyticLocalState final : public PipelineXLocalState { @@ -38,76 +38,9 @@ class AnalyticLocalState final : public PipelineXLocalState Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; Status close(RuntimeState* state) override; - - void init_result_columns(); - - Status output_current_block(vectorized::Block* block); - - bool init_next_partition(BlockRowPos found_partition_end); - private: - Status _get_next_for_rows(size_t rows); - Status _get_next_for_range(size_t rows); - Status _get_next_for_partition(size_t rows); - - void _execute_for_win_func(int64_t partition_start, int64_t partition_end, int64_t frame_start, - int64_t frame_end); - void _insert_result_info(int64_t current_block_rows); - - void _update_order_by_range(); - bool _refresh_need_more_input() { - auto need_more_input = _whether_need_next_partition(_shared_state->found_partition_end); - if (need_more_input) { - _dependency->block(); - _dependency->set_ready_to_write(); - } else { - _dependency->set_block_to_write(); - _dependency->set_ready(); - } - return need_more_input; - } - BlockRowPos _get_partition_by_end(); - BlockRowPos _compare_row_to_find_end(int64_t idx, BlockRowPos start, BlockRowPos end, - bool need_check_first = false); - bool _whether_need_next_partition(BlockRowPos& found_partition_end); - - void _reset_agg_status(); - void _create_agg_status(); - void _destroy_agg_status(); - friend class AnalyticSourceOperatorX; - - int64_t _output_block_index; - int64_t _window_end_position; - bool _next_partition; - std::vector _result_window_columns; - - int64_t _rows_start_offset; - int64_t _rows_end_offset; - vectorized::AggregateDataPtr _fn_place_ptr; - size_t _agg_functions_size; - bool _agg_functions_created; - bool _current_window_empty = false; - - BlockRowPos _order_by_start; - BlockRowPos _order_by_end; - BlockRowPos _partition_by_start; - std::unique_ptr _agg_arena_pool; - std::vector _agg_functions; - - RuntimeProfile::Counter* _evaluation_timer = nullptr; - RuntimeProfile::Counter* _execute_timer = nullptr; RuntimeProfile::Counter* _get_next_timer = nullptr; - RuntimeProfile::Counter* _get_result_timer = nullptr; - RuntimeProfile::HighWaterMarkCounter* _blocks_memory_usage = nullptr; - - using vectorized_get_next = std::function; - - struct executor { - vectorized_get_next get_next; - }; - - executor _executor; }; class AnalyticSourceOperatorX final : public OperatorX { @@ -119,39 +52,11 @@ class AnalyticSourceOperatorX final : public OperatorX { bool is_source() const override { return true; } - Status init(const TPlanNode& tnode, RuntimeState* state) override; + // Status init(const TPlanNode& tnode, RuntimeState* state) override; Status open(RuntimeState* state) override; private: friend class AnalyticLocalState; - - TAnalyticWindow _window; - - TupleId _intermediate_tuple_id; - TupleId _output_tuple_id; - - bool _has_window; - bool _has_range_window; - bool _has_window_start; - bool _has_window_end; - - std::vector _agg_functions; - - AnalyticFnScope _fn_scope; - - TupleDescriptor* _intermediate_tuple_desc = nullptr; - TupleDescriptor* _output_tuple_desc = nullptr; - - /// The offset of the n-th functions. - std::vector _offsets_of_aggregate_states; - /// The total size of the row from the functions. - size_t _total_size_of_aggregate_states = 0; - /// The max align size for functions - size_t _align_aggregate_states = 1; - - std::vector _change_to_nullable_flags; - const size_t _partition_exprs_size; - const size_t _order_by_exprs_size; }; } // namespace pipeline From c07fd8ec7cc6781930874b519a95eacd6f20eb44 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Tue, 31 Dec 2024 10:41:30 +0800 Subject: [PATCH 02/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 15 ++++++++++ be/src/pipeline/exec/analytic_sink_operator.h | 3 +- .../exec/analytic_source_operator.cpp | 28 +------------------ .../pipeline/exec/analytic_source_operator.h | 6 +--- 4 files changed, 19 insertions(+), 33 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 16453c594c808b..8e8b145e1fe232 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -128,6 +128,21 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { return Status::OK(); } +Status AnalyticSinkLocalState::close(RuntimeState* state, Status exec_status) { + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_close_timer); + if (_closed) { + return Status::OK(); + } + + _destroy_agg_status(); + _agg_arena_pool = nullptr; + + std::vector tmp_result_window_columns; + _result_window_columns.swap(tmp_result_window_columns); + return PipelineXSinkLocalState::close(state, exec_status); +} + Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { do { auto batch_size = _input_blocks[_output_block_index].rows(); diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 9416a868baa4f0..6dae2de3fe0703 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -38,6 +38,7 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState(state, parent) -{} + : PipelineXLocalState(state, parent) {} Status AnalyticLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState::init(state, info)); @@ -38,22 +37,12 @@ Status AnalyticLocalState::init(RuntimeState* state, LocalStateInfo& info) { return Status::OK(); } -Status AnalyticLocalState::open(RuntimeState* state) { - RETURN_IF_ERROR(PipelineXLocalState::open(state)); - SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_open_timer); - return Status::OK(); -} - - AnalyticSourceOperatorX::AnalyticSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs) : OperatorX(pool, tnode, operator_id, descs) { _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; - } - Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* output_block, bool* eos) { RETURN_IF_CANCELLED(state); @@ -91,21 +80,6 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block return Status::OK(); } -Status AnalyticLocalState::close(RuntimeState* state) { - SCOPED_TIMER(exec_time_counter()); - SCOPED_TIMER(_close_timer); - if (_closed) { - return Status::OK(); - } - - // _destroy_agg_status(); - // _agg_arena_pool = nullptr; - - // std::vector tmp_result_window_columns; - // _result_window_columns.swap(tmp_result_window_columns); - return PipelineXLocalState::close(state); -} - Status AnalyticSourceOperatorX::open(RuntimeState* state) { RETURN_IF_ERROR(OperatorX::open(state)); DCHECK(_child->row_desc().is_prefix_of(_row_descriptor)); diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 415e63783c78fe..9d62759212b2cc 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -28,16 +28,13 @@ class RuntimeState; namespace pipeline { #include "common/compile_check_begin.h" - class AnalyticSourceOperatorX; class AnalyticLocalState final : public PipelineXLocalState { public: ENABLE_FACTORY_CREATOR(AnalyticLocalState); AnalyticLocalState(RuntimeState* state, OperatorXBase* parent); - Status init(RuntimeState* state, LocalStateInfo& info) override; - Status open(RuntimeState* state) override; - Status close(RuntimeState* state) override; + private: friend class AnalyticSourceOperatorX; RuntimeProfile::Counter* _get_next_timer = nullptr; @@ -52,7 +49,6 @@ class AnalyticSourceOperatorX final : public OperatorX { bool is_source() const override { return true; } - // Status init(const TPlanNode& tnode, RuntimeState* state) override; Status open(RuntimeState* state) override; private: From 92049e4c2c4946c68d5f04e2ddd7108f9fd51e8d Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Thu, 2 Jan 2025 15:19:04 +0800 Subject: [PATCH 03/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 58 ++++++++++++------- be/src/pipeline/exec/analytic_sink_operator.h | 4 +- .../aggregate_function_window.h | 12 ++-- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 8e8b145e1fe232..de4e0162d8d11d 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -111,15 +111,17 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { _agg_input_columns[i][j] = _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); } } - _partition_by_eq_expr_ctxs.resize(p._partition_by_eq_expr_ctxs.size()); - _partition_by_column_idxs.resize(p._partition_by_eq_expr_ctxs.size()); - for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); i++) { + _partition_exprs_size = p._partition_by_eq_expr_ctxs.size(); + _partition_by_eq_expr_ctxs.resize(_partition_exprs_size); + _partition_by_column_idxs.resize(_partition_exprs_size); + for (size_t i = 0; i < _partition_exprs_size; i++) { RETURN_IF_ERROR( p._partition_by_eq_expr_ctxs[i]->clone(state, _partition_by_eq_expr_ctxs[i])); } - _order_by_eq_expr_ctxs.resize(p._order_by_eq_expr_ctxs.size()); - _order_by_column_idxs.resize(p._order_by_eq_expr_ctxs.size()); - for (size_t i = 0; i < _order_by_eq_expr_ctxs.size(); i++) { + _order_by_exprs_size = p._order_by_eq_expr_ctxs.size(); + _order_by_eq_expr_ctxs.resize(_order_by_exprs_size); + _order_by_column_idxs.resize(_order_by_exprs_size); + for (size_t i = 0; i < _order_by_exprs_size; i++) { RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); } _fn_place_ptr = _agg_arena_pool->aligned_alloc(p._total_size_of_aggregate_states, @@ -144,11 +146,14 @@ Status AnalyticSinkLocalState::close(RuntimeState* state, Status exec_status) { } Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { + if (!_has_input_data()) { + return Status::OK(); + } do { auto batch_size = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - auto remain_size = _current_row_position - current_block_base_pos - batch_size; - + // auto remain_size = _current_row_position - current_block_base_pos - batch_size; + auto remain_size = batch_size - (_current_row_position - current_block_base_pos); _init_result_columns(); _get_partition_by_end(); while (_current_row_position < _partition_by_pose.end.pos && remain_size > 0) { @@ -183,11 +188,14 @@ Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { } Status AnalyticSinkLocalState::_get_next_for_rows() { + if (!_has_input_data()) { + return Status::OK(); + } do { auto batch_size = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - auto remain_size = _current_row_position - current_block_base_pos - batch_size; - + // auto remain_size = _current_row_position - current_block_base_pos - batch_size; + auto remain_size = batch_size - (_current_row_position - current_block_base_pos); _init_result_columns(); _get_partition_by_end(); while (_current_row_position < _partition_by_pose.end.pos && remain_size > 0) { @@ -274,13 +282,14 @@ Status AnalyticSinkLocalState::_get_next_for_range() { if (!_order_by_pose.is_ended) { break; } - // maybe need break the loop if (_current_row_position < _order_by_pose.end.pos) { // real frame is [partition_start, order_by_end] // but the real deal with frame is [order_by_start, order_by_end] - _execute_for_win_func(_order_by_pose.start.pos, _order_by_pose.end.pos, - _order_by_pose.start.pos, _order_by_pose.end.pos); + _execute_for_win_func( + _partition_by_pose.start.pos, _partition_by_pose.end.pos, + // _execute_for_win_func(_order_by_pose.start.pos, _order_by_pose.end.pos, + _order_by_pose.start.pos, _order_by_pose.end.pos); } while (_current_row_position < _order_by_pose.end.pos) { @@ -295,7 +304,6 @@ Status AnalyticSinkLocalState::_get_next_for_range() { _insert_result_info(real_deal_with_width); _current_row_position += real_deal_with_width; - if (_current_row_position - current_block_base_pos >= batch_size) { vectorized::Block block; RETURN_IF_ERROR(output_current_block(&block)); @@ -303,7 +311,7 @@ Status AnalyticSinkLocalState::_get_next_for_range() { } } - if (_partition_by_pose.is_ended && _current_row_position == _order_by_pose.end.pos) { + if (_partition_by_pose.is_ended && _current_row_position == _partition_by_pose.end.pos) { has_finish_current_partition = true; _reset_state_for_next_partition(); } else { @@ -369,6 +377,8 @@ Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { DCHECK(_parent->cast()._change_to_nullable_flags.size() == _result_window_columns.size()); for (size_t i = 0; i < _result_window_columns.size(); ++i) { + DCHECK(_result_window_columns[i]); + DCHECK(_agg_functions[i]); if (_parent->cast()._change_to_nullable_flags[i]) { block->insert({make_nullable(std::move(_result_window_columns[i])), make_nullable(_agg_functions[i]->data_type()), ""}); @@ -383,7 +393,7 @@ Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { } void AnalyticSinkLocalState::_init_result_columns() { - if (_current_row_position - _input_block_first_row_positions[_output_block_index] == 0) { + if (_current_row_position == _input_block_first_row_positions[_output_block_index]) { _result_window_columns.resize(_agg_functions_size); for (size_t i = 0; i < _agg_functions_size; ++i) { _result_window_columns[i] = @@ -417,6 +427,9 @@ BlockRowPos AnalyticSinkLocalState::_compare_row_to_find_end(int64_t idx, BlockR BlockRowPos end, bool need_check_first) { int64_t start_init_row_num = start.row_num; + DCHECK_LT(start.block_num, _input_blocks.size()); + DCHECK_LT(idx, _input_blocks[start.block_num].columns()) + << _input_blocks[start.block_num].dump_structure(); vectorized::ColumnPtr start_column = _input_blocks[start.block_num].get_by_position(idx).column; vectorized::ColumnPtr start_next_block_column = start_column; @@ -690,21 +703,22 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); local_state._input_eos = eos; - if (input_block->rows() > 0) { - RETURN_IF_ERROR(_add_input_block(state, input_block)); - RETURN_IF_ERROR((local_state.*(local_state._executor.get_next_impl))()); - } + RETURN_IF_ERROR(_add_input_block(state, input_block)); + RETURN_IF_ERROR((local_state.*(local_state._executor.get_next_impl))()); + if (local_state._input_eos) { - local_state._dependency->set_ready_to_read(); // ready for source to read std::unique_lock lc(local_state._shared_state->sink_eos_lock); local_state._shared_state->sink_eos = true; - return Status::OK(); + local_state._dependency->set_ready_to_read(); // ready for source to read } return Status::OK(); } Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, vectorized::Block* input_block) { + if (input_block->rows() <= 0) { + return Status::OK(); + } auto& local_state = get_local_state(state); local_state._input_block_first_row_positions.emplace_back(local_state._input_total_rows); size_t block_rows = input_block->rows(); diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 6dae2de3fe0703..c411b35878846e 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -81,8 +81,8 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; - const size_t _partition_exprs_size = 0; - const size_t _order_by_exprs_size = 0; + size_t _partition_exprs_size = 0; + size_t _order_by_exprs_size = 0; size_t _agg_functions_size = 0; vectorized::AggregateDataPtr _fn_place_ptr = nullptr; diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.h b/be/src/vec/aggregate_functions/aggregate_function_window.h index 5d449318b7d2f5..ca5b3bb07652bb 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_window.h +++ b/be/src/vec/aggregate_functions/aggregate_function_window.h @@ -86,8 +86,8 @@ class WindowFunctionRowNumber final struct RankData { int64_t rank = 0; - int64_t count = 0; - int64_t peer_group_start = 0; + int64_t count = 1; + int64_t peer_group_start = -1; }; class WindowFunctionRank final : public IAggregateFunctionDataHelper { @@ -131,7 +131,7 @@ class WindowFunctionRank final : public IAggregateFunctionDataHelper Date: Fri, 3 Jan 2025 15:56:22 +0800 Subject: [PATCH 04/20] update --- be/src/pipeline/dependency.h | 34 +- .../pipeline/exec/analytic_sink_operator.cpp | 454 ++++++++---------- be/src/pipeline/exec/analytic_sink_operator.h | 31 +- 3 files changed, 206 insertions(+), 313 deletions(-) diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 6989389535110c..ec51de9a1f83ce 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -549,24 +549,9 @@ struct MultiCastSharedState : public BasicSharedState { std::unique_ptr multi_cast_data_streamer; }; -struct BlockRowPos { - int64_t block_num {}; //the pos at which block - int64_t row_num {}; //the pos at which row - int64_t pos {}; //pos = all blocks size + row_num - std::string debug_string() const { - std::string res = "\t block_num: "; - res += std::to_string(block_num); - res += "\t row_num: "; - res += std::to_string(row_num); - res += "\t pos: "; - res += std::to_string(pos); - return res; - } -}; - struct BoundaryPose { - BlockRowPos start; - BlockRowPos end; + int64_t start = 0; + int64_t end = 0; bool is_ended = false; }; @@ -575,25 +560,10 @@ struct AnalyticSharedState : public BasicSharedState { public: AnalyticSharedState() = default; - - // int64_t current_row_position = 0; - // BlockRowPos partition_by_end; - // int64_t input_total_rows = 0; - // BlockRowPos all_block_end; - // std::vector input_blocks; - // bool input_eos = false; - // BlockRowPos found_partition_end; - // std::vector origin_cols; - // std::vector input_block_first_row_positions; - // std::vector> agg_input_columns; - std::queue blocks_buffer; std::mutex buffer_mutex; bool sink_eos = false; std::mutex sink_eos_lock; - // TODO: maybe global? - // std::vector partition_by_column_idxs; - // std::vector order_by_column_idxs; }; struct JoinSharedState : public BasicSharedState { diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index de4e0162d8d11d..33904ec446849d 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -20,6 +20,7 @@ #include +#include #include #include "pipeline/exec/operator.h" @@ -42,24 +43,31 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _agg_arena_pool = std::make_unique(); auto& p = _parent->cast(); if (!p._has_window) { //haven't set window, Unbounded: [unbounded preceding,unbounded following] + // For window frame `ROWS|RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_partition; } else if (p._has_range_window) { - // RANGE windows must have UNBOUNDED PRECEDING - // RANGE window end bound must be CURRENT ROW or UNBOUNDED FOLLOWING - if (!p._has_window_end) { //haven't set end, so same as PARTITION, [unbounded preceding, unbounded following] + if (!p._has_window_start && !p._has_window_end) { _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_partition; - } else { - _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_range; + if (!p._has_window_start && + p._window.window_end.type == TAnalyticWindowBoundaryType::CURRENT_ROW) { + // For window frame `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW` + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_range; + } else { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_sliding_rows; + } } } else { - //haven't set start and end, same as PARTITION + // haven't set start and end, same as PARTITION if (!p._has_window_start && !p._has_window_end) { _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_partition; - } else if (!p._has_window_start) { - _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_rows; } else { - _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_sliding_rows; + if (!p._has_window_start && + p._window.window_end.type == TAnalyticWindowBoundaryType::CURRENT_ROW) { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_unbounded_rows; + } else { + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_sliding_rows; + } } if (p._has_window_start) { //calculate start boundary @@ -111,19 +119,25 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { _agg_input_columns[i][j] = _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); } } + _partition_exprs_size = p._partition_by_eq_expr_ctxs.size(); _partition_by_eq_expr_ctxs.resize(_partition_exprs_size); - _partition_by_column_idxs.resize(_partition_exprs_size); + _partition_by_columns.resize(_partition_exprs_size); for (size_t i = 0; i < _partition_exprs_size; i++) { RETURN_IF_ERROR( p._partition_by_eq_expr_ctxs[i]->clone(state, _partition_by_eq_expr_ctxs[i])); + _partition_by_columns[i] = + _partition_by_eq_expr_ctxs[i]->root()->data_type()->create_column(); } + _order_by_exprs_size = p._order_by_eq_expr_ctxs.size(); _order_by_eq_expr_ctxs.resize(_order_by_exprs_size); - _order_by_column_idxs.resize(_order_by_exprs_size); + _order_by_columns.resize(_order_by_exprs_size); for (size_t i = 0; i < _order_by_exprs_size; i++) { RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); + _order_by_columns[i] = _order_by_eq_expr_ctxs[i]->root()->data_type()->create_column(); } + _fn_place_ptr = _agg_arena_pool->aligned_alloc(p._total_size_of_aggregate_states, p._align_aggregate_states); _create_agg_status(); @@ -146,87 +160,88 @@ Status AnalyticSinkLocalState::close(RuntimeState* state, Status exec_status) { } Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { - if (!_has_input_data()) { - return Status::OK(); - } - do { - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - // auto remain_size = _current_row_position - current_block_base_pos - batch_size; - auto remain_size = batch_size - (_current_row_position - current_block_base_pos); - _init_result_columns(); - _get_partition_by_end(); - while (_current_row_position < _partition_by_pose.end.pos && remain_size > 0) { - // return {_current_row_position + _rows_start_offset, _current_row_position + _rows_end_offset + 1}; - const bool is_n_following_frame = _rows_end_offset > 0; - auto range_start = _current_row_position + _rows_start_offset; - auto range_end = _current_row_position + _rows_end_offset + 1; - // For window clause like `ROWS BETWEEN N PRECEDING AND M FOLLOWING`, - // if the current chunk has not reach the partition boundary, it may need more data. - if (is_n_following_frame && !_partition_by_pose.is_ended && - range_end > _partition_by_pose.end.pos) { - return Status::OK(); + while (_has_input_data()) { + { + SCOPED_TIMER(_evaluation_timer); + _get_partition_by_end(); + if (!_partition_by_pose.is_ended) { + break; } - _reset_agg_status(); - _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, - range_start, range_end); - _insert_result_info(1); - _current_row_position++; - remain_size--; - } - if (_partition_by_pose.is_ended && _current_row_position == _partition_by_pose.end.pos) { - _reset_state_for_next_partition(); - } + _init_result_columns(); + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - if (_current_row_position - current_block_base_pos >= batch_size) { - vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); - _refresh_buffer_and_dependency_state(&block); + while (_current_row_position < _partition_by_pose.end) { + int64_t current_row_start = 0; + int64_t current_row_end = _current_row_position + _rows_end_offset + 1; + + _reset_agg_status(); + if (!_parent->cast()._window.__isset.window_start) { + current_row_start = _partition_by_pose.start; + } else { + current_row_start = _current_row_position + _rows_start_offset; + } + // Make sure range_start <= range_end + current_row_start = std::min(current_row_start, current_row_end); + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + current_row_start, current_row_end); + _insert_result_info(1); + _current_row_position++; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; + } + } + + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + if (_current_row_position == _partition_by_pose.end) { + _reset_state_for_next_partition(); + } } - } while (_has_input_data()); + } return Status::OK(); } -Status AnalyticSinkLocalState::_get_next_for_rows() { - if (!_has_input_data()) { - return Status::OK(); - } - do { - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - // auto remain_size = _current_row_position - current_block_base_pos - batch_size; - auto remain_size = batch_size - (_current_row_position - current_block_base_pos); - _init_result_columns(); - _get_partition_by_end(); - while (_current_row_position < _partition_by_pose.end.pos && remain_size > 0) { - // return {_partition.start, _current_row_position + _rows_end_offset + 1}; - const bool is_n_following_frame = _rows_end_offset > 0; - auto current_row_end = _current_row_position + _rows_end_offset + 1; - // if the current chunk has not reach the partition boundary, it may need more data. - if (is_n_following_frame && !_partition_by_pose.is_ended && - current_row_end > _partition_by_pose.end.pos) { - return Status::OK(); +Status AnalyticSinkLocalState::_get_next_for_unbounded_rows() { + while (_has_input_data()) { + { + SCOPED_TIMER(_evaluation_timer); + _get_partition_by_end(); + if (!_partition_by_pose.is_ended) { + break; } + _init_result_columns(); + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - if (is_n_following_frame && _current_row_position == _partition_by_pose.start.pos) { - _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, - _partition_by_pose.start.pos, current_row_end - 1); + while (_current_row_position < _partition_by_pose.end) { + // [preceding, current_row], [current_row, following] rewrite it's same + // as could reuse the previous calculate result, so don't call _reset_agg_status function + // going on calculate, add up data, no need to reset state + int64_t current_row_start = _current_row_position; + int64_t current_row_end = _current_row_position + 1; + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + current_row_start, current_row_end); + _insert_result_info(1); + _current_row_position++; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; + } + } + + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + if (_current_row_position == _partition_by_pose.end) { + _reset_state_for_next_partition(); } - _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, - current_row_end - 1, current_row_end); - _insert_result_info(1); - _current_row_position++; - remain_size--; - } - if (_partition_by_pose.is_ended && _current_row_position == _partition_by_pose.end.pos) { - _reset_state_for_next_partition(); - } - if (_current_row_position - current_block_base_pos >= batch_size) { - vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); - _refresh_buffer_and_dependency_state(&block); } - } while (_has_input_data()); + } return Status::OK(); } @@ -239,16 +254,16 @@ Status AnalyticSinkLocalState::_get_next_for_partition() { break; } _init_result_columns(); - if (_current_row_position == _partition_by_pose.start.pos) { - _execute_for_win_func(_partition_by_pose.start.pos, _partition_by_pose.end.pos, - _partition_by_pose.start.pos, _partition_by_pose.end.pos); + if (_current_row_position == _partition_by_pose.start) { + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _partition_by_pose.start, _partition_by_pose.end); } auto batch_size = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; // the end pos maybe after multis blocks, but should output by batch size and should not exceed partition end auto window_end_pos = _current_row_position + batch_size; - window_end_pos = std::min(window_end_pos, _partition_by_pose.end.pos); + window_end_pos = std::min(window_end_pos, _partition_by_pose.end); auto previous_window_frame_width = _current_row_position - current_block_base_pos; auto current_window_frame_width = window_end_pos - current_block_base_pos; @@ -264,7 +279,7 @@ Status AnalyticSinkLocalState::_get_next_for_partition() { RETURN_IF_ERROR(output_current_block(&block)); _refresh_buffer_and_dependency_state(&block); } - if (_current_row_position == _partition_by_pose.end.pos) { + if (_current_row_position == _partition_by_pose.end) { _reset_state_for_next_partition(); } } @@ -273,49 +288,54 @@ Status AnalyticSinkLocalState::_get_next_for_partition() { } Status AnalyticSinkLocalState::_get_next_for_range() { - bool has_finish_current_partition = true; while (_has_input_data()) { - if (has_finish_current_partition) { + { + SCOPED_TIMER(_evaluation_timer); _get_partition_by_end(); - } - _update_order_by_range(); - if (!_order_by_pose.is_ended) { - break; - } - // maybe need break the loop - if (_current_row_position < _order_by_pose.end.pos) { - // real frame is [partition_start, order_by_end] - // but the real deal with frame is [order_by_start, order_by_end] - _execute_for_win_func( - _partition_by_pose.start.pos, _partition_by_pose.end.pos, - // _execute_for_win_func(_order_by_pose.start.pos, _order_by_pose.end.pos, - _order_by_pose.start.pos, _order_by_pose.end.pos); - } - - while (_current_row_position < _order_by_pose.end.pos) { + if (!_partition_by_pose.is_ended) { + break; + } _init_result_columns(); auto batch_size = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + LOG(INFO) << "asd _get_next_for_range: " << _current_row_position << " " << batch_size + << " " << current_block_base_pos; + LOG(INFO) << _order_by_pose.start << " " << _order_by_pose.end << " " + << _partition_by_pose.start << " " << _partition_by_pose.end; + while (_current_row_position < _partition_by_pose.end) { + _update_order_by_range(); + if (_current_row_position == _order_by_pose.start) { + LOG(INFO) << "asd1: " << _partition_by_pose.start << " " + << _partition_by_pose.end << " " << _order_by_pose.start << " " + << _order_by_pose.end; + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _order_by_pose.start, _order_by_pose.end); + } + auto previous_window_frame_width = _current_row_position - current_block_base_pos; + auto current_window_frame_width = _order_by_pose.end - current_block_base_pos; + current_window_frame_width = + std::min(current_window_frame_width, batch_size); + auto real_deal_with_width = + current_window_frame_width - previous_window_frame_width; + + _insert_result_info(real_deal_with_width); + _current_row_position += real_deal_with_width; + LOG(INFO) << "asd2: " << previous_window_frame_width << " " + << current_window_frame_width << " " << real_deal_with_width << " " + << _current_row_position; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; + } + } - auto previous_window_frame_width = _current_row_position - current_block_base_pos; - auto current_window_frame_width = _order_by_pose.end.pos - current_block_base_pos; - current_window_frame_width = std::min(current_window_frame_width, batch_size); - auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; - - _insert_result_info(real_deal_with_width); - _current_row_position += real_deal_with_width; if (_current_row_position - current_block_base_pos >= batch_size) { vectorized::Block block; RETURN_IF_ERROR(output_current_block(&block)); _refresh_buffer_and_dependency_state(&block); } - } - - if (_partition_by_pose.is_ended && _current_row_position == _partition_by_pose.end.pos) { - has_finish_current_partition = true; - _reset_state_for_next_partition(); - } else { - has_finish_current_partition = false; + if (_current_row_position == _partition_by_pose.end) { + _reset_state_for_next_partition(); + } } } return Status::OK(); @@ -336,6 +356,7 @@ void AnalyticSinkLocalState::_execute_for_win_func(int64_t partition_start, int6 agg_columns.data(), _agg_arena_pool.get()); // If the end is not greater than the start, the current window should be empty. + // _current_window_empty = false; _current_window_empty = std::min(frame_end, partition_end) <= std::max(frame_start, partition_start); } @@ -395,9 +416,9 @@ Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { void AnalyticSinkLocalState::_init_result_columns() { if (_current_row_position == _input_block_first_row_positions[_output_block_index]) { _result_window_columns.resize(_agg_functions_size); + // return type create result column for (size_t i = 0; i < _agg_functions_size; ++i) { - _result_window_columns[i] = - _agg_functions[i]->data_type()->create_column(); //return type + _result_window_columns[i] = _agg_functions[i]->data_type()->create_column(); } } } @@ -418,89 +439,13 @@ void AnalyticSinkLocalState::_refresh_buffer_and_dependency_state(vectorized::Bl } void AnalyticSinkLocalState::_reset_state_for_next_partition() { _partition_by_pose.start = _partition_by_pose.end; - _current_row_position = _partition_by_pose.start.pos; + _current_row_position = _partition_by_pose.start; _reset_agg_status(); } -//_partition_by_columns,_order_by_columns save in blocks, so if need to calculate the boundary, may find in which blocks firstly -BlockRowPos AnalyticSinkLocalState::_compare_row_to_find_end(int64_t idx, BlockRowPos start, - BlockRowPos end, - bool need_check_first) { - int64_t start_init_row_num = start.row_num; - DCHECK_LT(start.block_num, _input_blocks.size()); - DCHECK_LT(idx, _input_blocks[start.block_num].columns()) - << _input_blocks[start.block_num].dump_structure(); - vectorized::ColumnPtr start_column = _input_blocks[start.block_num].get_by_position(idx).column; - vectorized::ColumnPtr start_next_block_column = start_column; - - DCHECK_LE(start.block_num, end.block_num); - DCHECK_LE(start.block_num, _input_blocks.size() - 1); - int64_t start_block_num = start.block_num; - int64_t end_block_num = end.block_num; - int64_t mid_block_num = end.block_num; - // To fix this problem: https://github.com/apache/doris/issues/15951 - // in this case, the partition by column is last row of block, so it's pointed to a new block at row = 0, range is: [left, right) - // From the perspective of order by column, the two values are exactly equal. - // so the range will be get wrong because it's compare_at == 0 with next block at row = 0 - if (need_check_first && end.block_num > 0 && end.row_num == 0) { - end.block_num--; - end_block_num--; - end.row_num = _input_blocks[end_block_num].rows(); - } - //binary search find in which block - while (start_block_num < end_block_num) { - mid_block_num = (start_block_num + end_block_num + 1) >> 1; - start_next_block_column = _input_blocks[mid_block_num].get_by_position(idx).column; - //Compares (*this)[n] and rhs[m], this: start[init_row] rhs: mid[0] - if (start_column->compare_at(start_init_row_num, 0, *start_next_block_column, 1) == 0) { - start_block_num = mid_block_num; - } else { - end_block_num = mid_block_num - 1; - } - } - - // have check the start.block_num: start_column[start_init_row_num] with mid_block_num start_next_block_column[0] - // now next block must not be result, so need check with end_block_num: start_next_block_column[last_row] - if (end_block_num == mid_block_num - 1) { - start_next_block_column = _input_blocks[end_block_num].get_by_position(idx).column; - int64_t block_size = _input_blocks[end_block_num].rows(); - if ((start_column->compare_at(start_init_row_num, block_size - 1, *start_next_block_column, - 1) == 0)) { - start.block_num = end_block_num + 1; - start.row_num = 0; - return start; - } - } - - //check whether need get column again, maybe same as first init - // if the start_block_num have move to forword, so need update start block num and compare it from row_num=0 - if (start_block_num != start.block_num) { - start_init_row_num = 0; - start.block_num = start_block_num; - start_column = _input_blocks[start.block_num].get_by_position(idx).column; - } - //binary search, set start and end pos - int64_t start_pos = start_init_row_num; - int64_t end_pos = _input_blocks[start.block_num].rows(); - //if end_block_num haven't moved, only start_block_num go to the end block - //so could use the end.row_num for binary search - if (start.block_num == end.block_num) { - end_pos = end.row_num; - } - while (start_pos < end_pos) { - int64_t mid_pos = (start_pos + end_pos) >> 1; - if (start_column->compare_at(start_init_row_num, mid_pos, *start_column, 1)) { - end_pos = mid_pos; - } else { - start_pos = mid_pos + 1; - } - } - start.row_num = start_pos; //update row num, return the find end - return start; -} - void AnalyticSinkLocalState::_update_order_by_range() { - if (_order_by_pose.is_ended && _current_row_position < _order_by_pose.end.pos) { + // still have more data + if (_order_by_pose.is_ended && _current_row_position < _order_by_pose.end) { return; } @@ -509,26 +454,22 @@ void AnalyticSinkLocalState::_update_order_by_range() { } _order_by_pose.end = _partition_by_pose.end; - for (size_t i = 0; i < _order_by_exprs_size; ++i) { - _order_by_pose.end = _compare_row_to_find_end( - _order_by_column_idxs[i], _order_by_pose.start, _order_by_pose.end, true); - } - _order_by_pose.start.pos = _input_block_first_row_positions[_order_by_pose.start.block_num] + - _order_by_pose.start.row_num; - _order_by_pose.end.pos = _input_block_first_row_positions[_order_by_pose.end.block_num] + - _order_by_pose.end.row_num; - // `_order_by_end` will be assigned to `_order_by_start` next time, - // so make it a valid position. - if (_order_by_pose.end.row_num == _input_blocks[_order_by_pose.end.block_num].rows()) { - _order_by_pose.end.block_num++; - _order_by_pose.end.row_num = 0; + { + if (_order_by_pose.start < _order_by_pose.end) { + for (size_t i = 0; i < _order_by_exprs_size; ++i) { + _order_by_pose.end = + find_first_not_equal(_order_by_columns[i].get(), _order_by_pose.start, + _order_by_pose.start, _order_by_pose.end); + } + } } - if (_order_by_pose.end.pos < _partition_by_pose.end.pos) { + if (_order_by_pose.end < _partition_by_pose.end) { _order_by_pose.is_ended = true; + // here maybe find candidate ends; return; } - DCHECK_EQ(_partition_by_pose.end.pos, _order_by_pose.end.pos); + DCHECK_EQ(_partition_by_pose.end, _order_by_pose.end); if (_partition_by_pose.is_ended) { _order_by_pose.is_ended = true; return; @@ -538,34 +479,59 @@ void AnalyticSinkLocalState::_update_order_by_range() { void AnalyticSinkLocalState::_get_partition_by_end() { //still have data, return partition_by_end directly - if (_partition_by_pose.is_ended && _current_row_position < _partition_by_pose.end.pos) { + if (_partition_by_pose.is_ended && _current_row_position < _partition_by_pose.end) { return; } //no partition_by, the all block is end if (_partition_by_eq_expr_ctxs.empty() || (_input_total_rows == 0)) { - _partition_by_pose.end.block_num = _input_blocks.size() - 1; - _partition_by_pose.end.row_num = _input_blocks.back().rows(); - _partition_by_pose.end.pos = _input_total_rows; + _partition_by_pose.end = _input_total_rows; //maybe need check removed rows _partition_by_pose.is_ended = _input_eos; return; } - BlockRowPos cal_end = _all_block_end; - //have partition_by, binary search the partition end - for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); ++i) { - cal_end = _compare_row_to_find_end(_partition_by_column_idxs[i], _partition_by_pose.end, - cal_end); + const auto start = _partition_by_pose.end; + const auto target = (_partition_by_pose.end || _partition_by_pose.end == 0) + ? _partition_by_pose.end + : _partition_by_pose.end - 1; + DCHECK(_partition_exprs_size > 0); + const auto partition_column_rows = _partition_by_columns[0]->size(); + _partition_by_pose.end = partition_column_rows; + + { + if (start < _partition_by_pose.end) { + for (size_t i = 0; i < _partition_exprs_size; ++i) { + _partition_by_pose.end = find_first_not_equal( + _partition_by_columns[i].get(), target, start, _partition_by_pose.end); + } + } } - cal_end.pos = _input_block_first_row_positions[cal_end.block_num] + cal_end.row_num; - _partition_by_pose.end = cal_end; - if (_partition_by_pose.end.pos < _input_total_rows) { + + if (_partition_by_pose.end < partition_column_rows) { _partition_by_pose.is_ended = true; + // here maybe find candidate ends; return; } - DCHECK_EQ(_partition_by_pose.end.pos, _input_total_rows); + + DCHECK_EQ(_partition_by_pose.end, partition_column_rows); _partition_by_pose.is_ended = _input_eos; } +int64_t AnalyticSinkLocalState::find_first_not_equal(vectorized::IColumn* column, int64_t target, + int64_t start, int64_t end) { + while (start + 1 < end) { + int64_t mid = start + (end - start) / 2; + if (column->compare_at(target, mid, *column, 1) == 0) { + start = mid; + } else { + end = mid; + } + } + if (column->compare_at(target, end - 1, *column, 1) == 0) { + return end; + } + return end - 1; +} + AnalyticSinkOperatorX::AnalyticSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, const DescriptorTbl& descs, bool require_bucket_distribution) @@ -587,18 +553,6 @@ AnalyticSinkOperatorX::AnalyticSinkOperatorX(ObjectPool* pool, int operator_id, _has_window_start(tnode.analytic_node.window.__isset.window_start), _has_window_end(tnode.analytic_node.window.__isset.window_end) { _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; - _fn_scope = AnalyticFnScope::PARTITION; - if (_has_window && _has_range_window) { - // haven't set end, so same as PARTITION, [unbounded preceding, unbounded following] - if (_has_window_end) { - _fn_scope = AnalyticFnScope::RANGE; // range: [unbounded preceding,current row] - } - } else if (_has_window) { - if (_has_window_start || _has_window_end) { - // both not set, same as PARTITION - _fn_scope = AnalyticFnScope::ROWS; - } - } } Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { @@ -724,10 +678,6 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, size_t block_rows = input_block->rows(); local_state._input_total_rows += block_rows; - local_state._all_block_end.block_num = local_state._input_blocks.size(); - local_state._all_block_end.row_num = block_rows; - local_state._all_block_end.pos = local_state._input_total_rows; - // record origin columns, maybe be after this, could cast some column but no need to output if (local_state._input_col_ids.empty()) { for (int c = 0; c < input_block->columns(); ++c) { @@ -749,22 +699,18 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, { SCOPED_TIMER(local_state._compute_partition_by_timer); for (size_t i = 0; i < local_state._partition_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._partition_by_eq_expr_ctxs[i]->execute(input_block, - &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._partition_by_column_idxs[i] = result_col_id; + RETURN_IF_ERROR( + _insert_range_column(input_block, local_state._partition_by_eq_expr_ctxs[i], + local_state._partition_by_columns[i].get(), block_rows)); } } { SCOPED_TIMER(local_state._compute_order_by_timer); for (size_t i = 0; i < local_state._order_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR( - local_state._order_by_eq_expr_ctxs[i]->execute(input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._order_by_column_idxs[i] = result_col_id; + RETURN_IF_ERROR(_insert_range_column(input_block, local_state._order_by_eq_expr_ctxs[i], + local_state._order_by_columns[i].get(), + block_rows)); } } diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index c411b35878846e..4096f6e1786561 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -42,26 +42,14 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; @@ -97,33 +85,22 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _result_window_columns; int64_t _rows_start_offset = 0; int64_t _rows_end_offset = 0; - std::vector _partition_by_column_idxs; - std::vector _order_by_column_idxs; - - // BlockRowPos _order_by_start; - // BlockRowPos _order_by_end; - // BlockRowPos _partition_by_start; - // BlockRowPos _partition_by_end; - BoundaryPose _partition_by_pose; BoundaryPose _order_by_pose; int64_t _current_row_position = 0; - // BlockRowPos partition_by_end; int64_t _input_total_rows = 0; - BlockRowPos _all_block_end; std::vector _input_blocks; bool _input_eos = false; - // BlockRowPos found_partition_end; std::vector _input_col_ids; std::vector _input_block_first_row_positions; std::vector> _agg_input_columns; + std::vector _partition_by_columns; + std::vector _order_by_columns; RuntimeProfile::Counter* _evaluation_timer = nullptr; RuntimeProfile::Counter* _compute_agg_data_timer = nullptr; From 82e15369b27263f2e2ff771485ec34fb5edb08f9 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 6 Jan 2025 10:33:11 +0800 Subject: [PATCH 05/20] update --- be/src/pipeline/exec/analytic_sink_operator.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 4096f6e1786561..52651a8ba19520 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -59,8 +59,7 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState Date: Mon, 6 Jan 2025 10:39:25 +0800 Subject: [PATCH 06/20] update --- be/src/pipeline/exec/analytic_sink_operator.h | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 52651a8ba19520..1e16117050cbc4 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -162,7 +162,6 @@ class AnalyticSinkOperatorX final : public DataSinkOperatorX _partition_exprs; - AnalyticFnScope _fn_scope; TAnalyticWindow _window; bool _has_window; bool _has_range_window; From 38ebb0319c62c885b2be15dc4b29f539dcea1391 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 6 Jan 2025 14:14:13 +0800 Subject: [PATCH 07/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 103 +++++++++++++++--- be/src/pipeline/exec/analytic_sink_operator.h | 17 ++- 2 files changed, 101 insertions(+), 19 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 33904ec446849d..d7040bf20c2324 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -37,6 +37,7 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); + _compute_order_by_function_timer = ADD_TIMER(profile(), "ComputeOrderByFunctionTime"); _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); @@ -52,9 +53,9 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf if (!p._has_window_start && p._window.window_end.type == TAnalyticWindowBoundaryType::CURRENT_ROW) { // For window frame `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW` - _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_range; + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_unbounded_range; } else { - _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_sliding_rows; + _executor.get_next_impl = &AnalyticSinkLocalState::_get_next_for_range_between; } } } else { @@ -138,6 +139,14 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { _order_by_columns[i] = _order_by_eq_expr_ctxs[i]->root()->data_type()->create_column(); } + // only support one order by column, so need two columns upper and lower bound + _range_result_columns.resize(2); + // should change the order by exprs to range column, IF FE have support range window + for (size_t i = 0; i < _order_by_exprs_size; i++) { + RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); + _range_result_columns[i] = _order_by_eq_expr_ctxs[i]->root()->data_type()->create_column(); + } + _fn_place_ptr = _agg_arena_pool->aligned_alloc(p._total_size_of_aggregate_states, p._align_aggregate_states); _create_agg_status(); @@ -287,7 +296,61 @@ Status AnalyticSinkLocalState::_get_next_for_partition() { return Status::OK(); } -Status AnalyticSinkLocalState::_get_next_for_range() { +Status AnalyticSinkLocalState::_get_next_for_unbounded_range() { + while (_has_input_data()) { + { + SCOPED_TIMER(_evaluation_timer); + _get_partition_by_end(); + if (!_partition_by_pose.is_ended) { + break; + } + _init_result_columns(); + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + LOG(INFO) << "asd _get_next_for_unbounded_range: " << _current_row_position << " " + << batch_size << " " << current_block_base_pos; + LOG(INFO) << _order_by_pose.start << " " << _order_by_pose.end << " " + << _partition_by_pose.start << " " << _partition_by_pose.end; + while (_current_row_position < _partition_by_pose.end) { + _update_order_by_range(); + if (_current_row_position == _order_by_pose.start) { + LOG(INFO) << "asd1: " << _partition_by_pose.start << " " + << _partition_by_pose.end << " " << _order_by_pose.start << " " + << _order_by_pose.end; + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _order_by_pose.start, _order_by_pose.end); + } + auto previous_window_frame_width = _current_row_position - current_block_base_pos; + auto current_window_frame_width = _order_by_pose.end - current_block_base_pos; + current_window_frame_width = + std::min(current_window_frame_width, batch_size); + auto real_deal_with_width = + current_window_frame_width - previous_window_frame_width; + + _insert_result_info(real_deal_with_width); + _current_row_position += real_deal_with_width; + LOG(INFO) << "asd2: " << previous_window_frame_width << " " + << current_window_frame_width << " " << real_deal_with_width << " " + << _current_row_position; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; + } + } + + if (_current_row_position - current_block_base_pos >= batch_size) { + vectorized::Block block; + RETURN_IF_ERROR(output_current_block(&block)); + _refresh_buffer_and_dependency_state(&block); + } + if (_current_row_position == _partition_by_pose.end) { + _reset_state_for_next_partition(); + } + } + } + return Status::OK(); +} + +Status AnalyticSinkLocalState::_get_next_for_range_between() { while (_has_input_data()) { { SCOPED_TIMER(_evaluation_timer); @@ -298,8 +361,8 @@ Status AnalyticSinkLocalState::_get_next_for_range() { _init_result_columns(); auto batch_size = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - LOG(INFO) << "asd _get_next_for_range: " << _current_row_position << " " << batch_size - << " " << current_block_base_pos; + LOG(INFO) << "asd _get_next_for_unbounded_range: " << _current_row_position << " " + << batch_size << " " << current_block_base_pos; LOG(INFO) << _order_by_pose.start << " " << _order_by_pose.end << " " << _partition_by_pose.start << " " << _partition_by_pose.end; while (_current_row_position < _partition_by_pose.end) { @@ -437,6 +500,7 @@ void AnalyticSinkLocalState::_refresh_buffer_and_dependency_state(vectorized::Bl // buffer have push data, could signal the source to read _dependency->set_ready_to_read(); } + void AnalyticSinkLocalState::_reset_state_for_next_partition() { _partition_by_pose.start = _partition_by_pose.end; _current_row_position = _partition_by_pose.start; @@ -457,9 +521,9 @@ void AnalyticSinkLocalState::_update_order_by_range() { { if (_order_by_pose.start < _order_by_pose.end) { for (size_t i = 0; i < _order_by_exprs_size; ++i) { - _order_by_pose.end = - find_first_not_equal(_order_by_columns[i].get(), _order_by_pose.start, - _order_by_pose.start, _order_by_pose.end); + _order_by_pose.end = find_first_not_equal( + _order_by_columns[i].get(), _order_by_columns[i].get(), + _order_by_pose.start, _order_by_pose.start, _order_by_pose.end); } } } @@ -501,7 +565,8 @@ void AnalyticSinkLocalState::_get_partition_by_end() { if (start < _partition_by_pose.end) { for (size_t i = 0; i < _partition_exprs_size; ++i) { _partition_by_pose.end = find_first_not_equal( - _partition_by_columns[i].get(), target, start, _partition_by_pose.end); + _partition_by_columns[i].get(), _partition_by_columns[i].get(), target, + start, _partition_by_pose.end); } } } @@ -516,17 +581,19 @@ void AnalyticSinkLocalState::_get_partition_by_end() { _partition_by_pose.is_ended = _input_eos; } -int64_t AnalyticSinkLocalState::find_first_not_equal(vectorized::IColumn* column, int64_t target, - int64_t start, int64_t end) { +// Compares (*this)[n] and rhs[m] +int64_t AnalyticSinkLocalState::find_first_not_equal(vectorized::IColumn* reference_column, + vectorized::IColumn* compared_column, + int64_t target, int64_t start, int64_t end) { while (start + 1 < end) { int64_t mid = start + (end - start) / 2; - if (column->compare_at(target, mid, *column, 1) == 0) { + if (reference_column->compare_at(target, mid, *compared_column, 1) == 0) { start = mid; } else { end = mid; } } - if (column->compare_at(target, end - 1, *column, 1) == 0) { + if (reference_column->compare_at(target, end - 1, *compared_column, 1) == 0) { return end; } return end - 1; @@ -714,6 +781,16 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, } } + { + SCOPED_TIMER(local_state._compute_order_by_function_timer); + // should change the order by exprs to range column, IF FE have support range window + for (size_t i = 0; i < local_state._order_by_eq_expr_ctxs.size(); ++i) { + RETURN_IF_ERROR(_insert_range_column(input_block, local_state._order_by_eq_expr_ctxs[i], + local_state._range_result_columns[i].get(), + block_rows)); + } + } + COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); local_state._input_blocks.emplace_back(std::move(*input_block)); return Status::OK(); diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 1e16117050cbc4..15255b19d54d20 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -43,7 +43,8 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; + std::vector> _agg_input_columns; + std::vector _partition_by_columns; + std::vector _order_by_columns; + std::vector _range_result_columns; size_t _partition_exprs_size = 0; size_t _order_by_exprs_size = 0; size_t _agg_functions_size = 0; + bool _agg_functions_created = false; vectorized::AggregateDataPtr _fn_place_ptr = nullptr; std::unique_ptr _agg_arena_pool = nullptr; std::vector _agg_functions; @@ -82,7 +90,6 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _result_window_columns; @@ -97,14 +104,12 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _input_col_ids; std::vector _input_block_first_row_positions; - std::vector> _agg_input_columns; - std::vector _partition_by_columns; - std::vector _order_by_columns; RuntimeProfile::Counter* _evaluation_timer = nullptr; RuntimeProfile::Counter* _compute_agg_data_timer = nullptr; RuntimeProfile::Counter* _compute_partition_by_timer = nullptr; RuntimeProfile::Counter* _compute_order_by_timer = nullptr; + RuntimeProfile::Counter* _compute_order_by_function_timer = nullptr; RuntimeProfile::Counter* _execute_timer = nullptr; RuntimeProfile::Counter* _get_next_timer = nullptr; RuntimeProfile::Counter* _get_result_timer = nullptr; From f74d9795cdc58d6a5f607ff2e3dc3187b483c71a Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 6 Jan 2025 15:25:07 +0800 Subject: [PATCH 08/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index d7040bf20c2324..ed3f0ea51fa3c7 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -366,26 +366,28 @@ Status AnalyticSinkLocalState::_get_next_for_range_between() { LOG(INFO) << _order_by_pose.start << " " << _order_by_pose.end << " " << _partition_by_pose.start << " " << _partition_by_pose.end; while (_current_row_position < _partition_by_pose.end) { - _update_order_by_range(); - if (_current_row_position == _order_by_pose.start) { - LOG(INFO) << "asd1: " << _partition_by_pose.start << " " - << _partition_by_pose.end << " " << _order_by_pose.start << " " - << _order_by_pose.end; - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, - _order_by_pose.start, _order_by_pose.end); + _reset_agg_status(); + if (!_parent->cast()._window.__isset.window_start) { + _order_by_pose.start = _partition_by_pose.start; + } else { + _order_by_pose.start = find_first_not_equal( + _range_result_columns[0].get(), _order_by_columns[0].get(), + _current_row_position, _order_by_pose.start, _partition_by_pose.end); } - auto previous_window_frame_width = _current_row_position - current_block_base_pos; - auto current_window_frame_width = _order_by_pose.end - current_block_base_pos; - current_window_frame_width = - std::min(current_window_frame_width, batch_size); - auto real_deal_with_width = - current_window_frame_width - previous_window_frame_width; - _insert_result_info(real_deal_with_width); - _current_row_position += real_deal_with_width; - LOG(INFO) << "asd2: " << previous_window_frame_width << " " - << current_window_frame_width << " " << real_deal_with_width << " " - << _current_row_position; + if (!_parent->cast()._window.__isset.window_end) { + _order_by_pose.end = _partition_by_pose.end; + } else { + _order_by_pose.end = find_first_not_equal( + _range_result_columns[1].get(), _order_by_columns[0].get(), + _current_row_position, _order_by_pose.end, _partition_by_pose.end); + } + // Make sure range_start <= range_end + // current_row_start = std::min(current_row_start, current_row_end); + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _order_by_pose.start, _order_by_pose.end); + _insert_result_info(1); + _current_row_position++; if (_current_row_position - current_block_base_pos >= batch_size) { break; } @@ -398,6 +400,8 @@ Status AnalyticSinkLocalState::_get_next_for_range_between() { } if (_current_row_position == _partition_by_pose.end) { _reset_state_for_next_partition(); + _order_by_pose.start = _partition_by_pose.start; + _order_by_pose.end = _partition_by_pose.end; } } } @@ -554,7 +558,7 @@ void AnalyticSinkLocalState::_get_partition_by_end() { } const auto start = _partition_by_pose.end; - const auto target = (_partition_by_pose.end || _partition_by_pose.end == 0) + const auto target = (_partition_by_pose.is_ended || _partition_by_pose.end == 0) ? _partition_by_pose.end : _partition_by_pose.end - 1; DCHECK(_partition_exprs_size > 0); From cdd1bf1bb4e21a83831df5de49d182a5702e9c16 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 6 Jan 2025 15:39:48 +0800 Subject: [PATCH 09/20] update3 --- be/src/pipeline/exec/analytic_sink_operator.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index ed3f0ea51fa3c7..a232fc64fe579f 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -140,10 +140,11 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { } // only support one order by column, so need two columns upper and lower bound - _range_result_columns.resize(2); + // _range_result_columns.resize(2); + _range_result_columns.resize(_order_by_exprs_size); // should change the order by exprs to range column, IF FE have support range window for (size_t i = 0; i < _order_by_exprs_size; i++) { - RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); + // RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); _range_result_columns[i] = _order_by_eq_expr_ctxs[i]->root()->data_type()->create_column(); } From 0b15724b6841b68294040d10fb752f0861069d80 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 6 Jan 2025 18:42:38 +0800 Subject: [PATCH 10/20] update statistics --- be/src/pipeline/dependency.h | 6 -- .../pipeline/exec/analytic_sink_operator.cpp | 62 ++++++++++++++++++- be/src/pipeline/exec/analytic_sink_operator.h | 51 ++++++++++++++- 3 files changed, 109 insertions(+), 10 deletions(-) diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index ec51de9a1f83ce..4c078094b98cd5 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -549,12 +549,6 @@ struct MultiCastSharedState : public BasicSharedState { std::unique_ptr multi_cast_data_streamer; }; -struct BoundaryPose { - int64_t start = 0; - int64_t end = 0; - bool is_ended = false; -}; - struct AnalyticSharedState : public BasicSharedState { ENABLE_FACTORY_CREATOR(AnalyticSharedState) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index a232fc64fe579f..6400f0de307beb 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -507,6 +507,8 @@ void AnalyticSinkLocalState::_refresh_buffer_and_dependency_state(vectorized::Bl } void AnalyticSinkLocalState::_reset_state_for_next_partition() { + _partition_statistics.update(_partition_by_pose.end - _partition_by_pose.start); + _peer_group_statistics.reset(); _partition_by_pose.start = _partition_by_pose.end; _current_row_position = _partition_by_pose.start; _reset_agg_status(); @@ -518,6 +520,19 @@ void AnalyticSinkLocalState::_update_order_by_range() { return; } + while (!_candidate_peer_group_ends.empty()) { + int64_t peek = _candidate_peer_group_ends.front(); + _candidate_peer_group_ends.pop(); + if (peek > _order_by_pose.end) { + _order_by_pose.start = _order_by_pose.end; + _order_by_pose.end = peek; + _order_by_pose.is_ended = true; + + _peer_group_statistics.update(_order_by_pose.end - _order_by_pose.start); + return; + } + } + if (_order_by_pose.is_ended) { _order_by_pose.start = _order_by_pose.end; } @@ -534,8 +549,9 @@ void AnalyticSinkLocalState::_update_order_by_range() { } if (_order_by_pose.end < _partition_by_pose.end) { + _peer_group_statistics.update(_order_by_pose.end - _order_by_pose.start); _order_by_pose.is_ended = true; - // here maybe find candidate ends; + _find_candidate_peer_group_ends(); return; } DCHECK_EQ(_partition_by_pose.end, _order_by_pose.end); @@ -558,6 +574,16 @@ void AnalyticSinkLocalState::_get_partition_by_end() { return; } + while (!_candidate_partition_ends.empty()) { + int64_t peek = _candidate_partition_ends.front(); + _candidate_partition_ends.pop(); + if (peek > _partition_by_pose.end) { + _partition_by_pose.end = peek; + _partition_by_pose.is_ended = true; + return; + } + } + const auto start = _partition_by_pose.end; const auto target = (_partition_by_pose.is_ended || _partition_by_pose.end == 0) ? _partition_by_pose.end @@ -586,6 +612,40 @@ void AnalyticSinkLocalState::_get_partition_by_end() { _partition_by_pose.is_ended = _input_eos; } +void AnalyticSinkLocalState::_find_candidate_partition_ends() { + if (!_partition_statistics.is_high_cardinality()) { + return; + } + + // SCOPED_TIMER(_partition_search_timer); + for (size_t i = _partition_by_pose.end + 1; i < _partition_by_columns[0]->size(); ++i) { + for (auto& column : _partition_by_columns) { + auto cmp = column->compare_at(i - 1, i, *column, 1); + if (cmp != 0) { + _candidate_partition_ends.push(i); + break; + } + } + } +} + +void AnalyticSinkLocalState::_find_candidate_peer_group_ends() { + if (!_peer_group_statistics.is_high_cardinality()) { + return; + } + + // SCOPED_TIMER(_peer_group_search_timer); + for (size_t i = _order_by_pose.end + 1; i < _partition_by_pose.end; ++i) { + for (auto& column : _order_by_columns) { + auto cmp = column->compare_at(i - 1, i, *column, 1); + if (cmp != 0) { + _candidate_peer_group_ends.push(i); + break; + } + } + } +} + // Compares (*this)[n] and rhs[m] int64_t AnalyticSinkLocalState::find_first_not_equal(vectorized::IColumn* reference_column, vectorized::IColumn* compared_column, diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 15255b19d54d20..6766521298b525 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -27,7 +27,46 @@ namespace doris { #include "common/compile_check_begin.h" namespace pipeline { class AnalyticSinkOperatorX; -enum AnalyticFnScope { PARTITION, RANGE, ROWS }; + +struct BoundaryPose { + int64_t start = 0; + int64_t end = 0; + bool is_ended = false; +}; + +class SegmentStatistics { +private: + // We will not perform loop search until processing enough segments + // segment cache partition or peer group + static constexpr int64_t MIN_SEGMENT_NUM = 16; + + // Overhead of binary search is O(N/S logN), where S denote the average size of segment + // Overhead of loop search is O(N) + // The default chunk_size is 4096, then logN turns out to be log(4096) = 12 + // Considering the error of estimation, we set the threshold to 8 + static constexpr int64_t AVERAGE_SIZE_THRESHOLD = 8; + +public: + void update(int64_t segment_size) { + _count++; + _cumulative_size += segment_size; + _average_size = _cumulative_size / _count; + } + + void reset() { + _count = 0; + _cumulative_size = 0; + _average_size = 0; + } + + bool is_high_cardinality() const { + return _count > MIN_SEGMENT_NUM && _average_size < AVERAGE_SIZE_THRESHOLD; + } + + int64_t _count = 0; + int64_t _cumulative_size = 0; + int64_t _average_size = 0; +}; class AnalyticSinkLocalState : public PipelineXSinkLocalState { ENABLE_FACTORY_CREATOR(AnalyticSinkLocalState); @@ -64,6 +103,8 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _range_result_columns; size_t _partition_exprs_size = 0; size_t _order_by_exprs_size = 0; + BoundaryPose _partition_by_pose; + BoundaryPose _order_by_pose; + SegmentStatistics _partition_statistics; + SegmentStatistics _peer_group_statistics; + std::queue _candidate_partition_ends; + std::queue _candidate_peer_group_ends; size_t _agg_functions_size = 0; bool _agg_functions_created = false; @@ -96,8 +143,6 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _input_blocks; From f863ac7b48e0482bc492888e0c2e34785d879a6a Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 6 Jan 2025 20:59:38 +0800 Subject: [PATCH 11/20] update2 --- .../pipeline/exec/analytic_sink_operator.cpp | 20 ++++++------- be/src/pipeline/exec/analytic_sink_operator.h | 29 +++++-------------- 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 6400f0de307beb..d760cff5b57e6f 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -508,7 +508,7 @@ void AnalyticSinkLocalState::_refresh_buffer_and_dependency_state(vectorized::Bl void AnalyticSinkLocalState::_reset_state_for_next_partition() { _partition_statistics.update(_partition_by_pose.end - _partition_by_pose.start); - _peer_group_statistics.reset(); + _order_by_statistics.reset(); _partition_by_pose.start = _partition_by_pose.end; _current_row_position = _partition_by_pose.start; _reset_agg_status(); @@ -520,15 +520,15 @@ void AnalyticSinkLocalState::_update_order_by_range() { return; } - while (!_candidate_peer_group_ends.empty()) { - int64_t peek = _candidate_peer_group_ends.front(); - _candidate_peer_group_ends.pop(); + while (!_candidate_order_by_ends.empty()) { + int64_t peek = _candidate_order_by_ends.front(); + _candidate_order_by_ends.pop(); if (peek > _order_by_pose.end) { _order_by_pose.start = _order_by_pose.end; _order_by_pose.end = peek; _order_by_pose.is_ended = true; - _peer_group_statistics.update(_order_by_pose.end - _order_by_pose.start); + _order_by_statistics.update(_order_by_pose.end - _order_by_pose.start); return; } } @@ -549,9 +549,9 @@ void AnalyticSinkLocalState::_update_order_by_range() { } if (_order_by_pose.end < _partition_by_pose.end) { - _peer_group_statistics.update(_order_by_pose.end - _order_by_pose.start); + _order_by_statistics.update(_order_by_pose.end - _order_by_pose.start); _order_by_pose.is_ended = true; - _find_candidate_peer_group_ends(); + _find_candidate_order_by_ends(); return; } DCHECK_EQ(_partition_by_pose.end, _order_by_pose.end); @@ -629,8 +629,8 @@ void AnalyticSinkLocalState::_find_candidate_partition_ends() { } } -void AnalyticSinkLocalState::_find_candidate_peer_group_ends() { - if (!_peer_group_statistics.is_high_cardinality()) { +void AnalyticSinkLocalState::_find_candidate_order_by_ends() { + if (!_order_by_statistics.is_high_cardinality()) { return; } @@ -639,7 +639,7 @@ void AnalyticSinkLocalState::_find_candidate_peer_group_ends() { for (auto& column : _order_by_columns) { auto cmp = column->compare_at(i - 1, i, *column, 1); if (cmp != 0) { - _candidate_peer_group_ends.push(i); + _candidate_order_by_ends.push(i); break; } } diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 6766521298b525..2d211e4fb6ab42 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -34,22 +34,11 @@ struct BoundaryPose { bool is_ended = false; }; -class SegmentStatistics { -private: - // We will not perform loop search until processing enough segments - // segment cache partition or peer group - static constexpr int64_t MIN_SEGMENT_NUM = 16; - - // Overhead of binary search is O(N/S logN), where S denote the average size of segment - // Overhead of loop search is O(N) - // The default chunk_size is 4096, then logN turns out to be log(4096) = 12 - // Considering the error of estimation, we set the threshold to 8 - static constexpr int64_t AVERAGE_SIZE_THRESHOLD = 8; - +class PartitionStatistics { public: - void update(int64_t segment_size) { + void update(int64_t size) { _count++; - _cumulative_size += segment_size; + _cumulative_size += size; _average_size = _cumulative_size / _count; } @@ -59,9 +48,7 @@ class SegmentStatistics { _average_size = 0; } - bool is_high_cardinality() const { - return _count > MIN_SEGMENT_NUM && _average_size < AVERAGE_SIZE_THRESHOLD; - } + bool is_high_cardinality() const { return _count > 16 && _average_size < 8; } int64_t _count = 0; int64_t _cumulative_size = 0; @@ -104,7 +91,7 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _candidate_partition_ends; - std::queue _candidate_peer_group_ends; + std::queue _candidate_order_by_ends; size_t _agg_functions_size = 0; bool _agg_functions_created = false; From 2307ee1ae18efac297bf8130956d134c3ff78510 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Tue, 7 Jan 2025 19:22:40 +0800 Subject: [PATCH 12/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 310 +++++++----------- be/src/pipeline/exec/analytic_sink_operator.h | 6 +- 2 files changed, 121 insertions(+), 195 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index d760cff5b57e6f..46163fc2f14614 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -41,6 +41,10 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); + _partition_search_timer = ADD_TIMER(profile(), "PartitionSearchTime"); + _order_search_timer = ADD_TIMER(profile(), "OrderSearchTime"); + _blocks_memory_usage = + profile()->AddHighWaterMarkCounter("Blocks", TUnit::BYTES, "MemoryUsage", 1); _agg_arena_pool = std::make_unique(); auto& p = _parent->cast(); if (!p._has_window) { //haven't set window, Unbounded: [unbounded preceding,unbounded following] @@ -170,191 +174,139 @@ Status AnalyticSinkLocalState::close(RuntimeState* state, Status exec_status) { } Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { - while (_has_input_data()) { - { - SCOPED_TIMER(_evaluation_timer); - _get_partition_by_end(); - if (!_partition_by_pose.is_ended) { - break; - } - _init_result_columns(); - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - - while (_current_row_position < _partition_by_pose.end) { - int64_t current_row_start = 0; - int64_t current_row_end = _current_row_position + _rows_end_offset + 1; + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - _reset_agg_status(); - if (!_parent->cast()._window.__isset.window_start) { - current_row_start = _partition_by_pose.start; - } else { - current_row_start = _current_row_position + _rows_start_offset; - } - // Make sure range_start <= range_end - current_row_start = std::min(current_row_start, current_row_end); - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, - current_row_start, current_row_end); - _insert_result_info(1); - _current_row_position++; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; - } - } + while (_current_row_position < _partition_by_pose.end) { + int64_t current_row_start = 0; + int64_t current_row_end = _current_row_position + _rows_end_offset + 1; - if (_current_row_position - current_block_base_pos >= batch_size) { - vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); - _refresh_buffer_and_dependency_state(&block); - } - if (_current_row_position == _partition_by_pose.end) { - _reset_state_for_next_partition(); - } + _reset_agg_status(); + if (!_parent->cast()._window.__isset.window_start) { + current_row_start = _partition_by_pose.start; + } else { + current_row_start = _current_row_position + _rows_start_offset; + } + // Make sure range_start <= range_end + current_row_start = std::min(current_row_start, current_row_end); + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, current_row_start, + current_row_end); + _insert_result_info(1); + _current_row_position++; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; } } return Status::OK(); } Status AnalyticSinkLocalState::_get_next_for_unbounded_rows() { - while (_has_input_data()) { - { - SCOPED_TIMER(_evaluation_timer); - _get_partition_by_end(); - if (!_partition_by_pose.is_ended) { - break; - } - _init_result_columns(); - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - - while (_current_row_position < _partition_by_pose.end) { - // [preceding, current_row], [current_row, following] rewrite it's same - // as could reuse the previous calculate result, so don't call _reset_agg_status function - // going on calculate, add up data, no need to reset state - int64_t current_row_start = _current_row_position; - int64_t current_row_end = _current_row_position + 1; - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, - current_row_start, current_row_end); - _insert_result_info(1); - _current_row_position++; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; - } - } - - if (_current_row_position - current_block_base_pos >= batch_size) { - vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); - _refresh_buffer_and_dependency_state(&block); - } - if (_current_row_position == _partition_by_pose.end) { - _reset_state_for_next_partition(); - } + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + + while (_current_row_position < _partition_by_pose.end) { + // [preceding, current_row], [current_row, following] rewrite it's same + // as could reuse the previous calculate result, so don't call _reset_agg_status function + // going on calculate, add up data, no need to reset state + int64_t current_row_start = _current_row_position; + int64_t current_row_end = _current_row_position + 1; + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, current_row_start, + current_row_end); + _insert_result_info(1); + _current_row_position++; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; } } return Status::OK(); } Status AnalyticSinkLocalState::_get_next_for_partition() { - while (_has_input_data()) { - { - SCOPED_TIMER(_evaluation_timer); - _get_partition_by_end(); - if (!_partition_by_pose.is_ended) { - break; - } - _init_result_columns(); - if (_current_row_position == _partition_by_pose.start) { - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, - _partition_by_pose.start, _partition_by_pose.end); - } - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + if (_current_row_position == _partition_by_pose.start) { + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _partition_by_pose.start, _partition_by_pose.end); + } + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - // the end pos maybe after multis blocks, but should output by batch size and should not exceed partition end - auto window_end_pos = _current_row_position + batch_size; - window_end_pos = std::min(window_end_pos, _partition_by_pose.end); + // the end pos maybe after multis blocks, but should output by batch size and should not exceed partition end + auto window_end_pos = _current_row_position + batch_size; + window_end_pos = std::min(window_end_pos, _partition_by_pose.end); - auto previous_window_frame_width = _current_row_position - current_block_base_pos; - auto current_window_frame_width = window_end_pos - current_block_base_pos; - // should not exceed block batch size - current_window_frame_width = std::min(current_window_frame_width, batch_size); - auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; + auto previous_window_frame_width = _current_row_position - current_block_base_pos; + auto current_window_frame_width = window_end_pos - current_block_base_pos; + // should not exceed block batch size + current_window_frame_width = std::min(current_window_frame_width, batch_size); + auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; - _insert_result_info(real_deal_with_width); - _current_row_position += real_deal_with_width; + _insert_result_info(real_deal_with_width); + _current_row_position += real_deal_with_width; + return Status::OK(); +} - if (_current_row_position - current_block_base_pos >= batch_size) { - vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); - _refresh_buffer_and_dependency_state(&block); - } - if (_current_row_position == _partition_by_pose.end) { - _reset_state_for_next_partition(); - } +Status AnalyticSinkLocalState::_get_next_for_unbounded_range() { + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + while (_current_row_position < _partition_by_pose.end) { + _update_order_by_range(); + if (_current_row_position == _order_by_pose.start) { + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _order_by_pose.start, _order_by_pose.end); + } + auto previous_window_frame_width = _current_row_position - current_block_base_pos; + auto current_window_frame_width = _order_by_pose.end - current_block_base_pos; + current_window_frame_width = std::min(current_window_frame_width, batch_size); + auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; + + _insert_result_info(real_deal_with_width); + _current_row_position += real_deal_with_width; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; } } return Status::OK(); } -Status AnalyticSinkLocalState::_get_next_for_unbounded_range() { - while (_has_input_data()) { - { - SCOPED_TIMER(_evaluation_timer); - _get_partition_by_end(); - if (!_partition_by_pose.is_ended) { - break; - } - _init_result_columns(); - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - LOG(INFO) << "asd _get_next_for_unbounded_range: " << _current_row_position << " " - << batch_size << " " << current_block_base_pos; - LOG(INFO) << _order_by_pose.start << " " << _order_by_pose.end << " " - << _partition_by_pose.start << " " << _partition_by_pose.end; - while (_current_row_position < _partition_by_pose.end) { - _update_order_by_range(); - if (_current_row_position == _order_by_pose.start) { - LOG(INFO) << "asd1: " << _partition_by_pose.start << " " - << _partition_by_pose.end << " " << _order_by_pose.start << " " - << _order_by_pose.end; - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, - _order_by_pose.start, _order_by_pose.end); - } - auto previous_window_frame_width = _current_row_position - current_block_base_pos; - auto current_window_frame_width = _order_by_pose.end - current_block_base_pos; - current_window_frame_width = - std::min(current_window_frame_width, batch_size); - auto real_deal_with_width = - current_window_frame_width - previous_window_frame_width; - - _insert_result_info(real_deal_with_width); - _current_row_position += real_deal_with_width; - LOG(INFO) << "asd2: " << previous_window_frame_width << " " - << current_window_frame_width << " " << real_deal_with_width << " " - << _current_row_position; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; - } - } +Status AnalyticSinkLocalState::_get_next_for_range_between() { + auto batch_size = _input_blocks[_output_block_index].rows(); + auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + while (_current_row_position < _partition_by_pose.end) { + _reset_agg_status(); + if (!_parent->cast()._window.__isset.window_start) { + _order_by_pose.start = _partition_by_pose.start; + } else { + _order_by_pose.start = find_first_not_equal( + _range_result_columns[0].get(), _order_by_columns[0].get(), + _current_row_position, _order_by_pose.start, _partition_by_pose.end); + } - if (_current_row_position - current_block_base_pos >= batch_size) { - vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); - _refresh_buffer_and_dependency_state(&block); - } - if (_current_row_position == _partition_by_pose.end) { - _reset_state_for_next_partition(); - } + if (!_parent->cast()._window.__isset.window_end) { + _order_by_pose.end = _partition_by_pose.end; + } else { + _order_by_pose.end = find_first_not_equal( + _range_result_columns[1].get(), _order_by_columns[0].get(), + _current_row_position, _order_by_pose.end, _partition_by_pose.end); + } + // Make sure range_start <= range_end + // current_row_start = std::min(current_row_start, current_row_end); + _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _order_by_pose.start, _order_by_pose.end); + _insert_result_info(1); + _current_row_position++; + if (_current_row_position - current_block_base_pos >= batch_size) { + break; } } + if (_current_row_position == _partition_by_pose.end) { + _order_by_pose.start = _partition_by_pose.end; // update to next partition pos + _order_by_pose.end = _partition_by_pose.end; + } return Status::OK(); } -Status AnalyticSinkLocalState::_get_next_for_range_between() { - while (_has_input_data()) { +Status AnalyticSinkLocalState::_execute_impl() { + while (_output_block_index < _input_blocks.size()) { { - SCOPED_TIMER(_evaluation_timer); _get_partition_by_end(); if (!_partition_by_pose.is_ended) { break; @@ -362,37 +314,8 @@ Status AnalyticSinkLocalState::_get_next_for_range_between() { _init_result_columns(); auto batch_size = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - LOG(INFO) << "asd _get_next_for_unbounded_range: " << _current_row_position << " " - << batch_size << " " << current_block_base_pos; - LOG(INFO) << _order_by_pose.start << " " << _order_by_pose.end << " " - << _partition_by_pose.start << " " << _partition_by_pose.end; - while (_current_row_position < _partition_by_pose.end) { - _reset_agg_status(); - if (!_parent->cast()._window.__isset.window_start) { - _order_by_pose.start = _partition_by_pose.start; - } else { - _order_by_pose.start = find_first_not_equal( - _range_result_columns[0].get(), _order_by_columns[0].get(), - _current_row_position, _order_by_pose.start, _partition_by_pose.end); - } - if (!_parent->cast()._window.__isset.window_end) { - _order_by_pose.end = _partition_by_pose.end; - } else { - _order_by_pose.end = find_first_not_equal( - _range_result_columns[1].get(), _order_by_columns[0].get(), - _current_row_position, _order_by_pose.end, _partition_by_pose.end); - } - // Make sure range_start <= range_end - // current_row_start = std::min(current_row_start, current_row_end); - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, - _order_by_pose.start, _order_by_pose.end); - _insert_result_info(1); - _current_row_position++; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; - } - } + RETURN_IF_ERROR((this->*_executor.get_next_impl)()); if (_current_row_position - current_block_base_pos >= batch_size) { vectorized::Block block; @@ -401,8 +324,6 @@ Status AnalyticSinkLocalState::_get_next_for_range_between() { } if (_current_row_position == _partition_by_pose.end) { _reset_state_for_next_partition(); - _order_by_pose.start = _partition_by_pose.start; - _order_by_pose.end = _partition_by_pose.end; } } } @@ -458,7 +379,7 @@ void AnalyticSinkLocalState::_insert_result_info(int64_t real_deal_with_width) { Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { block->swap(std::move(_input_blocks[_output_block_index])); - // _blocks_memory_usage->add(-block->allocated_bytes()); + _blocks_memory_usage->add(-block->allocated_bytes()); if (_input_col_ids.size() < block->columns()) { block->erase_not_in(_input_col_ids); } @@ -519,7 +440,7 @@ void AnalyticSinkLocalState::_update_order_by_range() { if (_order_by_pose.is_ended && _current_row_position < _order_by_pose.end) { return; } - + SCOPED_TIMER(_order_search_timer); while (!_candidate_order_by_ends.empty()) { int64_t peek = _candidate_order_by_ends.front(); _candidate_order_by_ends.pop(); @@ -573,7 +494,7 @@ void AnalyticSinkLocalState::_get_partition_by_end() { _partition_by_pose.is_ended = _input_eos; return; } - + SCOPED_TIMER(_partition_search_timer); while (!_candidate_partition_ends.empty()) { int64_t peek = _candidate_partition_ends.front(); _candidate_partition_ends.pop(); @@ -617,7 +538,7 @@ void AnalyticSinkLocalState::_find_candidate_partition_ends() { return; } - // SCOPED_TIMER(_partition_search_timer); + SCOPED_TIMER(_partition_search_timer); for (size_t i = _partition_by_pose.end + 1; i < _partition_by_columns[0]->size(); ++i) { for (auto& column : _partition_by_columns) { auto cmp = column->compare_at(i - 1, i, *column, 1); @@ -634,7 +555,7 @@ void AnalyticSinkLocalState::_find_candidate_order_by_ends() { return; } - // SCOPED_TIMER(_peer_group_search_timer); + SCOPED_TIMER(_order_search_timer); for (size_t i = _order_by_pose.end + 1; i < _partition_by_pose.end; ++i) { for (auto& column : _order_by_columns) { auto cmp = column->compare_at(i - 1, i, *column, 1); @@ -790,7 +711,10 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); local_state._input_eos = eos; RETURN_IF_ERROR(_add_input_block(state, input_block)); - RETURN_IF_ERROR((local_state.*(local_state._executor.get_next_impl))()); + { + SCOPED_TIMER(local_state._evaluation_timer); + RETURN_IF_ERROR((local_state.*(local_state._executor.get_next_impl))()); + } if (local_state._input_eos) { std::unique_lock lc(local_state._shared_state->sink_eos_lock); diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 2d211e4fb6ab42..9da983833cc7bb 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -68,6 +68,7 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState { From 8d6557508542f97e2328aa6677c53c7cf4b4ef89 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Wed, 8 Jan 2025 10:54:13 +0800 Subject: [PATCH 13/20] update --- be/src/pipeline/exec/analytic_sink_operator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 46163fc2f14614..261a41f05e68f4 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -713,7 +713,7 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block RETURN_IF_ERROR(_add_input_block(state, input_block)); { SCOPED_TIMER(local_state._evaluation_timer); - RETURN_IF_ERROR((local_state.*(local_state._executor.get_next_impl))()); + RETURN_IF_ERROR(local_state._execute_impl()); } if (local_state._input_eos) { From 3dd956c7998f991d185a17bf9f6b03d2b655bfda Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Thu, 9 Jan 2025 17:27:59 +0800 Subject: [PATCH 14/20] remove timer --- .../pipeline/exec/analytic_sink_operator.cpp | 151 ++++++++---------- be/src/pipeline/exec/analytic_sink_operator.h | 36 ++--- 2 files changed, 84 insertions(+), 103 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 261a41f05e68f4..b05d56d351f90b 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -33,14 +33,11 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); - _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); _compute_order_by_function_timer = ADD_TIMER(profile(), "ComputeOrderByFunctionTime"); - _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); - _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); - _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); _partition_search_timer = ADD_TIMER(profile(), "PartitionSearchTime"); _order_search_timer = ADD_TIMER(profile(), "OrderSearchTime"); _blocks_memory_usage = @@ -114,6 +111,8 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { _agg_expr_ctxs.resize(_agg_functions_size); _agg_functions.resize(_agg_functions_size); _agg_input_columns.resize(_agg_functions_size); + _offsets_of_aggregate_states.resize(_agg_functions_size); + _result_column_nullable_flags.resize(_agg_functions_size); for (int i = 0; i < _agg_functions_size; ++i) { _agg_functions[i] = p._agg_functions[i]->clone(state, state->obj_pool()); @@ -123,6 +122,10 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { RETURN_IF_ERROR(p._agg_expr_ctxs[i][j]->clone(state, _agg_expr_ctxs[i][j])); _agg_input_columns[i][j] = _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); } + _offsets_of_aggregate_states[i] = p._offsets_of_aggregate_states[i]; + _result_column_nullable_flags[i] = + !_agg_functions[i]->function()->get_return_type()->is_nullable() && + _agg_functions[i]->data_type()->is_nullable(); } _partition_exprs_size = p._partition_by_eq_expr_ctxs.size(); @@ -173,10 +176,8 @@ Status AnalyticSinkLocalState::close(RuntimeState* state, Status exec_status) { return PipelineXSinkLocalState::close(state, exec_status); } -Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - +bool AnalyticSinkLocalState::_get_next_for_sliding_rows(int64_t batch_rows, + int64_t current_block_base_pos) { while (_current_row_position < _partition_by_pose.end) { int64_t current_row_start = 0; int64_t current_row_end = _current_row_position + _rows_end_offset + 1; @@ -189,87 +190,80 @@ Status AnalyticSinkLocalState::_get_next_for_sliding_rows() { } // Make sure range_start <= range_end current_row_start = std::min(current_row_start, current_row_end); - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, current_row_start, + _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, current_row_start, current_row_end); _insert_result_info(1); _current_row_position++; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; + if (_current_row_position - current_block_base_pos >= batch_rows) { + return true; } } - return Status::OK(); + return false; } -Status AnalyticSinkLocalState::_get_next_for_unbounded_rows() { - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; - +bool AnalyticSinkLocalState::_get_next_for_unbounded_rows(int64_t batch_rows, + int64_t current_block_base_pos) { while (_current_row_position < _partition_by_pose.end) { // [preceding, current_row], [current_row, following] rewrite it's same // as could reuse the previous calculate result, so don't call _reset_agg_status function // going on calculate, add up data, no need to reset state - int64_t current_row_start = _current_row_position; - int64_t current_row_end = _current_row_position + 1; - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, current_row_start, - current_row_end); + _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, + _current_row_position, _current_row_position + 1); _insert_result_info(1); _current_row_position++; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; + if (_current_row_position - current_block_base_pos >= batch_rows) { + return true; } } - return Status::OK(); + return false; } -Status AnalyticSinkLocalState::_get_next_for_partition() { +bool AnalyticSinkLocalState::_get_next_for_partition(int64_t batch_rows, + int64_t current_block_base_pos) { if (_current_row_position == _partition_by_pose.start) { - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, _partition_by_pose.start, _partition_by_pose.end); } - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; // the end pos maybe after multis blocks, but should output by batch size and should not exceed partition end - auto window_end_pos = _current_row_position + batch_size; + auto window_end_pos = _current_row_position + batch_rows; window_end_pos = std::min(window_end_pos, _partition_by_pose.end); auto previous_window_frame_width = _current_row_position - current_block_base_pos; auto current_window_frame_width = window_end_pos - current_block_base_pos; // should not exceed block batch size - current_window_frame_width = std::min(current_window_frame_width, batch_size); + current_window_frame_width = std::min(current_window_frame_width, batch_rows); auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; _insert_result_info(real_deal_with_width); _current_row_position += real_deal_with_width; - return Status::OK(); + return _current_row_position - current_block_base_pos >= batch_rows; } -Status AnalyticSinkLocalState::_get_next_for_unbounded_range() { - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; +bool AnalyticSinkLocalState::_get_next_for_unbounded_range(int64_t batch_rows, + int64_t current_block_base_pos) { while (_current_row_position < _partition_by_pose.end) { _update_order_by_range(); if (_current_row_position == _order_by_pose.start) { - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, _order_by_pose.start, _order_by_pose.end); } auto previous_window_frame_width = _current_row_position - current_block_base_pos; auto current_window_frame_width = _order_by_pose.end - current_block_base_pos; - current_window_frame_width = std::min(current_window_frame_width, batch_size); + current_window_frame_width = std::min(current_window_frame_width, batch_rows); auto real_deal_with_width = current_window_frame_width - previous_window_frame_width; _insert_result_info(real_deal_with_width); _current_row_position += real_deal_with_width; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; + if (_current_row_position - current_block_base_pos >= batch_rows) { + return true; } } - return Status::OK(); + return false; } -Status AnalyticSinkLocalState::_get_next_for_range_between() { - auto batch_size = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; +bool AnalyticSinkLocalState::_get_next_for_range_between(int64_t batch_rows, + int64_t current_block_base_pos) { while (_current_row_position < _partition_by_pose.end) { _reset_agg_status(); if (!_parent->cast()._window.__isset.window_start) { @@ -289,19 +283,19 @@ Status AnalyticSinkLocalState::_get_next_for_range_between() { } // Make sure range_start <= range_end // current_row_start = std::min(current_row_start, current_row_end); - _execute_for_win_func(_partition_by_pose.start, _partition_by_pose.end, + _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, _order_by_pose.start, _order_by_pose.end); _insert_result_info(1); _current_row_position++; - if (_current_row_position - current_block_base_pos >= batch_size) { - break; + if (_current_row_position - current_block_base_pos >= batch_rows) { + return true; } } if (_current_row_position == _partition_by_pose.end) { _order_by_pose.start = _partition_by_pose.end; // update to next partition pos _order_by_pose.end = _partition_by_pose.end; } - return Status::OK(); + return false; } Status AnalyticSinkLocalState::_execute_impl() { @@ -312,14 +306,19 @@ Status AnalyticSinkLocalState::_execute_impl() { break; } _init_result_columns(); - auto batch_size = _input_blocks[_output_block_index].rows(); + auto batch_rows = _input_blocks[_output_block_index].rows(); auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + bool should_output = false; - RETURN_IF_ERROR((this->*_executor.get_next_impl)()); + { + SCOPED_TIMER(_evaluation_timer); + should_output = + (this->*_executor.get_next_impl)(batch_rows, current_block_base_pos); + } - if (_current_row_position - current_block_base_pos >= batch_size) { + if (should_output) { vectorized::Block block; - RETURN_IF_ERROR(output_current_block(&block)); + _output_current_block(&block); _refresh_buffer_and_dependency_state(&block); } if (_current_row_position == _partition_by_pose.end) { @@ -330,9 +329,9 @@ Status AnalyticSinkLocalState::_execute_impl() { return Status::OK(); } -void AnalyticSinkLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, +void AnalyticSinkLocalState::_execute_for_function(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end) { - SCOPED_TIMER(_execute_timer); + // here is the core function, should not add timer for (size_t i = 0; i < _agg_functions_size; ++i) { std::vector agg_columns; for (int j = 0; j < _agg_input_columns[i].size(); ++j) { @@ -340,9 +339,8 @@ void AnalyticSinkLocalState::_execute_for_win_func(int64_t partition_start, int6 } _agg_functions[i]->function()->add_range_single_place( partition_start, partition_end, frame_start, frame_end, - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i], - agg_columns.data(), _agg_arena_pool.get()); + _fn_place_ptr + _offsets_of_aggregate_states[i], agg_columns.data(), + _agg_arena_pool.get()); // If the end is not greater than the start, the current window should be empty. // _current_window_empty = false; @@ -352,13 +350,10 @@ void AnalyticSinkLocalState::_execute_for_win_func(int64_t partition_start, int6 } void AnalyticSinkLocalState::_insert_result_info(int64_t real_deal_with_width) { - SCOPED_TIMER(_get_result_timer); - const auto& offsets_of_aggregate_states = - _parent->cast()._offsets_of_aggregate_states; + // here is the core function, should not add timer for (size_t i = 0; i < _agg_functions_size; ++i) { for (size_t j = 0; j < real_deal_with_width; ++j) { - if (!_agg_functions[i]->function()->get_return_type()->is_nullable() && - _result_window_columns[i]->is_nullable()) { + if (_result_column_nullable_flags[i]) { if (_current_window_empty) { _result_window_columns[i]->insert_default(); } else { @@ -366,18 +361,18 @@ void AnalyticSinkLocalState::_insert_result_info(int64_t real_deal_with_width) { _result_window_columns[i].get()); dst->get_null_map_data().push_back(0); _agg_functions[i]->insert_result_info( - _fn_place_ptr + offsets_of_aggregate_states[i], + _fn_place_ptr + _offsets_of_aggregate_states[i], &dst->get_nested_column()); } continue; } - _agg_functions[i]->insert_result_info(_fn_place_ptr + offsets_of_aggregate_states[i], + _agg_functions[i]->insert_result_info(_fn_place_ptr + _offsets_of_aggregate_states[i], _result_window_columns[i].get()); } } } -Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { +void AnalyticSinkLocalState::_output_current_block(vectorized::Block* block) { block->swap(std::move(_input_blocks[_output_block_index])); _blocks_memory_usage->add(-block->allocated_bytes()); if (_input_col_ids.size() < block->columns()) { @@ -399,7 +394,6 @@ Status AnalyticSinkLocalState::output_current_block(vectorized::Block* block) { } _output_block_index++; - return Status::OK(); } void AnalyticSinkLocalState::_init_result_columns() { @@ -448,7 +442,6 @@ void AnalyticSinkLocalState::_update_order_by_range() { _order_by_pose.start = _order_by_pose.end; _order_by_pose.end = peek; _order_by_pose.is_ended = true; - _order_by_statistics.update(_order_by_pose.end - _order_by_pose.start); return; } @@ -525,7 +518,7 @@ void AnalyticSinkLocalState::_get_partition_by_end() { if (_partition_by_pose.end < partition_column_rows) { _partition_by_pose.is_ended = true; - // here maybe find candidate ends; + _find_candidate_partition_ends(); return; } @@ -650,14 +643,15 @@ Status AnalyticSinkOperatorX::open(RuntimeState* state) { } _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); + _change_to_nullable_flags.resize(_agg_functions_size); for (size_t i = 0; i < _agg_functions_size; ++i) { SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[i]; SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[i]; RETURN_IF_ERROR(_agg_functions[i]->prepare(state, _child->row_desc(), intermediate_slot_desc, output_slot_desc)); _agg_functions[i]->set_version(state->be_exec_version()); - _change_to_nullable_flags.push_back(output_slot_desc->is_nullable() && - !_agg_functions[i]->data_type()->is_nullable()); + _change_to_nullable_flags[i] = + output_slot_desc->is_nullable() && (!_agg_functions[i]->data_type()->is_nullable()); } if (!_partition_by_eq_expr_ctxs.empty() || !_order_by_eq_expr_ctxs.empty()) { vector tuple_ids; @@ -711,11 +705,7 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); local_state._input_eos = eos; RETURN_IF_ERROR(_add_input_block(state, input_block)); - { - SCOPED_TIMER(local_state._evaluation_timer); - RETURN_IF_ERROR(local_state._execute_impl()); - } - + RETURN_IF_ERROR(local_state._execute_impl()); if (local_state._input_eos) { std::unique_lock lc(local_state._shared_state->sink_eos_lock); local_state._shared_state->sink_eos = true; @@ -781,6 +771,7 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, } COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); + COUNTER_UPDATE(local_state._blocks_memory_usage, input_block->allocated_bytes()); local_state._input_blocks.emplace_back(std::move(*input_block)); return Status::OK(); } @@ -798,23 +789,17 @@ Status AnalyticSinkOperatorX::_insert_range_column(vectorized::Block* block, void AnalyticSinkLocalState::_reset_agg_status() { for (size_t i = 0; i < _agg_functions_size; ++i) { - _agg_functions[i]->reset( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i]); + _agg_functions[i]->reset(_fn_place_ptr + _offsets_of_aggregate_states[i]); } } void AnalyticSinkLocalState::_create_agg_status() { for (size_t i = 0; i < _agg_functions_size; ++i) { try { - _agg_functions[i]->create( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i]); + _agg_functions[i]->create(_fn_place_ptr + _offsets_of_aggregate_states[i]); } catch (...) { for (int j = 0; j < i; ++j) { - _agg_functions[j]->destroy( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[j]); + _agg_functions[j]->destroy(_fn_place_ptr + _offsets_of_aggregate_states[j]); } throw; } @@ -827,9 +812,7 @@ void AnalyticSinkLocalState::_destroy_agg_status() { return; } for (size_t i = 0; i < _agg_functions_size; ++i) { - _agg_functions[i]->destroy( - _fn_place_ptr + - _parent->cast()._offsets_of_aggregate_states[i]); + _agg_functions[i]->destroy(_fn_place_ptr + _offsets_of_aggregate_states[i]); } } diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 9da983833cc7bb..56f0d8881197b0 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -69,28 +69,27 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_arena_pool = nullptr; std::vector _agg_functions; + std::vector _offsets_of_aggregate_states; + std::vector _result_column_nullable_flags; - using vectorized_get_next = Status (AnalyticSinkLocalState::*)(); + using vectorized_get_next = bool (AnalyticSinkLocalState::*)(int64_t, int64_t); struct executor { vectorized_get_next get_next_impl; }; executor _executor; bool _current_window_empty = false; + int64_t _current_row_position = 0; int64_t _output_block_index = 0; std::vector _result_window_columns; int64_t _rows_start_offset = 0; int64_t _rows_end_offset = 0; - int64_t _current_row_position = 0; int64_t _input_total_rows = 0; - std::vector _input_blocks; bool _input_eos = false; std::vector _input_col_ids; + std::vector _input_blocks; std::vector _input_block_first_row_positions; RuntimeProfile::Counter* _evaluation_timer = nullptr; @@ -142,9 +143,6 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState Date: Fri, 10 Jan 2025 15:57:08 +0800 Subject: [PATCH 15/20] update remove --- .../pipeline/exec/analytic_sink_operator.cpp | 102 ++++++++++++++---- be/src/pipeline/exec/analytic_sink_operator.h | 22 ++-- 2 files changed, 98 insertions(+), 26 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index b05d56d351f90b..594a1256aa4c96 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -20,6 +20,7 @@ #include +#include #include #include @@ -40,6 +41,9 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _compute_order_by_function_timer = ADD_TIMER(profile(), "ComputeOrderByFunctionTime"); _partition_search_timer = ADD_TIMER(profile(), "PartitionSearchTime"); _order_search_timer = ADD_TIMER(profile(), "OrderSearchTime"); + _remove_rows_timer = ADD_TIMER(profile(), "RemoveRowsTime"); + _remove_rows = ADD_COUNTER(profile(), "RemoveRows", TUnit::UNIT); + _remove_count = ADD_COUNTER(profile(), "RemoveCount", TUnit::UNIT); _blocks_memory_usage = profile()->AddHighWaterMarkCounter("Blocks", TUnit::BYTES, "MemoryUsage", 1); _agg_arena_pool = std::make_unique(); @@ -307,7 +311,8 @@ Status AnalyticSinkLocalState::_execute_impl() { } _init_result_columns(); auto batch_rows = _input_blocks[_output_block_index].rows(); - auto current_block_base_pos = _input_block_first_row_positions[_output_block_index]; + auto current_block_base_pos = + _input_block_first_row_positions[_output_block_index] - _have_removed_rows; bool should_output = false; { @@ -397,7 +402,8 @@ void AnalyticSinkLocalState::_output_current_block(vectorized::Block* block) { } void AnalyticSinkLocalState::_init_result_columns() { - if (_current_row_position == _input_block_first_row_positions[_output_block_index]) { + if (_current_row_position + _have_removed_rows == + _input_block_first_row_positions[_output_block_index]) { _result_window_columns.resize(_agg_functions_size); // return type create result column for (size_t i = 0; i < _agg_functions_size; ++i) { @@ -422,8 +428,8 @@ void AnalyticSinkLocalState::_refresh_buffer_and_dependency_state(vectorized::Bl } void AnalyticSinkLocalState::_reset_state_for_next_partition() { - _partition_statistics.update(_partition_by_pose.end - _partition_by_pose.start); - _order_by_statistics.reset(); + _partition_column_statistics.update(_partition_by_pose.end - _partition_by_pose.start); + _order_by_column_statistics.reset(); _partition_by_pose.start = _partition_by_pose.end; _current_row_position = _partition_by_pose.start; _reset_agg_status(); @@ -435,14 +441,14 @@ void AnalyticSinkLocalState::_update_order_by_range() { return; } SCOPED_TIMER(_order_search_timer); - while (!_candidate_order_by_ends.empty()) { - int64_t peek = _candidate_order_by_ends.front(); - _candidate_order_by_ends.pop(); + while (!_next_order_by_ends.empty()) { + int64_t peek = _next_order_by_ends.front(); + _next_order_by_ends.pop(); if (peek > _order_by_pose.end) { _order_by_pose.start = _order_by_pose.end; _order_by_pose.end = peek; _order_by_pose.is_ended = true; - _order_by_statistics.update(_order_by_pose.end - _order_by_pose.start); + _order_by_column_statistics.update(_order_by_pose.end - _order_by_pose.start); return; } } @@ -463,9 +469,9 @@ void AnalyticSinkLocalState::_update_order_by_range() { } if (_order_by_pose.end < _partition_by_pose.end) { - _order_by_statistics.update(_order_by_pose.end - _order_by_pose.start); + _order_by_column_statistics.update(_order_by_pose.end - _order_by_pose.start); _order_by_pose.is_ended = true; - _find_candidate_order_by_ends(); + _find_next_order_by_ends(); return; } DCHECK_EQ(_partition_by_pose.end, _order_by_pose.end); @@ -488,9 +494,9 @@ void AnalyticSinkLocalState::_get_partition_by_end() { return; } SCOPED_TIMER(_partition_search_timer); - while (!_candidate_partition_ends.empty()) { - int64_t peek = _candidate_partition_ends.front(); - _candidate_partition_ends.pop(); + while (!_next_partition_ends.empty()) { + int64_t peek = _next_partition_ends.front(); + _next_partition_ends.pop(); if (peek > _partition_by_pose.end) { _partition_by_pose.end = peek; _partition_by_pose.is_ended = true; @@ -518,7 +524,7 @@ void AnalyticSinkLocalState::_get_partition_by_end() { if (_partition_by_pose.end < partition_column_rows) { _partition_by_pose.is_ended = true; - _find_candidate_partition_ends(); + _find_next_partition_ends(); return; } @@ -526,8 +532,8 @@ void AnalyticSinkLocalState::_get_partition_by_end() { _partition_by_pose.is_ended = _input_eos; } -void AnalyticSinkLocalState::_find_candidate_partition_ends() { - if (!_partition_statistics.is_high_cardinality()) { +void AnalyticSinkLocalState::_find_next_partition_ends() { + if (!_partition_column_statistics.is_high_cardinality()) { return; } @@ -536,15 +542,15 @@ void AnalyticSinkLocalState::_find_candidate_partition_ends() { for (auto& column : _partition_by_columns) { auto cmp = column->compare_at(i - 1, i, *column, 1); if (cmp != 0) { - _candidate_partition_ends.push(i); + _next_partition_ends.push(i); break; } } } } -void AnalyticSinkLocalState::_find_candidate_order_by_ends() { - if (!_order_by_statistics.is_high_cardinality()) { +void AnalyticSinkLocalState::_find_next_order_by_ends() { + if (!_order_by_column_statistics.is_high_cardinality()) { return; } @@ -553,7 +559,7 @@ void AnalyticSinkLocalState::_find_candidate_order_by_ends() { for (auto& column : _order_by_columns) { auto cmp = column->compare_at(i - 1, i, *column, 1); if (cmp != 0) { - _candidate_order_by_ends.push(i); + _next_order_by_ends.push(i); break; } } @@ -704,6 +710,7 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); local_state._input_eos = eos; + local_state._remove_unused_rows(); RETURN_IF_ERROR(_add_input_block(state, input_block)); RETURN_IF_ERROR(local_state._execute_impl()); if (local_state._input_eos) { @@ -776,6 +783,61 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, return Status::OK(); } +void AnalyticSinkLocalState::_remove_unused_rows() { + const size_t block_num = 256; + if (_removed_block_index + block_num + 1 >= _input_block_first_row_positions.size()) { + return; + } + const int64_t unused_rows_pos = + _input_block_first_row_positions[_removed_block_index + block_num]; + + if (_have_removed_rows + _partition_by_pose.start <= unused_rows_pos) { + return; + } + + const int64_t remove_rows = unused_rows_pos - _have_removed_rows; + auto left_rows = _input_total_rows - _have_removed_rows - remove_rows; + { + SCOPED_TIMER(_remove_rows_timer); + for (size_t i = 0; i < _agg_functions_size; i++) { + for (size_t j = 0; j < _agg_expr_ctxs[i].size(); j++) { + _agg_input_columns[i][j] = + _agg_input_columns[i][j]->cut(remove_rows, left_rows)->assume_mutable(); + } + } + for (size_t i = 0; i < _partition_exprs_size; i++) { + _partition_by_columns[i] = + _partition_by_columns[i]->cut(remove_rows, left_rows)->assume_mutable(); + } + for (size_t i = 0; i < _order_by_exprs_size; i++) { + _order_by_columns[i] = + _order_by_columns[i]->cut(remove_rows, left_rows)->assume_mutable(); + } + } + COUNTER_UPDATE(_remove_count, 1); + COUNTER_UPDATE(_remove_rows, remove_rows); + _current_row_position -= remove_rows; + _partition_by_pose.remove_unused_rows(remove_rows); + _order_by_pose.remove_unused_rows(remove_rows); + int64_t candidate_partition_end_size = _next_partition_ends.size(); + while (--candidate_partition_end_size >= 0) { + auto peek = _next_partition_ends.front(); + _next_partition_ends.pop(); + _next_partition_ends.push(peek - remove_rows); + } + int64_t candidate_peer_group_end_size = _next_order_by_ends.size(); + while (--candidate_peer_group_end_size >= 0) { + auto peek = _next_order_by_ends.front(); + _next_order_by_ends.pop(); + _next_order_by_ends.push(peek - remove_rows); + } + _removed_block_index += block_num; + _have_removed_rows += remove_rows; + + DCHECK_GE(_current_row_position, 0); + DCHECK_GE(_partition_by_pose.end, 0); +} + Status AnalyticSinkOperatorX::_insert_range_column(vectorized::Block* block, const vectorized::VExprContextSPtr& expr, vectorized::IColumn* dst_column, size_t length) { diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 56f0d8881197b0..a9b51c8cfd735d 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -32,6 +32,10 @@ struct BoundaryPose { int64_t start = 0; int64_t end = 0; bool is_ended = false; + void remove_unused_rows(int64_t cnt) { + start -= cnt; + end -= cnt; + } }; class PartitionStatistics { @@ -86,11 +90,12 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _candidate_partition_ends; - std::queue _candidate_order_by_ends; + PartitionStatistics _partition_column_statistics; + PartitionStatistics _order_by_column_statistics; + std::queue _next_partition_ends; + std::queue _next_order_by_ends; size_t _agg_functions_size = 0; bool _agg_functions_created = false; @@ -137,6 +142,8 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _input_col_ids; std::vector _input_blocks; std::vector _input_block_first_row_positions; + int64_t _removed_block_index = 0; + int64_t _have_removed_rows = 0; RuntimeProfile::Counter* _evaluation_timer = nullptr; RuntimeProfile::Counter* _compute_agg_data_timer = nullptr; @@ -145,6 +152,9 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState Date: Fri, 10 Jan 2025 16:22:31 +0800 Subject: [PATCH 16/20] update2 --- be/src/pipeline/exec/analytic_source_operator.cpp | 8 +++++++- be/src/pipeline/exec/analytic_source_operator.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index e77e20ff8df085..f77024142842d7 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -17,6 +17,7 @@ #include "analytic_source_operator.h" +#include #include #include "pipeline/exec/operator.h" @@ -34,6 +35,7 @@ Status AnalyticLocalState::init(RuntimeState* state, LocalStateInfo& info) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); + _filtered_rows_counter = ADD_COUNTER(profile(), "FilteredRows", TUnit::UNIT); return Status::OK(); } @@ -55,6 +57,7 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block if (!local_state._shared_state->blocks_buffer.empty()) { local_state._shared_state->blocks_buffer.front().swap(*output_block); local_state._shared_state->blocks_buffer.pop(); + auto output_rows = output_block->rows(); //if buffer have no data and sink not eos, block reading and wait for signal again RETURN_IF_ERROR(vectorized::VExprContext::filter_block( local_state._conjuncts, output_block, output_block->columns())); @@ -69,7 +72,9 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block } } if (!output_block->empty()) { - local_state._num_rows_returned += output_block->rows(); + auto return_rows = output_block->rows(); + local_state._num_rows_returned += return_rows; + COUNTER_UPDATE(local_state._filtered_rows_counter, output_rows - return_rows); } } else { //iff buffer have no data and sink eos, set eos @@ -77,6 +82,7 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block *eos = local_state._shared_state->sink_eos; } } + local_state.reached_limit(output_block, eos); return Status::OK(); } diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 9d62759212b2cc..be1fdb2c9e5db5 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -38,6 +38,7 @@ class AnalyticLocalState final : public PipelineXLocalState private: friend class AnalyticSourceOperatorX; RuntimeProfile::Counter* _get_next_timer = nullptr; + RuntimeProfile::Counter* _filtered_rows_counter = nullptr; }; class AnalyticSourceOperatorX final : public OperatorX { From 0d6fffffef938a8b11160cfdbb5b7f371748c01e Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 13 Jan 2025 13:16:12 +0800 Subject: [PATCH 17/20] update limit --- be/src/pipeline/exec/analytic_source_operator.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index f77024142842d7..ce6f0d1d1074ae 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -52,12 +52,13 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block SCOPED_TIMER(local_state.exec_time_counter()); SCOPED_TIMER(local_state._get_next_timer); output_block->clear_column_data(); + size_t output_rows = 0; { std::lock_guard lock(local_state._shared_state->buffer_mutex); if (!local_state._shared_state->blocks_buffer.empty()) { local_state._shared_state->blocks_buffer.front().swap(*output_block); local_state._shared_state->blocks_buffer.pop(); - auto output_rows = output_block->rows(); + output_rows = output_block->rows(); //if buffer have no data and sink not eos, block reading and wait for signal again RETURN_IF_ERROR(vectorized::VExprContext::filter_block( local_state._conjuncts, output_block, output_block->columns())); @@ -71,11 +72,6 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block local_state._dependency->set_ready_to_write(); // ready for sink write } } - if (!output_block->empty()) { - auto return_rows = output_block->rows(); - local_state._num_rows_returned += return_rows; - COUNTER_UPDATE(local_state._filtered_rows_counter, output_rows - return_rows); - } } else { //iff buffer have no data and sink eos, set eos std::unique_lock lc(local_state._shared_state->sink_eos_lock); @@ -83,6 +79,11 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block } } local_state.reached_limit(output_block, eos); + if (!output_block->empty()) { + auto return_rows = output_block->rows(); + local_state._num_rows_returned += return_rows; + COUNTER_UPDATE(local_state._filtered_rows_counter, output_rows - return_rows); + } return Status::OK(); } From ede572047dfdab59feeb13ef7f10e4c537028b9e Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 13 Jan 2025 17:13:09 +0800 Subject: [PATCH 18/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 27 +++++++++++-------- be/src/pipeline/exec/analytic_sink_operator.h | 5 ++++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 594a1256aa4c96..75da53da62ad9e 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -175,8 +175,11 @@ Status AnalyticSinkLocalState::close(RuntimeState* state, Status exec_status) { _destroy_agg_status(); _agg_arena_pool = nullptr; - std::vector tmp_result_window_columns; - _result_window_columns.swap(tmp_result_window_columns); + _result_window_columns.clear(); + _agg_input_columns.clear(); + _partition_by_columns.clear(); + _order_by_columns.clear(); + _range_result_columns.clear(); return PipelineXSinkLocalState::close(state, exec_status); } @@ -337,6 +340,12 @@ Status AnalyticSinkLocalState::_execute_impl() { void AnalyticSinkLocalState::_execute_for_function(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end) { // here is the core function, should not add timer + // If the end is not greater than the start, the current window should be empty. + _current_window_empty = + std::min(frame_end, partition_end) <= std::max(frame_start, partition_start); + if (_current_window_empty) { + return; + } for (size_t i = 0; i < _agg_functions_size; ++i) { std::vector agg_columns; for (int j = 0; j < _agg_input_columns[i].size(); ++j) { @@ -346,11 +355,6 @@ void AnalyticSinkLocalState::_execute_for_function(int64_t partition_start, int6 partition_start, partition_end, frame_start, frame_end, _fn_place_ptr + _offsets_of_aggregate_states[i], agg_columns.data(), _agg_arena_pool.get()); - - // If the end is not greater than the start, the current window should be empty. - // _current_window_empty = false; - _current_window_empty = - std::min(frame_end, partition_end) <= std::max(frame_start, partition_start); } } @@ -369,10 +373,11 @@ void AnalyticSinkLocalState::_insert_result_info(int64_t real_deal_with_width) { _fn_place_ptr + _offsets_of_aggregate_states[i], &dst->get_nested_column()); } - continue; + } else { + _agg_functions[i]->insert_result_info( + _fn_place_ptr + _offsets_of_aggregate_states[i], + _result_window_columns[i].get()); } - _agg_functions[i]->insert_result_info(_fn_place_ptr + _offsets_of_aggregate_states[i], - _result_window_columns[i].get()); } } } @@ -489,7 +494,7 @@ void AnalyticSinkLocalState::_get_partition_by_end() { } //no partition_by, the all block is end if (_partition_by_eq_expr_ctxs.empty() || (_input_total_rows == 0)) { - _partition_by_pose.end = _input_total_rows; //maybe need check removed rows + _partition_by_pose.end = _input_total_rows - _have_removed_rows; _partition_by_pose.is_ended = _input_eos; return; } diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index a9b51c8cfd735d..2e208a7c2fc5ae 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -73,10 +73,15 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState Date: Mon, 13 Jan 2025 19:42:09 +0800 Subject: [PATCH 19/20] update --- .../pipeline/exec/analytic_sink_operator.cpp | 18 ++----- be/src/pipeline/exec/analytic_sink_operator.h | 1 - .../window_functions/test_column_boundary.out | 4 ++ .../test_column_boundary.groovy | 53 +++++++++++++++++++ 4 files changed, 62 insertions(+), 14 deletions(-) create mode 100644 regression-test/data/query_p0/sql_functions/window_functions/test_column_boundary.out create mode 100644 regression-test/suites/query_p0/sql_functions/window_functions/test_column_boundary.groovy diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 75da53da62ad9e..ec6900e278c320 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -195,6 +195,7 @@ bool AnalyticSinkLocalState::_get_next_for_sliding_rows(int64_t batch_rows, } else { current_row_start = _current_row_position + _rows_start_offset; } + // Eg: rows between unbounded preceding and 10 preceding // Make sure range_start <= range_end current_row_start = std::min(current_row_start, current_row_end); _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, current_row_start, @@ -288,8 +289,6 @@ bool AnalyticSinkLocalState::_get_next_for_range_between(int64_t batch_rows, _range_result_columns[1].get(), _order_by_columns[0].get(), _current_row_position, _order_by_pose.end, _partition_by_pose.end); } - // Make sure range_start <= range_end - // current_row_start = std::min(current_row_start, current_row_end); _execute_for_function(_partition_by_pose.start, _partition_by_pose.end, _order_by_pose.start, _order_by_pose.end); _insert_result_info(1); @@ -385,10 +384,6 @@ void AnalyticSinkLocalState::_insert_result_info(int64_t real_deal_with_width) { void AnalyticSinkLocalState::_output_current_block(vectorized::Block* block) { block->swap(std::move(_input_blocks[_output_block_index])); _blocks_memory_usage->add(-block->allocated_bytes()); - if (_input_col_ids.size() < block->columns()) { - block->erase_not_in(_input_col_ids); - } - DCHECK(_parent->cast()._change_to_nullable_flags.size() == _result_window_columns.size()); for (size_t i = 0; i < _result_window_columns.size(); ++i) { @@ -644,6 +639,8 @@ Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) _partition_by_eq_expr_ctxs)); RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(analytic_node.order_by_exprs, _order_by_eq_expr_ctxs)); + // RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(analytic_node.range_between_offset_exprs, + // _order_by_eq_expr_ctxs)); return Status::OK(); } @@ -737,12 +734,7 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, local_state._input_total_rows += block_rows; // record origin columns, maybe be after this, could cast some column but no need to output - if (local_state._input_col_ids.empty()) { - for (int c = 0; c < input_block->columns(); ++c) { - local_state._input_col_ids.emplace_back(c); - } - } - + auto column_to_keep = input_block->columns(); { SCOPED_TIMER(local_state._compute_agg_data_timer); //insert _agg_input_columns, execute calculate for its, and those columns maybe could remove have used data @@ -781,7 +773,7 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, block_rows)); } } - + vectorized::Block::erase_useless_column(input_block, column_to_keep); COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); COUNTER_UPDATE(local_state._blocks_memory_usage, input_block->allocated_bytes()); local_state._input_blocks.emplace_back(std::move(*input_block)); diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 2e208a7c2fc5ae..1a6e14a3c7d530 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -144,7 +144,6 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _input_col_ids; std::vector _input_blocks; std::vector _input_block_first_row_positions; int64_t _removed_block_index = 0; diff --git a/regression-test/data/query_p0/sql_functions/window_functions/test_column_boundary.out b/regression-test/data/query_p0/sql_functions/window_functions/test_column_boundary.out new file mode 100644 index 00000000000000..ac07866762434b --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/window_functions/test_column_boundary.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql_1 -- +512000000 + diff --git a/regression-test/suites/query_p0/sql_functions/window_functions/test_column_boundary.groovy b/regression-test/suites/query_p0/sql_functions/window_functions/test_column_boundary.groovy new file mode 100644 index 00000000000000..398d0a73e19e6a --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/window_functions/test_column_boundary.groovy @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_column_boundary") { + sql """ DROP TABLE IF EXISTS test_column_boundary """ + sql """ + CREATE TABLE IF NOT EXISTS test_column_boundary ( + u_id int NULL COMMENT "", + u_city varchar(20) NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`u_id`, `u_city`) + DISTRIBUTED BY HASH(`u_id`, `u_city`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "in_memory" = "false", + "storage_format" = "V2" + ); + """ + + sql """ insert into test_column_boundary select number, number + random() from numbers("number" = "1000000"); """ + Integer count = 0; + Integer maxCount = 9; + while (count < maxCount) { + sql """ insert into test_column_boundary select * from test_column_boundary;""" + count++ + sleep(100); + } + sql """ set parallel_pipeline_task_num = 1; """ + + qt_sql_1 """ select count() from test_column_boundary; """ + test { + // column size is too large + sql """ select count() over(partition by u_city) from test_column_boundary; """ + exception "string column length is too large" + } + sql """ DROP TABLE IF EXISTS test_column_boundary """ +} + + From c73ecb743868eb4599cd8a797914723b742e415d Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 13 Jan 2025 20:02:59 +0800 Subject: [PATCH 20/20] thrift --- .../pipeline/exec/analytic_sink_operator.cpp | 38 ++++++++++--------- be/src/pipeline/exec/analytic_sink_operator.h | 4 +- gensrc/thrift/PlanNodes.thrift | 2 + 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index ec6900e278c320..72213e74406845 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -38,7 +38,7 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); - _compute_order_by_function_timer = ADD_TIMER(profile(), "ComputeOrderByFunctionTime"); + _compute_range_between_function_timer = ADD_TIMER(profile(), "ComputeOrderByFunctionTime"); _partition_search_timer = ADD_TIMER(profile(), "PartitionSearchTime"); _order_search_timer = ADD_TIMER(profile(), "OrderSearchTime"); _remove_rows_timer = ADD_TIMER(profile(), "RemoveRowsTime"); @@ -151,12 +151,12 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { } // only support one order by column, so need two columns upper and lower bound - // _range_result_columns.resize(2); - _range_result_columns.resize(_order_by_exprs_size); - // should change the order by exprs to range column, IF FE have support range window - for (size_t i = 0; i < _order_by_exprs_size; i++) { - // RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); - _range_result_columns[i] = _order_by_eq_expr_ctxs[i]->root()->data_type()->create_column(); + _range_result_columns.resize(_range_between_expr_ctxs.size()); + _range_between_expr_ctxs = p._range_between_expr_ctxs; + for (size_t i = 0; i < _range_between_expr_ctxs.size(); i++) { + RETURN_IF_ERROR(p._range_between_expr_ctxs[i]->clone(state, _range_between_expr_ctxs[i])); + _range_result_columns[i] = + _range_between_expr_ctxs[i]->root()->data_type()->create_column(); } _fn_place_ptr = _agg_arena_pool->aligned_alloc(p._total_size_of_aggregate_states, @@ -639,8 +639,8 @@ Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) _partition_by_eq_expr_ctxs)); RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(analytic_node.order_by_exprs, _order_by_eq_expr_ctxs)); - // RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(analytic_node.range_between_offset_exprs, - // _order_by_eq_expr_ctxs)); + RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(analytic_node.range_between_offset_exprs, + _order_by_eq_expr_ctxs)); return Status::OK(); } @@ -675,7 +675,12 @@ Status AnalyticSinkOperatorX::open(RuntimeState* state) { vectorized::VExpr::prepare(_order_by_eq_expr_ctxs, state, cmp_row_desc)); } } - + if (!_range_between_expr_ctxs.empty()) { + DCHECK(_range_between_expr_ctxs.size() == 2); + RETURN_IF_ERROR( + vectorized::VExpr::prepare(_range_between_expr_ctxs, state, _child->row_desc())); + } + RETURN_IF_ERROR(vectorized::VExpr::open(_range_between_expr_ctxs, state)); RETURN_IF_ERROR(vectorized::VExpr::open(_partition_by_eq_expr_ctxs, state)); RETURN_IF_ERROR(vectorized::VExpr::open(_order_by_eq_expr_ctxs, state)); for (size_t i = 0; i < _agg_functions_size; ++i) { @@ -754,7 +759,6 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, local_state._partition_by_columns[i].get(), block_rows)); } } - { SCOPED_TIMER(local_state._compute_order_by_timer); for (size_t i = 0; i < local_state._order_by_eq_expr_ctxs.size(); ++i) { @@ -763,14 +767,12 @@ Status AnalyticSinkOperatorX::_add_input_block(doris::RuntimeState* state, block_rows)); } } - { - SCOPED_TIMER(local_state._compute_order_by_function_timer); - // should change the order by exprs to range column, IF FE have support range window - for (size_t i = 0; i < local_state._order_by_eq_expr_ctxs.size(); ++i) { - RETURN_IF_ERROR(_insert_range_column(input_block, local_state._order_by_eq_expr_ctxs[i], - local_state._range_result_columns[i].get(), - block_rows)); + SCOPED_TIMER(local_state._compute_range_between_function_timer); + for (size_t i = 0; i < local_state._range_between_expr_ctxs.size(); ++i) { + RETURN_IF_ERROR( + _insert_range_column(input_block, local_state._range_between_expr_ctxs[i], + local_state._range_result_columns[i].get(), block_rows)); } } vectorized::Block::erase_useless_column(input_block, column_to_keep); diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index 1a6e14a3c7d530..5530a4f4de563e 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -108,6 +108,7 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; + vectorized::VExprContextSPtrs _range_between_expr_ctxs; std::vector> _agg_input_columns; std::vector _partition_by_columns; std::vector _order_by_columns; @@ -153,7 +154,7 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; + vectorized::VExprContextSPtrs _range_between_expr_ctxs; size_t _agg_functions_size = 0; std::vector _num_agg_input; diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 9e7c90908f4329..78cf4f83d1d882 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -1103,6 +1103,8 @@ struct TAnalyticNode { 9: optional Exprs.TExpr order_by_eq 10: optional bool is_colocate + + 11: optional list range_between_offset_exprs } struct TMergeNode {