Skip to content

Commit

Permalink
Merge branch 'master' into scd-cache
Browse files Browse the repository at this point in the history
  • Loading branch information
eldenmoon authored Feb 25, 2025
2 parents b346813 + 2941d27 commit 7d7d9ba
Show file tree
Hide file tree
Showing 1,578 changed files with 55,946 additions and 12,658 deletions.
4 changes: 1 addition & 3 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ github:
- P0 Regression (Doris Regression)
- External Regression (Doris External Regression)
- cloud_p0 (Doris Cloud Regression)

required_pull_request_reviews:
required_pull_request_reviews:
require_code_owner_reviews: true
required_approving_review_count: 1
dismiss_stale_reviews: true
Expand Down Expand Up @@ -147,7 +146,6 @@ github:
required_approving_review_count: 1

collaborators:
- LemonLiTree
- Yukang-Lian
- TangSiyang2001
- freemandealer
Expand Down
1 change: 0 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ Checks: |
-readability-named-parameter,
-readability-avoid-const-params-in-decls,
-readability-convert-member-functions-to-static,
portability-simd-intrinsics,
performance-type-promotion-in-math-fn,
performance-faster-string-find,
performance-inefficient-algorithm,
Expand Down
12 changes: 12 additions & 0 deletions be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ message(STATUS "USE_MEM_TRACKER is ${USE_MEM_TRACKER}")
message(STATUS "USE_JEMALLOC is ${USE_JEMALLOC}")
message(STATUS "USE_UNWIND is ${USE_UNWIND}")
message(STATUS "ENABLE_PCH is ${ENABLE_PCH}")
message(STATUS "USE_AVX2 is ${USE_AVX2}")

# set CMAKE_BUILD_TYPE
if (NOT CMAKE_BUILD_TYPE)
Expand Down Expand Up @@ -225,6 +226,10 @@ install(DIRECTORY
${SRC_DIR}/clucene/src/contribs-lib/CLucene/analysis/jieba/dict
DESTINATION ${OUTPUT_DIR})

install(DIRECTORY
${BASE_DIR}/dict/icu/uax29
DESTINATION ${OUTPUT_DIR}/dict/icu)

# Check if functions are supported in this platform. All flags will generated
# in gensrc/build/common/env_config.h.
# You can check funcion here which depends on platform. Don't forget add this
Expand Down Expand Up @@ -328,13 +333,17 @@ endif()
# simd for architectures
if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
add_compile_options(-msse4.2)
add_definitions(-DLIBDIVIDE_SSE2)
if (USE_AVX2)
add_compile_options(-mavx2)
add_definitions(-DUSE_AVX2)
add_definitions(-DLIBDIVIDE_AVX2)
endif()
endif()

if (ARCH_ARM)
add_compile_options(-march=armv8-a+crc)
add_definitions(-DLIBDIVIDE_NEON)
endif()
#

Expand Down Expand Up @@ -536,6 +545,9 @@ if ((ARCH_AMD64 OR ARCH_AARCH64) AND OS_LINUX)
hadoop_hdfs
)
add_definitions(-DUSE_HADOOP_HDFS)
# USE_DORIS_HADOOP_HDFS means use hadoop deps from doris-thirdparty.
# the hadoop deps from doris-thirdparty contains some modification diff from the standard hadoop, such as log interface
add_definitions(-DUSE_DORIS_HADOOP_HDFS)
else()
add_library(hdfs3 STATIC IMPORTED)
set_target_properties(hdfs3 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libhdfs3.a)
Expand Down
4 changes: 4 additions & 0 deletions be/cmake/thirdparty.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,7 @@ endif()
if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
add_thirdparty(deflate)
endif()

add_thirdparty(icuuc LIB64)
add_thirdparty(icui18n LIB64)
add_thirdparty(icudata LIB64)
152 changes: 152 additions & 0 deletions be/dict/icu/uax29/Default.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Character class definitions from TR 29

!!chain;
!!quoted_literals_only;


#
# Character Class Definitions.
#

$Han = [:Han:];

$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}-$Han];
$ZWJ = [\p{Word_Break = ZWJ}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
$Single_Quote = [\p{Word_Break = Single_Quote}];
$Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$WSegSpace = [\p{Word_Break = WSegSpace}];
$Extended_Pict = [\p{Extended_Pictographic}];

$Hiragana = [:Hiragana:];
$Ideographic = [\p{Ideographic}];


# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.

$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
$dictionaryCJK = [$KanaKanji $HangulSyllable];
$dictionary = [$ComplexContext $dictionaryCJK];

# TODO: check if handling of katakana in dictionary makes rules incorrect/void

# leave CJK scripts out of ALetterPlus
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];


## -------------------------------------------------

# Rule 3 - CR x LF
#
$CR $LF;

# Rule 3c Do not break within emoji zwj sequences.
# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
#
$ZWJ $Extended_Pict;

# Rule 3d - Keep horizontal whitespace together.
#
$WSegSpace $WSegSpace;

# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text.

$ExFm = [$Extend $Format $ZWJ];

^$ExFm+; # This rule fires only when there are format or extend characters at the
# start of text, or immediately following another boundary. It groups them, in
# the event there are more than one.

[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
# with no special rule status value.

$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
$HangulSyllable {200};
$Hebrew_Letter $ExFm* {200};
$Katakana $ExFm* {300}; # note: these status values override those from rule 5
$Hiragana $ExFm* {300}; # by virtue of being numerically larger.
$Ideographic $ExFm* {400}; #

#
# rule 5
# Do not break between most letters.
#
($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);

# rule 6 and 7
($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};

# rule 7a
$Hebrew_Letter $ExFm* $Single_Quote {200};

# rule 7b and 7c
$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;

# rule 8

$Numeric $ExFm* $Numeric;

# rule 9

($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;

# rule 10

$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);

# rule 11 and 12

$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;

# rule 13
# to be consistent with $KanaKanji $KanaKanhi, changed
# from 300 to 400.
# See also TestRuleStatus in intltest/rbbiapts.cpp
$Katakana $ExFm* $Katakana {300};

# rule 13a/b

$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
$Katakana $ExFm* $ExtendNumLet {300}; # (13a)
$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)

$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
$ExtendNumLet $ExFm* $Katakana {300}; # (13b)

# rules 15 - 17
# Pairs of Regional Indicators stay together.
# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
^$Regional_Indicator $ExFm* $Regional_Indicator;

# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};

# Rule 999
# Match a single code point if no other rule applies.
.;
35 changes: 35 additions & 0 deletions be/dict/icu/uax29/MyanmarSyllable.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Parses Myanmar text, with syllable as token.

$Consonant = [:Indic_Syllabic_Category = Consonant:];
$ConsonantPlaceholder = [:Indic_Syllabic_Category = Consonant_Placeholder:];
$VowelIndependent = [:Indic_Syllabic_Category = Vowel_Independent:];
$Virama = [:Indic_Syllabic_Category = Invisible_Stacker:];
$Asat = [:Indic_Syllabic_Category = Pure_Killer:];
# for our purposes, $Cons means 'base'
$Cons = $Consonant | $ConsonantPlaceholder | $VowelIndependent;
$WordJoin = [:Line_Break=Word_Joiner:];

#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;

$ConsEx = $Cons ($Extend | $Format)*;
$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;

!!forward;
$MyanmarJoinedSyllableEx {200};

# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
18 changes: 11 additions & 7 deletions be/src/agent/workload_group_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include "agent/workload_group_listener.h"

#include <thrift/protocol/TDebugProtocol.h>

#include "runtime/exec_env.h"
#include "runtime/workload_group/workload_group.h"
#include "runtime/workload_group/workload_group_manager.h"
Expand All @@ -33,6 +35,8 @@ void WorkloadGroupListener::handle_topic_info(const std::vector<TopicInfo>& topi
if (!topic_info.__isset.workload_group_info) {
continue;
}
VLOG_DEBUG << "Received publish workload group info request: "
<< apache::thrift::ThriftDebugString(topic_info).c_str();
is_set_workload_group_info = true;

// 1 parse topic info to group info
Expand Down Expand Up @@ -65,13 +69,13 @@ void WorkloadGroupListener::handle_topic_info(const std::vector<TopicInfo>& topi
// 5 upsert io throttle
wg->upsert_scan_io_throttle(&workload_group_info);

LOG(INFO) << "[topic_publish_wg]update workload group finish, wg info="
<< wg->debug_string() << ", enable_cpu_hard_limit="
<< (_exec_env->workload_group_mgr()->enable_cpu_hard_limit() ? "true" : "false")
<< ", cgroup cpu_shares=" << workload_group_info.cgroup_cpu_shares
<< ", cgroup cpu_hard_limit=" << workload_group_info.cgroup_cpu_hard_limit
<< ", cgroup home path=" << config::doris_cgroup_cpu_path
<< ", list size=" << list_size << ", thread info=" << wg->thread_debug_info();
VLOG_DEBUG << "[topic_publish_wg]update workload group finish, wg info="
<< wg->debug_string() << ", enable_cpu_hard_limit="
<< (_exec_env->workload_group_mgr()->enable_cpu_hard_limit() ? "true" : "false")
<< ", cgroup cpu_shares=" << workload_group_info.cgroup_cpu_shares
<< ", cgroup cpu_hard_limit=" << workload_group_info.cgroup_cpu_hard_limit
<< ", cgroup home path=" << config::doris_cgroup_cpu_path
<< ", list size=" << list_size << ", thread info=" << wg->thread_debug_info();
}

// NOTE(wb) when is_set_workload_group_info=false, it means FE send a empty workload group list
Expand Down
19 changes: 6 additions & 13 deletions be/src/cloud/cloud_base_compaction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,7 @@ bvar::Adder<uint64_t> base_output_size("base_compaction", "output_size");

CloudBaseCompaction::CloudBaseCompaction(CloudStorageEngine& engine, CloudTabletSPtr tablet)
: CloudCompactionMixin(engine, tablet,
"BaseCompaction:" + std::to_string(tablet->tablet_id())) {
auto uuid = UUIDGenerator::instance()->next_uuid();
std::stringstream ss;
ss << uuid;
_uuid = ss.str();
}
"BaseCompaction:" + std::to_string(tablet->tablet_id())) {}

CloudBaseCompaction::~CloudBaseCompaction() = default;

Expand Down Expand Up @@ -330,8 +325,7 @@ Status CloudBaseCompaction::modify_rowsets() {
DeleteBitmapPtr output_rowset_delete_bitmap = nullptr;
if (_tablet->keys_type() == KeysType::UNIQUE_KEYS &&
_tablet->enable_unique_key_merge_on_write()) {
int64_t initiator = HashUtil::hash64(_uuid.data(), _uuid.size(), 0) &
std::numeric_limits<int64_t>::max();
int64_t initiator = this->initiator();
RETURN_IF_ERROR(cloud_tablet()->calc_delete_bitmap_for_compaction(
_input_rowsets, _output_rowset, *_rowid_conversion, compaction_type(),
_stats.merged_rows, _stats.filtered_rows, initiator, output_rowset_delete_bitmap,
Expand Down Expand Up @@ -403,8 +397,8 @@ Status CloudBaseCompaction::modify_rowsets() {
return Status::OK();
}

void CloudBaseCompaction::garbage_collection() {
CloudCompactionMixin::garbage_collection();
Status CloudBaseCompaction::garbage_collection() {
RETURN_IF_ERROR(CloudCompactionMixin::garbage_collection());
cloud::TabletJobInfoPB job;
auto idx = job.mutable_idx();
idx->set_tablet_id(_tablet->tablet_id());
Expand All @@ -418,9 +412,7 @@ void CloudBaseCompaction::garbage_collection() {
compaction_job->set_type(cloud::TabletCompactionJobPB::BASE);
if (_tablet->keys_type() == KeysType::UNIQUE_KEYS &&
_tablet->enable_unique_key_merge_on_write()) {
int64_t initiator = HashUtil::hash64(_uuid.data(), _uuid.size(), 0) &
std::numeric_limits<int64_t>::max();
compaction_job->set_delete_bitmap_lock_initiator(initiator);
compaction_job->set_delete_bitmap_lock_initiator(this->initiator());
}
auto st = _engine.meta_mgr().abort_tablet_job(job);
if (!st.ok()) {
Expand All @@ -429,6 +421,7 @@ void CloudBaseCompaction::garbage_collection() {
.tag("tablet_id", _tablet->tablet_id())
.error(st);
}
return st;
}

void CloudBaseCompaction::do_lease() {
Expand Down
3 changes: 1 addition & 2 deletions be/src/cloud/cloud_base_compaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,14 @@ class CloudBaseCompaction : public CloudCompactionMixin {

Status modify_rowsets() override;

void garbage_collection() override;
Status garbage_collection() override;

void _filter_input_rowset();

void build_basic_info();

ReaderType compaction_type() const override { return ReaderType::READER_BASE_COMPACTION; }

std::string _uuid;
int64_t _input_segments = 0;
int64_t _base_compaction_cnt = 0;
int64_t _cumulative_compaction_cnt = 0;
Expand Down
Loading

0 comments on commit 7d7d9ba

Please sign in to comment.