Skip to content

Commit

Permalink
Merge pull request #42 from myscale/sync/myscaledb-oss
Browse files Browse the repository at this point in the history
MyscaleDB 1.7.1 Release
  • Loading branch information
feixue1121 authored Sep 10, 2024
2 parents 8ecffb4 + 869425e commit 7dabf35
Show file tree
Hide file tree
Showing 49 changed files with 2,442 additions and 964 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ The simplest way to use MyScaleDB is to create an instance on MyScale Cloud serv
To quickly get a MyScaleDB instance up and running, simply pull and run the latest Docker image:

```bash
docker run --name myscaledb --net=host myscale/myscaledb:1.6.4
docker run --name myscaledb --net=host myscale/myscaledb:1.7.1
```

>Note: Myscale's default configuration only allows localhost ip access. For the docker run startup method, you need to specify `--net=host` to access services deployed in docker mode on the current node.
Expand Down Expand Up @@ -114,7 +114,7 @@ version: '3.7'

services:
myscaledb:
image: myscale/myscaledb:1.6.4
image: myscale/myscaledb:1.7.1
tty: true
ports:
- '8123:8123'
Expand Down
8 changes: 4 additions & 4 deletions cmake/autogenerated_myscale_versions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
SET(MYSCALE_VERSION_MAJOR 1)
SET(MYSCALE_VERSION_MINOR 6)
SET(MYSCALE_VERSION_PATCH 4)
SET(MYSCALE_VERSION_DESCRIBE myscale-v1.6.4)
SET(MYSCALE_VERSION_STRING 1.6.4)
SET(MYSCALE_VERSION_MINOR 7)
SET(MYSCALE_VERSION_PATCH 1)
SET(MYSCALE_VERSION_DESCRIBE myscale-v1.7.1)
SET(MYSCALE_VERSION_STRING 1.7.1)
# end of autochange

2 changes: 1 addition & 1 deletion contrib/search-index
2 changes: 1 addition & 1 deletion rust/supercrate/libs/tantivy_search
Submodule tantivy_search updated 41 files
+3 −3 include/tantivy_search_cbindgen.h
+99 −22 include/tantivy_search_cxx.h
+1 −7 src/common/constants.rs
+11 −28 src/common/errors.rs
+1 −1 src/common/mod.rs
+393 −204 src/common/test_util.rs
+97 −62 src/index/api/api_index.rs
+6 −1 src/index/api/mod.rs
+1 −1 src/index/bridge/index_writer_bridge_cache.rs
+5 −2 src/index/bridge/mod.rs
+2 −2 src/index/implements/index_manager.rs
+68 −30 src/lib.rs
+5 −5 src/logger/ffi_logger.rs
+66 −46 src/search/api/api_clickhouse.rs
+39 −20 src/search/api/api_common.rs
+80 −48 src/search/api/api_myscale.rs
+23 −3 src/search/api/mod.rs
+81 −2 src/search/collector/top_dos_with_bitmap_collector.rs
+1 −1 src/search/implements/api_clickhouse/api_query_sentence_bitmap.rs
+1 −1 src/search/implements/api_clickhouse/api_query_term_bitmap.rs
+1 −1 src/search/implements/api_clickhouse/api_query_terms_bitmap.rs
+1 −1 src/search/implements/api_clickhouse/api_regex_term_bitmap.rs
+70 −22 src/search/implements/api_myscale/api_bm25_nlq_search.rs
+91 −0 src/search/implements/api_myscale/api_bm25_standard_search.rs
+5 −7 src/search/implements/api_myscale/api_get_total_num_tokens.rs
+6 −11 src/search/implements/api_myscale/bm25_inner_search.rs
+0 −1 src/search/implements/mod.rs
+18 −19 src/search/implements/strategy/bm25_standard_query.rs
+1 −1 src/search/implements/strategy/regex_query.rs
+0 −149 src/search/implements/tests/api_clickhouse_test.rs
+0 −62 src/search/implements/tests/api_common_test.rs
+0 −365 src/search/implements/tests/api_myscale_test.rs
+0 −3 src/search/implements/tests/mod.rs
+0 −32 src/search/utils/index_searcher_utils.rs
+3 −2 src/search/utils/mod.rs
+9 −9 src/tokenizer/parser.rs
+94 −0 src/utils/api_utils.rs
+19 −8 src/utils/ffi_utils.rs
+0 −3 src/utils/index_utils.rs
+0 −104 src/utils/index_utils_dep.rs
+3 −1 src/utils/mod.rs
1 change: 1 addition & 0 deletions src/Core/Settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,7 @@ class IColumn;
M(UInt64, hybrid_search_top_k_multiple_base, 3, "Default multiple base on top k for num_candidates in hybrid search", 0) \
M(Bool, optimize_prefilter_in_search, true, "Enable prewhere optimization for vector or text search if some partition columns in prewhere condition.", 0) \
M(UInt64, max_search_result_window, 10000, "The maximum value of n + m in limit clause for pagination in vector/text/hybrid search", 0) \
M(Bool, dfs_query_then_fetch, false, "Enable Distributed Frequency Search (DFS) query to gather global statistical info for accurate BM25 calculation.", 0) \
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

Expand Down
3 changes: 2 additions & 1 deletion src/Databases/DatabasesCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n
}
// clean stores
// TODO needs refine TantivyIndexStoreFactory, the remove func is only for data part relative path.
TantivyIndexStoreFactory::instance().remove(table_relative_path);
auto index_names = res->getInMemoryMetadataPtr()->getSecondaryIndices().getAllRegisteredNames();
TantivyIndexStoreFactory::instance().remove(table_relative_path, index_names);
}
}
#endif
Expand Down
15 changes: 7 additions & 8 deletions src/Interpreters/ExpressionAnalyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,8 +594,9 @@ void ExpressionAnalyzer::analyzeTextSearch(ActionsDAGPtr & temp_actions)
}
}

/// text search cannot be performed when no fts index exists
if (has_text_search)
/// Text search cannot be performed when no fts index exists
/// Skip the fts index check when table is distributed.
if (!syntax->is_remote_storage && has_text_search)
checkTantivyIndex(syntax->storage_snapshot, text_search_info->text_column_name);
}

Expand All @@ -615,8 +616,8 @@ void ExpressionAnalyzer::analyzeHybridSearch(ActionsDAGPtr & temp_actions)

if (has_hybrid_search && hybrid_search_info)
{
/// check fts index
if (hybrid_search_info->text_search_info)
/// Skip the fts index check when table is distributed.
if (!syntax->is_remote_storage && hybrid_search_info->text_search_info)
checkTantivyIndex(syntax->storage_snapshot, hybrid_search_info->text_search_info->text_column_name);

/// Get vector search type and dim from metadata, check paramaters in vector scan and add to vector_paramters
Expand Down Expand Up @@ -1095,7 +1096,7 @@ TextSearchInfoPtr ExpressionAnalyzer::commonMakeTextSearchInfo(
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown parameter {} for TextSearch", param_key);
}

return std::make_shared<TextSearchInfo>(text_column_name, query_text_value, function_col_name, topk, syntax->direction, text_operator, enable_natural_language_query);
return std::make_shared<TextSearchInfo>(text_column_name, query_text_value, function_col_name, topk, text_operator, enable_natural_language_query);
}

bool ExpressionAnalyzer::makeTextSearchInfo(ActionsDAGPtr & actions)
Expand All @@ -1110,7 +1111,6 @@ bool ExpressionAnalyzer::makeTextSearchInfo(ActionsDAGPtr & actions)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong argument number in TextSearch function: expected 2, got {}", arguments.size());
}


Array parameters = (node->parameters) ? getAggregateFunctionParametersArray(node->parameters, "", getContext()) : Array();

/// Only need actions for the second argument, the first argument is used for search index.
Expand Down Expand Up @@ -1253,11 +1253,10 @@ bool ExpressionAnalyzer::makeHybridSearchInfo(ActionsDAGPtr & actions)
"Wrong HybridSearch parameter for Relative Score Fusion(RSF), valid value is in interval [0.0f, 1.0f]");
}

auto metric = Search::getMetricType(syntax->vector_scan_metric_type, vector_scan_descriptions[0].vector_search_type);
hybrid_search_info = std::make_shared<HybridSearchInfo>(
std::make_shared<VectorScanInfo>(vector_scan_descriptions),
tmp_text_search_info,
function_column_name, static_cast<int>(syntax->limit_length), hybrid_fusion_type, hybrid_fusion_weight, metric);
function_column_name, static_cast<int>(syntax->limit_length), hybrid_fusion_type, hybrid_fusion_weight);
}
else if (isRankFusion(hybrid_fusion_type))
{
Expand Down
37 changes: 36 additions & 1 deletion src/Interpreters/InterpreterSelectQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@
#include "config_version.h"
#include <Interpreters/Context.h>

#if USE_TANTIVY_SEARCH
# include <VectorIndex/Utils/CommonUtils.h>
#endif

namespace DB
{
Expand Down Expand Up @@ -641,14 +644,29 @@ InterpreterSelectQuery::InterpreterSelectQuery(
current_info.query = query_ptr;
current_info.syntax_analyzer_result = syntax_analyzer_result;

/// Support full text search table function
NameSet table_columns;
bool from_table_function = false;
if (storage->getName() == "FullTextSearch")
{
from_table_function = true;
table_columns = {collections::map<std::unordered_set>(
metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })};
table_columns.erase(SCORE_COLUMN_NAME);

current_info.has_hybrid_search = true;
}
else if (syntax_analyzer_result && !syntax_analyzer_result->hybrid_search_funcs.empty())
current_info.has_hybrid_search = true;

Names queried_columns = syntax_analyzer_result->requiredSourceColumns();
const auto & supported_prewhere_columns = storage->supportedPrewhereColumns();

MergeTreeWhereOptimizer where_optimizer{
std::move(column_compressed_sizes),
metadata_snapshot,
queried_columns,
supported_prewhere_columns,
from_table_function ? table_columns : supported_prewhere_columns,
log};

where_optimizer.optimize(current_info, context);
Expand Down Expand Up @@ -762,6 +780,23 @@ InterpreterSelectQuery::InterpreterSelectQuery(

analyze(shouldMoveToPrewhere());

#if USE_TANTIVY_SEARCH
if (!options.only_analyze && storage && query_analyzer->getAnalyzedData().text_search_info && context->getSettingsRef().dfs_query_then_fetch)
{
/// Collect global statistics information of all shards used in BM25 calculation when text search is distributed
if (auto distributed_storage = std::dynamic_pointer_cast<StorageDistributed>(storage))
{
collectStatisticForBM25Calculation(
context,
distributed_storage->getClusterName(),
distributed_storage->getRemoteDatabaseName(),
distributed_storage->getRemoteTableName(),
query_analyzer->getAnalyzedData().text_search_info->text_column_name,
query_analyzer->getAnalyzedData().text_search_info->query_text);
}
}
#endif

bool need_analyze_again = false;
bool can_analyze_again = false;
if (context->hasQueryContext())
Expand Down
Loading

0 comments on commit 7dabf35

Please sign in to comment.