From bfa0f59cd9054603da5158b604c2395a69b5c6cd Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 23 Sep 2024 22:51:26 +0800 Subject: [PATCH 1/4] tmp --- .../update/update-of-unique-model.md | 268 +++++++++++++++++- 1 file changed, 267 insertions(+), 1 deletion(-) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md index 19f1725a7c17d..072ccf3f2b2a4 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md @@ -153,4 +153,270 @@ INSERT INTO order_tbl (order_id, order_status) values (1,'待发货'); 目前,同一批次数据写入任务(无论是导入任务还是`INSERT INTO`)的所有行只能更新相同的列,如果需要更新不同列的数据,则需要分不同的批次进行写入。 -在未来版本中,将支持灵活的列更新,用户可以在同一批导入中,每一行更新不同的列。 \ No newline at end of file + +## 灵活列更新 + +在 x.x.x 版本之前,doris 支持的部分列更新功能限制了一次导入中每一行必须更新相同的列,从 x.x.x 版本开始,doris 支持一种更加灵活的更新方式,它使得一次导入中的每一行可以更新不同的列。 + +:::caution +注意 + +1. 灵活列更新这一功能从 x.x.x 版本开始支持 +2. 目前只有 stream load 这一种导入方式以及使用 stream load 作为其导入方式的工具(如 doris-flink-connector)支持灵活列更新功能 +3. 在使用灵活列更新时导入文件必须为 json 格式的数据 +::: + +### 适用场景 + +在使用 CDC 的方式将某个数据系统的数据实时同步到 Doris 中时,源端系统输出的记录可能并不是完整的行数据,而是只有主键和被更新的列的数据。在这种情况下,某个时间窗口内的一批数据中每一行更新的列可能都是不同的。此时,可以使用灵活列更新的方式来将数据导入到 Doris 中。 + +### 使用方式 + +**存量表开启灵活列更新功能** + +对于在旧版本 Doris 中已经建好的存量 Merge-On-Write 表,在升级 Doris 之后如果想要使用灵活列更新的功能,可以使用 `ALTER TALBE db1.tbl1 ENABLE FEATURE "UPDATE_FLEXIBLE_COLUMNS";` 来开启这一功能。执行完上述语句后使用 `show create table db1.tbl1` 的结果中如果包含 `"enable_unique_key_skip_bitmap_column" = "true"` 则表示功能开启成功。注意,使用这一方式之前需要确保目标表已经开启了 light-schema-change 的功能。 + +**新建表使用灵活列更新功能** + +对于新建的表,如果需要使用灵活列更新功能,建表时需要指定如下表属性,以开启 Merge-on-Write 实现,开启 light-schema-change,同时使得表具有灵活列更新所需要的 `bitmap` 隐藏列。(这3个表属性的默认值均为 `true`) + +```Plain +"enable_unique_key_merge_on_write" = "true" +"enable_unique_key_skip_bitmap_column" = "true" +"enable_light_schema_change" = "true" +``` + +**StreamLoad** + +在使用 Stream Load 导入时添加如下 header + +```Plain +unique_key_update_mode:UPDATE_FLEXIBLE_COLUMNS +``` + +**Flink Doris Connector** + +如果使用 Flink Doris Connector, 需要添加如下配置: + +```Plain +'sink.properties.unique_key_update_mode' = 'UPDATE_FLEXIBLE_COLUMNS' +``` + +### 示例 + +假设有如下表 + +```sql +CREATE TABLE t1 ( + `k` int(11) NULL, + `v1` BIGINT NULL, + `v2` BIGINT NULL DEFAULT "9876", + `v3` BIGINT NOT NULL, + `v4` BIGINT NOT NULL DEFAULT "1234", + `v5` BIGINT NULL +) UNIQUE KEY(`k`) DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES( +"replication_num" = "3", +"enable_unique_key_merge_on_write" = "true", +"enable_light_schema_change" = "true", +"enable_unique_key_skip_bitmap_column" = "true"); +``` + +表中有如下原始数据 + +```sql +MySQL root@127.1:d1> select * from t1; ++---+----+----+----+----+----+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+----+----+----+----+----+ +| 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 1 | +| 2 | 2 | 2 | 2 | 2 | 2 | +| 3 | 3 | 3 | 3 | 3 | 3 | +| 4 | 4 | 4 | 4 | 4 | 4 | +| 5 | 5 | 5 | 5 | 5 | 5 | ++---+----+----+----+----+----+ +``` + +现在通过灵活列更新导入来更新其中的一些行的字段 + +```shell +$ cat test1.json +``` +```json +{"k": 0, "__DORIS_DELETE_SIGN__": 1} +{"k": 1, "v1": 10} +{"k": 2, "v2": 20, "v5": 25} +{"k": 3, "v3": 30} +{"k": 4, "v4": 20, "v1": 43, "v3": 99} +{"k": 5, "v5": null} +{"k": 6, "v1": 999, "v3": 777} +{"k": 2, "v4": 222} +{"k": 1, "v2": 111, "v3": 111} +``` +```shell +curl --location-trusted -u root: \ +-H "strict_mode:false" \ +-H "format:json" \ +-H "read_json_by_line:true" \ +-H "unique_key_update_mode:UPDATE_FLEXIBLE_COLUMNS" \ +-T test1.json \ +-XPUT http://:/api/d1/t1/_stream_load +``` + +更新后表中的数据如下: + +```sql +MySQL root@127.1:d1> select * from t1; ++---+-----+------+-----+------+--------+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+-----+------+-----+------+--------+ +| 1 | 10 | 111 | 111 | 1 | 1 | +| 2 | 2 | 20 | 2 | 222 | 25 | +| 3 | 3 | 3 | 30 | 3 | 3 | +| 4 | 43 | 4 | 99 | 20 | 4 | +| 5 | 5 | 5 | 5 | 5 | | +| 6 | 999 | 9876 | 777 | 1234 | | ++---+-----+------+-----+------+--------+ +``` + +### 限制与注意事项 + +1. 和之前的部分列更新相同,灵活列更新要求导入的每一行数据需要包括所有的 Key 列,不满足这一要求的行数据将被过滤掉,同时计入 `filter rows` 的计数中,如果 `filtered rows` 的数量超过了本次导入 `max_filter_ratio` 所能容忍的上限,则整个导入将会失败。同时,被过滤的数据会在 error log 留下一条日志。 + +2. 灵活列更新导入中每一个 json 对象中的键值对只有当它的 Key 和目标表中某一列的列名一致时才是有效的,不满足这一要求的键值对将被忽略 。同时,Key 为 `__DORIS_VERSION_COL__`/`__DORIS_ROW_STORE_COL__`/`__DORIS_SKIP_BITMAP_COL__` 的键值对也将被忽略。 + +3. 当目标表的表属性中设置了 `function_column.sequence_type` 这一属性时,灵活列更新的导入可以通过在 json 对象中包括 Key 为 `__DORIS_SEQUENCE_COL__` 的键值对来指定目标表中 `__DORIS_SEQUENCE_COL__` 列的值。对于不指定 `__DORIS_SEQUENCE_COL__` 列的值的行,如果这一行的 Key 在原表中存在,则这一行 `__DORIS_SEQUENCE_COL__` 列的值将被填充为旧行中对应的值,否则该列的值将被填充为 `null` 值 + +例如,对于下表: +```sql +CREATE TABLE t2 ( + `k` int(11) NULL, + `v1` BIGINT NULL, + `v2` BIGINT NULL DEFAULT "9876", + `v3` BIGINT NOT NULL, + `v4` BIGINT NOT NULL DEFAULT "1234", + `v5` BIGINT NULL +) UNIQUE KEY(`k`) DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES( +"replication_num" = "3", +"enable_unique_key_merge_on_write" = "true", +"enable_light_schema_change" = "true", +"enable_unique_key_skip_bitmap_column" = "true", +"function_column.sequence_type" = "int"); +``` + +表中有如下原始数据: +```sql ++---+----+----+----+----+----+----------------------+ +| k | v1 | v2 | v3 | v4 | v5 |__DORIS_SEQUENCE_COL__| ++---+----+----+----+----+----+----------------------+ +| 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 10 | 10 | +| 2 | 2 | 2 | 2 | 2 | 20 | 20 | +| 3 | 3 | 3 | 3 | 3 | 30 | 30 | +| 4 | 4 | 4 | 4 | 4 | 40 | 40 | +| 5 | 5 | 5 | 5 | 5 | 50 | 50 | ++---+----+----+----+----+----+----------------------+ +``` + +通过灵活列更新导入如下数据: +```json +{"k": 1, "v1": 111, "v5": 9, "__DORIS_SEQUENCE_COL__": 9} +{"k": 2, "v2": 222, "v5": 25, "__DORIS_SEQUENCE_COL__": 25} +{"k": 3, "v3": 333} +{"k": 4, "v4": 444, "v5": 50, "v1": 411, "v3": 433, "v2": null, "__DORIS_SEQUENCE_COL__": 50} +{"k": 5, "v5": null, "__DORIS_SEQUENCE_COL__": null} +{"k": 6, "v1": 611, "v3": 633} +{"k": 7, "v3": 733, "v5": 300, "__DORIS_SEQUENCE_COL__": 300} +``` + +最终表中的数据如下: +```sql ++---+--------+--------+-----+------+--------+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+--------+--------+-----+------+--------+ +| 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 1 | +| 5 | 5 | 5 | 5 | 5 | 5 | +| 2 | 2 | 222 | 2 | 2 | 25 | +| 3 | 3 | 3 | 333 | 3 | 3 | +| 4 | 411 | | 433 | 444 | 50 | +| 6 | 611 | 9876 | 633 | 1234 | | +| 7 | | 9876 | 733 | 1234 | 300 | ++---+--------+--------+-----+------+--------+ +``` + +4. 当目标表的表属性中设置了 `function_column.sequence_col` 这一属性时,灵活列更新导入数据的 json 对象中 Key 为 `__DORIS_SEQUENCE_COL__` 的键值对将被忽略,导入中某一行 `__DORIS_SEQUENCE_COL__` 列的值将与这一行中表属性 `function_column.sequence_col` 所指定的列最终的值完全一致。 + +例如,对于下表: +```sql +CREATE TABLE t3 ( + `k` int(11) NULL, + `v1` BIGINT NULL, + `v2` BIGINT NULL DEFAULT "9876", + `v3` BIGINT NOT NULL, + `v4` BIGINT NOT NULL DEFAULT "1234", + `v5` BIGINT NULL DEFAULT "31" +) UNIQUE KEY(`k`) DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES( +"replication_num" = "3", +"enable_unique_key_merge_on_write" = "true", +"enable_light_schema_change" = "true", +"enable_unique_key_skip_bitmap_column" = "true", +"function_column.sequence_col" = "v5"); +``` + +表中有如下原始数据: +```sql ++---+----+----+----+----+----+----------------------+ +| k | v1 | v2 | v3 | v4 | v5 |__DORIS_SEQUENCE_COL__| ++---+----+----+----+----+----+----------------------+ +| 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 10 | 10 | +| 2 | 2 | 2 | 2 | 2 | 20 | 20 | +| 3 | 3 | 3 | 3 | 3 | 30 | 30 | +| 4 | 4 | 4 | 4 | 4 | 40 | 40 | +| 5 | 5 | 5 | 5 | 5 | 50 | 50 | ++---+----+----+----+----+----+----------------------+ +``` + +通过灵活列更新导入如下数据: +```json +{"k": 1, "v1": 111, "v5": 9} +{"k": 2, "v2": 222, "v5": 25} +{"k": 3, "v3": 333} +{"k": 4, "v4": 444, "v5": 50, "v1": 411, "v3": 433, "v2": null} +{"k": 5, "v5": null} +{"k": 6, "v1": 611, "v3": 633} +{"k": 7, "v3": 733, "v5": 300} +``` + +最终表中的数据如下: +```sql ++---+--------+--------+-----+------+-----+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+--------+--------+-----+------+-----+ +| 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 10 | +| 5 | 5 | 5 | 5 | 5 | 50 | +| 2 | 2 | 222 | 2 | 2 | 25 | +| 3 | 3 | 3 | 333 | 3 | 30 | +| 4 | 411 | | 433 | 444 | 50 | +| 6 | 611 | 9876 | 633 | 1234 | 31 | +| 7 | | 9876 | 733 | 1234 | 300 | ++---+--------+--------+-----+------+-----+ +``` + +5. 使用灵活列更新时不能指定或开启如下一些导入属参数: + - 不能指定 `merge_type` 参数 + - 不能指定 `delete` 参数 + - 不能开启 `fuzzy_parse` 参数 + - 不能指定 `columns` 参数 + - 不能指定 `jsonpaths` 参数 + - 不能指定 `hidden_columns` 参数 + - 不能指定 `function_column.sequence_col` 参数 + +6. 不支持在有 Variant 列的表上进行灵活列更新。 + +7. 不支持在有同步物化视图的表上进行灵活列更新。 From 21d7beca1f4978bff6e2e5cf2ba8c7f5f578578c Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 26 Sep 2024 18:28:25 +0800 Subject: [PATCH 2/4] update docs --- .../data-operate/import/import-way/stream-load-manual.md | 5 +++-- .../current/data-operate/update/update-of-unique-model.md | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/import/import-way/stream-load-manual.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/import/import-way/stream-load-manual.md index 218cfb793e3ec..e390745e94d85 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/import/import-way/stream-load-manual.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/import/import-way/stream-load-manual.md @@ -341,6 +341,7 @@ Stream Load 操作支持 HTTP 分块导入(HTTP chunked)与 HTTP 非分块 | enclose | 指定包围符。当 CSV 数据字段中含有行分隔符或列分隔符时,为防止意外截断,可指定单字节字符作为包围符起到保护作用。例如列分隔符为 ",",包围符为 "'",数据为 "a,'b,c'",则 "b,c" 会被解析为一个字段。注意:当 enclose 设置为`"`时,trim_double_quotes 一定要设置为 true。 | | escape | 指定转义符。用于转义在字段中出现的与包围符相同的字符。例如数据为 "a,'b,'c'",包围符为 "'",希望 "b,'c 被作为一个字段解析,则需要指定单字节转义符,例如"\",将数据修改为 "a,'b,\'c'"。 | | memtable_on_sink_node | 导入数据的时候是否开启 MemTable 前移,默认为 false。 | +| unique_key_update_mode | Unique 表上的更新模式,目前仅对 Merge-On-Write Unique 表有效,一共支持三种类型 `UPSERT`, `UPDATE_FIXED_COLUMNS`, `UPDATE_FLEXIBLE_COLUMNS`。 `UPSERT`: 表示以 upsert 语义导入数据; `UPDATE_FIXED_COLUMNS`: 表示以[部分列更新](../../../data-operate/update/update-of-unique-model)的方式导入数据; `UPDATE_FLEXIBLE_COLUMNS`: 表示以[灵活部分列更新](../../../data-operate/update/update-of-unique-model)的方式导入数据| ### 导入返回值 @@ -1070,9 +1071,9 @@ Doris 可以在导入语句中支持非常丰富的列转换和过滤操作。 `strict_mode` 属性用于设置导入任务是否运行在严格模式下。该属性会对列映射、转换和过滤的结果产生影响,它同时也将控制部分列更新的行为。关于严格模式的具体说明,可参阅 [严格模式](../../../data-operate/import/load-strict-mode) 文档。 -### 导入时进行部分列更新 +### 导入时进行部分列更新/灵活部分列更新 -关于导入时,如何表达部分列更新,可以参考 [数据操作/数据更新](../../../data-operate/update/unique-update) 文档 +关于导入时,如何表达部分列更新,可以参考 [数据更新/主键模型的导入更新](../../../data-operate/update/update-of-unique-model) 文档 ## 更多帮助 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md index 072ccf3f2b2a4..7e46af4c56d4b 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md @@ -154,7 +154,7 @@ INSERT INTO order_tbl (order_id, order_status) values (1,'待发货'); 目前,同一批次数据写入任务(无论是导入任务还是`INSERT INTO`)的所有行只能更新相同的列,如果需要更新不同列的数据,则需要分不同的批次进行写入。 -## 灵活列更新 +## 灵活部分列更新 在 x.x.x 版本之前,doris 支持的部分列更新功能限制了一次导入中每一行必须更新相同的列,从 x.x.x 版本开始,doris 支持一种更加灵活的更新方式,它使得一次导入中的每一行可以更新不同的列。 @@ -416,6 +416,10 @@ PROPERTIES( - 不能指定 `jsonpaths` 参数 - 不能指定 `hidden_columns` 参数 - 不能指定 `function_column.sequence_col` 参数 + - 不能指定 `sql` 参数 + - 不能开启 `memtable_on_sink_node` 前移 + - 不能指定 `group_commit` 参数 + - 不能指定 `where` 参数 6. 不支持在有 Variant 列的表上进行灵活列更新。 From b7cf383a90c923925644c3cee4271703d8b2d681 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 26 Sep 2024 19:59:07 +0800 Subject: [PATCH 3/4] add English doc --- .../import/import-way/stream-load-manual.md | 3 +- .../update/update-of-unique-model.md | 279 +++++++++++++++++- 2 files changed, 280 insertions(+), 2 deletions(-) diff --git a/docs/data-operate/import/import-way/stream-load-manual.md b/docs/data-operate/import/import-way/stream-load-manual.md index a5fc94fcb3bc3..d5769a4293abf 100644 --- a/docs/data-operate/import/import-way/stream-load-manual.md +++ b/docs/data-operate/import/import-way/stream-load-manual.md @@ -336,6 +336,7 @@ Determines whether to enable the Pipeline engine to execute Streamload tasks. Se | enclose | Specify the enclosure character. When a CSV data field contains a row delimiter or column delimiter, to prevent unexpected truncation, you can specify a single-byte character as the enclosure for protection. For example, if the column delimiter is "," and the enclosure is "'", the data "a,'b,c'" will have "b,c" parsed as a single field. Note: When the enclosure is set to a double quote ("), make sure to set `trim_double_quotes` to true. | | escape | Specify the escape character. It is used to escape characters that are the same as the enclosure character within a field. For example, if the data is "a,'b,'c'", and the enclosure is "'", and you want "b,'c" to be parsed as a single field, you need to specify a single-byte escape character, such as "", and modify the data to "a,'b','c'". | | memtable_on_sink_node | Whether to enable MemTable on DataSink node when loading data, default is false. | +|unique_key_update_mode | The update modes on Unique tables, currently are only effective for Merge-On-Write Unique tables. Supporting three types: `UPSERT`, `UPDATE_FIXED_COLUMNS`, and `UPDATE_FLEXIBLE_COLUMNS`. `UPSERT`: Indicates that data is loaded with upsert semantics; `UPDATE_FIXED_COLUMNS`: Indicates that data is loaded through partial updates; `UPDATE_FLEXIBLE_COLUMNS`: Indicates that data is loaded through flexible partial updates.| ### Load return value @@ -1064,7 +1065,7 @@ Doris supports a very rich set of column transformations and filtering operation The strict_mode attribute is used to set whether the import task runs in strict mode. This attribute affects the results of column mapping, transformation, and filtering, and it also controls the behavior of partial column updates. For specific instructions on strict mode, please refer to the [Strict Mode](../../../data-operate/import/load-strict-mode) documentation. -### Perform partial column updates during import +### Perform partial column updates/flexible partial update during import For how to express partial column updates during import, please refer to the Data Manipulation/Data Update documentation. diff --git a/docs/data-operate/update/update-of-unique-model.md b/docs/data-operate/update/update-of-unique-model.md index d5090219e20a7..8e5f721682edd 100644 --- a/docs/data-operate/update/update-of-unique-model.md +++ b/docs/data-operate/update/update-of-unique-model.md @@ -155,4 +155,281 @@ Suggestions for improving load performance: Now, all rows in a batch write task (whether it is an load task or `INSERT INTO`) can only update the same columns. If you need to update different columns, you will need to perform separate batch writes. -In the future, flexible column updates will be supported, allowing users to update different columns for each row within the same batch load. \ No newline at end of file +## Flexible Partial Column Updates + +Before version x.x.x, Doris's partial update feature required that every row in an import update the same columns. Starting from version x.x.x, Doris supports a more flexible partial update method that allows each row in a single import to update different columns. + +:::caution +Note + +1. The flexible partial update feature is supported since version x.x.x. +2. Currently, only the Stream Load import method and tools using Stream Load (e.g. Doris-Flink-Connector) support this feature. +3. The import file must be in JSON format when using flexible column updates. +::: + +### Applicable Scenarios + +When using CDC to synchronize data from a database system to Doris in real-time, the records output by the source system may not contain complete row data, but only the values of the primary keys and the updated columns. In such cases, the columns updated in a batch of data within a time window may differ. Flexible column updates can be used to import data into Doris. + +### Usage + +**Enabling Flexible Column Updates for Existing Tables** + +For existing Merge-On-Write tables created in old versions of Doris, after upgrading, you can enable flexible partial updates using the command: `ALTER TABLE db1.tbl1 ENABLE FEATURE "UPDATE_FLEXIBLE_COLUMNS";`. After executing this command, if the result of `show create table db1.tbl1` includes `"enable_unique_key_skip_bitmap_column" = "true"`, the feature has been successfully enabled. Ensure that the target table has the light-schema-change feature enabled beforehand. + +**Using Flexible Column Updates for New Tables** + +For new tables, to use the flexible column update feature, specify the following table properties when creating the table to enable Merge-on-Write, light-schema-change, and include the required hidden bitmap column for flexible column updates (the default value for these properties is `true`): + +```Plain +"enable_unique_key_merge_on_write" = "true" +"enable_unique_key_skip_bitmap_column" = "true" +"enable_light_schema_change" = "true" +``` + +**StreamLoad** + +When using Stream Load, add the following header: + +```Plain +unique_key_update_mode:UPDATE_FLEXIBLE_COLUMNS +``` + +**Flink Doris Connector** + +If using the Flink Doris Connector, add the following configuration: + +```Plain +'sink.properties.unique_key_update_mode' = 'UPDATE_FLEXIBLE_COLUMNS' +``` + +### Example + +Assuming the following table: + +```sql +CREATE TABLE t1 ( + `k` int(11) NULL, + `v1` BIGINT NULL, + `v2` BIGINT NULL DEFAULT "9876", + `v3` BIGINT NOT NULL, + `v4` BIGINT NOT NULL DEFAULT "1234", + `v5` BIGINT NULL +) UNIQUE KEY(`k`) DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES( +"replication_num" = "3", +"enable_unique_key_merge_on_write" = "true", +"enable_light_schema_change" = "true", +"enable_unique_key_skip_bitmap_column" = "true"); +``` + +The original data in the table is: + +```sql +MySQL root@127.1:d1> select * from t1; ++---+----+----+----+----+----+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+----+----+----+----+----+ +| 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 1 | +| 2 | 2 | 2 | 2 | 2 | 2 | +| 3 | 3 | 3 | 3 | 3 | 3 | +| 4 | 4 | 4 | 4 | 4 | 4 | +| 5 | 5 | 5 | 5 | 5 | 5 | ++---+----+----+----+----+----+ +``` + +Now, updating some fields using flexible column updates: + +```shell +$ cat test1.json +``` +```json +{"k": 0, "__DORIS_DELETE_SIGN__": 1} +{"k": 1, "v1": 10} +{"k": 2, "v2": 20, "v5": 25} +{"k": 3, "v3": 30} +{"k": 4, "v4": 20, "v1": 43, "v3": 99} +{"k": 5, "v5": null} +{"k": 6, "v1": 999, "v3": 777} +{"k": 2, "v4": 222} +{"k": 1, "v2": 111, "v3": 111} +``` +```shell +curl --location-trusted -u root: \ +-H "strict_mode:false" \ +-H "format:json" \ +-H "read_json_by_line:true" \ +-H "unique_key_update_mode:UPDATE_FLEXIBLE_COLUMNS" \ +-T test1.json \ +-XPUT http://:/api/d1/t1/_stream_load +``` + +After the update, the data in the table is: + +```sql +MySQL root@127.1:d1> select * from t1; ++---+-----+------+-----+------+--------+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+-----+------+-----+------+--------+ +| 1 | 10 | 111 | 111 | 1 | 1 | +| 2 | 2 | 20 | 2 | 222 | 25 | +| 3 | 3 | 3 | 30 | 3 | 3 | +| 4 | 43 | 4 | 99 | 20 | 4 | +| 5 | 5 | 5 | 5 | 5 | | +| 6 | 999 | 9876 | 777 | 1234 | | ++---+-----+------+-----+------+--------+ +``` + +### Limitations and Considerations + +1. Similar to previous partial updates, flexible column updates require that each row of imported data include all key columns. Rows not meeting this requirement will be filtered out and counted in filter rows. If the number of filtered rows exceeds the `max_filter_ratio` threshold for this import, the entire import will fail, and filtered data will generate an error log. + +2. In flexible partial update loads, key-value pairs in each JSON object are only valid if the key matches a column name in the target table. Key-value pairs that do not meet this requirement will be ignored. Pairs with keys `__DORIS_VERSION_COL__`, `__DORIS_ROW_STORE_COL__`, or `__DORIS_SKIP_BITMAP_COL__` will also be ignored. + +3. If the table properties of the target table include `function_column.sequence_type`, the import can specify the value for the `__DORIS_SEQUENCE_COL__` column by including a key-value pair in the JSON object with key `__DORIS_SEQUENCE_COL__`. For rows that do not specify a value for the `__DORIS_SEQUENCE_COL__`, if the key exists in the original table, the value will be filled from the old row; otherwise, it will be set to null. + +For example, for the following table: + +```sql +CREATE TABLE t2 ( + `k` int(11) NULL, + `v1` BIGINT NULL, + `v2` BIGINT NULL DEFAULT "9876", + `v3` BIGINT NOT NULL, + `v4` BIGINT NOT NULL DEFAULT "1234", + `v5` BIGINT NULL +) UNIQUE KEY(`k`) DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES( +"replication_num" = "3", +"enable_unique_key_merge_on_write" = "true", +"enable_light_schema_change" = "true", +"enable_unique_key_skip_bitmap_column" = "true", +"function_column.sequence_type" = "int"); +``` + +The original data in the table is: + +```sql ++---+----+----+----+----+----+----------------------+ +| k | v1 | v2 | v3 | v4 | v5 |__DORIS_SEQUENCE_COL__| ++---+----+----+----+----+----+----------------------+ +| 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 10 | 10 | +| 2 | 2 | 2 | 2 | 2 | 20 | 20 | +| 3 | 3 | 3 | 3 | 3 | 30 | 30 | +| 4 | 4 | 4 | 4 | 4 | 40 | 40 | +| 5 | 5 | 5 | 5 | 5 | 50 | 50 | ++---+----+----+----+----+----+----------------------+ +``` + +Importing data using flexible partial column updates: + +```json +{"k": 1, "v1": 111, "v5": 9, "__DORIS_SEQUENCE_COL__": 9} +{"k": 2, "v2": 222, "v5": 25, "__DORIS_SEQUENCE_COL__": 25} +{"k": 3, "v3": 333} +{"k": 4, "v4": 444, "v5": 50, "v1": 411, "v3": 433, "v2": null, "__DORIS_SEQUENCE_COL__": 50} +{"k": 5, "v5": null, "__DORIS_SEQUENCE_COL__": null} +{"k": 6, "v1": 611, "v3": 633} +{"k": 7, "v3": 733, "v5": 300, "__DORIS_SEQUENCE_COL__": 300} +``` + +The final data in the table is as follows: + +```sql ++---+--------+--------+-----+------+--------+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+--------+--------+-----+------+--------+ +| 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 1 | +| 5 | 5 | 5 | 5 | 5 | 5 | +| 2 | 2 | 222 | 2 | 2 | 25 | +| 3 | 3 | 3 | 333 | 3 | 3 | +| 4 | 411 | | 433 | 444 | 50 | +| 6 | 611 | 9876 | 633 | 1234 | | +| 7 | | 9876 | 733 | 1234 | 300 | ++---+--------+--------+-----+------+--------+ +``` + +4. When the table property `function_column.sequence_col` is set in the target table, the key-value pair in the JSON object for flexible column updates with the key `__DORIS_SEQUENCE_COL__` will be ignored. The value of the `__DORIS_SEQUENCE_COL__` column in a specific row during import will be the same of the final value of the column specified by the table property `function_column.sequence_col` for that row. + +For example, for the following table: + +```sql +CREATE TABLE t3 ( + `k` int(11) NULL, + `v1` BIGINT NULL, + `v2` BIGINT NULL DEFAULT "9876", + `v3` BIGINT NOT NULL, + `v4` BIGINT NOT NULL DEFAULT "1234", + `v5` BIGINT NULL DEFAULT "31" +) UNIQUE KEY(`k`) DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES( +"replication_num" = "3", +"enable_unique_key_merge_on_write" = "true", +"enable_light_schema_change" = "true", +"enable_unique_key_skip_bitmap_column" = "true", +"function_column.sequence_col" = "v5"); +``` + +The original data in the table is as follows: + +```sql ++---+----+----+----+----+----+----------------------+ +| k | v1 | v2 | v3 | v4 | v5 |__DORIS_SEQUENCE_COL__| ++---+----+----+----+----+----+----------------------+ +| 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 10 | 10 | +| 2 | 2 | 2 | 2 | 2 | 20 | 20 | +| 3 | 3 | 3 | 3 | 3 | 30 | 30 | +| 4 | 4 | 4 | 4 | 4 | 40 | 40 | +| 5 | 5 | 5 | 5 | 5 | 50 | 50 | ++---+----+----+----+----+----+----------------------+ +``` + +Using flexible partial updates, the following data is imported: + +```json +{"k": 1, "v1": 111, "v5": 9} +{"k": 2, "v2": 222, "v5": 25} +{"k": 3, "v3": 333} +{"k": 4, "v4": 444, "v5": 50, "v1": 411, "v3": 433, "v2": null} +{"k": 5, "v5": null} +{"k": 6, "v1": 611, "v3": 633} +{"k": 7, "v3": 733, "v5": 300} +``` + +The final data in the table is as follows: + +```sql ++---+--------+--------+-----+------+-----+ +| k | v1 | v2 | v3 | v4 | v5 | ++---+--------+--------+-----+------+-----+ +| 0 | 0 | 0 | 0 | 0 | 0 | +| 1 | 1 | 1 | 1 | 1 | 10 | +| 5 | 5 | 5 | 5 | 5 | 50 | +| 2 | 2 | 222 | 2 | 2 | 25 | +| 3 | 3 | 3 | 333 | 3 | 30 | +| 4 | 411 | | 433 | 444 | 50 | +| 6 | 611 | 9876 | 633 | 1234 | 31 | +| 7 | | 9876 | 733 | 1234 | 300 | ++---+--------+--------+-----+------+-----+ +``` + +5. When using flexible partial updates, the following import parameters cannot be specified or enabled: + - The `merge_type` parameter cannot be specified. + - The `delete` parameter cannot be specified. + - The `fuzzy_parse` parameter cannot be enabled. + - The `columns` parameter cannot be specified. + - The `jsonpaths` parameter cannot be specified. + - The `hidden_columns` parameter cannot be specified. + - The `function_column.sequence_col` parameter cannot be specified. + - The `sql` parameter cannot be specified. + - The `memtable_on_sink_node` option cannot be enabled. + - The `group_commit` parameter cannot be specified. + - The `where` parameter cannot be specified. + +6. Flexible partial updates are not supported on tables with Variant columns. + +7. Flexible partial updates are not supported on tables with synchronous materialized views. From 491f21523768ffda1e77cdfe5b2ab2119cd08fbb Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Sun, 29 Sep 2024 10:50:34 +0800 Subject: [PATCH 4/4] fix --- docs/data-operate/update/update-of-unique-model.md | 4 ++-- .../current/data-operate/update/update-of-unique-model.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/data-operate/update/update-of-unique-model.md b/docs/data-operate/update/update-of-unique-model.md index 8e5f721682edd..9287e03c693be 100644 --- a/docs/data-operate/update/update-of-unique-model.md +++ b/docs/data-operate/update/update-of-unique-model.md @@ -179,12 +179,12 @@ For existing Merge-On-Write tables created in old versions of Doris, after upgra **Using Flexible Column Updates for New Tables** -For new tables, to use the flexible column update feature, specify the following table properties when creating the table to enable Merge-on-Write, light-schema-change, and include the required hidden bitmap column for flexible column updates (the default value for these properties is `true`): +For new tables, to use the flexible column update feature, specify the following table properties when creating the table to enable Merge-on-Write, light-schema-change, and include the required hidden bitmap column for flexible column updates: ```Plain +"enable_light_schema_change" = "true" "enable_unique_key_merge_on_write" = "true" "enable_unique_key_skip_bitmap_column" = "true" -"enable_light_schema_change" = "true" ``` **StreamLoad** diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md index 7e46af4c56d4b..f6c45e9767a2d 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/data-operate/update/update-of-unique-model.md @@ -178,12 +178,12 @@ INSERT INTO order_tbl (order_id, order_status) values (1,'待发货'); **新建表使用灵活列更新功能** -对于新建的表,如果需要使用灵活列更新功能,建表时需要指定如下表属性,以开启 Merge-on-Write 实现,开启 light-schema-change,同时使得表具有灵活列更新所需要的 `bitmap` 隐藏列。(这3个表属性的默认值均为 `true`) +对于新建的表,如果需要使用灵活列更新功能,建表时需要指定如下表属性,以开启 Merge-on-Write 实现,开启 light-schema-change,同时使得表具有灵活列更新所需要的 `bitmap` 隐藏列。 ```Plain +"enable_light_schema_change" = "true" "enable_unique_key_merge_on_write" = "true" "enable_unique_key_skip_bitmap_column" = "true" -"enable_light_schema_change" = "true" ``` **StreamLoad**