diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 6d88e41f6c..f8c2a283f1 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -13,33 +13,73 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: build_docs - -# execute this docs build workflow automatically when new push happens in any branch +name: "Build documentation" + on: push: paths: - 'docs/**' - + branches: + - master + - release-* + schedule: + - cron: '0 0 * * *' # Deploy every day + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: - - build_docs_job: + build-documentation: + if: github.repository == 'apache/flink-cdc' runs-on: ubuntu-latest - container: debian:buster-slim - + strategy: + max-parallel: 1 + matrix: + branch: + - master + - release-3.0 + steps: - - - name: Prereqs - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - apt-get update - apt-get install -y git - git clone "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" . - shell: bash - - - name: Execute script to build our documentation and update pages - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: "docs/build_docs.sh" - shell: bash + - uses: actions/checkout@v3 + with: + ref: ${{ matrix.branch }} + + - name: Set branch environment variable + run: | + currentBranch=$(git branch --show-current) + + echo "flink_branch=${currentBranch}" >> ${GITHUB_ENV} + + if [ "${currentBranch}" = "master" ]; then + echo "flink_alias=release-3.1" >> ${GITHUB_ENV} + elif [ "${currentBranch}" = "release-3.0" ]; then + echo "flink_alias=stable" >> ${GITHUB_ENV} + fi + + - name: Build documentation + run: | + docker run --rm --volume "$PWD:/root/flink-cdc" chesnay/flink-ci:java_8_11_17_21_maven_386 bash -c "cd /root/flink-cdc && chmod +x ./.github/workflows/docs.sh && ./.github/workflows/docs.sh" + + - name: Upload documentation + uses: burnett01/rsync-deployments@5.2 + with: + switches: --archive --compress + path: docs/target/ + remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/flink/flink-cdc-docs-${{ env.flink_branch }}/ + remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} + remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} + remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} + remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} + + - name: Upload documentation alias + if: env.flink_alias != '' + uses: burnett01/rsync-deployments@5.2 + with: + switches: --archive --compress + path: docs/target/ + remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/flink/flink-cdc-docs-${{ env.flink_alias }}/ + remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} + remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} + remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} + remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} diff --git a/.github/workflows/docs.sh b/.github/workflows/docs.sh new file mode 100644 index 0000000000..4babee42ab --- /dev/null +++ b/.github/workflows/docs.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +set -e + +mvn --version +java -version +javadoc -J-version + +# workaround for a git security patch +git config --global --add safe.directory /root/flink-cdc +git submodule update --init --recursive + +HUGO_REPO=https://github.com/gohugoio/hugo/releases/download/v0.80.0/hugo_extended_0.80.0_Linux-64bit.tar.gz +HUGO_ARTIFACT=hugo_extended_0.80.0_Linux-64bit.tar.gz +if ! curl --fail -OL $HUGO_REPO ; then + echo "Failed to download Hugo binary" + exit 1 +fi +tar -zxvf $HUGO_ARTIFACT + +# Build the docs +hugo --source docs + +# generate docs into docs/target +hugo -v --source docs --destination target +if [ $? -ne 0 ]; then + echo "Error building the docs" + exit 1 +fi + +# build Flink; required for Javadoc step +mvn clean install -B -DskipTests -Dfast + +# build java/scala docs +mkdir -p docs/target/api +mvn javadoc:aggregate -B \ + -DadditionalJOption="-Xdoclint:none --allow-script-in-comments" \ + -Dmaven.javadoc.failOnError=false \ + -Dcheckstyle.skip=true \ + -Dspotless.check.skip=true \ + -Denforcer.skip=true \ + -Dheader="

Back to Flink Website

" +mv target/site/apidocs docs/target/api/java diff --git a/.github/workflows/flink_cdc.yml b/.github/workflows/flink_cdc.yml index 29ca6cd551..fe2793af2b 100644 --- a/.github/workflows/flink_cdc.yml +++ b/.github/workflows/flink_cdc.yml @@ -19,10 +19,16 @@ on: branches: - master - release-* + paths-ignore: + - 'docs/**' + - 'README.md' pull_request: branches: - master - release-* + paths-ignore: + - 'docs/**' + - 'README.md' concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..6cb7e5ce65 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "docs/themes/book"] + path = docs/themes/book + url = https://github.com/alex-shpak/hugo-book diff --git a/.idea/vcs.xml b/.idea/vcs.xml index e774de2aff..818d136a83 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -20,5 +20,6 @@ + diff --git a/README.md b/README.md index ba7a61638d..dd7841e6ef 100644 --- a/README.md +++ b/README.md @@ -7,17 +7,17 @@ This README is meant as a brief walkthrough on the core features of CDC Connecto ## Supported (Tested) Databases -| Connector | Database | Driver | -|------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| -| [mongodb-cdc](docs/content/connectors/mongodb-cdc.md) |
  • [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0, 6.0 | MongoDB Driver: 4.3.4 | -| [mysql-cdc](docs/content/connectors/mysql-cdc.md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 | -| [oceanbase-cdc](/docs/content/connectors/oceanbase-cdc.md) |
  • [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x
  • [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x | -| [oracle-cdc](docs/content/connectors/oracle-cdc.md) |
  • [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 | -| [postgres-cdc](docs/content/connectors/postgres-cdc.md) |
  • [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 | -| [sqlserver-cdc](docs/content/connectors/sqlserver-cdc.md) |
  • [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 | -| [tidb-cdc](docs/content/connectors/tidb-cdc.md) |
  • [TiDB](https://www.pingcap.com): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 | -| [Db2-cdc](docs/content/connectors/db2-cdc.md) |
  • [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 | -| [Vitess-cdc](docs/content/connectors/vitess-cdc.md) |
  • [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 | +| Connector | Database | Driver | +|-------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| +| [mongodb-cdc](docs/content/docs/connectors/cdc-connectors/mongodb-cdc.md) |
  • [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0, 6.0 | MongoDB Driver: 4.3.4 | +| [mysql-cdc](docs/content/docs/connectors/cdc-connectors/mysql-cdc.md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 | +| [oceanbase-cdc](docs/content/docs/connectors/cdc-connectors/oceanbase-cdc.md) |
  • [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x
  • [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x | +| [oracle-cdc](docs/content/docs/connectors/cdc-connectors/oracle-cdc.md) |
  • [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 | +| [postgres-cdc](docs/content/docs/connectors/cdc-connectors/postgres-cdc.md) |
  • [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 | +| [sqlserver-cdc](docs/content/docs/connectors/cdc-connectors/sqlserver-cdc.md) |
  • [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 | +| [tidb-cdc](docs/content/docs/connectors/cdc-connectors/tidb-cdc.md) |
  • [TiDB](https://www.pingcap.com): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 | +| [Db2-cdc](docs/content/docs/connectors/cdc-connectors/db2-cdc.md) |
  • [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 | +| [Vitess-cdc](docs/content/docs/connectors/cdc-connectors/vitess-cdc.md) |
  • [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 | ## Features @@ -106,10 +106,10 @@ Include following Maven dependency (available through Maven Central): ``` - com.ververica + org.apache.flink flink-connector-mysql-cdc - + 2.5-SNAPSHOT ``` diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000..6a6c3e4960 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,10 @@ +.bundle/ +.jekyll-metadata +.jekyll-cache/ +.rubydeps/ +ruby2/.bundle/ +ruby2/.rubydeps/ +public/ +resources/ +.hugo_build.lock +.DS_Store diff --git a/docs/Dockerfile b/docs/Dockerfile deleted file mode 100644 index 29ad7a68fc..0000000000 --- a/docs/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM python:3.7-slim -RUN apt-get update -RUN apt-get -y install git -RUN pip3 install -U sphinx==4.1.1 myst-parser==0.15.2 pygments==2.10.0 sphinx-rtd-theme==0.5.2 sphinx-autobuild==2021.3.14 gitpython==3.1.18 pyyaml==6.0 -EXPOSE 8001 -CMD ["sphinx-autobuild", "--host", "0.0.0.0", "--port", "8001", "/home/flink-cdc/docs", "/home/flink-cdc/docs/_build/html"] \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 298ea9e213..0000000000 --- a/docs/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index d12f6534db..11bbad8d85 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,36 +1,269 @@ - +# Requirements -This README gives an overview of how to build the documentation of Flink CDC. +### Build the documentation and serve it locally -### Build the site locally -Make sure you have installed [Docker](https://docs.docker.com/engine/install/) and started it on you local environment. +The Flink documentation uses [Hugo](https://gohugo.io/getting-started/installing/) to generate HTML files. More specifically, it uses the *extended version* of Hugo with Sass/SCSS support. -From the directory of this module (`docs`), use the following command to start the site. +To build the documentation, you can install Hugo locally or use a Docker image. + +Both methods require you to execute commands in the directory of this module (`docs/`). The built site is served at http://localhost:1313/. + +#### Using Hugo Docker image: ```sh -./docs_site.sh start +$ git submodule update --init --recursive +$ docker pull jakejarvis/hugo-extended:latest +$ docker run -v $(pwd):/src -p 1313:1313 jakejarvis/hugo-extended:latest server --buildDrafts --buildFuture --bind 0.0.0.0 ``` -Then the site will run and can be viewed at http://localhost:8001, any update on the `docs` will be shown in the site without restarting. -Of course, you can use the following command to stop the site. +## Include externally hosted documentation -```sh -./docs_site.sh stop +With the ongoing efforts to move Flink's connectors from this repository to individual, dedicated +repositories, this also requires the documentation to be hosted outside this repo. However, +we still want to serve all documentation as a whole on the Flink documentation website. + +Adding new externally hosted documentation requires the following steps to be taken: + +1. (If necessary) Move the existing documentation to the new repository + +2. In the Flink repository, edit the `docs/setup_docs.sh` file and add a reference to your now +externally hosted documentation. The reference will look like `integrate_connector_docs `. + +Replace with the name of your connector, e.g., `elasticsearch` for `flink-connector-elasticsearch`. + +## Generate configuration tables + +Configuration descriptions are auto generated from code. To trigger the generation you need to run in the project root: + +``` +mvn -Pgenerate-config-docs install -Dfast -DskipTests +``` + +The resulting html files will be written to `layouts/shortcodes/generated`. Tables are regenerated each time the command is invoked. +These tables can be directly included into the documentation: + +``` +{{< generated/file_name >}} +``` + +# Contribute + +## Markdown + +The documentation pages are written in [Markdown](http://daringfireball.net/projects/markdown/syntax). It is possible to use [GitHub flavored syntax](http://github.github.com/github-flavored-markdown) and intermix plain html. + +## Front matter + +In addition to Markdown, every page contains a Jekyll front matter, which specifies the title of the page and the layout to use. The title is used as the top-level heading for the page. The default layout is `plain` (found in `_layouts`). + + --- + title: "Title of the Page" + --- + + --- + title: "Title of the Page" <-- Title rendered in the side nave + weight: 1 <-- Weight controls the ordering of pages in the side nav. + type: docs <-- required + aliases: <-- Alias to setup redirect from removed page to this one + - /alias/to/removed/page.html + --- + +## Structure + +### Page + +#### Headings + +All documents are structured with headings. From these headings, you can automatically generate a page table of contents (see below). + +``` +# Level-1 Heading <- Used for the title of the page +## Level-2 Heading <- Start with this one for content +### Level-3 heading +#### Level-4 heading +##### Level-5 heading +``` + +Please stick to the "logical order" when using the headlines, e.g. start with level-2 headings and use level-3 headings for subsections, etc. Don't use a different ordering, because you don't like how a headline looks. + +#### Table of Contents + +Table of contents are added automatically to every page, based on heading levels 2 - 4. +The ToC can be omitted by adding the following to the front matter of the page: + + --- + bookToc: false + --- + +### ShortCodes + +Flink uses [shortcodes](https://gohugo.io/content-management/shortcodes/) to add custom functionality +to its documentation markdown. The following are available for use: + +#### Flink Artifact + + {{< artifact flink-streaming-scala withScalaVersion >}} + +This will be replaced by the maven artifact for flink-streaming-scala that users should copy into their pom.xml file. It will render out to: + +```xml + + org.apache.flink + flink-streaming-scala_2.12 + + +``` + +It includes a number of optional flags: + +* withScalaVersion: Includes the scala version suffix to the artifact id +* withTestScope: Includes `test` to the module. Useful for marking test dependencies. +* withTestClassifier: Includes `tests`. Useful when users should be pulling in Flink tests dependencies. This is mostly for the test harnesses and probably not what you want. + +You can also use the shortcodes (with same flags) instead: + +* `artifact_gradle` to show the Gradle syntax +* `artifact_tabs` to create a tabbed view, showing both Maven and Gradle syntax + +#### Flink Connector Artifact + + {{< connector_artifact flink-connector-elasticsearch 3.0.0 >}} + +This will be replaced by the maven artifact for flink-connector-elasticsearch that users should copy into their pom.xml file. It will render out to: + +```xml + + org.apache.flink + flink-connector-elasticsearch + 3.0.0 + ``` + +#### Back to Top + + {{< top >}} + +This will be replaced by a back to top link. It is recommended to use these links at least at the end of each level-2 section. + +#### Info Hints + + {{< hint info >}} + Some interesting information + {{< /hint >}} + +The hint will be rendered in a blue box. This hint is useful when providing +additional information for the user that does not fit into the flow of the documentation. + +#### Info Warning + + {{< hint warning >}} + Something to watch out for. + {{< /hint >}} + +The hint will be rendered in a yellow box. This hint is useful when highlighting +information users should watch out for to prevent errors. + +#### Info Danger + + {{< hint danger >}} + Something to avoid + {{< /hint >}} + +The hint will be rendered in a red box. This hint is useful when highlighting +information users need to know to avoid data loss or to point out broken +functionality. + +#### Label + + {{< label "My Label" >}} + +The label will be rendered in an inlined blue box. This is useful for labeling functionality +such as whether a SQL feature works for only batch or streaming execution. + +#### Flink version + + {{< version >}} + +Interpolates the current Flink version + +#### Scala Version + + {{< scala_version >}} + +Interpolates the default scala version + +#### Stable + + {{< stable >}} + Some content + {{< /stable >}} + +This shortcode will only render its content if the site is marked as stable. + +#### Unstable + + {{< unstable >}} + Some content + {{< /unstable >}} + +This shortcode will only render its content if the site is marked as unstable. + +#### Query State Warning + + {{< query_state_warning >}} + +Will render a warning the current SQL feature may have unbounded state requirements. + +#### tab + + {{< tabs "sometab" >}} + {{< tab "Java" >}} + ```java + System.out.println("Hello World!"); + ``` + {{< /tab >}} + {{< tab "Scala" >}} + ```scala + println("Hello World!"); + ``` + {< /tab >}} + {{< /tabs }} + +Prints the content in tabs. IMPORTANT: The label in the outermost "tabs" shortcode must +be unique for the page. + +#### Github Repo + + {{< github_repo >}} + +Renders a link to the apache flink repo. + +#### Github Link + + {{< gh_link file="/some/file.java" name="Some file" >}} + +Renders a link to a file in the Apache Flink repo with a given name. + +#### JavaDocs Link + {{< javadoc file="some/file" name="Some file" >}} + +Renders a link to a file in the Apache Flink Java Documentation. + +#### PythonDocs Link + {< pythondoc file="some/file" name="Some file" >}} + +Renders a link to a file in the Apache Flink Python Documentation. + +#### FlinkDownloads Link + +``` +{{< downloads >}} +``` + +Renders a link to the apache flink download page. diff --git a/docs/_static/button.js b/docs/_static/button.js deleted file mode 100644 index a036784ae1..0000000000 --- a/docs/_static/button.js +++ /dev/null @@ -1,6 +0,0 @@ -/*! - * github-buttons v2.19.1 - * (c) 2021 なつき - * @license BSD-2-Clause - */ -!function(){"use strict";var e=window.document,o=e.location,t=window.Math,r=window.HTMLElement,a=window.XMLHttpRequest,n="github-button",i="https://buttons.github.io/buttons.html",l="github.com",c=a&&"prototype"in a&&"withCredentials"in a.prototype,d=c&&r&&"attachShadow"in r.prototype&&!("prototype"in r.prototype.attachShadow),s=function(e,o){for(var t=0,r=e.length;t'}}},download:{heights:{16:{width:16,path:''}}},eye:{heights:{16:{width:16,path:''}}},heart:{heights:{16:{width:16,path:''}}},"issue-opened":{heights:{16:{width:16,path:''}}},"mark-github":{heights:{16:{width:16,path:''}}},package:{heights:{16:{width:16,path:''}}},play:{heights:{16:{width:16,path:''}}},"repo-forked":{heights:{16:{width:16,path:''}}},"repo-template":{heights:{16:{width:16,path:''}}},star:{heights:{16:{width:16,path:''}}}},C=function(e,o){e=p(e).replace(/^octicon-/,""),f(y,e)||(e="mark-github");var t=o>=24&&24 in y[e].heights?24:16,r=y[e].heights[t];return'"},F={},M=function(e,o){var t=F[e]||(F[e]=[]);if(!(t.push(o)>1)){var r=u((function(){for(delete F[e];o=t.shift();)o.apply(null,arguments)}));if(c){var n=new a;m(n,"abort",r),m(n,"error",r),m(n,"load",(function(){var e;try{e=JSON.parse(this.responseText)}catch(e){return void r(e)}r(200!==this.status,e)})),n.open("GET",e),n.send()}else{var i=this||window;i._=function(e){i._=null,r(200!==e.meta.status,e.data)};var l=g(i.document)("script",{async:!0,src:e+(-1!==e.indexOf("?")?"&":"?")+"callback=_"}),d=function(){i._&&i._({meta:{}})};m(l,"load",d),m(l,"error",d),k(l,/de|m/,d),i.document.getElementsByTagName("head")[0].appendChild(l)}}},A=function(e,o,t){var r=g(e.ownerDocument),a=e.appendChild(r("style",{type:"text/css"})),n="body{margin:0}a{text-decoration:none;outline:0}.widget{margin-left:20px;margin-top:20px;display:inline-block;overflow:hidden;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif;font-size:0;line-height:0;white-space:nowrap}.btn,.social-count{position:relative;display:inline-block;display:inline-flex;height:14px;padding:2px 5px;font-size:11px;font-weight:600;line-height:14px;vertical-align:bottom;cursor:pointer;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background-repeat:repeat-x;background-position:-1px -1px;background-size:110% 110%;border:1px solid}.btn{border-radius:.25em}.btn:not(:last-child){border-radius:.25em 0 0 .25em}.social-count{border-left:0;border-radius:0 .25em .25em 0}.widget-lg .btn,.widget-lg .social-count{height:16px;padding:5px 10px;font-size:12px;line-height:16px}.octicon{display:inline-block;vertical-align:text-top;fill:currentColor;overflow:visible}"+function(e){if(null==e)return x.light;if(f(x,e))return x[e];var o=b(e,";",":",(function(e){return e.replace(/^[ \t\n\f\r]+|[ \t\n\f\r]+$/g,"")}));return x[f(x,o["no-preference"])?o["no-preference"]:"light"]+z("light",o.light)+z("dark",o.dark)}(o["data-color-scheme"]);a.styleSheet?a.styleSheet.cssText=n:a.appendChild(e.ownerDocument.createTextNode(n));var i="large"===p(o["data-size"]),c=r("a",{className:"btn",href:o.href,rel:"noopener",target:"_blank",title:o.title||void 0,"aria-label":o["aria-label"]||void 0,innerHTML:C(o["data-icon"],i?16:14)+" "},[r("span",{},[o["data-text"]||""])]),d=e.appendChild(r("div",{className:"widget"+(i?" widget-lg":"")},[c])),s=c.hostname.replace(/\.$/,"");if(("."+s).substring(s.length-l.length)!=="."+l)return c.removeAttribute("href"),void t(d);var h=(" /"+c.pathname).split(/\/+/);if(((s===l||s==="gist."+l)&&"archive"===h[3]||s===l&&"releases"===h[3]&&("download"===h[4]||"latest"===h[4]&&"download"===h[5])||s==="codeload."+l)&&(c.target="_top"),"true"===p(o["data-show-count"])&&s===l&&"marketplace"!==h[1]&&"sponsors"!==h[1]&&"orgs"!==h[1]&&"users"!==h[1]&&"-"!==h[1]){var u,m;if(!h[2]&&h[1])m="followers",u="?tab=followers";else if(!h[3]&&h[2])m="stargazers_count",u="/stargazers";else if(h[4]||"subscription"!==h[3])if(h[4]||"fork"!==h[3]){if("issues"!==h[3])return void t(d);m="open_issues_count",u="/issues"}else m="forks_count",u="/network/members";else m="subscribers_count",u="/watchers";var v=h[2]?"/repos/"+h[1]+"/"+h[2]:"/users/"+h[1];M.call(this,"https://api.github.com"+v,(function(e,o){if(!e){var a=o[m];d.appendChild(r("a",{className:"social-count",href:o.html_url+u,rel:"noopener",target:"_blank","aria-label":a+" "+m.replace(/_count$/,"").replace("_"," ").slice(0,a<2?-1:void 0)+" on GitHub"},[(""+a).replace(/\B(?=(\d{3})+(?!\d))/g,",")]))}t(d)}))}else t(d)},L=window.devicePixelRatio||1,G=function(e){return(L>1?t.ceil(t.round(e*L)/L*2)/2:t.ceil(e))||0},E=function(e,o){e.style.width=o[0]+"px",e.style.height=o[1]+"px"},T=function(o,r){if(null!=o&&null!=r)if(o.getAttribute&&(o=function(e){var o={href:e.href,title:e.title,"aria-label":e.getAttribute("aria-label")};return s(["icon","color-scheme","text","size","show-count"],(function(t){var r="data-"+t;o[r]=e.getAttribute(r)})),null==o["data-text"]&&(o["data-text"]=e.textContent||e.innerText),o}(o)),d){var a=h("span");A(a.attachShadow({mode:"closed"}),o,(function(){r(a)}))}else{var n=h("iframe",{src:"javascript:0",title:o.title||void 0,allowtransparency:!0,scrolling:"no",frameBorder:0});E(n,[0,0]),n.style.border="none";var l=function(){var a,c=n.contentWindow;try{a=c.document.body}catch(o){return void e.body.appendChild(n.parentNode.removeChild(n))}v(n,"load",l),A.call(c,a,o,(function(e){var a=function(e){var o=e.offsetWidth,r=e.offsetHeight;if(e.getBoundingClientRect){var a=e.getBoundingClientRect();o=t.max(o,G(a.width)),r=t.max(r,G(a.height))}return[o,r]}(e);n.parentNode.removeChild(n),w(n,"load",(function(){E(n,a)})),n.src=i+"#"+(n.name=function(e,o,t,r){null==o&&(o="&"),null==t&&(t="="),null==r&&(r=window.encodeURIComponent);var a=[];for(var n in e){var i=e[n];null!=i&&a.push(r(n)+t+r(i))}return a.join(o)}(o)),r(n)}))};m(n,"load",l),e.body.appendChild(n)}};o.protocol+"//"+o.host+o.pathname===i?A(e.body,b(window.name||o.hash.replace(/^#/,"")),(function(){})):function(o){if("complete"===e.readyState||"loading"!==e.readyState&&!e.documentElement.doScroll)setTimeout(o);else if(e.addEventListener){var t=u(o);w(e,"DOMContentLoaded",t),w(window,"load",t)}else k(e,/m/,o)}((function(){var o,t=e.querySelectorAll?e.querySelectorAll("a."+n):(o=[],s(e.getElementsByTagName("a"),(function(e){-1!==(" "+e.className+" ").replace(/[ \t\n\f\r]+/g," ").indexOf(" github-button ")&&o.push(e)})),o);s(t,(function(e){T(e,(function(o){e.parentNode.replaceChild(o,e)}))}))}))}(); \ No newline at end of file diff --git a/docs/_static/fig/contribute_guidance/check_branch.png b/docs/_static/fig/contribute_guidance/check_branch.png deleted file mode 100644 index 191e3964f4..0000000000 Binary files a/docs/_static/fig/contribute_guidance/check_branch.png and /dev/null differ diff --git a/docs/_static/fig/contribute_guidance/fork.png b/docs/_static/fig/contribute_guidance/fork.png deleted file mode 100644 index dce33077ad..0000000000 Binary files a/docs/_static/fig/contribute_guidance/fork.png and /dev/null differ diff --git a/docs/_static/fig/contribute_guidance/open_pr.png b/docs/_static/fig/contribute_guidance/open_pr.png deleted file mode 100644 index 776e6e71d5..0000000000 Binary files a/docs/_static/fig/contribute_guidance/open_pr.png and /dev/null differ diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css deleted file mode 100644 index abce4cf437..0000000000 --- a/docs/_static/theme_overrides.css +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* override table width restrictions */ -.wy-table-responsive table td, .wy-table-responsive table th { - white-space: normal; -} - -.wy-table-responsive { - margin-bottom: 24px; - max-width: 100%; - overflow: visible; -} - -/* override style of li under ul */ -.wy-nav-content ul li { - list-style: disc; - margin-left: 36px; -} - -.wy-nav-content ul li p { - margin: 0 0 8px; -} - -/* override max-width of content */ -.wy-nav-content { - max-width: 80%; -} diff --git a/docs/_templates/breadcrumbs.html b/docs/_templates/breadcrumbs.html deleted file mode 100644 index 55d203691a..0000000000 --- a/docs/_templates/breadcrumbs.html +++ /dev/null @@ -1,51 +0,0 @@ - - - -{%- extends "sphinx_rtd_theme/breadcrumbs.html" %} - -{% if page_source_suffix %} - {% set suffix = page_source_suffix %} -{% else %} - {% set suffix = source_suffix %} -{% endif %} - -{% if meta is defined and meta is not none %} - {% set check_meta = True %} -{% else %} - {% set check_meta = False %} -{% endif %} - -{% if check_meta and 'github_url' in meta %} - {% set display_github = True %} -{% endif %} - - -
    -
      - {% block breadcrumbs_aside %} -
    • - {% if pagename != "search" %} - {% if display_github %} - {{ _('Edit on GitHub') }} - {% endif %} - {% endif %} -
    • - {% endblock %} -
    -
    -
    diff --git a/docs/_templates/versions.html b/docs/_templates/versions.html deleted file mode 100644 index acf436a171..0000000000 --- a/docs/_templates/versions.html +++ /dev/null @@ -1,59 +0,0 @@ - - -{% if READTHEDOCS or display_lower_left %} -{# Add rst-badge after rst-versions for small badge style. #} -
    - - Read the Docs - version: {{ current_version }} - - -
    - {% if versions %} -
    -
    {{ _('Versions') }}
    - {% for slug, url in versions %} - {% if slug == current_version %} {% endif %} -
    {{ slug }}
    - {% if slug == current_version %}
    {% endif %} - {% endfor %} -
    - {% endif %} - {% if READTHEDOCS %} -
    -
    {{ _('On Read the Docs') }}
    -
    - {{ - _('Project Home') }} -
    -
    - {{ - _('Builds') }} -
    -
    - {% endif %} -
    - {% trans %}Free document hosting provided by Read the - Docs.{% endtrans %} - -
    -
    -{% endif %} - - diff --git a/docs/assets/_custom.scss b/docs/assets/_custom.scss new file mode 100644 index 0000000000..33ccbb4060 --- /dev/null +++ b/docs/assets/_custom.scss @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +@import "github"; + +.link { + padding-bottom: 5px; +} + +.appetizer { + color: #FBB142; +} + +.maindish { + color: #7E4F89; +} + +.dessert { + color: #E6526F; +} + +.book-menu nav { + background: #f8f8f8; +} + +.book-page { + padding: 2rem 2rem; +} + +.book-search input { + background: white; +} + +.markdown a { + text-decoration: none; + color: #05b; +} + +.markdown a:visited { + text-decoration: none; + color: #05b; +} + +.markdown { + line-height: 1.43; + + h1, + h2, + h3, + h4, + h5, + h6 { + font-weight: 500; + padding-top: 0; + margin-top: 1em; + } +} + +body { + letter-spacing: normal; + -webkit-font-smoothing: auto; +} + +aside nav ul { + li { + margin: 0.5em 0; + } +} + +.book-search { + border: 2px solid #ebebeb; +} + +@media screen and (max-width: 768px) { + .toc { + display: none; + } +} + +aside.book-menu nav { + a:hover { + font-weight: bold; + opacity: 1.0; + } + + a.active { + font-weight: bold; + color: var(--body-font-color); + } +} + +aside.book-menu > li { + padding: 10px 5px 5px 5px; +} + +aside.book-toc { + h3 { + margin-top: 0; + padding-top: 0; + font-size: 1.2em; + } +} + +html { + line-height: 1.43; +} + +h1, h2, h3, h4, h5, h6 { + line-height: 1.1; +} + +h1, h2, h3 { + margin-top: 20px; + margin-bottom: 10px; +} + +h2, h3, h4 { + padding-top: 1em; +} + +h1 { + font-size: 36px; +} + +h2 { + font-size: 30px; + border-bottom: 1px solid #e5e5e5; +} + +h3 { + font-size: 24px; +} + +h4 { + font-size: 18px; +} + +.markdown code { + background: white; + padding: 0; + border-radius: 0; +} + +pre.chroma code { + line-height: 1.43; +} + +.book-languages { + border: 2px solid black; +} + +.menu-break { + opacity: 0.1; +} + +#book-search-results { + padding: 2px; + background-color: white; +} + +.label { + display: inline; + padding: .2em .6em .3em; + font-size: 75%; + font-weight: 700; + line-height: 1; + color: #fff; + text-align: center; + white-space: nowrap; + vertical-align: baseline; + border-radius: .25em; + background-color: #337ab7; +} + +.expand-toc { + position: fixed; + top: 2em; + right: 5em; + display: none; +} + +.container { + max-width: 90rem; +} + +#book-search-input:focus { + outline: none; +} + +.rest-api h5 { + margin-top: .5em; + margin-bottom: .5em; + font-size: 1em; +} + +.rest-api tbody { + display: table; + width: 100%; + background: white; +} + +.rest-api td { + background: white; +} + +.rest-api .book-expand label { + padding: 0rem 0rem; + background: white; +} + +.rest-api .book-expand { + background: white; +} + +.rest-api .book-expand .book-expand-head { + background: white; +} + +.configuration td { + background: white; +} + +.markdown table tr:nth-child(2n) { + background: white; +} \ No newline at end of file diff --git a/docs/assets/_fonts.scss b/docs/assets/_fonts.scss new file mode 100644 index 0000000000..dc57189cf0 --- /dev/null +++ b/docs/assets/_fonts.scss @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +body { + font-family: "Helvetica Neue",Helvetica,Arial,sans-serif; + font-size: 14px; +} + +code { + font-family: "Menlo", "Lucida Console", monospace; +} \ No newline at end of file diff --git a/docs/assets/github.css b/docs/assets/github.css new file mode 100644 index 0000000000..25600e34e7 --- /dev/null +++ b/docs/assets/github.css @@ -0,0 +1,87 @@ +/** + * Syntax highlighting generated via + * hugo gen chromastyles --style=github > chroma.css + */ + +/* Background */ .chroma { background-color: #ffffff } +/* Other */ .chroma .x { } +/* Error */ .chroma .err { color: #a61717; background-color: #e3d2d2 } +/* LineTableTD */ .chroma .lntd { vertical-align: top; padding: 0; margin: 0; border: 0; } +/* LineTable */ .chroma .lntable { border-spacing: 0; padding: 0; margin: 0; border: 0; width: auto; overflow: auto; display: block; } +/* LineHighlight */ .chroma .hl { display: block; width: 100%;background-color: #ffffcc } +/* LineNumbersTable */ .chroma .lnt { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f } +/* LineNumbers */ .chroma .ln { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f } +/* Keyword */ .chroma .k { color: #000000; font-weight: bold } +/* KeywordConstant */ .chroma .kc { color: #000000; font-weight: bold } +/* KeywordDeclaration */ .chroma .kd { color: #000000; font-weight: bold } +/* KeywordNamespace */ .chroma .kn { color: #000000; font-weight: bold } +/* KeywordPseudo */ .chroma .kp { color: #000000; font-weight: bold } +/* KeywordReserved */ .chroma .kr { color: #000000; font-weight: bold } +/* KeywordType */ .chroma .kt { color: #445588; font-weight: bold } +/* Name */ .chroma .n { } +/* NameAttribute */ .chroma .na { color: #008080 } +/* NameBuiltin */ .chroma .nb { color: #0086b3 } +/* NameBuiltinPseudo */ .chroma .bp { color: #999999 } +/* NameClass */ .chroma .nc { color: #445588; font-weight: bold } +/* NameConstant */ .chroma .no { color: #008080 } +/* NameDecorator */ .chroma .nd { color: #3c5d5d; font-weight: bold } +/* NameEntity */ .chroma .ni { color: #800080 } +/* NameException */ .chroma .ne { color: #990000; font-weight: bold } +/* NameFunction */ .chroma .nf { color: #990000; font-weight: bold } +/* NameFunctionMagic */ .chroma .fm { } +/* NameLabel */ .chroma .nl { color: #990000; font-weight: bold } +/* NameNamespace */ .chroma .nn { color: #555555 } +/* NameOther */ .chroma .nx { } +/* NameProperty */ .chroma .py { } +/* NameTag */ .chroma .nt { color: #000080 } +/* NameVariable */ .chroma .nv { color: #008080 } +/* NameVariableClass */ .chroma .vc { color: #008080 } +/* NameVariableGlobal */ .chroma .vg { color: #008080 } +/* NameVariableInstance */ .chroma .vi { color: #008080 } +/* NameVariableMagic */ .chroma .vm { } +/* Literal */ .chroma .l { } +/* LiteralDate */ .chroma .ld { } +/* LiteralString */ .chroma .s { color: #dd1144 } +/* LiteralStringAffix */ .chroma .sa { color: #dd1144 } +/* LiteralStringBacktick */ .chroma .sb { color: #dd1144 } +/* LiteralStringChar */ .chroma .sc { color: #dd1144 } +/* LiteralStringDelimiter */ .chroma .dl { color: #dd1144 } +/* LiteralStringDoc */ .chroma .sd { color: #dd1144 } +/* LiteralStringDouble */ .chroma .s2 { color: #dd1144 } +/* LiteralStringEscape */ .chroma .se { color: #dd1144 } +/* LiteralStringHeredoc */ .chroma .sh { color: #dd1144 } +/* LiteralStringInterpol */ .chroma .si { color: #dd1144 } +/* LiteralStringOther */ .chroma .sx { color: #dd1144 } +/* LiteralStringRegex */ .chroma .sr { color: #009926 } +/* LiteralStringSingle */ .chroma .s1 { color: #dd1144 } +/* LiteralStringSymbol */ .chroma .ss { color: #990073 } +/* LiteralNumber */ .chroma .m { color: #009999 } +/* LiteralNumberBin */ .chroma .mb { color: #009999 } +/* LiteralNumberFloat */ .chroma .mf { color: #009999 } +/* LiteralNumberHex */ .chroma .mh { color: #009999 } +/* LiteralNumberInteger */ .chroma .mi { color: #009999 } +/* LiteralNumberIntegerLong */ .chroma .il { color: #009999 } +/* LiteralNumberOct */ .chroma .mo { color: #009999 } +/* Operator */ .chroma .o { color: #000000; font-weight: bold } +/* OperatorWord */ .chroma .ow { color: #000000; font-weight: bold } +/* Punctuation */ .chroma .p { } +/* Comment */ .chroma .c { color: #999988; font-style: italic } +/* CommentHashbang */ .chroma .ch { color: #999988; font-style: italic } +/* CommentMultiline */ .chroma .cm { color: #999988; font-style: italic } +/* CommentSingle */ .chroma .c1 { color: #999988; font-style: italic } +/* CommentSpecial */ .chroma .cs { color: #999999; font-weight: bold; font-style: italic } +/* CommentPreproc */ .chroma .cp { color: #999999; font-weight: bold; font-style: italic } +/* CommentPreprocFile */ .chroma .cpf { color: #999999; font-weight: bold; font-style: italic } +/* Generic */ .chroma .g { } +/* GenericDeleted */ .chroma .gd { color: #000000; background-color: #ffdddd } +/* GenericEmph */ .chroma .ge { color: #000000; font-style: italic } +/* GenericError */ .chroma .gr { color: #aa0000 } +/* GenericHeading */ .chroma .gh { color: #999999 } +/* GenericInserted */ .chroma .gi { color: #000000; background-color: #ddffdd } +/* GenericOutput */ .chroma .go { color: #888888 } +/* GenericPrompt */ .chroma .gp { color: #555555 } +/* GenericStrong */ .chroma .gs { font-weight: bold } +/* GenericSubheading */ .chroma .gu { color: #aaaaaa } +/* GenericTraceback */ .chroma .gt { color: #aa0000 } +/* GenericUnderline */ .chroma .gl { text-decoration: underline } +/* TextWhitespace */ .chroma .w { color: #bbbbbb } diff --git a/docs/assets/search-data.js b/docs/assets/search-data.js new file mode 100644 index 0000000000..620fc380cf --- /dev/null +++ b/docs/assets/search-data.js @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +(function () { + const indexCfg = {{ with i18n "bookSearchConfig" }} + {{ . }}; + {{ else }} + {}; + {{ end }} + + indexCfg.doc = { + id: 'id', + field: ['title', 'content'], + store: ['title', 'href', 'section'], + }; + + const index = FlexSearch.create('balance', indexCfg); + window.bookSearchIndex = index; + + {{- $pages := where .Site.Pages "Kind" "in" (slice "page" "section") -}} + {{- $pages = where $pages "Params.booksearchexclude" "!=" true -}} + {{- $pages = where $pages "Content" "not in" (slice nil "") -}} + + {{ range $index, $page := $pages }} + index.add({ + 'id': {{ $index }}, + 'href': '{{ $page.RelPermalink }}', + 'title': {{ (partial "docs/simple-title" $page) | jsonify }}, + 'section': {{ (partial "docs/simple-title" $page.Parent) | jsonify }}, + 'content': {{ $page.Plain | jsonify }} + }); + {{- end -}} +})(); diff --git a/docs/build_docs.sh b/docs/build_docs.sh deleted file mode 100755 index 170af68386..0000000000 --- a/docs/build_docs.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -set -x - -# step-1: install dependencies -apt-get update -apt-get -y install git rsync python3-pip python3-git python3-stemmer python3-virtualenv python3-setuptools -python3 -m pip install -U sphinx==4.1.1 myst-parser==0.15.2 pygments==2.10.0 sphinx-rtd-theme==0.5.2 pyyaml==6.0 - -export REPO_NAME="${GITHUB_REPOSITORY##*/}" - -git config --global --add safe.directory /__w/${REPO_NAME}/${REPO_NAME} -export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct) -temp_docs_root=`mktemp -d` - -ls - -# step-1.5: copy main site content to temp dir -# this must be done before `make -C docs clean` otherwise the contents will be removed -rsync -avz "docs/site/" "${temp_docs_root}/" - -# step-2: build sites for all branches(for multiple versioned docs), excludes 'HEAD' and 'gh-pages' -make -C docs clean -branches="`git for-each-ref '--format=%(refname:lstrip=-1)' refs/remotes/origin/ | grep -viE '^(HEAD|gh-pages|release-1.0|release-1.1|release-1.2|release-1.3)$'| grep -iE '^(release-|master)'`" -for current_branch in ${branches}; do - export current_version=${current_branch} - git checkout ${current_branch} - - # skip the branch that has no docs - if [ ! -e 'docs/conf.py' ]; then - echo -e "\tINFO: Couldn't find 'docs/conf.py' for branch: ${current_branch}, just skip this branch" - continue - fi - echo "INFO: Building sites for branch: ${current_branch}" - sphinx-build -b html docs/ docs/_build/html/${current_branch} - - # copy the build content to temp dir - rsync -av "docs/_build/html/" "${temp_docs_root}/" - -done - -git checkout master -git config --global user.name "${GITHUB_ACTOR}" -git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com" - -# step-3: push build sites to gh-pages branch -pushd "${temp_docs_root}" -git init -git remote add deploy "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" -git checkout -b gh-pages - -touch .nojekyll - -git add . -git commit -m "Generated docs from commit ${GITHUB_SHA}" -git push deploy gh-pages --force - -# pop back and exit -popd -exit 0 diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 33a6812d61..0000000000 --- a/docs/conf.py +++ /dev/null @@ -1,135 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Maven -# Build your Java project and run tests with Apache Maven. -# Add steps that analyze code, save build artifacts, deploy, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/java - -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys -sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- -project = 'CDC Connectors for Apache Flink®' -copyright = '2022, Ververica GmbH; Apache Flink, Flink®, Apache®, the squirrel logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation' -author = 'ververica' - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx_rtd_theme', - 'sphinx.ext.autodoc', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'myst_parser', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -html_favicon = '_static/fig/favicon.png' - -import myst_parser - -source_parsers = { - '.md': myst_parser -} -source_suffix = ['.md'] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -html_context = { - 'css_files': [ - '_static/theme_overrides.css', # overrides for wide tables in RTD theme - ], -} - -try: - html_context -except NameError: - html_context = dict() -html_context['display_lower_left'] = True - -if 'REPO_NAME' in os.environ: - REPO_NAME = os.environ['REPO_NAME'] -else: - REPO_NAME = '' - -from git import Repo -repo = Repo( search_parent_directories=True ) -remote_refs = repo.remote().refs - -if 'current_version' in os.environ: - current_version = os.environ['current_version'] -else: - current_version = repo.active_branch.name - -html_context['current_version'] = current_version -html_context['version'] = current_version -html_context['github_version'] = current_version - -html_context['versions'] = list() -branches = [branch.name for branch in remote_refs] -for branch in branches: - if 'origin/' in branch and ('master' in branch or 'release-' in branch)\ - and 'HEAD' not in branch and 'gh-pages' not in branch \ - and 'release-1.0' not in branch and 'release-1.1' not in branch\ - and 'release-1.2' not in branch and 'release-1.3' not in branch: - version = branch[7:] - html_context['versions'].append( (version, '/' +REPO_NAME+ '/' +version+ '/') ) - -html_context['display_github'] = True -html_context['github_user'] = 'ververica' -html_context['github_repo'] = 'flink-cdc-connectors' diff --git a/docs/config.toml b/docs/config.toml new file mode 100644 index 0000000000..44a5b33100 --- /dev/null +++ b/docs/config.toml @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +baseURL = '//nightlies.apache.org/flink/flink-cdc-docs-master' +languageCode = 'en-us' +title = 'Apache Flink CDC' +enableGitInfo = false +theme = "book" +pygmentsUseClasses = true + +[params] + # Flag whether this is a stable version or not. + # Used for the quickstart page. + IsStable = false + + # Flag to indicate whether an outdated warning should be shown. + ShowOutDatedWarning = false + + # This is the version referenced in the docs. Please only use these variables + # to reference a specific Flink version, because this is the only place where + # we change the version for the complete docs when forking of a release branch + # etc. + # The full version string as referenced in Maven (e.g. 1.2.1) + Version = "3.1-SNAPSHOT" + + # For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot + # release this should be the same as the regular version + VersionTitle = "3.1-SNAPSHOT" + + # The branch for this version of Apache Flink CDC + Branch = "master" + + # The GitHub repository for Apache Flink CDC + Repo = "//github.com/apache/flink-cdc" + + GithubRepo = "https://github.com/apache/flink-cdc.git" + + ProjectHomepage = "//flink.apache.org" + + # External links at the bottom + # of the menu + MenuLinks = [ + ["Project Homepage", "//flink.apache.org"], + ["JavaDocs", "//nightlies.apache.org/flink/flink-cdc-docs-master/api/java/"], + ] + + PreviousDocs = [ + ["3.0", "https://nightlies.apache.org/flink-cdc/flink-cdc-docs-release-3.0"], + ] + +[markup] +[markup.goldmark.renderer] + unsafe = true + +[languages] +[languages.en] +languageName = 'English' +contentDir = 'content' +weight = 1 + +[languages.zh] +languageName = '中文版' +contentDir = 'content.zh' +weight = 2 + +[module] +[[module.imports.mounts]] +source = 'content' +target = 'content' +lang = 'en' +[[module.imports.mounts]] +source = 'content.zh' +target = 'content' +lang = 'zh' +[[module.imports.mounts]] +source = 'layouts' +target = 'layouts' +[[module.imports.mounts]] +source = 'data' +target = 'data' \ No newline at end of file diff --git a/docs/content.zh/_index.md b/docs/content.zh/_index.md new file mode 100644 index 0000000000..e0f2e94530 --- /dev/null +++ b/docs/content.zh/_index.md @@ -0,0 +1,58 @@ +--- +title: Apache Flink CDC +type: docs +bookToc: false +--- + + +#### + +
    +

    + Flink CDC: Change Data Capture Solution Of Apache Flink +

    +

    Set of source connectors for Apache Flink® directly ingesting changes coming from different databases using Change Data Capture(CDC).

    +
    + +Flink CDC integrates Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium). + +{{< img src="/fig/cdc-flow.png" alt="Stateful Functions" width="50%" >}} + +Flink CDC supports ingesting snapshot data and real time changes from databases to Flink® and then transform and sink to various downstream systems. + +{{< columns >}} +## Try Flink CDC + +If you’re interested in playing around with Flink CDC, check out our [quick +start]({{< ref "docs/try-flink-cdc" >}}). It provides multiple examples to submit and execute a Flink CDC job on a Flink cluster. + +<---> + +## Get Help with Flink CDC + +If you get stuck, check out our [community support +resources](https://flink.apache.org/community.html). In particular, Apache +Flink’s user mailing list is consistently ranked as one of the most active of +any Apache project, and is a great way to get help quickly. + +{{< /columns >}} + +Flink CDC is developed under the umbrella of [Apache +Flink](https://flink.apache.org/). diff --git a/docs/content.zh/docs/connectors/_index.md b/docs/content.zh/docs/connectors/_index.md new file mode 100644 index 0000000000..95f83ece66 --- /dev/null +++ b/docs/content.zh/docs/connectors/_index.md @@ -0,0 +1,25 @@ +--- +title: Connectors +icon: +bold: true +bookCollapseSection: true +weight: 3 +--- + diff --git a/docs/content/formats/index.md b/docs/content.zh/docs/connectors/cdc-connectors/_index.md similarity index 92% rename from docs/content/formats/index.md rename to docs/content.zh/docs/connectors/cdc-connectors/_index.md index 4146a2bf41..64aa8234bf 100644 --- a/docs/content/formats/index.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/_index.md @@ -1,3 +1,8 @@ +--- +title: CDC Connectors +bookCollapseSection: true +weight: 2 +--- - -# Formats - -```{toctree} -:maxdepth: 2 - -changelog-json -``` \ No newline at end of file diff --git a/docs/content/connectors/db2-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/db2-cdc.md similarity index 88% rename from docs/content/connectors/db2-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/db2-cdc.md index fbb3ae3655..12fc36e5d5 100644 --- a/docs/content/connectors/db2-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/db2-cdc.md @@ -1,3 +1,10 @@ +--- +title: "Db2 CDC Connector" +weight: 9 +type: docs +aliases: +- /connectors/cdc-connectors/db2-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-db2-cdc >}} ### SQL Client JAR @@ -55,7 +55,7 @@ put it under `/lib/`. **Note:** flink-sql-connector-db2-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as -[flink-sql-connector-db2-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-connector-db2-cdc), +[flink-sql-connector-db2-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-db2-cdc), the released version will be available in the Maven central warehouse. Setup Db2 server @@ -256,8 +256,6 @@ public class Db2SourceExample { } ``` -**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization. - Data Type Mapping ---------------- @@ -380,7 +378,4 @@ Data Type Mapping -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) +{{< top >}} diff --git a/docs/content/connectors/mongodb-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/mongodb-cdc.md similarity index 96% rename from docs/content/connectors/mongodb-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/mongodb-cdc.md index 1158b6cc46..3c560f16e2 100644 --- a/docs/content/connectors/mongodb-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/mongodb-cdc.md @@ -1,3 +1,10 @@ +--- +title: "MongoDB CDC Connector" +weight: 2 +type: docs +aliases: +- /connectors/cdc-connectors/mongodb-cdc.html +--- - 3.0-SNAPSHOT - -``` + +{{< artifact flink-connector-mongodb-cdc >}} ### SQL Client JAR ```Download link is available only for stable releases.``` -Download [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. +Download [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. -**Note:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mongodb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-mongodb-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mongodb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-mongodb-cdc), the released version will be available in the Maven central warehouse. Setup MongoDB ---------------- @@ -689,7 +690,4 @@ Reference - [BSON Types](https://docs.mongodb.com/manual/reference/bson-types/) - [Flink DataTypes](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/dev/table/types/) -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) \ No newline at end of file +{{< top >}} diff --git a/docs/content/connectors/mysql-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/mysql-cdc.md similarity index 98% rename from docs/content/connectors/mysql-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/mysql-cdc.md index a2b95a8af3..caaae51d36 100644 --- a/docs/content/connectors/mysql-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/mysql-cdc.md @@ -1,3 +1,10 @@ +--- +title: "MySQL CDC Connector" +weight: 7 +type: docs +aliases: +- /connectors/cdc-connectors/mysql-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-mysql-cdc >}} ### SQL Client JAR @@ -50,7 +50,7 @@ In order to setup the MySQL CDC connector, the following table provides dependen Download flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar and put it under `/lib/`. -**Note:** flink-sql-connector-mysql-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mysql-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-connector-mysql-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-mysql-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mysql-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc), the released version will be available in the Maven central warehouse. Setup MySQL server ---------------- @@ -697,8 +697,6 @@ public class MySqlSourceExample { } ``` -**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization. - ### Scan Newly Added Tables Scan Newly Added Tables feature enables you add new tables to monitor for existing running pipeline, the newly added tables will read theirs snapshot data firstly and then read their changelog automatically. @@ -1107,7 +1105,4 @@ The example for different spatial data types mapping is as follows: -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) +{{< top >}} diff --git a/docs/content/connectors/oceanbase-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/oceanbase-cdc.md similarity index 97% rename from docs/content/connectors/oceanbase-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/oceanbase-cdc.md index 864f5f44ce..1e7887e8a0 100644 --- a/docs/content/connectors/oceanbase-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/oceanbase-cdc.md @@ -1,3 +1,10 @@ +--- +title: "OceanBase CDC Connector" +weight: 4 +type: docs +aliases: +- /connectors/cdc-connectors/oceanbase-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-oceanbase-cdc >}} If you want to use OceanBase JDBC driver to connect to the enterprise edition database, you should also include the following dependency in your class path. @@ -49,9 +49,9 @@ If you want to use OceanBase JDBC driver to connect to the enterprise edition da ```Download link is available only for stable releases.``` -Download [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. +Download [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. -**Note:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oceanbase-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-oceanbase-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oceanbase-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oceanbase-cdc), the released version will be available in the Maven central warehouse. For JDBC driver, the cdc jar above already contains MySQL JDBC driver 5.1.47, which is our recommended version. Due to the license issue, we can not include the OceanBase JDBC driver in the cdc jar. If you need to use it, you can download it from [here](https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.2/oceanbase-client-2.4.2.jar) and put it under `/lib/`, you also need to set the start option `jdbc.driver` to `com.oceanbase.jdbc.Driver`. @@ -785,4 +785,6 @@ Data Type Mapping - \ No newline at end of file + + +{{< top >}} diff --git a/docs/content/connectors/oracle-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/oracle-cdc.md similarity index 96% rename from docs/content/connectors/oracle-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/oracle-cdc.md index ddd4881df3..76c24f1b59 100644 --- a/docs/content/connectors/oracle-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/oracle-cdc.md @@ -1,3 +1,10 @@ +--- +title: "Oracle CDC Connector" +weight: 5 +type: docs +aliases: +- /connectors/cdc-connectors/oracle-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-oracle-cdc >}} ### SQL Client JAR **Download link is available only for stable releases.** -Download [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. +Download [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. -**Note:** flink-sql-connector-oracle-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oracle-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-oracle-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-oracle-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oracle-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oracle-cdc), the released version will be available in the Maven central warehouse. Setup Oracle ---------------- @@ -588,8 +588,6 @@ public class OracleSourceExample { } ``` -**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization. - Data Type Mapping ----------------
    @@ -700,7 +698,4 @@ Data Type Mapping
    -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) \ No newline at end of file +{{< top >}} diff --git a/docs/content/overview/cdc-connectors.md b/docs/content.zh/docs/connectors/cdc-connectors/overview.md similarity index 61% rename from docs/content/overview/cdc-connectors.md rename to docs/content.zh/docs/connectors/cdc-connectors/overview.md index b209d5afd4..56ff59a615 100644 --- a/docs/content/overview/cdc-connectors.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/overview.md @@ -1,3 +1,10 @@ +--- +title: "Overview" +weight: 1 +type: docs +aliases: +- /connectors/cdc-connectors/ +--- flink-connector-mysql-cdc - + 3.0-SNAPSHOT ``` @@ -297,97 +304,4 @@ you can construct `JsonDebeziumDeserializationSchema` as following: new JsonDebeziumDeserializationSchema(true, customConverterConfigs); ``` -## Building from source - -Prerequisites: -- git -- Maven -- At least Java 8 - -``` -git clone https://github.com/ververica/flink-cdc-connectors.git -cd flink-cdc-connectors -mvn clean install -DskipTests -``` - -The dependencies are now available in your local `.m2` repository. - -### Code Contribute - -1. Left comment under the issue that you want to take -2. Fork Flink CDC project to your GitHub repositories - ![fork](/_static/fig/contribute_guidance/fork.png "fork") -3. Clone and compile your Flink CDC project - ```bash - git clone https://github.com/your_name/flink-cdc-connectors.git - cd flink-cdc-connectors - mvn clean install -DskipTests - ``` -4. Check to a new branch and start your work - ```bash - git checkout -b my_feature - -- develop and commit - ``` - ![check_branch](/_static/fig/contribute_guidance/check_branch.png "check_branch") -5. Push your branch to your github - ```bash - git push origin my_feature - ``` -6. Open a PR to https://github.com/ververica/flink-cdc-connectors - ![open_pr](/_static/fig/contribute_guidance/open_pr.png "open_pr") - -### Code Style - -#### Code Formatting - -You need to install the google-java-format plugin. Spotless together with google-java-format is used to format the codes. - -It is recommended to automatically format your code by applying the following settings: - -1. Go to "Settings" → "Other Settings" → "google-java-format Settings". -2. Tick the checkbox to enable the plugin. -3. Change the code style to "Android Open Source Project (AOSP) style". -4. Go to "Settings" → "Tools" → "Actions on Save". -5. Under "Formatting Actions", select "Optimize imports" and "Reformat file". -6. From the "All file types list" next to "Reformat code", select "Java". - -For earlier IntelliJ IDEA versions, the step 4 to 7 will be changed as follows. - -- 4.Go to "Settings" → "Other Settings" → "Save Actions". -- 5.Under "General", enable your preferred settings for when to format the code, e.g. "Activate save actions on save". -- 6.Under "Formatting Actions", select "Optimize imports" and "Reformat file". -- 7.Under "File Path Inclusions", add an entry for `.*\.java` to avoid formatting other file types. - Then the whole project could be formatted by command `mvn spotless:apply`. - -#### Checkstyle - -Checkstyle is used to enforce static coding guidelines. - -1. Go to "Settings" → "Tools" → "Checkstyle". -2. Set "Scan Scope" to "Only Java sources (including tests)". -3. For "Checkstyle Version" select "8.14". -4. Under "Configuration File" click the "+" icon to add a new configuration. -5. Set "Description" to "Flink cdc". -6. Select "Use a local Checkstyle file" and link it to the file `tools/maven/checkstyle.xml` which is located within your cloned repository. -7. Select "Store relative to project location" and click "Next". -8. Configure the property `checkstyle.suppressions.file` with the value `suppressions.xml` and click "Next". -9. Click "Finish". -10. Select "Flink cdc" as the only active configuration file and click "Apply". - -You can now import the Checkstyle configuration for the Java code formatter. - -1. Go to "Settings" → "Editor" → "Code Style" → "Java". -2. Click the gear icon next to "Scheme" and select "Import Scheme" → "Checkstyle Configuration". -3. Navigate to and select `tools/maven/checkstyle.xml` located within your cloned repository. - -Then you could click "View" → "Tool Windows" → "Checkstyle" and find the "Check Module" button in the opened tool window to validate checkstyle. Or you can use the command `mvn clean compile checkstyle:checkstyle` to validate. - -### Documentation Contribute - -Flink cdc documentations locates at `docs/content`. - -The contribution step is the same as the code contribution. We use markdown as the source code of the document. - -## License - -The code in this repository is licensed under the [Apache Software License 2](https://github.com/ververica/flink-cdc-connectors/blob/master/LICENSE). +{{< top >}} diff --git a/docs/content/connectors/postgres-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/postgres-cdc.md similarity index 96% rename from docs/content/connectors/postgres-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/postgres-cdc.md index 76bdb1073b..d1504b6aae 100644 --- a/docs/content/connectors/postgres-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/postgres-cdc.md @@ -1,3 +1,10 @@ +--- +title: "Postgres CDC Connector" +weight: 6 +type: docs +aliases: +- /connectors/cdc-connectors/postgres-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-postgres-cdc >}} ### SQL Client JAR @@ -43,7 +43,7 @@ In order to setup the Postgres CDC connector, the following table provides depen Download flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar and put it under `/lib/`. -**Note:** flink-sql-connector-postgres-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-postgres-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-postgres-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-postgres-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-postgres-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-postgres-cdc), the released version will be available in the Maven central warehouse. How to create a Postgres CDC table ---------------- @@ -521,7 +521,6 @@ public class PostgreSQLSourceExample { } } ``` -**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization. Data Type Mapping ---------------- @@ -618,7 +617,4 @@ Data Type Mapping -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) +{{< top >}} diff --git a/docs/content/connectors/sqlserver-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/sqlserver-cdc.md similarity index 96% rename from docs/content/connectors/sqlserver-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/sqlserver-cdc.md index c07b6c68d8..68553196ca 100644 --- a/docs/content/connectors/sqlserver-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/sqlserver-cdc.md @@ -1,3 +1,10 @@ +--- +title: "SQLServer CDC Connector" +weight: 7 +type: docs +aliases: +- /connectors/cdc-connectors/sqlserver-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-sqlserver-cdc >}} ### SQL Client JAR ```Download link is available only for stable releases.``` -Download [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. +Download [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. -**Note:** flink-sql-connector-sqlserver-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-sqlserver-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-sqlserver-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-sqlserver-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-sqlserver-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-sqlserver-cdc), the released version will be available in the Maven central warehouse. Setup SQLServer Database ---------------- @@ -230,7 +230,7 @@ Connector Options so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' - + scan.incremental.snapshot.chunk.key-column optional (none) @@ -408,7 +408,6 @@ public class SqlServerIncrementalSourceExample { } } ``` -**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization. Data Type Mapping ---------------- @@ -504,3 +503,5 @@ Data Type Mapping + +{{< top >}} diff --git a/docs/content/connectors/tidb-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/tidb-cdc.md similarity index 95% rename from docs/content/connectors/tidb-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/tidb-cdc.md index 6f51b6b540..46662858c7 100644 --- a/docs/content/connectors/tidb-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/tidb-cdc.md @@ -1,3 +1,10 @@ +--- +title: "TiDB CDC Connector" +weight: 8 +type: docs +aliases: +- /connectors/cdc-connectors/tidb-cdc.html +--- - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-tidb-cdc >}} ### SQL Client JAR ```Download link is available only for stable releases.``` -Download [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. +Download [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. -**Note:** flink-sql-connector-tidb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-tidb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-tidb-cdc), the released version will be available in the Maven central warehouse. +**Note:** flink-sql-connector-tidb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-tidb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-tidb-cdc), the released version will be available in the Maven central warehouse. How to create a TiDB CDC table ---------------- @@ -492,3 +492,5 @@ Data Type Mapping + +{{< top >}} diff --git a/docs/content/connectors/vitess-cdc.md b/docs/content.zh/docs/connectors/cdc-connectors/vitess-cdc.md similarity index 96% rename from docs/content/connectors/vitess-cdc.md rename to docs/content.zh/docs/connectors/cdc-connectors/vitess-cdc.md index eb00195935..c722adc975 100644 --- a/docs/content/connectors/vitess-cdc.md +++ b/docs/content.zh/docs/connectors/cdc-connectors/vitess-cdc.md @@ -1,3 +1,10 @@ +--- +title: "Vitess CDC Connector" +weight: 10 +type: docs +aliases: +- /connectors/cdc-connectors/vitess-cdc.html +--- - -# Overview - -```{toctree} -:maxdepth: 2 -:caption: Contents -cdc-connectors -cdc-pipeline -``` \ No newline at end of file diff --git a/docs/content/pipelines/doris-pipeline.md b/docs/content.zh/docs/connectors/pipeline-connectors/doris-pipeline.md similarity index 97% rename from docs/content/pipelines/doris-pipeline.md rename to docs/content.zh/docs/connectors/pipeline-connectors/doris-pipeline.md index 0daea1bafe..94b788636f 100644 --- a/docs/content/pipelines/doris-pipeline.md +++ b/docs/content.zh/docs/connectors/pipeline-connectors/doris-pipeline.md @@ -1,3 +1,10 @@ +--- +title: "Doris Pipeline Connector" +weight: 2 +type: docs +aliases: +- /pipelines/doris-pipeline.html +--- + +# Pipeline Connectors Of CDC Streaming ELT Framework + +## Supported Connectors + +| Connector | Database | +|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [doris-pipeline](doris-pipeline.md) |
  • [Doris](https://doris.apache.org/): 1.2.x, 2.x.x | +| [mysql-pipeline](mysql-pipeline.md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | +| [starrocks-pipeline](starrocks-pipeline.md) |
  • [StarRocks](https://www.starrocks.io/): 2.x, 3.x | + +## Supported Flink Versions +The following table shows the version mapping between Flink® CDC Pipeline and Flink®: + +| Flink® CDC Version | Flink® Version | +|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| 3.0.* | 1.14.\*, 1.15.\*, 1.16.\*, 1.17.\*, 1.18.\* | + +{{< top >}} diff --git a/docs/content/pipelines/starrocks-pipeline.md b/docs/content.zh/docs/connectors/pipeline-connectors/starrocks-pipeline.md similarity index 98% rename from docs/content/pipelines/starrocks-pipeline.md rename to docs/content.zh/docs/connectors/pipeline-connectors/starrocks-pipeline.md index 39434d221e..87eb72aef0 100644 --- a/docs/content/pipelines/starrocks-pipeline.md +++ b/docs/content.zh/docs/connectors/pipeline-connectors/starrocks-pipeline.md @@ -1,3 +1,10 @@ +--- +title: "StarRocks Pipeline Connector" +weight: 4 +type: docs +aliases: +- /pipelines/starrocks-pipeline.html +--- diff --git a/docs/content/quickstart/build-real-time-data-lake-tutorial.md b/docs/content.zh/docs/development/build-real-time-data-lake-tutorial.md similarity index 92% rename from docs/content/quickstart/build-real-time-data-lake-tutorial.md rename to docs/content.zh/docs/development/build-real-time-data-lake-tutorial.md index 71b4295288..f468f9e6df 100644 --- a/docs/content/quickstart/build-real-time-data-lake-tutorial.md +++ b/docs/content.zh/docs/development/build-real-time-data-lake-tutorial.md @@ -1,3 +1,12 @@ +--- +title: "Building a Real-time Data Lake with Flink CDC" +weight: 999 +type: docs +aliases: +- /development/build-real-time-data-lake-tutorial.html + +--- + -# DataStream api package guidance +# DataStream Api Package Guidance This guide provides a simple pom example of mysql cdc DataStream api @@ -34,7 +42,7 @@ flink 1.17.2 flink mysql cdc 2.4.2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - com.ververica + org.apache.flink FlinkCDCTest 1.0-SNAPSHOT @@ -113,7 +121,7 @@ flink 1.17.2 flink mysql cdc 2.4.2 30.1.1-jre-16.1 - com.ververica + org.apache.flink flink-connector-mysql-cdc 2.4.2 @@ -164,8 +172,8 @@ flink 1.17.2 flink mysql cdc 2.4.2 io.debezium:debezium-core io.debezium:debezium-ddl-parser io.debezium:debezium-connector-mysql - com.ververica:flink-connector-debezium - com.ververica:flink-connector-mysql-cdc + org.apache.flink:flink-connector-debezium + org.apache.flink:flink-connector-mysql-cdc org.antlr:antlr4-runtime org.apache.kafka:* mysql:mysql-connector-java @@ -228,7 +236,7 @@ flink 1.17.2 flink mysql cdc 2.4.2 ## code example ```java -package com.ververica.flink.cdc; +package org.apache.flink.flink.cdc; import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -265,3 +273,4 @@ public class CdcTest { ``` +{{< top >}} diff --git a/docs/content.zh/docs/faq/_index.md b/docs/content.zh/docs/faq/_index.md new file mode 100644 index 0000000000..1a6cd62e5d --- /dev/null +++ b/docs/content.zh/docs/faq/_index.md @@ -0,0 +1,25 @@ +--- +title: "FAQ" +icon: +bold: true +bookCollapseSection: true +weight: 4 +--- + diff --git a/docs/content.zh/docs/faq/faq.md b/docs/content.zh/docs/faq/faq.md new file mode 100644 index 0000000000..3077694015 --- /dev/null +++ b/docs/content.zh/docs/faq/faq.md @@ -0,0 +1,330 @@ +--- +title: "FAQ" +weight: 1 +type: docs +aliases: +- /faq/faq.html +--- + +## General FAQ + +### Q1: Why can't I download Flink-sql-connector-mysql-cdc-2.2-snapshot jar, why doesn't Maven warehouse rely on XXX snapshot? + +Like the mainstream Maven project version management, XXX snapshot version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as flink-sql-connector-mysql-cdc-2.1 0.jar, the released version will be available in the Maven central warehouse. + +### Q2: When should I use Flink SQL connector XXX Jar? When should I Flink connector XXX jar? What's the difference between the two? + +The dependency management of each connector in Flink CDC project is consistent with that in Flink project. Flink SQL connector XX is a fat jar. In addition to the code of connector, it also enters all the third-party packages that connector depends on into the shade and provides them to SQL jobs. Users only need to add the fat jar in the flink/lib directory. The Flink connector XX has only the code of the connector and does not contain the required dependencies. It is used by DataStream jobs. Users need to manage the required three-party package dependencies. Conflicting dependencies need to be excluded and shaded by themselves. + +### Q3: Why change the package name from com.alibaba.ververica changed to org.apache.flink? Why can't the 2. X version be found in Maven warehouse? + +Flink CDC project changes the group ID from com.alibaba.ververica changed to org.apache.flink since 2.0.0 version, this is to make the project more community neutral and more convenient for developers of various companies to build. So look for 2.x in Maven warehouse package, the path is /org/apache/flink. + +## MySQL CDC FAQ + +### Q1: I use CDC 2.x version , only full data can be read, but binlog data cannot be read. What's the matter? + +CDC 2.0 supports lock free algorithm and concurrent reading. In order to ensure the order of full data + incremental data, it relies on Flink's checkpoint mechanism, so the job needs to be configured with checkpoint. + +Configuration method in SQL job: + +```sql +Flink SQL> SET 'execution.checkpointing.interval' = '3s'; +``` + +DataStream job configuration mode: + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.enableCheckpointing(3000); +``` + +### Q2: Using MySQL CDC DataStream API, the timestamp field read in the incremental phase has a time zone difference of 8 hours. What's the matter? + +When parsing the timestamp field in binlog data, CDC will use the server time zone information configured in the job, that is, the time zone of the MySQL server. If this time zone is not consistent with the time zone of your MySQL server, this problem will occur. + +In addition, if the serializer is customized in the DataStream job. + +such as MyDeserializer implements DebeziumDeserializationSchema, when the customized serializer parses the timestamp type data, it needs to refer to the analysis of the timestamp type in RowDataDebeziumDeserializeSchema and use the given time zone information. + +``` +private TimestampData convertToTimestamp(Object dbzObj, Schema schema) { + if (dbzObj instanceof Long) { + switch (schema.name()) { + case Timestamp.SCHEMA_NAME: + return TimestampData.fromEpochMillis((Long) dbzObj); + case MicroTimestamp.SCHEMA_NAME: + long micro = (long) dbzObj; + return TimestampData.fromEpochMillis(micro / 1000, (int) (micro % 1000 * 1000)); + case NanoTimestamp.SCHEMA_NAME: + long nano = (long) dbzObj; + return TimestampData.fromEpochMillis(nano / 1000_000, (int) (nano % 1000_000)); + } + } + LocalDateTime localDateTime = TemporalConversions.toLocalDateTime(dbzObj, serverTimeZone); + return TimestampData.fromLocalDateTime(localDateTime); + } +``` + +### Q3: Does MySQL CDC support listening to slave database? How to configure slave database? + +Yes, the slave database needs to be configured with log slave updates = 1, so that the slave instance can also write the data synchronized from the master instance to the binlog file of the slave database. If the master database has enabled gtid mode, the slave database also needs to be enabled. + +``` +log-slave-updates = 1 +gtid_mode = on +enforce_gtid_consistency = on +``` + +### Q4: I want to synchronize sub databases and sub tables. How should I configure them? + +In the with parameter of MySQL CDC table, both table name and database name support regular configuration, such as 'table name ' = 'user_ '.' Can match table name 'user_ 1, user_ 2,user_ A ' table. + +Note that any regular matching character is'. ' Instead of '*', where the dot represents any character, the asterisk represents 0 or more, and so does database name, that the shared table should be in the same schema. + +### Q5: I want to skip the stock reading phase and only read binlog data. How to configure it? + +In the with parameter of MySQL CDC table + +``` +'scan.startup.mode' = 'latest-offset'. +``` + +### Q6: I want to get DDL events in the database. What should I do? Is there a demo? + +Flink CDC provides DataStream API `MysqlSource` since version 2.1. Users can configure includeschemachanges to indicate whether DDL events are required. After obtaining DDL events, they can write code for next processing. + +```java + public void consumingAllEvents() throws Exception { + inventoryDatabase.createAndInitialize(); + MySqlSource mySqlSource = + MySqlSource.builder() + .hostname(MYSQL_CONTAINER.getHost()) + .port(MYSQL_CONTAINER.getDatabasePort()) + .databaseList(inventoryDatabase.getDatabaseName()) + .tableList(inventoryDatabase.getDatabaseName() + ".products") + .username(inventoryDatabase.getUsername()) + .password(inventoryDatabase.getPassword()) + .serverId("5401-5404") + .deserializer(new JsonDebeziumDeserializationSchema()) + .includeSchemaChanges(true) // Configure here and output DDL events + .build(); + ... // Other processing logic + } +``` + +### Q7: How to synchronize the whole MySQL database? Does Flink CDC support it? + +The DataStream API provided in Q6 has enabled users to obtain DDL change events and data change events. On this basis, users need to develop DataStream jobs according to their own business logic and downstream storage. + +### Q8: In the same MySQL instance, the table of one database cannot synchronize incremental data, but other databases works fine. Why? + +Users can check Binlog_Ignore_DB and Binlog_Do_DB through the `show master status` command + +```mysql +mysql> show master status; ++------------------+----------+--------------+------------------+----------------------+ +| File | Position | Binlog_Do_DB | Binlog_Ignore_DB | Executed_Gtid_Set | ++------------------+----------+--------------+------------------+----------------------+ +| mysql-bin.000006 | 4594 | | | xxx:1-15 | ++------------------+----------+--------------+------------------+----------------------+ +``` + +### Q9: The job reports an error the connector is trying to read binlog starting at GTIDs xxx and binlog file 'binlog.000064', pos=89887992, skipping 4 events plus 1 rows, but this is no longer available on the server. Reconfigure the connector to use a snapshot when needed, What should I do? + +This error occurs because the binlog file being read by the job has been cleaned up on the MySQL server. Generally, the expiration time of the binlog file retained on the MySQL server is too short. You can set this value higher, such as 7 days. + +```mysql +mysql> show variables like 'expire_logs_days'; +mysql> set global expire_logs_days=7; +``` + +In another case, the binlog consumption of the Flink CDC job is too slow. Generally, sufficient resources can be allocated. + +### Q10: The job reports an error ConnectException: A slave with the same server_uuid/server_id as this slave has connected to the master,What should I do? + +This error occurs because the server ID used in the job conflicts with the server ID used by other jobs or other synchronization tools. The server ID needs to be globally unique. The server ID is an int type integer. In CDC 2.x In version, each concurrency of the source requires a server ID. it is recommended to reasonably plan the server ID. for example, if the source of the job is set to four concurrency, you can configure 'server ID' = '5001-5004', so that each source task will not conflict. + +### Q11: The job reports an error ConnectException: Received DML ‘…’ for processing, binlog probably contains events generated with statement or mixed based replication format,What should I do? + +This error occurs because the MySQL server is not configured correctly. You need to check the binlog is format row? You can view it through the following command + +```mysql +mysql> show variables like '%binlog_format%'; +``` + +### Q12: The job reports an error Mysql8.0 Public Key Retrieval is not allowed,What should I do? + +This is because the MySQL user configured by the user uses sha256 password authentication and requires TLS and other protocols to transmit passwords. A simple method is to allow MySQL users to support original password access. + +```mysql +mysql> ALTER USER 'username'@'localhost' IDENTIFIED WITH mysql_native_password BY 'password'; +mysql> FLUSH PRIVILEGES; +``` + +### Q13: The job reports an error EventDataDeserializationException: Failed to deserialize data of EventHeaderV4 .... Caused by: java.net.SocketException: Connection reset,What should I do? + +This problem is generally caused by the network. First, check the network between the Flink cluster and the database, and then increase the network parameters of the MySQL server. + +```mysql +mysql> set global slave_net_timeout = 120; +mysql> set global thread_pool_idle_timeout = 120; +``` + +Or try to use the flink configuration as follows. + +``` +execution.checkpointing.interval=10min +execution.checkpointing.tolerable-failed-checkpoints=100 +restart-strategy=fixed-delay +restart-strategy.fixed-delay.attempts=2147483647 +restart-strategy.fixed-delay.delay= 30s +``` + +If there is bad back pressure in the job, this problem may happen too. Then you need to handle the back pressure in the job first. + +### Q14: The job reports an error The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires,What should I do? + +The reason for this problem is that the reading of the full volume phase of the job is too slow. After reading the full volume phase, the previously recorded gtid site at the beginning of the full volume phase has been cleared by mysql. This can increase the save time of binlog files on the MySQL server, or increase the concurrency of source to make the full volume phase read faster. + +### Q15: How to config `tableList` option when build MySQL CDC source in DataStream API? + +The `tableList` option requires table name with database name rather than table name in DataStream API. For MySQL CDC source, the `tableList` option value should like ‘my_db.my_table’. + +## Postgres CDC FAQ + +### Q1: It is found that the disk utilization rate of PG server is high. What is the reason why wal is not released? + +Flink Postgres CDC will only update the LSN in the Postgres slot when the checkpoint is completed. Therefore, if you find that the disk utilization is high, please first confirm whether the checkpoint is turned on. + +### Q2: Flink Postgres CDC returns null for decimal types exceeding the maximum precision (38, 18) in synchronous Postgres + +In Flink, if the precision of the received data is greater than the precision of the type declared in Flink, the data will be processed as null. You can configure the corresponding 'debezium decimal. handling. Mode '='string' process the read data with string type + +### Q3: Flink Postgres CDC prompts that toast data is not transmitted. What is the reason? + +Please ensure that the replica identity is full first. The toast data is relatively large. In order to save the size of wal, if the toast data is not changed, the wal2json plugin will not bring toast data to the updated data. To avoid this problem, you can use 'debezium schema. refresh. mode'='columns_ diff_ exclude_ unchanged_ Toast 'to solve. + +### Q4: The job reports an error replication slot "XXXX" is active. What should I do? + +Currently, Flink Postgres CDC does not release the slot manually after the job exits. There are two ways to solve this problem + +- Go to Postgres and manually execute the following command + +``` +select pg_drop_replication_slot('rep_slot'); + ERROR: replication slot "rep_slot" is active for PID 162564 +select pg_terminate_backend(162564); select pg_drop_replication_slot('rep_slot'); +``` + +- Add 'debezium.slot.drop.on.stop'='true' to PG source with parameter to automatically clean up the slot after the job stops + +### Q5: Jobs have dirty data, such as illegal dates. Are there parameters that can be configured and filtered? + +Yes, you can add configure. In the with parameter of the Flink CDC table 'debezium.event.deserialization.failure.handling.mode'='warn' parameter, skip dirty data and print dirty data to warn log. You can also configure 'debezium.event.deserialization.failure.handling.mode'='ignore', skip dirty data directly and do not print dirty data to the log. + +### Q6: How to config `tableList` option when build Postgres CDC source in DataStream API? + +The `tableList` option requires table name with schema name rather than table name in DataStream API. For Postgres CDC source, the `tableList` option value should like ‘my_schema.my_table’. + +## MongoDB CDC FAQ + +### Q1: Does mongodb CDC support full + incremental read and read-only incremental? + +Yes, the default is full + incremental reading; Use copy The existing = false parameter is set to read-only increment. + +### Q2: Does mongodb CDC support recovery from checkpoint? What is the principle? + +Yes, the checkpoint will record the resumetoken of the changestream. During recovery, the changestream can be restored through the resumetoken. Where resumetoken corresponds to oplog RS (mongodb change log collection), oplog RS is a fixed capacity collection. When the corresponding record of resumetoken is in oplog When RS does not exist, an exception of invalid resumetoken may occur. In this case, you can set the appropriate oplog Set size of RS to avoid oplog RS retention time is too short, you can refer to https://docs.mongodb.com/manual/tutorial/change-oplog-size/ In addition, the resumetoken can be refreshed through the newly arrived change record and heartbeat record. + +### Q3: Does mongodb CDC support outputting - U (update_before) messages? + +Mongodb original oplog RS has only insert, update, replace and delete operation types. It does not retain the information before update. It cannot output - U messages. It can only realize the update semantics in Flink. When using mongodbtablesource, Flink planner will automatically perform changelognormalize optimization, fill in the missing - U messages, and output complete + I, - u, + U, and - D messages. The cost of changelognormalize optimization is that the node will save the status of all previous keys. Therefore, if the DataStream job directly uses mongodbsource, without the optimization of Flink planner, changelognormalize will not be performed automatically, so - U messages cannot be obtained directly. To obtain the pre update image value, you need to manage the status yourself. If you don't want to manage the status yourself, you can convert mongodbtablesource to changelogstream or retractstream and supplement the pre update image value with the optimization ability of Flink planner. An example is as follows: + +``` + tEnv.executeSql("CREATE TABLE orders ( ... ) WITH ( 'connector'='mongodb-cdc',... )"); + + Table table = tEnv.from("orders") + .select($("*")); + + tEnv.toChangelogStream(table) + .print() + .setParallelism(1); + + env.execute(); +``` + + + +### Q4: Does mongodb CDC support subscribing to multiple collections? + +Only the collection of the whole database can be subscribed, but some collection filtering functions are not supported. For example, if the database is configured as' mgdb 'and the collection is an empty string, all collections under the' mgdb 'database will be subscribed. + +### Q5: Does mongodb CDC support setting multiple concurrent reads? + +Not yet supported. + +### Q6: What versions of mongodb are supported by mongodb CDC? + +Mongodb CDC is implemented based on the changestream feature, which is a new feature launched by mongodb 3.6. Mongodb CDC theoretically supports versions above 3.6. It is recommended to run version > = 4.0. When executing versions lower than 3.6, an error will occur: unrecognized pipeline stage name: '$changestream'. + +### Q7: What is the operation mode of mongodb supported by mongodb CDC? + +Changestream requires mongodb to run in replica set or fragment mode. Local tests can use stand-alone replica set rs.initiate(). + +Errors occur in standalone mode : The $changestage is only supported on replica sets. + +### Q8: Mongodb CDC reports an error. The user name and password are incorrect, but other components can connect normally with this user name and password. What is the reason? + +If the user is creating a DB that needs to be connected, add 'connection' to the with parameter Options' ='authsource = DB where the user is located '. + +### Q9: Does mongodb CDC support debezium related parameters? + +The mongodb CDC connector is not supported because it is independently developed in the Flink CDC project and does not rely on the debezium project. + +### Q10: In the mongodb CDC full reading phase, can I continue reading from the checkpoint after the job fails? + +In the full reading phase, mongodb CDC does not do checkpoint until the full reading phase is completed. If it fails in the full reading phase, mongodb CDC will read the stock data again. + +## Oracle CDC FAQ + +### Q1: Oracle CDC's archive logs grow rapidly and read logs slowly? + +The online mining mode can be used without writing the data dictionary to the redo log, but it cannot process DDL statements. The default policy of the production environment reads the log slowly, and the default policy will write the data dictionary information to the redo log, resulting in a large increase in the log volume. You can add the following debezium configuration items. " log. mining. strategy' = 'online_ catalog','log. mining. continuous. mine' = 'true'。 If you use SQL, you need to prefix the configuration item with 'debezium.', Namely: + +``` +'debezium.log.mining.strategy' = 'online_catalog', +'debezium.log.mining.continuous.mine' = 'true' +``` + + +### Q2: Operation error caused by: io debezium. DebeziumException: Supplemental logging not configured for table xxx. Use command: alter table XXX add supplementary log data (all) columns? + +For Oracle version 11, debezium will set tableidcasesensitive to true by default, resulting in the table name being updated to lowercase. Therefore, the table completion log setting cannot be queried in Oracle, resulting in the false alarm of "supplementary logging not configured for table error". + +If it is the DataStream API, add the configuration item of debezium 'database.tablename.case.insensitive' = 'false'. If the SQL API is used, add the configuration item 'debezium.database.tablename.case.insensitive' = 'false' in the option of the table + +### Q3: How does Oracle CDC switch to XStream? + +Add configuration item 'database.connection.adpter' = 'xstream', please use the configuration item 'debezium.database.connection.adpter' = 'xstream' if you're using SQL API. + +### Q4: What are the database name and schema name of Oracle CDC + +Database name is the name of the database example, that is, the SID of Oracle. Schema name is the schema corresponding to the table. Generally speaking, a user corresponds to a schema. The schema name of the user is equal to the user name and is used as the default schema of the user. Therefore, schema name is generally the user name for creating the table, but if a schema is specified when creating the table, the specified schema is schema name. For example, use create table AAAA If TestTable (XXXX) is successfully created, AAAA is schema name. diff --git a/docs/content.zh/docs/try-flink-cdc/_index.md b/docs/content.zh/docs/try-flink-cdc/_index.md new file mode 100644 index 0000000000..b752c6f1b8 --- /dev/null +++ b/docs/content.zh/docs/try-flink-cdc/_index.md @@ -0,0 +1,25 @@ +--- +title: "Try Flink CDC" +icon: +bold: true +bookCollapseSection: true +weight: 1 +--- + diff --git a/docs/content.zh/docs/try-flink-cdc/cdc-connectors/_index.md b/docs/content.zh/docs/try-flink-cdc/cdc-connectors/_index.md new file mode 100644 index 0000000000..0c566a8a8a --- /dev/null +++ b/docs/content.zh/docs/try-flink-cdc/cdc-connectors/_index.md @@ -0,0 +1,25 @@ +--- +title: CDC Connectors +bookCollapseSection: true +weight: 2 +aliases: + - /try-flink-cdc/cdc-connectors/ +--- + diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/build-real-time-data-lake-tutorial-zh.md" b/docs/content.zh/docs/try-flink-cdc/cdc-connectors/build-real-time-data-lake-tutorial.md similarity index 50% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/build-real-time-data-lake-tutorial-zh.md" rename to docs/content.zh/docs/try-flink-cdc/cdc-connectors/build-real-time-data-lake-tutorial.md index 707cc68d46..f468f9e6df 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/build-real-time-data-lake-tutorial-zh.md" +++ b/docs/content.zh/docs/try-flink-cdc/cdc-connectors/build-real-time-data-lake-tutorial.md @@ -1,3 +1,12 @@ +--- +title: "Building a Real-time Data Lake with Flink CDC" +weight: 999 +type: docs +aliases: +- /development/build-real-time-data-lake-tutorial.html + +--- + -# 基于 Flink CDC 同步 MySQL 分库分表构建实时数据湖 +# Using Flink CDC to synchronize data from MySQL sharding tables and build real-time data lake -在 OLTP 系统中,为了解决单表数据量大的问题,通常采用分库分表的方式将单个大表进行拆分以提高系统的吞吐量。 -但是为了方便数据分析,通常需要将分库分表拆分出的表在同步到数据仓库、数据湖时,再合并成一个大表。 +For OLTP databases, to deal with a huge number of data in a single table, we usually do database and table sharding to get better throughput. +But sometimes, for convenient analysis, we need to merge them into one table when loading them to data warehouse or data lake. -这篇教程将展示如何使用 Flink CDC 构建实时数据湖来应对这种场景,本教程的演示基于 Docker,只涉及 SQL,无需一行 Java/Scala 代码,也无需安装 IDE,你可以很方便地在自己的电脑上完成本教程的全部内容。 +This tutorial will show how to use Flink CDC to build a real-time data lake for such a scenario. +You can walk through the tutorial easily in the docker environment. The entire process uses standard SQL syntax without a single line of Java/Scala code or IDE installation. -接下来将以数据从 MySQL 同步到 [Iceberg](https://iceberg.apache.org/) 为例展示整个流程,架构图如下所示: +The following sections will take the pipeline from MySQL to [Iceberg](https://iceberg.apache.org/) as an example. The overview of the architecture is as follows: -![Architecture of Real-Time Data Lake](/_static/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png "architecture of real-time data lake") +{{< img src="/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png" alt="Real-time data lake with Flink CDC" >}} -你也可以使用不同的 source 比如 Oracle/Postgres 和 sink 比如 Hudi 来构建自己的 ETL 流程。 +You can also use other data sources like Oracle/Postgres and sinks like Hudi to build your own pipeline. -## 准备阶段 -准备一台已经安装了 Docker 的 Linux 或者 MacOS 电脑。 +## Preparation +Prepare a Linux or MacOS computer with Docker installed. -### 下载所需要的依赖包 -**下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译** -- flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar +## Preparing JAR package required +**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.** +- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar - [flink-shaded-hadoop-2-uber-2.7.5-10.0.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.7.5-10.0/flink-shaded-hadoop-2-uber-2.7.5-10.0.jar) - [iceberg-flink-runtime-1.16-1.3.1.jar](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/1.3.1/iceberg-flink-runtime-1.16-1.3.1.jar) -### 准备教程所需要的组件 -接下来的教程将以 `docker-compose` 的方式准备所需要的组件。 -1. 使用下面的内容创建一个 `Dockerfile` 文件: +### Starting components required +The components required in this tutorial are all managed in containers, so we will use `docker-compose` to start them. + +1. Create `Dockerfile` file using following contents: ```dockerfile FROM flink:1.16.0-scala_2.12 # Place the downloaded jar packages in the lib directory at the same level. @@ -51,8 +62,7 @@ under the License. RUN apt-get update && apt-get install tree ``` -2. 使用下面的内容创建一个`docker-compose.yml` 文件: - +2. Create `docker-compose.yml` file using following contents: ```yml version: '2.1' services: @@ -113,33 +123,33 @@ under the License. device: "tmpfs" ``` - 该 Docker Compose 中包含的容器有: - - SQL-Client: Flink SQL Client, 用来提交 SQL 查询和查看 SQL 的执行结果 - - Flink Cluster:包含 Flink JobManager 和 Flink TaskManager,用来执行 Flink SQL - - MySQL:作为分库分表的数据源,存储本教程的 `user` 表 + The Docker Compose environment consists of the following containers: + - SQL-Client: Flink SQL Client, used to submit queries and visualize their results. + - Flink Cluster: a Flink JobManager and a Flink TaskManager container to execute queries. + - MySQL: mainly used as a data source to store the sharding table. -3. 在 `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件: +3. To start all containers, run the following command in the directory that contains the `docker-compose.yml` file: ```shell docker-compose up -d ``` - 该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 `docker ps` 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:8081/](http://localhost:8081//) 来查看 Flink 是否运行正常。 - -***注意:*** -* 本教程接下来用到的容器相关的命令都需要在 `docker-compose.yml` 所在目录下执行 -* 如果你想要在自己的 Flink 环境运行本教程,需要下载下面列出的包并且把它们放在 Flink 所在目录的 lib 目录下,即 FLINK_HOME/lib/ + This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run `docker ps` to check whether these containers are running properly. + We can also visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally. +***Note:*** +* If you want to run with your own Flink environment, remember to download the jar packages and then put them to `FLINK_HOME/lib/`. +* All the following commands involving `docker-compose` should be executed in the directory of the `docker-compose.yml` file. -![Flink UI](/_static/fig/real-time-data-lake-tutorial/flink-ui.png "Flink UI") +{{< img src="/fig/real-time-data-lake-tutorial/flink-ui.png" alt="Flink UI" >}} -### 准备数据 -1. 进入 MySQL 容器中 +### Preparing data in databases +1. Enter mysql's container: ```shell docker-compose exec mysql mysql -uroot -p123456 ``` -2. 创建数据和表,并填充数据 - - 创建两个不同的数据库,并在每个数据库中创建两个表,作为 `user` 表分库分表下拆分出的表。 +2. Create databases/tables and populate data: + + Create a logical sharding table `user` sharded in different databases and tables physically. ```sql CREATE DATABASE db_1; USE db_1; @@ -183,28 +193,30 @@ under the License. INSERT INTO user_2 VALUES (220,"user_220","Shanghai","123567891234","user_220@foo.com"); ``` -## 在 Flink SQL CLI 中使用 Flink DDL 创建表 -首先,使用如下的命令进入 Flink SQL CLI 容器中: +## Creating tables using Flink DDL in Flink SQL CLI +First, use the following command to enter the Flink SQL CLI Container: ```shell docker-compose run sql-client ``` -我们可以看到如下界面: -![Flink SQL Client](/_static/fig/real-time-data-lake-tutorial/flink-sql-client.png "Flink SQL Client" ) +We should see the welcome screen of the CLI client: + +{{< img src="/fig/real-time-data-lake-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}} + +Then do the following steps in Flink SQL CLI: -然后,进行如下步骤: -1. 开启 checkpoint,每隔3秒做一次 checkpoint +1. Enable checkpoints every 3 seconds - Checkpoint 默认是不开启的,我们需要开启 Checkpoint 来让 Iceberg 可以提交事务。 - 并且,mysql-cdc 在 binlog 读取阶段开始前,需要等待一个完整的 checkpoint 来避免 binlog 记录乱序的情况。 + Checkpoint is disabled by default, we need to enable it to commit Iceberg transactions. + Besides, the beginning of mysql-cdc binlog phase also requires waiting a complete checkpoint to avoid disorder of binlog records. ```sql -- Flink SQL Flink SQL> SET execution.checkpointing.interval = 3s; ``` -2. 创建 MySQL 分库分表 source 表 - - 创建 source 表 `user_source` 来捕获MySQL中所有 `user` 表的数据,在表的配置项 `database-name` , `table-name` 使用正则表达式来匹配这些表。 - 并且,`user_source` 表也定义了 metadata 列来区分数据是来自哪个数据库和表。 +2. Create MySQL sharding source table + + Create a source table that captures the data from the logical sharding table `user`. Here, we use regex to match all the physical tables. + Besides, the table defines metadata column to identify which database/table the record comes from. ```sql -- Flink SQL Flink SQL> CREATE TABLE user_source ( @@ -226,10 +238,10 @@ docker-compose run sql-client 'table-name' = 'user_[0-9]+' ); ``` -3. 创建 Iceberg sink 表 - - 创建 sink 表 `all_users_sink`,用来将数据加载至 Iceberg 中。 - 在这个 sink 表,考虑到不同的 MySQL 数据库表的 `id` 字段的值可能相同,我们定义了复合主键 (`database_name`, `table_name`, `id`)。 +3. Create Iceberg sink table + + Create a sink table `all_users_sink` used to load data to Iceberg. + We define `database_name`, `table_name` and `id` as a combined primary key, because `id` maybe not unique across different databases and tables. ```sql -- Flink SQL Flink SQL> CREATE TABLE all_users_sink ( @@ -249,67 +261,71 @@ docker-compose run sql-client 'format-version'='2' ); ``` - -## 流式写入 Iceberg -1. 使用下面的 Flink SQL 语句将数据从 MySQL 写入 Iceberg 中 +## Streaming to Iceberg +1. Streaming write data from MySQL to Iceberg using the following Flink SQL: ```sql -- Flink SQL Flink SQL> INSERT INTO all_users_sink select * from user_source; ``` - 上述命令将会启动一个流式作业,源源不断将 MySQL 数据库中的全量和增量数据同步到 Iceberg 中。 - 在 [Flink UI](http://localhost:8081/#/job/running) 上可以看到这个运行的作业: - - ![CDC to Iceberg Running Job](/_static/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png "CDC to Iceberg Running Job") - - 然后我们就可以使用如下的命令看到 Iceberg 中的写入的文件: + It will start a streaming job which will synchronize historical and incremental data from MySQL to Iceberg continuously. + The running job can be found in [Flink UI](http://localhost:8081/#/job/running), and it looks like: + + + {{< img src="/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png" alt="CDC to Iceberg Running Job" >}} + + Then, we can use the following command to see the files written to Iceberg: ```shell docker-compose exec sql-client tree /tmp/iceberg/warehouse/default_database/ ``` - 如下所示: + It should look like: - ![Files in Iceberg](/_static/fig/real-time-data-lake-tutorial/files-in-iceberg.png "Files in Iceberg") - - 在你的运行环境中,实际的文件可能与上面的截图不相同,但是整体的目录结构应该相似。 + {{< img src="/fig/real-time-data-lake-tutorial/files-in-iceberg.png" alt="Files in Iceberg" >}} + + The actual files may differ in your environment, but the structure of the directory should be similar. -2. 使用下面的 Flink SQL 语句查询表 `all_users_sink` 中的数据 +2. Use the following Flink SQL to query the data written to `all_users_sink`: ```sql -- Flink SQL Flink SQL> SELECT * FROM all_users_sink; ``` - 在 Flink SQL CLI 中我们可以看到如下查询结果: + We can see the data queried in the Flink SQL CLI: - ![Data in Iceberg](/_static/fig/real-time-data-lake-tutorial/data_in_iceberg.png "Data in Iceberg") - -3. 修改 MySQL 中表的数据,Iceberg 中的表 `all_users_sink` 中的数据也将实时更新: + {{< img src="/fig/real-time-data-lake-tutorial/data_in_iceberg.png" alt="Data in Iceberg" >}} + +3. Make some changes in the MySQL databases, and then the data in Iceberg table `all_users_sink` will also change in real time. - (3.1) 在 `db_1.user_1` 表中插入新的一行 + (3.1) Insert a new user in table `db_1.user_1` ```sql --- db_1 INSERT INTO db_1.user_1 VALUES (111,"user_111","Shanghai","123567891234","user_111@foo.com"); ``` - - (3.2) 更新 `db_1.user_2` 表的数据 + + (3.2) Update a user in table `db_1.user_2` ```sql --- db_1 UPDATE db_1.user_2 SET address='Beijing' WHERE id=120; ``` - - (3.3) 在 `db_2.user_2` 表中删除一行 + + (3.3) Delete a user in table `db_2.user_2` ```sql --- db_2 DELETE FROM db_2.user_2 WHERE id=220; ``` - 每执行一步,我们就可以在 Flink Client CLI 中使用 `SELECT * FROM all_users_sink` 查询表 `all_users_sink` 来看到数据的变化。 - - 最后的查询结果如下所示: - ![Final Data in Iceberg](/_static/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png "Final Data in Iceberg") - - 从 Iceberg 的最新结果中可以看到新增了`(db_1, user_1, 111)`的记录,`(db_1, user_2, 120)`的地址更新成了 `Beijing`,且`(db_2, user_2, 220)`的记录被删除了,与我们在 MySQL 做的数据更新完全一致。 + After executing each step, we can query the table `all_users_sink` using `SELECT * FROM all_users_sink` in Flink SQL CLI to see the changes. + + The final query result is as follows: + + {{< img src="/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png" alt="Final Data in Iceberg" >}} -## 环境清理 -本教程结束后,在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器: + From the latest result in Iceberg, we can see that there is a new record of `(db_1, user_1, 111)`, and the address of `(db_1, user_2, 120)` has been updated to `Beijing`. + Besides, the record of `(db_2, user_2, 220)` has been deleted. The result is exactly the same with the changes we did in MySQL. + +## Clean up +After finishing the tutorial, run the following command in the directory of `docker-compose.yml` to stop all containers: ```shell docker-compose down -``` \ No newline at end of file +``` + +{{< top >}} diff --git a/docs/content/quickstart/db2-tutorial.md b/docs/content.zh/docs/try-flink-cdc/cdc-connectors/db2-tutorial.md similarity index 95% rename from docs/content/quickstart/db2-tutorial.md rename to docs/content.zh/docs/try-flink-cdc/cdc-connectors/db2-tutorial.md index 5192360609..31150fdab0 100644 --- a/docs/content/quickstart/db2-tutorial.md +++ b/docs/content.zh/docs/try-flink-cdc/cdc-connectors/db2-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "Db2 Tutorial" +weight: 8 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/db2-tutorial.html +--- diff --git a/docs/content/quickstart/mysql-doris-pipeline-tutorial.md b/docs/content.zh/docs/try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.md similarity index 91% rename from docs/content/quickstart/mysql-doris-pipeline-tutorial.md rename to docs/content.zh/docs/try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.md index 27c28e84d8..a9913adc04 100644 --- a/docs/content/quickstart/mysql-doris-pipeline-tutorial.md +++ b/docs/content.zh/docs/try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "MySQL to Doris" +weight: 1 +type: docs +aliases: +- /try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.html +--- -# Pipeline Connectors +# Versions -```{toctree} -:maxdepth: 2 +An appendix of hosted documentation for all versions of Apache Flink CDC. -mysql-pipeline -mysql-pipeline(ZH) -doris-pipeline -doris-pipeline(ZH) -starrocks-pipeline -starrocks-pipeline(ZH) -``` +{{< all_versions >}} diff --git a/docs/content/_index.md b/docs/content/_index.md new file mode 100644 index 0000000000..e0f2e94530 --- /dev/null +++ b/docs/content/_index.md @@ -0,0 +1,58 @@ +--- +title: Apache Flink CDC +type: docs +bookToc: false +--- + + +#### + +
    +

    + Flink CDC: Change Data Capture Solution Of Apache Flink +

    +

    Set of source connectors for Apache Flink® directly ingesting changes coming from different databases using Change Data Capture(CDC).

    +
    + +Flink CDC integrates Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium). + +{{< img src="/fig/cdc-flow.png" alt="Stateful Functions" width="50%" >}} + +Flink CDC supports ingesting snapshot data and real time changes from databases to Flink® and then transform and sink to various downstream systems. + +{{< columns >}} +## Try Flink CDC + +If you’re interested in playing around with Flink CDC, check out our [quick +start]({{< ref "docs/try-flink-cdc" >}}). It provides multiple examples to submit and execute a Flink CDC job on a Flink cluster. + +<---> + +## Get Help with Flink CDC + +If you get stuck, check out our [community support +resources](https://flink.apache.org/community.html). In particular, Apache +Flink’s user mailing list is consistently ranked as one of the most active of +any Apache project, and is a great way to get help quickly. + +{{< /columns >}} + +Flink CDC is developed under the umbrella of [Apache +Flink](https://flink.apache.org/). diff --git a/docs/content/connectors/mongodb-cdc(ZH).md b/docs/content/connectors/mongodb-cdc(ZH).md deleted file mode 100644 index b341e62cd6..0000000000 --- a/docs/content/connectors/mongodb-cdc(ZH).md +++ /dev/null @@ -1,685 +0,0 @@ - - -# MongoDB CDC 连接器 - -MongoDB CDC 连接器允许从 MongoDB 读取快照数据和增量数据。 本文档描述了如何设置 MongoDB CDC 连接器以针对 MongoDB 运行 SQL 查询。 - -依赖 ------------- - -为了设置 MongoDB CDC 连接器, 下表提供了使用构建自动化工具(如 Maven 或 SBT )和带有 SQLJar 捆绑包的 SQLClient 的两个项目的依赖关系信息。 - -### Maven dependency -``` - - com.ververica - flink-connector-mongodb-cdc - - 3.0-SNAPSHOT - -``` - -### SQL Client JAR - -```下载链接仅适用于稳定版本。``` - -下载 [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) 把它放在 `/lib/`. - -**注意:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT 版本是与开发分支相对应的代码。 用户需要下载源代码并编译相应的jar。 用户应使用已发布的版本,例如 [flink-sql-connector-mongodb-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-mongodb-cdc), 发布的版本将在 Maven 中央仓库中提供。 - -设置 MongoDB ----------------- - -### 可用性 -- MongoDB 版本 - - MongoDB 版本 >= 3.6
    - 我们使用 [更改流](https://docs.mongodb.com/manual/changeStreams/) 功能(3.6 版中新增),以捕获更改数据。 - -- 集群部署 - - [副本集](https://docs.mongodb.com/manual/replication/) 或者 [分片集群](https://docs.mongodb.com/manual/sharding/) 是必需的。 - -- 存储引擎 - - [WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger) 存储引擎是必需的。 - -- [副本集协议版本](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) - - 副本集协议版本 1 [(pv1)](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) 是必需的。
    - 从 4.0 版本开始,MongoDB 只支持pv1。 pv1 是使用 MongoDB 3.2 或更高版本创建的所有新副本集的默认值。 - -- 权限 - - `changeStream` and `read` 是 MongoDB Kafka Connector 必需权限。 - - 你可以使用以下示例进行简单的授权。
    - 有关更详细的授权, 请参照 [MongoDB 数据库用户角色](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles). - - ```javascript - use admin; - db.createRole( - { - role: "flinkrole", - privileges: [{ - // 所有数据库中所有非系统集合的 grant 权限 - resource: { db: "", collection: "" }, - actions: [ - "splitVector", - "listDatabases", - "listCollections", - "collStats", - "find", - "changeStream" ] - }], - roles: [ - // 阅读 config.collections 和 config.chunks - // 用于分片集群快照拆分。 - { role: 'read', db: 'config' } - ] - } - ); - - db.createUser( - { - user: 'flinkuser', - pwd: 'flinkpw', - roles: [ - { role: 'flinkrole', db: 'admin' } - ] - } - ); - ``` - - -如何创建 MongoDB CDC 表 ----------------- - -MongoDB CDC 表可以定义如下: - -```sql --- 在 Flink SQL 中注册 MongoDB 表 `products` -CREATE TABLE products ( - _id STRING, // 必须声明 - name STRING, - weight DECIMAL(10,3), - tags ARRAY, -- array - price ROW, -- 嵌入式文档 - suppliers ARRAY>, -- 嵌入式文档 - PRIMARY KEY(_id) NOT ENFORCED -) WITH ( - 'connector' = 'mongodb-cdc', - 'hosts' = 'localhost:27017,localhost:27018,localhost:27019', - 'username' = 'flinkuser', - 'password' = 'flinkpw', - 'database' = 'inventory', - 'collection' = 'products' -); - --- 从 `products` 集合中读取快照和更改事件 -SELECT * FROM products; -``` - -**请注意** - -MongoDB 的更改事件记录在消息之前没有更新。因此,我们只能将其转换为 Flink 的 UPSERT 更改日志流。 -upstart 流需要一个唯一的密钥,所以我们必须声明 `_id` 作为主键。 -我们不能将其他列声明为主键, 因为删除操作不包含除 `_id` 和 `sharding key` 之外的键和值。 - -连接器选项 ----------------- - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)String指定要使用的连接器,此处应为 mongodb-cdc.
    schemeoptionalmongodbString指定 MongoDB 连接协议。 eg. mongodb or mongodb+srv.
    hostsrequired(none)StringMongoDB 服务器的主机名和端口对的逗号分隔列表。
    - eg. localhost:27017,localhost:27018 -
    usernameoptional(none)String连接到 MongoDB 时要使用的数据库用户的名称。
    - 只有当 MongoDB 配置为使用身份验证时,才需要这样做。 -
    passwordoptional(none)String连接到 MongoDB 时要使用的密码。
    - 只有当 MongoDB 配置为使用身份验证时,才需要这样做。 -
    databaseoptional(none)String要监视更改的数据库的名称。 如果未设置,则将捕获所有数据库。
    - 该数据库还支持正则表达式来监视与正则表达式匹配的多个数据库。
    collectionoptional(none)String数据库中要监视更改的集合的名称。 如果未设置,则将捕获所有集合。
    - 该集合还支持正则表达式来监视与完全限定的集合标识符匹配的多个集合。
    connection.optionsoptional(none)StringMongoDB连接选项。 例如:
    - replicaSet=test&connectTimeoutMS=300000 -
    scan.startup.modeoptionalinitialString MongoDB CDC 消费者可选的启动模式, - 合法的模式为 "initial","latest-offset" 和 "timestamp"。 - 请查阅 启动模式 章节了解更多详细信息。
    scan.startup.timestamp-millisoptional(none)Long起始毫秒数, 仅适用于 'timestamp' 启动模式.
    batch.sizeoptional1024IntegerCursor 批次大小。
    poll.max.batch.sizeoptional1024Integer轮询新数据时,单个批处理中要包含的更改流文档的最大数量。
    poll.await.time.msoptional1000Integer在更改流上检查新结果之前等待的时间。
    heartbeat.interval.msoptional0Integer心跳间隔(毫秒)。使用 0 禁用。
    scan.full-changelogoptionalfalseBoolean是否尝试使用 MongoDB 前像/后像产生完整事件流。请查阅 完整事件流 章节了解更多详细信息。该功能仅支持 MongoDB 6.0 之后的版本。
    scan.incremental.snapshot.enabledoptionalfalseBoolean是否启用增量快照。增量快照功能仅支持 MongoDB 4.0 之后的版本。
    scan.incremental.snapshot.chunk.size.mboptional64Integer增量快照的区块大小 mb。
    scan.incremental.snapshot.chunk.samplesoptional20Integer采样分片策略,每个chunk采样的数据条数。
    scan.incremental.close-idle-reader.enabledoptionalfalseBoolean是否在快照结束后关闭空闲的 Reader。 此特性需要 flink 版本大于等于 1.14 并且 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 需要设置为 true。
    - 若 flink 版本大于等于 1.15,'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 默认值变更为 true,可以不用显式配置 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = true。
    scan.cursor.no-timeoutoptionaltrueBooleanMongoDB 服务端通常会将空闲时间超过 10 分钟的 cursor 关闭,来节省内存开销。将这个参数设置为 true 可以防止 cursor 因为读取时间过长或者背压导致的空闲而关闭。仅在增量快照模式下生效。
    -
    - -注意: `heartbeat.interval.ms` 强烈建议设置一个大于 0 的适当值 **如果集合更改缓慢**. -当我们从检查点或保存点恢复 Flink 作业时,心跳事件可以向前推送 `resumeToken`,以避免 `resumeToken` 过期。 - -可用元数据 ----------------- - -以下格式元数据可以在表定义中公开为只读(VIRTUAL)列。 - - - - - - - - - - - - - - - - - - - - - - - - - - -
    KeyDataTypeDescription
    database_nameSTRING NOT NULL包含该行的数据库的名称。
    collection_nameSTRING NOT NULL包含该行的集合的名称。
    op_tsTIMESTAMP_LTZ(3) NOT NULL它指示在数据库中进行更改的时间。
    如果记录是从表的快照而不是改变流中读取的,该值将始终为0。
    - -扩展的 CREATE TABLE 示例演示了用于公开这些元数据字段的语法: -```sql -CREATE TABLE products ( - db_name STRING METADATA FROM 'database_name' VIRTUAL, - collection_name STRING METADATA FROM 'collection_name' VIRTUAL, - operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, - _id STRING, // 必须声明 - name STRING, - weight DECIMAL(10,3), - tags ARRAY, -- array - price ROW, -- 嵌入式文档 - suppliers ARRAY>, -- 嵌入式文档 - PRIMARY KEY(_id) NOT ENFORCED -) WITH ( - 'connector' = 'mongodb-cdc', - 'hosts' = 'localhost:27017,localhost:27018,localhost:27019', - 'username' = 'flinkuser', - 'password' = 'flinkpw', - 'database' = 'inventory', - 'collection' = 'products' -); -``` - -特性 --------- - -### 精确一次处理 - -MongoDB CDC 连接器是一个 Flink Source 连接器,它将首先读取数据库快照,然后在处理**甚至失败时继续读取带有**的更改流事件。 - -### 启动模式 - -配置选项```scan.startup.mode```指定 MySQL CDC 使用者的启动模式。有效枚举包括: - -- `initial` (默认):在第一次启动时对受监视的数据库表执行初始快照,并继续读取最新的 oplog。 -- `latest-offset`:首次启动时,从不对受监视的数据库表执行快照, 连接器仅从 oplog 的结尾处开始读取,这意味着连接器只能读取在连接器启动之后的数据更改。 -- `timestamp`:跳过快照阶段,从指定的时间戳开始读取 oplog 事件。 - -例如使用 DataStream API: -```java -MongoDBSource.builder() - .startupOptions(StartupOptions.latest()) // Start from latest offset - .startupOptions(StartupOptions.timestamp(1667232000000L) // Start from timestamp - .build() -``` - -and with SQL: - -```SQL -CREATE TABLE mongodb_source (...) WITH ( - 'connector' = 'mongodb-cdc', - 'scan.startup.mode' = 'latest-offset', -- 从最晚位点启动 - ... - 'scan.startup.mode' = 'timestamp', -- 指定时间戳启动模式 - 'scan.startup.timestamp-millis' = '1667232000000' -- 启动毫秒时间 - ... -) -``` - -### 更改流 - -我们将 [MongoDB's official Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/) 从 MongoDB 中读取快照或更改事件,并通过 Debezium 的 `EmbeddedEngine` 进行驱动。 - -Debezium 的 `EmbeddedEngine` 提供了一种在应用程序进程中运行单个 Kafka Connect `SourceConnector` 的机制,并且它可以正确地驱动任何标准的 Kafka Connect `SourceConnector`,即使它不是由 Debezium 提供的。 - -我们选择 **MongoDB 的官方 Kafka连接器**,而不是 **Debezium 的MongoDB 连接器**,因为它们使用了不同的更改数据捕获机制。 - -- 对于 Debezium 的 MongoDB 连接器,它读取每个复制集主节点的 `oplog.rs` 集合。 -- 对于 MongoDB 的 Kafka 连接器,它订阅了 MongoDB 的 `更改流`。 - -MongoDB 的`oplog.rs` 集合没有在状态之前保持更改记录的更新, 因此,很难通过单个 `oplog.rs` 记录提取完整的文档状态,并将其转换为 Flink 接受的更改日志流(Insert Only,Upsert,All)。 -此外,MongoDB 5(2021 7月发布)改变了 oplog 格式,因此当前的 Debezium 连接器不能与其一起使用。 - -**Change Stream**是 MongoDB 3.6 为副本集和分片集群提供的一项新功能,它允许应用程序访问实时数据更改,而不会带来跟踪操作日志的复杂性和风险。
    -应用程序可以使用更改流来订阅单个集合上的所有数据更改, 数据库或整个部署,并立即对其做出反应。 - -**查找更新操作的完整文档**是**变更流**提供的一项功能,它可以配置变更流以返回更新文档的最新多数提交版本。由于该功能,我们可以轻松收集最新的完整文档,并将更改日志转换为 Flink 的**Upsert Changelog Stream**。 - -顺便说一句,[DBZ-435](https://issues.redhat.com/browse/DBZ-435)提到的Debezium的MongoDB变更流探索,正在制定路线图。
    -如果完成了,我们可以考虑集成两种源连接器供用户选择。 - -### DataStream Source - -MongoDB CDC 连接器也可以是一个数据流源。 你可以创建 SourceFunction,如下所示: - -```java -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; -import org.apache.flink.cdc.connectors.mongodb.MongoDBSource; - -public class MongoDBSourceExample { - public static void main(String[] args) throws Exception { - SourceFunction sourceFunction = MongoDBSource.builder() - .hosts("localhost:27017") - .username("flink") - .password("flinkpw") - .databaseList("inventory") // 设置捕获的数据库,支持正则表达式 - .collectionList("inventory.products", "inventory.orders") //设置捕获的集合,支持正则表达式 - .deserializer(new JsonDebeziumDeserializationSchema()) - .build(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - env.addSource(sourceFunction) - .print().setParallelism(1); // 对 sink 使用并行度 1 以保持消息顺序 - - env.execute(); - } -} -``` - -MongoDB CDC 增量连接器(2.3.0 之后)可以使用,如下所示: -```java -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.cdc.connectors.mongodb.source.MongoDBSource; -import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; - -public class MongoDBIncrementalSourceExample { - public static void main(String[] args) throws Exception { - MongoDBSource mongoSource = - MongoDBSource.builder() - .hosts("localhost:27017") - .databaseList("inventory") // 设置捕获的数据库,支持正则表达式 - .collectionList("inventory.products", "inventory.orders") //设置捕获的集合,支持正则表达式 - .username("flink") - .password("flinkpw") - .deserializer(new JsonDebeziumDeserializationSchema()) - .build(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // 启用检查点 - env.enableCheckpointing(3000); - // 将 source 并行度设置为 2 - env.fromSource(mongoSource, WatermarkStrategy.noWatermarks(), "MongoDBIncrementalSource") - .setParallelism(2) - .print() - .setParallelism(1); - - env.execute("Print MongoDB Snapshot + Change Stream"); - } -} -``` - -**注意:** -- 如果使用数据库正则表达式,则需要 `readAnyDatabase` 角色。 -- 增量快照功能仅支持 MongoDB 4.0 之后的版本。 - -### 完整事件流 - -MongoDB 6.0 及以上版本支持在输出的更改流事件中携带对应更改前及更改后的文档版本(分别称为前像和后像)。 - -- 前像(Pre-image)是被该变更替换、更新或删除的文档。插入事件不存在对应的前像。 - -- 后像(Post-image)是该变更插入、替换或更新的文档。删除事件不存在对应的后像。 - -MongoDB CDC 能够借助上述前像和后像信息,产生完整的、包含 Insert、Update Before、Update After、Delete 数据行的事件流,从而避免下游 Flink 增加额外的 `ChangelogNormalize` 节点。 - -为了启用这一功能,您需要确保: - -- MongoDB 数据库版本不低于 6.0; -- 在数据库层面启用前像/后像记录功能: -```javascript -db.runCommand({ - setClusterParameter: { - changeStreamOptions: { - preAndPostImages: { - expireAfterSeconds: 'off' // 自定义前像后像的过期时间 - } - } - } -}) -``` -- 为需要监控的集合开启前像/后像记录功能: -```javascript -db.runCommand({ - collMod: "<< 集合名称 >>", - changeStreamPreAndPostImages: { - enabled: true - } -}) -``` -- 打开 MongoDB CDC 的 `scan.full-changelog` 开关: - -```java -MongoDBSource.builder() - .scanFullChangelog(true) - ... - .build() -``` - -或者使用 Flink SQL: - -```SQL -CREATE TABLE mongodb_source (...) WITH ( - 'connector' = 'mongodb-cdc', - 'scan.full-changelog' = 'true', - ... -) -``` - -数据类型映射 ----------------- -[BSON](https://docs.mongodb.com/manual/reference/bson-types/) **二进制 JSON**的缩写是一种类似 JSON 格式的二进制编码序列,用于在 MongoDB 中存储文档和进行远程过程调用。 - -[Flink SQL Data Type](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/types/) 类似于 SQL 标准的数据类型术语,该术语描述了表生态系统中值的逻辑类型。它可以用于声明操作的输入和/或输出类型。 - -为了使 Flink SQL 能够处理来自异构数据源的数据,异构数据源的数据类型需要统一转换为 Flink SQL 数据类型。 - -以下是 BSON 类型和 Flink SQL 类型的映射。 - - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    BSON typeFlink SQL type
    TINYINT
    SMALLINT
    - Int
    -
    INT
    LongBIGINT
    FLOAT
    DoubleDOUBLE
    Decimal128DECIMAL(p, s)
    BooleanBOOLEAN
    Date
    Timestamp
    DATE
    Date
    Timestamp
    TIME
    DateTIMESTAMP(3)
    TIMESTAMP_LTZ(3)
    TimestampTIMESTAMP(0)
    TIMESTAMP_LTZ(0) -
    - String
    - ObjectId
    - UUID
    - Symbol
    - MD5
    - JavaScript
    - Regex
    STRING
    BinDataBYTES
    ObjectROW
    ArrayARRAY
    DBPointerROW<$ref STRING, $id STRING>
    - GeoJSON - - Point : ROW<type STRING, coordinates ARRAY<DOUBLE>>
    - Line : ROW<type STRING, coordinates ARRAY<ARRAY< DOUBLE>>>
    - ... -
    -
    - - -参考 --------- - -- [MongoDB Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/) -- [Change Streams](https://docs.mongodb.com/manual/changeStreams/) -- [Replication](https://docs.mongodb.com/manual/replication/) -- [Sharding](https://docs.mongodb.com/manual/sharding/) -- [Database User Roles](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles) -- [WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger) -- [Replica set protocol](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) -- [Connection String Options](https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options) -- [Document Pre- and Post-Images](https://www.mongodb.com/docs/v6.0/changeStreams/#change-streams-with-document-pre--and-post-images) -- [BSON Types](https://docs.mongodb.com/manual/reference/bson-types/) -- [Flink DataTypes](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/types/) - -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) \ No newline at end of file diff --git a/docs/content/connectors/mysql-cdc(ZH).md b/docs/content/connectors/mysql-cdc(ZH).md deleted file mode 100644 index 778cd4b786..0000000000 --- a/docs/content/connectors/mysql-cdc(ZH).md +++ /dev/null @@ -1,1099 +0,0 @@ - - -# MySQL CDC 连接器 - -MySQL CDC 连接器允许从 MySQL 数据库读取快照数据和增量数据。本文描述了如何设置 MySQL CDC 连接器来对 MySQL 数据库运行 SQL 查询。 - -## 支持的数据库 - -| Connector | Database | Driver | -|-----------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| [mysql-cdc](mysql-cdc(ZH).md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.27 | - -依赖 ------------- - -为了设置 MySQL CDC 连接器,下表提供了使用构建自动化工具(如 Maven 或 SBT )和带有 SQL JAR 包的 SQL 客户端的两个项目的依赖关系信息。 - -### Maven dependency - -``` - - com.ververica - flink-connector-mysql-cdc - - 3.0-SNAPSHOT - -``` - -### SQL Client JAR - -```下载链接仅在已发布版本可用,请在文档网站左下角选择浏览已发布的版本。``` - -下载 flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar 到 `/lib/` 目录下。 - -**注意:** flink-sql-connector-mysql-cdc-XXX-SNAPSHOT 版本是开发分支`release-XXX`对应的快照版本,快照版本用户需要下载源代码并编译相应的 jar。用户应使用已经发布的版本,例如 [flink-sql-connector-mysql-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-mysql-cdc) 当前已发布的所有版本都可以在 Maven 中央仓库获取。 - -配置 MySQL 服务器 ----------------- - -你必须定义一个 MySQL 用户,该用户对 MySQL CDC 连接器监视的所有数据库都应该具有所需的权限。 - -1. 创建 MySQL 用户: - -```sql -mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password'; -``` - -2. 向用户授予所需的权限: - -```sql -mysql> GRANT SELECT, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password'; -``` -**注意:** 在 `scan.incremental.snapshot.enabled` 参数已启用时(默认情况下已启用)时,不再需要授予 reload 权限。 - -3. 刷新用户权限: - -```sql -mysql> FLUSH PRIVILEGES; -``` - -查看更多用户权限问题请参考 [权限说明](https://debezium.io/documentation/reference/1.9/connectors/mysql.html#mysql-creating-user). - - -注意事项 ----------------- - -### 为每个 Reader 设置不同的 Server id - -每个用于读取 binlog 的 MySQL 数据库客户端都应该有一个唯一的 id,称为 Server id。 MySQL 服务器将使用此 id 来维护网络连接和 binlog 位置。 因此,如果不同的作业共享相同的 Server id, 则可能导致从错误的 binlog 位置读取数据。 -因此,建议通过为每个 Reader 设置不同的 Server id [SQL Hints](https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/sql/hints.html), -假设 Source 并行度为 4, 我们可以使用 `SELECT * FROM source_table /*+ OPTIONS('server-id'='5401-5404') */ ;` 来为 4 个 Source readers 中的每一个分配唯一的 Server id。 - - -### 设置 MySQL 会话超时时间 - -当为大型数据库创建初始一致快照时,你建立的连接可能会在读取表时碰到超时问题。你可以通过在 MySQL 侧配置 interactive_timeout 和 wait_timeout 来缓解此类问题。 -- `interactive_timeout`: 服务器在关闭交互连接之前等待活动的秒数。 更多信息请参考 [MySQL documentations](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout). -- `wait_timeout`: 服务器在关闭非交互连接之前等待活动的秒数。 更多信息请参考 [MySQL documentations](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout). - - -如何创建 MySQL CDC 表 ----------------- - -MySQL CDC 表可以定义如下: - -```sql --- 每 3 秒做一次 checkpoint,用于测试,生产配置建议5到10分钟 -Flink SQL> SET 'execution.checkpointing.interval' = '3s'; - --- 在 Flink SQL中注册 MySQL 表 'orders' -Flink SQL> CREATE TABLE orders ( - order_id INT, - order_date TIMESTAMP(0), - customer_name STRING, - price DECIMAL(10, 5), - product_id INT, - order_status BOOLEAN, - PRIMARY KEY(order_id) NOT ENFORCED - ) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = 'localhost', - 'port' = '3306', - 'username' = 'root', - 'password' = '123456', - 'database-name' = 'mydb', - 'table-name' = 'orders'); - --- 从订单表读取全量数据(快照)和增量数据(binlog) -Flink SQL> SELECT * FROM orders; -``` - -连接器选项 ----------------- - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)String指定要使用的连接器, 这里应该是 'mysql-cdc'.
    hostnamerequired(none)String MySQL 数据库服务器的 IP 地址或主机名。
    usernamerequired(none)String连接到 MySQL 数据库服务器时要使用的 MySQL 用户的名称。
    passwordrequired(none)String连接 MySQL 数据库服务器时使用的密码。
    database-namerequired(none)String要监视的 MySQL 服务器的数据库名称。数据库名称还支持正则表达式,以监视多个与正则表达式匹配的表。
    table-namerequired(none)String需要监视的 MySQL 数据库的表名。表名支持正则表达式,以监视满足正则表达式的多个表。注意:MySQL CDC 连接器在正则匹配表名时,会把用户填写的 database-name, table-name 通过字符串 `\\.` 连接成一个全路径的正则表达式,然后使用该正则表达式和 MySQL 数据库中表的全限定名进行正则匹配。
    portoptional3306Integer MySQL 数据库服务器的整数端口号。
    server-idoptional(none)String读取数据使用的 server id,server id 可以是个整数或者一个整数范围,比如 '5400' 或 '5400-5408', - 建议在 'scan.incremental.snapshot.enabled' 参数为启用时,配置成整数范围。因为在当前 MySQL 集群中运行的所有 slave 节点,标记每个 salve 节点的 id 都必须是唯一的。 所以当连接器加入 MySQL 集群作为另一个 slave 节点(并且具有唯一 id 的情况下),它就可以读取 binlog。 默认情况下,连接器会在 5400 和 6400 之间生成一个随机数,但是我们建议用户明确指定 Server id。 -
    scan.incremental.snapshot.enabledoptionaltrueBoolean增量快照是一种读取表快照的新机制,与旧的快照机制相比, - 增量快照有许多优点,包括: - (1)在快照读取期间,Source 支持并发读取, - (2)在快照读取期间,Source 支持进行 chunk 粒度的 checkpoint, - (3)在快照读取之前,Source 不需要数据库锁权限。 - 如果希望 Source 并行运行,则每个并行 Readers 都应该具有唯一的 Server id,所以 - Server id 必须是类似 `5400-6400` 的范围,并且该范围必须大于并行度。 - 请查阅 增量快照读取 章节了解更多详细信息。 -
    scan.incremental.snapshot.chunk.sizeoptional8096Integer表快照的块大小(行数),读取表的快照时,捕获的表被拆分为多个块。
    scan.snapshot.fetch.sizeoptional1024Integer读取表快照时每次读取数据的最大条数。
    scan.startup.modeoptionalinitialString MySQL CDC 消费者可选的启动模式, - 合法的模式为 "initial","earliest-offset","latest-offset","specific-offset" 和 "timestamp"。 - 请查阅 启动模式 章节了解更多详细信息。
    scan.startup.specific-offset.fileoptional(none)String在 "specific-offset" 启动模式下,启动位点的 binlog 文件名。
    scan.startup.specific-offset.posoptional(none)Long在 "specific-offset" 启动模式下,启动位点的 binlog 文件位置。
    scan.startup.specific-offset.gtid-setoptional(none)String在 "specific-offset" 启动模式下,启动位点的 GTID 集合。
    scan.startup.specific-offset.skip-eventsoptional(none)Long在指定的启动位点后需要跳过的事件数量。
    scan.startup.specific-offset.skip-rowsoptional(none)Long在指定的启动位点后需要跳过的数据行数量。
    server-time-zoneoptional(none)String数据库服务器中的会话时区, 例如: "Asia/Shanghai". - 它控制 MYSQL 中的时间戳类型如何转换为字符串。 - 更多请参考 这里. - 如果没有设置,则使用ZoneId.systemDefault()来确定服务器时区。 -
    debezium.min.row. - count.to.stream.resultoptional1000Integer - 在快照操作期间,连接器将查询每个包含的表,以生成该表中所有行的读取事件。 此参数确定 MySQL 连接是否将表的所有结果拉入内存(速度很快,但需要大量内存), 或者结果是否需要流式传输(传输速度可能较慢,但适用于非常大的表)。 该值指定了在连接器对结果进行流式处理之前,表必须包含的最小行数,默认值为1000。将此参数设置为`0`以跳过所有表大小检查,并始终在快照期间对所有结果进行流式处理。
    connect.timeoutoptional30sDuration连接器在尝试连接到 MySQL 数据库服务器后超时前应等待的最长时间。
    connect.max-retriesoptional3Integer连接器应重试以建立 MySQL 数据库服务器连接的最大重试次数。
    connection.pool.sizeoptional20Integer连接池大小。
    jdbc.properties.*optional20String传递自定义 JDBC URL 属性的选项。用户可以传递自定义属性,如 'jdbc.properties.useSSL' = 'false'.
    heartbeat.intervaloptional30sDuration用于跟踪最新可用 binlog 偏移的发送心跳事件的间隔。
    debezium.*optional(none)String将 Debezium 的属性传递给 Debezium 嵌入式引擎,该引擎用于从 MySQL 服务器捕获数据更改。 - 例如: 'debezium.snapshot.mode' = 'never'. - 查看更多关于 Debezium 的 MySQL 连接器属性
    scan.incremental.close-idle-reader.enabledoptionalfalseBoolean是否在快照结束后关闭空闲的 Reader。 此特性需要 flink 版本大于等于 1.14 并且 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 需要设置为 true。
    - 若 flink 版本大于等于 1.15,'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 默认值变更为 true,可以不用显式配置 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = true。
    debezium.binary.handling.modeoptional(none)Stringdebezium.binary.handling.mode 参数可以设置为以下值: - none:不进行任何处理,直接将二进制数据类型作为字节数组(byte array)传输。 - base64:将二进制数据类型转换为 Base64 编码的字符串,然后传输。 - hex:将二进制数据类型转换为十六进制编码的字符串,然后传输。 - 默认值为 none。根据您的需求和数据类型,您可以选择合适的处理模式。如果您的数据库中包含大量二进制数据类型,建议使用 base64 或 hex 模式,以便在传输过程中更容易处理。 -
    -
    - -支持的元数据 ----------------- - -下表中的元数据可以在 DDL 中作为只读(虚拟)meta 列声明。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    KeyDataTypeDescription
    table_nameSTRING NOT NULL当前记录所属的表名称。
    database_nameSTRING NOT NULL当前记录所属的库名称。
    op_tsTIMESTAMP_LTZ(3) NOT NULL当前记录表在数据库中更新的时间。
    如果从表的快照而不是 binlog 读取记录,该值将始终为0。
    row_kindSTRING NOT NULL当前记录对应的 changelog 类型。注意:当 Source 算子选择为每条记录输出 row_kind 字段后,下游 SQL 算子在处理消息撤回时会因为这个字段不同而比对失败, -建议只在简单的同步作业中引用该元数据列。
    '+I' 表示 INSERT 数据,'-D' 表示 DELETE 数据,'-U' 表示 UPDATE_BEFORE 数据,'+U' 表示 UPDATE_AFTER 数据。 -
    - -下述创建表示例展示元数据列的用法: - -```sql -CREATE TABLE products -( - db_name STRING METADATA FROM 'database_name' VIRTUAL, - table_name STRING METADATA FROM 'table_name' VIRTUAL, - operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, - operation STRING METADATA FROM 'row_kind' VIRTUAL, - order_id INT, - order_date TIMESTAMP(0), - customer_name STRING, - price DECIMAL(10, 5), - product_id INT, - order_status BOOLEAN, - PRIMARY KEY (order_id) NOT ENFORCED -) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = 'localhost', - 'port' = '3306', - 'username' = 'root', - 'password' = '123456', - 'database-name' = 'mydb', - 'table-name' = 'orders' - ); -``` - -下述创建表示例展示使用正则表达式匹配多张库表的用法: - -```sql -CREATE TABLE products -( - db_name STRING METADATA FROM 'database_name' VIRTUAL, - table_name STRING METADATA FROM 'table_name' VIRTUAL, - operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, - operation STRING METADATA FROM 'row_kind' VIRTUAL, - order_id INT, - order_date TIMESTAMP(0), - customer_name STRING, - price DECIMAL(10, 5), - product_id INT, - order_status BOOLEAN, - PRIMARY KEY (order_id) NOT ENFORCED -) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = 'localhost', - 'port' = '3306', - 'username' = 'root', - 'password' = '123456', - 'database-name' = '(^(test).*|^(tpc).*|txc|.*[p$]|t{2})', - 'table-name' = '(t[5-8]|tt)' - ); -``` - - - - - - - - - - - - - - - - - - - - - - - - - -
    匹配示例表达式描述
    前缀匹配^(test).*匹配前缀为test的数据库名或表名,例如test1、test2等。
    后缀匹配.*[p$]匹配后缀为p的数据库名或表名,例如cdcp、edcp等。
    特定匹配txc匹配具体的数据库名或表名。
    - -进行库表匹配时,会使用正则表达式 `database-name\\.table-name` 来与MySQL表的全限定名做匹配,所以该例子使用 `(^(test).*|^(tpc).*|txc|.*[p$]|t{2})\\.(t[5-8]|tt)`,可以匹配到表 txc.tt、test2.test5。 - -支持的特性 --------- - -### 增量快照读取 - -增量快照读取是一种读取表快照的新机制。与旧的快照机制相比,增量快照具有许多优点,包括: -* (1)在快照读取期间,Source 支持并发读取, -* (2)在快照读取期间,Source 支持进行 chunk 粒度的 checkpoint, -* (3)在快照读取之前,Source 不需要数据库锁权限。 - -如果希望 source 并行运行,则每个并行 reader 都应该具有唯一的 server id,因此`server id`的范围必须类似于 `5400-6400`, -且范围必须大于并行度。在增量快照读取过程中,MySQL CDC Source 首先通过表的主键将表划分成多个块(chunk), -然后 MySQL CDC Source 将多个块分配给多个 reader 以并行读取表的数据。 - -#### 并发读取 - -增量快照读取提供了并行读取快照数据的能力。 -你可以通过设置作业并行度的方式来控制 Source 的并行度 `parallelism.default`. For example, in SQL CLI: - -```sql -Flink SQL> SET 'parallelism.default' = 8; -``` - -#### 全量阶段支持 checkpoint - -增量快照读取提供了在区块级别执行检查点的能力。它使用新的快照读取机制解决了以前版本中的检查点超时问题。 - -#### 无锁算法 - -MySQL CDC source 使用 增量快照算法, 避免了数据库锁的使用,因此不需要 “RELOAD” 权限。 - -#### MySQL高可用性支持 - -```mysql cdc``` 连接器通过使用 [GTID](https://dev.mysql.com/doc/refman/5.7/en/replication-gtids-concepts.html) 提供 MySQL 高可用集群的高可用性信息。为了获得高可用性, MySQL集群需要启用 GTID 模式,MySQL 配置文件中的 GTID 模式应该包含以下设置: - -```yaml -gtid_mode = on -enforce_gtid_consistency = on -``` - -如果监控的MySQL服务器地址包含从实例,则需要对MySQL配置文件设置以下设置。设置 ```log slave updates=1``` 允许从实例也将从主实例同步的数据写入其binlog, 这确保了```mysql cdc```连接器可以使用从实例中的全部数据。 - -```yaml -gtid_mode = on -enforce_gtid_consistency = on -log-slave-updates = 1 -``` - -MySQL 集群中你监控的服务器出现故障后, 你只需将受监视的服务器地址更改为其他可用服务器,然后从最新的检查点/保存点重新启动作业, 作业将从 checkpoint/savepoint 恢复,不会丢失任何记录。 - -建议为 MySQL 集群配置 DNS(域名服务)或 VIP(虚拟 IP 地址), 使用```mysql cdc```连接器的 DNS 或 VIP 地址, DNS或VIP将自动将网络请求路由到活动MySQL服务器。 这样,你就不再需要修改地址和重新启动管道。 - -#### MySQL心跳事件支持 - -如果表不经常更新,则 binlog 文件或 GTID 集可能已在其最后提交的 binlog 位置被清理。 -在这种情况下,CDC 作业可能会重新启动失败。因此心跳事件将帮助更新 binlog 位置。 默认情况下,MySQL CDC Source 启用心跳事件,间隔设置为30秒。 可以使用表选项```heartbeat```指定间隔。或将选项设置为```0s```以禁用心跳事件。 - -#### 增量快照读取的工作原理 - -当 MySQL CDC Source 启动时,它并行读取表的快照,然后以单并行度的方式读取表的 binlog。 - -在快照阶段,根据表的主键和表行的大小将快照切割成多个快照块。 -快照块被分配给多个快照读取器。每个快照读取器使用 [区块读取算法](#snapshot-chunk-reading) 并将读取的数据发送到下游。 -Source 会管理块的进程状态(完成或未完成),因此快照阶段的 Source 可以支持块级别的 checkpoint。 -如果发生故障,可以恢复 Source 并继续从最后完成的块中读取块。 - -所有快照块完成后,Source 将继续在单个任务中读取 binlog。 -为了保证快照记录和 binlog 记录的全局数据顺序,binlog reader 将开始读取数据直到快照块完成后并有一个完整的 checkpoint,以确保所有快照数据已被下游消费。 -binlog reader 在状态中跟踪所使用的 binlog 位置,因此 binlog 阶段的 Source 可以支持行级别的 checkpoint。 - -Flink 定期为 Source 执行 checkpoint,在故障转移的情况下,作业将重新启动并从最后一个成功的 checkpoint 状态恢复,并保证只执行一次语义。 - -##### 全量阶段分片算法 - -在执行增量快照读取时,MySQL CDC source 需要一个用于分片的的算法。 -MySQL CDC Source 使用主键列将表划分为多个分片(chunk)。 默认情况下,MySQL CDC source 会识别表的主键列,并使用主键中的第一列作为用作分片列。 -如果表中没有主键, 增量快照读取将失败,你可以禁用 `scan.incremental.snapshot.enabled` 来回退到旧的快照读取机制。 - -对于数值和自动增量拆分列,MySQL CDC Source 按固定步长高效地拆分块。 -例如,如果你有一个主键列为`id`的表,它是自动增量 BIGINT 类型,最小值为`0`,最大值为`100`, -和表选项 `scan.incremental.snapshot.chunk.size` 大小 `value`为`25`,表将被拆分为以下块: - -``` - (-∞, 25), - [25, 50), - [50, 75), - [75, 100), - [100, +∞) -``` - -对于其他主键列类型, MySQL CDC Source 将以下形式执行语句: `SELECT MAX(STR_ID) AS chunk_high FROM (SELECT * FROM TestTable WHERE STR_ID > 'uuid-001' limit 25)` 来获得每个区块的低值和高值, -分割块集如下所示: - - ``` - (-∞, 'uuid-001'), - ['uuid-001', 'uuid-009'), - ['uuid-009', 'uuid-abc'), - ['uuid-abc', 'uuid-def'), - [uuid-def, +∞). -``` - -##### Chunk 读取算法 - -对于上面的示例`MyTable`,如果 MySQL CDC Source 并行度设置为 4,MySQL CDC Source 将在每一个 executes 运行 4 个 Readers **通过偏移信号算法** -获取快照区块的最终一致输出。 **偏移信号算法**简单描述如下: - -* (1) 将当前 binlog 位置记录为`LOW`偏移量 -* (2) 通过执行语句读取并缓冲快照区块记录 `SELECT * FROM MyTable WHERE id > chunk_low AND id <= chunk_high` -* (3) 将当前 binlog 位置记录为`HIGH`偏移量 -* (4) 从`LOW`偏移量到`HIGH`偏移量读取属于快照区块的 binlog 记录 -* (5) 将读取的 binlog 记录向上插入缓冲区块记录,并发出缓冲区中的所有记录作为快照区块的最终输出(全部作为插入记录) -* (6) 继续读取并发出属于 *单个 binlog reader* 中`HIGH`偏移量之后的区块的 binlog 记录。 - -该算法的是基于 [DBLog Paper](https://arxiv.org/pdf/2010.12597v1.pdf) 并结合 Flink 的一个变种, 请参考它了解更多详细信息。 - -**注意:** 如果主键的实际值在其范围内分布不均匀,则在增量快照读取时可能会导致任务不平衡。 - -### Exactly-Once 处理 - -MySQL CDC 连接器是一个 Flink Source 连接器,它将首先读取表快照块,然后继续读取 binlog, -无论是在快照阶段还是读取 binlog 阶段,MySQL CDC 连接器都会在处理时**准确读取数据**,即使任务出现了故障。 - -### 启动模式 - -配置选项```scan.startup.mode```指定 MySQL CDC 使用者的启动模式。有效枚举包括: - -- `initial` (默认):在第一次启动时对受监视的数据库表执行初始快照,并继续读取最新的 binlog。 -- `earliest-offset`:跳过快照阶段,从可读取的最早 binlog 位点开始读取 -- `latest-offset`:首次启动时,从不对受监视的数据库表执行快照, 连接器仅从 binlog 的结尾处开始读取,这意味着连接器只能读取在连接器启动之后的数据更改。 -- `specific-offset`:跳过快照阶段,从指定的 binlog 位点开始读取。位点可通过 binlog 文件名和位置指定,或者在 GTID 在集群上启用时通过 GTID 集合指定。 -- `timestamp`:跳过快照阶段,从指定的时间戳开始读取 binlog 事件。 - -例如使用 DataStream API: -```java -MySQLSource.builder() - .startupOptions(StartupOptions.earliest()) // 从最早位点启动 - .startupOptions(StartupOptions.latest()) // 从最晚位点启动 - .startupOptions(StartupOptions.specificOffset("mysql-bin.000003", 4L) // 从指定 binlog 文件名和位置启动 - .startupOptions(StartupOptions.specificOffset("24DA167-0C0C-11E8-8442-00059A3C7B00:1-19")) // 从 GTID 集合启动 - .startupOptions(StartupOptions.timestamp(1667232000000L) // 从时间戳启动 - ... - .build() -``` - -使用 SQL: - -```SQL -CREATE TABLE mysql_source (...) WITH ( - 'connector' = 'mysql-cdc', - 'scan.startup.mode' = 'earliest-offset', -- 从最早位点启动 - 'scan.startup.mode' = 'latest-offset', -- 从最晚位点启动 - 'scan.startup.mode' = 'specific-offset', -- 从特定位点启动 - 'scan.startup.mode' = 'timestamp', -- 从特定位点启动 - 'scan.startup.specific-offset.file' = 'mysql-bin.000003', -- 在特定位点启动模式下指定 binlog 文件名 - 'scan.startup.specific-offset.pos' = '4', -- 在特定位点启动模式下指定 binlog 位置 - 'scan.startup.specific-offset.gtid-set' = '24DA167-0C0C-11E8-8442-00059A3C7B00:1-19', -- 在特定位点启动模式下指定 GTID 集合 - 'scan.startup.timestamp-millis' = '1667232000000' -- 在时间戳启动模式下指定启动时间戳 - ... -) -``` - -**注意**: -1. MySQL source 会在 checkpoint 时将当前位点以 INFO 级别打印到日志中,日志前缀为 "Binlog offset on checkpoint {checkpoint-id}"。 -该日志可以帮助将作业从某个 checkpoint 的位点开始启动的场景。 -2. 如果捕获变更的表曾经发生过表结构变化,从最早位点、特定位点或时间戳启动可能会发生错误,因为 Debezium 读取器会在内部保存当前的最新表结构,结构不匹配的早期数据无法被正确解析。 - - -### DataStream Source - -```java -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; -import org.apache.flink.cdc.connectors.mysql.source.MySqlSource; - -public class MySqlSourceExample { - public static void main(String[] args) throws Exception { - MySqlSource mySqlSource = MySqlSource.builder() - .hostname("yourHostname") - .port(yourPort) - .databaseList("yourDatabaseName") // 设置捕获的数据库, 如果需要同步整个数据库,请将 tableList 设置为 ".*". - .tableList("yourDatabaseName.yourTableName") // 设置捕获的表 - .username("yourUsername") - .password("yourPassword") - .deserializer(new JsonDebeziumDeserializationSchema()) // 将 SourceRecord 转换为 JSON 字符串 - .build(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - // 设置 3s 的 checkpoint 间隔 - env.enableCheckpointing(3000); - - env - .fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") - // 设置 source 节点的并行度为 4 - .setParallelism(4) - .print().setParallelism(1); // 设置 sink 节点并行度为 1 - - env.execute("Print MySQL Snapshot + Binlog"); - } -} -``` - -**注意:** 请参考 [Deserialization](../about.html#deserialization) 有关 JSON 反序列化的更多详细信息。 - -### 动态加表 - -扫描新添加的表功能使你可以添加新表到正在运行的作业中,新添加的表将首先读取其快照数据,然后自动读取其变更日志。 - -想象一下这个场景:一开始, Flink 作业监控表 `[product, user, address]`, 但几天后,我们希望这个作业还可以监控表 `[order, custom]`,这些表包含历史数据,我们需要作业仍然可以复用作业的已有状态,动态加表功能可以优雅地解决此问题。 - -以下操作显示了如何启用此功能来解决上述场景。 使用现有的 Flink CDC Source 作业,如下: - -```java - MySqlSource mySqlSource = MySqlSource.builder() - .hostname("yourHostname") - .port(yourPort) - .scanNewlyAddedTableEnabled(true) // 启用扫描新添加的表功能 - .databaseList("db") // 设置捕获的数据库 - .tableList("db.product, db.user, db.address") // 设置捕获的表 [product, user, address] - .username("yourUsername") - .password("yourPassword") - .deserializer(new JsonDebeziumDeserializationSchema()) // 将 SourceRecord 转换为 JSON 字符串 - .build(); - // 你的业务代码 -``` - -如果我们想添加新表 `[order, custom]` 对于现有的 Flink 作业,只需更新 `tableList()` 将新增表 `[order, custom]` 加入并从已有的 savepoint 恢复作业。 - -_Step 1_: 使用 savepoint 停止现有的 Flink 作业。 -```shell -$ ./bin/flink stop $Existing_Flink_JOB_ID -``` -```shell -Suspending job "cca7bc1061d61cf15238e92312c2fc20" with a savepoint. -Savepoint completed. Path: file:/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab -``` -_Step 2_: 更新现有 Flink 作业的表列表选项。 -1. 更新 `tableList()` 参数. -2. 编译更新后的作业,示例如下: -```java - MySqlSource mySqlSource = MySqlSource.builder() - .hostname("yourHostname") - .port(yourPort) - .scanNewlyAddedTableEnabled(true) - .databaseList("db") - .tableList("db.product, db.user, db.address, db.order, db.custom") // 设置捕获的表 [product, user, address ,order, custom] - .username("yourUsername") - .password("yourPassword") - .deserializer(new JsonDebeziumDeserializationSchema()) // 将 SourceRecord 转换为 JSON 字符串 - .build(); - // 你的业务代码 -``` -_Step 3_: 从 savepoint 还原更新后的 Flink 作业。 -```shell -$ ./bin/flink run \ - --detached \ - --fromSavepoint /tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab \ - ./FlinkCDCExample.jar -``` -**注意:** 请参考文档 [Restore the job from previous savepoint](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/deployment/cli/#command-line-interface) 了解更多详细信息。 - -### 关于无主键表 - -从2.4.0 版本开始支持无主键表,使用无主键表必须设置 `scan.incremental.snapshot.chunk.key-column`,且只能选择非空类型的一个字段。 - -在使用无主键表时,需要注意以下两种情况。 - -1. 配置 `scan.incremental.snapshot.chunk.key-column` 时,如果表中存在索引,请尽量使用索引中的列来加快 select 速度。 -2. 无主键表的处理语义由 `scan.incremental.snapshot.chunk.key-column` 指定的列的行为决定: - * 如果指定的列不存在更新操作,此时可以保证 Exactly once 语义。 - * 如果指定的列存在更新操作,此时只能保证 At least once 语义。但可以结合下游,通过指定下游主键,结合幂等性操作来保证数据的正确性。 - -### 关于二进制类型数据转换为base64编码数据 - -```sql -CREATE TABLE products ( - db_name STRING METADATA FROM 'database_name' VIRTUAL, - table_name STRING METADATA FROM 'table_name' VIRTUAL, - operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, - order_id INT, - order_date TIMESTAMP(0), - customer_name STRING, - price DECIMAL(10, 5), - product_id INT, - order_status BOOLEAN, - binary_data STRING, - PRIMARY KEY(order_id) NOT ENFORCED -) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = 'localhost', - 'port' = '3306', - 'username' = 'root', - 'password' = '123456', - 'database-name' = 'test_db', - 'table-name' = 'test_tb', - 'debezium.binary.handling.mode' = 'base64' -); -``` - -`binary_data`字段, 在数据库中的类型是VARBINARY(N),我们在有些场景需要将二进制数据转换为base64编码的字符串数据,可以通过添加参数'debezium.binary.handling.mode' = 'base64'来开启这个功能, -添加此参数的情况下,我们就可以在flink sql中将该字段类型映射为`STRING`,从而获取base64编码的字符串数据。 - -数据类型映射 ----------------- - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    MySQL typeFlink SQL typeNOTE
    TINYINTTINYINT
    - SMALLINT
    - TINYINT UNSIGNED
    - TINYINT UNSIGNED ZEROFILL -
    SMALLINT
    - INT
    - MEDIUMINT
    - SMALLINT UNSIGNED
    - SMALLINT UNSIGNED ZEROFILL -
    INT
    - BIGINT
    - INT UNSIGNED
    - INT UNSIGNED ZEROFILL
    - MEDIUMINT UNSIGNED
    - MEDIUMINT UNSIGNED ZEROFILL -
    BIGINT
    - BIGINT UNSIGNED
    - BIGINT UNSIGNED ZEROFILL
    - SERIAL -
    DECIMAL(20, 0)
    - FLOAT
    - FLOAT UNSIGNED
    - FLOAT UNSIGNED ZEROFILL -
    FLOAT
    - REAL
    - REAL UNSIGNED
    - REAL UNSIGNED ZEROFILL
    - DOUBLE
    - DOUBLE UNSIGNED
    - DOUBLE UNSIGNED ZEROFILL
    - DOUBLE PRECISION
    - DOUBLE PRECISION UNSIGNED
    - DOUBLE PRECISION UNSIGNED ZEROFILL -
    DOUBLE
    - NUMERIC(p, s)
    - NUMERIC(p, s) UNSIGNED
    - NUMERIC(p, s) UNSIGNED ZEROFILL
    - DECIMAL(p, s)
    - DECIMAL(p, s) UNSIGNED
    - DECIMAL(p, s) UNSIGNED ZEROFILL
    - FIXED(p, s)
    - FIXED(p, s) UNSIGNED
    - FIXED(p, s) UNSIGNED ZEROFILL
    - where p <= 38
    -
    DECIMAL(p, s)
    - NUMERIC(p, s)
    - NUMERIC(p, s) UNSIGNED
    - NUMERIC(p, s) UNSIGNED ZEROFILL
    - DECIMAL(p, s)
    - DECIMAL(p, s) UNSIGNED
    - DECIMAL(p, s) UNSIGNED ZEROFILL
    - FIXED(p, s)
    - FIXED(p, s) UNSIGNED
    - FIXED(p, s) UNSIGNED ZEROFILL
    - where 38 < p <= 65
    -
    STRING在 MySQL 中,十进制数据类型的精度高达 65,但在 Flink 中,十进制数据类型的精度仅限于 38。所以,如果定义精度大于 38 的十进制列,则应将其映射到字符串以避免精度损失。
    - BOOLEAN
    - TINYINT(1)
    - BIT(1) -
    BOOLEAN
    DATEDATE
    TIME [(p)]TIME [(p)]
    TIMESTAMP [(p)]
    - DATETIME [(p)] -
    TIMESTAMP [(p)] -
    - CHAR(n) - CHAR(n)
    - VARCHAR(n) - VARCHAR(n)
    - BIT(n) - BINARY(⌈(n + 7) / 8⌉)
    - BINARY(n) - BINARY(n)
    - VARBINARY(N) - VARBINARY(N)
    - TINYTEXT
    - TEXT
    - MEDIUMTEXT
    - LONGTEXT
    -
    STRING
    - TINYBLOB
    - BLOB
    - MEDIUMBLOB
    - LONGBLOB
    -
    BYTES目前,对于 MySQL 中的 BLOB 数据类型,仅支持长度不大于 2147483647(2**31-1)的 blob。
    - YEAR - INT
    - ENUM - STRING
    - JSON - STRING JSON 数据类型将在 Flink 中转换为 JSON 格式的字符串。
    - SET - ARRAY<STRING>因为 MySQL 中的 SET 数据类型是一个字符串对象,可以有零个或多个值 - 它应该始终映射到字符串数组。 -
    - GEOMETRY
    - POINT
    - LINESTRING
    - POLYGON
    - MULTIPOINT
    - MULTILINESTRING
    - MULTIPOLYGON
    - GEOMETRYCOLLECTION
    -
    - STRING - - MySQL 中的空间数据类型将转换为具有固定 Json 格式的字符串。 - 请参考 MySQL 空间数据类型映射 章节了解更多详细信息。 -
    -
    - -### 空间数据类型映射 - -MySQL中除`GEOMETRYCOLLECTION`之外的空间数据类型都会转换为 Json 字符串,格式固定,如:
    -```json -{"srid": 0 , "type": "xxx", "coordinates": [0, 0]} -``` -字段`srid`标识定义几何体的 SRS,如果未指定 SRID,则 SRID 0 是新几何体值的默认值。 -由于 MySQL 8+ 在定义空间数据类型时只支持特定的 SRID,因此在版本较低的MySQL中,字段`srid`将始终为 0。 - -字段`type`标识空间数据类型,例如`POINT`/`LINESTRING`/`POLYGON`。 - -字段`coordinates`表示空间数据的`坐标`。 - -对于`GEOMETRYCOLLECTION`,它将转换为 Json 字符串,格式固定,如:
    -```json -{"srid": 0 , "type": "GeometryCollection", "geometries": [{"type":"Point","coordinates":[10,10]}]} -``` - -`Geometrics`字段是一个包含所有空间数据的数组。 - -不同空间数据类型映射的示例如下: -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    Spatial data in MySQLJson String converted in Flink
    POINT(1 1){"coordinates":[1,1],"type":"Point","srid":0}
    LINESTRING(3 0, 3 3, 3 5){"coordinates":[[3,0],[3,3],[3,5]],"type":"LineString","srid":0}
    POLYGON((1 1, 2 1, 2 2, 1 2, 1 1)){"coordinates":[[[1,1],[2,1],[2,2],[1,2],[1,1]]],"type":"Polygon","srid":0}
    MULTIPOINT((1 1),(2 2)){"coordinates":[[1,1],[2,2]],"type":"MultiPoint","srid":0}
    MultiLineString((1 1,2 2,3 3),(4 4,5 5)){"coordinates":[[[1,1],[2,2],[3,3]],[[4,4],[5,5]]],"type":"MultiLineString","srid":0}
    MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)), ((5 5, 7 5, 7 7, 5 7, 5 5))){"coordinates":[[[[0,0],[10,0],[10,10],[0,10],[0,0]]],[[[5,5],[7,5],[7,7],[5,7],[5,5]]]],"type":"MultiPolygon","srid":0}
    GEOMETRYCOLLECTION(POINT(10 10), POINT(30 30), LINESTRING(15 15, 20 20)){"geometries":[{"type":"Point","coordinates":[10,10]},{"type":"Point","coordinates":[30,30]},{"type":"LineString","coordinates":[[15,15],[20,20]]}],"type":"GeometryCollection","srid":0}
    -
    - -常见问题 --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) diff --git a/docs/content/docs/connectors/_index.md b/docs/content/docs/connectors/_index.md new file mode 100644 index 0000000000..95f83ece66 --- /dev/null +++ b/docs/content/docs/connectors/_index.md @@ -0,0 +1,25 @@ +--- +title: Connectors +icon: +bold: true +bookCollapseSection: true +weight: 3 +--- + diff --git a/docs/content/downloads.md b/docs/content/docs/connectors/cdc-connectors/_index.md similarity index 88% rename from docs/content/downloads.md rename to docs/content/docs/connectors/cdc-connectors/_index.md index 7697b1b791..64aa8234bf 100644 --- a/docs/content/downloads.md +++ b/docs/content/docs/connectors/cdc-connectors/_index.md @@ -1,3 +1,8 @@ +--- +title: CDC Connectors +bookCollapseSection: true +weight: 2 +--- - -# Downloads - -Please see [Releases History](https://github.com/ververica/flink-cdc-connectors/releases) diff --git a/docs/content/docs/connectors/cdc-connectors/db2-cdc.md b/docs/content/docs/connectors/cdc-connectors/db2-cdc.md new file mode 100644 index 0000000000..12fc36e5d5 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/db2-cdc.md @@ -0,0 +1,381 @@ +--- +title: "Db2 CDC Connector" +weight: 9 +type: docs +aliases: +- /connectors/cdc-connectors/db2-cdc.html +--- + + +# Db2 CDC Connector + +The Db2 CDC connector allows for reading snapshot data and incremental data from Db2 database. This document +describes how to setup the db2 CDC connector to run SQL queries against Db2 databases. + + +## Supported Databases + +| Connector | Database | Driver | +|-----------------------|----------------------------------------------------|----------------------| +| [Db2-cdc](db2-cdc.md) |
  • [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 | + +Dependencies +------------ + +In order to set up the Db2 CDC connector, the following table provides dependency information for both projects +using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-db2-cdc >}} + +### SQL Client JAR + +```Download link is available only for stable releases.``` + +Download flink-sql-connector-db2-cdc-3.0-SNAPSHOT.jar and +put it under `/lib/`. + +**Note:** flink-sql-connector-db2-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users +need to download the source code and compile the corresponding jar. Users should use the released version, such as +[flink-sql-connector-db2-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-db2-cdc), +the released version will be available in the Maven central warehouse. + +Setup Db2 server +---------------- + +Follow the steps in the [Debezium Db2 Connector](https://debezium.io/documentation/reference/1.9/connectors/db2.html#setting-up-db2). + + +Notes +---------------- + +### Not support BOOLEAN type in SQL Replication on Db2 + +Only snapshots can be taken from tables with BOOLEAN type columns. Currently, SQL Replication on Db2 does not support BOOLEAN, so Debezium can not perform CDC on those tables. +Consider using another type to replace BOOLEAN type. + + +How to create a Db2 CDC table +---------------- + +The Db2 CDC table can be defined as following: + +```sql +-- checkpoint every 3 seconds +Flink SQL> SET 'execution.checkpointing.interval' = '3s'; + +-- register a Db2 table 'products' in Flink SQL +Flink SQL> CREATE TABLE products ( + ID INT NOT NULL, + NAME STRING, + DESCRIPTION STRING, + WEIGHT DECIMAL(10,3) + ) WITH ( + 'connector' = 'db2-cdc', + 'hostname' = 'localhost', + 'port' = '50000', + 'username' = 'root', + 'password' = '123456', + 'database-name' = 'mydb', + 'schema-name' = 'myschema', + 'table-name' = 'products'); + +-- read snapshot and binlogs from products table +Flink SQL> SELECT * FROM products; +``` + +Connector Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be 'db2-cdc'.
    hostnamerequired(none)StringIP address or hostname of the Db2 database server.
    usernamerequired(none)StringName of the Db2 database to use when connecting to the Db2 database server.
    passwordrequired(none)StringPassword to use when connecting to the Db2 database server.
    database-namerequired(none)StringDatabase name of the Db2 server to monitor.
    schema-namerequired(none)StringSchema name of the Db2 database to monitor.
    table-namerequired(none)StringTable name of the Db2 database to monitor.
    portoptional50000IntegerInteger port number of the Db2 database server.
    scan.startup.modeoptionalinitialStringOptional startup mode for Db2 CDC consumer, valid enumerations are "initial" + and "latest-offset". Please see Startup Reading Position section +for more detailed information.
    server-time-zoneoptional(none)StringThe session time zone in database server, e.g. "Asia/Shanghai". + It controls how the TIMESTAMP type in Db2 converted to STRING. + See more here. + If not set, then ZoneId.systemDefault() is used to determine the server time zone. +
    debezium.*optional(none)StringPass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from +Db2 server. + For example: 'debezium.snapshot.mode' = 'never'. + See more about the Debezium's Db2 Connector properties
    +
    + +Features +-------- +### Startup Reading Position + +The config option `scan.startup.mode` specifies the startup mode for DB2 CDC consumer. The valid enumerations are: + +- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest binlog. +- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from + the end of the binlog which means only have the changes since the connector was started. + +_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not using them together. If you speicifying both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._ + +### DataStream Source + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; + +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; + +public class Db2SourceExample { + public static void main(String[] args) throws Exception { + SourceFunction db2Source = + Db2Source.builder() + .hostname("yourHostname") + .port(50000) + .database("yourDatabaseName") // set captured database + .tableList("yourSchemaName.yourTableName") // set captured table + .username("yourUsername") + .password("yourPassword") + .deserializer( + new JsonDebeziumDeserializationSchema()) // converts SourceRecord to + // JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + // enable checkpoint + env.enableCheckpointing(3000); + + env.addSource(db2Source) + .print() + .setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute("Print Db2 Snapshot + Change Stream"); + } +} +``` + +Data Type Mapping +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Db2 typeFlink SQL typeNOTE
    + SMALLINT
    +
    SMALLINT
    + INTEGER + INT
    + BIGINT + BIGINT
    + REAL + FLOAT
    + DOUBLE + DOUBLE
    + NUMERIC(p, s)
    + DECIMAL(p, s) +
    DECIMAL(p, s)
    DATEDATE
    TIMETIME
    TIMESTAMP [(p)] + TIMESTAMP [(p)] +
    + CHARACTER(n) + CHAR(n)
    + VARCHAR(n) + VARCHAR(n)
    + BINARY(n) + BINARY(n)
    + VARBINARY(N) + VARBINARY(N)
    + BLOB
    + CLOB
    + DBCLOB
    +
    BYTES
    + VARGRAPHIC
    + XML +
    STRING
    +
    + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/mongodb-cdc.md b/docs/content/docs/connectors/cdc-connectors/mongodb-cdc.md new file mode 100644 index 0000000000..3c560f16e2 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/mongodb-cdc.md @@ -0,0 +1,693 @@ +--- +title: "MongoDB CDC Connector" +weight: 2 +type: docs +aliases: +- /connectors/cdc-connectors/mongodb-cdc.html +--- + + +# MongoDB CDC Connector + +The MongoDB CDC connector allows for reading snapshot data and incremental data from MongoDB. This document describes how to setup the MongoDB CDC connector to run SQL queries against MongoDB. + +Dependencies +------------ + +In order to setup the MongoDB CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-mongodb-cdc >}} + +### SQL Client JAR + +```Download link is available only for stable releases.``` + +Download [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. + +**Note:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mongodb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-mongodb-cdc), the released version will be available in the Maven central warehouse. + +Setup MongoDB +---------------- + +### Availability +- MongoDB version + + MongoDB version >= 3.6
    +We use [change streams](https://docs.mongodb.com/manual/changeStreams/) feature (new in version 3.6) to capture change data. + +- Cluster Deployment + + [replica sets](https://docs.mongodb.com/manual/replication/) or [sharded clusters](https://docs.mongodb.com/manual/sharding/) is required. + +- Storage Engine + + [WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger) storage engine is required. + +- [Replica set protocol version](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) + + Replica set protocol version 1 [(pv1)](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) is required.
    +Starting in version 4.0, MongoDB only supports pv1. pv1 is the default for all new replica sets created with MongoDB 3.2 or later. + +- Privileges + + `changeStream` and `read` privileges are required by MongoDB Kafka Connector. + + You can use the following example for simple authorization.
    + For more detailed authorization, please refer to [MongoDB Database User Roles](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles). + + ```javascript + use admin; + db.createRole( + { + role: "flinkrole", + privileges: [{ + // Grant privileges on all non-system collections in all databases + resource: { db: "", collection: "" }, + actions: [ + "splitVector", + "listDatabases", + "listCollections", + "collStats", + "find", + "changeStream" ] + }], + roles: [ + // Read config.collections and config.chunks + // for sharded cluster snapshot splitting. + { role: 'read', db: 'config' } + ] + } + ); + + db.createUser( + { + user: 'flinkuser', + pwd: 'flinkpw', + roles: [ + { role: 'flinkrole', db: 'admin' } + ] + } + ); + ``` + + +How to create a MongoDB CDC table +---------------- + +The MongoDB CDC table can be defined as following: + +```sql +-- register a MongoDB table 'products' in Flink SQL +CREATE TABLE products ( + _id STRING, // must be declared + name STRING, + weight DECIMAL(10,3), + tags ARRAY, -- array + price ROW, -- embedded document + suppliers ARRAY>, -- embedded documents + PRIMARY KEY(_id) NOT ENFORCED +) WITH ( + 'connector' = 'mongodb-cdc', + 'hosts' = 'localhost:27017,localhost:27018,localhost:27019', + 'username' = 'flinkuser', + 'password' = 'flinkpw', + 'database' = 'inventory', + 'collection' = 'products' +); + +-- read snapshot and change events from products collection +SELECT * FROM products; +``` + +**Note that** + +MongoDB's change event record doesn't have updated before message. So, we can only convert it to Flink's UPSERT changelog stream. +An upsert stream requires a unique key, so we must declare `_id` as primary key. +We can't declare other column as primary key, because delete operation does not contain the key and value besides `_id` and `sharding key`. + +Connector Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be mongodb-cdc.
    schemeoptionalmongodbStringThe protocol connected to MongoDB. eg. mongodb or mongodb+srv.
    hostsrequired(none)StringThe comma-separated list of hostname and port pairs of the MongoDB servers.
    + eg. localhost:27017,localhost:27018 +
    usernameoptional(none)StringName of the database user to be used when connecting to MongoDB.
    + This is required only when MongoDB is configured to use authentication. +
    passwordoptional(none)StringPassword to be used when connecting to MongoDB.
    + This is required only when MongoDB is configured to use authentication. +
    databaseoptional(none)StringName of the database to watch for changes. If not set then all databases will be captured.
    + The database also supports regular expressions to monitor multiple databases matching the regular expression.
    collectionoptional(none)StringName of the collection in the database to watch for changes. If not set then all collections will be captured.
    + The collection also supports regular expressions to monitor multiple collections matching fully-qualified collection identifiers.
    connection.optionsoptional(none)StringThe ampersand-separated connection options of MongoDB. eg.
    + replicaSet=test&connectTimeoutMS=300000 +
    scan.startup.modeoptionalinitialStringOptional startup mode for MongoDB CDC consumer, valid enumerations are "initial", "latest-offset" and "timestamp". + Please see Startup Reading Position section for more detailed information.
    scan.startup.timestamp-millisoptional(none)LongTimestamp in millis of the start point, only used for 'timestamp' startup mode.
    copy.existing.queue.sizeoptional10240IntegerThe max size of the queue to use when copying data.
    batch.sizeoptional1024IntegerThe cursor batch size.
    poll.max.batch.sizeoptional1024IntegerMaximum number of change stream documents to include in a single batch when polling for new data.
    poll.await.time.msoptional1000IntegerThe amount of time to wait before checking for new results on the change stream.
    heartbeat.interval.msoptional0IntegerThe length of time in milliseconds between sending heartbeat messages. Use 0 to disable.
    scan.full-changelogoptionalfalseBooleanWhether try to generate full-mode changelog based on pre- and post-images in MongoDB. Refer to Full Changelog for more details. Supports MongoDB 6.0 and above only.
    scan.incremental.snapshot.enabledoptionalfalseBooleanWhether enable incremental snapshot. The incremental snapshot feature only supports after MongoDB 4.0.
    scan.incremental.snapshot.chunk.size.mboptional64IntegerThe chunk size mb of incremental snapshot.
    scan.incremental.snapshot.chunk.samplesoptional20IntegerThe samples count per chunk when using sample partition strategy during incremental snapshot.
    scan.incremental.close-idle-reader.enabledoptionalfalseBooleanWhether to close idle readers at the end of the snapshot phase.
    + The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.
    + If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true, + so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' +
    scan.cursor.no-timeoutoptionaltrueBooleanMongoDB server normally times out idle cursors after an inactivity period (10 minutes) to prevent excess memory use. Set this option to true to prevent that. Only available when parallelism snapshot is enabled.
    +
    + +Note: `heartbeat.interval.ms` is highly recommended setting a proper value larger than 0 **if the collection changes slowly**. +The heartbeat event can push the `resumeToken` forward to avoid `resumeToken` being expired when we recover the Flink job from a checkpoint or savepoint. + +Available Metadata +---------------- + +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDataTypeDescription
    database_nameSTRING NOT NULLName of the database that contain the row.
    collection_nameSTRING NOT NULLName of the collection that contain the row.
    op_tsTIMESTAMP_LTZ(3) NOT NULLIt indicates the time that the change was made in the database.
    If the record is read from snapshot of the table instead of the change stream, the value is always 0.
    + +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: +```sql +CREATE TABLE products ( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + collection_name STRING METADATA FROM 'collection_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + _id STRING, // must be declared + name STRING, + weight DECIMAL(10,3), + tags ARRAY, -- array + price ROW, -- embedded document + suppliers ARRAY>, -- embedded documents + PRIMARY KEY(_id) NOT ENFORCED +) WITH ( + 'connector' = 'mongodb-cdc', + 'hosts' = 'localhost:27017,localhost:27018,localhost:27019', + 'username' = 'flinkuser', + 'password' = 'flinkpw', + 'database' = 'inventory', + 'collection' = 'products' +); +``` + +Features +-------- + +### Exactly-Once Processing + +The MongoDB CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change stream events with **exactly-once processing** even failures happen. + +### Startup Reading Position + +The config option `scan.startup.mode` specifies the startup mode for MongoDB CDC consumer. The valid enumerations are: + +- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest oplog. +- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from + the end of the oplog which means only have the changes since the connector was started. +- `timestamp`: Skip snapshot phase and start reading oplog events from a specific timestamp. + +For example in DataStream API: +```java +MongoDBSource.builder() + .startupOptions(StartupOptions.latest()) // Start from latest offset + .startupOptions(StartupOptions.timestamp(1667232000000L) // Start from timestamp + .build() +``` + +and with SQL: + +```SQL +CREATE TABLE mongodb_source (...) WITH ( + 'connector' = 'mongodb-cdc', + 'scan.startup.mode' = 'latest-offset', -- Start from latest offset + ... + 'scan.startup.mode' = 'timestamp', -- Start from timestamp + 'scan.startup.timestamp-millis' = '1667232000000' -- Timestamp under timestamp startup mode + ... +) +``` + +### Change Streams + +We integrate the [MongoDB's official Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/) to read snapshot or change events from MongoDB and drive it by Debezium's `EmbeddedEngine`. + +Debezium's `EmbeddedEngine` provides a mechanism for running a single Kafka Connect `SourceConnector` within an application's process, and it can drive any standard Kafka Connect `SourceConnector` properly even which is not provided by Debezium. + +We choose **MongoDB's official Kafka Connector** instead of the **Debezium's MongoDB Connector** because they use a different change data capture mechanism. + +- For Debezium's MongoDB Connector, it reads the `oplog.rs` collection of each replica-set's master node. +- For MongoDB's Kafka Connector, it subscribes `Change Stream` of MongoDB. + +MongoDB's `oplog.rs` collection doesn't keep the changed record's update before state, so it's hard to extract the full document state by a single `oplog.rs` record and convert it to change log stream accepted by Flink (Insert Only, Upsert, All). +Additionally, MongoDB 5 (released in July 2021) has changed the oplog format, so the current Debezium connector cannot be used with it. + +**Change Stream** is a new feature provided by MongoDB 3.6 for replica sets and sharded clusters that allows applications to access real-time data changes without the complexity and risk of tailing the oplog.
    +Applications can use change streams to subscribe to all data changes on a single collection, a database, or an entire deployment, and immediately react to them. + +**Lookup Full Document for Update Operations** is a feature provided by **Change Stream** which can configure the change stream to return the most current majority-committed version of the updated document. Because of this feature, we can easily collect the latest full document and convert the change log to Flink's **Upsert Changelog Stream**. + +By the way, Debezium's MongoDB change streams exploration mentioned by [DBZ-435](https://issues.redhat.com/browse/DBZ-435) is on roadmap.
    +If it's done, we can consider integrating two kinds of source connector for users to choose. + +### DataStream Source + +The MongoDB CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows: + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.mongodb.MongoDBSource; + +public class MongoDBSourceExample { + public static void main(String[] args) throws Exception { + SourceFunction sourceFunction = MongoDBSource.builder() + .hosts("localhost:27017") + .username("flink") + .password("flinkpw") + .databaseList("inventory") // set captured database, support regex + .collectionList("inventory.products", "inventory.orders") //set captured collections, support regex + .deserializer(new JsonDebeziumDeserializationSchema()) + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env.addSource(sourceFunction) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute(); + } +} +``` + +The MongoDB CDC incremental connector (after 2.3.0) can be used as the following shows: +```java +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.cdc.connectors.mongodb.source.MongoDBSource; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; + +public class MongoDBIncrementalSourceExample { + public static void main(String[] args) throws Exception { + MongoDBSource mongoSource = + MongoDBSource.builder() + .hosts("localhost:27017") + .databaseList("inventory") // set captured database, support regex + .collectionList("inventory.products", "inventory.orders") //set captured collections, support regex + .username("flink") + .password("flinkpw") + .deserializer(new JsonDebeziumDeserializationSchema()) + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(3000); + // set the source parallelism to 2 + env.fromSource(mongoSource, WatermarkStrategy.noWatermarks(), "MongoDBIncrementalSource") + .setParallelism(2) + .print() + .setParallelism(1); + + env.execute("Print MongoDB Snapshot + Change Stream"); + } +} +``` + +**Note:** +- If database regex is used, `readAnyDatabase` role is required. +- The incremental snapshot feature only supports after MongoDB 4.0. + +### Full Changelog + +MongoDB 6.0 and above supports emitting change stream events containing document before and after the change was made (aka. pre- and post-images). + +- The pre-image is the document before it was replaced, updated, or deleted. There is no pre-image for an inserted document. + +- The post-image is the document after it was inserted, replaced, or updated. There is no post-image for a deleted document. + +MongoDB CDC could make uses of pre-image and post-images to generate full-mode changelog stream including Insert, Update Before, Update After, and Delete data rows, thereby avoiding additional `ChangelogNormalize` downstream node. + +To enable this feature, here's some prerequisites: + +- MongoDB version must be 6.0 or above; +- Enable `preAndPostImages` feature at the database level: +```javascript +db.runCommand({ + setClusterParameter: { + changeStreamOptions: { + preAndPostImages: { + expireAfterSeconds: 'off' // replace with custom image expiration time + } + } + } +}) +``` +- Enable `changeStreamPreAndPostImages` feature for collections to be monitored: +```javascript +db.runCommand({ + collMod: "<< collection name >>", + changeStreamPreAndPostImages: { + enabled: true + } +}) +``` +- Enable MongoDB CDC's `scan.full-changelog` feature: + +```java +MongoDBSource.builder() + .scanFullChangelog(true) + ... + .build() +``` + +or with Flink SQL: + +```SQL +CREATE TABLE mongodb_source (...) WITH ( + 'connector' = 'mongodb-cdc', + 'scan.full-changelog' = 'true', + ... +) +``` + +Data Type Mapping +---------------- +[BSON](https://docs.mongodb.com/manual/reference/bson-types/) short for **Binary JSON** is a binary-encoded serialization of JSON-like format used to store documents and make remote procedure calls in MongoDB. + +[Flink SQL Data Type](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/dev/table/types/) is similar to the SQL standard’s data type terminology which describes the logical type of a value in the table ecosystem. It can be used to declare input and/or output types of operations. + +In order to enable Flink SQL to process data from heterogeneous data sources, the data types of heterogeneous data sources need to be uniformly converted to Flink SQL data types. + +The following is the mapping of BSON type and Flink SQL type. + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    BSON typeFlink SQL type
    TINYINT
    SMALLINT
    + Int
    +
    INT
    LongBIGINT
    FLOAT
    DoubleDOUBLE
    Decimal128DECIMAL(p, s)
    BooleanBOOLEAN
    Date
    Timestamp
    DATE
    Date
    Timestamp
    TIME
    DateTIMESTAMP(3)
    TIMESTAMP_LTZ(3)
    TimestampTIMESTAMP(0)
    TIMESTAMP_LTZ(0) +
    + String
    + ObjectId
    + UUID
    + Symbol
    + MD5
    + JavaScript
    + Regex
    STRING
    BinDataBYTES
    ObjectROW
    ArrayARRAY
    DBPointerROW<$ref STRING, $id STRING>
    + GeoJSON + + Point : ROW<type STRING, coordinates ARRAY<DOUBLE>>
    + Line : ROW<type STRING, coordinates ARRAY<ARRAY< DOUBLE>>>
    + ... +
    +
    + + +Reference +-------- + +- [MongoDB Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/) +- [Change Streams](https://docs.mongodb.com/manual/changeStreams/) +- [Replication](https://docs.mongodb.com/manual/replication/) +- [Sharding](https://docs.mongodb.com/manual/sharding/) +- [Database User Roles](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles) +- [WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger) +- [Replica set protocol](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) +- [Connection String Options](https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options) +- [Document Pre- and Post-Images](https://www.mongodb.com/docs/v6.0/changeStreams/#change-streams-with-document-pre--and-post-images) +- [BSON Types](https://docs.mongodb.com/manual/reference/bson-types/) +- [Flink DataTypes](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/dev/table/types/) + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/mysql-cdc.md b/docs/content/docs/connectors/cdc-connectors/mysql-cdc.md new file mode 100644 index 0000000000..caaae51d36 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/mysql-cdc.md @@ -0,0 +1,1108 @@ +--- +title: "MySQL CDC Connector" +weight: 7 +type: docs +aliases: +- /connectors/cdc-connectors/mysql-cdc.html +--- + + +# MySQL CDC Connector + +The MySQL CDC connector allows for reading snapshot data and incremental data from MySQL database. This document describes how to setup the MySQL CDC connector to run SQL queries against MySQL databases. + + +## Supported Databases + +| Connector | Database | Driver | +|-----------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| [mysql-cdc](mysql-cdc.md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.27 | + +Dependencies +------------ + +In order to setup the MySQL CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-mysql-cdc >}} + +### SQL Client JAR + +```Download link is available only for stable releases.``` + +Download flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar and put it under `/lib/`. + +**Note:** flink-sql-connector-mysql-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mysql-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc), the released version will be available in the Maven central warehouse. + +Setup MySQL server +---------------- + +You have to define a MySQL user with appropriate permissions on all databases that the Debezium MySQL connector monitors. + +1. Create the MySQL user: + +```sql +mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password'; +``` + +2. Grant the required permissions to the user: + +```sql +mysql> GRANT SELECT, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password'; +``` +**Note:** The RELOAD permissions is not required any more when `scan.incremental.snapshot.enabled` is enabled (enabled by default). + +3. Finalize the user’s permissions: + +```sql +mysql> FLUSH PRIVILEGES; +``` + +See more about the [permission explanation](https://debezium.io/documentation/reference/1.9/connectors/mysql.html#mysql-creating-user). + + +Notes +---------------- + +### Set a different SERVER ID for each reader + +Every MySQL database client for reading binlog should have a unique id, called server id. MySQL server will use this id to maintain network connection and the binlog position. Therefore, if different jobs share a same server id, it may result to read from wrong binlog position. +Thus, it is recommended to set different server id for each reader via the [SQL Hints](https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/sql/hints.html), +e.g. assuming the source parallelism is 4, then we can use `SELECT * FROM source_table /*+ OPTIONS('server-id'='5401-5404') */ ;` to assign unique server id for each of the 4 source readers. + + +### Setting up MySQL session timeouts + +When an initial consistent snapshot is made for large databases, your established connection could timeout while the tables are being read. You can prevent this behavior by configuring interactive_timeout and wait_timeout in your MySQL configuration file. +- `interactive_timeout`: The number of seconds the server waits for activity on an interactive connection before closing it. See [MySQL documentations](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout). +- `wait_timeout`: The number of seconds the server waits for activity on a noninteractive connection before closing it. See [MySQL documentations](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout). + + +How to create a MySQL CDC table +---------------- + +The MySQL CDC table can be defined as following: + +```sql +-- checkpoint every 3000 milliseconds +Flink SQL> SET 'execution.checkpointing.interval' = '3s'; + +-- register a MySQL table 'orders' in Flink SQL +Flink SQL> CREATE TABLE orders ( + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY(order_id) NOT ENFORCED + ) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'username' = 'root', + 'password' = '123456', + 'database-name' = 'mydb', + 'table-name' = 'orders'); + +-- read snapshot and binlogs from orders table +Flink SQL> SELECT * FROM orders; +``` + +Connector Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be 'mysql-cdc'.
    hostnamerequired(none)StringIP address or hostname of the MySQL database server.
    usernamerequired(none)StringName of the MySQL database to use when connecting to the MySQL database server.
    passwordrequired(none)StringPassword to use when connecting to the MySQL database server.
    database-namerequired(none)StringDatabase name of the MySQL server to monitor. The database-name also supports regular expressions to monitor multiple tables matches the regular expression.
    table-namerequired(none)String + Table name of the MySQL database to monitor. The table-name also supports regular expressions to monitor multiple tables that satisfy the regular expressions. Note: When the MySQL CDC connector regularly matches the table name, it will concat the database-name and table-name filled in by the user through the string `\\.` to form a full-path regular expression, and then use the regular expression to match the fully qualified name of the table in the MySQL database. +
    portoptional3306IntegerInteger port number of the MySQL database server.
    server-idoptional(none)StringA numeric ID or a numeric ID range of this database client, The numeric ID syntax is like '5400', + the numeric ID range syntax is like '5400-5408', The numeric ID range syntax is recommended when 'scan.incremental.snapshot.enabled' enabled. + Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the MySQL cluster + as another server (with this unique ID) so it can read the binlog. By default, a random number is generated between 5400 and 6400, + though we recommend setting an explicit value. +
    scan.incremental.snapshot.enabledoptionaltrueBooleanIncremental snapshot is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism, + the incremental snapshot has many advantages, including: + (1) source can be parallel during snapshot reading, + (2) source can perform checkpoints in the chunk granularity during snapshot reading, + (3) source doesn't need to acquire global read lock (FLUSH TABLES WITH READ LOCK) before snapshot reading. + If you would like the source run in parallel, each parallel reader should have an unique server id, so + the 'server-id' must be a range like '5400-6400', and the range must be larger than the parallelism. + Please see Incremental Snapshot Readingsection for more detailed information. +
    scan.incremental.snapshot.chunk.sizeoptional8096IntegerThe chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table.
    scan.snapshot.fetch.sizeoptional1024IntegerThe maximum fetch size for per poll when read table snapshot.
    scan.startup.modeoptionalinitialStringOptional startup mode for MySQL CDC consumer, valid enumerations are "initial", "earliest-offset", "latest-offset", "specific-offset" and "timestamp". + Please see Startup Reading Position section for more detailed information.
    scan.startup.specific-offset.fileoptional(none)StringOptional binlog file name used in case of "specific-offset" startup mode
    scan.startup.specific-offset.posoptional(none)LongOptional binlog file position used in case of "specific-offset" startup mode
    scan.startup.specific-offset.gtid-setoptional(none)StringOptional GTID set used in case of "specific-offset" startup mode
    scan.startup.specific-offset.skip-eventsoptional(none)LongOptional number of events to skip after the specific starting offset
    scan.startup.specific-offset.skip-rowsoptional(none)LongOptional number of rows to skip after the specific starting offset
    server-time-zoneoptional(none)StringThe session time zone in database server, e.g. "Asia/Shanghai". + It controls how the TIMESTAMP type in MYSQL converted to STRING. + See more here. + If not set, then ZoneId.systemDefault() is used to determine the server time zone. +
    debezium.min.row. + count.to.stream.resultoptional1000Integer +During a snapshot operation, the connector will query each included table to produce a read event for all rows in that table. This parameter determines whether the MySQL connection will pull all results for a table into memory (which is fast but requires large amounts of memory), or whether the results will instead be streamed (can be slower, but will work for very large tables). The value specifies the minimum number of rows a table must contain before the connector will stream results, and defaults to 1,000. Set this parameter to '0' to skip all table size checks and always stream all results during a snapshot.
    connect.timeoutoptional30sDurationThe maximum time that the connector should wait after trying to connect to the MySQL database server before timing out.
    connect.max-retriesoptional3IntegerThe max retry times that the connector should retry to build MySQL database server connection.
    connection.pool.sizeoptional20IntegerThe connection pool size.
    jdbc.properties.*optional20StringOption to pass custom JDBC URL properties. User can pass custom properties like 'jdbc.properties.useSSL' = 'false'.
    heartbeat.intervaloptional30sDurationThe interval of sending heartbeat event for tracing the latest available binlog offsets.
    debezium.*optional(none)StringPass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from MySQL server. + For example: 'debezium.snapshot.mode' = 'never'. + See more about the Debezium's MySQL Connector properties
    scan.incremental.close-idle-reader.enabledoptionalfalseBooleanWhether to close idle readers at the end of the snapshot phase.
    + The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.
    + If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true, + so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' +
    debezium.binary.handling.modeoptional(none)Stringdebezium.binary.handling.mode can be set to one of the following values: + none: No processing is performed, and the binary data type is transmitted as a byte array (byte array). + base64: The binary data type is converted to a Base64-encoded string and transmitted. + hex: The binary data type is converted to a hexadecimal string and transmitted. + The default value is none. Depending on your requirements and data types, you can choose the appropriate processing mode. If your database contains a large number of binary data types, it is recommended to use base64 or hex mode to make it easier to handle during transmission.
    +
    + +Available Metadata +---------------- + +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDataTypeDescription
    table_nameSTRING NOT NULLName of the table that contain the row.
    database_nameSTRING NOT NULLName of the database that contain the row.
    op_tsTIMESTAMP_LTZ(3) NOT NULLIt indicates the time that the change was made in the database.
    If the record is read from snapshot of the table instead of the binlog, the value is always 0.
    row_kindSTRING NOT NULLIt indicates the row kind of the changelog,Note: The downstream SQL operator may fail to compare due to this new added column when processing the row retraction if +the source operator chooses to output the 'row_kind' column for each record. It is recommended to use this metadata column only in simple synchronization jobs. +
    '+I' means INSERT message, '-D' means DELETE message, '-U' means UPDATE_BEFORE message and '+U' means UPDATE_AFTER message.
    + +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: + +```sql +CREATE TABLE products +( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + table_name STRING METADATA FROM 'table_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + operation STRING METADATA FROM 'row_kind' VIRTUAL, + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY (order_id) NOT ENFORCED +) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'username' = 'root', + 'password' = '123456', + 'database-name' = 'mydb', + 'table-name' = 'orders' + ); +``` + +The extended CREATE TABLE example demonstrates the usage of regex to match multi-tables: + +```sql +CREATE TABLE products +( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + table_name STRING METADATA FROM 'table_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + operation STRING METADATA FROM 'row_kind' VIRTUAL, + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY (order_id) NOT ENFORCED +) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'username' = 'root', + 'password' = '123456', + 'database-name' = '(^(test).*|^(tpc).*|txc|.*[p$]|t{2})', + 'table-name' = '(t[5-8]|tt)' + ); +``` + + + + + + + + + + + + + + + + + + + + + + + + + +
    exampleexpressiondescription
    prefix match^(test).*This matches the database name or table name starts with prefix of test, e.g test1、test2.
    suffix match.*[p$]This matches the database name or table name ends with suffix of p, e.g cdcp、edcp.
    specific matchtxcThis matches the database name or table name according to a specific name, e.g txc.
    + +It will use `database-name\\.table-name` as a pattern to match tables, as above examples using pattern `(^(test).*|^(tpc).*|txc|.*[p$]|t{2})\\.(t[5-8]|tt)` matches txc.tt、test2.test5. + + +Features +-------- + +### Incremental Snapshot Reading + +Incremental snapshot reading is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism, the incremental snapshot has many advantages, including: +* (1) MySQL CDC Source can be parallel during snapshot reading +* (2) MySQL CDC Source can perform checkpoints in the chunk granularity during snapshot reading +* (3) MySQL CDC Source doesn't need to acquire global read lock (FLUSH TABLES WITH READ LOCK) before snapshot reading + +If you would like the source run in parallel, each parallel reader should have an unique server id, so the 'server-id' must be a range like '5400-6400', +and the range must be larger than the parallelism. + +During the incremental snapshot reading, the MySQL CDC Source firstly splits snapshot chunks (splits) by primary key of table, +and then MySQL CDC Source assigns the chunks to multiple readers to read the data of snapshot chunk. + +#### Controlling Parallelism + +Incremental snapshot reading provides the ability to read snapshot data parallelly. +You can control the source parallelism by setting the job parallelism `parallelism.default`. For example, in SQL CLI: + +```sql +Flink SQL> SET 'parallelism.default' = 8; +``` + +#### Checkpoint + +Incremental snapshot reading provides the ability to perform checkpoint in chunk level. It resolves the checkpoint timeout problem in previous version with old snapshot reading mechanism. + +#### Lock-free + +The MySQL CDC source use **incremental snapshot algorithm**, which avoid acquiring global read lock (FLUSH TABLES WITH READ LOCK) and thus doesn't need `RELOAD` permission. + +#### MySQL High Availability Support + +The ```mysql-cdc``` connector offers high availability of MySQL high available cluster by using the [GTID](https://dev.mysql.com/doc/refman/5.7/en/replication-gtids-concepts.html) information. To obtain the high availability, the MySQL cluster need enable the GTID mode, the GTID mode in your mysql config file should contain following settings: + +```yaml +gtid_mode = on +enforce_gtid_consistency = on +``` + +If the monitored MySQL server address contains slave instance, you need set following settings to the MySQL conf file. The setting ```log-slave-updates = 1``` enables the slave instance to also write the data that synchronized from master to its binlog, this makes sure that the ```mysql-cdc``` connector can consume entire data from the slave instance. + +```yaml +gtid_mode = on +enforce_gtid_consistency = on +log-slave-updates = 1 +``` + +After the server you monitored fails in MySQL cluster, you only need to change the monitored server address to other available server and then restart the job from the latest checkpoint/savepoint, the job will restore from the checkpoint/savepoint and won't miss any records. + +It's recommended to configure a DNS(Domain Name Service) or VIP(Virtual IP Address) for your MySQL cluster, using the DNS or VIP address for ```mysql-cdc``` connector, the DNS or VIP would automatically route the network request to the active MySQL server. In this way, you don't need to modify the address and restart your pipeline anymore. + +#### MySQL Heartbeat Event Support + +If the table updates infrequently, the binlog file or GTID set may have been cleaned in its last committed binlog position. +The CDC job may restart fails in this case. So the heartbeat event will help update binlog position. By default heartbeat event is enabled in MySQL CDC source and the interval is set to 30 seconds. You can specify the interval by using table option ```heartbeat.interval```, or set the option to `0s` to disable heartbeat events. + +#### How Incremental Snapshot Reading works + +When the MySQL CDC source is started, it reads snapshot of table parallelly and then reads binlog of table with single parallelism. + +In snapshot phase, the snapshot is cut into multiple snapshot chunks according to primary key of table and the size of table rows. +Snapshot chunks is assigned to multiple snapshot readers. Each snapshot reader reads its received chunks with [chunk reading algorithm](#snapshot-chunk-reading) and send the read data to downstream. +The source manages the process status (finished or not) of chunks, thus the source of snapshot phase can support checkpoint in chunk level. +If a failure happens, the source can be restored and continue to read chunks from last finished chunks. + +After all snapshot chunks finished, the source will continue to read binlog in a single task. +In order to guarantee the global data order of snapshot records and binlog records, binlog reader will start to read data +until there is a complete checkpoint after snapshot chunks finished to make sure all snapshot data has been consumed by downstream. +The binlog reader tracks the consumed binlog position in state, thus source of binlog phase can support checkpoint in row level. + +Flink performs checkpoints for the source periodically, in case of failover, the job will restart and restore from the last successful checkpoint state and guarantees the exactly once semantic. + +##### Snapshot Chunk Splitting + +When performing incremental snapshot reading, MySQL CDC source need a criterion which used to split the table. +MySQL CDC Source use a splitting column to split the table to multiple splits (chunks). By default, MySQL CDC source will identify the primary key column of the table and use the first column in primary key as the splitting column. +If there is no primary key in the table, incremental snapshot reading will fail and you can disable `scan.incremental.snapshot.enabled` to fallback to old snapshot reading mechanism. + +For numeric and auto incremental splitting column, MySQL CDC Source efficiently splits chunks by fixed step length. +For example, if you had a table with a primary key column of `id` which is auto-incremental BIGINT type, the minimum value was `0` and maximum value was `100`, +and the table option `scan.incremental.snapshot.chunk.size` value is `25`, the table would be split into following chunks: + +``` + (-∞, 25), + [25, 50), + [50, 75), + [75, 100), + [100, +∞) +``` + +For other primary key column type, MySQL CDC Source executes the statement in the form of `SELECT MAX(STR_ID) AS chunk_high FROM (SELECT * FROM TestTable WHERE STR_ID > 'uuid-001' limit 25)` to get the low and high value for each chunk, +the splitting chunks set would be like: + + ``` + (-∞, 'uuid-001'), + ['uuid-001', 'uuid-009'), + ['uuid-009', 'uuid-abc'), + ['uuid-abc', 'uuid-def'), + [uuid-def, +∞). +``` + +##### Chunk Reading Algorithm + +For above example `MyTable`, if the MySQL CDC Source parallelism was set to 4, MySQL CDC Source would run 4 readers which each executes **Offset Signal Algorithm** to +get a final consistent output of the snapshot chunk. The **Offset Signal Algorithm** simply describes as following: + + * (1) Record current binlog position as `LOW` offset + * (2) Read and buffer the snapshot chunk records by executing statement `SELECT * FROM MyTable WHERE id > chunk_low AND id <= chunk_high` + * (3) Record current binlog position as `HIGH` offset + * (4) Read the binlog records that belong to the snapshot chunk from `LOW` offset to `HIGH` offset + * (5) Upsert the read binlog records into the buffered chunk records, and emit all records in the buffer as final output (all as INSERT records) of the snapshot chunk + * (6) Continue to read and emit binlog records belong to the chunk after the `HIGH` offset in *single binlog reader*. + +The algorithm is inspired by [DBLog Paper](https://arxiv.org/pdf/2010.12597v1.pdf), please refer it for more detail. + +**Note:** If the actual values for the primary key are not uniformly distributed across its range, this may lead to unbalanced tasks when incremental snapshot read. + +### Exactly-Once Processing + +The MySQL CDC connector is a Flink Source connector which will read table snapshot chunks first and then continues to read binlog, +both snapshot phase and binlog phase, MySQL CDC connector read with **exactly-once processing** even failures happen. + +### Startup Reading Position + +The config option `scan.startup.mode` specifies the startup mode for MySQL CDC consumer. The valid enumerations are: + +- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest binlog. +- `earliest-offset`: Skip snapshot phase and start reading binlog events from the earliest accessible binlog offset. +- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from +the end of the binlog which means only have the changes since the connector was started. +- `specific-offset`: Skip snapshot phase and start reading binlog events from a specific offset. The offset could be +specified with binlog filename and position, or a GTID set if GTID is enabled on server. +- `timestamp`: Skip snapshot phase and start reading binlog events from a specific timestamp. + +For example in DataStream API: +```java +MySQLSource.builder() + .startupOptions(StartupOptions.earliest()) // Start from earliest offset + .startupOptions(StartupOptions.latest()) // Start from latest offset + .startupOptions(StartupOptions.specificOffset("mysql-bin.000003", 4L) // Start from binlog file and offset + .startupOptions(StartupOptions.specificOffset("24DA167-0C0C-11E8-8442-00059A3C7B00:1-19")) // Start from GTID set + .startupOptions(StartupOptions.timestamp(1667232000000L) // Start from timestamp + ... + .build() +``` + +and with SQL: + +```SQL +CREATE TABLE mysql_source (...) WITH ( + 'connector' = 'mysql-cdc', + 'scan.startup.mode' = 'earliest-offset', -- Start from earliest offset + 'scan.startup.mode' = 'latest-offset', -- Start from latest offset + 'scan.startup.mode' = 'specific-offset', -- Start from specific offset + 'scan.startup.mode' = 'timestamp', -- Start from timestamp + 'scan.startup.specific-offset.file' = 'mysql-bin.000003', -- Binlog filename under specific offset startup mode + 'scan.startup.specific-offset.pos' = '4', -- Binlog position under specific offset mode + 'scan.startup.specific-offset.gtid-set' = '24DA167-0C0C-11E8-8442-00059A3C7B00:1-19', -- GTID set under specific offset startup mode + 'scan.startup.timestamp-millis' = '1667232000000' -- Timestamp under timestamp startup mode + ... +) +``` + +**Notes:** +1. MySQL source will print the current binlog position into logs with INFO level on checkpoint, with the prefix +"Binlog offset on checkpoint {checkpoint-id}". It could be useful if you want to restart the job from a specific checkpointed position. +2. If schema of capturing tables was changed previously, starting with earliest offset, specific offset or timestamp +could fail as the Debezium reader keeps the current latest table schema internally and earlier records with unmatched schema cannot be correctly parsed. + +### DataStream Source + +```java +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.mysql.source.MySqlSource; + +public class MySqlSourceExample { + public static void main(String[] args) throws Exception { + MySqlSource mySqlSource = MySqlSource.builder() + .hostname("yourHostname") + .port(yourPort) + .databaseList("yourDatabaseName") // set captured database, If you need to synchronize the whole database, Please set tableList to ".*". + .tableList("yourDatabaseName.yourTableName") // set captured table + .username("yourUsername") + .password("yourPassword") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + // enable checkpoint + env.enableCheckpointing(3000); + + env + .fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") + // set 4 parallel source tasks + .setParallelism(4) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute("Print MySQL Snapshot + Binlog"); + } +} +``` + +### Scan Newly Added Tables + +Scan Newly Added Tables feature enables you add new tables to monitor for existing running pipeline, the newly added tables will read theirs snapshot data firstly and then read their changelog automatically. + +Imaging this scenario: At the beginning, a Flink job monitor tables `[product, user, address]`, but after some days we would like the job can also monitor tables `[order, custom]` which contains history data, and we need the job can still reuse existing state of the job, this feature can resolve this case gracefully. + +The following operations show how to enable this feature to resolve above scenario. An existing Flink job which uses CDC Source like: + +```java + MySqlSource mySqlSource = MySqlSource.builder() + .hostname("yourHostname") + .port(yourPort) + .scanNewlyAddedTableEnabled(true) // enable scan the newly added tables feature + .databaseList("db") // set captured database + .tableList("db.product, db.user, db.address") // set captured tables [product, user, address] + .username("yourUsername") + .password("yourPassword") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + // your business code +``` + +If we would like to add new tables `[order, custom]` to an existing Flink job,just need to update the `tableList()` value of the job to include `[order, custom]` and restore the job from previous savepoint. + +_Step 1_: Stop the existing Flink job with savepoint. +```shell +$ ./bin/flink stop $Existing_Flink_JOB_ID +``` +```shell +Suspending job "cca7bc1061d61cf15238e92312c2fc20" with a savepoint. +Savepoint completed. Path: file:/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab +``` +_Step 2_: Update the table list option for the existing Flink job . +1. update `tableList()` value. +2. build the jar of updated job. +```java + MySqlSource mySqlSource = MySqlSource.builder() + .hostname("yourHostname") + .port(yourPort) + .scanNewlyAddedTableEnabled(true) + .databaseList("db") + .tableList("db.product, db.user, db.address, db.order, db.custom") // set captured tables [product, user, address ,order, custom] + .username("yourUsername") + .password("yourPassword") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + // your business code +``` +_Step 3_: Restore the updated Flink job from savepoint. +```shell +$ ./bin/flink run \ + --detached \ + --fromSavepoint /tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab \ + ./FlinkCDCExample.jar +``` +**Note:** Please refer the doc [Restore the job from previous savepoint](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/deployment/cli/#command-line-interface) for more details. + +### Tables Without primary keys + +Starting from version 2.4.0, MySQL CDC support tables that do not have a primary key. To use a table without primary keys, you must configure the `scan.incremental.snapshot.chunk.key-column` option and specify one non-null field. + +There are two places that need to be taken care of. + +1. If there is an index in the table, try to use a column which is contained in the index in `scan.incremental.snapshot.chunk.key-column`. This will increase the speed of select statement. +2. The processing semantics of a MySQL CDC table without primary keys is determined based on the behavior of the column that are specified by the `scan.incremental.snapshot.chunk.key-column`. + * If no update operation is performed on the specified column, the exactly-once semantics is ensured. + * If the update operation is performed on the specified column, only the at-least-once semantics is ensured. However, you can specify primary keys at downstream and perform the idempotence operation to ensure data correctness. + +### About converting binary type data to base64 encoded data + +```sql +CREATE TABLE products ( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + table_name STRING METADATA FROM 'table_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + binary_data STRING, + PRIMARY KEY(order_id) NOT ENFORCED +) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'username' = 'root', + 'password' = '123456', + 'database-name' = 'test_db', + 'table-name' = 'test_tb', + 'debezium.binary.handling.mode' = 'base64' +); +``` + +`binary_data` field in the database is of type VARBINARY(N). In some scenarios, we need to convert binary data to base64 encoded string data. This feature can be enabled by adding the parameter 'debezium.binary.handling.mode'='base64', +By adding this parameter, we can map the binary field type to 'STRING' in Flink SQL, thereby obtaining base64 encoded string data. + +Data Type Mapping +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    MySQL typeFlink SQL typeNOTE
    TINYINTTINYINT
    + SMALLINT
    + TINYINT UNSIGNED
    + TINYINT UNSIGNED ZEROFILL +
    SMALLINT
    + INT
    + MEDIUMINT
    + SMALLINT UNSIGNED
    + SMALLINT UNSIGNED ZEROFILL +
    INT
    + BIGINT
    + INT UNSIGNED
    + INT UNSIGNED ZEROFILL
    + MEDIUMINT UNSIGNED
    + MEDIUMINT UNSIGNED ZEROFILL +
    BIGINT
    + BIGINT UNSIGNED
    + BIGINT UNSIGNED ZEROFILL
    + SERIAL +
    DECIMAL(20, 0)
    + FLOAT
    + FLOAT UNSIGNED
    + FLOAT UNSIGNED ZEROFILL +
    FLOAT
    + REAL
    + REAL UNSIGNED
    + REAL UNSIGNED ZEROFILL
    + DOUBLE
    + DOUBLE UNSIGNED
    + DOUBLE UNSIGNED ZEROFILL
    + DOUBLE PRECISION
    + DOUBLE PRECISION UNSIGNED
    + DOUBLE PRECISION UNSIGNED ZEROFILL +
    DOUBLE
    + NUMERIC(p, s)
    + NUMERIC(p, s) UNSIGNED
    + NUMERIC(p, s) UNSIGNED ZEROFILL
    + DECIMAL(p, s)
    + DECIMAL(p, s) UNSIGNED
    + DECIMAL(p, s) UNSIGNED ZEROFILL
    + FIXED(p, s)
    + FIXED(p, s) UNSIGNED
    + FIXED(p, s) UNSIGNED ZEROFILL
    + where p <= 38
    +
    DECIMAL(p, s)
    + NUMERIC(p, s)
    + NUMERIC(p, s) UNSIGNED
    + NUMERIC(p, s) UNSIGNED ZEROFILL
    + DECIMAL(p, s)
    + DECIMAL(p, s) UNSIGNED
    + DECIMAL(p, s) UNSIGNED ZEROFILL
    + FIXED(p, s)
    + FIXED(p, s) UNSIGNED
    + FIXED(p, s) UNSIGNED ZEROFILL
    + where 38 < p <= 65
    +
    STRINGThe precision for DECIMAL data type is up to 65 in MySQL, but the precision for DECIMAL is limited to 38 in Flink. + So if you define a decimal column whose precision is greater than 38, you should map it to STRING to avoid precision loss.
    + BOOLEAN
    + TINYINT(1)
    + BIT(1) +
    BOOLEAN
    DATEDATE
    TIME [(p)]TIME [(p)]
    TIMESTAMP [(p)]
    + DATETIME [(p)] +
    TIMESTAMP [(p)] +
    + CHAR(n) + CHAR(n)
    + VARCHAR(n) + VARCHAR(n)
    + BIT(n) + BINARY(⌈(n + 7) / 8⌉)
    + BINARY(n) + BINARY(n)
    + VARBINARY(N) + VARBINARY(N)
    + TINYTEXT
    + TEXT
    + MEDIUMTEXT
    + LONGTEXT
    +
    STRING
    + TINYBLOB
    + BLOB
    + MEDIUMBLOB
    + LONGBLOB
    +
    BYTESCurrently, for BLOB data type in MySQL, only the blob whose length isn't greater than 2,147,483,647(2 ** 31 - 1) is supported.
    + YEAR + INT
    + ENUM + STRING
    + JSON + STRINGThe JSON data type will be converted into STRING with JSON format in Flink.
    + SET + ARRAY<STRING>As the SET data type in MySQL is a string object that can have zero or more values, + it should always be mapped to an array of string +
    + GEOMETRY
    + POINT
    + LINESTRING
    + POLYGON
    + MULTIPOINT
    + MULTILINESTRING
    + MULTIPOLYGON
    + GEOMETRYCOLLECTION
    +
    + STRING + + The spatial data types in MySQL will be converted into STRING with a fixed Json format. + Please see MySQL Spatial Data Types Mapping section for more detailed information. +
    +
    + +### MySQL Spatial Data Types Mapping +The spatial data types except for `GEOMETRYCOLLECTION` in MySQL will be converted into Json String with a fixed format like:
    +```json +{"srid": 0 , "type": "xxx", "coordinates": [0, 0]} +``` +The field `srid` identifies the SRS in which the geometry is defined, SRID 0 is the default for new geometry values if no SRID is specified. +As only MySQL 8+ support to specific SRID when define spatial data type, the field `srid` will always be 0 in MySQL with a lower version. + +The field `type` identifies the spatial data type, such as `POINT`/`LINESTRING`/`POLYGON`. + +The field `coordinates` represents the `coordinates` of the spatial data. + +For `GEOMETRYCOLLECTION`, it will be converted into Json String with a fixed format like:
    +```json +{"srid": 0 , "type": "GeometryCollection", "geometries": [{"type":"Point","coordinates":[10,10]}]} +``` + +The field `geometries` is an array contains all spatial data. + +The example for different spatial data types mapping is as follows: +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Spatial data in MySQLJson String converted in Flink
    POINT(1 1){"coordinates":[1,1],"type":"Point","srid":0}
    LINESTRING(3 0, 3 3, 3 5){"coordinates":[[3,0],[3,3],[3,5]],"type":"LineString","srid":0}
    POLYGON((1 1, 2 1, 2 2, 1 2, 1 1)){"coordinates":[[[1,1],[2,1],[2,2],[1,2],[1,1]]],"type":"Polygon","srid":0}
    MULTIPOINT((1 1),(2 2)){"coordinates":[[1,1],[2,2]],"type":"MultiPoint","srid":0}
    MultiLineString((1 1,2 2,3 3),(4 4,5 5)){"coordinates":[[[1,1],[2,2],[3,3]],[[4,4],[5,5]]],"type":"MultiLineString","srid":0}
    MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)), ((5 5, 7 5, 7 7, 5 7, 5 5))){"coordinates":[[[[0,0],[10,0],[10,10],[0,10],[0,0]]],[[[5,5],[7,5],[7,7],[5,7],[5,5]]]],"type":"MultiPolygon","srid":0}
    GEOMETRYCOLLECTION(POINT(10 10), POINT(30 30), LINESTRING(15 15, 20 20)){"geometries":[{"type":"Point","coordinates":[10,10]},{"type":"Point","coordinates":[30,30]},{"type":"LineString","coordinates":[[15,15],[20,20]]}],"type":"GeometryCollection","srid":0}
    +
    + +{{< top >}} diff --git a/docs/content/connectors/oceanbase-cdc(ZH).md b/docs/content/docs/connectors/cdc-connectors/oceanbase-cdc.md similarity index 55% rename from docs/content/connectors/oceanbase-cdc(ZH).md rename to docs/content/docs/connectors/cdc-connectors/oceanbase-cdc.md index c8269b485e..1e7887e8a0 100644 --- a/docs/content/connectors/oceanbase-cdc(ZH).md +++ b/docs/content/docs/connectors/cdc-connectors/oceanbase-cdc.md @@ -1,3 +1,10 @@ +--- +title: "OceanBase CDC Connector" +weight: 4 +type: docs +aliases: +- /connectors/cdc-connectors/oceanbase-cdc.html +--- -# OceanBase CDC 连接器 +# OceanBase CDC Connector -OceanBase CDC 连接器允许从 OceanBase 读取快照数据和增量数据。本文介绍了如何设置 OceanBase CDC 连接器以对 OceanBase 进行 SQL 查询。 +The OceanBase CDC connector allows for reading snapshot data and incremental data from OceanBase. This document describes how to set up the OceanBase CDC connector to run SQL queries against OceanBase. -## 依赖 +Dependencies +------------ -为了使用 OceanBase CDC 连接器,您必须提供相关的依赖信息。以下依赖信息适用于使用自动构建工具(如 Maven 或 SBT)构建的项目和带有 SQL JAR 包的 SQL 客户端。 +In order to set up the OceanBase CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. -```xml - - com.ververica - flink-connector-oceanbase-cdc - - 3.0-SNAPSHOT - -``` +{{< artifact flink-connector-oceanbase-cdc >}} -如果您是要连接企业版的 OceanBase,您可能需要使用 OceanBase 官方的 JDBC 驱动,这时需要引入如下依赖。 +If you want to use OceanBase JDBC driver to connect to the enterprise edition database, you should also include the following dependency in your class path. ```xml @@ -44,51 +45,57 @@ OceanBase CDC 连接器允许从 OceanBase 读取快照数据和增量数据。 ``` -## 下载 SQL 客户端 JAR 包 +### SQL Client JAR -```下载链接仅在已发布版本可用,请在文档网站左下角选择浏览已发布的版本。``` +```Download link is available only for stable releases.``` -下载[flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/2.5-SNAPSHOT/flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar) 到 `/lib/` 目录下。 +Download [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. -**注意:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT 版本是开发分支`release-XXX`对应的快照版本,快照版本用户需要下载源代码并编译相应的 jar。用户应使用已经发布的版本,例如 [flink-sql-connector-oceanbase-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-oceanbase-cdc) 当前已发布的所有版本都可以在 Maven 中央仓库获取。 +**Note:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oceanbase-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oceanbase-cdc), the released version will be available in the Maven central warehouse. -对于 JDBC 驱动,上述的 cdc jar 文件中已经包含了我们推荐的 MySQL 驱动版本 5.1.47。由于开源许可证的原因,我们不能在上述 cdc jar 文件中包含 OceanBase 的官方 JDBC 驱动,如果您需要使用它,可以从[这里](https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.2/oceanbase-client-2.4.2.jar)下载,然后放到 `/lib/` 目录下,同时需要将配置项 `jdbc.driver` 设为 `com.oceanbase.jdbc.Driver`。 +For JDBC driver, the cdc jar above already contains MySQL JDBC driver 5.1.47, which is our recommended version. Due to the license issue, we can not include the OceanBase JDBC driver in the cdc jar. If you need to use it, you can download it from [here](https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.2/oceanbase-client-2.4.2.jar) and put it under `/lib/`, you also need to set the start option `jdbc.driver` to `com.oceanbase.jdbc.Driver`. -### 配置 OceanBase 数据库和 oblogproxy 服务 +Setup OceanBase and LogProxy Server +---------------------- -1. 按照 [文档](https://github.com/oceanbase/oceanbase#quick-start) 配置 OceanBase 集群。 -2. 在 sys 租户中,为 oblogproxy 创建一个带密码的用户。 +1. Set up the OceanBase cluster following the [doc](https://github.com/oceanbase/oceanbase#quick-start). - ```bash - mysql -h${host} -P${port} -uroot - mysql> SHOW TENANT; - mysql> CREATE USER ${sys_username} IDENTIFIED BY '${sys_password}'; - mysql> GRANT ALL PRIVILEGES ON *.* TO ${sys_username} WITH GRANT OPTION; - ``` +2. Create a user with password in `sys` tenant, this user is used in OceanBase LogProxy. + + ```shell + mysql -h${host} -P${port} -uroot + + mysql> SHOW TENANT; + mysql> CREATE USER ${sys_username} IDENTIFIED BY '${sys_password}'; + mysql> GRANT ALL PRIVILEGES ON *.* TO ${sys_username} WITH GRANT OPTION; + ``` -3. 为你想要监控的租户创建一个用户,这个用户用来读取快照数据和变化事件数据。 -4. OceanBase 社区版用户需要获取`rootserver-list`,可以使用以下命令获取: +3. Create a user in the tenant you want to monitor, this is used to read data for snapshot and change event. - ```bash - mysql> SHOW PARAMETERS LIKE 'rootservice_list'; +4. For users of OceanBase Community Edition, you need to get the `rootserver-list`. You can use the following command to get the value: + + ```shell + mysql> show parameters like 'rootservice_list'; ``` - OceanBase 企业版用户需要获取 `config-url`,可以使用以下命令获取: + + For users of OceanBase Enterprise Edition, you need to get the `config-url`. You can use the following command to get the value: ```shell mysql> show parameters like 'obconfig_url'; ``` -5. 按照 [文档](https://github.com/oceanbase/oblogproxy#getting-started) 配置 oblogproxy。 +5. Setup OceanBase LogProxy. For users of OceanBase Community Edition, you can follow the [quick start](https://github.com/oceanbase/oblogproxy#getting-started). -## 创建 OceanBase CDC 表 +How to create a OceanBase CDC table +---------------- -使用以下命令,创建 OceanBase CDC 表: +The OceanBase CDC table can be defined as following: ```sql --- 每 3 秒做一次 checkpoint,用于测试,生产配置建议 5 到 10 分钟 +-- checkpoint every 3000 milliseconds Flink SQL> SET 'execution.checkpointing.interval' = '3s'; --- 在 Flink SQL 中创建 OceanBase 表 `orders` +-- register a OceanBase table 'orders' in Flink SQL Flink SQL> CREATE TABLE orders ( order_id INT, order_date TIMESTAMP(0), @@ -113,11 +120,11 @@ Flink SQL> CREATE TABLE orders ( 'working-mode' = 'memory' ); --- 从表 orders 中读取快照数据和 binlog 数据 +-- read snapshot and binlogs from orders table Flink SQL> SELECT * FROM orders; ``` -如果您使用的是企业版的 OceanBase Oracle 模式,您需要先添加 OceanBase 的官方 JDBC 驱动 jar 包到 Flink 环境,并且部署企业版的 oblogproxy 服务,然后通过以下命令创建 OceanBase CDC 表: +If you want to use OceanBase Oracle mode, you need to add the OceanBase jdbc jar file to Flink and set up the enterprise edition of oblogproxy, then you can create a table in Flink as following: ```sql Flink SQL> CREATE TABLE orders ( @@ -147,237 +154,235 @@ Flink SQL> CREATE TABLE orders ( ); ``` -您也可以访问 Flink CDC 官网文档,快速体验将数据从 OceanBase 导入到 Elasticsearch。更多信息,参考 [Flink CDC 官网文档](https://ververica.github.io/flink-cdc-connectors/release-2.2/content/%E5%BF%AB%E9%80%9F%E4%B8%8A%E6%89%8B/oceanbase-tutorial-zh.html)。 +You can also try the quickstart tutorial that sync data from OceanBase to Elasticsearch, please refer [Flink CDC Tutorial](https://ververica.github.io/flink-cdc-connectors/release-2.3//content/quickstart/oceanbase-tutorial.html) for more information. -## OceanBase CDC 连接器选项 +Connector Options +---------------- -OceanBase CDC 连接器包括用于 SQL 和 DataStream API 的选项,如下表所示。 +The OceanBase CDC Connector contains some options for both sql and stream api as the following sheet. -*注意*:连接器支持两种方式来指定需要监听的表,两种方式同时使用时会监听两种方式匹配的所有表。 -1. 使用 `database-name` 和 `table-name` 匹配正则表达式中的数据库和表名。 由于`obcdc`(以前的`liboblog`)现在只支持`fnmatch`匹配,我们不能直接使用正则过滤 changelog 事件,所以通过两个选项去匹配去指定监听表只能在`initial`启动模式下使用。 -2. 使用 `table-list` 去匹配数据库名和表名的准确列表。 +*Note*: The connector supports two ways to specify the table list to listen to, and will get the union of the results when both way are used at the same time. +1. Use `database-name` and `table-name` to match database and table names in regex. As the `obcdc` (former `liboblog`) only supports `fnmatch` now, we can't use regex directly to filter change events, so these two options can only be used in `initial` startup mode. +2. Use `table-list` to match the exact value of database and table names.
    - - - - - + + + + + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - + - + - + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - + - + - + - + - + - + - + - - + + - + - - + + - +
    配置项是否必选默认值类型描述OptionRequiredDefaultTypeDescription
    connectorrequired(none) String指定要使用的连接器,此处为 'oceanbase-cdc'Specify what connector to use, here should be 'oceanbase-cdc'.
    scan.startup.moderequired(none) String指定 OceanBase CDC 消费者的启动模式。可取值为'initial','latest-offset' or - 'timestamp'Specify the startup mode for OceanBase CDC consumer, valid enumerations are + 'initial','latest-offset' or 'timestamp'. +
    scan.startup.timestampoptional(none) Long起始点的时间戳,单位为秒。仅在启动模式为 'timestamp' 时可用。Timestamp in seconds of the start point, only used for 'timestamp' startup mode.
    usernamerequired(none) String连接 OceanBase 数据库的用户的名称。Username to be used when connecting to OceanBase.
    passwordrequired(none) String连接 OceanBase 数据库时使用的密码。Password to be used when connecting to OceanBase.
    tenant-namerequired(none) String待监控 OceanBase 数据库的租户名,应该填入精确值。Tenant name of OceanBase to monitor, should be exact value.
    database-nameoptional(none) String待监控 OceanBase 数据库的数据库名,应该是正则表达式,该选项只支持和 'initial' 模式一起使用。Database name of OceanBase to monitor, should be regular expression. Only can be used with 'initial' mode.
    table-nameoptional(none) String待监控 OceanBase 数据库的表名,应该是正则表达式,该选项只支持和 'initial' 模式一起使用。Table name of OceanBase to monitor, should be regular expression. Only can be used with 'initial' mode.
    table-listoptional(none) String待监控 OceanBase 数据库的全路径的表名列表,逗号分隔,如:"db1.table1, db2.table2"。List of full names of tables, separated by commas, e.g. "db1.table1, db2.table2".
    hostnameoptional(none) StringOceanBase 数据库或 OceanBbase 代理 ODP 的 IP 地址或主机名。IP address or hostname of the OceanBase database server or OceanBase Proxy server.
    portoptional(none) Integer - OceanBase 数据库服务器的整数端口号。可以是 OceanBase 服务器的 SQL 端口号(默认值为 2881)
    - 或 OceanBase代理服务的端口号(默认值为 2883)
    Integer port number to connect to OceanBase. It can be the SQL port of OceanBase server, which is 2881 by default, or the port of OceanBase proxy service, which is 2883 by default.
    connect.timeoutoptional 30s Duration连接器在尝试连接到 OceanBase 数据库服务器超时前的最长时间。The maximum time that the connector should wait after trying to connect to the OceanBase database server before timing out.
    server-time-zoneoptional +00:00 String - 数据库服务器中的会话时区,用户控制 OceanBase 的时间类型如何转换为 STRING。
    - 合法的值可以是格式为"±hh:mm"的 UTC 时区偏移量,
    - 如果 mysql 数据库中的时区信息表已创建,合法的值则可以是创建的时区。 -
    The session timezone which controls how temporal types are converted to STRING in OceanBase. Can be UTC offset in format "±hh:mm", or named time zones if the time zone information tables in the mysql database have been created and populated.
    logproxy.hostrequired(none) StringOceanBase 日志代理服务 的 IP 地址或主机名。Hostname or IP address of OceanBase log proxy service.
    logproxy.portrequired(none) IntegerOceanBase 日志代理服务 的端口号。Port number of OceanBase log proxy service.
    logproxy.client.id规则生成optionalBy rule. StringOceanBase日志代理服务的客户端连接 ID,默认值的生成规则是 {flink_ip}_{process_id}_{timestamp}_{thread_id}_{tenant}。Id of a log proxy client connection, will be in format {flink_ip}_{process_id}_{timestamp}_{thread_id}_{tenant} by default.
    rootserver-listoptional(none) StringOceanBase root 服务器列表,服务器格式为 `ip:rpc_port:sql_port`,
    多个服务器地址使用英文分号 `;` 隔开,OceanBase 社区版本必填。
    The semicolon-separated list of OceanBase root servers in format `ip:rpc_port:sql_port`, required for OceanBase CE.
    config-urloptional(none) String从配置服务器获取服务器信息的 url, OceanBase 企业版本必填。The url to get the server info from the config server, required for OceanBase EE.
    working-modeoptional storage String日志代理中 `libobcdc` 的工作模式 , 可以是 `storage` 或 `memory`。Working mode of `obcdc` in LogProxy, can be `storage` or `memory`.
    compatible-modeoptional mysql StringOceanBase 的兼容模式,可以是 `mysql` 或 `oracle`。Compatible mode of OceanBase, can be `mysql` or `oracle`.
    jdbc.driveroptional com.mysql.jdbc.Driver String全量读取时使用的 jdbc 驱动类名。JDBC driver class for snapshot reading.
    jdbc.properties.*optional(none) String传递自定义 JDBC URL 属性的选项。用户可以传递自定义属性,如 'jdbc.properties.useSSL' = 'false'。Option to pass custom JDBC URL properties. User can pass custom properties like 'jdbc.properties.useSSL' = 'false'.
    obcdc.properties.*optional(none) String传递参数到libobcdc,如 'obcdc.properties.sort_trans_participants' = '1'。更多参数信息见 obcdc 配置项说明Option to pass custom configurations to the libobcdc, eg: 'obcdc.properties.sort_trans_participants' = '1'. Please refer to obcdc parameters for more details.
    -## 支持的元数据 +Available Metadata +---------------- -在创建表时,您可以使用以下格式的元数据作为只读列(VIRTUAL)。 +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. - - - + + + - + - + - + - +
    列名数据类型描述KeyDataTypeDescription
    tenant_name STRING NOT NULL当前记录所属的租户名称。Name of the tenant that contains the row.
    database_name STRING NOT NULL当前记录所属的库名。Name of the database that contains the row.
    table_name STRING NOT NULL当前记录所属的表名称。Name of the table that contains the row.
    op_ts TIMESTAMP_LTZ(3) NOT NULL该值表示此修改在数据库中发生的时间。如果这条记录是该表在快照阶段读取的记录,则该值返回 0。It indicates the time that the change was made in the database.
    + If the record is read from snapshot of the table instead of the change stream, the value is always 0.
    -如下 SQL 展示了如何在表中使用这些元数据列: +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: ```sql CREATE TABLE products ( @@ -404,32 +409,35 @@ CREATE TABLE products ( 'port' = '2881', 'rootserver-list' = '127.0.0.1:2882:2881', 'logproxy.host' = '127.0.0.1', - 'logproxy.port' = '2983'); + 'logproxy.port' = '2983', + 'working-mode' = 'memory' +); ``` -## 特性 +Features +-------- -### At-Least-Once 处理 +### At-Least-Once Processing -OceanBase CDC 连接器是一个 Flink Source 连接器。它将首先读取数据库快照,然后再读取变化事件,并进行 **At-Least-Once 处理**。 +The OceanBase CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **at-least-once processing**. -OceanBase 数据库是一个分布式数据库,它的日志也分散在不同的服务器上。由于没有类似 MySQL binlog 偏移量的位置信息,OceanBase 数据库用时间戳作为位置标记。为确保读取完整的数据,liboblog(读取 OceanBase 日志记录的 C++ 库)可能会在给定的时间戳之前读取一些日志数据。因此,OceanBase 数据库可能会读到起始点附近时间戳的重复数据,可保证 **At-Least-Once 处理**。 +OceanBase is a kind of distributed database whose log files are distributed on different servers. As there is no position information like MySQL binlog offset, we can only use timestamp as the position mark. In order to ensure the completeness of reading data, `liboblog` (a C++ library to read OceanBase log record) might read some log data before the given timestamp. So in this way we may read duplicate data whose timestamp is around the start point, and only 'at-least-once' can be guaranteed. -### 启动模式 +### Startup Reading Position -配置选项 `scan.startup.mode` 指定 OceanBase CDC 连接器的启动模式。可用取值包括: +The config option `scan.startup.mode` specifies the startup mode for OceanBase CDC consumer. The valid enumerations are: -- `initial`(默认):在首次启动时对受监视的数据库表执行初始快照,并继续读取最新的提交日志。 -- `latest-offset`:首次启动时,不对受监视的数据库表执行快照,仅从连接器启动时读取提交日志。 -- `timestamp`:在首次启动时不对受监视的数据库表执行初始快照,仅从指定的 `scan.startup.timestamp` 读取最新的提交日志。 +- `initial`: Performs an initial snapshot on the monitored table upon first startup, and continue to read the latest commit log. +- `latest-offset`: Never to perform snapshot on the monitored table upon first startup and just read the latest commit log since the connector is started. +- `timestamp`: Never to perform snapshot on the monitored table upon first startup and just read the commit log from the given `scan.startup.timestamp`. -### 消费提交日志 +### Consume Commit Log -OceanBase CDC 连接器使用 [oblogclient](https://github.com/oceanbase/oblogclient) 消费 OceanBase日志代理服务 中的事务日志。 +The OceanBase CDC Connector using [oblogclient](https://github.com/oceanbase/oblogclient) to consume commit log from OceanBase LogProxy. ### DataStream Source -OceanBase CDC 连接器也可以作为 DataStream Source 使用。您可以按照如下创建一个 SourceFunction: +The OceanBase CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows: ```java import org.apache.flink.api.common.typeinfo.TypeInformation; @@ -503,27 +511,25 @@ public class OceanBaseSourceExample { } } ``` +Data Type Mapping +---------------- -## 数据类型映射 - -### Mysql 模式 +### Mysql Mode
    - - - + + + - + BIT(1) @@ -535,8 +541,7 @@ public class OceanBaseSourceExample { + TINYINT UNSIGNED @@ -544,16 +549,14 @@ public class OceanBaseSourceExample { + SMALLINT UNSIGNED + INT UNSIGNED @@ -565,7 +568,7 @@ public class OceanBaseSourceExample { @@ -581,7 +584,8 @@ public class OceanBaseSourceExample { + where p <= 38
    + @@ -589,13 +593,13 @@ public class OceanBaseSourceExample { - - + + @@ -629,7 +633,7 @@ public class OceanBaseSourceExample { - + @@ -647,7 +651,7 @@ public class OceanBaseSourceExample { TINYTEXT
    TEXT
    MEDIUMTEXT
    - LONGTEXT + LONGTEXT
    @@ -657,7 +661,7 @@ public class OceanBaseSourceExample { TINYBLOB
    BLOB
    MEDIUMBLOB
    - LONGBLOB + LONGBLOB
    @@ -675,21 +679,18 @@ public class OceanBaseSourceExample { - + - +
    OceanBase 数据类型Flink SQL 类型描述OceanBase typeFlink SQL typeNOTE
    - BOOLEAN
    +
    BOOLEAN
    TINYINT(1)
    - BIT(1) -
    BOOLEAN
    SMALLINT
    - TINYINT UNSIGNED -
    SMALLINT
    INT
    MEDIUMINT
    - SMALLINT UNSIGNED -
    INT
    BIGINT
    - INT UNSIGNED -
    BIGINT
    REAL
    - FLOAT + FLOAT
    FLOAT NUMERIC(p, s)
    DECIMAL(p, s)
    - where p <= 38
    DECIMAL(p, s)
    NUMERIC(p, s)
    DECIMAL(p, s)
    - where 38 < p <=65
    STRING - DECIMAL 等同于 NUMERIC。在 OceanBase 数据库中,DECIMAL 数据类型的精度最高为 65。
    - 但在 Flink 中,DECIMAL 的最高精度为 38。因此,
    - 如果你定义了一个精度大于 38 的 DECIMAL 列,你应当将其映射为 STRING,以避免精度损失。 + where 38 < p <=65
    STRINGDECIMAL is equivalent to NUMERIC. The precision for DECIMAL data type is up to 65 in OceanBase, but + the precision for DECIMAL is limited to 38 in Flink. + So if you define a decimal column whose precision is greater than 38, you should map it to STRING to + avoid precision loss.
    DATE
    BIT(n)BINARY(⌈n/8⌉)BINARY(⌈(n + 7) / 8⌉)
    STRING BYTES
    SET ARRAY<STRING> - 因为 OceanBase 的 SET 类型是用包含一个或多个值的字符串对象表示,
    - 所以映射到 Flink 时是一个字符串数组 -
    As the SET data type in OceanBase is a string object that can have zero or more values, it should always be mapped to an array of string
    JSON STRINGJSON 类型的数据在 Flink 中会转化为 JSON 格式的字符串The JSON data type will be converted into STRING with JSON format in Flink.
    -### Oracle 模式 +### Oracle Mode
    @@ -785,3 +786,5 @@ public class OceanBaseSourceExample {
    + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/oracle-cdc.md b/docs/content/docs/connectors/cdc-connectors/oracle-cdc.md new file mode 100644 index 0000000000..76c24f1b59 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/oracle-cdc.md @@ -0,0 +1,701 @@ +--- +title: "Oracle CDC Connector" +weight: 5 +type: docs +aliases: +- /connectors/cdc-connectors/oracle-cdc.html +--- + + +# Oracle CDC Connector + +The Oracle CDC connector allows for reading snapshot data and incremental data from Oracle database. This document describes how to setup the Oracle CDC connector to run SQL queries against Oracle databases. + +Dependencies +------------ + +In order to setup the Oracle CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-oracle-cdc >}} + +### SQL Client JAR + +**Download link is available only for stable releases.** + +Download [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. + +**Note:** flink-sql-connector-oracle-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oracle-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oracle-cdc), the released version will be available in the Maven central warehouse. + +Setup Oracle +---------------- +You have to enable log archiving for Oracle database and define an Oracle user with appropriate permissions on all databases that the Debezium Oracle connector monitors. + +### For Non-CDB database + +1. Enable log archiving + + (1.1). Connect to the database as DBA + ```sql + ORACLE_SID=SID + export ORACLE_SID + sqlplus /nolog + CONNECT sys/password AS SYSDBA + ``` + + (1.2). Enable log archiving + ```sql + alter system set db_recovery_file_dest_size = 10G; + alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; + shutdown immediate; + startup mount; + alter database archivelog; + alter database open; + ``` + **Note:** + + - Enable log archiving requires database restart, pay attention when try to do it + - The archived logs will occupy a large amount of disk space, so consider clean the expired logs the periodically + + (1.3). Check whether log archiving is enabled + ```sql + -- Should now "Database log mode: Archive Mode" + archive log list; + ``` + **Note:** + + Supplemental logging must be enabled for captured tables or the database in order for data changes to capture the before state of changed database rows. + The following illustrates how to configure this on the table/database level. + ```sql + -- Enable supplemental logging for a specific table: + ALTER TABLE inventory.customers ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; + ``` + ```sql + -- Enable supplemental logging for database + ALTER DATABASE ADD SUPPLEMENTAL LOG DATA; + ``` + +2. Create an Oracle user with permissions + + (2.1). Create Tablespace + ```sql + sqlplus sys/password@host:port/SID AS SYSDBA; + CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/SID/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; + exit; + ``` + + (2.2). Create a user and grant permissions + ```sql + sqlplus sys/password@host:port/SID AS SYSDBA; + CREATE USER flinkuser IDENTIFIED BY flinkpw DEFAULT TABLESPACE LOGMINER_TBS QUOTA UNLIMITED ON LOGMINER_TBS; + GRANT CREATE SESSION TO flinkuser; + GRANT SET CONTAINER TO flinkuser; + GRANT SELECT ON V_$DATABASE to flinkuser; + GRANT FLASHBACK ANY TABLE TO flinkuser; + GRANT SELECT ANY TABLE TO flinkuser; + GRANT SELECT_CATALOG_ROLE TO flinkuser; + GRANT EXECUTE_CATALOG_ROLE TO flinkuser; + GRANT SELECT ANY TRANSACTION TO flinkuser; + GRANT LOGMINING TO flinkuser; + GRANT ANALYZE ANY TO flinkuser; + + GRANT CREATE TABLE TO flinkuser; + -- need not to execute if set scan.incremental.snapshot.enabled=true(default) + GRANT LOCK ANY TABLE TO flinkuser; + GRANT ALTER ANY TABLE TO flinkuser; + GRANT CREATE SEQUENCE TO flinkuser; + + GRANT EXECUTE ON DBMS_LOGMNR TO flinkuser; + GRANT EXECUTE ON DBMS_LOGMNR_D TO flinkuser; + + GRANT SELECT ON V_$LOG TO flinkuser; + GRANT SELECT ON V_$LOG_HISTORY TO flinkuser; + GRANT SELECT ON V_$LOGMNR_LOGS TO flinkuser; + GRANT SELECT ON V_$LOGMNR_CONTENTS TO flinkuser; + GRANT SELECT ON V_$LOGMNR_PARAMETERS TO flinkuser; + GRANT SELECT ON V_$LOGFILE TO flinkuser; + GRANT SELECT ON V_$ARCHIVED_LOG TO flinkuser; + GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO flinkuser; + exit; + ``` + +### For CDB database + +Overall, the steps for configuring CDB database is quite similar to non-CDB database, but the commands may be different. +1. Enable log archiving + ```sql + ORACLE_SID=ORCLCDB + export ORACLE_SID + sqlplus /nolog + CONNECT sys/password AS SYSDBA + alter system set db_recovery_file_dest_size = 10G; + -- should exist + alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; + shutdown immediate + startup mount + alter database archivelog; + alter database open; + -- Should show "Database log mode: Archive Mode" + archive log list + exit; + ``` + **Note:** + You can also use the following commands to enable supplemental logging: + ```sql + -- Enable supplemental logging for a specific table: + ALTER TABLE inventory.customers ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; + -- Enable supplemental logging for database + ALTER DATABASE ADD SUPPLEMENTAL LOG DATA; + ``` + +2. Create an Oracle user with permissions + ```sql + sqlplus sys/password@//localhost:1521/ORCLCDB as sysdba + CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; + exit + ``` + ```sql + sqlplus sys/password@//localhost:1521/ORCLPDB1 as sysdba + CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/ORCLPDB1/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; + exit + ``` + ```sql + sqlplus sys/password@//localhost:1521/ORCLCDB as sysdba + CREATE USER flinkuser IDENTIFIED BY flinkpw DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs CONTAINER=ALL; + GRANT CREATE SESSION TO flinkuser CONTAINER=ALL; + GRANT SET CONTAINER TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$DATABASE to flinkuser CONTAINER=ALL; + GRANT FLASHBACK ANY TABLE TO flinkuser CONTAINER=ALL; + GRANT SELECT ANY TABLE TO flinkuser CONTAINER=ALL; + GRANT SELECT_CATALOG_ROLE TO flinkuser CONTAINER=ALL; + GRANT EXECUTE_CATALOG_ROLE TO flinkuser CONTAINER=ALL; + GRANT SELECT ANY TRANSACTION TO flinkuser CONTAINER=ALL; + GRANT LOGMINING TO flinkuser CONTAINER=ALL; + GRANT CREATE TABLE TO flinkuser CONTAINER=ALL; + -- need not to execute if set scan.incremental.snapshot.enabled=true(default) + GRANT LOCK ANY TABLE TO flinkuser CONTAINER=ALL; + GRANT CREATE SEQUENCE TO flinkuser CONTAINER=ALL; + + GRANT EXECUTE ON DBMS_LOGMNR TO flinkuser CONTAINER=ALL; + GRANT EXECUTE ON DBMS_LOGMNR_D TO flinkuser CONTAINER=ALL; + + GRANT SELECT ON V_$LOG TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$LOG_HISTORY TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$LOGMNR_LOGS TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$LOGMNR_CONTENTS TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$LOGMNR_PARAMETERS TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$LOGFILE TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$ARCHIVED_LOG TO flinkuser CONTAINER=ALL; + GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO flinkuser CONTAINER=ALL; + exit + ``` + +See more about the [Setting up Oracle](https://debezium.io/documentation/reference/1.9/connectors/oracle.html#setting-up-oracle) + +How to create an Oracle CDC table +---------------- + +The Oracle CDC table can be defined as following: + +```sql +-- register an Oracle table 'products' in Flink SQL +Flink SQL> CREATE TABLE products ( + ID INT NOT NULL, + NAME STRING, + DESCRIPTION STRING, + WEIGHT DECIMAL(10, 3), + PRIMARY KEY(id) NOT ENFORCED + ) WITH ( + 'connector' = 'oracle-cdc', + 'hostname' = 'localhost', + 'port' = '1521', + 'username' = 'flinkuser', + 'password' = 'flinkpw', + 'database-name' = 'ORCLCDB', + 'schema-name' = 'inventory', + 'table-name' = 'products'); + +-- read snapshot and redo logs from products table +Flink SQL> SELECT * FROM products; +``` +**Note:** +When working with the CDB + PDB model, you are expected to add an extra option `'debezium.database.pdb.name' = 'xxx'` in Flink DDL to specific the name of the PDB to connect to. + +**Note:** +While the connector might work with a variety of Oracle versions and editions, only Oracle 9i, 10g, 11g and 12c have been tested. + +Connector Options +---------------- +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be 'oracle-cdc'.
    hostnameoptional(none)StringIP address or hostname of the Oracle database server. If the url is not empty, hostname may not be configured, otherwise hostname can not be empty
    usernamerequired(none)StringName of the Oracle database to use when connecting to the Oracle database server.
    passwordrequired(none)StringPassword to use when connecting to the Oracle database server.
    database-namerequired(none)StringDatabase name of the Oracle server to monitor.
    schema-namerequired(none)StringSchema name of the Oracle database to monitor.
    table-namerequired(none)StringTable name of the Oracle database to monitor.
    portoptional1521IntegerInteger port number of the Oracle database server.
    urloptionaljdbc:oracle:thin:@{hostname}:{port}:{database-name}StringJdbcUrl of the oracle database server . If the hostname and port parameter is configured, the URL is concatenated by hostname port database-name in SID format by default. Otherwise, you need to configure the URL parameter
    scan.startup.modeoptionalinitialStringOptional startup mode for Oracle CDC consumer, valid enumerations are "initial" + and "latest-offset". + Please see Startup Reading Position section for more detailed information.
    scan.incremental.snapshot.enabledoptionaltrueBooleanIncremental snapshot is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism, + the incremental snapshot has many advantages, including: + (1) source can be parallel during snapshot reading, + (2) source can perform checkpoints in the chunk granularity during snapshot reading, + (3) source doesn't need to acquire ROW SHARE MODE lock before snapshot reading. +
    scan.incremental.snapshot.chunk.sizeoptional8096IntegerThe chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table.
    scan.snapshot.fetch.sizeoptional1024IntegerThe maximum fetch size for per poll when read table snapshot.
    connect.max-retriesoptional3IntegerThe max retry times that the connector should retry to build Oracle database server connection.
    connection.pool.sizeoptional20IntegerThe connection pool size.
    debezium.*optional(none)StringPass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from Oracle server. + For example: 'debezium.snapshot.mode' = 'never'. + See more about the Debezium's Oracle Connector properties
    scan.incremental.close-idle-reader.enabledoptionalfalseBooleanWhether to close idle readers at the end of the snapshot phase.
    + The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.
    + If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true, + so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' +
    scan.incremental.snapshot.chunk.key-columnoptional(none)StringThe chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table. + By default, the chunk key is 'ROWID'. This column must be a column of the primary key.
    +
    + +Limitation +-------- + +### Can't perform checkpoint during scanning snapshot of tables +During scanning snapshot of database tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, Oracle CDC source will keep the checkpoint waiting to timeout. The timeout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints: + +``` +execution.checkpointing.interval: 10min +execution.checkpointing.tolerable-failed-checkpoints: 100 +restart-strategy: fixed-delay +restart-strategy.fixed-delay.attempts: 2147483647 +``` + +Available Metadata +---------------- + +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDataTypeDescription
    table_nameSTRING NOT NULLName of the table that contain the row.
    schema_nameSTRING NOT NULLName of the schema that contain the row.
    database_nameSTRING NOT NULLName of the database that contain the row.
    op_tsTIMESTAMP_LTZ(3) NOT NULLIt indicates the time that the change was made in the database.
    If the record is read from snapshot of the table instead of the change stream, the value is always 0.
    + +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: +```sql +CREATE TABLE products ( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + schema_name STRING METADATA FROM 'schema_name' VIRTUAL, + table_name STRING METADATA FROM 'table_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + ID INT NOT NULL, + NAME STRING, + DESCRIPTION STRING, + WEIGHT DECIMAL(10, 3), + PRIMARY KEY(id) NOT ENFORCED +) WITH ( + 'connector' = 'oracle-cdc', + 'hostname' = 'localhost', + 'port' = '1521', + 'username' = 'flinkuser', + 'password' = 'flinkpw', + 'database-name' = 'ORCLCDB', + 'schema-name' = 'inventory', + 'table-name' = 'products', + 'debezium.log.mining.strategy' = 'online_catalog', + 'debezium.log.mining.continuous.mine' = 'true' +); +``` + +**Note** : The Oracle dialect is case-sensitive, it converts field name to uppercase if the field name is not quoted, Flink SQL doesn't convert the field name. Thus for physical columns from oracle database, we should use its converted field name in Oracle when define an `oracle-cdc` table in Flink SQL. + +Features +-------- + +### Exactly-Once Processing + +The Oracle CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.9/connectors/oracle.html#how-the-oracle-connector-works). + +### Startup Reading Position + +The config option `scan.startup.mode` specifies the startup mode for Oracle CDC consumer. The valid enumerations are: + +- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest redo log. +- `latest-offset`: Never to perform a snapshot on the monitored database tables upon first startup, just read from + the change since the connector was started. + +_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not use them together. If you specific both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._ + +### Single Thread Reading + +The Oracle CDC source can't work in parallel reading, because there is only one task can receive change events. + +### DataStream Source + +The Oracle CDC connector can also be a DataStream source. There are two modes for the DataStream source: + +- incremental snapshot based, which allows parallel reading +- SourceFunction based, which only supports single thread reading + +#### Incremental Snapshot based DataStream (Experimental) + +```java +import org.apache.flink.cdc.connectors.base.options.StartupOptions; +import org.apache.flink.cdc.connectors.base.source.jdbc.JdbcIncrementalSource; +import org.apache.flink.cdc.connectors.oracle.source.OracleSourceBuilder; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; + +import java.util.Properties; + +public class OracleParallelSourceExample { + + public static void main(String[] args) throws Exception { + Properties debeziumProperties = new Properties(); + debeziumProperties.setProperty("log.mining.strategy", "online_catalog"); + + JdbcIncrementalSource oracleChangeEventSource = + new OracleSourceBuilder() + .hostname("host") + .port(1521) + .databaseList("ORCLCDB") + .schemaList("DEBEZIUM") + .tableList("DEBEZIUM.PRODUCTS") + .username("username") + .password("password") + .deserializer(new JsonDebeziumDeserializationSchema()) + .includeSchemaChanges(true) // output the schema changes as well + .startupOptions(StartupOptions.initial()) + .debeziumProperties(debeziumProperties) + .splitSize(2) + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(3000L); + // set the source parallelism to 4 + env.fromSource( + oracleChangeEventSource, + WatermarkStrategy.noWatermarks(), + "OracleParallelSource") + .setParallelism(4) + .print() + .setParallelism(1); + env.execute("Print Oracle Snapshot + RedoLog"); + } +} +``` + +#### SourceFunction-based DataStream + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.oracle.OracleSource; + +public class OracleSourceExample { + public static void main(String[] args) throws Exception { + SourceFunction sourceFunction = OracleSource.builder() + .url("jdbc:oracle:thin:@{hostname}:{port}:{database}") + .port(1521) + .database("ORCLCDB") // monitor XE database + .schemaList("inventory") // monitor inventory schema + .tableList("inventory.products") // monitor products table + .username("flinkuser") + .password("flinkpw") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env + .addSource(sourceFunction) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute(); + } +} +``` + +Data Type Mapping +---------------- +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Oracle typeFlink SQL type
    NUMBER(p, s <= 0), p - s < 3 + TINYINT
    NUMBER(p, s <= 0), p - s < 5 + SMALLINT
    NUMBER(p, s <= 0), p - s < 10 + INT
    NUMBER(p, s <= 0), p - s < 19 + BIGINT
    NUMBER(p, s <= 0), 19 <= p - s <= 38
    +
    DECIMAL(p - s, 0)
    NUMBER(p, s > 0) + DECIMAL(p, s)
    NUMBER(p, s <= 0), p - s > 38 + STRING
    + FLOAT
    + BINARY_FLOAT +
    FLOAT
    + DOUBLE PRECISION
    + BINARY_DOUBLE +
    DOUBLE
    NUMBER(1)BOOLEAN
    + DATE
    + TIMESTAMP [(p)] +
    TIMESTAMP [(p)] [WITHOUT TIMEZONE]
    TIMESTAMP [(p)] WITH TIME ZONETIMESTAMP [(p)] WITH TIME ZONE
    TIMESTAMP [(p)] WITH LOCAL TIME ZONETIMESTAMP_LTZ [(p)]
    + CHAR(n)
    + NCHAR(n)
    + NVARCHAR2(n)
    + VARCHAR(n)
    + VARCHAR2(n)
    + CLOB
    + NCLOB
    + XMLType
    + SYS.XMLTYPE +
    STRING
    BLOB
    + ROWID +
    BYTES
    + INTERVAL DAY TO SECOND
    + INTERVAL YEAR TO MONTH +
    BIGINT
    +
    + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/overview.md b/docs/content/docs/connectors/cdc-connectors/overview.md new file mode 100644 index 0000000000..56ff59a615 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/overview.md @@ -0,0 +1,307 @@ +--- +title: "Overview" +weight: 1 +type: docs +aliases: +- /connectors/cdc-connectors/ +--- + + +# CDC Connectors for Apache Flink + +CDC Connectors for Apache Flink® is a set of source connectors for Apache Flink®, ingesting changes from different databases using change data capture (CDC). +The CDC Connectors for Apache Flink® integrate Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium). + +{{< img src="/fig/cdc-flow.png" width="600px" alt="Flink CDC" >}} + +## Supported Connectors + +| Connector | Database | Driver | +|-----------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| +| [mongodb-cdc](mongodb-cdc.md) |
  • [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0 | MongoDB Driver: 4.3.4 | +| [mysql-cdc](mysql-cdc.md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 | +| [oceanbase-cdc](oceanbase-cdc.md) |
  • [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x
  • [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x | +| [oracle-cdc](oracle-cdc.md) |
  • [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 | +| [postgres-cdc](postgres-cdc.md) |
  • [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 | +| [sqlserver-cdc](sqlserver-cdc.md) |
  • [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 | +| [tidb-cdc](tidb-cdc.md) |
  • [TiDB](https://www.pingcap.com/): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 | +| [db2-cdc](db2-cdc.md) |
  • [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 | +| [vitess-cdc](vitess-cdc.md) |
  • [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 | + +## Supported Flink Versions +The following table shows the version mapping between Flink® CDC Connectors and Flink®: + +| Flink® CDC Version | Flink® Version | +|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| 1.0.0 | 1.11.* | +| 1.1.0 | 1.11.* | +| 1.2.0 | 1.12.* | +| 1.3.0 | 1.12.* | +| 1.4.0 | 1.13.* | +| 2.0.* | 1.13.* | +| 2.1.* | 1.13.* | +| 2.2.* | 1.13.\*, 1.14.\* | +| 2.3.* | 1.13.\*, 1.14.\*, 1.15.\*, 1.16.\* | +| 2.4.* | 1.13.\*, 1.14.\*, 1.15.\*, 1.16.\*, 1.17.\* | +| 3.0.* | 1.14.\*, 1.15.\*, 1.16.\*, 1.17.\*, 1.18.\* | + +## Features + +1. Supports reading database snapshot and continues to read binlogs with **exactly-once processing** even failures happen. +2. CDC connectors for DataStream API, users can consume changes on multiple databases and tables in a single job without Debezium and Kafka deployed. +3. CDC connectors for Table/SQL API, users can use SQL DDL to create a CDC source to monitor changes on a single table. + +The following table shows the current features of the connector: + +| Connector | No-lock Read | Parallel Read | Exactly-once Read | Incremental Snapshot Read | +|-----------------------------------|--------------|---------------|-------------------|---------------------------| +| [mongodb-cdc](mongodb-cdc.md) | ✅ | ✅ | ✅ | ✅ | +| [mysql-cdc](mysql-cdc.md) | ✅ | ✅ | ✅ | ✅ | +| [oracle-cdc](oracle-cdc.md) | ✅ | ✅ | ✅ | ✅ | +| [postgres-cdc](postgres-cdc.md) | ✅ | ✅ | ✅ | ✅ | +| [sqlserver-cdc](sqlserver-cdc.md) | ✅ | ✅ | ✅ | ✅ | +| [oceanbase-cdc](oceanbase-cdc.md) | ❌ | ❌ | ❌ | ❌ | +| [tidb-cdc](tidb-cdc.md) | ✅ | ❌ | ✅ | ❌ | +| [db2-cdc](db2-cdc.md) | ❌ | ❌ | ✅ | ❌ | +| [vitess-cdc](vitess-cdc.md) | ✅ | ❌ | ✅ | ❌ | + +## Usage for Table/SQL API + +We need several steps to setup a Flink cluster with the provided connector. + +1. Setup a Flink cluster with version 1.12+ and Java 8+ installed. +2. Download the connector SQL jars from the [Downloads](../downloads.md) page (or [build yourself](#building-from-source)). +3. Put the downloaded jars under `FLINK_HOME/lib/`. +4. Restart the Flink cluster. + +The example shows how to create a MySQL CDC source in [Flink SQL Client](https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/table/sqlclient/) and execute queries on it. + +```sql +-- creates a mysql cdc table source +CREATE TABLE mysql_binlog ( + id INT NOT NULL, + name STRING, + description STRING, + weight DECIMAL(10,3), + PRIMARY KEY(id) NOT ENFORCED +) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'username' = 'flinkuser', + 'password' = 'flinkpw', + 'database-name' = 'inventory', + 'table-name' = 'products' +); + +-- read snapshot and binlog data from mysql, and do some transformation, and show on the client +SELECT id, UPPER(name), description, weight FROM mysql_binlog; +``` + +## Usage for DataStream API + +Include following Maven dependency (available through Maven Central): + +``` + + org.apache.flink + + flink-connector-mysql-cdc + + 3.0-SNAPSHOT + +``` + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.mysql.source.MySqlSource; + +public class MySqlBinlogSourceExample { + public static void main(String[] args) throws Exception { + MySqlSource mySqlSource = MySqlSource.builder() + .hostname("yourHostname") + .port(yourPort) + .databaseList("yourDatabaseName") // set captured database + .tableList("yourDatabaseName.yourTableName") // set captured table + .username("yourUsername") + .password("yourPassword") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + // enable checkpoint + env.enableCheckpointing(3000); + + env + .fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") + // set 4 parallel source tasks + .setParallelism(4) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute("Print MySQL Snapshot + Binlog"); + } +} +``` +### Deserialization +The following JSON data show the change event in JSON format. + +```json +{ + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.15 + }, + "source": {...}, + "op": "u", // the operation type, "u" means this this is an update event + "ts_ms": 1589362330904, // the time at which the connector processed the event + "transaction": null +} +``` +**Note:** Please refer [Debezium documentation](https://debezium.io/documentation/reference/1.9/connectors/mysql.html#mysql-events +) to know the meaning of each field. + +In some cases, users can use the `JsonDebeziumDeserializationSchema(true)` Constructor to enabled include schema in the message. Then the Debezium JSON message may look like this: +```json +{ + "schema": { + "type": "struct", + "fields": [ + { + "type": "struct", + "fields": [ + { + "type": "int32", + "optional": false, + "field": "id" + }, + { + "type": "string", + "optional": false, + "default": "flink", + "field": "name" + }, + { + "type": "string", + "optional": true, + "field": "description" + }, + { + "type": "double", + "optional": true, + "field": "weight" + } + ], + "optional": true, + "name": "mysql_binlog_source.inventory_1pzxhca.products.Value", + "field": "before" + }, + { + "type": "struct", + "fields": [ + { + "type": "int32", + "optional": false, + "field": "id" + }, + { + "type": "string", + "optional": false, + "default": "flink", + "field": "name" + }, + { + "type": "string", + "optional": true, + "field": "description" + }, + { + "type": "double", + "optional": true, + "field": "weight" + } + ], + "optional": true, + "name": "mysql_binlog_source.inventory_1pzxhca.products.Value", + "field": "after" + }, + { + "type": "struct", + "fields": {...}, + "optional": false, + "name": "io.debezium.connector.mysql.Source", + "field": "source" + }, + { + "type": "string", + "optional": false, + "field": "op" + }, + { + "type": "int64", + "optional": true, + "field": "ts_ms" + } + ], + "optional": false, + "name": "mysql_binlog_source.inventory_1pzxhca.products.Envelope" + }, + "payload": { + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.15 + }, + "source": {...}, + "op": "u", // the operation type, "u" means this this is an update event + "ts_ms": 1589362330904, // the time at which the connector processed the event + "transaction": null + } +} +``` +Usually, it is recommended to exclude schema because schema fields makes the messages very verbose which reduces parsing performance. + +The `JsonDebeziumDeserializationSchema` can also accept custom configuration of `JsonConverter`, for example if you want to obtain numeric output for decimal data, +you can construct `JsonDebeziumDeserializationSchema` as following: + +```java + Map customConverterConfigs = new HashMap<>(); + customConverterConfigs.put(JsonConverterConfig.DECIMAL_FORMAT_CONFIG, "numeric"); + JsonDebeziumDeserializationSchema schema = + new JsonDebeziumDeserializationSchema(true, customConverterConfigs); +``` + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/postgres-cdc.md b/docs/content/docs/connectors/cdc-connectors/postgres-cdc.md new file mode 100644 index 0000000000..d1504b6aae --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/postgres-cdc.md @@ -0,0 +1,620 @@ +--- +title: "Postgres CDC Connector" +weight: 6 +type: docs +aliases: +- /connectors/cdc-connectors/postgres-cdc.html +--- + + +# Postgres CDC Connector + +The Postgres CDC connector allows for reading snapshot data and incremental data from PostgreSQL database. This document describes how to setup the Postgres CDC connector to run SQL queries against PostgreSQL databases. + +Dependencies +------------ + +In order to setup the Postgres CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-postgres-cdc >}} + +### SQL Client JAR + +```Download link is available only for stable releases.``` + +Download flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar and put it under `/lib/`. + +**Note:** flink-sql-connector-postgres-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-postgres-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-postgres-cdc), the released version will be available in the Maven central warehouse. + +How to create a Postgres CDC table +---------------- + +The Postgres CDC table can be defined as following: + +```sql +-- register a PostgreSQL table 'shipments' in Flink SQL +CREATE TABLE shipments ( + shipment_id INT, + order_id INT, + origin STRING, + destination STRING, + is_arrived BOOLEAN +) WITH ( + 'connector' = 'postgres-cdc', + 'hostname' = 'localhost', + 'port' = '5432', + 'username' = 'postgres', + 'password' = 'postgres', + 'database-name' = 'postgres', + 'schema-name' = 'public', + 'table-name' = 'shipments', + 'slot.name' = 'flink', + -- experimental feature: incremental snapshot (default off) + 'scan.incremental.snapshot.enabled' = 'true' +); + +-- read snapshot and binlogs from shipments table +SELECT * FROM shipments; +``` + +Connector Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be 'postgres-cdc'.
    hostnamerequired(none)StringIP address or hostname of the PostgreSQL database server.
    usernamerequired(none)StringName of the PostgreSQL database to use when connecting to the PostgreSQL database server.
    passwordrequired(none)StringPassword to use when connecting to the PostgreSQL database server.
    database-namerequired(none)StringDatabase name of the PostgreSQL server to monitor.
    schema-namerequired(none)StringSchema name of the PostgreSQL database to monitor.
    table-namerequired(none)StringTable name of the PostgreSQL database to monitor.
    portoptional5432IntegerInteger port number of the PostgreSQL database server.
    slot.namerequired(none)StringThe name of the PostgreSQL logical decoding slot that was created for streaming changes from a particular plug-in + for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. +
    Slot names must conform to PostgreSQL replication slot naming rules, which state: "Each replication slot has a name, which can contain lower-case letters, numbers, and the underscore character."
    decoding.plugin.nameoptionaldecoderbufsStringThe name of the Postgres logical decoding plug-in installed on the server. + Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming, wal2json_rds_streaming and pgoutput.
    changelog-modeoptionalallStringThe changelog mode used for encoding streaming changes. Supported values are all (which encodes changes as retract stream using all RowKinds) and upsert (which encodes changes as upsert stream that describes idempotent updates on a key). +
    upsert mode can be used for tables with primary keys when replica identity FULL is not an option. Primary keys must be set to use upsert mode.
    heartbeat.interval.msoptional30sDurationThe interval of sending heartbeat event for tracing the latest available replication slot offsets
    debezium.*optional(none)StringPass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from Postgres server. + For example: 'debezium.snapshot.mode' = 'never'. + See more about the Debezium's Postgres Connector properties
    debezium.snapshot.select.statement.overridesoptional(none)StringIf you encounter a situation where there is a large amount of data in the table and you don't need all the historical data. You can try to specify the underlying configuration in debezium to select the data range you want to snapshot. This parameter only affects snapshots and does not affect subsequent data reading consumption. +
    Note: PostgreSQL must use schema name and table name. +
    For example: 'debezium.snapshot.select.statement.overrides' = 'schema.table'. +
    After specifying the above attributes, you must also add the following attributes: + debezium.snapshot.select.statement.overrides.[schema].[table] +
    debezium.snapshot.select.statement.overrides.[schema].[table]optional(none)StringYou can specify SQL statements to limit the data range of snapshot. +
    Note1: Schema and table need to be specified in the SQL statement, and the SQL should conform to the syntax of the data source.Currently. +
    For example: 'debezium.snapshot.select.statement.overrides.schema.table' = 'select * from schema.table where 1 != 1'. +
    Note2: The Flink SQL client submission task does not support functions with single quotation marks in the content. +
    For example: 'debezium.snapshot.select.statement.overrides.schema.table' = 'select * from schema.table where to_char(rq, 'yyyy-MM-dd')'. +
    scan.incremental.snapshot.enabledoptionalfalseBooleanIncremental snapshot is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism, + the incremental snapshot has many advantages, including: + (1) source can be parallel during snapshot reading, + (2) source can perform checkpoints in the chunk granularity during snapshot reading, + (3) source doesn't need to acquire global read lock (FLUSH TABLES WITH READ LOCK) before snapshot reading. + Please see Incremental Snapshot Readingsection for more detailed information. +
    scan.incremental.close-idle-reader.enabledoptionalfalseBooleanWhether to close idle readers at the end of the snapshot phase.
    + The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.
    + If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true, + so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' +
    +
    +
    + +Note: `slot.name` is recommended to set for different tables to avoid the potential `PSQLException: ERROR: replication slot "flink" is active for PID 974` error. See more [here](https://debezium.io/documentation/reference/1.9/connectors/postgresql.html#postgresql-property-slot-name). + +### Incremental Snapshot Options + +The following options is available only when `scan.incremental.snapshot.enabled=true`: + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    scan.incremental.snapshot.chunk.sizeoptional8096IntegerThe chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table.
    scan.startup.modeoptionalinitialStringOptional startup mode for Postgres CDC consumer, valid enumerations are "initial" + and "latest-offset". + Please see Startup Reading Position section for more detailed information.
    chunk-meta.group.sizeoptional1000IntegerThe group size of chunk meta, if the meta size exceeds the group size, the meta will be divided into multiple groups.
    connect.timeoutoptional30sDurationThe maximum time that the connector should wait after trying to connect to the PostgreSQL database server before timing out.
    connect.pool.sizeoptional30IntegerThe connection pool size.
    connect.max-retriesoptional3IntegerThe max retry times that the connector should retry to build database server connection.
    scan.snapshot.fetch.sizeoptional1024IntegerThe maximum fetch size for per poll when read table snapshot.
    scan.incremental.snapshot.chunk.key-columnoptional(none)StringThe chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table. + By default, the chunk key is the first column of the primary key. This column must be a column of the primary key.
    chunk-key.even-distribution.factor.lower-boundoptional0.05dDoubleThe lower bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not. + The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven. + The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.
    chunk-key.even-distribution.factor.upper-boundoptional1000.0dDoubleThe upper bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not. + The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven. + The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.
    +
    + +Available Metadata +---------------- + +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDataTypeDescription
    table_nameSTRING NOT NULLName of the table that contain the row.
    schema_nameSTRING NOT NULLName of the schema that contain the row.
    database_nameSTRING NOT NULLName of the database that contain the row.
    op_tsTIMESTAMP_LTZ(3) NOT NULLIt indicates the time that the change was made in the database.
    If the record is read from snapshot of the table instead of the change stream, the value is always 0.
    + +Limitation +-------- + +### Can't perform checkpoint during scanning snapshot of tables when incremental snapshot is disabled + +When `scan.incremental.snapshot.enabled=false`, we have the following limitation. + +During scanning snapshot of database tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, Postgres CDC source will keep the checkpoint waiting to timeout. The timeout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints: + +``` +execution.checkpointing.interval: 10min +execution.checkpointing.tolerable-failed-checkpoints: 100 +restart-strategy: fixed-delay +restart-strategy.fixed-delay.attempts: 2147483647 +``` + +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: +```sql +CREATE TABLE products ( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + table_name STRING METADATA FROM 'table_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + shipment_id INT, + order_id INT, + origin STRING, + destination STRING, + is_arrived BOOLEAN +) WITH ( + 'connector' = 'postgres-cdc', + 'hostname' = 'localhost', + 'port' = '5432', + 'username' = 'postgres', + 'password' = 'postgres', + 'database-name' = 'postgres', + 'schema-name' = 'public', + 'table-name' = 'shipments', + 'slot.name' = 'flink' +); +``` + +Features +-------- + +### Incremental Snapshot Reading (Experimental) + +Incremental snapshot reading is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism, the incremental snapshot has many advantages, including: +* (1) PostgreSQL CDC Source can be parallel during snapshot reading +* (2) PostgreSQL CDC Source can perform checkpoints in the chunk granularity during snapshot reading +* (3) PostgreSQL CDC Source doesn't need to acquire global read lock before snapshot reading + +During the incremental snapshot reading, the PostgreSQL CDC Source firstly splits snapshot chunks (splits) by primary key of table, +and then PostgreSQL CDC Source assigns the chunks to multiple readers to read the data of snapshot chunk. + +### Exactly-Once Processing + +The Postgres CDC connector is a Flink Source connector which will read database snapshot first and then continues to read binlogs with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.9/connectors/postgresql.html#how-the-postgresql-connector-works). + +### DataStream Source + +The Postgres CDC connector can also be a DataStream source. There are two modes for the DataStream source: + +- incremental snapshot based, which allows parallel reading +- SourceFunction based, which only supports single thread reading + +#### Incremental Snapshot based DataStream (Experimental) + +```java +import org.apache.flink.cdc.connectors.base.source.jdbc.JdbcIncrementalSource; +import org.apache.flink.cdc.connectors.postgres.source.PostgresSourceBuilder; +import org.apache.flink.cdc.debezium.DebeziumDeserializationSchema; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; + +public class PostgresParallelSourceExample { + + public static void main(String[] args) throws Exception { + + DebeziumDeserializationSchema deserializer = + new JsonDebeziumDeserializationSchema(); + + JdbcIncrementalSource postgresIncrementalSource = + PostgresSourceBuilder.PostgresIncrementalSource.builder() + .hostname("localhost") + .port(5432) + .database("postgres") + .schemaList("inventory") + .tableList("inventory.products") + .username("postgres") + .password("postgres") + .slotName("flink") + .decodingPluginName("decoderbufs") // use pgoutput for PostgreSQL 10+ + .deserializer(deserializer) + .includeSchemaChanges(true) // output the schema changes as well + .splitSize(2) // the split size of each snapshot split + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env.enableCheckpointing(3000); + + env.fromSource( + postgresIncrementalSource, + WatermarkStrategy.noWatermarks(), + "PostgresParallelSource") + .setParallelism(2) + .print(); + + env.execute("Output Postgres Snapshot"); + } +} +``` + +#### SourceFunction-based DataStream + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.postgres.PostgreSQLSource; + +public class PostgreSQLSourceExample { + public static void main(String[] args) throws Exception { + SourceFunction sourceFunction = PostgreSQLSource.builder() + .hostname("localhost") + .port(5432) + .database("postgres") // monitor postgres database + .schemaList("inventory") // monitor inventory schema + .tableList("inventory.products") // monitor products table + .username("flinkuser") + .password("flinkpw") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env + .addSource(sourceFunction) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute(); + } +} +``` + +Data Type Mapping +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    PostgreSQL typeFlink SQL type
    TINYINT
    + SMALLINT
    + INT2
    + SMALLSERIAL
    + SERIAL2
    SMALLINT
    + INTEGER
    + SERIAL
    INT
    + BIGINT
    + BIGSERIAL
    BIGINT
    DECIMAL(20, 0)
    BIGINTBIGINT
    + REAL
    + FLOAT4
    FLOAT
    + FLOAT8
    + DOUBLE PRECISION
    DOUBLE
    + NUMERIC(p, s)
    + DECIMAL(p, s)
    DECIMAL(p, s)
    BOOLEANBOOLEAN
    DATEDATE
    TIME [(p)] [WITHOUT TIMEZONE]TIME [(p)] [WITHOUT TIMEZONE]
    TIMESTAMP [(p)] [WITHOUT TIMEZONE]TIMESTAMP [(p)] [WITHOUT TIMEZONE]
    + CHAR(n)
    + CHARACTER(n)
    + VARCHAR(n)
    + CHARACTER VARYING(n)
    + TEXT
    STRING
    BYTEABYTES
    +
    + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/sqlserver-cdc.md b/docs/content/docs/connectors/cdc-connectors/sqlserver-cdc.md new file mode 100644 index 0000000000..68553196ca --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/sqlserver-cdc.md @@ -0,0 +1,507 @@ +--- +title: "SQLServer CDC Connector" +weight: 7 +type: docs +aliases: +- /connectors/cdc-connectors/sqlserver-cdc.html +--- + + +# SQLServer CDC Connector + +The SQLServer CDC connector allows for reading snapshot data and incremental data from SQLServer database. This document describes how to setup the SQLServer CDC connector to run SQL queries against SQLServer databases. + +Dependencies +------------ + +In order to setup the SQLServer CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-sqlserver-cdc >}} + +### SQL Client JAR + +```Download link is available only for stable releases.``` + +Download [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. + +**Note:** flink-sql-connector-sqlserver-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-sqlserver-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-sqlserver-cdc), the released version will be available in the Maven central warehouse. + +Setup SQLServer Database +---------------- +A SQL Server administrator must enable change data capture on the source tables that you want to capture. The database must already be enabled for CDC. To enable CDC on a table, a SQL Server administrator runs the stored procedure ```sys.sp_cdc_enable_table``` for the table. + +**Prerequisites:** +* CDC is enabled on the SQL Server database. +* The SQL Server Agent is running. +* You are a member of the db_owner fixed database role for the database. + +**Procedure:** +* Connect to the SQL Server database by database management studio. +* Run the following SQL statement to enable CDC on the table. +```sql +USE MyDB +GO + +EXEC sys.sp_cdc_enable_table +@source_schema = N'dbo', -- Specifies the schema of the source table. +@source_name = N'MyTable', -- Specifies the name of the table that you want to capture. +@role_name = N'MyRole', -- Specifies a role MyRole to which you can add users to whom you want to grant SELECT permission on the captured columns of the source table. Users in the sysadmin or db_owner role also have access to the specified change tables. Set the value of @role_name to NULL, to allow only members in the sysadmin or db_owner to have full access to captured information. +@filegroup_name = N'MyDB_CT',-- Specifies the filegroup where SQL Server places the change table for the captured table. The named filegroup must already exist. It is best not to locate change tables in the same filegroup that you use for source tables. +@supports_net_changes = 0 +GO +``` +* Verifying that the user has access to the CDC table +```sql +--The following example runs the stored procedure sys.sp_cdc_help_change_data_capture on the database MyDB: +USE MyDB; +GO +EXEC sys.sp_cdc_help_change_data_capture +GO +``` +The query returns configuration information for each table in the database that is enabled for CDC and that contains change data that the caller is authorized to access. If the result is empty, verify that the user has privileges to access both the capture instance and the CDC tables. + +How to create a SQLServer CDC table +---------------- + +The SqlServer CDC table can be defined as following: + +```sql +-- register a SqlServer table 'orders' in Flink SQL +CREATE TABLE orders ( + id INT, + order_date DATE, + purchaser INT, + quantity INT, + product_id INT, + PRIMARY KEY (id) NOT ENFORCED +) WITH ( + 'connector' = 'sqlserver-cdc', + 'hostname' = 'localhost', + 'port' = '1433', + 'username' = 'sa', + 'password' = 'Password!', + 'database-name' = 'inventory', + 'table-name' = 'dob.orders' +); + +-- read snapshot and binlogs from orders table +SELECT * FROM orders; +``` + +Connector Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be 'sqlserver-cdc'.
    hostnamerequired(none)StringIP address or hostname of the SQLServer database.
    usernamerequired(none)StringUsername to use when connecting to the SQLServer database.
    passwordrequired(none)StringPassword to use when connecting to the SQLServer database.
    database-namerequired(none)StringDatabase name of the SQLServer database to monitor.
    table-namerequired(none)StringTable name of the SQLServer database to monitor, e.g.: "db1.table1"
    portoptional1433IntegerInteger port number of the SQLServer database.
    server-time-zoneoptionalUTCStringThe session time zone in database server, e.g. "Asia/Shanghai".
    scan.incremental.snapshot.enabledoptionaltrueBooleanWhether enable parallelism snapshot.
    chunk-meta.group.sizeoptional1000IntegerThe group size of chunk meta, if the meta size exceeds the group size, the meta will be divided into multiple groups.
    chunk-key.even-distribution.factor.lower-boundoptional0.05dDoubleThe lower bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not. + The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven. + The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.
    chunk-key.even-distribution.factor.upper-boundoptional1000.0dDoubleThe upper bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not. + The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven. + The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.
    debezium.*optional(none)StringPass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from SQLServer. + For example: 'debezium.snapshot.mode' = 'initial_only'. + See more about the Debezium's SQLServer Connector properties
    scan.incremental.close-idle-reader.enabledoptionalfalseBooleanWhether to close idle readers at the end of the snapshot phase.
    + The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.
    + If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true, + so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' +
    scan.incremental.snapshot.chunk.key-columnoptional(none)StringThe chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table. + By default, the chunk key is the first column of the primary key. This column must be a column of the primary key.
    +
    + +Available Metadata +---------------- + +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDataTypeDescription
    table_nameSTRING NOT NULLName of the table that contain the row.
    schema_nameSTRING NOT NULLName of the schema that contain the row.
    database_nameSTRING NOT NULLName of the database that contain the row.
    op_tsTIMESTAMP_LTZ(3) NOT NULLIt indicates the time that the change was made in the database.
    If the record is read from snapshot of the table instead of the change stream, the value is always 0.
    + +Limitation +-------- + +### Can't perform checkpoint during scanning snapshot of tables +During scanning snapshot of database tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, SqlServer CDC source will keep the checkpoint waiting to timeout. The timeout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints: + +``` +execution.checkpointing.interval: 10min +execution.checkpointing.tolerable-failed-checkpoints: 100 +restart-strategy: fixed-delay +restart-strategy.fixed-delay.attempts: 2147483647 +``` + +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: +```sql +CREATE TABLE products ( + table_name STRING METADATA FROM 'table_name' VIRTUAL, + schema_name STRING METADATA FROM 'schema_name' VIRTUAL, + db_name STRING METADATA FROM 'database_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + id INT NOT NULL, + name STRING, + description STRING, + weight DECIMAL(10,3) +) WITH ( + 'connector' = 'sqlserver-cdc', + 'hostname' = 'localhost', + 'port' = '1433', + 'username' = 'sa', + 'password' = 'Password!', + 'database-name' = 'inventory', + 'table-name' = 'dbo.products' +); +``` + +Features +-------- + +### Exactly-Once Processing + +The SQLServer CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.9/connectors/sqlserver.html#how-the-sqlserver-connector-works). + +### Startup Reading Position + +The config option `scan.startup.mode` specifies the startup mode for SQLServer CDC consumer. The valid enumerations are: + +- `initial` (default): Takes a snapshot of structure and data of captured tables; useful if topics should be populated with a complete representation of the data from the captured tables. +- `initial-only`: Takes a snapshot of structure and data like initial but instead does not transition into streaming changes once the snapshot has completed. +- `latest-offset`: Takes a snapshot of the structure of captured tables only; useful if only changes happening from now onwards should be propagated to topics. + +_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not use them together. If you specific both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._ + +### Single Thread Reading + +The SQLServer CDC source can't work in parallel reading, because there is only one task can receive change events. + +### DataStream Source + +The SQLServer CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows: + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.sqlserver.SqlServerSource; + +public class SqlServerSourceExample { + public static void main(String[] args) throws Exception { + SourceFunction sourceFunction = SqlServerSource.builder() + .hostname("localhost") + .port(1433) + .database("sqlserver") // monitor sqlserver database + .tableList("dbo.products") // monitor products table + .username("sa") + .password("Password!") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env + .addSource(sourceFunction) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute(); + } +} +``` + +The SQLServer CDC incremental connector (after 2.4.0) can be used as the following shows: +```java +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; + +import org.apache.flink.cdc.connectors.base.options.StartupOptions; +import org.apache.flink.cdc.connectors.sqlserver.source.SqlServerSourceBuilder; +import org.apache.flink.cdc.connectors.sqlserver.source.SqlServerSourceBuilder.SqlServerIncrementalSource; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; + +public class SqlServerIncrementalSourceExample { + public static void main(String[] args) throws Exception { + SqlServerIncrementalSource sqlServerSource = + new SqlServerSourceBuilder() + .hostname("localhost") + .port(1433) + .databaseList("inventory") + .tableList("dbo.products") + .username("sa") + .password("Password!") + .deserializer(new JsonDebeziumDeserializationSchema()) + .startupOptions(StartupOptions.initial()) + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(3000); + // set the source parallelism to 2 + env.fromSource( + sqlServerSource, + WatermarkStrategy.noWatermarks(), + "SqlServerIncrementalSource") + .setParallelism(2) + .print() + .setParallelism(1); + + env.execute("Print SqlServer Snapshot + Change Stream"); + } +} +``` + +Data Type Mapping +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    SQLServer typeFlink SQL type
    char(n)CHAR(n)
    + varchar(n)
    + nvarchar(n)
    + nchar(n) +
    VARCHAR(n)
    + text
    + ntext
    + xml +
    STRING
    + decimal(p, s)
    + money
    + smallmoney +
    DECIMAL(p, s)
    numericNUMERIC
    + float
    + real +
    DOUBLE
    bitBOOLEAN
    intINT
    tinyintSMALLINT
    smallintSMALLINT
    bigintBIGINT
    dateDATE
    time(n)TIME(n)
    + datetime2
    + datetime
    + smalldatetime +
    TIMESTAMP(n)
    datetimeoffsetTIMESTAMP_LTZ(3)
    +
    + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/tidb-cdc.md b/docs/content/docs/connectors/cdc-connectors/tidb-cdc.md new file mode 100644 index 0000000000..46662858c7 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/tidb-cdc.md @@ -0,0 +1,496 @@ +--- +title: "TiDB CDC Connector" +weight: 8 +type: docs +aliases: +- /connectors/cdc-connectors/tidb-cdc.html +--- + + +# TiDB CDC Connector + +The TiDB CDC connector allows for reading snapshot data and incremental data from TiDB database. This document describes how to setup the TiDB CDC connector to run SQL queries against TiDB databases. + +Dependencies +------------ + +In order to setup the TiDB CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-tidb-cdc >}} + +### SQL Client JAR + +```Download link is available only for stable releases.``` + +Download [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. + +**Note:** flink-sql-connector-tidb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-tidb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-tidb-cdc), the released version will be available in the Maven central warehouse. + +How to create a TiDB CDC table +---------------- + +The TiDB CDC table can be defined as following: + +```sql +-- checkpoint every 3000 milliseconds +Flink SQL> SET 'execution.checkpointing.interval' = '3s'; + +-- register a TiDB table 'orders' in Flink SQL +Flink SQL> CREATE TABLE orders ( + order_id INT, + order_date TIMESTAMP(3), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY(order_id) NOT ENFORCED + ) WITH ( + 'connector' = 'tidb-cdc', + 'tikv.grpc.timeout_in_ms' = '20000', + 'pd-addresses' = 'localhost:2379', + 'database-name' = 'mydb', + 'table-name' = 'orders' +); + +-- read snapshot and binlogs from orders table +Flink SQL> SELECT * FROM orders; +``` + +Connector Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be 'tidb-cdc'.
    database-namerequired(none)StringDatabase name of the TiDB server to monitor.
    table-namerequired(none)StringTable name of the TiDB database to monitor.
    scan.startup.modeoptionalinitialStringOptional startup mode for TiDB CDC consumer, valid enumerations are "initial" and "latest-offset".
    pd-addressesrequired(none)StringTiKV cluster's PD address.
    tikv.grpc.timeout_in_msoptional(none)LongTiKV GRPC timeout in ms.
    tikv.grpc.scan_timeout_in_msoptional(none)LongTiKV GRPC scan timeout in ms.
    tikv.batch_get_concurrencyoptional20IntegerTiKV GRPC batch get concurrency.
    tikv.*optional(none)StringPass-through TiDB client's properties.
    +
    + +Available Metadata +---------------- + +The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDataTypeDescription
    table_nameSTRING NOT NULLName of the table that contain the row.
    database_nameSTRING NOT NULLName of the database that contain the row.
    op_tsTIMESTAMP_LTZ(3) NOT NULLIt indicates the time that the change was made in the database.
    If the record is read from snapshot of the table instead of the binlog, the value is always 0.
    + +The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields: +```sql +CREATE TABLE products ( + db_name STRING METADATA FROM 'database_name' VIRTUAL, + table_name STRING METADATA FROM 'table_name' VIRTUAL, + operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL, + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY(order_id) NOT ENFORCED +) WITH ( + 'connector' = 'tidb-cdc', + 'tikv.grpc.timeout_in_ms' = '20000', + 'pd-addresses' = 'localhost:2379', + 'database-name' = 'mydb', + 'table-name' = 'orders' +); +``` + +Features +-------- +### Exactly-Once Processing + +The TiDB CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **exactly-once processing** even failures happen. + +### Startup Reading Position + +The config option `scan.startup.mode` specifies the startup mode for TiDB CDC consumer. The valid enumerations are: + +- `initial` (default): Takes a snapshot of structure and data of captured tables; useful if you want fetch a complete representation of the data from the captured tables. +- `latest-offset`: Takes a snapshot of the structure of captured tables only; useful if only changes happening from now onwards should be fetched. + +### Multi Thread Reading + +The TiDB CDC source can work in parallel reading, because there is multiple tasks can receive change events. + +### DataStream Source + +The TiDB CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows: + +### DataStream Source + +```java +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.util.Collector; + +import org.apache.flink.cdc.connectors.tidb.TDBSourceOptions; +import org.apache.flink.cdc.connectors.tidb.TiDBSource; +import org.apache.flink.cdc.connectors.tidb.TiKVChangeEventDeserializationSchema; +import org.apache.flink.cdc.connectors.tidb.TiKVSnapshotEventDeserializationSchema; +import org.tikv.kvproto.Cdcpb; +import org.tikv.kvproto.Kvrpcpb; + +import java.util.HashMap; + +public class TiDBSourceExample { + + public static void main(String[] args) throws Exception { + + SourceFunction tidbSource = + TiDBSource.builder() + .database("mydb") // set captured database + .tableName("products") // set captured table + .tiConf( + TDBSourceOptions.getTiConfiguration( + "localhost:2399", new HashMap<>())) + .snapshotEventDeserializer( + new TiKVSnapshotEventDeserializationSchema() { + @Override + public void deserialize( + Kvrpcpb.KvPair record, Collector out) + throws Exception { + out.collect(record.toString()); + } + + @Override + public TypeInformation getProducedType() { + return BasicTypeInfo.STRING_TYPE_INFO; + } + }) + .changeEventDeserializer( + new TiKVChangeEventDeserializationSchema() { + @Override + public void deserialize( + Cdcpb.Event.Row record, Collector out) + throws Exception { + out.collect(record.toString()); + } + + @Override + public TypeInformation getProducedType() { + return BasicTypeInfo.STRING_TYPE_INFO; + } + }) + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + // enable checkpoint + env.enableCheckpointing(3000); + env.addSource(tidbSource).print().setParallelism(1); + + env.execute("Print TiDB Snapshot + Binlog"); + } +} +``` + +Data Type Mapping +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    TiDB typeFlink SQL typeNOTE
    TINYINTTINYINT
    + SMALLINT
    + TINYINT UNSIGNED
    SMALLINT
    + INT
    + MEDIUMINT
    + SMALLINT UNSIGNED
    INT
    + BIGINT
    + INT UNSIGNED
    BIGINT
    BIGINT UNSIGNEDDECIMAL(20, 0)
    + FLOAT
    +
    FLOAT
    + REAL
    + DOUBLE +
    DOUBLE
    + NUMERIC(p, s)
    + DECIMAL(p, s)
    + where p <= 38
    +
    DECIMAL(p, s)
    + NUMERIC(p, s)
    + DECIMAL(p, s)
    + where 38 < p <= 65
    +
    STRINGThe precision for DECIMAL data type is up to 65 in TiDB, but the precision for DECIMAL is limited to 38 in Flink. + So if you define a decimal column whose precision is greater than 38, you should map it to STRING to avoid precision loss.
    + BOOLEAN
    + TINYINT(1)
    + BIT(1) +
    BOOLEAN
    DATEDATE
    TIME [(p)]TIME [(p)]
    TIMESTAMP [(p)]TIMESTAMP_LTZ [(p)]
    DATETIME [(p)]TIMESTAMP [(p)] +
    + CHAR(n) + CHAR(n)
    + VARCHAR(n) + VARCHAR(n)
    + BIT(n) + BINARY(⌈n/8⌉)
    + BINARY(n) + BINARY(n)
    + TINYTEXT
    + TEXT
    + MEDIUMTEXT
    + LONGTEXT
    +
    STRING
    + TINYBLOB
    + BLOB
    + MEDIUMBLOB
    + LONGBLOB
    +
    BYTESCurrently, for BLOB data type in TiDB, only the blob whose length isn't greater than 2,147,483,647(2 ** 31 - 1) is supported.
    + YEAR + INT
    + ENUM + STRING
    + JSON + STRINGThe JSON data type will be converted into STRING with JSON format in Flink.
    + SET + ARRAY<STRING>As the SET data type in TiDB is a string object that can have zero or more values, + it should always be mapped to an array of string +
    +
    + +{{< top >}} diff --git a/docs/content/docs/connectors/cdc-connectors/vitess-cdc.md b/docs/content/docs/connectors/cdc-connectors/vitess-cdc.md new file mode 100644 index 0000000000..c722adc975 --- /dev/null +++ b/docs/content/docs/connectors/cdc-connectors/vitess-cdc.md @@ -0,0 +1,329 @@ +--- +title: "Vitess CDC Connector" +weight: 10 +type: docs +aliases: +- /connectors/cdc-connectors/vitess-cdc.html +--- + + +# Vitess CDC Connector + +The Vitess CDC connector allows for reading of incremental data from Vitess cluster. The connector does not support snapshot feature at the moment. This document describes how to setup the Vitess CDC connector to run SQL queries against Vitess databases. +[Vitess debezium documentation](https://debezium.io/documentation/reference/connectors/vitess.html) + +Dependencies +------------ + +In order to setup the Vitess CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles. + +### Maven dependency + +{{< artifact flink-connector-vitess-cdc >}} + +### SQL Client JAR + +Download [flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-vitess-cdc/3.0-SNAPSHOT/flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar) and put it under `/lib/`. + +Setup Vitess server +---------------- + +You can follow the Local Install via [Docker guide](https://vitess.io/docs/get-started/local-docker/), or the Vitess Operator for [Kubernetes guide](https://vitess.io/docs/get-started/operator/) to install Vitess. No special setup is needed to support Vitess connector. + +### Checklist +* Make sure that the VTGate host and its gRPC port (default is 15991) is accessible from the machine where the Vitess connector is installed + +### gRPC authentication +Because Vitess connector reads change events from the VTGate VStream gRPC server, it does not need to connect directly to MySQL instances. +Therefore, no special database user and permissions are needed. At the moment, Vitess connector only supports unauthenticated access to the VTGate gRPC server. + +How to create a Vitess CDC table +---------------- + +The Vitess CDC table can be defined as following: + +```sql +-- checkpoint every 3000 milliseconds +Flink SQL> SET 'execution.checkpointing.interval' = '3s'; + +-- register a Vitess table 'orders' in Flink SQL +Flink SQL> CREATE TABLE orders ( + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY(order_id) NOT ENFORCED + ) WITH ( + 'connector' = 'vitess-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'keyspace' = 'mydb', + 'table-name' = 'orders'); + +-- read snapshot and binlogs from orders table +Flink SQL> SELECT * FROM orders; +``` + +Connector Options +---------------- + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    connectorrequired(none)StringSpecify what connector to use, here should be ‘vitess-cdc’.
    hostnamerequired(none)StringIP address or hostname of the Vitess database server (VTGate).
    keyspacerequired(none)StringThe name of the keyspace from which to stream the changes.
    usernameoptional(none)StringAn optional username of the Vitess database server (VTGate). If not configured, unauthenticated VTGate gRPC is used.
    passwordoptional(none)StringAn optional password of the Vitess database server (VTGate). If not configured, unauthenticated VTGate gRPC is used.
    shardoptional(none)StringAn optional name of the shard from which to stream the changes. If not configured, in case of unsharded keyspace, the connector streams changes from the only shard, in case of sharded keyspace, the connector streams changes from all shards in the keyspace.
    gtidoptionalcurrentStringAn optional GTID position for a shard to stream from.
    stopOnReshardoptionalfalseBooleanControls Vitess flag stop_on_reshard.
    tombstonesOnDeleteoptionaltrueBooleanControls whether a delete event is followed by a tombstone event.
    tombstonesOnDeleteoptionaltrueBooleanControls whether a delete event is followed by a tombstone event.
    schemaNameAdjustmentModeoptionalavroStringSpecifies how schema names should be adjusted for compatibility with the message converter used by the connector.
    table-namerequired(none)StringTable name of the MySQL database to monitor.
    tablet.typeoptionalRDONLYStringThe type of Tablet (hence MySQL) from which to stream the changes: MASTER represents streaming from the master MySQL instance REPLICA represents streaming from the replica slave MySQL instance RDONLY represents streaming from the read-only slave MySQL instance.
    +
    + +Features +-------- + +### Incremental Reading + +The Vitess connector spends all its time streaming changes from the VTGate’s VStream gRPC service to which it is subscribed. The client receives changes from VStream as they are committed in the underlying MySQL server’s binlog at certain positions, which are referred to as VGTID. + +The VGTID in Vitess is the equivalent of GTID in MySQL, it describes the position in the VStream in which a change event happens. Typically, A VGTID has multiple shard GTIDs, each shard GTID is a tuple of (Keyspace, Shard, GTID), which describes the GTID position of a given shard. + +When subscribing to a VStream service, the connector needs to provide a VGTID and a Tablet Type (e.g. MASTER, REPLICA). The VGTID describes the position from which VStream should starts sending change events; the Tablet type describes which underlying MySQL instance (master or replica) in each shard do we read change events from. + +The first time the connector connects to a Vitess cluster, it gets and provides the current VGTID to VStream. + +The Debezium Vitess connector acts as a gRPC client of VStream. When the connector receives changes it transforms the events into Debezium create, update, or delete events that include the VGTID of the event. The Vitess connector forwards these change events in records to the Kafka Connect framework, which is running in the same process. The Kafka Connect process asynchronously writes the change event records in the same order in which they were generated to the appropriate Kafka topic. + +#### Checkpoint + +Incremental snapshot reading provides the ability to perform checkpoint in chunk level. It resolves the checkpoint timeout problem in previous version with old snapshot reading mechanism. + +### Exactly-Once Processing + +The Vitess CDC connector is a Flink Source connector which will read table snapshot chunks first and then continues to read binlog, +both snapshot phase and binlog phase, Vitess CDC connector read with **exactly-once processing** even failures happen. + +### DataStream Source + +The Incremental Reading feature of Vitess CDC Source only exposes in SQL currently, if you're using DataStream, please use Vitess Source: + +```java +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema; +import org.apache.flink.cdc.connectors.vitess.VitessSource; + +public class VitessSourceExample { + public static void main(String[] args) throws Exception { + SourceFunction sourceFunction = VitessSource.builder() + .hostname("localhost") + .port(15991) + .keyspace("inventory") + .username("flinkuser") + .password("flinkpw") + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String + .build(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env + .addSource(sourceFunction) + .print().setParallelism(1); // use parallelism 1 for sink to keep message ordering + + env.execute(); + } +} +``` + +Data Type Mapping +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    MySQL typeFlink SQL type
    TINYINTTINYINT
    + SMALLINT
    + TINYINT UNSIGNED
    SMALLINT
    + INT
    + MEDIUMINT
    + SMALLINT UNSIGNED
    INT
    + BIGINT
    + INT UNSIGNED
    BIGINT
    BIGINT UNSIGNEDDECIMAL(20, 0)
    BIGINTBIGINT
    FLOATFLOAT
    + DOUBLE
    + DOUBLE PRECISION
    DOUBLE
    + NUMERIC(p, s)
    + DECIMAL(p, s)
    DECIMAL(p, s)
    + BOOLEAN
    + TINYINT(1)
    BOOLEAN
    + CHAR(n)
    + VARCHAR(n)
    + TEXT
    STRING
    +
    + +{{< top >}} diff --git a/docs/content/docs/connectors/pipeline-connectors/_index.md b/docs/content/docs/connectors/pipeline-connectors/_index.md new file mode 100644 index 0000000000..3e611a21bf --- /dev/null +++ b/docs/content/docs/connectors/pipeline-connectors/_index.md @@ -0,0 +1,23 @@ +--- +title: Pipeline Connectors +bookCollapseSection: true +weight: 1 +--- + diff --git a/docs/content/docs/connectors/pipeline-connectors/doris-pipeline.md b/docs/content/docs/connectors/pipeline-connectors/doris-pipeline.md new file mode 100644 index 0000000000..94b788636f --- /dev/null +++ b/docs/content/docs/connectors/pipeline-connectors/doris-pipeline.md @@ -0,0 +1,287 @@ +--- +title: "Doris Pipeline Connector" +weight: 2 +type: docs +aliases: +- /pipelines/doris-pipeline.html +--- + + +# Doris Pipeline Connector + +This article introduces of Doris Pipeline Connector + + +## Example +---------------- + +```yaml +source: + type: values + name: ValuesSource + +sink: + type: doris + name: Doris Sink + fenodes: 127.0.0.1:8030 + username: root + password: "" + table.create.properties.replication_num: 1 + +pipeline: + parallelism: 1 + +``` + +## Pipeline options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    typerequired(none)StringSpecify the Sink to use, here is 'doris'.
    nameoptional(none)String Name of PipeLine
    fenodesrequired(none)StringHttp address of Doris cluster FE, such as 127.0.0.1:8030
    benodesoptional(none)StringHttp address of Doris cluster BE, such as 127.0.0.1:8040
    jdbc-urloptional(none)StringJDBC address of Doris cluster, for example: jdbc:mysql://127.0.0.1:9030/db
    usernamerequired(none)StringUsername of Doris cluster
    passwordoptional(none)StringPassword for Doris cluster
    auto-redirectoptionalfalseString Whether to write through FE redirection and directly connect to BE to write
    sink.enable.batch-modeoptionaltrueBoolean Whether to use the batch method to write to Doris
    sink.flush.queue-sizeoptional2Integer Queue size for batch writing +
    sink.buffer-flush.max-rowsoptional50000IntegerMaximum number of Flush records in a single batch
    sink.buffer-flush.max-bytesoptional10485760(10MB)IntegerMaximum number of bytes flushed in a single batch
    sink.buffer-flush.intervaloptional10sStringFlush interval duration. If this time is exceeded, the data will be flushed asynchronously
    sink.properties.optional(none)String Parameters of StreamLoad. + For example: sink.properties.strict_mode: true. + See more about StreamLoad Properties properties
    table.create.properties.*optional(none)StringCreate the Properties configuration of the table. + For example: table.create.properties.replication_num: 1. + See more about Doris Table Properties properties
    +
    +## Data Type Mapping + +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CDC typeDoris typeNOTE
    TINYINTTINYINT
    SMALLINTSMALLINT
    INTINT
    BIGINTBIGINT
    DECIMALDECIMAL
    FLOATFLOAT
    DOUBLEDOUBLE
    BOOLEANBOOLEAN
    DATEDATE
    TIMESTAMP [(p)]DATETIME [(p)]
    TIMESTAMP_LTZ [(p)] + DATETIME [(p)] +
    CHAR(n)CHAR(n*3)In Doris, strings are stored in UTF-8 encoding, so English characters occupy 1 byte and Chinese characters occupy 3 bytes. The length here is multiplied by 3. The maximum length of CHAR is 255. Once exceeded, it will automatically be converted to VARCHAR type.
    VARCHAR(n)VARCHAR(n*3)Same as above. The length here is multiplied by 3. The maximum length of VARCHAR is 65533. Once exceeded, it will automatically be converted to STRING type.
    + BINARY(n) + STRING
    + VARBINARY(N) + STRING
    STRINGSTRING
    +
    + +{{< top >}} diff --git a/docs/content/pipelines/mysql-pipeline(ZH).md b/docs/content/docs/connectors/pipeline-connectors/mysql-pipeline.md similarity index 58% rename from docs/content/pipelines/mysql-pipeline(ZH).md rename to docs/content/docs/connectors/pipeline-connectors/mysql-pipeline.md index 0b244ff85d..4da4a24892 100644 --- a/docs/content/pipelines/mysql-pipeline(ZH).md +++ b/docs/content/docs/connectors/pipeline-connectors/mysql-pipeline.md @@ -1,3 +1,10 @@ +--- +title: "MySQL Pipeline Connector" +weight: 3 +type: docs +aliases: +- /pipelines/mysql-pipeline.html +--- -# MySQL CDC Pipeline 连接器 +# MySQL CDC Pipeline Connector -MySQL CDC Pipeline 连接器允许从 MySQL 数据库读取快照数据和增量数据,并提供端到端的整库数据同步能力。 -本文描述了如何设置 MySQL CDC Pipeline 连接器。 +The MySQL CDC Pipeline Connector allows for reading snapshot data and incremental data from MySQL database and provides end-to-end full-database data synchronization capabilities. +This document describes how to setup the MySQL CDC Pipeline connector. -如何创建 Pipeline +How to create Pipeline ---------------- -从 MySQL 读取数据同步到 Doris 的 Pipeline 可以定义如下: +The pipeline for reading data from MySQL and sink to Doris can be defined as follows: ```yaml source: @@ -51,7 +58,7 @@ pipeline: parallelism: 4 ``` -Pipeline 连接器选项 +Pipeline Connector Options ----------------
    @@ -71,181 +78,188 @@ Pipeline 连接器选项 required (none) String - MySQL 数据库服务器的 IP 地址或主机名。 + IP address or hostname of the MySQL database server. port optional 3306 Integer - MySQL 数据库服务器的整数端口号。 + Integer port number of the MySQL database server. username required (none) String - 连接到 MySQL 数据库服务器时要使用的 MySQL 用户的名称。 + Name of the MySQL database to use when connecting to the MySQL database server. password required (none) String - 连接 MySQL 数据库服务器时使用的密码。 + Password to use when connecting to the MySQL database server. tables required (none) String - 需要监视的 MySQL 数据库的表名。表名支持正则表达式,以监视满足正则表达式的多个表。
    - 需要注意的是,点号(.)被视为数据库和表名的分隔符。 如果需要在正则表达式中使用点(.)来匹配任何字符,必须使用反斜杠对点进行转义。
    - 例如,db0.\.*, db1.user_table_[0-9]+, db[1-2].[app|web]order_\.* + Table name of the MySQL database to monitor. The table-name also supports regular expressions to monitor multiple tables that satisfy the regular expressions.
    + It is important to note that the dot (.) is treated as a delimiter for database and table names. + If there is a need to use a dot (.) in a regular expression to match any character, it is necessary to escape the dot with a backslash.
    + eg. db0.\.*, db1.user_table_[0-9]+, db[1-2].[app|web]order_\.* schema-change.enabled optional true Boolean - 是否发送模式更改事件,下游 sink 可以响应模式变更事件实现表结构同步,默认为true。 + Whether to send schema change events, so that downstream sinks can respond to schema changes and achieve table structure synchronization. server-id optional (none) String - 读取数据使用的 server id,server id 可以是个整数或者一个整数范围,比如 '5400' 或 '5400-5408', - 建议在 'scan.incremental.snapshot.enabled' 参数为启用时,配置成整数范围。因为在当前 MySQL 集群中运行的所有 slave 节点,标记每个 salve 节点的 id 都必须是唯一的。 所以当连接器加入 MySQL 集群作为另一个 slave 节点(并且具有唯一 id 的情况下),它就可以读取 binlog。 默认情况下,连接器会在 5400 和 6400 之间生成一个随机数,但是我们建议用户明确指定 Server id。 - + A numeric ID or a numeric ID range of this database client, The numeric ID syntax is like '5400', + the numeric ID range syntax is like '5400-5408', The numeric ID range syntax is recommended when 'scan.incremental.snapshot.enabled' enabled. + Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the MySQL cluster + as another server (with this unique ID) so it can read the binlog. By default, a random number is generated between 5400 and 6400, + though we recommend setting an explicit value. scan.incremental.snapshot.chunk.size optional 8096 Integer - 表快照的块大小(行数),读取表的快照时,捕获的表被拆分为多个块。 + The chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table. scan.snapshot.fetch.size optional 1024 Integer - 读取表快照时每次读取数据的最大条数。 + The maximum fetch size for per poll when read table snapshot. scan.startup.mode optional initial String - MySQL CDC 消费者可选的启动模式, - 合法的模式为 "initial","earliest-offset","latest-offset","specific-offset" 和 "timestamp"。 - 请查阅 启动模式 章节了解更多详细信息。 + Optional startup mode for MySQL CDC consumer, valid enumerations are "initial", "earliest-offset", "latest-offset", "specific-offset" and "timestamp". + Please see Startup Reading Position section for more detailed information. scan.startup.specific-offset.file optional (none) String - 在 "specific-offset" 启动模式下,启动位点的 binlog 文件名。 + Optional binlog file name used in case of "specific-offset" startup mode scan.startup.specific-offset.pos optional (none) Long - 在 "specific-offset" 启动模式下,启动位点的 binlog 文件位置。 + Optional binlog file position used in case of "specific-offset" startup mode scan.startup.specific-offset.gtid-set optional (none) String - 在 "specific-offset" 启动模式下,启动位点的 GTID 集合。 + Optional GTID set used in case of "specific-offset" startup mode scan.startup.specific-offset.skip-events optional (none) Long - 在指定的启动位点后需要跳过的事件数量。 + Optional number of events to skip after the specific starting offset scan.startup.specific-offset.skip-rows optional (none) Long - 在指定的启动位点后需要跳过的数据行数量。 + Optional number of rows to skip after the specific starting offset - connect.timeout - optional - 30s - Duration - 连接器在尝试连接到 MySQL 数据库服务器后超时前应等待的最长时间。 + connect.timeout + optional + 30s + Duration + The maximum time that the connector should wait after trying to connect to the MySQL database server before timing out. - connect.max-retries - optional - 3 - Integer - 连接器应重试以建立 MySQL 数据库服务器连接的最大重试次数。 + connect.max-retries + optional + 3 + Integer + The max retry times that the connector should retry to build MySQL database server connection. - connection.pool.size - optional - 20 - Integer - 连接池大小。 + connection.pool.size + optional + 20 + Integer + The connection pool size. - jdbc.properties.* - optional - 20 - String - 传递自定义 JDBC URL 属性的选项。用户可以传递自定义属性,如 'jdbc.properties.useSSL' = 'false'. + jdbc.properties.* + optional + 20 + String + Option to pass custom JDBC URL properties. User can pass custom properties like 'jdbc.properties.useSSL' = 'false'. - heartbeat.interval - optional - 30s - Duration - 用于跟踪最新可用 binlog 偏移的发送心跳事件的间隔。 + heartbeat.interval + optional + 30s + Duration + The interval of sending heartbeat event for tracing the latest available binlog offsets. debezium.* optional (none) String - 将 Debezium 的属性传递给 Debezium 嵌入式引擎,该引擎用于从 MySQL 服务器捕获数据更改。 - 例如: 'debezium.snapshot.mode' = 'never'. - 查看更多关于 Debezium 的 MySQL 连接器属性 + Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from MySQL server. + For example: 'debezium.snapshot.mode' = 'never'. + See more about the Debezium's MySQL Connector properties scan.incremental.close-idle-reader.enabled optional false Boolean - 是否在快照结束后关闭空闲的 Reader。 此特性需要 flink 版本大于等于 1.14 并且 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 需要设置为 true。
    - 若 flink 版本大于等于 1.15,'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 默认值变更为 true,可以不用显式配置 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = true。 + Whether to close idle readers at the end of the snapshot phase.
    + The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.
    + If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true, + so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true' +
    - -启动模式 +Startup Reading Position -------- -配置选项```scan.startup.mode```指定 MySQL CDC 使用者的启动模式。有效枚举包括: +The config option `scan.startup.mode` specifies the startup mode for MySQL CDC consumer. The valid enumerations are: -- `initial` (默认):在第一次启动时对受监视的数据库表执行初始快照,并继续读取最新的 binlog。 -- `earliest-offset`:跳过快照阶段,从可读取的最早 binlog 位点开始读取 -- `latest-offset`:首次启动时,从不对受监视的数据库表执行快照, 连接器仅从 binlog 的结尾处开始读取,这意味着连接器只能读取在连接器启动之后的数据更改。 -- `specific-offset`:跳过快照阶段,从指定的 binlog 位点开始读取。位点可通过 binlog 文件名和位置指定,或者在 GTID 在集群上启用时通过 GTID 集合指定。 -- `timestamp`:跳过快照阶段,从指定的时间戳开始读取 binlog 事件。 +- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest binlog. +- `earliest-offset`: Skip snapshot phase and start reading binlog events from the earliest accessible binlog offset. +- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from + the end of the binlog which means only have the changes since the connector was started. +- `specific-offset`: Skip snapshot phase and start reading binlog events from a specific offset. The offset could be + specified with binlog filename and position, or a GTID set if GTID is enabled on server. +- `timestamp`: Skip snapshot phase and start reading binlog events from a specific timestamp. -数据类型映射 + +Data Type Mapping ----------------
    @@ -357,7 +371,8 @@ Pipeline 连接器选项 where 38 < p <= 65
    STRING - 在 MySQL 中,十进制数据类型的精度高达 65,但在 Flink 中,十进制数据类型的精度仅限于 38。所以,如果定义精度大于 38 的十进制列,则应将其映射到字符串以避免精度损失。 + The precision for DECIMAL data type is up to 65 in MySQL, but the precision for DECIMAL is limited to 38 in Flink. + So if you define a decimal column whose precision is greater than 38, you should map it to STRING to avoid precision loss. @@ -445,7 +460,7 @@ Pipeline 连接器选项 LONGBLOB
    BYTES - 目前,对于 MySQL 中的 BLOB 数据类型,仅支持长度不大于 2147483647(2**31-1)的 blob。 + Currently, for BLOB data type in MySQL, only the blob whose length isn't greater than 2,147,483,647(2 ** 31 - 1) is supported. @@ -459,14 +474,14 @@ Pipeline 连接器选项 JSON STRING - JSON 数据类型将在 Flink 中转换为 JSON 格式的字符串。 + The JSON data type will be converted into STRING with JSON format in Flink. SET - - 暂不支持 + Not supported yet. @@ -483,35 +498,34 @@ Pipeline 连接器选项 STRING - MySQL 中的空间数据类型将转换为具有固定 Json 格式的字符串。 - 请参考 MySQL 空间数据类型映射 章节了解更多详细信息。 + The spatial data types in MySQL will be converted into STRING with a fixed Json format. + Please see MySQL Spatial Data Types Mapping section for more detailed information.
    -### 空间数据类型映射 - -MySQL中除`GEOMETRYCOLLECTION`之外的空间数据类型都会转换为 Json 字符串,格式固定,如:
    +### MySQL Spatial Data Types Mapping +The spatial data types except for `GEOMETRYCOLLECTION` in MySQL will be converted into Json String with a fixed format like:
    ```json {"srid": 0 , "type": "xxx", "coordinates": [0, 0]} ``` -字段`srid`标识定义几何体的 SRS,如果未指定 SRID,则 SRID 0 是新几何体值的默认值。 -由于 MySQL 8+ 在定义空间数据类型时只支持特定的 SRID,因此在版本较低的MySQL中,字段`srid`将始终为 0。 +The field `srid` identifies the SRS in which the geometry is defined, SRID 0 is the default for new geometry values if no SRID is specified. +As only MySQL 8+ support to specific SRID when define spatial data type, the field `srid` will always be 0 in MySQL with a lower version. -字段`type`标识空间数据类型,例如`POINT`/`LINESTRING`/`POLYGON`。 +The field `type` identifies the spatial data type, such as `POINT`/`LINESTRING`/`POLYGON`. -字段`coordinates`表示空间数据的`坐标`。 +The field `coordinates` represents the `coordinates` of the spatial data. -对于`GEOMETRYCOLLECTION`,它将转换为 Json 字符串,格式固定,如:
    +For `GEOMETRYCOLLECTION`, it will be converted into Json String with a fixed format like:
    ```json {"srid": 0 , "type": "GeometryCollection", "geometries": [{"type":"Point","coordinates":[10,10]}]} ``` -`Geometrics`字段是一个包含所有空间数据的数组。 +The field `geometries` is an array contains all spatial data. -不同空间数据类型映射的示例如下: +The example for different spatial data types mapping is as follows:
    @@ -553,7 +567,4 @@ MySQL中除`GEOMETRYCOLLECTION`之外的空间数据类型都会转换为 Json
    -常见问题 --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) +{{< top >}} diff --git a/docs/content/docs/connectors/pipeline-connectors/overview.md b/docs/content/docs/connectors/pipeline-connectors/overview.md new file mode 100644 index 0000000000..877501b19a --- /dev/null +++ b/docs/content/docs/connectors/pipeline-connectors/overview.md @@ -0,0 +1,44 @@ +--- +title: "Overview" +weight: 1 +type: docs +aliases: + - /connectors/pipeline-connectors/ +--- + + +# Pipeline Connectors Of CDC Streaming ELT Framework + +## Supported Connectors + +| Connector | Database | +|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [doris-pipeline](doris-pipeline.md) |
  • [Doris](https://doris.apache.org/): 1.2.x, 2.x.x | +| [mysql-pipeline](mysql-pipeline.md) |
  • [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x
  • [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x
  • [MariaDB](https://mariadb.org): 10.x
  • [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | +| [starrocks-pipeline](starrocks-pipeline.md) |
  • [StarRocks](https://www.starrocks.io/): 2.x, 3.x | + +## Supported Flink Versions +The following table shows the version mapping between Flink® CDC Pipeline and Flink®: + +| Flink® CDC Version | Flink® Version | +|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| 3.0.* | 1.14.\*, 1.15.\*, 1.16.\*, 1.17.\*, 1.18.\* | + +{{< top >}} diff --git a/docs/content/pipelines/starrocks-pipeline(ZH).md b/docs/content/docs/connectors/pipeline-connectors/starrocks-pipeline.md similarity index 50% rename from docs/content/pipelines/starrocks-pipeline(ZH).md rename to docs/content/docs/connectors/pipeline-connectors/starrocks-pipeline.md index a42d168367..87eb72aef0 100644 --- a/docs/content/pipelines/starrocks-pipeline(ZH).md +++ b/docs/content/docs/connectors/pipeline-connectors/starrocks-pipeline.md @@ -1,3 +1,10 @@ +--- +title: "StarRocks Pipeline Connector" +weight: 4 +type: docs +aliases: +- /pipelines/starrocks-pipeline.html +--- -# StarRocks Pipeline 连接器 +# StarRocks Pipeline Connector -StarRocks Pipeline 连接器可以用作 Pipeline 的 *Data Sink*,将数据写入[StarRocks](https://github.com/StarRocks/starrocks)。 本文档介绍如何设置 StarRocks Pipeline 连接器。 +The StarRocks Pipeline connector can be used as the *Data Sink* of the pipeline, and write data to [StarRocks](https://github.com/StarRocks/starrocks). This document describes how to set up the StarRocks Pipeline connector. -## 连接器的功能 -* 自动建表 -* 表结构变更同步 -* 数据实时同步 +## What can the connector do? +* Create table automatically if not exist +* Schema change synchronization +* Data synchronization -如何创建 Pipeline +How to create Pipeline ---------------- -从 MySQL 读取数据同步到 StarRocks 的 Pipeline 可以定义如下: +The pipeline for reading data from MySQL and sink to StarRocks can be defined as follows: ```yaml source: @@ -55,7 +62,7 @@ pipeline: parallelism: 2 ``` -Pipeline 连接器配置项 +Pipeline Connector Options ----------------
    @@ -74,162 +81,170 @@ Pipeline 连接器配置项 - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
    required (none) String指定要使用的连接器, 这里需要设置成 'starrocks'.Specify what connector to use, here should be 'starrocks'.
    name optional (none) StringSink 的名称.The name of the sink.
    jdbc-url required (none) String用于访问 FE 节点上的 MySQL 服务器。多个地址用英文逗号(,)分隔。格式:`jdbc:mysql://fe_host1:fe_query_port1,fe_host2:fe_query_port2`。The address that is used to connect to the MySQL server of the FE. You can specify multiple addresses, which must be separated by a comma (,). Format: `jdbc:mysql://fe_host1:fe_query_port1,fe_host2:fe_query_port2,fe_host3:fe_query_port3`.
    load-url required (none) String用于访问 FE 节点上的 HTTP 服务器。多个地址用英文分号(;)分隔。格式:`fe_host1:fe_http_port1;fe_host2:fe_http_port2`。The address that is used to connect to the HTTP server of the FE. You can specify multiple addresses, which must be separated by a semicolon (;). Format: `fe_host1:fe_http_port1;fe_host2:fe_http_port2`.
    username required (none) StringStarRocks 集群的用户名。User name to use when connecting to the StarRocks database.
    password required (none) StringStarRocks 集群的用户密码。Password to use when connecting to the StarRocks database.
    sink.label-prefix optional (none) String指定 Stream Load 使用的 label 前缀。The label prefix used by Stream Load.
    sink.connect.timeout-ms optional 30000 String与 FE 建立 HTTP 连接的超时时间。取值范围:[100, 60000]。The timeout for establishing HTTP connection. Valid values: 100 to 60000.
    sink.wait-for-continue.timeout-ms optional 30000 String等待 FE HTTP 100-continue 应答的超时时间。取值范围:[3000, 60000]。Timeout in millisecond to wait for 100-continue response from FE http server. + Valid values: 3000 to 600000.
    sink.buffer-flush.max-bytes optional 157286400 Long内存中缓冲的数据量大小,缓冲区由所有导入的表共享,达到阈值后将选择一个或多个表的数据写入到StarRocks。 - 达到阈值后取值范围:[64MB, 10GB]。The maximum size of data that can be accumulated in memory before being sent to StarRocks at a time. + The value ranges from 64 MB to 10 GB. This buffer is shared by all tables in the sink. If the buffer + is full, the connector will choose one or more tables to flush.
    sink.buffer-flush.interval-ms optional 300000 Long每个表缓冲数据发送的间隔,用于控制数据写入 StarRocks 的延迟。单位是毫秒,取值范围:[1000, 3600000]。The interval at which data is flushed for each table. The unit is in millisecond.
    sink.scan-frequency.ms optional 50 Long连接器会定期检查每个表是否到达发送间隔,该配置控制检查频率,单位为毫秒。Scan frequency in milliseconds to check whether the buffered data for a table should be flushed + because of reaching the flush interval.
    sink.io.thread-count optional 2 Integer用来执行 Stream Load 的线程数,不同表之间的导入可以并发执行。Number of threads used for concurrent stream loads among different tables.
    sink.at-least-once.use-transaction-stream-load optional true Booleanat-least-once 下是否使用 transaction stream load。Whether to use transaction stream load for at-least-once when it's available.
    sink.properties.* optional (none) StringStream Load 的参数,控制 Stream Load 导入行为。例如 参数 `sink.properties.timeout` 用来控制导入的超时时间。 - 全部参数和解释请参考 - STREAM LOADThe parameters that control Stream Load behavior. For example, the parameter `sink.properties.timeout` + specifies the timeout of Stream Load. For a list of supported parameters and their descriptions, + see + STREAM LOAD.
    table.create.num-buckets optional (none) Integer自动创建 StarRocks 表时使用的桶数。对于 StarRocks 2.5 及之后的版本可以不设置,StarRocks 将会 - - 自动设置分桶数量;对于 StarRocks 2.5 之前的版本必须设置。Number of buckets when creating a StarRocks table automatically. For StarRocks 2.5 or later, it's not required + to set the option because StarRocks can + + determine the number of buckets automatically. For StarRocks prior to 2.5, you must set this option.
    table.create.properties.* optional (none) String自动创建 StarRocks 表时使用的属性。比如: 如果使用 StarRocks 3.2 及之后的版本,'table.create.properties.fast_schema_evolution' = 'true' - 将会打开 fast schema evolution 功能。 更多信息请参考 - 主键模型Properties used for creating a StarRocks table. For example: 'table.create.properties.fast_schema_evolution' = 'true' + will enable fast schema evolution if you are using StarRocks 3.2 or later. For more information, + see how to create a primary key table.
    table.schema-change.timeout optional 30min DurationStarRocks 侧执行 schema change 的超时时间,必须是秒的整数倍。超时后 StarRocks 将会取消 schema change,从而导致作业失败。Timeout for a schema change on StarRocks side, and must be an integral multiple of + seconds. StarRocks will cancel the schema change after timeout which will + cause the sink failure.
    -使用说明 +Usage Notes -------- -* 只支持主键表,因此源表必须有主键 +* Only support StarRocks primary key table, so the source table must have primary keys. -* 暂不支持 exactly-once,连接器 通过 at-least-once 和主键表实现幂等写 +* Not support exactly-once. The connector uses at-least-once + primary key table for idempotent writing. -* 对于自动建表 - * 分桶键和主键相同 - * 没有分区键 - * 分桶数由 `table.create.num-buckets` 控制。如果使用的 StarRocks 2.5 及之后的版本可以不设置,StarRocks 能够 - - 自动设置分桶数量。对于 StarRocks 2.5 之前的版本必须设置,否则无法自动创建表。 +* For creating table automatically + * the distribution keys are the same as the primary keys + * there is no partition key + * the number of buckets is controlled by `table.create.num-buckets`. If you are using StarRocks 2.5 or later, + it's not required to set the option because StarRocks can [determine the number of buckets automatically](https://docs.starrocks.io/docs/table_design/Data_distribution/#determine-the-number-of-buckets), + otherwise you must set the option. -* 对于表结构变更同步 - * 只支持增删列 - * 新增列只能添加到最后一列 - * 如果使用 StarRocks 3.2 及之后版本,并且通过连接器来自动建表, 可以通过配置 `table.create.properties.fast_schema_evolution` 为 `true` - 来加速 StarRocks 执行变更。 +* For schema change synchronization + * only supports add/drop columns + * the new column will always be added to the last position + * if your StarRocks version is 3.2 or later, and using the connector to create table automatically, + you can set `table.create.properties.fast_schema_evolution` to `true` to speed up the schema change. -* 对于数据同步,pipeline 连接器使用 [StarRocks Sink 连接器](https://github.com/StarRocks/starrocks-connector-for-apache-flink) - 将数据写入 StarRocks,具体可以参考 [Sink 文档](https://github.com/StarRocks/starrocks-connector-for-apache-flink/blob/main/docs/content/connector-sink.md)。 +* For data synchronization, the pipeline connector uses [StarRocks Sink Connector](https://github.com/StarRocks/starrocks-connector-for-apache-flink) + to write data to StarRocks. You can see [sink documentation](https://github.com/StarRocks/starrocks-connector-for-apache-flink/blob/main/docs/content/connector-sink.md) + for how it works. -数据类型映射 +Data Type Mapping ----------------
    @@ -299,26 +314,25 @@ Pipeline 连接器配置项 - + - + - +
    CHAR(n) where n <= 85 CHAR(n * 3)CDC 中长度表示字符数,而 StarRocks 中长度表示字节数。根据 UTF-8 编码,一个中文字符占用三个字节,因此 CDC 中的长度对应到 StarRocks - 中为 n * 3。由于 StarRocks CHAR 类型的最大长度为255,所以只有当 CDC 中长度不超过85时,才将 CDC CHAR 映射到 StarRocks CHAR。CDC defines the length by characters, and StarRocks defines it by bytes. According to UTF-8, one Chinese + character is equal to three bytes, so the length for StarRocks is n * 3. Because the max length of StarRocks + CHAR is 255, map CDC CHAR to StarRocks CHAR only when the CDC length is no larger than 85.
    CHAR(n) where n > 85 VARCHAR(n * 3)CDC 中长度表示字符数,而 StarRocks 中长度表示字节数。根据 UTF-8 编码,一个中文字符占用三个字节,因此 CDC 中的长度对应到 StarRocks - 中为 n * 3。由于 StarRocks CHAR 类型的最大长度为255,所以当 CDC 中长度超过85时,才将 CDC CHAR 映射到 StarRocks VARCHAR。CDC defines the length by characters, and StarRocks defines it by bytes. According to UTF-8, one Chinese + character is equal to three bytes, so the length for StarRocks is n * 3. Because the max length of StarRocks + CHAR is 255, map CDC CHAR to StarRocks VARCHAR if the CDC length is larger than 85.
    VARCHAR(n) VARCHAR(n * 3)CDC 中长度表示字符数,而 StarRocks 中长度表示字节数。根据 UTF-8 编码,一个中文字符占用三个字节,因此 CDC 中的长度对应到 StarRocks - 中为 n * 3。CDC defines the length by characters, and StarRocks defines it by bytes. According to UTF-8, one Chinese + character is equal to three bytes, so the length for StarRocks is n * 3.
    -FAQ --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) \ No newline at end of file +{{< top >}} diff --git a/docs/content/docs/development/_index.md b/docs/content/docs/development/_index.md new file mode 100644 index 0000000000..3229296226 --- /dev/null +++ b/docs/content/docs/development/_index.md @@ -0,0 +1,26 @@ +--- +title: Development +icon: +bold: true +sectionBreak: true +bookCollapseSection: true +weight: 2 +--- + diff --git a/docs/content/docs/development/concept-pipeline.md b/docs/content/docs/development/concept-pipeline.md new file mode 100644 index 0000000000..90682f9aa3 --- /dev/null +++ b/docs/content/docs/development/concept-pipeline.md @@ -0,0 +1,130 @@ +--- +title: "CDC Streaming ELT Framework Concepts" +weight: 1 +type: docs +aliases: + - /development/concept-pipeline.html +--- + + + +# CDC Streaming ELT Framework + +## What is CDC Streaming ELT Framework +CDC Streaming ELT Framework is a stream data integration framework that aims to provide users with a more robust API. It allows users to configure their data synchronization logic through customized Flink operators and job submission tools. The framework prioritizes optimizing the task submission process and offers enhanced functionalities such as whole database synchronization, sharding, and schema change synchronization. + +## What can CDC Streaming ELT Framework do? +{{< img src="/fig/architecture.png" alt="CDC Architecture" >}} +* ✅ End-to-end data integration framework +* ✅ API for data integration users to build jobs easily +* ✅ Multi-table support in Source / Sink +* ✅ Synchronization of entire databases +* ✅ Schema evolution capability + +## Core Concepts +{{< img src="/fig/design.png" alt="CDC Design" >}} + +The data types flowing in the Flink CDC 3.0 framework are referred to as **Event**, which represent the change events generated by external systems. +Each event is marked with a **Table ID** for which the change occurred. Events are categorized into `SchemaChangeEvent` and `DataChangeEvent`, representing changes in table structure and data respectively. + +**Data Source** Connector captures the changes in external systems and converts them into events as the output of the synchronization task. It also provides a `MetadataAccessor` for the framework to read the metadata of the external systems. + +**Data Sink** connector receives the change events from **Data Source** and applies them to the external systems. Additionally, `MetadataApplier` is used to apply metadata changes from the source system to the target system. + +Since events flow from the upstream to the downstream in a pipeline manner, the data synchronization task is referred as a **Data Pipeline**. A **Data Pipeline** consists of a **Data Source**, **Route**, **Transform** and **Data Sink**. The transform can add extra content to events, and the router can remap the `Table ID`s corresponding to events. + +Now let's introduce more details about the concepts you need to know when using the CDC Streaming ELT Framework. + +### Table ID +When connecting to external systems, it is necessary to establish a mapping relationship with the storage objects of the external system. This is what `Table Id` refers to. + +To be compatible with most external systems, the `Table ID` is represented by a 3-tuple : (namespace, schemaName, table). Connectors need to establish the mapping between Table ID and storage objects in external systems. +For instance, a table in MySQL/Doris is mapped to (null, database, table) and a topic in a message queue system such as Kafka is mapped to (null, null, topic). + +### Data Source +Data Source is used to access metadata and read the changed data from external systems. +A Data Source can read data from multiple tables simultaneously. + +To describe a data source, the follows are required: +* Type: The type of the source, such as MySQL, Postgres. +* Name: The name of the source, which is user-defined (optional, with a default value provided). +* Other custom configurations for the source. + +For example, we could use `yaml` files to define a mysql source +```yaml +source: + type: mysql + name: mysql-source #optional,description information + host: localhost + port: 3306 + username: admin + password: pass + tables: adb.*, bdb.user_table_[0-9]+, [app|web]_order_\.* +``` + +### Data Sink +The Data Sink is used to apply schema changes and write change data to external systems. A Data Sink can write to multiple tables simultaneously. + +To describe a data sink, the follows are required: +* Type: The type of the sink, such as MySQL or PostgreSQL. +* Name: The name of the sink, which is user-defined (optional, with a default value provided). +* Other custom configurations for the sink. + +For example, we can use this `yaml` file to define a kafka sink: +```yaml +sink: + type: kafka + name: mysink-queue # Optional parameter for description purpose + bootstrap-servers: localhost:9092 + auto-create-table: true # Optional parameter for advanced functionalities +``` + +### Route +Route specifies the target table ID of each event. +The most typical scenario is the merge of sub-databases and sub-tables, routing multiple upstream source tables to the same sink table + +To describe a route, the follows are required: +* source-table: Source table id, supports regular expressions +* sink-table: Sink table id, supports regular expressions +* description: Routing rule description(optional, default value provided) + +For example, if synchronize the table 'web_order' in the database 'mydb' to a Kafka topic 'ods_web_order', we can use this yaml file to define this route: +```yaml +route: + source-table: mydb.default.web_order + sink-table: ods_web_order + description: sync table to one destination table with given prefix ods_ +``` + +### Data Pipeline +Since events flow from the upstream to the downstream in a pipeline manner, the data synchronization task is also referred as a Data Pipeline. + +To describe a Data Pipeline, the follows are required: +* Name: The name of the pipeline, which will be submitted to the Flink cluster as the job name. +* Other advanced capabilities such as automatic table creation, schema evolution, etc., will be implemented. + +For example, we can use this yaml file to define a pipeline: +```yaml +pipeline: + name: mysql-to-kafka-pipeline + parallelism: 1 +``` + +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/datastream-api-package-guidance-zh.md" b/docs/content/docs/development/datastream-api-package-guidance.md similarity index 92% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/datastream-api-package-guidance-zh.md" rename to docs/content/docs/development/datastream-api-package-guidance.md index 400474b3dc..ea2bfe130e 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/datastream-api-package-guidance-zh.md" +++ b/docs/content/docs/development/datastream-api-package-guidance.md @@ -1,3 +1,11 @@ +--- +title: "DataStream Api Package Guidance" +weight: 999 +type: docs +aliases: +- /development/datastream-api-package-guidance.html +--- + -# DataStream api 打包指南 +# DataStream Api Package Guidance -本指南提供了 mysql cdc DataStream api 的简单 pom 示例 +This guide provides a simple pom example of mysql cdc DataStream api -## 框架版本 +## frame version flink 1.17.2 flink mysql cdc 2.4.2 -## pom 示例 +## pom example ```xml @@ -34,7 +42,7 @@ flink 1.17.2 flink mysql cdc 2.4.2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - com.ververica + org.apache.flink FlinkCDCTest 1.0-SNAPSHOT @@ -113,7 +121,7 @@ flink 1.17.2 flink mysql cdc 2.4.2 30.1.1-jre-16.1 - com.ververica + org.apache.flink flink-connector-mysql-cdc 2.4.2 @@ -164,8 +172,8 @@ flink 1.17.2 flink mysql cdc 2.4.2 io.debezium:debezium-core io.debezium:debezium-ddl-parser io.debezium:debezium-connector-mysql - com.ververica:flink-connector-debezium - com.ververica:flink-connector-mysql-cdc + org.apache.flink:flink-connector-debezium + org.apache.flink:flink-connector-mysql-cdc org.antlr:antlr4-runtime org.apache.kafka:* mysql:mysql-connector-java @@ -225,10 +233,10 @@ flink 1.17.2 flink mysql cdc 2.4.2 ``` -## 代码示例 +## code example ```java -package com.ververica.flink.cdc; +package org.apache.flink.flink.cdc; import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -241,26 +249,28 @@ public class CdcTest { MySqlSource mySqlSource = MySqlSource.builder() .hostname("yourHostname") .port(yourPort) - .databaseList("yourDatabaseName") // 设置捕获的数据库, 如果需要同步整个数据库,请将 tableList 设置为 ".*". - .tableList("yourDatabaseName.yourTableName") // 设置捕获的表 + .databaseList("yourDatabaseName") // set captured database, If you need to synchronize the whole database, Please set tableList to ".*". + .tableList("yourDatabaseName.yourTableName") // set captured table .username("yourUsername") .password("yourPassword") - .deserializer(new JsonDebeziumDeserializationSchema()) // 将 SourceRecord 转换为 JSON 字符串 + .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String .build(); final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // 设置 3s 的 checkpoint 间隔 + // enable checkpoint env.enableCheckpointing(3000); env .fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") - // 设置 source 节点的并行度为 1 + // set 1 parallel source tasks .setParallelism(1) - .print().setParallelism(1); // 设置 sink 节点并行度为 1 + .print().setParallelism(1); // use parallelism 1 for sink env.execute("Print MySQL Snapshot + Binlog"); } } -``` \ No newline at end of file +``` + +{{< top >}} diff --git a/docs/content/docs/faq/_index.md b/docs/content/docs/faq/_index.md new file mode 100644 index 0000000000..1a6cd62e5d --- /dev/null +++ b/docs/content/docs/faq/_index.md @@ -0,0 +1,25 @@ +--- +title: "FAQ" +icon: +bold: true +bookCollapseSection: true +weight: 4 +--- + diff --git a/docs/content/docs/faq/faq.md b/docs/content/docs/faq/faq.md new file mode 100644 index 0000000000..3077694015 --- /dev/null +++ b/docs/content/docs/faq/faq.md @@ -0,0 +1,330 @@ +--- +title: "FAQ" +weight: 1 +type: docs +aliases: +- /faq/faq.html +--- + +## General FAQ + +### Q1: Why can't I download Flink-sql-connector-mysql-cdc-2.2-snapshot jar, why doesn't Maven warehouse rely on XXX snapshot? + +Like the mainstream Maven project version management, XXX snapshot version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as flink-sql-connector-mysql-cdc-2.1 0.jar, the released version will be available in the Maven central warehouse. + +### Q2: When should I use Flink SQL connector XXX Jar? When should I Flink connector XXX jar? What's the difference between the two? + +The dependency management of each connector in Flink CDC project is consistent with that in Flink project. Flink SQL connector XX is a fat jar. In addition to the code of connector, it also enters all the third-party packages that connector depends on into the shade and provides them to SQL jobs. Users only need to add the fat jar in the flink/lib directory. The Flink connector XX has only the code of the connector and does not contain the required dependencies. It is used by DataStream jobs. Users need to manage the required three-party package dependencies. Conflicting dependencies need to be excluded and shaded by themselves. + +### Q3: Why change the package name from com.alibaba.ververica changed to org.apache.flink? Why can't the 2. X version be found in Maven warehouse? + +Flink CDC project changes the group ID from com.alibaba.ververica changed to org.apache.flink since 2.0.0 version, this is to make the project more community neutral and more convenient for developers of various companies to build. So look for 2.x in Maven warehouse package, the path is /org/apache/flink. + +## MySQL CDC FAQ + +### Q1: I use CDC 2.x version , only full data can be read, but binlog data cannot be read. What's the matter? + +CDC 2.0 supports lock free algorithm and concurrent reading. In order to ensure the order of full data + incremental data, it relies on Flink's checkpoint mechanism, so the job needs to be configured with checkpoint. + +Configuration method in SQL job: + +```sql +Flink SQL> SET 'execution.checkpointing.interval' = '3s'; +``` + +DataStream job configuration mode: + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.enableCheckpointing(3000); +``` + +### Q2: Using MySQL CDC DataStream API, the timestamp field read in the incremental phase has a time zone difference of 8 hours. What's the matter? + +When parsing the timestamp field in binlog data, CDC will use the server time zone information configured in the job, that is, the time zone of the MySQL server. If this time zone is not consistent with the time zone of your MySQL server, this problem will occur. + +In addition, if the serializer is customized in the DataStream job. + +such as MyDeserializer implements DebeziumDeserializationSchema, when the customized serializer parses the timestamp type data, it needs to refer to the analysis of the timestamp type in RowDataDebeziumDeserializeSchema and use the given time zone information. + +``` +private TimestampData convertToTimestamp(Object dbzObj, Schema schema) { + if (dbzObj instanceof Long) { + switch (schema.name()) { + case Timestamp.SCHEMA_NAME: + return TimestampData.fromEpochMillis((Long) dbzObj); + case MicroTimestamp.SCHEMA_NAME: + long micro = (long) dbzObj; + return TimestampData.fromEpochMillis(micro / 1000, (int) (micro % 1000 * 1000)); + case NanoTimestamp.SCHEMA_NAME: + long nano = (long) dbzObj; + return TimestampData.fromEpochMillis(nano / 1000_000, (int) (nano % 1000_000)); + } + } + LocalDateTime localDateTime = TemporalConversions.toLocalDateTime(dbzObj, serverTimeZone); + return TimestampData.fromLocalDateTime(localDateTime); + } +``` + +### Q3: Does MySQL CDC support listening to slave database? How to configure slave database? + +Yes, the slave database needs to be configured with log slave updates = 1, so that the slave instance can also write the data synchronized from the master instance to the binlog file of the slave database. If the master database has enabled gtid mode, the slave database also needs to be enabled. + +``` +log-slave-updates = 1 +gtid_mode = on +enforce_gtid_consistency = on +``` + +### Q4: I want to synchronize sub databases and sub tables. How should I configure them? + +In the with parameter of MySQL CDC table, both table name and database name support regular configuration, such as 'table name ' = 'user_ '.' Can match table name 'user_ 1, user_ 2,user_ A ' table. + +Note that any regular matching character is'. ' Instead of '*', where the dot represents any character, the asterisk represents 0 or more, and so does database name, that the shared table should be in the same schema. + +### Q5: I want to skip the stock reading phase and only read binlog data. How to configure it? + +In the with parameter of MySQL CDC table + +``` +'scan.startup.mode' = 'latest-offset'. +``` + +### Q6: I want to get DDL events in the database. What should I do? Is there a demo? + +Flink CDC provides DataStream API `MysqlSource` since version 2.1. Users can configure includeschemachanges to indicate whether DDL events are required. After obtaining DDL events, they can write code for next processing. + +```java + public void consumingAllEvents() throws Exception { + inventoryDatabase.createAndInitialize(); + MySqlSource mySqlSource = + MySqlSource.builder() + .hostname(MYSQL_CONTAINER.getHost()) + .port(MYSQL_CONTAINER.getDatabasePort()) + .databaseList(inventoryDatabase.getDatabaseName()) + .tableList(inventoryDatabase.getDatabaseName() + ".products") + .username(inventoryDatabase.getUsername()) + .password(inventoryDatabase.getPassword()) + .serverId("5401-5404") + .deserializer(new JsonDebeziumDeserializationSchema()) + .includeSchemaChanges(true) // Configure here and output DDL events + .build(); + ... // Other processing logic + } +``` + +### Q7: How to synchronize the whole MySQL database? Does Flink CDC support it? + +The DataStream API provided in Q6 has enabled users to obtain DDL change events and data change events. On this basis, users need to develop DataStream jobs according to their own business logic and downstream storage. + +### Q8: In the same MySQL instance, the table of one database cannot synchronize incremental data, but other databases works fine. Why? + +Users can check Binlog_Ignore_DB and Binlog_Do_DB through the `show master status` command + +```mysql +mysql> show master status; ++------------------+----------+--------------+------------------+----------------------+ +| File | Position | Binlog_Do_DB | Binlog_Ignore_DB | Executed_Gtid_Set | ++------------------+----------+--------------+------------------+----------------------+ +| mysql-bin.000006 | 4594 | | | xxx:1-15 | ++------------------+----------+--------------+------------------+----------------------+ +``` + +### Q9: The job reports an error the connector is trying to read binlog starting at GTIDs xxx and binlog file 'binlog.000064', pos=89887992, skipping 4 events plus 1 rows, but this is no longer available on the server. Reconfigure the connector to use a snapshot when needed, What should I do? + +This error occurs because the binlog file being read by the job has been cleaned up on the MySQL server. Generally, the expiration time of the binlog file retained on the MySQL server is too short. You can set this value higher, such as 7 days. + +```mysql +mysql> show variables like 'expire_logs_days'; +mysql> set global expire_logs_days=7; +``` + +In another case, the binlog consumption of the Flink CDC job is too slow. Generally, sufficient resources can be allocated. + +### Q10: The job reports an error ConnectException: A slave with the same server_uuid/server_id as this slave has connected to the master,What should I do? + +This error occurs because the server ID used in the job conflicts with the server ID used by other jobs or other synchronization tools. The server ID needs to be globally unique. The server ID is an int type integer. In CDC 2.x In version, each concurrency of the source requires a server ID. it is recommended to reasonably plan the server ID. for example, if the source of the job is set to four concurrency, you can configure 'server ID' = '5001-5004', so that each source task will not conflict. + +### Q11: The job reports an error ConnectException: Received DML ‘…’ for processing, binlog probably contains events generated with statement or mixed based replication format,What should I do? + +This error occurs because the MySQL server is not configured correctly. You need to check the binlog is format row? You can view it through the following command + +```mysql +mysql> show variables like '%binlog_format%'; +``` + +### Q12: The job reports an error Mysql8.0 Public Key Retrieval is not allowed,What should I do? + +This is because the MySQL user configured by the user uses sha256 password authentication and requires TLS and other protocols to transmit passwords. A simple method is to allow MySQL users to support original password access. + +```mysql +mysql> ALTER USER 'username'@'localhost' IDENTIFIED WITH mysql_native_password BY 'password'; +mysql> FLUSH PRIVILEGES; +``` + +### Q13: The job reports an error EventDataDeserializationException: Failed to deserialize data of EventHeaderV4 .... Caused by: java.net.SocketException: Connection reset,What should I do? + +This problem is generally caused by the network. First, check the network between the Flink cluster and the database, and then increase the network parameters of the MySQL server. + +```mysql +mysql> set global slave_net_timeout = 120; +mysql> set global thread_pool_idle_timeout = 120; +``` + +Or try to use the flink configuration as follows. + +``` +execution.checkpointing.interval=10min +execution.checkpointing.tolerable-failed-checkpoints=100 +restart-strategy=fixed-delay +restart-strategy.fixed-delay.attempts=2147483647 +restart-strategy.fixed-delay.delay= 30s +``` + +If there is bad back pressure in the job, this problem may happen too. Then you need to handle the back pressure in the job first. + +### Q14: The job reports an error The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires,What should I do? + +The reason for this problem is that the reading of the full volume phase of the job is too slow. After reading the full volume phase, the previously recorded gtid site at the beginning of the full volume phase has been cleared by mysql. This can increase the save time of binlog files on the MySQL server, or increase the concurrency of source to make the full volume phase read faster. + +### Q15: How to config `tableList` option when build MySQL CDC source in DataStream API? + +The `tableList` option requires table name with database name rather than table name in DataStream API. For MySQL CDC source, the `tableList` option value should like ‘my_db.my_table’. + +## Postgres CDC FAQ + +### Q1: It is found that the disk utilization rate of PG server is high. What is the reason why wal is not released? + +Flink Postgres CDC will only update the LSN in the Postgres slot when the checkpoint is completed. Therefore, if you find that the disk utilization is high, please first confirm whether the checkpoint is turned on. + +### Q2: Flink Postgres CDC returns null for decimal types exceeding the maximum precision (38, 18) in synchronous Postgres + +In Flink, if the precision of the received data is greater than the precision of the type declared in Flink, the data will be processed as null. You can configure the corresponding 'debezium decimal. handling. Mode '='string' process the read data with string type + +### Q3: Flink Postgres CDC prompts that toast data is not transmitted. What is the reason? + +Please ensure that the replica identity is full first. The toast data is relatively large. In order to save the size of wal, if the toast data is not changed, the wal2json plugin will not bring toast data to the updated data. To avoid this problem, you can use 'debezium schema. refresh. mode'='columns_ diff_ exclude_ unchanged_ Toast 'to solve. + +### Q4: The job reports an error replication slot "XXXX" is active. What should I do? + +Currently, Flink Postgres CDC does not release the slot manually after the job exits. There are two ways to solve this problem + +- Go to Postgres and manually execute the following command + +``` +select pg_drop_replication_slot('rep_slot'); + ERROR: replication slot "rep_slot" is active for PID 162564 +select pg_terminate_backend(162564); select pg_drop_replication_slot('rep_slot'); +``` + +- Add 'debezium.slot.drop.on.stop'='true' to PG source with parameter to automatically clean up the slot after the job stops + +### Q5: Jobs have dirty data, such as illegal dates. Are there parameters that can be configured and filtered? + +Yes, you can add configure. In the with parameter of the Flink CDC table 'debezium.event.deserialization.failure.handling.mode'='warn' parameter, skip dirty data and print dirty data to warn log. You can also configure 'debezium.event.deserialization.failure.handling.mode'='ignore', skip dirty data directly and do not print dirty data to the log. + +### Q6: How to config `tableList` option when build Postgres CDC source in DataStream API? + +The `tableList` option requires table name with schema name rather than table name in DataStream API. For Postgres CDC source, the `tableList` option value should like ‘my_schema.my_table’. + +## MongoDB CDC FAQ + +### Q1: Does mongodb CDC support full + incremental read and read-only incremental? + +Yes, the default is full + incremental reading; Use copy The existing = false parameter is set to read-only increment. + +### Q2: Does mongodb CDC support recovery from checkpoint? What is the principle? + +Yes, the checkpoint will record the resumetoken of the changestream. During recovery, the changestream can be restored through the resumetoken. Where resumetoken corresponds to oplog RS (mongodb change log collection), oplog RS is a fixed capacity collection. When the corresponding record of resumetoken is in oplog When RS does not exist, an exception of invalid resumetoken may occur. In this case, you can set the appropriate oplog Set size of RS to avoid oplog RS retention time is too short, you can refer to https://docs.mongodb.com/manual/tutorial/change-oplog-size/ In addition, the resumetoken can be refreshed through the newly arrived change record and heartbeat record. + +### Q3: Does mongodb CDC support outputting - U (update_before) messages? + +Mongodb original oplog RS has only insert, update, replace and delete operation types. It does not retain the information before update. It cannot output - U messages. It can only realize the update semantics in Flink. When using mongodbtablesource, Flink planner will automatically perform changelognormalize optimization, fill in the missing - U messages, and output complete + I, - u, + U, and - D messages. The cost of changelognormalize optimization is that the node will save the status of all previous keys. Therefore, if the DataStream job directly uses mongodbsource, without the optimization of Flink planner, changelognormalize will not be performed automatically, so - U messages cannot be obtained directly. To obtain the pre update image value, you need to manage the status yourself. If you don't want to manage the status yourself, you can convert mongodbtablesource to changelogstream or retractstream and supplement the pre update image value with the optimization ability of Flink planner. An example is as follows: + +``` + tEnv.executeSql("CREATE TABLE orders ( ... ) WITH ( 'connector'='mongodb-cdc',... )"); + + Table table = tEnv.from("orders") + .select($("*")); + + tEnv.toChangelogStream(table) + .print() + .setParallelism(1); + + env.execute(); +``` + + + +### Q4: Does mongodb CDC support subscribing to multiple collections? + +Only the collection of the whole database can be subscribed, but some collection filtering functions are not supported. For example, if the database is configured as' mgdb 'and the collection is an empty string, all collections under the' mgdb 'database will be subscribed. + +### Q5: Does mongodb CDC support setting multiple concurrent reads? + +Not yet supported. + +### Q6: What versions of mongodb are supported by mongodb CDC? + +Mongodb CDC is implemented based on the changestream feature, which is a new feature launched by mongodb 3.6. Mongodb CDC theoretically supports versions above 3.6. It is recommended to run version > = 4.0. When executing versions lower than 3.6, an error will occur: unrecognized pipeline stage name: '$changestream'. + +### Q7: What is the operation mode of mongodb supported by mongodb CDC? + +Changestream requires mongodb to run in replica set or fragment mode. Local tests can use stand-alone replica set rs.initiate(). + +Errors occur in standalone mode : The $changestage is only supported on replica sets. + +### Q8: Mongodb CDC reports an error. The user name and password are incorrect, but other components can connect normally with this user name and password. What is the reason? + +If the user is creating a DB that needs to be connected, add 'connection' to the with parameter Options' ='authsource = DB where the user is located '. + +### Q9: Does mongodb CDC support debezium related parameters? + +The mongodb CDC connector is not supported because it is independently developed in the Flink CDC project and does not rely on the debezium project. + +### Q10: In the mongodb CDC full reading phase, can I continue reading from the checkpoint after the job fails? + +In the full reading phase, mongodb CDC does not do checkpoint until the full reading phase is completed. If it fails in the full reading phase, mongodb CDC will read the stock data again. + +## Oracle CDC FAQ + +### Q1: Oracle CDC's archive logs grow rapidly and read logs slowly? + +The online mining mode can be used without writing the data dictionary to the redo log, but it cannot process DDL statements. The default policy of the production environment reads the log slowly, and the default policy will write the data dictionary information to the redo log, resulting in a large increase in the log volume. You can add the following debezium configuration items. " log. mining. strategy' = 'online_ catalog','log. mining. continuous. mine' = 'true'。 If you use SQL, you need to prefix the configuration item with 'debezium.', Namely: + +``` +'debezium.log.mining.strategy' = 'online_catalog', +'debezium.log.mining.continuous.mine' = 'true' +``` + + +### Q2: Operation error caused by: io debezium. DebeziumException: Supplemental logging not configured for table xxx. Use command: alter table XXX add supplementary log data (all) columns? + +For Oracle version 11, debezium will set tableidcasesensitive to true by default, resulting in the table name being updated to lowercase. Therefore, the table completion log setting cannot be queried in Oracle, resulting in the false alarm of "supplementary logging not configured for table error". + +If it is the DataStream API, add the configuration item of debezium 'database.tablename.case.insensitive' = 'false'. If the SQL API is used, add the configuration item 'debezium.database.tablename.case.insensitive' = 'false' in the option of the table + +### Q3: How does Oracle CDC switch to XStream? + +Add configuration item 'database.connection.adpter' = 'xstream', please use the configuration item 'debezium.database.connection.adpter' = 'xstream' if you're using SQL API. + +### Q4: What are the database name and schema name of Oracle CDC + +Database name is the name of the database example, that is, the SID of Oracle. Schema name is the schema corresponding to the table. Generally speaking, a user corresponds to a schema. The schema name of the user is equal to the user name and is used as the default schema of the user. Therefore, schema name is generally the user name for creating the table, but if a schema is specified when creating the table, the specified schema is schema name. For example, use create table AAAA If TestTable (XXXX) is successfully created, AAAA is schema name. diff --git a/docs/content/docs/try-flink-cdc/_index.md b/docs/content/docs/try-flink-cdc/_index.md new file mode 100644 index 0000000000..b752c6f1b8 --- /dev/null +++ b/docs/content/docs/try-flink-cdc/_index.md @@ -0,0 +1,25 @@ +--- +title: "Try Flink CDC" +icon: +bold: true +bookCollapseSection: true +weight: 1 +--- + diff --git a/docs/content/docs/try-flink-cdc/cdc-connectors/_index.md b/docs/content/docs/try-flink-cdc/cdc-connectors/_index.md new file mode 100644 index 0000000000..0c566a8a8a --- /dev/null +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/_index.md @@ -0,0 +1,25 @@ +--- +title: CDC Connectors +bookCollapseSection: true +weight: 2 +aliases: + - /try-flink-cdc/cdc-connectors/ +--- + diff --git a/docs/content/docs/try-flink-cdc/cdc-connectors/build-real-time-data-lake-tutorial.md b/docs/content/docs/try-flink-cdc/cdc-connectors/build-real-time-data-lake-tutorial.md new file mode 100644 index 0000000000..f468f9e6df --- /dev/null +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/build-real-time-data-lake-tutorial.md @@ -0,0 +1,331 @@ +--- +title: "Building a Real-time Data Lake with Flink CDC" +weight: 999 +type: docs +aliases: +- /development/build-real-time-data-lake-tutorial.html + +--- + + + +# Using Flink CDC to synchronize data from MySQL sharding tables and build real-time data lake + +For OLTP databases, to deal with a huge number of data in a single table, we usually do database and table sharding to get better throughput. +But sometimes, for convenient analysis, we need to merge them into one table when loading them to data warehouse or data lake. + +This tutorial will show how to use Flink CDC to build a real-time data lake for such a scenario. +You can walk through the tutorial easily in the docker environment. The entire process uses standard SQL syntax without a single line of Java/Scala code or IDE installation. + +The following sections will take the pipeline from MySQL to [Iceberg](https://iceberg.apache.org/) as an example. The overview of the architecture is as follows: + +{{< img src="/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png" alt="Real-time data lake with Flink CDC" >}} + +You can also use other data sources like Oracle/Postgres and sinks like Hudi to build your own pipeline. + +## Preparation +Prepare a Linux or MacOS computer with Docker installed. + +## Preparing JAR package required +**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.** +- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar +- [flink-shaded-hadoop-2-uber-2.7.5-10.0.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.7.5-10.0/flink-shaded-hadoop-2-uber-2.7.5-10.0.jar) +- [iceberg-flink-runtime-1.16-1.3.1.jar](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/1.3.1/iceberg-flink-runtime-1.16-1.3.1.jar) + + + +### Starting components required +The components required in this tutorial are all managed in containers, so we will use `docker-compose` to start them. + +1. Create `Dockerfile` file using following contents: + ```dockerfile + FROM flink:1.16.0-scala_2.12 + # Place the downloaded jar packages in the lib directory at the same level. + COPY ./lib /opt/flink/lib + RUN apt-get update && apt-get install tree + ``` + +2. Create `docker-compose.yml` file using following contents: + ```yml + version: '2.1' + services: + sql-client: + user: flink:flink + build: . + command: bin/sql-client.sh + depends_on: + - jobmanager + - mysql + environment: + - MYSQL_HOST=mysql + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + rest.address: jobmanager + volumes: + - shared-tmpfs:/tmp/iceberg + jobmanager: + user: flink:flink + build: . + ports: + - "8081:8081" + command: jobmanager + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + volumes: + - shared-tmpfs:/tmp/iceberg + taskmanager: + user: flink:flink + build: . + depends_on: + - jobmanager + command: taskmanager + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + taskmanager.numberOfTaskSlots: 2 + volumes: + - shared-tmpfs:/tmp/iceberg + mysql: + image: debezium/example-mysql:1.1 + ports: + - "3306:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_USER=mysqluser + - MYSQL_PASSWORD=mysqlpw + + volumes: + shared-tmpfs: + driver: local + driver_opts: + type: "tmpfs" + device: "tmpfs" + ``` + + The Docker Compose environment consists of the following containers: + - SQL-Client: Flink SQL Client, used to submit queries and visualize their results. + - Flink Cluster: a Flink JobManager and a Flink TaskManager container to execute queries. + - MySQL: mainly used as a data source to store the sharding table. + +3. To start all containers, run the following command in the directory that contains the `docker-compose.yml` file: + ```shell + docker-compose up -d + ``` + This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run `docker ps` to check whether these containers are running properly. + We can also visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally. + + +***Note:*** +* If you want to run with your own Flink environment, remember to download the jar packages and then put them to `FLINK_HOME/lib/`. +* All the following commands involving `docker-compose` should be executed in the directory of the `docker-compose.yml` file. + +{{< img src="/fig/real-time-data-lake-tutorial/flink-ui.png" alt="Flink UI" >}} + +### Preparing data in databases +1. Enter mysql's container: + ```shell + docker-compose exec mysql mysql -uroot -p123456 + ``` +2. Create databases/tables and populate data: + + Create a logical sharding table `user` sharded in different databases and tables physically. + ```sql + CREATE DATABASE db_1; + USE db_1; + CREATE TABLE user_1 ( + id INTEGER NOT NULL PRIMARY KEY, + name VARCHAR(255) NOT NULL DEFAULT 'flink', + address VARCHAR(1024), + phone_number VARCHAR(512), + email VARCHAR(255) + ); + INSERT INTO user_1 VALUES (110,"user_110","Shanghai","123567891234","user_110@foo.com"); + + CREATE TABLE user_2 ( + id INTEGER NOT NULL PRIMARY KEY, + name VARCHAR(255) NOT NULL DEFAULT 'flink', + address VARCHAR(1024), + phone_number VARCHAR(512), + email VARCHAR(255) + ); + INSERT INTO user_2 VALUES (120,"user_120","Shanghai","123567891234","user_120@foo.com"); + ``` + ```sql + CREATE DATABASE db_2; + USE db_2; + CREATE TABLE user_1 ( + id INTEGER NOT NULL PRIMARY KEY, + name VARCHAR(255) NOT NULL DEFAULT 'flink', + address VARCHAR(1024), + phone_number VARCHAR(512), + email VARCHAR(255) + ); + INSERT INTO user_1 VALUES (110,"user_110","Shanghai","123567891234", NULL); + + CREATE TABLE user_2 ( + id INTEGER NOT NULL PRIMARY KEY, + name VARCHAR(255) NOT NULL DEFAULT 'flink', + address VARCHAR(1024), + phone_number VARCHAR(512), + email VARCHAR(255) + ); + INSERT INTO user_2 VALUES (220,"user_220","Shanghai","123567891234","user_220@foo.com"); + ``` + +## Creating tables using Flink DDL in Flink SQL CLI +First, use the following command to enter the Flink SQL CLI Container: +```shell +docker-compose run sql-client +``` + +We should see the welcome screen of the CLI client: + +{{< img src="/fig/real-time-data-lake-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}} + +Then do the following steps in Flink SQL CLI: + +1. Enable checkpoints every 3 seconds + + Checkpoint is disabled by default, we need to enable it to commit Iceberg transactions. + Besides, the beginning of mysql-cdc binlog phase also requires waiting a complete checkpoint to avoid disorder of binlog records. + ```sql + -- Flink SQL + Flink SQL> SET execution.checkpointing.interval = 3s; + ``` +2. Create MySQL sharding source table + + Create a source table that captures the data from the logical sharding table `user`. Here, we use regex to match all the physical tables. + Besides, the table defines metadata column to identify which database/table the record comes from. + ```sql + -- Flink SQL + Flink SQL> CREATE TABLE user_source ( + database_name STRING METADATA VIRTUAL, + table_name STRING METADATA VIRTUAL, + `id` DECIMAL(20, 0) NOT NULL, + name STRING, + address STRING, + phone_number STRING, + email STRING, + PRIMARY KEY (`id`) NOT ENFORCED + ) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'mysql', + 'port' = '3306', + 'username' = 'root', + 'password' = '123456', + 'database-name' = 'db_[0-9]+', + 'table-name' = 'user_[0-9]+' + ); + ``` +3. Create Iceberg sink table + + Create a sink table `all_users_sink` used to load data to Iceberg. + We define `database_name`, `table_name` and `id` as a combined primary key, because `id` maybe not unique across different databases and tables. + ```sql + -- Flink SQL + Flink SQL> CREATE TABLE all_users_sink ( + database_name STRING, + table_name STRING, + `id` DECIMAL(20, 0) NOT NULL, + name STRING, + address STRING, + phone_number STRING, + email STRING, + PRIMARY KEY (database_name, table_name, `id`) NOT ENFORCED + ) WITH ( + 'connector'='iceberg', + 'catalog-name'='iceberg_catalog', + 'catalog-type'='hadoop', + 'warehouse'='file:///tmp/iceberg/warehouse', + 'format-version'='2' + ); + ``` + +## Streaming to Iceberg +1. Streaming write data from MySQL to Iceberg using the following Flink SQL: + ```sql + -- Flink SQL + Flink SQL> INSERT INTO all_users_sink select * from user_source; + ``` + It will start a streaming job which will synchronize historical and incremental data from MySQL to Iceberg continuously. + The running job can be found in [Flink UI](http://localhost:8081/#/job/running), and it looks like: + + + {{< img src="/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png" alt="CDC to Iceberg Running Job" >}} + + Then, we can use the following command to see the files written to Iceberg: + ```shell + docker-compose exec sql-client tree /tmp/iceberg/warehouse/default_database/ + ``` + It should look like: + + {{< img src="/fig/real-time-data-lake-tutorial/files-in-iceberg.png" alt="Files in Iceberg" >}} + + The actual files may differ in your environment, but the structure of the directory should be similar. + +2. Use the following Flink SQL to query the data written to `all_users_sink`: + ```sql + -- Flink SQL + Flink SQL> SELECT * FROM all_users_sink; + ``` + We can see the data queried in the Flink SQL CLI: + + {{< img src="/fig/real-time-data-lake-tutorial/data_in_iceberg.png" alt="Data in Iceberg" >}} + +3. Make some changes in the MySQL databases, and then the data in Iceberg table `all_users_sink` will also change in real time. + + (3.1) Insert a new user in table `db_1.user_1` + ```sql + --- db_1 + INSERT INTO db_1.user_1 VALUES (111,"user_111","Shanghai","123567891234","user_111@foo.com"); + ``` + + (3.2) Update a user in table `db_1.user_2` + ```sql + --- db_1 + UPDATE db_1.user_2 SET address='Beijing' WHERE id=120; + ``` + + (3.3) Delete a user in table `db_2.user_2` + ```sql + --- db_2 + DELETE FROM db_2.user_2 WHERE id=220; + ``` + + After executing each step, we can query the table `all_users_sink` using `SELECT * FROM all_users_sink` in Flink SQL CLI to see the changes. + + The final query result is as follows: + + {{< img src="/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png" alt="Final Data in Iceberg" >}} + + From the latest result in Iceberg, we can see that there is a new record of `(db_1, user_1, 111)`, and the address of `(db_1, user_2, 120)` has been updated to `Beijing`. + Besides, the record of `(db_2, user_2, 220)` has been deleted. The result is exactly the same with the changes we did in MySQL. + +## Clean up +After finishing the tutorial, run the following command in the directory of `docker-compose.yml` to stop all containers: +```shell +docker-compose down +``` + +{{< top >}} diff --git a/docs/content/docs/try-flink-cdc/cdc-connectors/db2-tutorial.md b/docs/content/docs/try-flink-cdc/cdc-connectors/db2-tutorial.md new file mode 100644 index 0000000000..31150fdab0 --- /dev/null +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/db2-tutorial.md @@ -0,0 +1,163 @@ +--- +title: "Db2 Tutorial" +weight: 8 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/db2-tutorial.html +--- + + +# Demo: Db2 CDC to Elasticsearch + +**1. Create `docker-compose.yml` file using following contents:** + +``` +version: '2.1' +services: + db2: + image: ruanhang/db2-cdc-demo:v1 + privileged: true + ports: + - 50000:50000 + environment: + - LICENSE=accept + - DB2INSTANCE=db2inst1 + - DB2INST1_PASSWORD=admin + - DBNAME=testdb + - ARCHIVE_LOGS=true + elasticsearch: + image: elastic/elasticsearch:7.6.0 + environment: + - cluster.name=docker-cluster + - bootstrap.memory_lock=true + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - discovery.type=single-node + ports: + - "9200:9200" + - "9300:9300" + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + kibana: + image: elastic/kibana:7.6.0 + ports: + - "5601:5601" + volumes: + - /var/run/docker.sock:/var/run/docker.sock +``` +The Docker Compose environment consists of the following containers: +- Db2: db2 server and a pre-populated `products` table in the database `testdb`. +- Elasticsearch: store the result of the `products` table. +- Kibana: mainly used to visualize the data in Elasticsearch + +To start all containers, run the following command in the directory that contains the docker-compose.yml file. +```shell +docker-compose up -d +``` +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. +Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally. + +Don’t forget to run the following command to stop all containers after you finished the tutorial: +```shell +docker-compose down +``` + +**2. Download following JAR package to `/lib`** + +**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** + +- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) +- flink-sql-connector-db2-cdc-3.0-SNAPSHOT.jar + +**3. Launch a Flink cluster and start a Flink SQL CLI** + +Execute following SQL statements in the Flink SQL CLI: + +```sql +-- Flink SQL +-- checkpoint every 3000 milliseconds +Flink SQL> SET execution.checkpointing.interval = 3s; + +Flink SQL> CREATE TABLE products ( + ID INT NOT NULL, + NAME STRING, + DESCRIPTION STRING, + WEIGHT DECIMAL(10,3), + PRIMARY KEY (ID) NOT ENFORCED + ) WITH ( + 'connector' = 'db2-cdc', + 'hostname' = 'localhost', + 'port' = '50000', + 'username' = 'db2inst1', + 'password' = 'admin', + 'database-name' = 'testdb', + 'schema-name' = 'DB2INST1', + 'table-name' = 'PRODUCTS' + ); + +Flink SQL> CREATE TABLE es_products ( + ID INT NOT NULL, + NAME STRING, + DESCRIPTION STRING, + WEIGHT DECIMAL(10,3), + PRIMARY KEY (ID) NOT ENFORCED + ) WITH ( + 'connector' = 'elasticsearch-7', + 'hosts' = 'http://localhost:9200', + 'index' = 'enriched_products_1' + ); + +Flink SQL> INSERT INTO es_products SELECT * FROM products; +``` + +**4. Check result in Elasticsearch** + +Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data. + +**5. Make changes in Db2 and watch result in Elasticsearch** + +Enter Db2's container to make some changes in Db2, then you can see the result in Elasticsearch will change after +executing every SQL statement: +```shell +docker exec -it ${containerId} /bin/bash + +su db2inst1 + +db2 connect to testdb + +# enter db2 and execute sqls +db2 +``` + +```sql +UPDATE DB2INST1.PRODUCTS SET DESCRIPTION='18oz carpenter hammer' WHERE ID=106; + +INSERT INTO DB2INST1.PRODUCTS VALUES (default,'jacket','water resistent white wind breaker',0.2); + +INSERT INTO DB2INST1.PRODUCTS VALUES (default,'scooter','Big 2-wheel scooter ',5.18); + +DELETE FROM DB2INST1.PRODUCTS WHERE ID=111; +``` + +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mongodb-tutorial-zh.md" b/docs/content/docs/try-flink-cdc/cdc-connectors/mongodb-tutorial.md similarity index 79% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mongodb-tutorial-zh.md" rename to docs/content/docs/try-flink-cdc/cdc-connectors/mongodb-tutorial.md index f2b7c1936f..1e58df8e2f 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mongodb-tutorial-zh.md" +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/mongodb-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "MongoDB Tutorial" +weight: 1 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/mongodb-tutorial.html +--- -# 演示: MongoDB CDC 导入 Elasticsearch +# Demo: MongoDB CDC to Elasticsearch -1. 下载 `docker-compose.yml` +1. Create `docker-compose.yml` file using following contents: ``` version: '2.1' @@ -55,20 +62,20 @@ services: - "5601:5601" ``` -2. 进入 MongoDB 容器,初始化副本集和数据: +2. Enter Mongodb's container and initialize replica set and data: ``` docker-compose exec mongo /usr/bin/mongo -u mongouser -p mongopw ``` ```javascript -// 1. 初始化副本集 +// 1. initialize replica set rs.initiate(); rs.status(); -// 2. 切换数据库 +// 2. switch database use mgdb; -// 3. 初始化数据 +// 3. initialize data db.orders.insertMany([ { order_id: 101, @@ -124,21 +131,21 @@ db.customers.insertMany([ ]); ``` -3. 下载以下 jar 包到 `/lib/`: +3. Download following JAR package to `/lib/`: -```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译``` +```Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself. ``` - - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) - - [flink-sql-connector-mongodb-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/2.5-SNAPSHOT/flink-sql-connector-mongodb-cdc-2.5-SNAPSHOT.jar) +- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) +- [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) -4. 然后启动 Flink 集群,再启动 SQL CLI. +4. Launch a Flink cluster, then start a Flink SQL CLI and execute following SQL statements inside: ```sql -- Flink SQL --- 设置间隔时间为3秒 +-- checkpoint every 3000 milliseconds Flink SQL> SET execution.checkpointing.interval = 3s; --- 设置本地时区为 Asia/Shanghai +-- set local time zone as Asia/Shanghai Flink SQL> SET table.local-time-zone = Asia/Shanghai; Flink SQL> CREATE TABLE orders ( @@ -203,7 +210,7 @@ Flink SQL> INSERT INTO enriched_orders LEFT JOIN customers AS c ON o.customer_id = c.customer_id; ``` -5. 修改 MongoDB 里面的数据,观察 elasticsearch 里的结果。 +5. Make some changes in MongoDB, then check the result in Elasticsearch: ```javascript db.orders.insert({ @@ -233,3 +240,5 @@ db.orders.deleteOne( { order_id : 104 } ); ``` + +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-postgres-tutorial-zh.md" b/docs/content/docs/try-flink-cdc/cdc-connectors/mysql-postgres-tutorial.md similarity index 56% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-postgres-tutorial-zh.md" rename to docs/content/docs/try-flink-cdc/cdc-connectors/mysql-postgres-tutorial.md index 3914f043db..d735825fe7 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-postgres-tutorial-zh.md" +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/mysql-postgres-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "Mysql & Postgres Tutorial" +weight: 2 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/mysql-postgres-tutorial.html +--- -# 基于 Flink CDC 构建 MySQL 和 Postgres 的 Streaming ETL +# Streaming ETL for MySQL and Postgres with Flink CDC -这篇教程将展示如何基于 Flink CDC 快速构建 MySQL 和 Postgres 的流式 ETL。本教程的演示都将在 Flink SQL CLI 中进行,只涉及 SQL,无需一行 Java/Scala 代码,也无需安装 IDE。 +This tutorial is to show how to quickly build streaming ETL for MySQL and Postgres with Flink CDC. -假设我们正在经营电子商务业务,商品和订单的数据存储在 MySQL 中,订单对应的物流信息存储在 Postgres 中。 -对于订单表,为了方便进行分析,我们希望让它关联上其对应的商品和物流信息,构成一张宽表,并且实时把它写到 ElasticSearch 中。 +Assuming we are running an e-commerce business. The product and order data stored in MySQL, the shipment data related to the order is stored in Postgres. +We want to enrich the orders using the product and shipment table, and then load the enriched orders to ElasticSearch in real time. -接下来的内容将介绍如何使用 Flink Mysql/Postgres CDC 来实现这个需求,系统的整体架构如下图所示: -![Flink CDC Streaming ETL](/_static/fig/mysql-postgress-tutorial/flink-cdc-streaming-etl.png "Flink CDC Streaming ETL") +In the following sections, we will describe how to use Flink Mysql/Postgres CDC to implement it. +All exercises in this tutorial are performed in the Flink SQL CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation. -## 准备阶段 -准备一台已经安装了 Docker 的 Linux 或者 MacOS 电脑。 +The overview of the architecture is as follows: +{{< img src="/fig/mysql-postgres-tutorial/flink-cdc-streaming-etl.png" width="700px" alt="Flink CDC Streaming ETL" >}} -### 准备教程所需要的组件 -接下来的教程将以 `docker-compose` 的方式准备所需要的组件。 +## Preparation +Prepare a Linux or MacOS computer with Docker installed. -使用下面的内容创建一个 `docker-compose.yml` 文件: +### Starting components required +The components required in this demo are all managed in containers, so we will use `docker-compose` to start them. + +Create `docker-compose.yml` file using following contents: ``` version: '2.1' services: @@ -75,34 +85,35 @@ services: ports: - "5601:5601" ``` -该 Docker Compose 中包含的容器有: -- MySQL: 商品表 `products` 和 订单表 `orders` 将存储在该数据库中, 这两张表将和 Postgres 数据库中的物流表 `shipments`进行关联,得到一张包含更多信息的订单表 `enriched_orders` -- Postgres: 物流表 `shipments` 将存储在该数据库中 -- Elasticsearch: 最终的订单表 `enriched_orders` 将写到 Elasticsearch -- Kibana: 用来可视化 ElasticSearch 的数据 +The Docker Compose environment consists of the following containers: +- MySQL: the `products`,`orders` tables will be store in the database. They will be joined with data in Postgres to enrich the orders. +- Postgres: the `shipments` table will be store in the database. +- Elasticsearch: mainly used as a data sink to store enriched orders. +- Kibana: used to visualize the data in Elasticsearch. -在 `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件: +To start all containers, run the following command in the directory that contains the `docker-compose.yml` file. ```shell docker-compose up -d ``` -该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 docker ps 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:5601/](http://localhost:5601/) 来查看 Kibana 是否运行正常。 +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run docker ps to check whether these containers are running properly. +We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kibana is running normally. -### 下载 Flink 和所需要的依赖包 -1. 下载 [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) 并将其解压至目录 `flink-1.18.0` -2. 下载下面列出的依赖包,并将它们放到目录 `flink-1.18.0/lib/` 下: +### Preparing Flink and JAR package required +1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) and unzip it to the directory `flink-1.18.0` +2. Download following JAR package required and put them under `flink-1.18.0/lib/`: - **下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译** + **Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) - - flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar - - flink-sql-connector-postgres-cdc-2.5-SNAPSHOT.jar + - flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar + - flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar -### 准备数据 -#### 在 MySQL 数据库中准备数据 -1. 进入 MySQL 容器 +### Preparing data in databases +#### Preparing data in MySQL +1. Enter mysql's container: ```shell docker-compose exec mysql mysql -uroot -p123456 ``` -2. 创建数据库和表 `products`,`orders`,并插入数据 +2. Create tables and populate data: ```sql -- MySQL CREATE DATABASE mydb; @@ -139,12 +150,12 @@ docker-compose up -d (default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false), (default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false); ``` -#### 在 Postgres 数据库中准备数据 -1. 进入 Postgres 容器 +#### Preparing data in Postgres +1. Enter postgres's container: ```shell docker-compose exec postgres psql -h localhost -U postgres ``` -2. 创建表 `shipments`,并插入数据 +2. Create tables and populate data ```sql -- PG CREATE TABLE shipments ( @@ -162,37 +173,37 @@ docker-compose up -d (default,10003,'Shanghai','Hangzhou',false); ``` -## 启动 Flink 集群和 Flink SQL CLI +## Starting Flink cluster and Flink SQL CLI -1. 使用下面的命令跳转至 Flink 目录下 +1. Use the following command to change to the Flink directory: ``` cd flink-1.18.0 ``` -2. 使用下面的命令启动 Flink 集群 +2. Use the following command to start a Flink cluster: ```shell ./bin/start-cluster.sh ``` - 启动成功的话,可以在 [http://localhost:8081/](http://localhost:8081/) 访问到 Flink Web UI,如下所示: + Then we can visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally, and the web page looks like: - ![Flink UI](/_static/fig/mysql-postgress-tutorial/flink-ui.png "Flink UI") + {{< img src="/fig/mysql-postgres-tutorial/flink-ui.png" width="700px" alt="Flink UI" >}} -3. 使用下面的命令启动 Flink SQL CLI +3. Use the following command to start a Flink SQL CLI: ```shell ./bin/sql-client.sh ``` - 启动成功后,可以看到如下的页面: + We should see the welcome screen of the CLI client. - ![Flink SQL_Client](/_static/fig/mysql-postgress-tutorial/flink-sql-client.png "Flink SQL Client") + {{< img src="/fig/mysql-postgres-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}} -## 在 Flink SQL CLI 中使用 Flink DDL 创建表 -首先,开启 checkpoint,每隔3秒做一次 checkpoint +## Creating tables using Flink DDL in Flink SQL CLI +First, enable checkpoints every 3 seconds ```sql -- Flink SQL Flink SQL> SET execution.checkpointing.interval = 3s; ``` -然后, 对于数据库中的表 `products`, `orders`, `shipments`, 使用 Flink SQL CLI 创建对应的表,用于同步这些底层数据库表的数据 +Then, create tables that capture the change data from the corresponding database tables. ```sql -- Flink SQL Flink SQL> CREATE TABLE products ( @@ -248,7 +259,7 @@ Flink SQL> CREATE TABLE shipments ( ); ``` -最后,创建 `enriched_orders` 表, 用来将关联后的订单数据写入 Elasticsearch 中 +Finally, create `enriched_orders` table that is used to load data to the Elasticsearch. ```sql -- Flink SQL Flink SQL> CREATE TABLE enriched_orders ( @@ -272,8 +283,8 @@ Flink SQL> CREATE TABLE enriched_orders ( ); ``` -## 关联订单数据并且将其写入 Elasticsearch 中 -使用 Flink SQL 将订单表 `order` 与 商品表 `products`,物流信息表 `shipments` 关联,并将关联后的订单信息写入 Elasticsearch 中 +## Enriching orders and load to ElasticSearch +Use Flink SQL to join the `order` table with the `products` and `shipments` table to enrich orders and write to the Elasticsearch. ```sql -- Flink SQL Flink SQL> INSERT INTO enriched_orders @@ -282,55 +293,54 @@ Flink SQL> INSERT INTO enriched_orders LEFT JOIN products AS p ON o.product_id = p.id LEFT JOIN shipments AS s ON o.order_id = s.order_id; ``` -现在,就可以在 Kibana 中看到包含商品和物流信息的订单数据。 - -首先访问 [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) 创建 index pattern `enriched_orders`. +Now, the enriched orders should be shown in Kibana. +Visit [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) to create an index pattern `enriched_orders`. -![Create Index Pattern](/_static/fig/mysql-postgress-tutorial/kibana-create-index-pattern.png "Create Index Pattern") +{{< img src="/fig/mysql-postgres-tutorial/kibana-create-index-pattern.png" width="700px" alt="Create Index Pattern" >}} -然后就可以在 [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) 看到写入的数据了. +Visit [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) to find the enriched orders. -![Find enriched Orders](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders.png "Find enriched Orders") +{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders.png" width="700px" alt="Find enriched Orders" >}} -接下来,修改 MySQL 和 Postgres 数据库中表的数据,Kibana中显示的订单数据也将实时更新: -1. 在 MySQL 的 `orders` 表中插入一条数据 +Next, do some change in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time. +1. Insert a new order in MySQL ```sql --MySQL INSERT INTO orders VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false); ``` -2. 在 Postgres 的 `shipment` 表中插入一条数据 +2. Insert a shipment in Postgres ```sql --PG INSERT INTO shipments VALUES (default,10004,'Shanghai','Beijing',false); ``` -3. 在 MySQL 的 `orders` 表中更新订单的状态 +3. Update the order status in MySQL ```sql --MySQL UPDATE orders SET order_status = true WHERE order_id = 10004; ``` -4. 在 Postgres 的 `shipment` 表中更新物流的状态 +4. Update the shipment status in Postgres ```sql --PG UPDATE shipments SET is_arrived = true WHERE shipment_id = 1004; ``` -5. 在 MYSQL 的 `orders` 表中删除一条数据 +5. Delete the order in MySQL ```sql --MySQL DELETE FROM orders WHERE order_id = 10004; ``` - 每执行一步就刷新一次 Kibana,可以看到 Kibana 中显示的订单数据将实时更新,如下所示: - ![Enriched Orders Changes](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders-changes.gif "Enriched Orders Changes") - -## 环境清理 -本教程结束后,在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器: + The changes of enriched orders in Kibana are as follows: + {{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders-changes.gif" width="700px" alt="Enriched Orders Changes" >}} + +## Clean up +After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`: ```shell docker-compose down ``` -在 Flink 所在目录 `flink-1.18.0` 下执行如下命令停止 Flink 集群: +Run the following command to stop the Flink cluster in the directory of Flink `flink-1.18.0`: ```shell ./bin/stop-cluster.sh ``` - +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/oceanbase-tutorial-zh.md" b/docs/content/docs/try-flink-cdc/cdc-connectors/oceanbase-tutorial.md similarity index 73% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/oceanbase-tutorial-zh.md" rename to docs/content/docs/try-flink-cdc/cdc-connectors/oceanbase-tutorial.md index 89264aa7de..6bb93d67bc 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/oceanbase-tutorial-zh.md" +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/oceanbase-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "OceanBase Tutorial" +weight: 3 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/oceanbase-tutorial.html +--- -# 演示: OceanBase CDC 导入 Elasticsearch +# Demo: OceanBase CDC to ElasticSearch -## 视频教程 +## Video tutorial - [YouTube](https://www.youtube.com/watch?v=ODGE-73Dntg&t=2s) - [Bilibili](https://www.bilibili.com/video/BV1Zg411a7ZB/?spm_id_from=333.999.0.0) +### Preparation -### 准备教程所需要的组件 +#### Configure and start the components -#### 配置并启动容器 +Create `docker-compose.yml`. -配置 `docker-compose.yml`。 - -*注意*:本示例需要使用`host`网络,所以只能在 Linux 系统上运行,更多信息见 [network-tutorial-host](https://docs.docker.com/network/network-tutorial-host/)。 +*Note*: `host` network mode is required in this demo, so it can only work on Linux, see [network-tutorial-host](https://docs.docker.com/network/network-tutorial-host/). ```yaml version: '2.1' @@ -77,36 +83,40 @@ services: - '/var/run/docker.sock:/var/run/docker.sock' ``` -在 `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件: +Execute the following command in the directory where `docker-compose.yml` is located. ```shell docker-compose up -d ``` -### 设置密码 +### Set password + +From OceanBase 4.0.0.0 CE, we can only fetch the commit log of non-sys tenant. -OceanBase 从社区版 4.0.0.0 开始只支持对非 sys 租户的增量数据拉取,这里我们使用 test 租户的 root 用户作为示例。 +Here we use the 'test' tenant for example. -登陆 test 租户的 root 用户: +Login with 'root' user of 'test' tenant: ```shell docker-compose exec observer obclient -h127.0.0.1 -P2881 -uroot@test ``` -设置密码: +Set a password: ```mysql ALTER USER root IDENTIFIED BY 'test'; ``` -### 准备数据 +### Create data for reading snapshot -使用 'root@test' 用户登陆。 +Login 'root' user of 'test' tenant. ```shell docker-compose exec observer obclient -h127.0.0.1 -P2881 -uroot@test -ptest ``` +Insert data: + ```sql CREATE DATABASE ob; USE ob; @@ -144,23 +154,23 @@ VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false), (default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false); ``` -### 下载所需要的依赖包 +### Download the libraries required -```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译``` +```Download links are only available for stable releases.``` - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) -- [flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/2.5-SNAPSHOT/flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar) +- [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) -### 在 Flink SQL CLI 中使用 Flink DDL 创建表 +### Use Flink DDL to create dynamic table in Flink SQL CLI ```sql --- 设置间隔时间为3秒 +-- checkpoint every 3000 milliseconds Flink SQL> SET execution.checkpointing.interval = 3s; --- 设置本地时区为 Asia/Shanghai +-- set local time zone as Asia/Shanghai Flink SQL> SET table.local-time-zone = Asia/Shanghai; --- 创建订单表 +-- create orders table Flink SQL> CREATE TABLE orders ( order_id INT, order_date TIMESTAMP(0), @@ -185,7 +195,7 @@ Flink SQL> CREATE TABLE orders ( 'working-mode' = 'memory' ); --- 创建商品表 +-- create products table Flink SQL> CREATE TABLE products ( id INT, name STRING, @@ -205,9 +215,9 @@ Flink SQL> CREATE TABLE products ( 'logproxy.host' = 'localhost', 'logproxy.port' = '2983', 'working-mode' = 'memory' - ); + ); --- 创建关联后的订单数据表 +-- create flat table enriched_orders Flink SQL> CREATE TABLE enriched_orders ( order_id INT, order_date TIMESTAMP(0), @@ -223,7 +233,7 @@ Flink SQL> CREATE TABLE enriched_orders ( 'hosts' = 'http://localhost:9200', 'index' = 'enriched_orders'); --- 执行读取和写入 +-- Start the reading and writing job Flink SQL> INSERT INTO enriched_orders SELECT o.order_id, o.order_date, @@ -237,13 +247,13 @@ Flink SQL> INSERT INTO enriched_orders LEFT JOIN products AS p ON o.product_id = p.id; ``` -### 在 Kibana 中查看数据 +### Check data on Kibana -访问 [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) 创建 index pattern `enriched_orders`,之后可以在 [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) 看到写入的数据了。 +Open [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) and create index pattern `enriched_orders`, then go to [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover), and you will see the data of `enriched_orders`. -### 修改监听表数据,查看增量数据变动 +### Check data changes -在OceanBase中依次执行如下修改操作,每执行一步就刷新一次 Kibana,可以看到 Kibana 中显示的订单数据将实时更新。 +Execute the following sql in OceanBase under `ob` database, you will find records in Kibana be updated after each step in real time. ```sql INSERT INTO orders VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false); @@ -251,16 +261,18 @@ UPDATE orders SET order_status = true WHERE order_id = 10004; DELETE FROM orders WHERE order_id = 10004; ``` -### 环境清理 +### Clean up -在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器: +Execute the following command to stop all containers in the directory where `docker-compose.yml` is located. ```shell docker-compose down ``` -进入Flink的部署目录,停止 Flink 集群: +Stop the flink cluster by following command. ```shell ./bin/stop-cluster.sh ``` + +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/oracle-tutorial-zh.md" b/docs/content/docs/try-flink-cdc/cdc-connectors/oracle-tutorial.md similarity index 75% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/oracle-tutorial-zh.md" rename to docs/content/docs/try-flink-cdc/cdc-connectors/oracle-tutorial.md index 0be4b2407c..1707e36a56 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/oracle-tutorial-zh.md" +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/oracle-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "Oracle Tutorial" +weight: 4 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/oracle-tutorial.html +--- -# 演示: Oracle CDC 导入 Elasticsearch +# Demo: Oracle CDC to Elasticsearch -**创建`docker-compose.yml`文件,内容如下所示:** +**Create `docker-compose.yml` file using following contents:** ``` version: '2.1' @@ -51,36 +58,35 @@ services: - "5601:5601" volumes: - /var/run/docker.sock:/var/run/docker.sock -``` -该 Docker Compose 中包含的容器有: -- Oracle: Oracle 19c 数据库 -- Elasticsearch: `orders` 表将和 `products` 表进行join,join的结果写入Elasticsearch中 -- Kibana: 可视化 Elasticsearch 中的数据 +``` +The Docker Compose environment consists of the following containers: +- Oracle: Oracle 19c database. +- Elasticsearch: store the join result of the `orders` and `products` table. +- Kibana: mainly used to visualize the data in Elasticsearch -在 docker-compose.yml 所在目录下运行如下命令以启动所有容器: +To start all containers, run the following command in the directory that contains the docker-compose.yml file. ```shell docker-compose up -d ``` -该命令会以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。 -你可以通过 docker ps 来观察上述的容器是否正常启动了。 也可以访问 http://localhost:5601/ 来查看 Kibana 是否运行正常。 -另外可以通过如下命令停止所有的容器: +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. +Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally. +Don’t forget to run the following command to stop all containers after you finished the tutorial: ```shell docker-compose down -```` +``` -**下载以下 jar 包到 `/lib/`:** +**Download following JAR package to `/lib`** -*下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译* +**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.** - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) -- [flink-sql-connector-oracle-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oracle-cdc/2.5-SNAPSHOT/flink-sql-connector-oracle-cdc-2.5-SNAPSHOT.jar) - +- [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) -**在 Oracle 数据库中准备数据** -创建数据库和表 `products`,`orders`,并插入数据: +**Preparing data in Oracle database** +Introduce the tables in Oracle: ```shell docker-compose exec oracle sqlplus debezium/dbz@localhost:1521/ORCLCDB ``` @@ -141,7 +147,9 @@ INSERT INTO DEBEZIUM.ORDERS VALUES (1003, TO_TIMESTAMP('2020-07-30 12:00:30.0010 INSERT INTO DEBEZIUM.ORDERS VALUES (1004, TO_TIMESTAMP('2020-07-30 15:22:00.001000', 'YYYY-MM-DD HH24:MI:SS.FF'), 'Jark', 1, 104); ``` -**然后启动 Flink 集群,再启动 SQL CLI:** +**Launch a Flink cluster and start a Flink SQL CLI** + +Execute following SQL statements in the Flink SQL CLI: ```sql -- Flink SQL @@ -202,13 +210,13 @@ Flink SQL> INSERT INTO enriched_orders LEFT JOIN products AS p ON o.PRODUCT_ID = p.ID; ``` -**检查 ElasticSearch 中的结果** +**Check result in Elasticsearch** -检查最终的结果是否写入ElasticSearch中, 可以在[Kibana](http://localhost:5601/)看到ElasticSearch中的数据 +Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data. -**在 Oracle 制造一些变更,观察 ElasticSearch 中的结果** +**Make changes in Oracle and watch result in Elasticsearch** -进入Oracle容器中并通过如下的SQL语句对Oracle数据库进行一些修改, 然后就可以看到每执行一条SQL语句,Elasticsearch中的数据都会实时更新。 +Enter Oracle's container to make some changes in Oracle, then you can see the result in Elasticsearch will change after executing every SQL statement: ```shell docker-compose exec oracle sqlplus debezium/dbz@localhost:1521/ORCLCDB @@ -220,4 +228,6 @@ INSERT INTO DEBEZIUM.ORDERS VALUES (1005, TO_TIMESTAMP('2020-07-30 15:22:00.0010 UPDATE DEBEZIUM.ORDERS SET QUANTITY = 10 WHERE ID = 1002; DELETE FROM DEBEZIUM.ORDERS WHERE ID = 1004; -``` \ No newline at end of file +``` + +{{< top >}} diff --git a/docs/content/docs/try-flink-cdc/cdc-connectors/polardbx-tutorial.md b/docs/content/docs/try-flink-cdc/cdc-connectors/polardbx-tutorial.md new file mode 100644 index 0000000000..37e3ed7da7 --- /dev/null +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/polardbx-tutorial.md @@ -0,0 +1,289 @@ +--- +title: "PolarDB-X Tutorial" +weight: 5 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/mongodb-tutorial.html +--- + + +# Demo: PolarDB-X CDC to Elasticsearch + +This tutorial is to show how to quickly build streaming ETL for PolarDB-X with Flink CDC. + +Assuming we are running an e-commerce business. The product and order data stored in PolarDB-X. +We want to enrich the orders using the product table, and then load the enriched orders to ElasticSearch in real time. + +In the following sections, we will describe how to use Flink PolarDB-X CDC to implement it. +All exercises in this tutorial are performed in the Flink SQL CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation. + +## Preparation +Prepare a Linux or MacOS computer with Docker installed. + +### Starting components required +The components required in this demo are all managed in containers, so we will use `docker-compose` to start them. + +Create `docker-compose.yml` file using following contents: +``` +version: '2.1' +services: + polardbx: + polardbx: + image: polardbx/polardb-x:2.0.1 + container_name: polardbx + ports: + - "8527:8527" + elasticsearch: + image: 'elastic/elasticsearch:7.6.0' + container_name: elasticsearch + environment: + - cluster.name=docker-cluster + - bootstrap.memory_lock=true + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - discovery.type=single-node + ports: + - '9200:9200' + - '9300:9300' + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + kibana: + image: 'elastic/kibana:7.6.0' + container_name: kibana + ports: + - '5601:5601' + volumes: + - '/var/run/docker.sock:/var/run/docker.sock' +``` +The Docker Compose environment consists of the following containers: +- PolarDB-X: the `products`,`orders` tables will be store in the database. They will be joined enrich the orders. +- Elasticsearch: mainly used as a data sink to store enriched orders. +- Kibana: used to visualize the data in Elasticsearch. + +To start all containers, run the following command in the directory that contains the `docker-compose.yml` file. +```shell +docker-compose up -d +``` +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run docker ps to check whether these containers are running properly. +We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kibana is running normally. + +### Preparing Flink and JAR package required +1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) and unzip it to the directory `flink-1.18.0` +2. Download following JAR package required and put them under `flink-1.18.0/lib/`: + + **Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** + - flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar + - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) + +### Preparing data in databases +#### Preparing data in PolarDB-X +1. Enter PolarDB-X Database: + ```shell + mysql -h127.0.0.1 -P8527 -upolardbx_root -p"123456" + ``` +2. Create tables and populate data: + ```sql + -- PolarDB-X + CREATE TABLE products ( + id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) NOT NULL, + description VARCHAR(512) + ) AUTO_INCREMENT = 101; + + INSERT INTO products + VALUES (default,"scooter","Small 2-wheel scooter"), + (default,"car battery","12V car battery"), + (default,"12-pack drill bits","12-pack of drill bits with sizes ranging from #40 to #3"), + (default,"hammer","12oz carpenter's hammer"), + (default,"hammer","14oz carpenter's hammer"), + (default,"hammer","16oz carpenter's hammer"), + (default,"rocks","box of assorted rocks"), + (default,"jacket","water resistent black wind breaker"), + (default,"spare tire","24 inch spare tire"); + + + CREATE TABLE orders ( + order_id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + order_date DATETIME NOT NULL, + customer_name VARCHAR(255) NOT NULL, + price DECIMAL(10, 5) NOT NULL, + product_id INTEGER NOT NULL, + order_status BOOLEAN NOT NULL -- Whether order has been placed + ) AUTO_INCREMENT = 10001; + + INSERT INTO orders + VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false), + (default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false), + (default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false); + ``` + +## Starting Flink cluster and Flink SQL CLI + +1. Use the following command to change to the Flink directory: + ``` + cd flink-1.18.0 + ``` + +2. Use the following command to start a Flink cluster: + ```shell + ./bin/start-cluster.sh + ``` + + Then we can visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally, and the web page looks like: + + {{< img src="/fig/mysql-postgres-tutorial/flink-ui.png" alt="Flink UI" >}} + +3. Use the following command to start a Flink SQL CLI: + ```shell + ./bin/sql-client.sh + ``` + We should see the welcome screen of the CLI client. + + {{< img src="/fig/mysql-postgres-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}} + +## Creating tables using Flink DDL in Flink SQL CLI +First, enable checkpoints every 3 seconds +```sql +-- Flink SQL +Flink SQL> SET execution.checkpointing.interval = 3s; +``` + +Then, create tables that capture the change data from the corresponding database tables. +```sql +-- Flink SQL +Flink SQL> SET execution.checkpointing.interval = 3s; + +-- create source table2 - orders +Flink SQL> CREATE TABLE orders ( + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + PRIMARY KEY (order_id) NOT ENFORCED + ) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = '127.0.0.1', + 'port' = '8527', + 'username' = 'polardbx_root', + 'password' = '123456', + 'database-name' = 'mydb', + 'table-name' = 'orders' + ); + +-- create source table2 - products +CREATE TABLE products ( + id INT, + name STRING, + description STRING, + PRIMARY KEY (id) NOT ENFORCED + ) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = '127.0.0.1', + 'port' = '8527', + 'username' = 'polardbx_root', + 'password' = '123456', + 'database-name' = 'mydb', + 'table-name' = 'products' +); +``` + +Finally, create `enriched_orders` table that is used to load data to the Elasticsearch. +```sql +-- Flink SQL +-- create sink table - enrich_orders +Flink SQL> CREATE TABLE enriched_orders ( + order_id INT, + order_date TIMESTAMP(0), + customer_name STRING, + price DECIMAL(10, 5), + product_id INT, + order_status BOOLEAN, + product_name STRING, + product_description STRING, + PRIMARY KEY (order_id) NOT ENFORCED + ) WITH ( + 'connector' = 'elasticsearch-7', + 'hosts' = 'http://localhost:9200', + 'index' = 'enriched_orders' + ); +``` + +## Enriching orders and load to ElasticSearch +Use Flink SQL to join the `order` table with the `products` table to enrich orders and write to the Elasticsearch. +```sql +-- Flink SQL +Flink SQL> INSERT INTO enriched_orders + SELECT o.order_id, + o.order_date, + o.customer_name, + o.price, + o.product_id, + o.order_status, + p.name, + p.description + FROM orders AS o + LEFT JOIN products AS p ON o.product_id = p.id; +``` +Now, the enriched orders should be shown in Kibana. +Visit [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) to create an index pattern `enriched_orders`. + +{{< img src="/fig/mysql-postgres-tutorial/kibana-create-index-pattern.png" alt="Create Index Pattern" >}} + +Visit [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) to find the enriched orders. + +{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders.png" alt="Find enriched Orders" >}} + +Next, do some change in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time. +1. Insert a new order in PolarDB-X + ```sql + --PolarDB-X + INSERT INTO orders + VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false); + ``` +2. Update the order status in PolarDB-X + ```sql + --PolarDB-X + UPDATE orders SET order_status = true WHERE order_id = 10004; + ``` +3. Delete the order in PolarDB-X + ```sql + --PolarDB-X + DELETE FROM orders WHERE order_id = 10004; + ``` + The changes of enriched orders in Kibana are as follows: + {{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders-changes.gif" alt="Enriched Orders Changes" >}} + +## Clean up +After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`: +```shell +docker-compose down +``` +Run the following command to stop the Flink cluster in the directory of Flink `flink-1.18.0`: +```shell +./bin/stop-cluster.sh +``` + +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/sqlserver-tutorial-zh.md" b/docs/content/docs/try-flink-cdc/cdc-connectors/sqlserver-tutorial.md similarity index 75% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/sqlserver-tutorial-zh.md" rename to docs/content/docs/try-flink-cdc/cdc-connectors/sqlserver-tutorial.md index aa46331a82..3929caca50 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/sqlserver-tutorial-zh.md" +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/sqlserver-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "SqlServer Tutorial" +weight: 6 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/sqlserver-tutorial.html +--- -# 演示: SqlServer CDC 导入 Elasticsearch +# Demo: SqlServer CDC to Elasticsearch -**创建 `docker-compose.yml` 文件,内容如下所示:** +**Create `docker-compose.yml` file using following contents:** ``` version: '2.1' @@ -60,35 +67,35 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock ``` -该 Docker Compose 中包含的容器有: -- SqlServer:SqlServer 数据库。 -- Elasticsearch:`orders` 表将和 `products` 表进行 join,join 的结果写入 Elasticsearch 中。 -- Kibana:可视化 Elasticsearch 中的数据。 +The Docker Compose environment consists of the following containers: +- SqlServer: SqlServer database. +- Elasticsearch: store the join result of the `orders` and `products` table. +- Kibana: mainly used to visualize the data in Elasticsearch. -在 docker-compose.yml 所在目录下运行如下命令以启动所有容器: +To start all containers, run the following command in the directory that contains the docker-compose.yml file: ```shell docker-compose up -d ``` -该命令会以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。 -你可以通过 docker ps 来观察上述的容器是否正常启动了。 也可以访问 http://localhost:5601/ 来查看 Kibana 是否运行正常。 +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. +Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally. -另外可以通过如下命令停止并删除所有的容器: +Don’t forget to run the following command to stop and remove all containers after you finished the tutorial: ```shell docker-compose down ```` -**下载以下 jar 包到 `/lib/`:** +**Download following JAR package to `/lib`:** -```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译``` +**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) -- [flink-sql-connector-sqlserver-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-sqlserver-cdc/2.5-SNAPSHOT/flink-sql-connector-sqlserver-cdc-2.5-SNAPSHOT.jar) +- [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) -**在 SqlServer 数据库中准备数据** +**Preparing data in SqlServer database** -创建数据库和表 `products`,`orders`,并插入数据: +Create databases/tables and populate data ```sql -- Sqlserver @@ -143,7 +150,7 @@ docker-compose down EXEC sys.sp_cdc_enable_table @source_schema = 'dbo', @source_name = 'orders', @role_name = NULL, @supports_net_changes = 0; GO ``` -**然后启动 Flink 集群,再启动 SQL CLI:** +**Launch a Flink cluster and start a Flink SQL CLI:** ```sql -- Flink SQL @@ -202,13 +209,13 @@ Flink SQL> INSERT INTO enriched_orders LEFT JOIN products AS p ON o.product_id = p.id; ``` -**检查 ElasticSearch 中的结果** +**Check result in Elasticsearch** -检查最终的结果是否写入 ElasticSearch 中,可以在 [Kibana](http://localhost:5601/) 看到 ElasticSearch 中的数据。 +Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data. -**在 SqlServer 制造一些变更,观察 ElasticSearch 中的结果** +**Make changes in SqlServer and watch result in Elasticsearch** -通过如下的 SQL 语句对 SqlServer 数据库进行一些修改,然后就可以看到每执行一条 SQL 语句,Elasticsearch 中的数据都会实时更新。 +Do some changes in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time. ```sql INSERT INTO orders(order_date,purchaser,quantity,product_id) VALUES ('22-FEB-2016', 1006, 22, 107); @@ -220,3 +227,5 @@ GO DELETE FROM orders WHERE id = 10004; GO ``` + +{{< top >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/tidb-tutorial-zh.md" b/docs/content/docs/try-flink-cdc/cdc-connectors/tidb-tutorial.md similarity index 75% rename from "docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/tidb-tutorial-zh.md" rename to docs/content/docs/try-flink-cdc/cdc-connectors/tidb-tutorial.md index fde346c546..e8a0a0f198 100644 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/tidb-tutorial-zh.md" +++ b/docs/content/docs/try-flink-cdc/cdc-connectors/tidb-tutorial.md @@ -1,3 +1,10 @@ +--- +title: "TiDB Tutorial" +weight: 7 +type: docs +aliases: +- /try-flink-cdc/cdc-connectors/tidb-tutorial.html +--- -# 演示: TiDB CDC 导入 Elasticsearch +# Demo: TiDB CDC to Elasticsearch -**首先我们得通过 docker 来启动 TiDB 集群。** +**First,we will start TiDB cluster with docker.** ```shell $ git clone https://github.com/pingcap/tidb-docker-compose.git ``` -**其次替换目录 `tidb-docker-compose` 里面的 `docker-compose.yml` 文件,内容如下所示:** +**Next,replace `docker-compose.yml` file using following contents in directory `tidb-docker-compose`:** ``` version: "2.1" @@ -111,37 +118,37 @@ services: - /var/run/docker.sock:/var/run/docker.sock ``` -该 Docker Compose 中包含的容器有: -- TiDB 集群: tikv、pd、tidb。 -- Elasticsearch:`orders` 表将和 `products` 表进行 join,join 的结果写入 Elasticsearch 中。 -- Kibana:可视化 Elasticsearch 中的数据。 +The Docker Compose environment consists of the following containers: +- TiDB cluster: tikv、pd、tidb. +- Elasticsearch: store the join result of the `orders` and `products` table. +- Kibana: mainly used to visualize the data in Elasticsearch. -本机添加 host 映射 `pd` 和 `tikv` 映射 `127.0.0.1`。 -在 docker-compose.yml 所在目录下运行如下命令以启动所有容器: +Add `pd` and `tikv` mapping to `127.0.0.1` in `host` file. +To start all containers, run the following command in the directory that contains the docker-compose.yml file: ```shell docker-compose up -d mysql -h 127.0.0.1 -P 4000 -u root # Just test tidb cluster is ready,if you have install mysql local. ``` -该命令会以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。 -你可以通过 docker ps 来观察上述的容器是否正常启动了。 也可以访问 http://localhost:5601/ 来查看 Kibana 是否运行正常。 +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. +Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally. -另外可以通过如下命令停止并删除所有的容器: +Don’t forget to run the following command to stop and remove all containers after you finished the tutorial: ```shell docker-compose down ```` -**下载以下 jar 包到 `/lib/`:** +**Download following JAR package to `/lib`:** -```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译``` +**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** - [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) -- [flink-sql-connector-tidb-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-tidb-cdc/2.5-SNAPSHOT/flink-sql-connector-tidb-cdc-2.5-SNAPSHOT.jar) +- [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) -**在 TiDB 数据库中准备数据** +**Preparing data in TiDB database** -创建数据库和表 `products`,`orders`,并插入数据: +Create databases/tables and populate data ```sql -- TiDB @@ -178,7 +185,7 @@ VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false), (default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false), (default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false); ``` -**然后启动 Flink 集群,再启动 SQL CLI:** +**Launch a Flink cluster and start a Flink SQL CLI:** ```sql -- Flink SQL @@ -234,13 +241,13 @@ Flink SQL> INSERT INTO enriched_orders LEFT JOIN products AS p ON o.product_id = p.id; ``` -**检查 ElasticSearch 中的结果** +**Check result in Elasticsearch** -检查最终的结果是否写入 ElasticSearch 中,可以在 [Kibana](http://localhost:5601/) 看到 ElasticSearch 中的数据。 +Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data. -**在 TiDB 制造一些变更,观察 ElasticSearch 中的结果** +**Make changes in TiDB and watch result in Elasticsearch** -通过如下的 SQL 语句对 TiDB 数据库进行一些修改,然后就可以看到每执行一条 SQL 语句,Elasticsearch 中的数据都会实时更新。 +Do some changes in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time. ```sql INSERT INTO orders @@ -251,3 +258,4 @@ UPDATE orders SET order_status = true WHERE order_id = 10004; DELETE FROM orders WHERE order_id = 10004; ``` +{{< top >}} diff --git a/docs/content/docs/try-flink-cdc/pipeline-connectors/_index.md b/docs/content/docs/try-flink-cdc/pipeline-connectors/_index.md new file mode 100644 index 0000000000..7297646b3e --- /dev/null +++ b/docs/content/docs/try-flink-cdc/pipeline-connectors/_index.md @@ -0,0 +1,25 @@ +--- +title: Pipeline Connectors +bookCollapseSection: true +weight: 1 +aliases: + - /try-flink-cdc/pipeline-connectors/ +--- + diff --git a/docs/content/docs/try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.md b/docs/content/docs/try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.md new file mode 100644 index 0000000000..a9913adc04 --- /dev/null +++ b/docs/content/docs/try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.md @@ -0,0 +1,346 @@ +--- +title: "MySQL to Doris" +weight: 1 +type: docs +aliases: +- /try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.html +--- + + +# Streaming ELT from MySQL to Doris using Flink CDC 3.0 + +This tutorial is to show how to quickly build a Streaming ELT job from MySQL to Doris using Flink CDC 3.0,including the feature of sync all table of one database, schema change evolution and sync sharding tables into one table. +All exercises in this tutorial are performed in the Flink CDC CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation. + +## Preparation +Prepare a Linux or MacOS computer with Docker installed. + +### Prepare Flink Standalone cluster +1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) ,unzip and get flink-1.18.0 directory. + Use the following command to navigate to the Flink directory and set FLINK_HOME to the directory where flink-1.18.0 is located. + + ```shell + cd flink-1.18.0 + ``` + +2. Enable checkpointing by appending the following parameters to the conf/flink-conf.yaml configuration file to perform a checkpoint every 3 seconds. + + ```yaml + execution.checkpointing.interval: 3000 + ``` + +3. Start the Flink cluster using the following command. + + ```shell + ./bin/start-cluster.sh + ``` + +If successfully started, you can access the Flink Web UI at [http://localhost:8081/](http://localhost:8081/), as shown below. + +{{< img src="/fig/mysql-doris-tutorial/flink-ui.png" alt="Flink UI" >}} + +Executing `start-cluster.sh` multiple times can start multiple `TaskManager`s. + +### Prepare docker compose +The following tutorial will prepare the required components using `docker-compose`. + +1. Host Machine Configuration +Since `Doris` requires memory mapping support for operation, execute the following command on the host machine: + + ```shell + sysctl -w vm.max_map_count=2000000 + ``` +Due to the different ways of implementing containers internally on MacOS, it may not be possible to directly modify the value of max_map_count on the host during deployment. You need to create the following containers first: + + ```shell + docker run -it --privileged --pid=host --name=change_count debian nsenter -t 1 -m -u -n -i sh + ``` + +The container was created successfully executing the following command: + ```shell + sysctl -w vm.max_map_count=2000000 + ``` + +Then `exit` exits and creates the Doris Docker cluster. + +2. Start docker compose + Create a `docker-compose.yml` file using the content provided below: + + ```yaml + version: '2.1' + services: + doris: + image: yagagagaga/doris-standalone + ports: + - "8030:8030" + - "8040:8040" + - "9030:9030" + mysql: + image: debezium/example-mysql:1.1 + ports: + - "3306:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_USER=mysqluser + - MYSQL_PASSWORD=mysqlpw + ``` + +The Docker Compose should include the following services (containers): +- MySQL: include a database named `app_db` +- Doris: to store tables from MySQL + +To start all containers, run the following command in the directory that contains the `docker-compose.yml` file. + + ```shell + docker-compose up -d + ``` + +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run docker ps to check whether these containers are running properly. You can also visit [http://localhost:8030/](http://localhost:8030/) to check whether Doris is running. +#### Prepare records for MySQL +1. Enter MySQL container + + ```shell + docker-compose exec mysql mysql -uroot -p123456 + ``` + +2. create `app_db` database and `orders`,`products`,`shipments` tables, then insert records + + ```sql + -- create database + CREATE DATABASE app_db; + + USE app_db; + + -- create orders table + CREATE TABLE `orders` ( + `id` INT NOT NULL, + `price` DECIMAL(10,2) NOT NULL, + PRIMARY KEY (`id`) + ); + + -- insert records + INSERT INTO `orders` (`id`, `price`) VALUES (1, 4.00); + INSERT INTO `orders` (`id`, `price`) VALUES (2, 100.00); + + -- create shipments table + CREATE TABLE `shipments` ( + `id` INT NOT NULL, + `city` VARCHAR(255) NOT NULL, + PRIMARY KEY (`id`) + ); + + -- insert records + INSERT INTO `shipments` (`id`, `city`) VALUES (1, 'beijing'); + INSERT INTO `shipments` (`id`, `city`) VALUES (2, 'xian'); + + -- create products table + CREATE TABLE `products` ( + `id` INT NOT NULL, + `product` VARCHAR(255) NOT NULL, + PRIMARY KEY (`id`) + ); + + -- insert records + INSERT INTO `products` (`id`, `product`) VALUES (1, 'Beer'); + INSERT INTO `products` (`id`, `product`) VALUES (2, 'Cap'); + INSERT INTO `products` (`id`, `product`) VALUES (3, 'Peanut'); + ``` + +#### Create database in Doris +`Doris` connector currently does not support automatic database creation and needs to first create a database corresponding to the write table. +1. Enter Doris Web UI。 + [http://localhost:8030/](http://localhost:8030/) + The default username is `root`, and the default password is empty. + + {{< img src="/fig/mysql-doris-tutorial/doris-ui.png" alt="Doris UI" >}} + +2. Create `app_db` database through Web UI. + + ```sql + create database app_db; + ``` + + {{< img src="/fig/mysql-doris-tutorial/doris-create-table.png" alt="Doris create table" >}} + +## Submit job using FlinkCDC cli +1. Download the binary compressed packages listed below and extract them to the directory ` flink cdc-3.0.0 '`: + [flink-cdc-3.0.0-bin.tar.gz](https://github.org/apache/flink/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz) + flink-cdc-3.0.0 directory will contain four directory `bin`,`lib`,`log`,`conf`. + +2. Download the connector package listed below and move it to the `lib` directory + **Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** + - [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar) + - [Apache Doris pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-doris/3.0.0/flink-cdc-pipeline-connector-doris-3.0.0.jar) + +3. Write task configuration yaml file + Here is an example file for synchronizing the entire database `mysql-to-doris.yaml`: + + ```yaml + ################################################################################ + # Description: Sync MySQL all tables to Doris + ################################################################################ + source: + type: mysql + hostname: localhost + port: 3306 + username: root + password: 123456 + tables: app_db.\.* + server-id: 5400-5404 + server-time-zone: UTC + + sink: + type: doris + fenodes: 127.0.0.1:8030 + username: root + password: "" + table.create.properties.light_schema_change: true + table.create.properties.replication_num: 1 + + pipeline: + name: Sync MySQL Database to Doris + parallelism: 2 + + ``` + +Notice that: +`tables: app_db.\.*` in source synchronize all tables in `app_db` through Regular Matching. +`table.create.properties.replication_num` in sink is because there is only one Doris BE node in the Docker image. + +4. Finally, submit job to Flink Standalone cluster using Cli. + ```shell + bash bin/flink-cdc.sh mysql-to-doris.yaml + ``` +After successful submission, the return information is as follows: + ```shell + Pipeline has been submitted to cluster. + Job ID: ae30f4580f1918bebf16752d4963dc54 + Job Description: Sync MySQL Database to Doris + ``` + We can find a job named `Sync MySQL Database to Doris` is running through Flink Web UI. + +{{< img src="/fig/mysql-doris-tutorial/mysql-to-doris.png" alt="MySQL-to-Doris" >}} + +We can find that tables are created and inserted through Doris Web UI. + +{{< img src="/fig/mysql-doris-tutorial/doris-display-data.png" alt="Doris display data" >}} + +### Synchronize Schema and Data changes +Enter MySQL container + + ```shell + docker-compose exec mysql mysql -uroot -p123456 + ``` + +Then, modify schema and record in MySQL, and the tables of Doris will change the same in real time: +1. insert one record in `orders` from MySQL: + + ```sql + INSERT INTO app_db.orders (id, price) VALUES (3, 100.00); + ``` + +2. add one column in `orders` from MySQL: + + ```sql + ALTER TABLE app_db.orders ADD amount varchar(100) NULL; + ``` + +3. update one record in `orders` from MySQL: + + ```sql + UPDATE app_db.orders SET price=100.00, amount=100.00 WHERE id=1; + ``` +4. delete one record in `orders` from MySQL: + + ```sql + DELETE FROM app_db.orders WHERE id=2; + ``` + +Refresh the Doris Web UI every time you execute a step, and you can see that the `orders` table displayed in Doris will be updated in real-time, like the following: + +{{< img src="/fig/mysql-doris-tutorial/doris-display-result.png" alt="Doris display result" >}} + +Similarly, by modifying the 'shipments' and' products' tables, you can also see the results of synchronized changes in real-time in Doris. + +### Route the changes +Flink CDC provides the configuration to route the table structure/data of the source table to other table names. +With this ability, we can achieve functions such as table name, database name replacement, and whole database synchronization. +Here is an example file for using `route` feature: + ```yaml + ################################################################################ + # Description: Sync MySQL all tables to Doris + ################################################################################ + source: + type: mysql + hostname: localhost + port: 3306 + username: root + password: 123456 + tables: app_db.\.* + server-id: 5400-5404 + server-time-zone: UTC + + sink: + type: doris + fenodes: 127.0.0.1:8030 + benodes: 127.0.0.1:8040 + username: root + password: "" + table.create.properties.light_schema_change: true + table.create.properties.replication_num: 1 + + route: + - source-table: app_db.orders + sink-table: ods_db.ods_orders + - source-table: app_db.shipments + sink-table: ods_db.ods_shipments + - source-table: app_db.products + sink-table: ods_db.ods_products + + pipeline: + name: Sync MySQL Database to Doris + parallelism: 2 + ``` + +Using the upper `route` configuration, we can synchronize the table schema and data of `app_db.orders` to `ods_db.ods_orders`, thus achieving the function of database migration. +Specifically, `source-table` support regular expression matching with multiple tables to synchronize sharding databases and tables. like the following: + + ```yaml + route: + - source-table: app_db.order\.* + sink-table: ods_db.ods_orders + ``` + +In this way, we can synchronize sharding tables like `app_db.order01`、`app_db.order02`、`app_db.order03` into one ods_db.ods_orders tables. +Warning that there is currently no support for scenarios where the same primary key data exists in multiple tables, which will be supported in future versions. + +## Clean up +After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`: + + ```shell + docker-compose down + ``` +Run the following command to stop the Flink cluster in the directory of Flink `flink-1.18.0`: + + ```shell + ./bin/stop-cluster.sh + ``` + +{{< top >}} diff --git a/docs/content/docs/try-flink-cdc/pipeline-connectors/mysql-starrocks-pipeline-tutorial.md b/docs/content/docs/try-flink-cdc/pipeline-connectors/mysql-starrocks-pipeline-tutorial.md new file mode 100644 index 0000000000..54febb8260 --- /dev/null +++ b/docs/content/docs/try-flink-cdc/pipeline-connectors/mysql-starrocks-pipeline-tutorial.md @@ -0,0 +1,314 @@ +--- +title: "MySQL to StarRocks" +weight: 2 +type: docs +aliases: +- /try-flink-cdc/pipeline-connectors/mysql-starrocks-pipeline-tutorial.html +--- + + +# Streaming ELT from MySQL to StarRocks using Flink CDC 3.0 + +This tutorial is to show how to quickly build a Streaming ELT job from MySQL to StarRocks using Flink CDC 3.0,including the feature of sync all table of one database, schema change evolution and sync sharding tables into one table. +All exercises in this tutorial are performed in the Flink CDC CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation. + +## Preparation +Prepare a Linux or MacOS computer with Docker installed. + +### Prepare Flink Standalone cluster +1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) ,unzip and get flink-1.18.0 directory. + Use the following command to navigate to the Flink directory and set FLINK_HOME to the directory where flink-1.18.0 is located. + + ```shell + cd flink-1.18.0 + ``` + +2. Enable checkpointing by appending the following parameters to the conf/flink-conf.yaml configuration file to perform a checkpoint every 3 seconds. + + ```yaml + execution.checkpointing.interval: 3000 + ``` + +3. Start the Flink cluster using the following command. + + ```shell + ./bin/start-cluster.sh + ``` + +If successfully started, you can access the Flink Web UI at [http://localhost:8081/](http://localhost:8081/), as shown below. + +{{< img src="/fig/mysql-starrocks-tutorial/flink-ui.png" alt="Flink UI" >}} + +Executing `start-cluster.sh` multiple times can start multiple `TaskManager`s. + +### Prepare docker compose +The following tutorial will prepare the required components using `docker-compose`. +Create a `docker-compose.yml` file using the content provided below: + + ```yaml + version: '2.1' + services: + StarRocks: + image: registry.starrocks.io/starrocks/allin1-ubuntu + ports: + - "8030:8030" + - "8040:8040" + - "9030:9030" + MySQL: + image: debezium/example-mysql:1.1 + ports: + - "3306:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_USER=mysqluser + - MYSQL_PASSWORD=mysqlpw + ``` + +The Docker Compose should include the following services (containers): +- MySQL: include a database named `app_db` +- StarRocks: to store tables from MySQL + +To start all containers, run the following command in the directory that contains the `docker-compose.yml` file. + + ```shell + docker-compose up -d + ``` + +This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run docker ps to check whether these containers are running properly. You can also visit [http://localhost:8030/](http://localhost:8030/) to check whether StarRocks is running. +#### Prepare records for MySQL +1. Enter MySQL container + + ```shell + docker-compose exec mysql mysql -uroot -p123456 + ``` + +2. create `app_db` database and `orders`,`products`,`shipments` tables, then insert records + + ```sql + -- create database + CREATE DATABASE app_db; + + USE app_db; + + -- create orders table + CREATE TABLE `orders` ( + `id` INT NOT NULL, + `price` DECIMAL(10,2) NOT NULL, + PRIMARY KEY (`id`) + ); + + -- insert records + INSERT INTO `orders` (`id`, `price`) VALUES (1, 4.00); + INSERT INTO `orders` (`id`, `price`) VALUES (2, 100.00); + + -- create shipments table + CREATE TABLE `shipments` ( + `id` INT NOT NULL, + `city` VARCHAR(255) NOT NULL, + PRIMARY KEY (`id`) + ); + + -- insert records + INSERT INTO `shipments` (`id`, `city`) VALUES (1, 'beijing'); + INSERT INTO `shipments` (`id`, `city`) VALUES (2, 'xian'); + + -- create products table + CREATE TABLE `products` ( + `id` INT NOT NULL, + `product` VARCHAR(255) NOT NULL, + PRIMARY KEY (`id`) + ); + + -- insert records + INSERT INTO `products` (`id`, `product`) VALUES (1, 'Beer'); + INSERT INTO `products` (`id`, `product`) VALUES (2, 'Cap'); + INSERT INTO `products` (`id`, `product`) VALUES (3, 'Peanut'); + ``` + +## Submit job using FlinkCDC cli +1. Download the binary compressed packages listed below and extract them to the directory ` flink cdc-3.0.0 '`: + [flink-cdc-3.0.0-bin.tar.gz](https://github.org/apache/flink/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz) + flink-cdc-3.0.0 directory will contain four directory `bin`,`lib`,`log`,`conf`. + +2. Download the connector package listed below and move it to the `lib` directory + **Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.** + - [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar) + - [StarRocks pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-starrocks/3.0.0/flink-cdc-pipeline-connector-starrocks-3.0.0.jar) + +3. Write task configuration yaml file. + Here is an example file for synchronizing the entire database `mysql-to-starrocks.yaml`: + + ```yaml + ################################################################################ + # Description: Sync MySQL all tables to StarRocks + ################################################################################ + source: + type: mysql + hostname: localhost + port: 3306 + username: root + password: 123456 + tables: app_db.\.* + server-id: 5400-5404 + server-time-zone: UTC + + sink: + type: starrocks + name: StarRocks Sink + jdbc-url: jdbc:mysql://127.0.0.1:9030 + load-url: 127.0.0.1:8030 + username: root + password: "" + table.create.properties.replication_num: 1 + + pipeline: + name: Sync MySQL Database to StarRocks + parallelism: 2 + + ``` + +Notice that: +* `tables: app_db.\.*` in source synchronize all tables in `app_db` through Regular Matching. +* `table.create.properties.replication_num` in sink is because there is only one StarRocks BE node in the Docker image. + +4. Finally, submit job to Flink Standalone cluster using Cli. + + ```shell + bash bin/flink-cdc.sh mysql-to-starrocks.yaml + ``` + +After successful submission, the return information is as follows: + + ```shell + Pipeline has been submitted to cluster. + Job ID: 02a31c92f0e7bc9a1f4c0051980088a0 + Job Description: Sync MySQL Database to StarRocks + ``` + +We can find a job named `Sync MySQL Database to StarRocks` is running through Flink Web UI. + +{{< img src="/fig/mysql-starrocks-tutorial/mysql-to-starrocks.png" alt="MySQL-to-StarRocks" >}} + +Connect to jdbc through database connection tools such as Dbeaver using `mysql://127.0.0.1:9030`. You can view the data written to three tables in StarRocks. + +{{< img src="/fig/mysql-starrocks-tutorial/starrocks-display-data.png" alt="StarRocks-display-data" >}} + +### Synchronize Schema and Data changes +Enter MySQL container + + ```shell + docker-compose exec mysql mysql -uroot -p123456 + ``` + +Then, modify schema and record in MySQL, and the tables of StarRocks will change the same in real time: +1. insert one record in `orders` from MySQL: + + ```sql + INSERT INTO app_db.orders (id, price) VALUES (3, 100.00); + ``` + +2. add one column in `orders` from MySQL: + + ```sql + ALTER TABLE app_db.orders ADD amount varchar(100) NULL; + ``` + +3. update one record in `orders` from MySQL: + + ```sql + UPDATE app_db.orders SET price=100.00, amount=100.00 WHERE id=1; + ``` +4. delete one record in `orders` from MySQL: + + ```sql + DELETE FROM app_db.orders WHERE id=2; + ``` + +Refresh the Dbeaver every time you execute a step, and you can see that the `orders` table displayed in StarRocks will be updated in real-time, like the following: + +{{< img src="/fig/mysql-starrocks-tutorial/starrocks-display-result.png" alt="StarRocks-display-result" >}} + +Similarly, by modifying the `shipments` and `products` tables, you can also see the results of synchronized changes in real-time in StarRocks. + +### Route the changes +Flink CDC provides the configuration to route the table structure/data of the source table to other table names. +With this ability, we can achieve functions such as table name, database name replacement, and whole database synchronization. +Here is an example file for using `route` feature: + ```yaml + ################################################################################ + # Description: Sync MySQL all tables to StarRocks + ################################################################################ + source: + type: mysql + hostname: localhost + port: 3306 + username: root + password: 123456 + tables: app_db.\.* + server-id: 5400-5404 + server-time-zone: UTC + + sink: + type: starrocks + jdbc-url: jdbc:mysql://127.0.0.1:9030 + load-url: 127.0.0.1:8030 + username: root + password: "" + table.create.properties.replication_num: 1 + + route: + - source-table: app_db.orders + sink-table: ods_db.ods_orders + - source-table: app_db.shipments + sink-table: ods_db.ods_shipments + - source-table: app_db.products + sink-table: ods_db.ods_products + + pipeline: + name: Sync MySQL Database to StarRocks + parallelism: 2 + ``` + +Using the upper `route` configuration, we can synchronize the table schema and data of `app_db.orders` to `ods_db.ods_orders`, thus achieving the function of database migration. +Specifically, `source-table` support regular expression matching with multiple tables to synchronize sharding databases and tables. like the following: + + ```yaml + route: + - source-table: app_db.order\.* + sink-table: ods_db.ods_orders + ``` + +In this way, we can synchronize sharding tables like `app_db.order01`、`app_db.order02`、`app_db.order03` into one ods_db.ods_orders tables. +Warning that there is currently no support for scenarios where the same primary key data exists in multiple tables, which will be supported in future versions. + +## Clean up +After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`: + + ```shell + docker-compose down + ``` + +Run the following command to stop the Flink cluster in the directory of Flink `flink-1.18.0`: + + ```shell + ./bin/stop-cluster.sh + ``` + +{{< top >}} diff --git a/docs/content/formats/changelog-json.md b/docs/content/formats/changelog-json.md deleted file mode 100644 index 5c6ab4bc3b..0000000000 --- a/docs/content/formats/changelog-json.md +++ /dev/null @@ -1,27 +0,0 @@ - - -# Changelog JSON Format - -**WARNING:** The CDC format `changelog-json` is deprecated since Flink CDC version 2.2. -The CDC format `changelog-json` was introduced at the point that Flink didn't offer any CDC format. Currently, Flink offers several well-maintained CDC formats i.e.[Debezium CDC, MAXWELL CDC, CANAL CDC](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/connectors/table/formats/overview/), we recommend user to use above CDC formats. - -### Compatibility Note - -User can still obtain and use the deprecated `changelog-json` format from older Flink CDC version e.g. [flink-format-changelog-json-2.1.1.jar](https://repo1.maven.org/maven2/com/ververica/flink-format-changelog-json/2.1.1/flink-format-changelog-json-2.1.1-SNAPSHOT.jar). diff --git a/docs/content/pipelines/doris-pipeline(ZH).md b/docs/content/pipelines/doris-pipeline(ZH).md deleted file mode 100644 index 81ffd40bec..0000000000 --- a/docs/content/pipelines/doris-pipeline(ZH).md +++ /dev/null @@ -1,286 +0,0 @@ - - -# Doris Pipeline 连接器 - -本文介绍了Pipeline Doris Connector的使用 - - -## 示例 ----------------- - -```yaml -source: - type: values - name: Values Source - -sink: - type: doris - name: Doris Sink - fenodes: 127.0.0.1:8030 - username: root - password: "" - table.create.properties.replication_num: 1 - -pipeline: - parallelism: 1 - -``` - - -## Pipeline选项 ----------------- - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    OptionRequiredDefaultTypeDescription
    typerequired(none)String指定要使用的Sink, 这里是 'doris'.
    nameoptional(none)String PipeLine的名称
    fenodesrequired(none)StringDoris集群FE的Http地址, 比如 127.0.0.1:8030
    benodesoptional(none)StringDoris集群BE的Http地址, 比如 127.0.0.1:8040
    jdbc-urloptional(none)StringDoris集群的JDBC地址,比如:jdbc:mysql://127.0.0.1:9030/db
    usernamerequired(none)StringDoris集群的用户名
    passwordoptional(none)StringDoris集群的密码
    auto-redirectoptionalfalseString 是否通过FE重定向写入,直连BE写入
    sink.enable.batch-modeoptionaltrueBoolean 是否使用攒批方式写入Doris
    sink.flush.queue-sizeoptional2Integer 攒批写入的队列大小 -
    sink.buffer-flush.max-rowsoptional50000Integer单个批次最大Flush的记录数
    sink.buffer-flush.max-bytesoptional10485760(10MB)Integer单个批次最大Flush的字节数
    sink.buffer-flush.intervaloptional10sStringFlush的间隔时长,超过这个时间,将异步Flush数据
    sink.properties.optional(none)StringStreamLoad的参数。 - For example: sink.properties.strict_mode: true. - 查看更多关于 StreamLoad的Properties 属性
    table.create.properties.*optional(none)String创建表的Properties配置。 - For example: table.create.properties.replication_num: 1. - 查看更多关于 Doris Table 的 Properties 属性
    -
    -## 数据类型映射 - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    CDC typeDoris typeNOTE
    TINYINTTINYINT
    SMALLINTSMALLINT
    INTINT
    BIGINTBIGINT
    DECIMALDECIMAL
    FLOATFLOAT
    DOUBLEDOUBLE
    BOOLEANBOOLEAN
    DATEDATE
    TIMESTAMP [(p)]DATETIME [(p)]
    TIMESTAMP_LTZ [(p)] - DATETIME [(p)] -
    CHAR(n)CHAR(n*3)在Doris中,字符串是以UTF-8编码存储的,所以英文字符占1个字节,中文字符占3个字节。这里的长度统一乘3,CHAR最大的长度是255,超过后会自动转为VARCHAR类型
    VARCHAR(n)VARCHAR(n*3)同上,这里的长度统一乘3,VARCHAR最大的长度是65533,超过后会自动转为STRING类型
    - BINARY(n) - STRING
    - VARBINARY(N) - STRING
    STRINGSTRING
    -
    - - - - - -## 常见问题 --------- -* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ) -* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH)) diff --git a/docs/content/versions.md b/docs/content/versions.md new file mode 100644 index 0000000000..034262ed4b --- /dev/null +++ b/docs/content/versions.md @@ -0,0 +1,29 @@ +--- +title: Versions +type: docs +bookToc: false +--- + + +# Versions + +An appendix of hosted documentation for all versions of Apache Flink CDC. + +{{< all_versions >}} diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/index.md" "b/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/index.md" deleted file mode 100644 index 680d4f125c..0000000000 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/index.md" +++ /dev/null @@ -1,36 +0,0 @@ - - -# 快速上手 - -```{toctree} -:maxdepth: 2 - -mysql-doris-pipeline-tutorial-zh -mysql-starrocks-pipeline-tutorial-zh -mysql-postgres-tutorial-zh -mongodb-tutorial-zh -oceanbase-tutorial-zh -oracle-tutorial-zh -polardbx-tutorial-zh -sqlserver-tutorial-zh -tidb-tutorial-zh -build-real-time-data-lake-tutorial-zh -datastream-api-package-guidance-zh -`` \ No newline at end of file diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-doris-pipeline-tutorial-zh.md" "b/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-doris-pipeline-tutorial-zh.md" deleted file mode 100644 index c7e62cb451..0000000000 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-doris-pipeline-tutorial-zh.md" +++ /dev/null @@ -1,341 +0,0 @@ - - -# 基于 Flink CDC 3.0 构建 MySQL 到 Doris 的 Streaming ELT - -这篇教程将展示如何基于 Flink CDC 快速构建 MySQL 到 Doris 的 Streaming ELT 作业,包含整库同步、表结构变更同步和分库分表同步的功能。 -本教程的演示都将在 Flink CDC CLI 中进行,无需一行 Java/Scala 代码,也无需安装 IDE。 - -## 准备阶段 -准备一台已经安装了 Docker 的 Linux 或者 MacOS 电脑。 - -### 准备 Flink Standalone 集群 -1. 下载 [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) ,解压后得到 flink-1.18.0 目录。 -使用下面的命令跳转至 Flink 目录下,并且设置 FLINK_HOME 为 flink-1.18.0 所在目录。 - - ```shell - cd flink-1.18.0 - ``` - -2. 通过在 conf/flink-conf.yaml 配置文件追加下列参数开启 checkpoint,每隔 3 秒做一次 checkpoint。 - - ```yaml - execution.checkpointing.interval: 3000 - ``` - -3. 使用下面的命令启动 Flink 集群。 - - ```shell - ./bin/start-cluster.sh - ``` - -启动成功的话,可以在 [http://localhost:8081/](http://localhost:8081/) 访问到 Flink Web UI,如下所示: - -![Flink UI](/_static/fig/mysql-doris-tutorial/flink-ui.png "Flink UI") - -多次执行 start-cluster.sh 可以拉起多个 TaskManager。 - -### 准备 Docker 环境 -接下来的教程将以 `docker-compose` 的方式准备所需要的组件。 - -1. 宿主机配置 -由于 Doris 的运行需要内存映射支持,需在宿主机执行如下命令 - - ```shell - sysctl -w vm.max_map_count=2000000 - ``` - -MacOS 由于内部实现容器的方式不同,在部署时宿主机直接修改`max_map_count`值可能无法成功,需要先创建以下容器: - - ```shell - docker run -it --privileged --pid=host --name=change_count debian nsenter -t 1 -m -u -n -i sh - ``` - -容器创建成功执行以下命令: - - ```shell - sysctl -w vm.max_map_count=2000000 - ``` - -然后 `exit` 退出,创建 Doris Docker 集群。 - -2. docker 镜像启动 -使用下面的内容创建一个 `docker-compose.yml` 文件: - - ```yaml - version: '2.1' - services: - doris: - image: yagagagaga/doris-standalone - ports: - - "8030:8030" - - "8040:8040" - - "9030:9030" - mysql: - image: debezium/example-mysql:1.1 - ports: - - "3306:3306" - environment: - - MYSQL_ROOT_PASSWORD=123456 - - MYSQL_USER=mysqluser - - MYSQL_PASSWORD=mysqlpw - ``` - -该 Docker Compose 中包含的容器有: -- MySQL: 包含商品信息的数据库 `app_db` 和 用户信息的数据库 `my_db` -- Doris: 存储从 MySQL 中根据规则映射过来的结果表 - -在 `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件: - - ```shell - docker-compose up -d - ``` - -该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 docker ps 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:8030/](http://localhost:8030/) 来查看 Doris 是否运行正常。 -#### 在 MySQL 数据库中准备数据 -1. 进入 MySQL 容器 - - ```shell - docker-compose exec mysql mysql -uroot -p123456 - ``` - -2. 创建数据库 `app_db` 和表 `orders`,`products`,`shipments`,并插入数据 - - ```sql - -- 创建数据库 - CREATE DATABASE app_db; - - USE app_db; - - -- 创建 orders 表 - CREATE TABLE `orders` ( - `id` INT NOT NULL, - `price` DECIMAL(10,2) NOT NULL, - PRIMARY KEY (`id`) - ); - - -- 插入数据 - INSERT INTO `orders` (`id`, `price`) VALUES (1, 4.00); - INSERT INTO `orders` (`id`, `price`) VALUES (2, 100.00); - - -- 创建 shipments 表 - CREATE TABLE `shipments` ( - `id` INT NOT NULL, - `city` VARCHAR(255) NOT NULL, - PRIMARY KEY (`id`) - ); - - -- 插入数据 - INSERT INTO `shipments` (`id`, `city`) VALUES (1, 'beijing'); - INSERT INTO `shipments` (`id`, `city`) VALUES (2, 'xian'); - - -- 创建 products 表 - CREATE TABLE `products` ( - `id` INT NOT NULL, - `product` VARCHAR(255) NOT NULL, - PRIMARY KEY (`id`) - ); - - -- 插入数据 - INSERT INTO `products` (`id`, `product`) VALUES (1, 'Beer'); - INSERT INTO `products` (`id`, `product`) VALUES (2, 'Cap'); - INSERT INTO `products` (`id`, `product`) VALUES (3, 'Peanut'); - ``` - -#### 在 Doris 数据库中创建数据库 -Doris 暂时不支持自动创建数据库,需要先创建写入表对应的数据库。 -1. 进入 Doris Web UI。 -[http://localhost:8030/](http://localhost:8030/) -默认的用户名为 root,默认密码为空。 - - ![Doris UI](/_static/fig/mysql-doris-tutorial/doris-ui.png "Doris UI") - -2. 通过 Web UI 创建 `app_db` 数据库 - - ```sql - create database app_db; - ``` - - ![Doris create_table](/_static/fig/mysql-doris-tutorial/doris-create-table.png "Doris create table") - -## 通过 FlinkCDC cli 提交任务 -1. 下载下面列出的二进制压缩包,并解压得到目录 `flink-cdc-3.0.0`: - [flink-cdc-3.0.0-bin.tar.gz](https://github.com/ververica/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz) -flink-cdc-3.0.0 下会包含 bin、lib、log、conf 四个目录。 - -2. 下载下面列出的 connector 包,并且移动到 lib 目录下 - **下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译** - - [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar) - - [Apache Doris pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-doris/3.0.0/flink-cdc-pipeline-connector-doris-3.0.0.jar) - -3. 编写任务配置 yaml 文件 -下面给出了一个整库同步的示例文件 mysql-to-doris.yaml: - - ```yaml - ################################################################################ - # Description: Sync MySQL all tables to Doris - ################################################################################ - source: - type: mysql - hostname: localhost - port: 3306 - username: root - password: 123456 - tables: app_db.\.* - server-id: 5400-5404 - server-time-zone: UTC - - sink: - type: doris - fenodes: 127.0.0.1:8030 - username: root - password: "" - table.create.properties.light_schema_change: true - table.create.properties.replication_num: 1 - - pipeline: - name: Sync MySQL Database to Doris - parallelism: 2 - - ``` - -其中: -source 中的 `tables: app_db.\.*` 通过正则匹配同步 `app_db` 下的所有表。 -sink 添加 `table.create.properties.replication_num` 参数是由于 Docker 镜像中只有一个 Doris BE 节点。 - -4. 最后,通过命令行提交任务到 Flink Standalone cluster - ```shell - bash bin/flink-cdc.sh mysql-to-doris.yaml - ``` -提交成功后,返回信息如: - ```shell - Pipeline has been submitted to cluster. - Job ID: ae30f4580f1918bebf16752d4963dc54 - Job Description: Sync MySQL Database to Doris - ``` -在 Flink Web UI,可以看到一个名为 `Sync MySQL Database to Doris` 的任务正在运行。 - -![MySQL-to-Doris](/_static/fig/mysql-doris-tutorial/mysql-to-doris.png "MySQL-to-Doris") - -打开 Doris 的 Web UI,可以看到数据表已经被创建出来,数据能成功写入。 - -![Doris_display_data](/_static/fig/mysql-doris-tutorial/doris_display_data.png "Doris_display_data") - - -### 同步变更 -进入 MySQL 容器 - - ```shell - docker-compose exec mysql mysql -uroot -p123456 - ``` - -接下来,修改 MySQL 数据库中表的数据,Doris 中显示的订单数据也将实时更新: -1. 在 MySQL 的 `orders` 表中插入一条数据 - - ```sql - INSERT INTO app_db.orders (id, price) VALUES (3, 100.00); - ``` - -2. 在 MySQL 的 `orders` 表中增加一个字段 - - ```sql - ALTER TABLE app_db.orders ADD amount varchar(100) NULL; - ``` - -3. 在 MySQL 的 `orders` 表中更新一条数据 - - ```sql - UPDATE app_db.orders SET price=100.00, amount=100.00 WHERE id=1; - ``` -4. 在 MySQL 的 `orders` 表中删除一条数据 - - ```sql - DELETE FROM app_db.orders WHERE id=2; - ``` - -每执行一步就刷新一次 Doris Web UI,可以看到 Doris 中显示的 orders 数据将实时更新,如下所示: - -![Doris_display_result](/_static/fig/mysql-doris-tutorial/doris_display_result.png "Doris_display_result") - -同样的,去修改 `shipments`, `products` 表,也能在 Doris 中实时看到同步变更的结果。 - -### 路由变更 -Flink CDC 提供了将源表的表结构/数据路由到其他表名的配置,借助这种能力,我们能够实现表名库名替换,整库同步等功能。 -下面提供一个配置文件说明: - ```yaml - ################################################################################ - # Description: Sync MySQL all tables to Doris - ################################################################################ - source: - type: mysql - hostname: localhost - port: 3306 - username: root - password: 123456 - tables: app_db.\.* - server-id: 5400-5404 - server-time-zone: UTC - - sink: - type: doris - fenodes: 127.0.0.1:8030 - benodes: 127.0.0.1:8040 - username: root - password: "" - table.create.properties.light_schema_change: true - table.create.properties.replication_num: 1 - - route: - - source-table: app_db.orders - sink-table: ods_db.ods_orders - - source-table: app_db.shipments - sink-table: ods_db.ods_shipments - - source-table: app_db.products - sink-table: ods_db.ods_products - - pipeline: - name: Sync MySQL Database to Doris - parallelism: 2 - ``` - -通过上面的 `route` 配置,会将 `app_db.orders` 表的结构和数据同步到 `ods_db.ods_orders`中。从而实现数据库迁移的功能。 -特别地,source-table 支持正则表达式匹配多表,从而实现分库分表同步的功能,例如下面的配置: - - ```yaml - route: - - source-table: app_db.order\.* - sink-table: ods_db.ods_orders - ``` - -这样,就可以将诸如 `app_db.order01`、`app_db.order02`、`app_db.order03` 的表汇总到 ods_db.ods_orders 中。注意,目前还不支持多表中存在相同主键数据的场景,将在后续版本支持。 - -## 环境清理 -本教程结束后,在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器: - - ```shell - docker-compose down - ``` - -在 Flink 所在目录 `flink-1.18.0` 下执行如下命令停止 Flink 集群: - - ```shell - ./bin/stop-cluster.sh - ``` - - diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-starrocks-pipeline-tutorial-zh.md" "b/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-starrocks-pipeline-tutorial-zh.md" deleted file mode 100644 index 43a94b3fd0..0000000000 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/mysql-starrocks-pipeline-tutorial-zh.md" +++ /dev/null @@ -1,306 +0,0 @@ - - -# 基于 Flink CDC 3.0 构建 MySQL 到 StarRocks 的 Streaming ELT - -这篇教程将展示如何基于 Flink CDC 快速构建 MySQL 到 StarRocks 的 Streaming ELT 作业,包含整库同步、表结构变更同步和分库分表同步的功能。 -本教程的演示都将在 Flink CDC CLI 中进行,无需一行 Java/Scala 代码,也无需安装 IDE。 - -## 准备阶段 -准备一台已经安装了 Docker 的 Linux 或者 MacOS 电脑。 - -### 准备 Flink Standalone 集群 -1. 下载 [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) ,解压后得到 flink-1.18.0 目录。 - 使用下面的命令跳转至 Flink 目录下,并且设置 FLINK_HOME 为 flink-1.18.0 所在目录。 - - ```shell - cd flink-1.18.0 - ``` - -2. 通过在 conf/flink-conf.yaml 配置文件追加下列参数开启 checkpoint,每隔 3 秒做一次 checkpoint。 - - ```yaml - execution.checkpointing.interval: 3000 - ``` - -3. 使用下面的命令启动 Flink 集群。 - - ```shell - ./bin/start-cluster.sh - ``` - -启动成功的话,可以在 [http://localhost:8081/](http://localhost:8081/) 访问到 Flink Web UI,如下所示: - -![Flink UI](/_static/fig/mysql-starrocks-tutorial/flink-ui.png "Flink UI") - -多次执行 start-cluster.sh 可以拉起多个 TaskManager。 - -### 准备 Docker 环境 -使用下面的内容创建一个 `docker-compose.yml` 文件: - - ```yaml - version: '2.1' - services: - StarRocks: - image: registry.starrocks.io/starrocks/allin1-ubuntu - ports: - - "8030:8030" - - "8040:8040" - - "9030:9030" - MySQL: - image: debezium/example-mysql:1.1 - ports: - - "3306:3306" - environment: - - MYSQL_ROOT_PASSWORD=123456 - - MYSQL_USER=mysqluser - - MYSQL_PASSWORD=mysqlpw - ``` - -该 Docker Compose 中包含的容器有: -- MySQL: 包含商品信息的数据库 `app_db` -- StarRocks: 存储从 MySQL 中根据规则映射过来的结果表 - -在 `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件: - - ```shell - docker-compose up -d - ``` - -该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 docker ps 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:8030/](http://localhost:8030/) 来查看 StarRocks 是否运行正常。 -#### 在 MySQL 数据库中准备数据 -1. 进入 MySQL 容器 - - ```shell - docker-compose exec mysql mysql -uroot -p123456 - ``` - -2. 创建数据库 `app_db` 和表 `orders`,`products`,`shipments`,并插入数据 - - ```sql - -- 创建数据库 - CREATE DATABASE app_db; - - USE app_db; - - -- 创建 orders 表 - CREATE TABLE `orders` ( - `id` INT NOT NULL, - `price` DECIMAL(10,2) NOT NULL, - PRIMARY KEY (`id`) - ); - - -- 插入数据 - INSERT INTO `orders` (`id`, `price`) VALUES (1, 4.00); - INSERT INTO `orders` (`id`, `price`) VALUES (2, 100.00); - - -- 创建 shipments 表 - CREATE TABLE `shipments` ( - `id` INT NOT NULL, - `city` VARCHAR(255) NOT NULL, - PRIMARY KEY (`id`) - ); - - -- 插入数据 - INSERT INTO `shipments` (`id`, `city`) VALUES (1, 'beijing'); - INSERT INTO `shipments` (`id`, `city`) VALUES (2, 'xian'); - - -- 创建 products 表 - CREATE TABLE `products` ( - `id` INT NOT NULL, - `product` VARCHAR(255) NOT NULL, - PRIMARY KEY (`id`) - ); - - -- 插入数据 - INSERT INTO `products` (`id`, `product`) VALUES (1, 'Beer'); - INSERT INTO `products` (`id`, `product`) VALUES (2, 'Cap'); - INSERT INTO `products` (`id`, `product`) VALUES (3, 'Peanut'); - ``` - -## 通过 FlinkCDC cli 提交任务 -1. 下载下面列出的二进制压缩包,并解压得到目录 `flink-cdc-3.0.0`: - [flink-cdc-3.0.0-bin.tar.gz](https://github.com/ververica/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz) - flink-cdc-3.0.0 下会包含 bin、lib、log、conf 四个目录。 - -2. 下载下面列出的 connector 包,并且移动到 lib 目录下 - **下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译** - - [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar) - - [StarRocks pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-starrocks/3.0.0/flink-cdc-pipeline-connector-starrocks-3.0.0.jar) - -3. 编写任务配置 yaml 文件 - 下面给出了一个整库同步的示例文件 mysql-to-starrocks.yaml: - - ```yaml - ################################################################################ - # Description: Sync MySQL all tables to StarRocks - ################################################################################ - source: - type: mysql - hostname: localhost - port: 3306 - username: root - password: 123456 - tables: app_db.\.* - server-id: 5400-5404 - server-time-zone: UTC - - sink: - type: starrocks - name: StarRocks Sink - jdbc-url: jdbc:mysql://127.0.0.1:9030 - load-url: 127.0.0.1:8030 - username: root - password: "" - table.create.properties.replication_num: 1 - - pipeline: - name: Sync MySQL Database to StarRocks - parallelism: 2 - - ``` - -其中: -* source 中的 `tables: app_db.\.*` 通过正则匹配同步 `app_db` 下的所有表。 -* sink 添加 `table.create.properties.replication_num` 参数是由于 Docker 镜像中只有一个 StarRocks BE 节点。 - -4. 最后,通过命令行提交任务到 Flink Standalone cluster - - ```shell - bash bin/flink-cdc.sh mysql-to-starrocks.yaml - ``` - -提交成功后,返回信息如: - - ```shell - Pipeline has been submitted to cluster. - Job ID: 02a31c92f0e7bc9a1f4c0051980088a0 - Job Description: Sync MySQL Database to StarRocks - ``` - -在 Flink Web UI,可以看到一个名为 `Sync MySQL Database to StarRocks` 的任务正在运行。 - -![MySQL-to-StarRocks](/_static/fig/mysql-starrocks-tutorial/mysql-to-starrocks.png "MySQL-to-StarRocks") - -通过数据库连接工具例如 Dbeaver 等连接到 jdbc:mysql://127.0.0.1:9030, 可以查看 StarRocks 中写入了三张表的数据。 - -![StarRocks-dispaly-data](/_static/fig/mysql-starrocks-tutorial/starrocks-display-data.png "StarRocks-dispaly-data") - -### 同步变更 -进入 MySQL 容器 - - ```shell - docker-compose exec mysql mysql -uroot -p123456 - ``` - -接下来,修改 MySQL 数据库中表的数据,StarRocks 中显示的订单数据也将实时更新: -1. 在 MySQL 的 `orders` 表中插入一条数据 - - ```sql - INSERT INTO app_db.orders (id, price) VALUES (3, 100.00); - ``` - -2. 在 MySQL 的 `orders` 表中增加一个字段 - - ```sql - ALTER TABLE app_db.orders ADD amount varchar(100) NULL; - ``` - -3. 在 MySQL 的 `orders` 表中更新一条数据 - - ```sql - UPDATE app_db.orders SET price=100.00, amount=100.00 WHERE id=1; - ``` - -4. 在 MySQL 的 `orders` 表中删除一条数据 - - ```sql - DELETE FROM app_db.orders WHERE id=2; - ``` - -通过连接工具,我们可以看到 StarRocks 上也在实时发生着这些变更: - -![StarRocks-display-result](/_static/fig/mysql-starrocks-tutorial/starrocks-display-result.png "StarRocks-display-result") - -同样的,去修改 `shipments`, `products` 表,也能在 StarRocks 中实时看到同步变更的结果。 - -### 路由变更 -Flink CDC 提供了将源表的表结构/数据路由到其他表名的配置,借助这种能力,我们能够实现表名库名替换,整库同步等功能。 -下面提供一个配置文件说明: - ```yaml - ################################################################################ - # Description: Sync MySQL all tables to StarRocks - ################################################################################ - source: - type: mysql - hostname: localhost - port: 3306 - username: root - password: 123456 - tables: app_db.\.* - server-id: 5400-5404 - server-time-zone: UTC - - sink: - type: starrocks - name: StarRocks Sink - jdbc-url: jdbc:mysql://127.0.0.1:9030 - load-url: 127.0.0.1:8030 - username: root - password: "" - table.create.properties.replication_num: 1 - - route: - - source-table: app_db.orders - sink-table: ods_db.ods_orders - - source-table: app_db.shipments - sink-table: ods_db.ods_shipments - - source-table: app_db.products - sink-table: ods_db.ods_products - - pipeline: - name: Sync MySQL Database to StarRocks - parallelism: 2 - ``` - -通过上面的 `route` 配置,会将 `app_db.orders` 表的结构和数据同步到 `ods_db.ods_orders` 中。从而实现数据库迁移的功能。 -特别地,source-table 支持正则表达式匹配多表,从而实现分库分表同步的功能,例如下面的配置: - - ```yaml - route: - - source-table: app_db.order\.* - sink-table: ods_db.ods_orders - ``` - -这样,就可以将诸如 `app_db.order01`、`app_db.order02`、`app_db.order03` 的表汇总到 ods_db.ods_orders 中。注意,目前还不支持多表中存在相同主键数据的场景,将在后续版本支持。 - -## 环境清理 -本教程结束后,在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器: - - ```shell - docker-compose down - ``` - -在 Flink 所在目录 `flink-1.18.0` 下执行如下命令停止 Flink 集群: - - ```shell - ./bin/stop-cluster.sh - ``` - - diff --git "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/polardbx-tutorial-zh.md" "b/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/polardbx-tutorial-zh.md" deleted file mode 100644 index 92082c01d7..0000000000 --- "a/docs/content/\345\277\253\351\200\237\344\270\212\346\211\213/polardbx-tutorial-zh.md" +++ /dev/null @@ -1,251 +0,0 @@ - - -# 演示: PolarDB-X CDC 导入 Elasticsearch - -本示例我们通过演示 PolarDB-X 借助 Flink-CDC 将数据导入至 Elasticsearch 来介绍 PolarDB-X 的增量订阅能力,你可以前往:[PolarDB-X](https://github.com/ApsaraDB/galaxysql) 了解更多细节。 - -### 准备教程所需要的组件 -我们假设你运行在一台 MacOS 或者 Linux 机器上,并且已经安装 docker. - -#### 配置并启动容器 - -配置 `docker-compose.yml`。 - -```yaml -version: '2.1' -services: - polardbx: - polardbx: - image: polardbx/polardb-x:2.0.1 - container_name: polardbx - ports: - - "8527:8527" - elasticsearch: - image: 'elastic/elasticsearch:7.6.0' - container_name: elasticsearch - environment: - - cluster.name=docker-cluster - - bootstrap.memory_lock=true - - ES_JAVA_OPTS=-Xms512m -Xmx512m - - discovery.type=single-node - ports: - - '9200:9200' - - '9300:9300' - ulimits: - memlock: - soft: -1 - hard: -1 - nofile: - soft: 65536 - hard: 65536 - kibana: - image: 'elastic/kibana:7.6.0' - container_name: kibana - ports: - - '5601:5601' - volumes: - - '/var/run/docker.sock:/var/run/docker.sock' -``` -该 Docker Compose 中包含的容器有: -- PolarDB-X: 商品表 `products` 和 订单表 `orders` 将存储在该数据库中, 这两张表将进行关联,得到一张包含更多信息的订单表 `enriched_orders` -- Elasticsearch: 最终的订单表 `enriched_orders` 将写到 Elasticsearch -- Kibana: 用来可视化 ElasticSearch 的数据 - -在 `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件: - -```shell -docker-compose up -d -``` -该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 docker ps 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:5601/](http://localhost:5601/) 来查看 Kibana 是否运行正常 - - -### 准备数据: - -使用已创建的用户名和密码进行登陆PolarDB-X。 - -```shell -mysql -h127.0.0.1 -P8527 -upolardbx_root -p"123456" -``` - -```sql -CREATE DATABASE mydb; -USE mydb; - --- 创建一张产品表,并写入一些数据 -CREATE TABLE products ( - id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(255) NOT NULL, - description VARCHAR(512) -) AUTO_INCREMENT = 101; - -INSERT INTO products -VALUES (default,"scooter","Small 2-wheel scooter"), - (default,"car battery","12V car battery"), - (default,"12-pack drill bits","12-pack of drill bits with sizes ranging from #40 to #3"), - (default,"hammer","12oz carpenter's hammer"), - (default,"hammer","14oz carpenter's hammer"), - (default,"hammer","16oz carpenter's hammer"), - (default,"rocks","box of assorted rocks"), - (default,"jacket","water resistent black wind breaker"), - (default,"spare tire","24 inch spare tire"); - - --- 创建一张订单表,并写入一些数据 -CREATE TABLE orders ( - order_id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, - order_date DATETIME NOT NULL, - customer_name VARCHAR(255) NOT NULL, - price DECIMAL(10, 5) NOT NULL, - product_id INTEGER NOT NULL, - order_status BOOLEAN NOT NULL -- Whether order has been placed -) AUTO_INCREMENT = 10001; - -INSERT INTO orders -VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false), - (default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false), - (default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false); -``` - -### 下载 Flink 和所需要的依赖包 -1. 下载 [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) 并将其解压至目录 `flink-1.18.0` -2. 下载下面列出的依赖包,并将它们放到目录 `flink-1.18.0/lib/` 下 - -```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译``` -- 用于订阅PolarDB-X Binlog: flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar -- 用于写入Elasticsearch: [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar) -3. 启动flink服务: -```shell -./bin/start-cluster.sh -``` - -我们可以访问 [http://localhost:8081/](http://localhost:8081/) 看到Flink正常运行: - -![Flink UI](/_static/fig/mysql-postgress-tutorial/flink-ui.png "Flink UI") - - -4. 启动Flink SQL CLI: -```shell -./bin/sql-client.sh -``` - -### 在 Flink SQL CLI 中使用 Flink DDL 创建表 - - -```sql --- 设置间隔时间为3秒 -Flink SQL> SET execution.checkpointing.interval = 3s; - --- 创建source1 -订单表 -Flink SQL> CREATE TABLE orders ( - order_id INT, - order_date TIMESTAMP(0), - customer_name STRING, - price DECIMAL(10, 5), - product_id INT, - order_status BOOLEAN, - PRIMARY KEY (order_id) NOT ENFORCED - ) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = '127.0.0.1', - 'port' = '8527', - 'username' = 'polardbx_root', - 'password' = '123456', - 'database-name' = 'mydb', - 'table-name' = 'orders' - ); - --- 创建source2 -产品表 -CREATE TABLE products ( - id INT, - name STRING, - description STRING, - PRIMARY KEY (id) NOT ENFORCED - ) WITH ( - 'connector' = 'mysql-cdc', - 'hostname' = '127.0.0.1', - 'port' = '8527', - 'username' = 'polardbx_root', - 'password' = '123456', - 'database-name' = 'mydb', - 'table-name' = 'products' -); - --- 创建sink - 关联后的结果表 -Flink SQL> CREATE TABLE enriched_orders ( - order_id INT, - order_date TIMESTAMP(0), - customer_name STRING, - price DECIMAL(10, 5), - product_id INT, - order_status BOOLEAN, - product_name STRING, - product_description STRING, - PRIMARY KEY (order_id) NOT ENFORCED - ) WITH ( - 'connector' = 'elasticsearch-7', - 'hosts' = 'http://localhost:9200', - 'index' = 'enriched_orders' - ); - --- 执行读取和写入 -Flink SQL> INSERT INTO enriched_orders - SELECT o.order_id, - o.order_date, - o.customer_name, - o.price, - o.product_id, - o.order_status, - p.name, - p.description - FROM orders AS o - LEFT JOIN products AS p ON o.product_id = p.id; -``` - -### 在 Kibana 中查看数据 - -访问 [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) - -创建 index pattern `enriched_orders`,之后可以在 [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) 看到写入的数据了。 - -### 修改监听表数据,查看增量数据变动 - -在PolarDB-X中依次执行如下修改操作,每执行一步就刷新一次 Kibana,可以看到 Kibana 中显示的订单数据将实时更新。 - -```sql -INSERT INTO orders VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false); - -UPDATE orders SET order_status = true WHERE order_id = 10004; - -DELETE FROM orders WHERE order_id = 10004; -``` - -### 环境清理 - -在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器: - -```shell -docker-compose down -``` - -进入Flink的部署目录,停止 Flink 集群: - -```shell -./bin/stop-cluster.sh -``` diff --git a/docs/docs_site.sh b/docs/docs_site.sh deleted file mode 100755 index 457658c3dc..0000000000 --- a/docs/docs_site.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -set -e - -docs_container_name="flink-cdc-docs_container" -docs_image_name="flink-cdc-docs" -port=8001 -host=localhost - -function start_docs_server() { - project_dir="$(dirname "$(pwd)")" - echo "starting docs server....." - docker build -t ${docs_image_name} -f ${project_dir}/docs/Dockerfile . - docker run -d -it -p ${port}:${port} --rm -v "${project_dir}":/home/flink-cdc --name ${docs_container_name} ${docs_image_name} - echo "docs server is running on http://${host}:${port}" -} - -function stop_docs_server() { - project_dir="$(dirname "$(pwd)")" - echo "stopping docs server....." - docker stop ${docs_container_name} - rm -rf ${project_dir}/docs/_build - echo "stop docs server successfully." -} - -if ! command -v docker &> /dev/null -then - echo "Docker must be installed to run the docs locally" - echo "Please see docs/README.md for more details" - exit 1 -fi - -if [[ $1 = "start" ]]; then - start_docs_server -elif [[ $1 = "stop" ]]; then - stop_docs_server -else - echo "Usage:" - echo "$0 start" - echo "$0 stop" -fi \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index fc1ca34ab6..0000000000 --- a/docs/index.md +++ /dev/null @@ -1,19 +0,0 @@ -# CDC Connectors for Apache Flink® - -```{toctree} -:maxdepth: 2 -:caption: Contents -content/overview/index -content/quickstart/index -content/快速上手/index -content/connectors/index -content/pipelines/index -content/formats/index -content/downloads -content/githublink -``` - -# Indices and Search - -* {ref}`genindex` -* {ref}`search` diff --git a/docs/layouts/_default/baseof.html b/docs/layouts/_default/baseof.html new file mode 100644 index 0000000000..733bf1b6d2 --- /dev/null +++ b/docs/layouts/_default/baseof.html @@ -0,0 +1,125 @@ + + + + + + {{ hugo.Generator }} + {{ partial "docs/html-head" . }} + {{ partial "docs/inject/head" . }} + + + + + + + + +
    + + +
    +
    + {{ template "header" . }} +
    + + {{ partial "docs/inject/content-before" . }} + {{ template "main" . }} + {{ partial "docs/inject/content-after" . }} + +
    + {{ template "footer" . }} + {{ partial "docs/inject/footer" . }} +
    + + {{ template "comments" . }} + + +
    + + {{ if default true (default .Site.Params.BookToC .Params.BookToC) }} + + + {{ end }} +
    + + {{ partial "docs/inject/body" . }} + + + + +{{ define "menu" }} + {{ partial "docs/menu" . }} +{{ end }} + +{{ define "header" }} + {{ partial "docs/header" . }} + + {{ if default true (default .Site.Params.BookToC .Params.BookToC) }} + + {{ end }} +{{ end }} + +{{ define "footer" }} + {{ partial "docs/footer" . }} +{{ end }} + +{{ define "comments" }} + {{ if and .Content (default true (default .Site.Params.BookComments .Params.BookComments)) }} +
    + {{- partial "docs/comments" . -}} +
    + {{ end }} +{{ end }} + +{{ define "main" }} +
    + {{- .Content -}} +
    +{{ end }} + +{{ define "toc" }} + {{ partial "docs/toc" . }} +{{ end }} diff --git a/docs/layouts/partials/docs/footer.html b/docs/layouts/partials/docs/footer.html new file mode 100644 index 0000000000..45aa36fd52 --- /dev/null +++ b/docs/layouts/partials/docs/footer.html @@ -0,0 +1,28 @@ + + +{{ if .IsPage }} +{{ $folder := "content" }} +{{ if eq "/zh" .Site.LanguagePrefix }} + {{ $folder = "content.zh" }} +{{ end }} +Edit This Page +{{ end }} diff --git a/docs/layouts/partials/docs/inject/content-before.html b/docs/layouts/partials/docs/inject/content-before.html new file mode 100644 index 0000000000..5d063bfa8a --- /dev/null +++ b/docs/layouts/partials/docs/inject/content-before.html @@ -0,0 +1,37 @@ + + + +{{ if $.Site.Params.ShowOutDatedWarning }} +
    +
    + {{ markdownify "This documentation is for an out-of-date version of Apache Flink CDC. We recommend you use the latest [stable version](https://ci.apache.org/projects/flink/flink-cdc-docs-stable/)."}} +
    +
    +{{ end }} +{{ if (not $.Site.Params.IsStable) }} +
    +
    + {{ markdownify "This documentation is for an unreleased version of Apache Flink CDC. We recommend you use the latest [stable version](https://ci.apache.org/projects/flink/flink-cdc-docs-stable/)."}} +
    +
    +{{ end }} + diff --git a/docs/layouts/partials/docs/inject/head.html b/docs/layouts/partials/docs/inject/head.html new file mode 100644 index 0000000000..f456c36bdc --- /dev/null +++ b/docs/layouts/partials/docs/inject/head.html @@ -0,0 +1,25 @@ + + + + + + diff --git a/docs/layouts/partials/docs/inject/menu-after.html b/docs/layouts/partials/docs/inject/menu-after.html new file mode 100644 index 0000000000..df19fb428e --- /dev/null +++ b/docs/layouts/partials/docs/inject/menu-after.html @@ -0,0 +1,76 @@ + + +
    + +{{ range $links := .Site.Params.MenuLinks }} + {{ index $links 0 }} +
    +{{ end }} + +
  • +
    + + + +
  • + + +{{ $translations := dict }} +{{ range .Site.Home.AllTranslations }} + {{ $translations = merge $translations (dict .Language.Lang .) }} +{{ end }} +{{ range .Translations }} + {{ $translations = merge $translations (dict .Language.Lang .) }} +{{ end }} + + +{{ range .Site.Languages }}{{ with index $translations .Lang }} +{{ if (ne $.Site.Language .Language) }} + +    + {{ .Language.LanguageName }} + +{{ end }}{{ end }}{{ end }} diff --git a/docs/layouts/partials/docs/inject/menu-before.html b/docs/layouts/partials/docs/inject/menu-before.html new file mode 100644 index 0000000000..95e0f8f885 --- /dev/null +++ b/docs/layouts/partials/docs/inject/menu-before.html @@ -0,0 +1,25 @@ + + + +

    v{{ $.Site.Params.Version }}

    \ No newline at end of file diff --git a/docs/content/connectors/index.md b/docs/layouts/partials/docs/interpolate.html similarity index 77% rename from docs/content/connectors/index.md rename to docs/layouts/partials/docs/interpolate.html index a9e59a6476..6b9702aaa4 100644 --- a/docs/content/connectors/index.md +++ b/docs/layouts/partials/docs/interpolate.html @@ -16,22 +16,9 @@ specific language governing permissions and limitations under the License. --> - -# Connectors - -```{toctree} -:maxdepth: 2 - -mongodb-cdc -mongodb-cdc(ZH) -mysql-cdc -mysql-cdc(ZH) -oceanbase-cdc -oceanbase-cdc(ZH) -oracle-cdc -postgres-cdc -sqlserver-cdc -tidb-cdc -db2-cdc -vitess-cdc -``` + +{{ $str := replace . "$scala_version" site.Params.ScalaVersion }} +{{ $str = replace $str "$version" site.Params.Version }} +{{ return $str }} \ No newline at end of file diff --git a/docs/layouts/partials/docs/menu-filetree.html b/docs/layouts/partials/docs/menu-filetree.html new file mode 100644 index 0000000000..b583347dd0 --- /dev/null +++ b/docs/layouts/partials/docs/menu-filetree.html @@ -0,0 +1,68 @@ + + +{{ $bookSection := default "docs" .Site.Params.BookSection }} +{{ if eq $bookSection "*" }} + {{ $bookSection = "/" }}{{/* Backward compatibility */}} +{{ end }} + +{{ with .Site.GetPage $bookSection }} + {{ template "book-section-children" (dict "Section" . "CurrentPage" $) }} +{{ end }} + +{{ define "book-section-children" }}{{/* (dict "Section" .Section "CurrentPage" .CurrentPage) */}} + +{{ end }} + +{{ define "book-page-link" }}{{/* (dict "Page" .Page "CurrentPage" .CurrentPage) */}} + {{ $current := eq .CurrentPage .Page }} + {{ $ancestor := .Page.IsAncestor .CurrentPage }} + + {{ if .Page.Params.sectionBreak }} +
    + {{ end }} + {{ if .Page.Params.bookCollapseSection }} + + + {{ else if .Page.Content }} + + {{- partial "docs/title" .Page -}} + + {{ else }} + {{- partial "docs/title" .Page -}} + {{ end }} +{{ end }} diff --git a/docs/layouts/partials/docs/menu.html b/docs/layouts/partials/docs/menu.html new file mode 100644 index 0000000000..77d0301599 --- /dev/null +++ b/docs/layouts/partials/docs/menu.html @@ -0,0 +1,42 @@ + + + + + +{{ $script := resources.Get "menu-reset.js" | resources.Minify }} +{{ with $script.Content }} + +{{ end }} diff --git a/docs/layouts/partials/docs/simple-title.html b/docs/layouts/partials/docs/simple-title.html new file mode 100644 index 0000000000..b324d48184 --- /dev/null +++ b/docs/layouts/partials/docs/simple-title.html @@ -0,0 +1,33 @@ + + +{{ $title := "" }} + +{{ if .Title }} + {{ $title = .Title }} +{{ else if and .IsSection .File }} + {{ $title = path.Base .File.Dir | humanize | title }} +{{ else if and .IsPage .File }} + {{ $title = .File.BaseFileName | humanize | title }} +{{ end }} + +{{ return $title }} \ No newline at end of file diff --git a/docs/content/quickstart/index.md b/docs/layouts/partials/docs/title.html similarity index 56% rename from docs/content/quickstart/index.md rename to docs/layouts/partials/docs/title.html index adf106419e..f9c96daf70 100644 --- a/docs/content/quickstart/index.md +++ b/docs/layouts/partials/docs/title.html @@ -16,22 +16,27 @@ specific language governing permissions and limitations under the License. --> + +{{ $title := "" }} + +{{ if .Title }} + {{ $title = .Title }} +{{ else if and .IsSection .File }} + {{ $title = path.Base .File.Dir | humanize | title }} +{{ else if and .IsPage .File }} + {{ $title = .File.BaseFileName | humanize | title }} +{{ end }} + +{{ if .Params.icon }} + {{ $title = printf "%s  %s" .Params.icon $title }} +{{ end }} -# Getting Started +{{ if .Params.bold }} + {{ $title = printf `
    %s
    ` $title }} +{{ end }} -```{toctree} -:maxdepth: 2 +{{ return ($title | safeHTML) }} -mysql-doris-pipeline-tutorial -mysql-starrocks-pipeline-tutorial -mysql-postgres-tutorial -mongodb-tutorial -oceanbase-tutorial -oracle-tutorial -polardbx-tutorial -sqlserver-tutorial -tidb-tutorial -db2-tutorial -build-real-time-data-lake-tutorial -datastream-api-package-guidance -``` \ No newline at end of file diff --git a/docs/layouts/partials/docs/toc.html b/docs/layouts/partials/docs/toc.html new file mode 100644 index 0000000000..863eec29bc --- /dev/null +++ b/docs/layouts/partials/docs/toc.html @@ -0,0 +1,23 @@ + +{{/* + Generates the pages table of contents. Unfortunately, hugo does not give us a lot of flexibility + around how the TOC is generated so we have to fall back to a regex to add the header. +*/}} +{{ .TableOfContents | replaceRE "