[FLINK-34180] Migrate doc website from ververica to flink (#3028)

pull/3118/head
gongzhongqiang 11 months ago committed by GitHub
parent 86272bf102
commit 1dc201f9b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -13,33 +13,73 @@
# See the License for the specific language governing permissions and
# limitations under the License.
name: build_docs
name: "Build documentation"
# execute this docs build workflow automatically when new push happens in any branch
on:
push:
paths:
- 'docs/**'
branches:
- master
- release-*
schedule:
- cron: '0 0 * * *' # Deploy every day
jobs:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
build_docs_job:
jobs:
build-documentation:
if: github.repository == 'apache/flink-cdc'
runs-on: ubuntu-latest
container: debian:buster-slim
strategy:
max-parallel: 1
matrix:
branch:
- master
- release-3.0
steps:
- uses: actions/checkout@v3
with:
ref: ${{ matrix.branch }}
- name: Set branch environment variable
run: |
currentBranch=$(git branch --show-current)
- name: Prereqs
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
echo "flink_branch=${currentBranch}" >> ${GITHUB_ENV}
if [ "${currentBranch}" = "master" ]; then
echo "flink_alias=release-3.1" >> ${GITHUB_ENV}
elif [ "${currentBranch}" = "release-3.0" ]; then
echo "flink_alias=stable" >> ${GITHUB_ENV}
fi
- name: Build documentation
run: |
apt-get update
apt-get install -y git
git clone "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" .
shell: bash
- name: Execute script to build our documentation and update pages
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: "docs/build_docs.sh"
shell: bash
docker run --rm --volume "$PWD:/root/flink-cdc" chesnay/flink-ci:java_8_11_17_21_maven_386 bash -c "cd /root/flink-cdc && chmod +x ./.github/workflows/docs.sh && ./.github/workflows/docs.sh"
- name: Upload documentation
uses: burnett01/rsync-deployments@5.2
with:
switches: --archive --compress
path: docs/target/
remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/flink/flink-cdc-docs-${{ env.flink_branch }}/
remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }}
remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }}
remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }}
remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }}
- name: Upload documentation alias
if: env.flink_alias != ''
uses: burnett01/rsync-deployments@5.2
with:
switches: --archive --compress
path: docs/target/
remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/flink/flink-cdc-docs-${{ env.flink_alias }}/
remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }}
remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }}
remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }}
remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }}

@ -0,0 +1,59 @@
#!/usr/bin/env bash
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
set -e
mvn --version
java -version
javadoc -J-version
# workaround for a git security patch
git config --global --add safe.directory /root/flink-cdc
git submodule update --init --recursive
HUGO_REPO=https://github.com/gohugoio/hugo/releases/download/v0.80.0/hugo_extended_0.80.0_Linux-64bit.tar.gz
HUGO_ARTIFACT=hugo_extended_0.80.0_Linux-64bit.tar.gz
if ! curl --fail -OL $HUGO_REPO ; then
echo "Failed to download Hugo binary"
exit 1
fi
tar -zxvf $HUGO_ARTIFACT
# Build the docs
hugo --source docs
# generate docs into docs/target
hugo -v --source docs --destination target
if [ $? -ne 0 ]; then
echo "Error building the docs"
exit 1
fi
# build Flink; required for Javadoc step
mvn clean install -B -DskipTests -Dfast
# build java/scala docs
mkdir -p docs/target/api
mvn javadoc:aggregate -B \
-DadditionalJOption="-Xdoclint:none --allow-script-in-comments" \
-Dmaven.javadoc.failOnError=false \
-Dcheckstyle.skip=true \
-Dspotless.check.skip=true \
-Denforcer.skip=true \
-Dheader="<a href=\"http://flink.apache.org/\" target=\"_top\"><h1>Back to Flink Website</h1></a> <script>var _paq=window._paq=window._paq||[];_paq.push([\"disableCookies\"]),_paq.push([\"setDomains\",[\"*.flink.apache.org\",\"*.nightlies.apache.org/flink\"]]),_paq.push([\"trackPageView\"]),_paq.push([\"enableLinkTracking\"]),function(){var u=\"//matomo.privacy.apache.org/\";_paq.push([\"setTrackerUrl\",u+\"matomo.php\"]),_paq.push([\"setSiteId\",\"1\"]);var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s)}();</script>"
mv target/site/apidocs docs/target/api/java

@ -19,10 +19,16 @@ on:
branches:
- master
- release-*
paths-ignore:
- 'docs/**'
- 'README.md'
pull_request:
branches:
- master
- release-*
paths-ignore:
- 'docs/**'
- 'README.md'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}

3
.gitmodules vendored

@ -0,0 +1,3 @@
[submodule "docs/themes/book"]
path = docs/themes/book
url = https://github.com/alex-shpak/hugo-book

@ -20,5 +20,6 @@
</component>
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/docs/themes/book" vcs="Git" />
</component>
</project>

@ -8,16 +8,16 @@ This README is meant as a brief walkthrough on the core features of CDC Connecto
## Supported (Tested) Databases
| Connector | Database | Driver |
|------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
| [mongodb-cdc](docs/content/connectors/mongodb-cdc.md) | <li> [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0, 6.0 | MongoDB Driver: 4.3.4 |
| [mysql-cdc](docs/content/connectors/mysql-cdc.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 |
| [oceanbase-cdc](/docs/content/connectors/oceanbase-cdc.md) | <li> [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x <li> [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x |
| [oracle-cdc](docs/content/connectors/oracle-cdc.md) | <li> [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 |
| [postgres-cdc](docs/content/connectors/postgres-cdc.md) | <li> [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 |
| [sqlserver-cdc](docs/content/connectors/sqlserver-cdc.md) | <li> [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 |
| [tidb-cdc](docs/content/connectors/tidb-cdc.md) | <li> [TiDB](https://www.pingcap.com): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 |
| [Db2-cdc](docs/content/connectors/db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
| [Vitess-cdc](docs/content/connectors/vitess-cdc.md) | <li> [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 |
|-------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
| [mongodb-cdc](docs/content/docs/connectors/cdc-connectors/mongodb-cdc.md) | <li> [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0, 6.0 | MongoDB Driver: 4.3.4 |
| [mysql-cdc](docs/content/docs/connectors/cdc-connectors/mysql-cdc.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 |
| [oceanbase-cdc](docs/content/docs/connectors/cdc-connectors/oceanbase-cdc.md) | <li> [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x <li> [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x |
| [oracle-cdc](docs/content/docs/connectors/cdc-connectors/oracle-cdc.md) | <li> [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 |
| [postgres-cdc](docs/content/docs/connectors/cdc-connectors/postgres-cdc.md) | <li> [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 |
| [sqlserver-cdc](docs/content/docs/connectors/cdc-connectors/sqlserver-cdc.md) | <li> [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 |
| [tidb-cdc](docs/content/docs/connectors/cdc-connectors/tidb-cdc.md) | <li> [TiDB](https://www.pingcap.com): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 |
| [Db2-cdc](docs/content/docs/connectors/cdc-connectors/db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
| [Vitess-cdc](docs/content/docs/connectors/cdc-connectors/vitess-cdc.md) | <li> [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 |
## Features
@ -106,10 +106,10 @@ Include following Maven dependency (available through Maven Central):
```
<dependency>
<groupId>com.ververica</groupId>
<groupId>org.apache.flink</groupId>
<!-- add the dependency matching your database -->
<artifactId>flink-connector-mysql-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself. -->
<version>2.5-SNAPSHOT</version>
</dependency>
```

10
docs/.gitignore vendored

@ -0,0 +1,10 @@
.bundle/
.jekyll-metadata
.jekyll-cache/
.rubydeps/
ruby2/.bundle/
ruby2/.rubydeps/
public/
resources/
.hugo_build.lock
.DS_Store

@ -1,6 +0,0 @@
FROM python:3.7-slim
RUN apt-get update
RUN apt-get -y install git
RUN pip3 install -U sphinx==4.1.1 myst-parser==0.15.2 pygments==2.10.0 sphinx-rtd-theme==0.5.2 sphinx-autobuild==2021.3.14 gitpython==3.1.18 pyyaml==6.0
EXPOSE 8001
CMD ["sphinx-autobuild", "--host", "0.0.0.0", "--port", "8001", "/home/flink-cdc/docs", "/home/flink-cdc/docs/_build/html"]

@ -1,19 +0,0 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

@ -1,36 +1,269 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
This README gives an overview of how to build and contribute to the documentation of Apache Flink.
http://www.apache.org/licenses/LICENSE-2.0
The documentation is included with the source of Apache Flink in order to ensure that you always
have docs corresponding to your checked out version. The online documentation at
https://flink.apache.org/ is also generated from the files found here.
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Requirements
This README gives an overview of how to build the documentation of Flink CDC.
### Build the documentation and serve it locally
### Build the site locally
Make sure you have installed [Docker](https://docs.docker.com/engine/install/) and started it on you local environment.
The Flink documentation uses [Hugo](https://gohugo.io/getting-started/installing/) to generate HTML files. More specifically, it uses the *extended version* of Hugo with Sass/SCSS support.
From the directory of this module (`docs`), use the following command to start the site.
To build the documentation, you can install Hugo locally or use a Docker image.
Both methods require you to execute commands in the directory of this module (`docs/`). The built site is served at http://localhost:1313/.
#### Using Hugo Docker image:
```sh
./docs_site.sh start
$ git submodule update --init --recursive
$ docker pull jakejarvis/hugo-extended:latest
$ docker run -v $(pwd):/src -p 1313:1313 jakejarvis/hugo-extended:latest server --buildDrafts --buildFuture --bind 0.0.0.0
```
Then the site will run and can be viewed at http://localhost:8001, any update on the `docs` will be shown in the site without restarting.
Of course, you can use the following command to stop the site.
## Include externally hosted documentation
```sh
./docs_site.sh stop
With the ongoing efforts to move Flink's connectors from this repository to individual, dedicated
repositories, this also requires the documentation to be hosted outside this repo. However,
we still want to serve all documentation as a whole on the Flink documentation website.
Adding new externally hosted documentation requires the following steps to be taken:
1. (If necessary) Move the existing documentation to the new repository
2. In the Flink repository, edit the `docs/setup_docs.sh` file and add a reference to your now
externally hosted documentation. The reference will look like `integrate_connector_docs <connector_name> <branch_or_tag>`.
Replace <connector_name> with the name of your connector, e.g., `elasticsearch` for `flink-connector-elasticsearch`.
## Generate configuration tables
Configuration descriptions are auto generated from code. To trigger the generation you need to run in the project root:
```
mvn -Pgenerate-config-docs install -Dfast -DskipTests
```
The resulting html files will be written to `layouts/shortcodes/generated`. Tables are regenerated each time the command is invoked.
These tables can be directly included into the documentation:
```
{{< generated/file_name >}}
```
# Contribute
## Markdown
The documentation pages are written in [Markdown](http://daringfireball.net/projects/markdown/syntax). It is possible to use [GitHub flavored syntax](http://github.github.com/github-flavored-markdown) and intermix plain html.
## Front matter
In addition to Markdown, every page contains a Jekyll front matter, which specifies the title of the page and the layout to use. The title is used as the top-level heading for the page. The default layout is `plain` (found in `_layouts`).
---
title: "Title of the Page"
---
---
title: "Title of the Page" <-- Title rendered in the side nave
weight: 1 <-- Weight controls the ordering of pages in the side nav.
type: docs <-- required
aliases: <-- Alias to setup redirect from removed page to this one
- /alias/to/removed/page.html
---
## Structure
### Page
#### Headings
All documents are structured with headings. From these headings, you can automatically generate a page table of contents (see below).
```
# Level-1 Heading <- Used for the title of the page
## Level-2 Heading <- Start with this one for content
### Level-3 heading
#### Level-4 heading
##### Level-5 heading
```
Please stick to the "logical order" when using the headlines, e.g. start with level-2 headings and use level-3 headings for subsections, etc. Don't use a different ordering, because you don't like how a headline looks.
#### Table of Contents
Table of contents are added automatically to every page, based on heading levels 2 - 4.
The ToC can be omitted by adding the following to the front matter of the page:
---
bookToc: false
---
### ShortCodes
Flink uses [shortcodes](https://gohugo.io/content-management/shortcodes/) to add custom functionality
to its documentation markdown. The following are available for use:
#### Flink Artifact
{{< artifact flink-streaming-scala withScalaVersion >}}
This will be replaced by the maven artifact for flink-streaming-scala that users should copy into their pom.xml file. It will render out to:
```xml
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.12</artifactId>
<version><!-- current flink version --></version>
</dependency>
```
It includes a number of optional flags:
* withScalaVersion: Includes the scala version suffix to the artifact id
* withTestScope: Includes `<scope>test</scope>` to the module. Useful for marking test dependencies.
* withTestClassifier: Includes `<classifier>tests</classifier>`. Useful when users should be pulling in Flink tests dependencies. This is mostly for the test harnesses and probably not what you want.
You can also use the shortcodes (with same flags) instead:
* `artifact_gradle` to show the Gradle syntax
* `artifact_tabs` to create a tabbed view, showing both Maven and Gradle syntax
#### Flink Connector Artifact
{{< connector_artifact flink-connector-elasticsearch 3.0.0 >}}
This will be replaced by the maven artifact for flink-connector-elasticsearch that users should copy into their pom.xml file. It will render out to:
```xml
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch</artifactId>
<version>3.0.0</version>
</dependency>
```
#### Back to Top
{{< top >}}
This will be replaced by a back to top link. It is recommended to use these links at least at the end of each level-2 section.
#### Info Hints
{{< hint info >}}
Some interesting information
{{< /hint >}}
The hint will be rendered in a blue box. This hint is useful when providing
additional information for the user that does not fit into the flow of the documentation.
#### Info Warning
{{< hint warning >}}
Something to watch out for.
{{< /hint >}}
The hint will be rendered in a yellow box. This hint is useful when highlighting
information users should watch out for to prevent errors.
#### Info Danger
{{< hint danger >}}
Something to avoid
{{< /hint >}}
The hint will be rendered in a red box. This hint is useful when highlighting
information users need to know to avoid data loss or to point out broken
functionality.
#### Label
{{< label "My Label" >}}
The label will be rendered in an inlined blue box. This is useful for labeling functionality
such as whether a SQL feature works for only batch or streaming execution.
#### Flink version
{{< version >}}
Interpolates the current Flink version
#### Scala Version
{{< scala_version >}}
Interpolates the default scala version
#### Stable
{{< stable >}}
Some content
{{< /stable >}}
This shortcode will only render its content if the site is marked as stable.
#### Unstable
{{< unstable >}}
Some content
{{< /unstable >}}
This shortcode will only render its content if the site is marked as unstable.
#### Query State Warning
{{< query_state_warning >}}
Will render a warning the current SQL feature may have unbounded state requirements.
#### tab
{{< tabs "sometab" >}}
{{< tab "Java" >}}
```java
System.out.println("Hello World!");
```
{{< /tab >}}
{{< tab "Scala" >}}
```scala
println("Hello World!");
```
{< /tab >}}
{{< /tabs }}
Prints the content in tabs. IMPORTANT: The label in the outermost "tabs" shortcode must
be unique for the page.
#### Github Repo
{{< github_repo >}}
Renders a link to the apache flink repo.
#### Github Link
{{< gh_link file="/some/file.java" name="Some file" >}}
Renders a link to a file in the Apache Flink repo with a given name.
#### JavaDocs Link
{{< javadoc file="some/file" name="Some file" >}}
Renders a link to a file in the Apache Flink Java Documentation.
#### PythonDocs Link
{< pythondoc file="some/file" name="Some file" >}}
Renders a link to a file in the Apache Flink Python Documentation.
#### FlinkDownloads Link
```
{{< downloads >}}
```
Renders a link to the apache flink download page.

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 301 KiB

@ -1,42 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* override table width restrictions */
.wy-table-responsive table td, .wy-table-responsive table th {
white-space: normal;
}
.wy-table-responsive {
margin-bottom: 24px;
max-width: 100%;
overflow: visible;
}
/* override style of li under ul */
.wy-nav-content ul li {
list-style: disc;
margin-left: 36px;
}
.wy-nav-content ul li p {
margin: 0 0 8px;
}
/* override max-width of content */
.wy-nav-content {
max-width: 80%;
}

@ -1,51 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Extend the RTD template to support user defined "Edit on Github" URL -->
{%- extends "sphinx_rtd_theme/breadcrumbs.html" %}
{% if page_source_suffix %}
{% set suffix = page_source_suffix %}
{% else %}
{% set suffix = source_suffix %}
{% endif %}
{% if meta is defined and meta is not none %}
{% set check_meta = True %}
{% else %}
{% set check_meta = False %}
{% endif %}
{% if check_meta and 'github_url' in meta %}
{% set display_github = True %}
{% endif %}
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
{% block breadcrumbs_aside %}
<li class="wy-breadcrumbs-aside">
{% if pagename != "search" %}
{% if display_github %}
<a href="http://{{ github_host|default("github.com") }}/{{ github_user }}/{{ github_repo }}/blob/{{ github_version }}/docs/{{ pagename }}{{ suffix }}" class="fa fa-github"> {{ _('Edit on GitHub') }}</a>
{% endif %}
{% endif %}
</li>
{% endblock %}
</ul>
<hr/>
</div>

@ -1,59 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
{% if READTHEDOCS or display_lower_left %}
{# Add rst-badge after rst-versions for small badge style. #}
<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
<span class="rst-current-version" data-toggle="rst-current-version">
<span class="fa fa-book"> Read the Docs</span>
version: {{ current_version }}
<span class="fa fa-caret-down"></span>
</span>
<div class="rst-other-versions">
{% if versions %}
<dl>
<dt>{{ _('Versions') }}</dt>
{% for slug, url in versions %}
{% if slug == current_version %} <strong> {% endif %}
<dd><a href="{{ url }}">{{ slug }}</a></dd>
{% if slug == current_version %} </strong> {% endif %}
{% endfor %}
</dl>
{% endif %}
{% if READTHEDOCS %}
<dl>
<dt>{{ _('On Read the Docs') }}</dt>
<dd>
<a href="//{{ PRODUCTION_DOMAIN }}/projects/{{ slug }}/?fromdocs={{ slug }}">{{
_('Project Home') }}</a>
</dd>
<dd>
<a href="//{{ PRODUCTION_DOMAIN }}/builds/{{ slug }}/?fromdocs={{ slug }}">{{
_('Builds') }}</a>
</dd>
</dl>
{% endif %}
<hr/>
{% trans %}Free document hosting provided by <a href="http://www.readthedocs.org">Read the
Docs</a>.{% endtrans %}
</div>
</div>
{% endif %}
<!-- Place this tag in your head or just before your close body tag. -->
<script async defer
src="https://ververica.github.io/{{ github_repo }}/{{ current_version }}/_static/button.js"></script>

@ -0,0 +1,239 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
@import "github";
.link {
padding-bottom: 5px;
}
.appetizer {
color: #FBB142;
}
.maindish {
color: #7E4F89;
}
.dessert {
color: #E6526F;
}
.book-menu nav {
background: #f8f8f8;
}
.book-page {
padding: 2rem 2rem;
}
.book-search input {
background: white;
}
.markdown a {
text-decoration: none;
color: #05b;
}
.markdown a:visited {
text-decoration: none;
color: #05b;
}
.markdown {
line-height: 1.43;
h1,
h2,
h3,
h4,
h5,
h6 {
font-weight: 500;
padding-top: 0;
margin-top: 1em;
}
}
body {
letter-spacing: normal;
-webkit-font-smoothing: auto;
}
aside nav ul {
li {
margin: 0.5em 0;
}
}
.book-search {
border: 2px solid #ebebeb;
}
@media screen and (max-width: 768px) {
.toc {
display: none;
}
}
aside.book-menu nav {
a:hover {
font-weight: bold;
opacity: 1.0;
}
a.active {
font-weight: bold;
color: var(--body-font-color);
}
}
aside.book-menu > li {
padding: 10px 5px 5px 5px;
}
aside.book-toc {
h3 {
margin-top: 0;
padding-top: 0;
font-size: 1.2em;
}
}
html {
line-height: 1.43;
}
h1, h2, h3, h4, h5, h6 {
line-height: 1.1;
}
h1, h2, h3 {
margin-top: 20px;
margin-bottom: 10px;
}
h2, h3, h4 {
padding-top: 1em;
}
h1 {
font-size: 36px;
}
h2 {
font-size: 30px;
border-bottom: 1px solid #e5e5e5;
}
h3 {
font-size: 24px;
}
h4 {
font-size: 18px;
}
.markdown code {
background: white;
padding: 0;
border-radius: 0;
}
pre.chroma code {
line-height: 1.43;
}
.book-languages {
border: 2px solid black;
}
.menu-break {
opacity: 0.1;
}
#book-search-results {
padding: 2px;
background-color: white;
}
.label {
display: inline;
padding: .2em .6em .3em;
font-size: 75%;
font-weight: 700;
line-height: 1;
color: #fff;
text-align: center;
white-space: nowrap;
vertical-align: baseline;
border-radius: .25em;
background-color: #337ab7;
}
.expand-toc {
position: fixed;
top: 2em;
right: 5em;
display: none;
}
.container {
max-width: 90rem;
}
#book-search-input:focus {
outline: none;
}
.rest-api h5 {
margin-top: .5em;
margin-bottom: .5em;
font-size: 1em;
}
.rest-api tbody {
display: table;
width: 100%;
background: white;
}
.rest-api td {
background: white;
}
.rest-api .book-expand label {
padding: 0rem 0rem;
background: white;
}
.rest-api .book-expand {
background: white;
}
.rest-api .book-expand .book-expand-head {
background: white;
}
.configuration td {
background: white;
}
.markdown table tr:nth-child(2n) {
background: white;
}

@ -0,0 +1,25 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
body {
font-family: "Helvetica Neue",Helvetica,Arial,sans-serif;
font-size: 14px;
}
code {
font-family: "Menlo", "Lucida Console", monospace;
}

@ -0,0 +1,87 @@
/**
* Syntax highlighting generated via
* hugo gen chromastyles --style=github > chroma.css
*/
/* Background */ .chroma { background-color: #ffffff }
/* Other */ .chroma .x { }
/* Error */ .chroma .err { color: #a61717; background-color: #e3d2d2 }
/* LineTableTD */ .chroma .lntd { vertical-align: top; padding: 0; margin: 0; border: 0; }
/* LineTable */ .chroma .lntable { border-spacing: 0; padding: 0; margin: 0; border: 0; width: auto; overflow: auto; display: block; }
/* LineHighlight */ .chroma .hl { display: block; width: 100%;background-color: #ffffcc }
/* LineNumbersTable */ .chroma .lnt { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f }
/* LineNumbers */ .chroma .ln { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f }
/* Keyword */ .chroma .k { color: #000000; font-weight: bold }
/* KeywordConstant */ .chroma .kc { color: #000000; font-weight: bold }
/* KeywordDeclaration */ .chroma .kd { color: #000000; font-weight: bold }
/* KeywordNamespace */ .chroma .kn { color: #000000; font-weight: bold }
/* KeywordPseudo */ .chroma .kp { color: #000000; font-weight: bold }
/* KeywordReserved */ .chroma .kr { color: #000000; font-weight: bold }
/* KeywordType */ .chroma .kt { color: #445588; font-weight: bold }
/* Name */ .chroma .n { }
/* NameAttribute */ .chroma .na { color: #008080 }
/* NameBuiltin */ .chroma .nb { color: #0086b3 }
/* NameBuiltinPseudo */ .chroma .bp { color: #999999 }
/* NameClass */ .chroma .nc { color: #445588; font-weight: bold }
/* NameConstant */ .chroma .no { color: #008080 }
/* NameDecorator */ .chroma .nd { color: #3c5d5d; font-weight: bold }
/* NameEntity */ .chroma .ni { color: #800080 }
/* NameException */ .chroma .ne { color: #990000; font-weight: bold }
/* NameFunction */ .chroma .nf { color: #990000; font-weight: bold }
/* NameFunctionMagic */ .chroma .fm { }
/* NameLabel */ .chroma .nl { color: #990000; font-weight: bold }
/* NameNamespace */ .chroma .nn { color: #555555 }
/* NameOther */ .chroma .nx { }
/* NameProperty */ .chroma .py { }
/* NameTag */ .chroma .nt { color: #000080 }
/* NameVariable */ .chroma .nv { color: #008080 }
/* NameVariableClass */ .chroma .vc { color: #008080 }
/* NameVariableGlobal */ .chroma .vg { color: #008080 }
/* NameVariableInstance */ .chroma .vi { color: #008080 }
/* NameVariableMagic */ .chroma .vm { }
/* Literal */ .chroma .l { }
/* LiteralDate */ .chroma .ld { }
/* LiteralString */ .chroma .s { color: #dd1144 }
/* LiteralStringAffix */ .chroma .sa { color: #dd1144 }
/* LiteralStringBacktick */ .chroma .sb { color: #dd1144 }
/* LiteralStringChar */ .chroma .sc { color: #dd1144 }
/* LiteralStringDelimiter */ .chroma .dl { color: #dd1144 }
/* LiteralStringDoc */ .chroma .sd { color: #dd1144 }
/* LiteralStringDouble */ .chroma .s2 { color: #dd1144 }
/* LiteralStringEscape */ .chroma .se { color: #dd1144 }
/* LiteralStringHeredoc */ .chroma .sh { color: #dd1144 }
/* LiteralStringInterpol */ .chroma .si { color: #dd1144 }
/* LiteralStringOther */ .chroma .sx { color: #dd1144 }
/* LiteralStringRegex */ .chroma .sr { color: #009926 }
/* LiteralStringSingle */ .chroma .s1 { color: #dd1144 }
/* LiteralStringSymbol */ .chroma .ss { color: #990073 }
/* LiteralNumber */ .chroma .m { color: #009999 }
/* LiteralNumberBin */ .chroma .mb { color: #009999 }
/* LiteralNumberFloat */ .chroma .mf { color: #009999 }
/* LiteralNumberHex */ .chroma .mh { color: #009999 }
/* LiteralNumberInteger */ .chroma .mi { color: #009999 }
/* LiteralNumberIntegerLong */ .chroma .il { color: #009999 }
/* LiteralNumberOct */ .chroma .mo { color: #009999 }
/* Operator */ .chroma .o { color: #000000; font-weight: bold }
/* OperatorWord */ .chroma .ow { color: #000000; font-weight: bold }
/* Punctuation */ .chroma .p { }
/* Comment */ .chroma .c { color: #999988; font-style: italic }
/* CommentHashbang */ .chroma .ch { color: #999988; font-style: italic }
/* CommentMultiline */ .chroma .cm { color: #999988; font-style: italic }
/* CommentSingle */ .chroma .c1 { color: #999988; font-style: italic }
/* CommentSpecial */ .chroma .cs { color: #999999; font-weight: bold; font-style: italic }
/* CommentPreproc */ .chroma .cp { color: #999999; font-weight: bold; font-style: italic }
/* CommentPreprocFile */ .chroma .cpf { color: #999999; font-weight: bold; font-style: italic }
/* Generic */ .chroma .g { }
/* GenericDeleted */ .chroma .gd { color: #000000; background-color: #ffdddd }
/* GenericEmph */ .chroma .ge { color: #000000; font-style: italic }
/* GenericError */ .chroma .gr { color: #aa0000 }
/* GenericHeading */ .chroma .gh { color: #999999 }
/* GenericInserted */ .chroma .gi { color: #000000; background-color: #ddffdd }
/* GenericOutput */ .chroma .go { color: #888888 }
/* GenericPrompt */ .chroma .gp { color: #555555 }
/* GenericStrong */ .chroma .gs { font-weight: bold }
/* GenericSubheading */ .chroma .gu { color: #aaaaaa }
/* GenericTraceback */ .chroma .gt { color: #aa0000 }
/* GenericUnderline */ .chroma .gl { text-decoration: underline }
/* TextWhitespace */ .chroma .w { color: #bbbbbb }

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'use strict';
(function () {
const indexCfg = {{ with i18n "bookSearchConfig" }}
{{ . }};
{{ else }}
{};
{{ end }}
indexCfg.doc = {
id: 'id',
field: ['title', 'content'],
store: ['title', 'href', 'section'],
};
const index = FlexSearch.create('balance', indexCfg);
window.bookSearchIndex = index;
{{- $pages := where .Site.Pages "Kind" "in" (slice "page" "section") -}}
{{- $pages = where $pages "Params.booksearchexclude" "!=" true -}}
{{- $pages = where $pages "Content" "not in" (slice nil "") -}}
{{ range $index, $page := $pages }}
index.add({
'id': {{ $index }},
'href': '{{ $page.RelPermalink }}',
'title': {{ (partial "docs/simple-title" $page) | jsonify }},
'section': {{ (partial "docs/simple-title" $page.Parent) | jsonify }},
'content': {{ $page.Plain | jsonify }}
});
{{- end -}}
})();

@ -1,76 +0,0 @@
#!/bin/bash
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
set -x
# step-1: install dependencies
apt-get update
apt-get -y install git rsync python3-pip python3-git python3-stemmer python3-virtualenv python3-setuptools
python3 -m pip install -U sphinx==4.1.1 myst-parser==0.15.2 pygments==2.10.0 sphinx-rtd-theme==0.5.2 pyyaml==6.0
export REPO_NAME="${GITHUB_REPOSITORY##*/}"
git config --global --add safe.directory /__w/${REPO_NAME}/${REPO_NAME}
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
temp_docs_root=`mktemp -d`
ls
# step-1.5: copy main site content to temp dir
# this must be done before `make -C docs clean` otherwise the contents will be removed
rsync -avz "docs/site/" "${temp_docs_root}/"
# step-2: build sites for all branches(for multiple versioned docs), excludes 'HEAD' and 'gh-pages'
make -C docs clean
branches="`git for-each-ref '--format=%(refname:lstrip=-1)' refs/remotes/origin/ | grep -viE '^(HEAD|gh-pages|release-1.0|release-1.1|release-1.2|release-1.3)$'| grep -iE '^(release-|master)'`"
for current_branch in ${branches}; do
export current_version=${current_branch}
git checkout ${current_branch}
# skip the branch that has no docs
if [ ! -e 'docs/conf.py' ]; then
echo -e "\tINFO: Couldn't find 'docs/conf.py' for branch: ${current_branch}, just skip this branch"
continue
fi
echo "INFO: Building sites for branch: ${current_branch}"
sphinx-build -b html docs/ docs/_build/html/${current_branch}
# copy the build content to temp dir
rsync -av "docs/_build/html/" "${temp_docs_root}/"
done
git checkout master
git config --global user.name "${GITHUB_ACTOR}"
git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
# step-3: push build sites to gh-pages branch
pushd "${temp_docs_root}"
git init
git remote add deploy "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git"
git checkout -b gh-pages
touch .nojekyll
git add .
git commit -m "Generated docs from commit ${GITHUB_SHA}"
git push deploy gh-pages --force
# pop back and exit
popd
exit 0

@ -1,135 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Maven
# Build your Java project and run tests with Apache Maven.
# Add steps that analyze code, save build artifacts, deploy, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/java
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'CDC Connectors for Apache Flink®'
copyright = '2022, Ververica GmbH; Apache Flink, Flink®, Apache®, the squirrel logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation'
author = 'ververica'
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx_rtd_theme',
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
'myst_parser',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
html_favicon = '_static/fig/favicon.png'
import myst_parser
source_parsers = {
'.md': myst_parser
}
source_suffix = ['.md']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_context = {
'css_files': [
'_static/theme_overrides.css', # overrides for wide tables in RTD theme
],
}
try:
html_context
except NameError:
html_context = dict()
html_context['display_lower_left'] = True
if 'REPO_NAME' in os.environ:
REPO_NAME = os.environ['REPO_NAME']
else:
REPO_NAME = ''
from git import Repo
repo = Repo( search_parent_directories=True )
remote_refs = repo.remote().refs
if 'current_version' in os.environ:
current_version = os.environ['current_version']
else:
current_version = repo.active_branch.name
html_context['current_version'] = current_version
html_context['version'] = current_version
html_context['github_version'] = current_version
html_context['versions'] = list()
branches = [branch.name for branch in remote_refs]
for branch in branches:
if 'origin/' in branch and ('master' in branch or 'release-' in branch)\
and 'HEAD' not in branch and 'gh-pages' not in branch \
and 'release-1.0' not in branch and 'release-1.1' not in branch\
and 'release-1.2' not in branch and 'release-1.3' not in branch:
version = branch[7:]
html_context['versions'].append( (version, '/' +REPO_NAME+ '/' +version+ '/') )
html_context['display_github'] = True
html_context['github_user'] = 'ververica'
html_context['github_repo'] = 'flink-cdc-connectors'

@ -0,0 +1,93 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
baseURL = '//nightlies.apache.org/flink/flink-cdc-docs-master'
languageCode = 'en-us'
title = 'Apache Flink CDC'
enableGitInfo = false
theme = "book"
pygmentsUseClasses = true
[params]
# Flag whether this is a stable version or not.
# Used for the quickstart page.
IsStable = false
# Flag to indicate whether an outdated warning should be shown.
ShowOutDatedWarning = false
# This is the version referenced in the docs. Please only use these variables
# to reference a specific Flink version, because this is the only place where
# we change the version for the complete docs when forking of a release branch
# etc.
# The full version string as referenced in Maven (e.g. 1.2.1)
Version = "3.1-SNAPSHOT"
# For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot
# release this should be the same as the regular version
VersionTitle = "3.1-SNAPSHOT"
# The branch for this version of Apache Flink CDC
Branch = "master"
# The GitHub repository for Apache Flink CDC
Repo = "//github.com/apache/flink-cdc"
GithubRepo = "https://github.com/apache/flink-cdc.git"
ProjectHomepage = "//flink.apache.org"
# External links at the bottom
# of the menu
MenuLinks = [
["Project Homepage", "//flink.apache.org"],
["JavaDocs", "//nightlies.apache.org/flink/flink-cdc-docs-master/api/java/"],
]
PreviousDocs = [
["3.0", "https://nightlies.apache.org/flink-cdc/flink-cdc-docs-release-3.0"],
]
[markup]
[markup.goldmark.renderer]
unsafe = true
[languages]
[languages.en]
languageName = 'English'
contentDir = 'content'
weight = 1
[languages.zh]
languageName = '中文版'
contentDir = 'content.zh'
weight = 2
[module]
[[module.imports.mounts]]
source = 'content'
target = 'content'
lang = 'en'
[[module.imports.mounts]]
source = 'content.zh'
target = 'content'
lang = 'zh'
[[module.imports.mounts]]
source = 'layouts'
target = 'layouts'
[[module.imports.mounts]]
source = 'data'
target = 'data'

@ -0,0 +1,58 @@
---
title: Apache Flink CDC
type: docs
bookToc: false
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
####
<div style="text-align: center">
<h1>
Flink CDC: Change Data Capture Solution Of Apache Flink
</h1>
<h4 style="color: #696969">Set of source connectors for Apache Flink® directly ingesting changes coming from different databases using Change Data Capture(CDC).</h4>
</div>
Flink CDC integrates Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium).
{{< img src="/fig/cdc-flow.png" alt="Stateful Functions" width="50%" >}}
Flink CDC supports ingesting snapshot data and real time changes from databases to Flink® and then transform and sink to various downstream systems.
{{< columns >}}
## Try Flink CDC
If youre interested in playing around with Flink CDC, check out our [quick
start]({{< ref "docs/try-flink-cdc" >}}). It provides multiple examples to submit and execute a Flink CDC job on a Flink cluster.
<--->
## Get Help with Flink CDC
If you get stuck, check out our [community support
resources](https://flink.apache.org/community.html). In particular, Apache
Flinks user mailing list is consistently ranked as one of the most active of
any Apache project, and is a great way to get help quickly.
{{< /columns >}}
Flink CDC is developed under the umbrella of [Apache
Flink](https://flink.apache.org/).

@ -0,0 +1,25 @@
---
title: Connectors
icon: <i class="fa fa-random title maindish" aria-hidden="true"></i>
bold: true
bookCollapseSection: true
weight: 3
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -1,3 +1,8 @@
---
title: CDC Connectors
bookCollapseSection: true
weight: 2
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -16,11 +21,3 @@ KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Formats
```{toctree}
:maxdepth: 2
changelog-json
```

@ -1,3 +1,10 @@
---
title: "Db2 CDC Connector"
weight: 9
type: docs
aliases:
- /connectors/cdc-connectors/db2-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -26,7 +33,7 @@ describes how to setup the db2 CDC connector to run SQL queries against Db2 data
## Supported Databases
| Connector | Database | Driver |
|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------|
|-----------------------|----------------------------------------------------|----------------------|
| [Db2-cdc](db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
Dependencies
@ -37,14 +44,7 @@ using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-db2-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-db2-cdc >}}
### SQL Client JAR
@ -55,7 +55,7 @@ put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-db2-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users
need to download the source code and compile the corresponding jar. Users should use the released version, such as
[flink-sql-connector-db2-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-connector-db2-cdc),
[flink-sql-connector-db2-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-db2-cdc),
the released version will be available in the Maven central warehouse.
Setup Db2 server
@ -256,8 +256,6 @@ public class Db2SourceExample {
}
```
**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization.
Data Type Mapping
----------------
@ -380,7 +378,4 @@ Data Type Mapping
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MongoDB CDC Connector"
weight: 2
type: docs
aliases:
- /connectors/cdc-connectors/mongodb-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -27,22 +34,16 @@ Dependencies
In order to setup the MongoDB CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-mongodb-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-mongodb-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Download [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mongodb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-mongodb-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mongodb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-mongodb-cdc), the released version will be available in the Maven central warehouse.
Setup MongoDB
----------------
@ -689,7 +690,4 @@ Reference
- [BSON Types](https://docs.mongodb.com/manual/reference/bson-types/)
- [Flink DataTypes](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/dev/table/types/)
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MySQL CDC Connector"
weight: 7
type: docs
aliases:
- /connectors/cdc-connectors/mysql-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -35,14 +42,7 @@ In order to setup the MySQL CDC connector, the following table provides dependen
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-mysql-cdc >}}
### SQL Client JAR
@ -50,7 +50,7 @@ In order to setup the MySQL CDC connector, the following table provides dependen
Download flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-mysql-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mysql-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-connector-mysql-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-mysql-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mysql-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-mysql-cdc), the released version will be available in the Maven central warehouse.
Setup MySQL server
----------------
@ -697,8 +697,6 @@ public class MySqlSourceExample {
}
```
**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization.
### Scan Newly Added Tables
Scan Newly Added Tables feature enables you add new tables to monitor for existing running pipeline, the newly added tables will read theirs snapshot data firstly and then read their changelog automatically.
@ -1107,7 +1105,4 @@ The example for different spatial data types mapping is as follows:
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -1,3 +1,10 @@
---
title: "OceanBase CDC Connector"
weight: 4
type: docs
aliases:
- /connectors/cdc-connectors/oceanbase-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -26,14 +33,7 @@ Dependencies
In order to set up the OceanBase CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
```xml
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-oceanbase-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-oceanbase-cdc >}}
If you want to use OceanBase JDBC driver to connect to the enterprise edition database, you should also include the following dependency in your class path.
@ -49,9 +49,9 @@ If you want to use OceanBase JDBC driver to connect to the enterprise edition da
```Download link is available only for stable releases.```
Download [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Download [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oceanbase-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-oceanbase-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oceanbase-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oceanbase-cdc), the released version will be available in the Maven central warehouse.
For JDBC driver, the cdc jar above already contains MySQL JDBC driver 5.1.47, which is our recommended version. Due to the license issue, we can not include the OceanBase JDBC driver in the cdc jar. If you need to use it, you can download it from [here](https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.2/oceanbase-client-2.4.2.jar) and put it under `<FLINK_HOME>/lib/`, you also need to set the start option `jdbc.driver` to `com.oceanbase.jdbc.Driver`.
@ -786,3 +786,5 @@ Data Type Mapping
</tbody>
</table>
</div>
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Oracle CDC Connector"
weight: 5
type: docs
aliases:
- /connectors/cdc-connectors/oracle-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -28,22 +35,15 @@ In order to setup the Oracle CDC connector, the following table provides depende
### Maven dependency
```xml
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-oracle-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-oracle-cdc >}}
### SQL Client JAR
**Download link is available only for stable releases.**
Download [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Download [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-oracle-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oracle-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-oracle-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-oracle-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oracle-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oracle-cdc), the released version will be available in the Maven central warehouse.
Setup Oracle
----------------
@ -588,8 +588,6 @@ public class OracleSourceExample {
}
```
**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization.
Data Type Mapping
----------------
<div class="wy-table-responsive">
@ -700,7 +698,4 @@ Data Type Mapping
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Overview"
weight: 1
type: docs
aliases:
- /connectors/cdc-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -22,27 +29,27 @@ under the License.
CDC Connectors for Apache Flink<sup>®</sup> is a set of source connectors for <a href="https://flink.apache.org/">Apache Flink<sup>®</sup></a>, ingesting changes from different databases using change data capture (CDC).
The CDC Connectors for Apache Flink<sup>®</sup> integrate Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium).
![Flink_CDC](/_static/fig/flinkcdc.png "Flink CDC")
{{< img src="/fig/cdc-flow.png" width="600px" alt="Flink CDC" >}}
## Supported Connectors
| Connector | Database | Driver |
|----------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
| [mongodb-cdc](../connectors/mongodb-cdc.md) | <li> [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0 | MongoDB Driver: 4.3.4 |
| [mysql-cdc](../connectors/mysql-cdc.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 |
| [oceanbase-cdc](../connectors/oceanbase-cdc.md) | <li> [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x <li> [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x |
| [oracle-cdc](../connectors/oracle-cdc.md) | <li> [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 |
| [postgres-cdc](../connectors/postgres-cdc.md) | <li> [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 |
| [sqlserver-cdc](../connectors/sqlserver-cdc.md) | <li> [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 |
| [tidb-cdc](../connectors/tidb-cdc.md) | <li> [TiDB](https://www.pingcap.com/): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 |
| [db2-cdc](../connectors/db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
| [vitess-cdc](../connectors/vitess-cdc.md) | <li> [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 |
|-----------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
| [mongodb-cdc](mongodb-cdc.md) | <li> [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0 | MongoDB Driver: 4.3.4 |
| [mysql-cdc](mysql-cdc.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 |
| [oceanbase-cdc](oceanbase-cdc.md) | <li> [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x <li> [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x |
| [oracle-cdc](oracle-cdc.md) | <li> [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 |
| [postgres-cdc](postgres-cdc.md) | <li> [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 |
| [sqlserver-cdc](sqlserver-cdc.md) | <li> [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 |
| [tidb-cdc](tidb-cdc.md) | <li> [TiDB](https://www.pingcap.com/): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 |
| [db2-cdc](db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
| [vitess-cdc](vitess-cdc.md) | <li> [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 |
## Supported Flink Versions
The following table shows the version mapping between Flink<sup>®</sup> CDC Connectors and Flink<sup>®</sup>:
| Flink<sup>®</sup> CDC Version | Flink<sup>®</sup> Version |
|:-----------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| <font color="DarkCyan">1.0.0</font> | <font color="MediumVioletRed">1.11.*</font> |
| <font color="DarkCyan">1.1.0</font> | <font color="MediumVioletRed">1.11.*</font> |
| <font color="DarkCyan">1.2.0</font> | <font color="MediumVioletRed">1.12.*</font> |
@ -64,16 +71,16 @@ The following table shows the version mapping between Flink<sup>®</sup> CDC Con
The following table shows the current features of the connector:
| Connector | No-lock Read | Parallel Read | Exactly-once Read | Incremental Snapshot Read |
|----------------------------------------------|--------------|---------------|-------------------|---------------------------|
| [mongodb-cdc](../connectors/mongodb-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [mysql-cdc](../connectors/mysql-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [oracle-cdc](../connectors/oracle-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [postgres-cdc](../connectors/postgres-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [sqlserver-cdc](../connectors/sqlserver-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [oceanbase-cdc](../connectors/oceanbase-cdc.md) | ❌ | ❌ | ❌ | ❌ |
| [tidb-cdc](../connectors/tidb-cdc.md) | ✅ | ❌ | ✅ | ❌ |
| [db2-cdc](../connectors/db2-cdc.md) | ❌ | ❌ | ✅ | ❌ |
| [vitess-cdc](../connectors/vitess-cdc.md) | ✅ | ❌ | ✅ | ❌ |
|-----------------------------------|--------------|---------------|-------------------|---------------------------|
| [mongodb-cdc](mongodb-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [mysql-cdc](mysql-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [oracle-cdc](oracle-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [postgres-cdc](postgres-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [sqlserver-cdc](sqlserver-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [oceanbase-cdc](oceanbase-cdc.md) | ❌ | ❌ | ❌ | ❌ |
| [tidb-cdc](tidb-cdc.md) | ✅ | ❌ | ✅ | ❌ |
| [db2-cdc](db2-cdc.md) | ❌ | ❌ | ✅ | ❌ |
| [vitess-cdc](vitess-cdc.md) | ✅ | ❌ | ✅ | ❌ |
## Usage for Table/SQL API
@ -114,10 +121,10 @@ Include following Maven dependency (available through Maven Central):
```
<dependency>
<groupId>com.ververica</groupId>
<groupId>org.apache.flink</groupId>
<!-- add the dependency matching your database -->
<artifactId>flink-connector-mysql-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
@ -297,97 +304,4 @@ you can construct `JsonDebeziumDeserializationSchema` as following:
new JsonDebeziumDeserializationSchema(true, customConverterConfigs);
```
## Building from source
Prerequisites:
- git
- Maven
- At least Java 8
```
git clone https://github.com/ververica/flink-cdc-connectors.git
cd flink-cdc-connectors
mvn clean install -DskipTests
```
The dependencies are now available in your local `.m2` repository.
### Code Contribute
1. Left comment under the issue that you want to take
2. Fork Flink CDC project to your GitHub repositories
![fork](/_static/fig/contribute_guidance/fork.png "fork")
3. Clone and compile your Flink CDC project
```bash
git clone https://github.com/your_name/flink-cdc-connectors.git
cd flink-cdc-connectors
mvn clean install -DskipTests
```
4. Check to a new branch and start your work
```bash
git checkout -b my_feature
-- develop and commit
```
![check_branch](/_static/fig/contribute_guidance/check_branch.png "check_branch")
5. Push your branch to your github
```bash
git push origin my_feature
```
6. Open a PR to https://github.com/ververica/flink-cdc-connectors
![open_pr](/_static/fig/contribute_guidance/open_pr.png "open_pr")
### Code Style
#### Code Formatting
You need to install the google-java-format plugin. Spotless together with google-java-format is used to format the codes.
It is recommended to automatically format your code by applying the following settings:
1. Go to "Settings" → "Other Settings" → "google-java-format Settings".
2. Tick the checkbox to enable the plugin.
3. Change the code style to "Android Open Source Project (AOSP) style".
4. Go to "Settings" → "Tools" → "Actions on Save".
5. Under "Formatting Actions", select "Optimize imports" and "Reformat file".
6. From the "All file types list" next to "Reformat code", select "Java".
For earlier IntelliJ IDEA versions, the step 4 to 7 will be changed as follows.
- 4.Go to "Settings" → "Other Settings" → "Save Actions".
- 5.Under "General", enable your preferred settings for when to format the code, e.g. "Activate save actions on save".
- 6.Under "Formatting Actions", select "Optimize imports" and "Reformat file".
- 7.Under "File Path Inclusions", add an entry for `.*\.java` to avoid formatting other file types.
Then the whole project could be formatted by command `mvn spotless:apply`.
#### Checkstyle
Checkstyle is used to enforce static coding guidelines.
1. Go to "Settings" → "Tools" → "Checkstyle".
2. Set "Scan Scope" to "Only Java sources (including tests)".
3. For "Checkstyle Version" select "8.14".
4. Under "Configuration File" click the "+" icon to add a new configuration.
5. Set "Description" to "Flink cdc".
6. Select "Use a local Checkstyle file" and link it to the file `tools/maven/checkstyle.xml` which is located within your cloned repository.
7. Select "Store relative to project location" and click "Next".
8. Configure the property `checkstyle.suppressions.file` with the value `suppressions.xml` and click "Next".
9. Click "Finish".
10. Select "Flink cdc" as the only active configuration file and click "Apply".
You can now import the Checkstyle configuration for the Java code formatter.
1. Go to "Settings" → "Editor" → "Code Style" → "Java".
2. Click the gear icon next to "Scheme" and select "Import Scheme" → "Checkstyle Configuration".
3. Navigate to and select `tools/maven/checkstyle.xml` located within your cloned repository.
Then you could click "View" → "Tool Windows" → "Checkstyle" and find the "Check Module" button in the opened tool window to validate checkstyle. Or you can use the command `mvn clean compile checkstyle:checkstyle` to validate.
### Documentation Contribute
Flink cdc documentations locates at `docs/content`.
The contribution step is the same as the code contribution. We use markdown as the source code of the document.
## License
The code in this repository is licensed under the [Apache Software License 2](https://github.com/ververica/flink-cdc-connectors/blob/master/LICENSE).
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Postgres CDC Connector"
weight: 6
type: docs
aliases:
- /connectors/cdc-connectors/postgres-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -28,14 +35,7 @@ In order to setup the Postgres CDC connector, the following table provides depen
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-postgres-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-postgres-cdc >}}
### SQL Client JAR
@ -43,7 +43,7 @@ In order to setup the Postgres CDC connector, the following table provides depen
Download flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-postgres-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-postgres-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-postgres-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-postgres-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-postgres-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-postgres-cdc), the released version will be available in the Maven central warehouse.
How to create a Postgres CDC table
----------------
@ -521,7 +521,6 @@ public class PostgreSQLSourceExample {
}
}
```
**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization.
Data Type Mapping
----------------
@ -618,7 +617,4 @@ Data Type Mapping
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -1,3 +1,10 @@
---
title: "SQLServer CDC Connector"
weight: 7
type: docs
aliases:
- /connectors/cdc-connectors/sqlserver-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -28,22 +35,15 @@ In order to setup the SQLServer CDC connector, the following table provides depe
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-sqlserver-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-sqlserver-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Download [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-sqlserver-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-sqlserver-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-sqlserver-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-sqlserver-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-sqlserver-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-sqlserver-cdc), the released version will be available in the Maven central warehouse.
Setup SQLServer Database
----------------
@ -408,7 +408,6 @@ public class SqlServerIncrementalSourceExample {
}
}
```
**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization.
Data Type Mapping
----------------
@ -504,3 +503,5 @@ Data Type Mapping
</tbody>
</table>
</div>
{{< top >}}

@ -1,3 +1,10 @@
---
title: "TiDB CDC Connector"
weight: 8
type: docs
aliases:
- /connectors/cdc-connectors/tidb-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -28,22 +35,15 @@ In order to setup the TiDB CDC connector, the following table provides dependenc
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-tidb-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-tidb-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Download [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-tidb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-tidb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-tidb-cdc), the released version will be available in the Maven central warehouse.
**Note:** flink-sql-connector-tidb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-tidb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-tidb-cdc), the released version will be available in the Maven central warehouse.
How to create a TiDB CDC table
----------------
@ -492,3 +492,5 @@ Data Type Mapping
</tbody>
</table>
</div>
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Vitess CDC Connector"
weight: 10
type: docs
aliases:
- /connectors/cdc-connectors/vitess-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -29,17 +36,11 @@ In order to setup the Vitess CDC connector, the following table provides depende
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-vitess-cdc</artifactId>
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-vitess-cdc >}}
### SQL Client JAR
Download [flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-vitess-cdc/3.0-SNAPSHOT/flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Download [flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-vitess-cdc/3.0-SNAPSHOT/flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Setup Vitess server
----------------
@ -249,8 +250,6 @@ public class VitessSourceExample {
}
```
**Note:** Please refer [Deserialization](../about.html#deserialization) for more details about the JSON deserialization.
Data Type Mapping
----------------
@ -326,3 +325,5 @@ Data Type Mapping
</tbody>
</table>
</div>
{{< top >}}

@ -1,3 +1,8 @@
---
title: Pipeline Connectors
bookCollapseSection: true
weight: 1
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -16,12 +21,3 @@ KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Overview
```{toctree}
:maxdepth: 2
:caption: Contents
cdc-connectors
cdc-pipeline
```

@ -1,3 +1,10 @@
---
title: "Doris Pipeline Connector"
weight: 2
type: docs
aliases:
- /pipelines/doris-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -28,7 +35,7 @@ This article introduces of Doris Pipeline Connector
```yaml
source:
type: values
name:ValuesSource
name: ValuesSource
sink:
type: doris
@ -277,12 +284,4 @@ pipeline:
</table>
</div>
## FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(Chinese)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MySQL Pipeline Connector"
weight: 3
type: docs
aliases:
- /pipelines/mysql-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -560,7 +567,4 @@ The example for different spatial data types mapping is as follows:
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -0,0 +1,44 @@
---
title: "Overview"
weight: 1
type: docs
aliases:
- /connectors/pipeline-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Pipeline Connectors Of CDC Streaming ELT Framework
## Supported Connectors
| Connector | Database |
|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [doris-pipeline](doris-pipeline.md) | <li> [Doris](https://doris.apache.org/): 1.2.x, 2.x.x |
| [mysql-pipeline](mysql-pipeline.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 |
| [starrocks-pipeline](starrocks-pipeline.md) | <li> [StarRocks](https://www.starrocks.io/): 2.x, 3.x |
## Supported Flink Versions
The following table shows the version mapping between Flink<sup>®</sup> CDC Pipeline and Flink<sup>®</sup>:
| Flink<sup>®</sup> CDC Version | Flink<sup>®</sup> Version |
|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| <font color="DarkCyan">3.0.*</font> | <font color="MediumVioletRed">1.14.\*</font>, <font color="MediumVioletRed">1.15.\*</font>, <font color="MediumVioletRed">1.16.\*</font>, <font color="MediumVioletRed">1.17.\*</font>, <font color="MediumVioletRed">1.18.\*</font> |
{{< top >}}

@ -1,3 +1,10 @@
---
title: "StarRocks Pipeline Connector"
weight: 4
type: docs
aliases:
- /pipelines/starrocks-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -328,7 +335,4 @@ Data Type Mapping
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -0,0 +1,26 @@
---
title: Development
icon: <i class="fa fa-code title maindish" aria-hidden="true"></i>
bold: true
sectionBreak: true
bookCollapseSection: true
weight: 2
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -1,3 +1,12 @@
---
title: "Building a Real-time Data Lake with Flink CDC"
weight: 999
type: docs
aliases:
- /development/build-real-time-data-lake-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -27,7 +36,7 @@ You can walk through the tutorial easily in the docker environment. The entire p
The following sections will take the pipeline from MySQL to [Iceberg](https://iceberg.apache.org/) as an example. The overview of the architecture is as follows:
![Real-time data lake with Flink CDC](/_static/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png "architecture of real-time data lake")
{{< img src="/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png" alt="Real-time data lake with Flink CDC" >}}
You can also use other data sources like Oracle/Postgres and sinks like Hudi to build your own pipeline.
@ -35,7 +44,7 @@ You can also use other data sources like Oracle/Postgres and sinks like Hudi to
Prepare a Linux or MacOS computer with Docker installed.
## Preparing JAR package required
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.**
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- [flink-shaded-hadoop-2-uber-2.7.5-10.0.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.7.5-10.0/flink-shaded-hadoop-2-uber-2.7.5-10.0.jar)
- [iceberg-flink-runtime-1.16-1.3.1.jar](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/1.3.1/iceberg-flink-runtime-1.16-1.3.1.jar)
@ -131,7 +140,7 @@ The components required in this tutorial are all managed in containers, so we wi
* If you want to run with your own Flink environment, remember to download the jar packages and then put them to `FLINK_HOME/lib/`.
* All the following commands involving `docker-compose` should be executed in the directory of the `docker-compose.yml` file.
![Flink UI](/_static/fig/real-time-data-lake-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/real-time-data-lake-tutorial/flink-ui.png" alt="Flink UI" >}}
### Preparing data in databases
1. Enter mysql's container:
@ -192,7 +201,7 @@ docker-compose run sql-client
We should see the welcome screen of the CLI client:
![Flink SQL Client](/_static/fig/real-time-data-lake-tutorial/flink-sql-client.png "Flink SQL Client" )
{{< img src="/fig/real-time-data-lake-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
Then do the following steps in Flink SQL CLI:
@ -263,7 +272,7 @@ Then do the following steps in Flink SQL CLI:
The running job can be found in [Flink UI](http://localhost:8081/#/job/running), and it looks like:
![CDC to Iceberg Running Job](/_static/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png "CDC to Iceberg Running Job")
{{< img src="/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png" alt="CDC to Iceberg Running Job" >}}
Then, we can use the following command to see the files written to Iceberg:
```shell
@ -271,7 +280,7 @@ Then do the following steps in Flink SQL CLI:
```
It should look like:
![Files in Iceberg](/_static/fig/real-time-data-lake-tutorial/files-in-iceberg.png "Files in Iceberg")
{{< img src="/fig/real-time-data-lake-tutorial/files-in-iceberg.png" alt="Files in Iceberg" >}}
The actual files may differ in your environment, but the structure of the directory should be similar.
@ -282,7 +291,7 @@ Then do the following steps in Flink SQL CLI:
```
We can see the data queried in the Flink SQL CLI:
![Data in Iceberg](/_static/fig/real-time-data-lake-tutorial/data_in_iceberg.png "Data in Iceberg")
{{< img src="/fig/real-time-data-lake-tutorial/data_in_iceberg.png" alt="Data in Iceberg" >}}
3. Make some changes in the MySQL databases, and then the data in Iceberg table `all_users_sink` will also change in real time.
@ -308,7 +317,7 @@ Then do the following steps in Flink SQL CLI:
The final query result is as follows:
![Final Data in Iceberg](/_static/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png "Final Data in Iceberg")
{{< img src="/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png" alt="Final Data in Iceberg" >}}
From the latest result in Iceberg, we can see that there is a new record of `(db_1, user_1, 111)`, and the address of `(db_1, user_2, 120)` has been updated to `Beijing`.
Besides, the record of `(db_2, user_2, 220)` has been deleted. The result is exactly the same with the changes we did in MySQL.
@ -319,4 +328,4 @@ After finishing the tutorial, run the following command in the directory of `doc
docker-compose down
```
{{< top >}}

@ -1,3 +1,11 @@
---
title: "CDC Streaming ELT Framework Concepts"
weight: 1
type: docs
aliases:
- /development/concept-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -23,31 +31,15 @@ under the License.
CDC Streaming ELT Framework is a stream data integration framework that aims to provide users with a more robust API. It allows users to configure their data synchronization logic through customized Flink operators and job submission tools. The framework prioritizes optimizing the task submission process and offers enhanced functionalities such as whole database synchronization, sharding, and schema change synchronization.
## What can CDC Streaming ELT Framework do?
![CDC Architecture](/_static/fig/architecture.png "CDC Architecture")
{{< img src="/fig/architecture.png" alt="CDC Architecture" >}}
* ✅ End-to-end data integration framework
* ✅ API for data integration users to build jobs easily
* ✅ Multi-table support in Source / Sink
* ✅ Synchronization of entire databases
* ✅ Schema evolution capability
## Supported Connectors
| Connector | Database |
|----------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [doris-pipeline](../pipelines/doris-pipeline.md) | <li> [Doris](https://doris.apache.org/): 1.2.x, 2.x.x |
| [mysql-pipeline](../pipelines/mysql-pipeline.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 |
| [starrocks-pipeline](../pipelines/starrocks-pipeline.md) | <li> [StarRocks](https://www.starrocks.io/): 2.x, 3.x |
## Supported Flink Versions
The following table shows the version mapping between Flink<sup>®</sup> CDC Pipeline and Flink<sup>®</sup>:
| Flink<sup>®</sup> CDC Version | Flink<sup>®</sup> Version |
|:-----------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| <font color="DarkCyan">3.0.*</font> | <font color="MediumVioletRed">1.14.\*</font>, <font color="MediumVioletRed">1.15.\*</font>, <font color="MediumVioletRed">1.16.\*</font>, <font color="MediumVioletRed">1.17.\*</font>, <font color="MediumVioletRed">1.18.\*</font> |
## Core Concepts
![CDC Design](/_static/fig/design.png "CDC Design")
{{< img src="/fig/design.png" alt="CDC Design" >}}
The data types flowing in the Flink CDC 3.0 framework are referred to as **Event**, which represent the change events generated by external systems.
Each event is marked with a **Table ID** for which the change occurred. Events are categorized into `SchemaChangeEvent` and `DataChangeEvent`, representing changes in table structure and data respectively.
@ -134,3 +126,5 @@ pipeline:
name: mysql-to-kafka-pipeline
parallelism: 1
```
{{< top >}}

@ -1,3 +1,11 @@
---
title: "DataStream Api Package Guidance"
weight: 999
type: docs
aliases:
- /development/datastream-api-package-guidance.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,7 +25,7 @@ specific language governing permissions and limitations
under the License.
-->
# DataStream api package guidance
# DataStream Api Package Guidance
This guide provides a simple pom example of mysql cdc DataStream api
@ -34,7 +42,7 @@ flink 1.17.2 flink mysql cdc 2.4.2
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ververica</groupId>
<groupId>org.apache.flink</groupId>
<artifactId>FlinkCDCTest</artifactId>
<version>1.0-SNAPSHOT</version>
@ -113,7 +121,7 @@ flink 1.17.2 flink mysql cdc 2.4.2
<version>30.1.1-jre-16.1</version>
</dependency>
<dependency>
<groupId>com.ververica</groupId>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>2.4.2</version>
</dependency>
@ -164,8 +172,8 @@ flink 1.17.2 flink mysql cdc 2.4.2
<include>io.debezium:debezium-core</include>
<include>io.debezium:debezium-ddl-parser</include>
<include>io.debezium:debezium-connector-mysql</include>
<include>com.ververica:flink-connector-debezium</include>
<include>com.ververica:flink-connector-mysql-cdc</include>
<include>org.apache.flink:flink-connector-debezium</include>
<include>org.apache.flink:flink-connector-mysql-cdc</include>
<include>org.antlr:antlr4-runtime</include>
<include>org.apache.kafka:*</include>
<include>mysql:mysql-connector-java</include>
@ -228,7 +236,7 @@ flink 1.17.2 flink mysql cdc 2.4.2
## code example
```java
package com.ververica.flink.cdc;
package org.apache.flink.flink.cdc;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
@ -265,3 +273,4 @@ public class CdcTest {
```
{{< top >}}

@ -0,0 +1,25 @@
---
title: "FAQ"
icon: <i class="fa fa-question title appetizer" aria-hidden="true"></i>
bold: true
bookCollapseSection: true
weight: 4
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,330 @@
---
title: "FAQ"
weight: 1
type: docs
aliases:
- /faq/faq.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## General FAQ
### Q1: Why can't I download Flink-sql-connector-mysql-cdc-2.2-snapshot jar, why doesn't Maven warehouse rely on XXX snapshot?
Like the mainstream Maven project version management, XXX snapshot version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as flink-sql-connector-mysql-cdc-2.1 0.jar, the released version will be available in the Maven central warehouse.
### Q2: When should I use Flink SQL connector XXX Jar? When should I Flink connector XXX jar? What's the difference between the two?
The dependency management of each connector in Flink CDC project is consistent with that in Flink project. Flink SQL connector XX is a fat jar. In addition to the code of connector, it also enters all the third-party packages that connector depends on into the shade and provides them to SQL jobs. Users only need to add the fat jar in the flink/lib directory. The Flink connector XX has only the code of the connector and does not contain the required dependencies. It is used by DataStream jobs. Users need to manage the required three-party package dependencies. Conflicting dependencies need to be excluded and shaded by themselves.
### Q3: Why change the package name from com.alibaba.ververica changed to org.apache.flink? Why can't the 2. X version be found in Maven warehouse?
Flink CDC project changes the group ID from com.alibaba.ververica changed to org.apache.flink since 2.0.0 version, this is to make the project more community neutral and more convenient for developers of various companies to build. So look for 2.x in Maven warehouse package, the path is /org/apache/flink.
## MySQL CDC FAQ
### Q1: I use CDC 2.x version , only full data can be read, but binlog data cannot be read. What's the matter?
CDC 2.0 supports lock free algorithm and concurrent reading. In order to ensure the order of full data + incremental data, it relies on Flink's checkpoint mechanism, so the job needs to be configured with checkpoint.
Configuration method in SQL job:
```sql
Flink SQL> SET 'execution.checkpointing.interval' = '3s';
```
DataStream job configuration mode:
```java
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(3000);
```
### Q2: Using MySQL CDC DataStream API, the timestamp field read in the incremental phase has a time zone difference of 8 hours. What's the matter?
When parsing the timestamp field in binlog data, CDC will use the server time zone information configured in the job, that is, the time zone of the MySQL server. If this time zone is not consistent with the time zone of your MySQL server, this problem will occur.
In addition, if the serializer is customized in the DataStream job.
such as MyDeserializer implements DebeziumDeserializationSchema, when the customized serializer parses the timestamp type data, it needs to refer to the analysis of the timestamp type in RowDataDebeziumDeserializeSchema and use the given time zone information.
```
private TimestampData convertToTimestamp(Object dbzObj, Schema schema) {
if (dbzObj instanceof Long) {
switch (schema.name()) {
case Timestamp.SCHEMA_NAME:
return TimestampData.fromEpochMillis((Long) dbzObj);
case MicroTimestamp.SCHEMA_NAME:
long micro = (long) dbzObj;
return TimestampData.fromEpochMillis(micro / 1000, (int) (micro % 1000 * 1000));
case NanoTimestamp.SCHEMA_NAME:
long nano = (long) dbzObj;
return TimestampData.fromEpochMillis(nano / 1000_000, (int) (nano % 1000_000));
}
}
LocalDateTime localDateTime = TemporalConversions.toLocalDateTime(dbzObj, serverTimeZone);
return TimestampData.fromLocalDateTime(localDateTime);
}
```
### Q3: Does MySQL CDC support listening to slave database? How to configure slave database?
Yes, the slave database needs to be configured with log slave updates = 1, so that the slave instance can also write the data synchronized from the master instance to the binlog file of the slave database. If the master database has enabled gtid mode, the slave database also needs to be enabled.
```
log-slave-updates = 1
gtid_mode = on
enforce_gtid_consistency = on
```
### Q4: I want to synchronize sub databases and sub tables. How should I configure them?
In the with parameter of MySQL CDC table, both table name and database name support regular configuration, such as 'table name ' = 'user_ '.' Can match table name 'user_ 1, user_ 2,user_ A ' table.
Note that any regular matching character is'. ' Instead of '*', where the dot represents any character, the asterisk represents 0 or more, and so does database name, that the shared table should be in the same schema.
### Q5: I want to skip the stock reading phase and only read binlog data. How to configure it?
In the with parameter of MySQL CDC table
```
'scan.startup.mode' = 'latest-offset'.
```
### Q6: I want to get DDL events in the database. What should I do? Is there a demo?
Flink CDC provides DataStream API `MysqlSource` since version 2.1. Users can configure includeschemachanges to indicate whether DDL events are required. After obtaining DDL events, they can write code for next processing.
```java
public void consumingAllEvents() throws Exception {
inventoryDatabase.createAndInitialize();
MySqlSource<String> mySqlSource =
MySqlSource.<String>builder()
.hostname(MYSQL_CONTAINER.getHost())
.port(MYSQL_CONTAINER.getDatabasePort())
.databaseList(inventoryDatabase.getDatabaseName())
.tableList(inventoryDatabase.getDatabaseName() + ".products")
.username(inventoryDatabase.getUsername())
.password(inventoryDatabase.getPassword())
.serverId("5401-5404")
.deserializer(new JsonDebeziumDeserializationSchema())
.includeSchemaChanges(true) // Configure here and output DDL events
.build();
... // Other processing logic
}
```
### Q7: How to synchronize the whole MySQL database? Does Flink CDC support it?
The DataStream API provided in Q6 has enabled users to obtain DDL change events and data change events. On this basis, users need to develop DataStream jobs according to their own business logic and downstream storage.
### Q8: In the same MySQL instance, the table of one database cannot synchronize incremental data, but other databases works fine. Why?
Users can check Binlog_Ignore_DB and Binlog_Do_DB through the `show master status` command
```mysql
mysql> show master status;
+------------------+----------+--------------+------------------+----------------------+
| File | Position | Binlog_Do_DB | Binlog_Ignore_DB | Executed_Gtid_Set |
+------------------+----------+--------------+------------------+----------------------+
| mysql-bin.000006 | 4594 | | | xxx:1-15 |
+------------------+----------+--------------+------------------+----------------------+
```
### Q9: The job reports an error the connector is trying to read binlog starting at GTIDs xxx and binlog file 'binlog.000064', pos=89887992, skipping 4 events plus 1 rows, but this is no longer available on the server. Reconfigure the connector to use a snapshot when needed, What should I do?
This error occurs because the binlog file being read by the job has been cleaned up on the MySQL server. Generally, the expiration time of the binlog file retained on the MySQL server is too short. You can set this value higher, such as 7 days.
```mysql
mysql> show variables like 'expire_logs_days';
mysql> set global expire_logs_days=7;
```
In another case, the binlog consumption of the Flink CDC job is too slow. Generally, sufficient resources can be allocated.
### Q10: The job reports an error ConnectException: A slave with the same server_uuid/server_id as this slave has connected to the master,What should I do?
This error occurs because the server ID used in the job conflicts with the server ID used by other jobs or other synchronization tools. The server ID needs to be globally unique. The server ID is an int type integer. In CDC 2.x In version, each concurrency of the source requires a server ID. it is recommended to reasonably plan the server ID. for example, if the source of the job is set to four concurrency, you can configure 'server ID' = '5001-5004', so that each source task will not conflict.
### Q11: The job reports an error ConnectException: Received DML ‘…’ for processing, binlog probably contains events generated with statement or mixed based replication format,What should I do?
This error occurs because the MySQL server is not configured correctly. You need to check the binlog is format row? You can view it through the following command
```mysql
mysql> show variables like '%binlog_format%';
```
### Q12: The job reports an error Mysql8.0 Public Key Retrieval is not allowed,What should I do?
This is because the MySQL user configured by the user uses sha256 password authentication and requires TLS and other protocols to transmit passwords. A simple method is to allow MySQL users to support original password access.
```mysql
mysql> ALTER USER 'username'@'localhost' IDENTIFIED WITH mysql_native_password BY 'password';
mysql> FLUSH PRIVILEGES;
```
### Q13: The job reports an error EventDataDeserializationException: Failed to deserialize data of EventHeaderV4 .... Caused by: java.net.SocketException: Connection reset,What should I do?
This problem is generally caused by the network. First, check the network between the Flink cluster and the database, and then increase the network parameters of the MySQL server.
```mysql
mysql> set global slave_net_timeout = 120;
mysql> set global thread_pool_idle_timeout = 120;
```
Or try to use the flink configuration as follows.
```
execution.checkpointing.interval=10min
execution.checkpointing.tolerable-failed-checkpoints=100
restart-strategy=fixed-delay
restart-strategy.fixed-delay.attempts=2147483647
restart-strategy.fixed-delay.delay= 30s
```
If there is bad back pressure in the job, this problem may happen too. Then you need to handle the back pressure in the job first.
### Q14: The job reports an error The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires,What should I do?
The reason for this problem is that the reading of the full volume phase of the job is too slow. After reading the full volume phase, the previously recorded gtid site at the beginning of the full volume phase has been cleared by mysql. This can increase the save time of binlog files on the MySQL server, or increase the concurrency of source to make the full volume phase read faster.
### Q15: How to config `tableList` option when build MySQL CDC source in DataStream API?
The `tableList` option requires table name with database name rather than table name in DataStream API. For MySQL CDC source, the `tableList` option value should like my_db.my_table.
## Postgres CDC FAQ
### Q1: It is found that the disk utilization rate of PG server is high. What is the reason why wal is not released?
Flink Postgres CDC will only update the LSN in the Postgres slot when the checkpoint is completed. Therefore, if you find that the disk utilization is high, please first confirm whether the checkpoint is turned on.
### Q2: Flink Postgres CDC returns null for decimal types exceeding the maximum precision (38, 18) in synchronous Postgres
In Flink, if the precision of the received data is greater than the precision of the type declared in Flink, the data will be processed as null. You can configure the corresponding 'debezium decimal. handling. Mode '='string' process the read data with string type
### Q3: Flink Postgres CDC prompts that toast data is not transmitted. What is the reason?
Please ensure that the replica identity is full first. The toast data is relatively large. In order to save the size of wal, if the toast data is not changed, the wal2json plugin will not bring toast data to the updated data. To avoid this problem, you can use 'debezium schema. refresh. mode'='columns_ diff_ exclude_ unchanged_ Toast 'to solve.
### Q4: The job reports an error replication slot "XXXX" is active. What should I do?
Currently, Flink Postgres CDC does not release the slot manually after the job exits. There are two ways to solve this problem
- Go to Postgres and manually execute the following command
```
select pg_drop_replication_slot('rep_slot');
ERROR: replication slot "rep_slot" is active for PID 162564
select pg_terminate_backend(162564); select pg_drop_replication_slot('rep_slot');
```
- Add 'debezium.slot.drop.on.stop'='true' to PG source with parameter to automatically clean up the slot after the job stops
### Q5: Jobs have dirty data, such as illegal dates. Are there parameters that can be configured and filtered?
Yes, you can add configure. In the with parameter of the Flink CDC table 'debezium.event.deserialization.failure.handling.mode'='warn' parameter, skip dirty data and print dirty data to warn log. You can also configure 'debezium.event.deserialization.failure.handling.mode'='ignore', skip dirty data directly and do not print dirty data to the log.
### Q6: How to config `tableList` option when build Postgres CDC source in DataStream API?
The `tableList` option requires table name with schema name rather than table name in DataStream API. For Postgres CDC source, the `tableList` option value should like my_schema.my_table.
## MongoDB CDC FAQ
### Q1: Does mongodb CDC support full + incremental read and read-only incremental?
Yes, the default is full + incremental reading; Use copy The existing = false parameter is set to read-only increment.
### Q2: Does mongodb CDC support recovery from checkpoint? What is the principle?
Yes, the checkpoint will record the resumetoken of the changestream. During recovery, the changestream can be restored through the resumetoken. Where resumetoken corresponds to oplog RS (mongodb change log collection), oplog RS is a fixed capacity collection. When the corresponding record of resumetoken is in oplog When RS does not exist, an exception of invalid resumetoken may occur. In this case, you can set the appropriate oplog Set size of RS to avoid oplog RS retention time is too short, you can refer to https://docs.mongodb.com/manual/tutorial/change-oplog-size/ In addition, the resumetoken can be refreshed through the newly arrived change record and heartbeat record.
### Q3: Does mongodb CDC support outputting - U (update_before) messages?
Mongodb original oplog RS has only insert, update, replace and delete operation types. It does not retain the information before update. It cannot output - U messages. It can only realize the update semantics in Flink. When using mongodbtablesource, Flink planner will automatically perform changelognormalize optimization, fill in the missing - U messages, and output complete + I, - u, + U, and - D messages. The cost of changelognormalize optimization is that the node will save the status of all previous keys. Therefore, if the DataStream job directly uses mongodbsource, without the optimization of Flink planner, changelognormalize will not be performed automatically, so - U messages cannot be obtained directly. To obtain the pre update image value, you need to manage the status yourself. If you don't want to manage the status yourself, you can convert mongodbtablesource to changelogstream or retractstream and supplement the pre update image value with the optimization ability of Flink planner. An example is as follows:
```
tEnv.executeSql("CREATE TABLE orders ( ... ) WITH ( 'connector'='mongodb-cdc',... )");
Table table = tEnv.from("orders")
.select($("*"));
tEnv.toChangelogStream(table)
.print()
.setParallelism(1);
env.execute();
```
### Q4: Does mongodb CDC support subscribing to multiple collections?
Only the collection of the whole database can be subscribed, but some collection filtering functions are not supported. For example, if the database is configured as' mgdb 'and the collection is an empty string, all collections under the' mgdb 'database will be subscribed.
### Q5: Does mongodb CDC support setting multiple concurrent reads?
Not yet supported.
### Q6: What versions of mongodb are supported by mongodb CDC?
Mongodb CDC is implemented based on the changestream feature, which is a new feature launched by mongodb 3.6. Mongodb CDC theoretically supports versions above 3.6. It is recommended to run version > = 4.0. When executing versions lower than 3.6, an error will occur: unrecognized pipeline stage name: '$changestream'.
### Q7: What is the operation mode of mongodb supported by mongodb CDC?
Changestream requires mongodb to run in replica set or fragment mode. Local tests can use stand-alone replica set rs.initiate().
Errors occur in standalone mode : The $changestage is only supported on replica sets.
### Q8: Mongodb CDC reports an error. The user name and password are incorrect, but other components can connect normally with this user name and password. What is the reason?
If the user is creating a DB that needs to be connected, add 'connection' to the with parameter Options' ='authsource = DB where the user is located '.
### Q9: Does mongodb CDC support debezium related parameters?
The mongodb CDC connector is not supported because it is independently developed in the Flink CDC project and does not rely on the debezium project.
### Q10: In the mongodb CDC full reading phase, can I continue reading from the checkpoint after the job fails?
In the full reading phase, mongodb CDC does not do checkpoint until the full reading phase is completed. If it fails in the full reading phase, mongodb CDC will read the stock data again.
## Oracle CDC FAQ
### Q1: Oracle CDC's archive logs grow rapidly and read logs slowly?
The online mining mode can be used without writing the data dictionary to the redo log, but it cannot process DDL statements. The default policy of the production environment reads the log slowly, and the default policy will write the data dictionary information to the redo log, resulting in a large increase in the log volume. You can add the following debezium configuration items. " log. mining. strategy' = 'online_ catalog','log. mining. continuous. mine' = 'true'。 If you use SQL, you need to prefix the configuration item with 'debezium.', Namely:
```
'debezium.log.mining.strategy' = 'online_catalog',
'debezium.log.mining.continuous.mine' = 'true'
```
### Q2: Operation error caused by: io debezium. DebeziumException: Supplemental logging not configured for table xxx. Use command: alter table XXX add supplementary log data (all) columns?
For Oracle version 11, debezium will set tableidcasesensitive to true by default, resulting in the table name being updated to lowercase. Therefore, the table completion log setting cannot be queried in Oracle, resulting in the false alarm of "supplementary logging not configured for table error".
If it is the DataStream API, add the configuration item of debezium 'database.tablename.case.insensitive' = 'false'. If the SQL API is used, add the configuration item 'debezium.database.tablename.case.insensitive' = 'false' in the option of the table
### Q3: How does Oracle CDC switch to XStream?
Add configuration item 'database.connection.adpter' = 'xstream', please use the configuration item 'debezium.database.connection.adpter' = 'xstream' if you're using SQL API.
### Q4: What are the database name and schema name of Oracle CDC
Database name is the name of the database example, that is, the SID of Oracle. Schema name is the schema corresponding to the table. Generally speaking, a user corresponds to a schema. The schema name of the user is equal to the user name and is used as the default schema of the user. Therefore, schema name is generally the user name for creating the table, but if a schema is specified when creating the table, the specified schema is schema name. For example, use create table AAAA If TestTable (XXXX) is successfully created, AAAA is schema name.

@ -0,0 +1,25 @@
---
title: "Try Flink CDC"
icon: <i class="fa fa-rocket title appetizer" aria-hidden="true"></i>
bold: true
bookCollapseSection: true
weight: 1
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,25 @@
---
title: CDC Connectors
bookCollapseSection: true
weight: 2
aliases:
- /try-flink-cdc/cdc-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -1,3 +1,12 @@
---
title: "Building a Real-time Data Lake with Flink CDC"
weight: 999
type: docs
aliases:
- /development/build-real-time-data-lake-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,33 +26,35 @@ specific language governing permissions and limitations
under the License.
-->
# 基于 Flink CDC 同步 MySQL 分库分表构建实时数据湖
# Using Flink CDC to synchronize data from MySQL sharding tables and build real-time data lake
在 OLTP 系统中,为了解决单表数据量大的问题,通常采用分库分表的方式将单个大表进行拆分以提高系统的吞吐量。
但是为了方便数据分析,通常需要将分库分表拆分出的表在同步到数据仓库、数据湖时,再合并成一个大表。
For OLTP databases, to deal with a huge number of data in a single table, we usually do database and table sharding to get better throughput.
But sometimes, for convenient analysis, we need to merge them into one table when loading them to data warehouse or data lake.
这篇教程将展示如何使用 Flink CDC 构建实时数据湖来应对这种场景,本教程的演示基于 Docker只涉及 SQL无需一行 Java/Scala 代码,也无需安装 IDE你可以很方便地在自己的电脑上完成本教程的全部内容。
This tutorial will show how to use Flink CDC to build a real-time data lake for such a scenario.
You can walk through the tutorial easily in the docker environment. The entire process uses standard SQL syntax without a single line of Java/Scala code or IDE installation.
接下来将以数据从 MySQL 同步到 [Iceberg](https://iceberg.apache.org/) 为例展示整个流程,架构图如下所示:
The following sections will take the pipeline from MySQL to [Iceberg](https://iceberg.apache.org/) as an example. The overview of the architecture is as follows:
![Architecture of Real-Time Data Lake](/_static/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png "architecture of real-time data lake")
{{< img src="/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png" alt="Real-time data lake with Flink CDC" >}}
你也可以使用不同的 source 比如 Oracle/Postgres 和 sink 比如 Hudi 来构建自己的 ETL 流程。
You can also use other data sources like Oracle/Postgres and sinks like Hudi to build your own pipeline.
## 准备阶段
准备一台已经安装了 Docker 的 Linux 或者 MacOS 电脑。
## Preparation
Prepare a Linux or MacOS computer with Docker installed.
### 下载所需要的依赖包
**下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译**
- flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar
## Preparing JAR package required
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.**
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- [flink-shaded-hadoop-2-uber-2.7.5-10.0.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.7.5-10.0/flink-shaded-hadoop-2-uber-2.7.5-10.0.jar)
- [iceberg-flink-runtime-1.16-1.3.1.jar](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/1.3.1/iceberg-flink-runtime-1.16-1.3.1.jar)
### 准备教程所需要的组件
接下来的教程将以 `docker-compose` 的方式准备所需要的组件。
1. 使用下面的内容创建一个 `Dockerfile` 文件:
### Starting components required
The components required in this tutorial are all managed in containers, so we will use `docker-compose` to start them.
1. Create `Dockerfile` file using following contents:
```dockerfile
FROM flink:1.16.0-scala_2.12
# Place the downloaded jar packages in the lib directory at the same level.
@ -51,8 +62,7 @@ under the License.
RUN apt-get update && apt-get install tree
```
2. 使用下面的内容创建一个`docker-compose.yml` 文件:
2. Create `docker-compose.yml` file using following contents:
```yml
version: '2.1'
services:
@ -113,33 +123,33 @@ under the License.
device: "tmpfs"
```
该 Docker Compose 中包含的容器有:
- SQL-Client: Flink SQL Client, 用来提交 SQL 查询和查看 SQL 的执行结果
- Flink Cluster:包含 Flink JobManager 和 Flink TaskManager用来执行 Flink SQL
- MySQL:作为分库分表的数据源,存储本教程的 `user`
The Docker Compose environment consists of the following containers:
- SQL-Client: Flink SQL Client, used to submit queries and visualize their results.
- Flink Cluster: a Flink JobManager and a Flink TaskManager container to execute queries.
- MySQL: mainly used as a data source to store the sharding table.
3. `docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件:
3. To start all containers, run the following command in the directory that contains the `docker-compose.yml` file:
```shell
docker-compose up -d
```
该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 `docker ps` 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:8081/](http://localhost:8081//) 来查看 Flink 是否运行正常。
***注意:***
* 本教程接下来用到的容器相关的命令都需要在 `docker-compose.yml` 所在目录下执行
* 如果你想要在自己的 Flink 环境运行本教程,需要下载下面列出的包并且把它们放在 Flink 所在目录的 lib 目录下,即 FLINK_HOME/lib/
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run `docker ps` to check whether these containers are running properly.
We can also visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally.
***Note:***
* If you want to run with your own Flink environment, remember to download the jar packages and then put them to `FLINK_HOME/lib/`.
* All the following commands involving `docker-compose` should be executed in the directory of the `docker-compose.yml` file.
![Flink UI](/_static/fig/real-time-data-lake-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/real-time-data-lake-tutorial/flink-ui.png" alt="Flink UI" >}}
### 准备数据
1. 进入 MySQL 容器中
### Preparing data in databases
1. Enter mysql's container:
```shell
docker-compose exec mysql mysql -uroot -p123456
```
2. 创建数据和表,并填充数据
2. Create databases/tables and populate data:
创建两个不同的数据库,并在每个数据库中创建两个表,作为 `user` 表分库分表下拆分出的表。
Create a logical sharding table `user` sharded in different databases and tables physically.
```sql
CREATE DATABASE db_1;
USE db_1;
@ -183,28 +193,30 @@ under the License.
INSERT INTO user_2 VALUES (220,"user_220","Shanghai","123567891234","user_220@foo.com");
```
## 在 Flink SQL CLI 中使用 Flink DDL 创建表
首先,使用如下的命令进入 Flink SQL CLI 容器中:
## Creating tables using Flink DDL in Flink SQL CLI
First, use the following command to enter the Flink SQL CLI Container:
```shell
docker-compose run sql-client
```
我们可以看到如下界面:
![Flink SQL Client](/_static/fig/real-time-data-lake-tutorial/flink-sql-client.png "Flink SQL Client" )
We should see the welcome screen of the CLI client:
{{< img src="/fig/real-time-data-lake-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
然后,进行如下步骤:
1. 开启 checkpoint每隔3秒做一次 checkpoint
Then do the following steps in Flink SQL CLI:
Checkpoint 默认是不开启的,我们需要开启 Checkpoint 来让 Iceberg 可以提交事务。
并且mysql-cdc 在 binlog 读取阶段开始前,需要等待一个完整的 checkpoint 来避免 binlog 记录乱序的情况。
1. Enable checkpoints every 3 seconds
Checkpoint is disabled by default, we need to enable it to commit Iceberg transactions.
Besides, the beginning of mysql-cdc binlog phase also requires waiting a complete checkpoint to avoid disorder of binlog records.
```sql
-- Flink SQL
Flink SQL> SET execution.checkpointing.interval = 3s;
```
2. 创建 MySQL 分库分表 source 表
2. Create MySQL sharding source table
创建 source 表 `user_source` 来捕获MySQL中所有 `user` 表的数据,在表的配置项 `database-name` , `table-name` 使用正则表达式来匹配这些表。
并且,`user_source` 表也定义了 metadata 列来区分数据是来自哪个数据库和表。
Create a source table that captures the data from the logical sharding table `user`. Here, we use regex to match all the physical tables.
Besides, the table defines metadata column to identify which database/table the record comes from.
```sql
-- Flink SQL
Flink SQL> CREATE TABLE user_source (
@ -226,10 +238,10 @@ docker-compose run sql-client
'table-name' = 'user_[0-9]+'
);
```
3. 创建 Iceberg sink 表
3. Create Iceberg sink table
创建 sink 表 `all_users_sink`,用来将数据加载至 Iceberg 中。
在这个 sink 表,考虑到不同的 MySQL 数据库表的 `id` 字段的值可能相同,我们定义了复合主键 (`database_name`, `table_name`, `id`)。
Create a sink table `all_users_sink` used to load data to Iceberg.
We define `database_name`, `table_name` and `id` as a combined primary key, because `id` maybe not unique across different databases and tables.
```sql
-- Flink SQL
Flink SQL> CREATE TABLE all_users_sink (
@ -250,66 +262,70 @@ docker-compose run sql-client
);
```
## 流式写入 Iceberg
1. 使用下面的 Flink SQL 语句将数据从 MySQL 写入 Iceberg 中
## Streaming to Iceberg
1. Streaming write data from MySQL to Iceberg using the following Flink SQL:
```sql
-- Flink SQL
Flink SQL> INSERT INTO all_users_sink select * from user_source;
```
上述命令将会启动一个流式作业,源源不断将 MySQL 数据库中的全量和增量数据同步到 Iceberg 中。
在 [Flink UI](http://localhost:8081/#/job/running) 上可以看到这个运行的作业:
It will start a streaming job which will synchronize historical and incremental data from MySQL to Iceberg continuously.
The running job can be found in [Flink UI](http://localhost:8081/#/job/running), and it looks like:
![CDC to Iceberg Running Job](/_static/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png "CDC to Iceberg Running Job")
{{< img src="/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png" alt="CDC to Iceberg Running Job" >}}
然后我们就可以使用如下的命令看到 Iceberg 中的写入的文件:
Then, we can use the following command to see the files written to Iceberg:
```shell
docker-compose exec sql-client tree /tmp/iceberg/warehouse/default_database/
```
如下所示:
It should look like:
![Files in Iceberg](/_static/fig/real-time-data-lake-tutorial/files-in-iceberg.png "Files in Iceberg")
{{< img src="/fig/real-time-data-lake-tutorial/files-in-iceberg.png" alt="Files in Iceberg" >}}
在你的运行环境中,实际的文件可能与上面的截图不相同,但是整体的目录结构应该相似。
The actual files may differ in your environment, but the structure of the directory should be similar.
2. 使用下面的 Flink SQL 语句查询表 `all_users_sink` 中的数据
2. Use the following Flink SQL to query the data written to `all_users_sink`:
```sql
-- Flink SQL
Flink SQL> SELECT * FROM all_users_sink;
```
在 Flink SQL CLI 中我们可以看到如下查询结果:
We can see the data queried in the Flink SQL CLI:
![Data in Iceberg](/_static/fig/real-time-data-lake-tutorial/data_in_iceberg.png "Data in Iceberg")
{{< img src="/fig/real-time-data-lake-tutorial/data_in_iceberg.png" alt="Data in Iceberg" >}}
3. 修改 MySQL 中表的数据Iceberg 中的表 `all_users_sink` 中的数据也将实时更新:
3. Make some changes in the MySQL databases, and then the data in Iceberg table `all_users_sink` will also change in real time.
(3.1) `db_1.user_1` 表中插入新的一行
(3.1) Insert a new user in table `db_1.user_1`
```sql
--- db_1
INSERT INTO db_1.user_1 VALUES (111,"user_111","Shanghai","123567891234","user_111@foo.com");
```
(3.2) 更新 `db_1.user_2` 表的数据
(3.2) Update a user in table `db_1.user_2`
```sql
--- db_1
UPDATE db_1.user_2 SET address='Beijing' WHERE id=120;
```
(3.3) `db_2.user_2` 表中删除一行
(3.3) Delete a user in table `db_2.user_2`
```sql
--- db_2
DELETE FROM db_2.user_2 WHERE id=220;
```
每执行一步,我们就可以在 Flink Client CLI 中使用 `SELECT * FROM all_users_sink` 查询表 `all_users_sink` 来看到数据的变化。
最后的查询结果如下所示:
After executing each step, we can query the table `all_users_sink` using `SELECT * FROM all_users_sink` in Flink SQL CLI to see the changes.
![Final Data in Iceberg](/_static/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png "Final Data in Iceberg")
The final query result is as follows:
从 Iceberg 的最新结果中可以看到新增了`(db_1, user_1, 111)`的记录,`(db_1, user_2, 120)`的地址更新成了 `Beijing`,且`(db_2, user_2, 220)`的记录被删除了,与我们在 MySQL 做的数据更新完全一致。
{{< img src="/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png" alt="Final Data in Iceberg" >}}
## 环境清理
本教程结束后,在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器:
From the latest result in Iceberg, we can see that there is a new record of `(db_1, user_1, 111)`, and the address of `(db_1, user_2, 120)` has been updated to `Beijing`.
Besides, the record of `(db_2, user_2, 220)` has been deleted. The result is exactly the same with the changes we did in MySQL.
## Clean up
After finishing the tutorial, run the following command in the directory of `docker-compose.yml` to stop all containers:
```shell
docker-compose down
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Db2 Tutorial"
weight: 8
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/db2-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -78,7 +85,7 @@ docker-compose down
**2. Download following JAR package to `<FLINK_HOME>/lib`**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- flink-sql-connector-db2-cdc-3.0-SNAPSHOT.jar
@ -152,3 +159,5 @@ INSERT INTO DB2INST1.PRODUCTS VALUES (default,'scooter','Big 2-wheel scooter ',5
DELETE FROM DB2INST1.PRODUCTS WHERE ID=111;
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MongoDB Tutorial"
weight: 1
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/mongodb-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -126,10 +133,10 @@ db.customers.insertMany([
3. Download following JAR package to `<FLINK_HOME>/lib/`:
```Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. ```
```Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself. ```
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar)
- [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar)
4. Launch a Flink cluster, then start a Flink SQL CLI and execute following SQL statements inside:
@ -233,3 +240,5 @@ db.orders.deleteOne(
{ order_id : 104 }
);
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Mysql & Postgres Tutorial"
weight: 2
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/mysql-postgres-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -28,7 +35,7 @@ In the following sections, we will describe how to use Flink Mysql/Postgres CDC
All exercises in this tutorial are performed in the Flink SQL CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation.
The overview of the architecture is as follows:
![Flink CDC Streaming ETL](/_static/fig/mysql-postgress-tutorial/flink-cdc-streaming-etl.png "Flink CDC Streaming ETL")
{{< img src="/fig/mysql-postgres-tutorial/flink-cdc-streaming-etl.png" width="700px" alt="Flink CDC Streaming ETL" >}}
## Preparation
Prepare a Linux or MacOS computer with Docker installed.
@ -95,10 +102,10 @@ We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kib
1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) and unzip it to the directory `flink-1.18.0`
2. Download following JAR package required and put them under `flink-1.18.0/lib/`:
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar
- flink-sql-connector-postgres-cdc-2.5-SNAPSHOT.jar
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar
### Preparing data in databases
#### Preparing data in MySQL
@ -179,7 +186,7 @@ We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kib
```
Then we can visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally, and the web page looks like:
![Flink UI](/_static/fig/mysql-postgress-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/mysql-postgres-tutorial/flink-ui.png" width="700px" alt="Flink UI" >}}
3. Use the following command to start a Flink SQL CLI:
```shell
@ -187,7 +194,7 @@ We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kib
```
We should see the welcome screen of the CLI client.
![Flink SQL Client](/_static/fig/mysql-postgress-tutorial/flink-sql-client.png "Flink SQL Client")
{{< img src="/fig/mysql-postgres-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
## Creating tables using Flink DDL in Flink SQL CLI
First, enable checkpoints every 3 seconds
@ -289,11 +296,11 @@ Flink SQL> INSERT INTO enriched_orders
Now, the enriched orders should be shown in Kibana.
Visit [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) to create an index pattern `enriched_orders`.
![Create Index Pattern](/_static/fig/mysql-postgress-tutorial/kibana-create-index-pattern.png "Create Index Pattern")
{{< img src="/fig/mysql-postgres-tutorial/kibana-create-index-pattern.png" width="700px" alt="Create Index Pattern" >}}
Visit [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) to find the enriched orders.
![Find enriched Orders](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders.png "Find enriched Orders")
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders.png" width="700px" alt="Find enriched Orders" >}}
Next, do some change in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time.
1. Insert a new order in MySQL
@ -324,7 +331,7 @@ Next, do some change in the databases, and then the enriched orders shown in Kib
DELETE FROM orders WHERE order_id = 10004;
```
The changes of enriched orders in Kibana are as follows:
![Enriched Orders Changes](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders-changes.gif "Enriched Orders Changes")
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders-changes.gif" width="700px" alt="Enriched Orders Changes" >}}
## Clean up
After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`:
@ -335,3 +342,5 @@ Run the following command to stop the Flink cluster in the directory of Flink `f
```shell
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "OceanBase Tutorial"
weight: 3
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/oceanbase-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -152,7 +159,7 @@ VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false),
```Download links are only available for stable releases.```
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/2.5-SNAPSHOT/flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar)
### Use Flink DDL to create dynamic table in Flink SQL CLI
@ -267,3 +274,5 @@ Stop the flink cluster by following command.
```shell
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Oracle Tutorial"
weight: 4
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/oracle-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -74,7 +81,7 @@ docker-compose down
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-oracle-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oracle-cdc/2.5-SNAPSHOT/flink-sql-connector-oracle-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar)
**Preparing data in Oracle database**
@ -222,3 +229,5 @@ UPDATE DEBEZIUM.ORDERS SET QUANTITY = 10 WHERE ID = 1002;
DELETE FROM DEBEZIUM.ORDERS WHERE ID = 1004;
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "PolarDB-X Tutorial"
weight: 5
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/mongodb-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -85,8 +92,8 @@ We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kib
1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) and unzip it to the directory `flink-1.18.0`
2. Download following JAR package required and put them under `flink-1.18.0/lib/`:
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
- flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
### Preparing data in databases
@ -145,7 +152,7 @@ We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kib
Then we can visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally, and the web page looks like:
![Flink UI](/_static/fig/mysql-postgress-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/mysql-postgres-tutorial/flink-ui.png" alt="Flink UI" >}}
3. Use the following command to start a Flink SQL CLI:
```shell
@ -153,7 +160,7 @@ We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kib
```
We should see the welcome screen of the CLI client.
![Flink SQL Client](/_static/fig/mysql-postgress-tutorial/flink-sql-client.png "Flink SQL Client")
{{< img src="/fig/mysql-postgres-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
## Creating tables using Flink DDL in Flink SQL CLI
First, enable checkpoints every 3 seconds
@ -243,11 +250,11 @@ Flink SQL> INSERT INTO enriched_orders
Now, the enriched orders should be shown in Kibana.
Visit [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) to create an index pattern `enriched_orders`.
![Create Index Pattern](/_static/fig/mysql-postgress-tutorial/kibana-create-index-pattern.png "Create Index Pattern")
{{< img src="/fig/mysql-postgres-tutorial/kibana-create-index-pattern.png" alt="Create Index Pattern" >}}
Visit [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) to find the enriched orders.
![Find enriched Orders](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders.png "Find enriched Orders")
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders.png" alt="Find enriched Orders" >}}
Next, do some change in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time.
1. Insert a new order in PolarDB-X
@ -267,7 +274,7 @@ Next, do some change in the databases, and then the enriched orders shown in Kib
DELETE FROM orders WHERE order_id = 10004;
```
The changes of enriched orders in Kibana are as follows:
![Enriched Orders Changes](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders-changes.gif "Enriched Orders Changes")
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders-changes.gif" alt="Enriched Orders Changes" >}}
## Clean up
After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`:
@ -278,3 +285,5 @@ Run the following command to stop the Flink cluster in the directory of Flink `f
```shell
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "SqlServer Tutorial"
weight: 6
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/sqlserver-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -80,10 +87,10 @@ docker-compose down
**Download following JAR package to `<FLINK_HOME>/lib`:**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-sqlserver-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-sqlserver-cdc/2.5-SNAPSHOT/flink-sql-connector-sqlserver-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar)
**Preparing data in SqlServer database**
@ -220,3 +227,5 @@ GO
DELETE FROM orders WHERE id = 10004;
GO
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "TiDB Tutorial"
weight: 7
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/tidb-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -133,10 +140,10 @@ docker-compose down
**Download following JAR package to `<FLINK_HOME>/lib`:**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-tidb-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-tidb-cdc/2.5-SNAPSHOT/flink-sql-connector-tidb-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar)
**Preparing data in TiDB database**
@ -250,3 +257,5 @@ UPDATE orders SET order_status = true WHERE order_id = 10004;
DELETE FROM orders WHERE order_id = 10004;
```
{{< top >}}

@ -0,0 +1,25 @@
---
title: Pipeline Connectors
bookCollapseSection: true
weight: 1
aliases:
- /try-flink-cdc/pipeline-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -1,3 +1,10 @@
---
title: "MySQL to Doris"
weight: 1
type: docs
aliases:
- /try-flink-cdc/pipeline-connectors/mysql-doris-pipeline-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -47,7 +54,7 @@ Prepare a Linux or MacOS computer with Docker installed.
If successfully started, you can access the Flink Web UI at [http://localhost:8081/](http://localhost:8081/), as shown below.
![Flink UI](/_static/fig/mysql-doris-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/mysql-doris-tutorial/flink-ui.png" alt="Flink UI" >}}
Executing `start-cluster.sh` multiple times can start multiple `TaskManager`s.
@ -162,7 +169,7 @@ This command automatically starts all the containers defined in the Docker Compo
[http://localhost:8030/](http://localhost:8030/)
The default username is `root`, and the default password is empty.
![Doris UI](/_static/fig/mysql-doris-tutorial/doris-ui.png "Doris UI")
{{< img src="/fig/mysql-doris-tutorial/doris-ui.png" alt="Doris UI" >}}
2. Create `app_db` database through Web UI.
@ -170,17 +177,17 @@ This command automatically starts all the containers defined in the Docker Compo
create database app_db;
```
![Doris create_table](/_static/fig/mysql-doris-tutorial/doris-create-table.png "Doris create table")
{{< img src="/fig/mysql-doris-tutorial/doris-create-table.png" alt="Doris create table" >}}
## Submit job using FlinkCDC cli
1. Download the binary compressed packages listed below and extract them to the directory ` flink cdc-3.0.0 '`
[flink-cdc-3.0.0-bin.tar.gz](https://github.com/ververica/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz)
[flink-cdc-3.0.0-bin.tar.gz](https://github.org/apache/flink/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz)
flink-cdc-3.0.0 directory will contain four directory `bin`,`lib`,`log`,`conf`.
2. Download the connector package listed below and move it to the `lib` directory
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
- [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar)
- [Apache Doris pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-doris/3.0.0/flink-cdc-pipeline-connector-doris-3.0.0.jar)
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar)
- [Apache Doris pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-doris/3.0.0/flink-cdc-pipeline-connector-doris-3.0.0.jar)
3. Write task configuration yaml file
Here is an example file for synchronizing the entire database `mysql-to-doris.yaml`
@ -229,11 +236,11 @@ After successful submission, the return information is as follows
```
We can find a job named `Sync MySQL Database to Doris` is running through Flink Web UI.
![MySQL-to-Doris](/_static/fig/mysql-doris-tutorial/mysql-to-doris.png "MySQL-to-Doris")
{{< img src="/fig/mysql-doris-tutorial/mysql-to-doris.png" alt="MySQL-to-Doris" >}}
We can find that tables are created and inserted through Doris Web UI.
![Doris_display_data](/_static/fig/mysql-doris-tutorial/doris_display_data.png "Doris_display_data")
{{< img src="/fig/mysql-doris-tutorial/doris-display-data.png" alt="Doris display data" >}}
### Synchronize Schema and Data changes
Enter MySQL container
@ -268,7 +275,7 @@ Then, modify schema and record in MySQL, and the tables of Doris will change the
Refresh the Doris Web UI every time you execute a step, and you can see that the `orders` table displayed in Doris will be updated in real-time, like the following
![Doris_display_result](/_static/fig/mysql-doris-tutorial/doris_display_result.png "Doris_display_result")
{{< img src="/fig/mysql-doris-tutorial/doris-display-result.png" alt="Doris display result" >}}
Similarly, by modifying the 'shipments' and' products' tables, you can also see the results of synchronized changes in real-time in Doris.
@ -336,3 +343,4 @@ Run the following command to stop the Flink cluster in the directory of Flink `f
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MySQL to StarRocks"
weight: 2
type: docs
aliases:
- /try-flink-cdc/pipeline-connectors/mysql-starrocks-pipeline-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -47,7 +54,7 @@ Prepare a Linux or MacOS computer with Docker installed.
If successfully started, you can access the Flink Web UI at [http://localhost:8081/](http://localhost:8081/), as shown below.
![Flink UI](/_static/fig/mysql-starrocks-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/mysql-starrocks-tutorial/flink-ui.png" alt="Flink UI" >}}
Executing `start-cluster.sh` multiple times can start multiple `TaskManager`s.
@ -137,13 +144,13 @@ This command automatically starts all the containers defined in the Docker Compo
## Submit job using FlinkCDC cli
1. Download the binary compressed packages listed below and extract them to the directory ` flink cdc-3.0.0 '`
[flink-cdc-3.0.0-bin.tar.gz](https://github.com/ververica/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz)
[flink-cdc-3.0.0-bin.tar.gz](https://github.org/apache/flink/flink-cdc-connectors/releases/download/release-3.0.0/flink-cdc-3.0.0-bin.tar.gz)
flink-cdc-3.0.0 directory will contain four directory `bin`,`lib`,`log`,`conf`.
2. Download the connector package listed below and move it to the `lib` directory
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself.**
- [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar)
- [StarRocks pipeline connector 3.0.0](https://repo1.maven.org/maven2/com/ververica/flink-cdc-pipeline-connector-starrocks/3.0.0/flink-cdc-pipeline-connector-starrocks-3.0.0.jar)
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [MySQL pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-mysql/3.0.0/flink-cdc-pipeline-connector-mysql-3.0.0.jar)
- [StarRocks pipeline connector 3.0.0](https://repo1.maven.org/maven2/org/apache/flink/flink-cdc-pipeline-connector-starrocks/3.0.0/flink-cdc-pipeline-connector-starrocks-3.0.0.jar)
3. Write task configuration yaml file.
Here is an example file for synchronizing the entire database `mysql-to-starrocks.yaml`
@ -197,11 +204,11 @@ After successful submission, the return information is as follows
We can find a job named `Sync MySQL Database to StarRocks` is running through Flink Web UI.
![MySQL-to-StarRocks](/_static/fig/mysql-starrocks-tutorial/mysql-to-starrocks.png "MySQL-to-StarRocks")
{{< img src="/fig/mysql-starrocks-tutorial/mysql-to-starrocks.png" alt="MySQL-to-StarRocks" >}}
Connect to jdbc through database connection tools such as Dbeaver using `mysql://127.0.0.1:9030`. You can view the data written to three tables in StarRocks.
![StarRocks-display-data](/_static/fig/mysql-starrocks-tutorial/starrocks-display-data.png "StarRocks-display-data")
{{< img src="/fig/mysql-starrocks-tutorial/starrocks-display-data.png" alt="StarRocks-display-data" >}}
### Synchronize Schema and Data changes
Enter MySQL container
@ -236,7 +243,7 @@ Then, modify schema and record in MySQL, and the tables of StarRocks will change
Refresh the Dbeaver every time you execute a step, and you can see that the `orders` table displayed in StarRocks will be updated in real-time, like the following
![StarRockss-display-result](/_static/fig/mysql-starrocks-tutorial/starrocks-display-result.png "StarRocks-display-result")
{{< img src="/fig/mysql-starrocks-tutorial/starrocks-display-result.png" alt="StarRocks-display-result" >}}
Similarly, by modifying the `shipments` and `products` tables, you can also see the results of synchronized changes in real-time in StarRocks.
@ -304,3 +311,4 @@ Run the following command to stop the Flink cluster in the directory of Flink `f
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,8 @@
---
title: Versions
type: docs
bookToc: false
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,15 +22,8 @@ specific language governing permissions and limitations
under the License.
-->
# Pipeline Connectors
# Versions
```{toctree}
:maxdepth: 2
An appendix of hosted documentation for all versions of Apache Flink CDC.
mysql-pipeline
mysql-pipeline(ZH)
doris-pipeline
doris-pipeline(ZH)
starrocks-pipeline
starrocks-pipeline(ZH)
```
{{< all_versions >}}

@ -0,0 +1,58 @@
---
title: Apache Flink CDC
type: docs
bookToc: false
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
####
<div style="text-align: center">
<h1>
Flink CDC: Change Data Capture Solution Of Apache Flink
</h1>
<h4 style="color: #696969">Set of source connectors for Apache Flink® directly ingesting changes coming from different databases using Change Data Capture(CDC).</h4>
</div>
Flink CDC integrates Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium).
{{< img src="/fig/cdc-flow.png" alt="Stateful Functions" width="50%" >}}
Flink CDC supports ingesting snapshot data and real time changes from databases to Flink® and then transform and sink to various downstream systems.
{{< columns >}}
## Try Flink CDC
If youre interested in playing around with Flink CDC, check out our [quick
start]({{< ref "docs/try-flink-cdc" >}}). It provides multiple examples to submit and execute a Flink CDC job on a Flink cluster.
<--->
## Get Help with Flink CDC
If you get stuck, check out our [community support
resources](https://flink.apache.org/community.html). In particular, Apache
Flinks user mailing list is consistently ranked as one of the most active of
any Apache project, and is a great way to get help quickly.
{{< /columns >}}
Flink CDC is developed under the umbrella of [Apache
Flink](https://flink.apache.org/).

@ -1,685 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# MongoDB CDC 连接器
MongoDB CDC 连接器允许从 MongoDB 读取快照数据和增量数据。 本文档描述了如何设置 MongoDB CDC 连接器以针对 MongoDB 运行 SQL 查询。
依赖
------------
为了设置 MongoDB CDC 连接器, 下表提供了使用构建自动化工具(如 Maven 或 SBT )和带有 SQLJar 捆绑包的 SQLClient 的两个项目的依赖关系信息。
### Maven dependency
```
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-mongodb-cdc</artifactId>
<!-- 依赖项仅适用于稳定版本SNAPSHOT依赖项需要自己构建。 -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
### SQL Client JAR
```下载链接仅适用于稳定版本。```
下载 [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) 把它放在 `<FLINK_HOME>/lib/`.
**注意:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT 版本是与开发分支相对应的代码。 用户需要下载源代码并编译相应的jar。 用户应使用已发布的版本,例如 [flink-sql-connector-mongodb-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-mongodb-cdc), 发布的版本将在 Maven 中央仓库中提供。
设置 MongoDB
----------------
### 可用性
- MongoDB 版本
MongoDB 版本 >= 3.6 <br>
我们使用 [更改流](https://docs.mongodb.com/manual/changeStreams/) 功能3.6 版中新增),以捕获更改数据。
- 集群部署
[副本集](https://docs.mongodb.com/manual/replication/) 或者 [分片集群](https://docs.mongodb.com/manual/sharding/) 是必需的。
- 存储引擎
[WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger) 存储引擎是必需的。
- [副本集协议版本](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion)
副本集协议版本 1 [(pv1)](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) 是必需的。 <br>
从 4.0 版本开始MongoDB 只支持pv1。 pv1 是使用 MongoDB 3.2 或更高版本创建的所有新副本集的默认值。
- 权限
`changeStream` and `read` 是 MongoDB Kafka Connector 必需权限。
你可以使用以下示例进行简单的授权。<br>
有关更详细的授权, 请参照 [MongoDB 数据库用户角色](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles).
```javascript
use admin;
db.createRole(
{
role: "flinkrole",
privileges: [{
// 所有数据库中所有非系统集合的 grant 权限
resource: { db: "", collection: "" },
actions: [
"splitVector",
"listDatabases",
"listCollections",
"collStats",
"find",
"changeStream" ]
}],
roles: [
// 阅读 config.collections 和 config.chunks
// 用于分片集群快照拆分。
{ role: 'read', db: 'config' }
]
}
);
db.createUser(
{
user: 'flinkuser',
pwd: 'flinkpw',
roles: [
{ role: 'flinkrole', db: 'admin' }
]
}
);
```
如何创建 MongoDB CDC 表
----------------
MongoDB CDC 表可以定义如下:
```sql
-- 在 Flink SQL 中注册 MongoDB 表 `products`
CREATE TABLE products (
_id STRING, // 必须声明
name STRING,
weight DECIMAL(10,3),
tags ARRAY<STRING>, -- array
price ROW<amount DECIMAL(10,2), currency STRING>, -- 嵌入式文档
suppliers ARRAY<ROW<name STRING, address STRING>>, -- 嵌入式文档
PRIMARY KEY(_id) NOT ENFORCED
) WITH (
'connector' = 'mongodb-cdc',
'hosts' = 'localhost:27017,localhost:27018,localhost:27019',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database' = 'inventory',
'collection' = 'products'
);
-- 从 `products` 集合中读取快照和更改事件
SELECT * FROM products;
```
**请注意**
MongoDB 的更改事件记录在消息之前没有更新。因此,我们只能将其转换为 Flink 的 UPSERT 更改日志流。
upstart 流需要一个唯一的密钥,所以我们必须声明 `_id` 作为主键。
我们不能将其他列声明为主键, 因为删除操作不包含除 `_id``sharding key` 之外的键和值。
连接器选项
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>指定要使用的连接器,此处应为 <code>mongodb-cdc</code>.</td>
</tr>
<tr>
<td>scheme</td>
<td>optional</td>
<td style="word-wrap: break-word;">mongodb</td>
<td>String</td>
<td>指定 MongoDB 连接协议。 eg. <code>mongodb or mongodb+srv.</code></td>
</tr>
<tr>
<td>hosts</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>MongoDB 服务器的主机名和端口对的逗号分隔列表。<br>
eg. <code>localhost:27017,localhost:27018</code>
</td>
</tr>
<tr>
<td>username</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>连接到 MongoDB 时要使用的数据库用户的名称。<br>
只有当 MongoDB 配置为使用身份验证时,才需要这样做。
</td>
</tr>
<tr>
<td>password</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>连接到 MongoDB 时要使用的密码。<br>
只有当 MongoDB 配置为使用身份验证时,才需要这样做。
</td>
</tr>
<tr>
<td>database</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>要监视更改的数据库的名称。 如果未设置,则将捕获所有数据库。 <br>
该数据库还支持正则表达式来监视与正则表达式匹配的多个数据库。</td>
</tr>
<tr>
<td>collection</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>数据库中要监视更改的集合的名称。 如果未设置,则将捕获所有集合。<br>
该集合还支持正则表达式来监视与完全限定的集合标识符匹配的多个集合。</td>
</tr>
<tr>
<td>connection.options</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td><a href="https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options">MongoDB连接选项</a>。 例如: <br>
<code>replicaSet=test&connectTimeoutMS=300000</code>
</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td> MongoDB CDC 消费者可选的启动模式,
合法的模式为 "initial""latest-offset" 和 "timestamp"。
请查阅 <a href="#a-name-id-002-a">启动模式</a> 章节了解更多详细信息。</td>
</tr>
<tr>
<td>scan.startup.timestamp-millis</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>起始毫秒数, 仅适用于 <code>'timestamp'</code> 启动模式.</td>
</tr>
<tr>
<td>batch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>Cursor 批次大小。</td>
</tr>
<tr>
<td>poll.max.batch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>轮询新数据时,单个批处理中要包含的更改流文档的最大数量。</td>
</tr>
<tr>
<td>poll.await.time.ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">1000</td>
<td>Integer</td>
<td>在更改流上检查新结果之前等待的时间。</td>
</tr>
<tr>
<td>heartbeat.interval.ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">0</td>
<td>Integer</td>
<td>心跳间隔(毫秒)。使用 0 禁用。</td>
</tr>
<tr>
<td>scan.full-changelog</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>是否尝试使用 MongoDB 前像/后像产生完整事件流。请查阅 <a href="#a-name-id-003-a">完整事件流</a> 章节了解更多详细信息。该功能仅支持 MongoDB 6.0 之后的版本。</td>
</tr>
<tr>
<td>scan.incremental.snapshot.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>是否启用增量快照。增量快照功能仅支持 MongoDB 4.0 之后的版本。</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.size.mb</td>
<td>optional</td>
<td style="word-wrap: break-word;">64</td>
<td>Integer</td>
<td>增量快照的区块大小 mb。</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.samples</td>
<td>optional</td>
<td style="word-wrap: break-word;">20</td>
<td>Integer</td>
<td>采样分片策略每个chunk采样的数据条数。</td>
</tr>
<tr>
<td>scan.incremental.close-idle-reader.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>是否在快照结束后关闭空闲的 Reader。 此特性需要 flink 版本大于等于 1.14 并且 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 需要设置为 true。<br>
若 flink 版本大于等于 1.15'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 默认值变更为 true可以不用显式配置 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = true。</td>
</tr>
<tr>
<td>scan.cursor.no-timeout</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>MongoDB 服务端通常会将空闲时间超过 10 分钟的 cursor 关闭,来节省内存开销。将这个参数设置为 true 可以防止 cursor 因为读取时间过长或者背压导致的空闲而关闭。仅在增量快照模式下生效。</td>
</tr>
</tbody>
</table>
</div>
注意: `heartbeat.interval.ms` 强烈建议设置一个大于 0 的适当值 **如果集合更改缓慢**.
当我们从检查点或保存点恢复 Flink 作业时,心跳事件可以向前推送 `resumeToken`,以避免 `resumeToken` 过期。
可用元数据
----------------
以下格式元数据可以在表定义中公开为只读VIRTUAL列。
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>包含该行的数据库的名称。</td>
</tr>
<tr>
<td>collection_name</td>
<td>STRING NOT NULL</td>
<td>包含该行的集合的名称。</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>它指示在数据库中进行更改的时间。 <br>如果记录是从表的快照而不是改变流中读取的该值将始终为0。</td>
</tr>
</tbody>
</table>
扩展的 CREATE TABLE 示例演示了用于公开这些元数据字段的语法:
```sql
CREATE TABLE products (
db_name STRING METADATA FROM 'database_name' VIRTUAL,
collection_name STRING METADATA FROM 'collection_name' VIRTUAL,
operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL,
_id STRING, // 必须声明
name STRING,
weight DECIMAL(10,3),
tags ARRAY<STRING>, -- array
price ROW<amount DECIMAL(10,2), currency STRING>, -- 嵌入式文档
suppliers ARRAY<ROW<name STRING, address STRING>>, -- 嵌入式文档
PRIMARY KEY(_id) NOT ENFORCED
) WITH (
'connector' = 'mongodb-cdc',
'hosts' = 'localhost:27017,localhost:27018,localhost:27019',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database' = 'inventory',
'collection' = 'products'
);
```
特性
--------
### 精确一次处理
MongoDB CDC 连接器是一个 Flink Source 连接器,它将首先读取数据库快照,然后在处理**甚至失败时继续读取带有**的更改流事件。
### 启动模式<a name="启动模式" id="002" ></a>
配置选项```scan.startup.mode```指定 MySQL CDC 使用者的启动模式。有效枚举包括:
- `initial` (默认):在第一次启动时对受监视的数据库表执行初始快照,并继续读取最新的 oplog。
- `latest-offset`:首次启动时,从不对受监视的数据库表执行快照, 连接器仅从 oplog 的结尾处开始读取,这意味着连接器只能读取在连接器启动之后的数据更改。
- `timestamp`:跳过快照阶段,从指定的时间戳开始读取 oplog 事件。
例如使用 DataStream API:
```java
MongoDBSource.builder()
.startupOptions(StartupOptions.latest()) // Start from latest offset
.startupOptions(StartupOptions.timestamp(1667232000000L) // Start from timestamp
.build()
```
and with SQL:
```SQL
CREATE TABLE mongodb_source (...) WITH (
'connector' = 'mongodb-cdc',
'scan.startup.mode' = 'latest-offset', -- 从最晚位点启动
...
'scan.startup.mode' = 'timestamp', -- 指定时间戳启动模式
'scan.startup.timestamp-millis' = '1667232000000' -- 启动毫秒时间
...
)
```
### 更改流
我们将 [MongoDB's official Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/) 从 MongoDB 中读取快照或更改事件,并通过 Debezium 的 `EmbeddedEngine` 进行驱动。
Debezium 的 `EmbeddedEngine` 提供了一种在应用程序进程中运行单个 Kafka Connect `SourceConnector` 的机制,并且它可以正确地驱动任何标准的 Kafka Connect `SourceConnector`,即使它不是由 Debezium 提供的。
我们选择 **MongoDB 的官方 Kafka连接器**,而不是 **Debezium 的MongoDB 连接器**,因为它们使用了不同的更改数据捕获机制。
- 对于 Debezium 的 MongoDB 连接器,它读取每个复制集主节点的 `oplog.rs` 集合。
- 对于 MongoDB 的 Kafka 连接器,它订阅了 MongoDB 的 `更改流`
MongoDB 的`oplog.rs` 集合没有在状态之前保持更改记录的更新, 因此,很难通过单个 `oplog.rs` 记录提取完整的文档状态,并将其转换为 Flink 接受的更改日志流Insert OnlyUpsertAll
此外MongoDB 52021 7月发布改变了 oplog 格式,因此当前的 Debezium 连接器不能与其一起使用。
**Change Stream**是 MongoDB 3.6 为副本集和分片集群提供的一项新功能,它允许应用程序访问实时数据更改,而不会带来跟踪操作日志的复杂性和风险。<br>
应用程序可以使用更改流来订阅单个集合上的所有数据更改, 数据库或整个部署,并立即对其做出反应。
**查找更新操作的完整文档**是**变更流**提供的一项功能,它可以配置变更流以返回更新文档的最新多数提交版本。由于该功能,我们可以轻松收集最新的完整文档,并将更改日志转换为 Flink 的**Upsert Changelog Stream**。
顺便说一句,[DBZ-435](https://issues.redhat.com/browse/DBZ-435)提到的Debezium的MongoDB变更流探索,正在制定路线图。<br>
如果完成了,我们可以考虑集成两种源连接器供用户选择。
### DataStream Source
MongoDB CDC 连接器也可以是一个数据流源。 你可以创建 SourceFunction如下所示
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.mongodb.MongoDBSource;
public class MongoDBSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = MongoDBSource.<String>builder()
.hosts("localhost:27017")
.username("flink")
.password("flinkpw")
.databaseList("inventory") // 设置捕获的数据库,支持正则表达式
.collectionList("inventory.products", "inventory.orders") //设置捕获的集合,支持正则表达式
.deserializer(new JsonDebeziumDeserializationSchema())
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.addSource(sourceFunction)
.print().setParallelism(1); // 对 sink 使用并行度 1 以保持消息顺序
env.execute();
}
}
```
MongoDB CDC 增量连接器2.3.0 之后)可以使用,如下所示:
```java
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.cdc.connectors.mongodb.source.MongoDBSource;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
public class MongoDBIncrementalSourceExample {
public static void main(String[] args) throws Exception {
MongoDBSource<String> mongoSource =
MongoDBSource.<String>builder()
.hosts("localhost:27017")
.databaseList("inventory") // 设置捕获的数据库,支持正则表达式
.collectionList("inventory.products", "inventory.orders") //设置捕获的集合,支持正则表达式
.username("flink")
.password("flinkpw")
.deserializer(new JsonDebeziumDeserializationSchema())
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 启用检查点
env.enableCheckpointing(3000);
// 将 source 并行度设置为 2
env.fromSource(mongoSource, WatermarkStrategy.noWatermarks(), "MongoDBIncrementalSource")
.setParallelism(2)
.print()
.setParallelism(1);
env.execute("Print MongoDB Snapshot + Change Stream");
}
}
```
**注意:**
- 如果使用数据库正则表达式,则需要 `readAnyDatabase` 角色。
- 增量快照功能仅支持 MongoDB 4.0 之后的版本。
### 完整事件流<a name="完整事件流" id="003" ></a>
MongoDB 6.0 及以上版本支持在输出的更改流事件中携带对应更改前及更改后的文档版本(分别称为前像和后像)。
- 前像Pre-image是被该变更替换、更新或删除的文档。插入事件不存在对应的前像。
- 后像Post-image是该变更插入、替换或更新的文档。删除事件不存在对应的后像。
MongoDB CDC 能够借助上述前像和后像信息,产生完整的、包含 Insert、Update Before、Update After、Delete 数据行的事件流,从而避免下游 Flink 增加额外的 `ChangelogNormalize` 节点。
为了启用这一功能,您需要确保:
- MongoDB 数据库版本不低于 6.0
- 在数据库层面启用前像/后像记录功能:
```javascript
db.runCommand({
setClusterParameter: {
changeStreamOptions: {
preAndPostImages: {
expireAfterSeconds: 'off' // 自定义前像后像的过期时间
}
}
}
})
```
- 为需要监控的集合开启前像/后像记录功能:
```javascript
db.runCommand({
collMod: "<< 集合名称 >>",
changeStreamPreAndPostImages: {
enabled: true
}
})
```
- 打开 MongoDB CDC 的 `scan.full-changelog` 开关:
```java
MongoDBSource.builder()
.scanFullChangelog(true)
...
.build()
```
或者使用 Flink SQL
```SQL
CREATE TABLE mongodb_source (...) WITH (
'connector' = 'mongodb-cdc',
'scan.full-changelog' = 'true',
...
)
```
数据类型映射
----------------
[BSON](https://docs.mongodb.com/manual/reference/bson-types/) **二进制 JSON**的缩写是一种类似 JSON 格式的二进制编码序列,用于在 MongoDB 中存储文档和进行远程过程调用。
[Flink SQL Data Type](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/types/) 类似于 SQL 标准的数据类型术语,该术语描述了表生态系统中值的逻辑类型。它可以用于声明操作的输入和/或输出类型。
为了使 Flink SQL 能够处理来自异构数据源的数据,异构数据源的数据类型需要统一转换为 Flink SQL 数据类型。
以下是 BSON 类型和 Flink SQL 类型的映射。
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">BSON type<a href="https://docs.mongodb.com/manual/reference/bson-types/"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>TINYINT</td>
</tr>
<tr>
<td></td>
<td>SMALLINT</td>
</tr>
<tr>
<td>
Int<br>
<td>INT</td>
</tr>
<tr>
<td>Long</td>
<td>BIGINT</td>
</tr>
<tr>
<td></td>
<td>FLOAT</td>
</tr>
<tr>
<td>Double</td>
<td>DOUBLE</td>
</tr>
<tr>
<td>Decimal128</td>
<td>DECIMAL(p, s)</td>
</tr>
<tr>
<td>Boolean</td>
<td>BOOLEAN</td>
</tr>
<tr>
<td>Date</br>Timestamp</td>
<td>DATE</td>
</tr>
<tr>
<td>Date</br>Timestamp</td>
<td>TIME</td>
</tr>
<tr>
<td>Date</td>
<td>TIMESTAMP(3)</br>TIMESTAMP_LTZ(3)</td>
</tr>
<tr>
<td>Timestamp</td>
<td>TIMESTAMP(0)</br>TIMESTAMP_LTZ(0)
</td>
</tr>
<tr>
<td>
String<br>
ObjectId<br>
UUID<br>
Symbol<br>
MD5<br>
JavaScript</br>
Regex</td>
<td>STRING</td>
</tr>
<tr>
<td>BinData</td>
<td>BYTES</td>
</tr>
<tr>
<td>Object</td>
<td>ROW</td>
</tr>
<tr>
<td>Array</td>
<td>ARRAY</td>
</tr>
<tr>
<td>DBPointer</td>
<td>ROW&lt;$ref STRING, $id STRING&gt;</td>
</tr>
<tr>
<td>
<a href="https://docs.mongodb.com/manual/reference/geojson/">GeoJSON</a>
</td>
<td>
Point : ROW&lt;type STRING, coordinates ARRAY&lt;DOUBLE&gt;&gt;</br>
Line : ROW&lt;type STRING, coordinates ARRAY&lt;ARRAY&lt; DOUBLE&gt;&gt;&gt;</br>
...
</td>
</tr>
</tbody>
</table>
</div>
参考
--------
- [MongoDB Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/)
- [Change Streams](https://docs.mongodb.com/manual/changeStreams/)
- [Replication](https://docs.mongodb.com/manual/replication/)
- [Sharding](https://docs.mongodb.com/manual/sharding/)
- [Database User Roles](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles)
- [WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger)
- [Replica set protocol](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion)
- [Connection String Options](https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options)
- [Document Pre- and Post-Images](https://www.mongodb.com/docs/v6.0/changeStreams/#change-streams-with-document-pre--and-post-images)
- [BSON Types](https://docs.mongodb.com/manual/reference/bson-types/)
- [Flink DataTypes](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/types/)
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))

File diff suppressed because it is too large Load Diff

@ -0,0 +1,25 @@
---
title: Connectors
icon: <i class="fa fa-random title maindish" aria-hidden="true"></i>
bold: true
bookCollapseSection: true
weight: 3
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -1,3 +1,8 @@
---
title: CDC Connectors
bookCollapseSection: true
weight: 2
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -16,7 +21,3 @@ KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Downloads
Please see [Releases History](https://github.com/ververica/flink-cdc-connectors/releases)

@ -0,0 +1,381 @@
---
title: "Db2 CDC Connector"
weight: 9
type: docs
aliases:
- /connectors/cdc-connectors/db2-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Db2 CDC Connector
The Db2 CDC connector allows for reading snapshot data and incremental data from Db2 database. This document
describes how to setup the db2 CDC connector to run SQL queries against Db2 databases.
## Supported Databases
| Connector | Database | Driver |
|-----------------------|----------------------------------------------------|----------------------|
| [Db2-cdc](db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
Dependencies
------------
In order to set up the Db2 CDC connector, the following table provides dependency information for both projects
using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-db2-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download flink-sql-connector-db2-cdc-3.0-SNAPSHOT.jar and
put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-db2-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users
need to download the source code and compile the corresponding jar. Users should use the released version, such as
[flink-sql-connector-db2-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-connector-db2-cdc),
the released version will be available in the Maven central warehouse.
Setup Db2 server
----------------
Follow the steps in the [Debezium Db2 Connector](https://debezium.io/documentation/reference/1.9/connectors/db2.html#setting-up-db2).
Notes
----------------
### Not support BOOLEAN type in SQL Replication on Db2
Only snapshots can be taken from tables with BOOLEAN type columns. Currently, SQL Replication on Db2 does not support BOOLEAN, so Debezium can not perform CDC on those tables.
Consider using another type to replace BOOLEAN type.
How to create a Db2 CDC table
----------------
The Db2 CDC table can be defined as following:
```sql
-- checkpoint every 3 seconds
Flink SQL> SET 'execution.checkpointing.interval' = '3s';
-- register a Db2 table 'products' in Flink SQL
Flink SQL> CREATE TABLE products (
ID INT NOT NULL,
NAME STRING,
DESCRIPTION STRING,
WEIGHT DECIMAL(10,3)
) WITH (
'connector' = 'db2-cdc',
'hostname' = 'localhost',
'port' = '50000',
'username' = 'root',
'password' = '123456',
'database-name' = 'mydb',
'schema-name' = 'myschema',
'table-name' = 'products');
-- read snapshot and binlogs from products table
Flink SQL> SELECT * FROM products;
```
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 10%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 65%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'db2-cdc'</code>.</td>
</tr>
<tr>
<td>hostname</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>IP address or hostname of the Db2 database server.</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the Db2 database to use when connecting to the Db2 database server.</td>
</tr>
<tr>
<td>password</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to use when connecting to the Db2 database server.</td>
</tr>
<tr>
<td>database-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the Db2 server to monitor.</td>
</tr>
<tr>
<td>schema-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Schema name of the Db2 database to monitor.</td>
</tr>
<tr>
<td>table-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the Db2 database to monitor.</td>
</tr>
<tr>
<td>port</td>
<td>optional</td>
<td style="word-wrap: break-word;">50000</td>
<td>Integer</td>
<td>Integer port number of the Db2 database server.</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td>Optional startup mode for Db2 CDC consumer, valid enumerations are "initial"
and "latest-offset". Please see <a href="#startup-reading-position">Startup Reading Position</a> section
for more detailed information.</td>
</tr>
<tr>
<td>server-time-zone</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The session time zone in database server, e.g. "Asia/Shanghai".
It controls how the TIMESTAMP type in Db2 converted to STRING.
See more <a href="https://debezium.io/documentation/reference/1.9/connectors/db2.html#db2-temporal-types">here</a>.
If not set, then ZoneId.systemDefault() is used to determine the server time zone.
</td>
</tr>
<tr>
<td>debezium.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from
Db2 server.
For example: <code>'debezium.snapshot.mode' = 'never'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.9/connectors/db2.html#db2-connector-properties">Debezium's Db2 Connector properties</a></td>
</tr>
</tbody>
</table>
</div>
Features
--------
### Startup Reading Position
The config option `scan.startup.mode` specifies the startup mode for DB2 CDC consumer. The valid enumerations are:
- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest binlog.
- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from
the end of the binlog which means only have the changes since the connector was started.
_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not using them together. If you speicifying both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._
### DataStream Source
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
public class Db2SourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> db2Source =
Db2Source.<String>builder()
.hostname("yourHostname")
.port(50000)
.database("yourDatabaseName") // set captured database
.tableList("yourSchemaName.yourTableName") // set captured table
.username("yourUsername")
.password("yourPassword")
.deserializer(
new JsonDebeziumDeserializationSchema()) // converts SourceRecord to
// JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// enable checkpoint
env.enableCheckpointing(3000);
env.addSource(db2Source)
.print()
.setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute("Print Db2 Snapshot + Change Stream");
}
}
```
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width:30%;"><a href="https://www.ibm.com/docs/en/db2/11.5?topic=elements-data-types">Db2 type</a></th>
<th class="text-left" style="width:10%;">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
<th class="text-left" style="width:60%;">NOTE</th>
</tr>
</thead>
<tbody>
<tr>
<td>
SMALLINT<br>
</td>
<td>SMALLINT</td>
<td></td>
</tr>
<tr>
<td>
INTEGER
</td>
<td>INT</td>
<td></td>
</tr>
<tr>
<td>
BIGINT
</td>
<td>BIGINT</td>
<td></td>
</tr>
<tr>
<td>
REAL
</td>
<td>FLOAT</td>
<td></td>
</tr>
<tr>
<td>
DOUBLE
</td>
<td>DOUBLE</td>
<td></td>
</tr>
<tr>
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)
</td>
<td>DECIMAL(p, s)</td>
<td></td>
</tr>
<tr>
<td>DATE</td>
<td>DATE</td>
<td></td>
</tr>
<tr>
<td>TIME</td>
<td>TIME</td>
<td></td>
</tr>
<tr>
<td>TIMESTAMP [(p)]
</td>
<td>TIMESTAMP [(p)]
</td>
<td></td>
</tr>
<tr>
<td>
CHARACTER(n)
</td>
<td>CHAR(n)</td>
<td></td>
</tr>
<tr>
<td>
VARCHAR(n)
</td>
<td>VARCHAR(n)</td>
<td></td>
</tr>
<tr>
<td>
BINARY(n)
</td>
<td>BINARY(n)</td>
<td></td>
</tr>
<tr>
<td>
VARBINARY(N)
</td>
<td>VARBINARY(N)</td>
<td></td>
</tr>
<tr>
<td>
BLOB<br>
CLOB<br>
DBCLOB<br>
</td>
<td>BYTES</td>
<td></td>
</tr>
<tr>
<td>
VARGRAPHIC<br>
XML
</td>
<td>STRING</td>
<td></td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,693 @@
---
title: "MongoDB CDC Connector"
weight: 2
type: docs
aliases:
- /connectors/cdc-connectors/mongodb-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# MongoDB CDC Connector
The MongoDB CDC connector allows for reading snapshot data and incremental data from MongoDB. This document describes how to setup the MongoDB CDC connector to run SQL queries against MongoDB.
Dependencies
------------
In order to setup the MongoDB CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-mongodb-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-mongodb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-mongodb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-mongodb-cdc), the released version will be available in the Maven central warehouse.
Setup MongoDB
----------------
### Availability
- MongoDB version
MongoDB version >= 3.6 <br>
We use [change streams](https://docs.mongodb.com/manual/changeStreams/) feature (new in version 3.6) to capture change data.
- Cluster Deployment
[replica sets](https://docs.mongodb.com/manual/replication/) or [sharded clusters](https://docs.mongodb.com/manual/sharding/) is required.
- Storage Engine
[WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger) storage engine is required.
- [Replica set protocol version](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion)
Replica set protocol version 1 [(pv1)](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion) is required. <br>
Starting in version 4.0, MongoDB only supports pv1. pv1 is the default for all new replica sets created with MongoDB 3.2 or later.
- Privileges
`changeStream` and `read` privileges are required by MongoDB Kafka Connector.
You can use the following example for simple authorization.<br>
For more detailed authorization, please refer to [MongoDB Database User Roles](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles).
```javascript
use admin;
db.createRole(
{
role: "flinkrole",
privileges: [{
// Grant privileges on all non-system collections in all databases
resource: { db: "", collection: "" },
actions: [
"splitVector",
"listDatabases",
"listCollections",
"collStats",
"find",
"changeStream" ]
}],
roles: [
// Read config.collections and config.chunks
// for sharded cluster snapshot splitting.
{ role: 'read', db: 'config' }
]
}
);
db.createUser(
{
user: 'flinkuser',
pwd: 'flinkpw',
roles: [
{ role: 'flinkrole', db: 'admin' }
]
}
);
```
How to create a MongoDB CDC table
----------------
The MongoDB CDC table can be defined as following:
```sql
-- register a MongoDB table 'products' in Flink SQL
CREATE TABLE products (
_id STRING, // must be declared
name STRING,
weight DECIMAL(10,3),
tags ARRAY<STRING>, -- array
price ROW<amount DECIMAL(10,2), currency STRING>, -- embedded document
suppliers ARRAY<ROW<name STRING, address STRING>>, -- embedded documents
PRIMARY KEY(_id) NOT ENFORCED
) WITH (
'connector' = 'mongodb-cdc',
'hosts' = 'localhost:27017,localhost:27018,localhost:27019',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database' = 'inventory',
'collection' = 'products'
);
-- read snapshot and change events from products collection
SELECT * FROM products;
```
**Note that**
MongoDB's change event record doesn't have updated before message. So, we can only convert it to Flink's UPSERT changelog stream.
An upsert stream requires a unique key, so we must declare `_id` as primary key.
We can't declare other column as primary key, because delete operation does not contain the key and value besides `_id` and `sharding key`.
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>mongodb-cdc</code>.</td>
</tr>
<tr>
<td>scheme</td>
<td>optional</td>
<td style="word-wrap: break-word;">mongodb</td>
<td>String</td>
<td>The protocol connected to MongoDB. eg. <code>mongodb or mongodb+srv.</code></td>
</tr>
<tr>
<td>hosts</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The comma-separated list of hostname and port pairs of the MongoDB servers.<br>
eg. <code>localhost:27017,localhost:27018</code>
</td>
</tr>
<tr>
<td>username</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the database user to be used when connecting to MongoDB.<br>
This is required only when MongoDB is configured to use authentication.
</td>
</tr>
<tr>
<td>password</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to be used when connecting to MongoDB.<br>
This is required only when MongoDB is configured to use authentication.
</td>
</tr>
<tr>
<td>database</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the database to watch for changes. If not set then all databases will be captured. <br>
The database also supports regular expressions to monitor multiple databases matching the regular expression.</td>
</tr>
<tr>
<td>collection</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the collection in the database to watch for changes. If not set then all collections will be captured.<br>
The collection also supports regular expressions to monitor multiple collections matching fully-qualified collection identifiers.</td>
</tr>
<tr>
<td>connection.options</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The ampersand-separated <a href="https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options">connection options</a> of MongoDB. eg. <br>
<code>replicaSet=test&connectTimeoutMS=300000</code>
</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td>Optional startup mode for MongoDB CDC consumer, valid enumerations are "initial", "latest-offset" and "timestamp".
Please see <a href="#startup-reading-position">Startup Reading Position</a> section for more detailed information.</td>
</tr>
<tr>
<td>scan.startup.timestamp-millis</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>Timestamp in millis of the start point, only used for <code>'timestamp'</code> startup mode.</td>
</tr>
<tr>
<td>copy.existing.queue.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">10240</td>
<td>Integer</td>
<td>The max size of the queue to use when copying data.</td>
</tr>
<tr>
<td>batch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>The cursor batch size.</td>
</tr>
<tr>
<td>poll.max.batch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>Maximum number of change stream documents to include in a single batch when polling for new data.</td>
</tr>
<tr>
<td>poll.await.time.ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">1000</td>
<td>Integer</td>
<td>The amount of time to wait before checking for new results on the change stream.</td>
</tr>
<tr>
<td>heartbeat.interval.ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">0</td>
<td>Integer</td>
<td>The length of time in milliseconds between sending heartbeat messages. Use 0 to disable.</td>
</tr>
<tr>
<td>scan.full-changelog</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether try to generate full-mode changelog based on pre- and post-images in MongoDB. Refer to <a href="#a-name-id-003-a">Full Changelog</a> for more details. Supports MongoDB 6.0 and above only.</td>
</tr>
<tr>
<td>scan.incremental.snapshot.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether enable incremental snapshot. The incremental snapshot feature only supports after MongoDB 4.0.</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.size.mb</td>
<td>optional</td>
<td style="word-wrap: break-word;">64</td>
<td>Integer</td>
<td>The chunk size mb of incremental snapshot.</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.samples</td>
<td>optional</td>
<td style="word-wrap: break-word;">20</td>
<td>Integer</td>
<td>The samples count per chunk when using sample partition strategy during incremental snapshot.</td>
</tr>
<tr>
<td>scan.incremental.close-idle-reader.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to close idle readers at the end of the snapshot phase. <br>
The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.<br>
If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true,
so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true'
</td>
</tr>
<tr>
<td>scan.cursor.no-timeout</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>MongoDB server normally times out idle cursors after an inactivity period (10 minutes) to prevent excess memory use. Set this option to true to prevent that. Only available when parallelism snapshot is enabled.</td>
</tr>
</tbody>
</table>
</div>
Note: `heartbeat.interval.ms` is highly recommended setting a proper value larger than 0 **if the collection changes slowly**.
The heartbeat event can push the `resumeToken` forward to avoid `resumeToken` being expired when we recover the Flink job from a checkpoint or savepoint.
Available Metadata
----------------
The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition.
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>Name of the database that contain the row.</td>
</tr>
<tr>
<td>collection_name</td>
<td>STRING NOT NULL</td>
<td>Name of the collection that contain the row.</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>It indicates the time that the change was made in the database. <br>If the record is read from snapshot of the table instead of the change stream, the value is always 0.</td>
</tr>
</tbody>
</table>
The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields:
```sql
CREATE TABLE products (
db_name STRING METADATA FROM 'database_name' VIRTUAL,
collection_name STRING METADATA FROM 'collection_name' VIRTUAL,
operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL,
_id STRING, // must be declared
name STRING,
weight DECIMAL(10,3),
tags ARRAY<STRING>, -- array
price ROW<amount DECIMAL(10,2), currency STRING>, -- embedded document
suppliers ARRAY<ROW<name STRING, address STRING>>, -- embedded documents
PRIMARY KEY(_id) NOT ENFORCED
) WITH (
'connector' = 'mongodb-cdc',
'hosts' = 'localhost:27017,localhost:27018,localhost:27019',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database' = 'inventory',
'collection' = 'products'
);
```
Features
--------
### Exactly-Once Processing
The MongoDB CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change stream events with **exactly-once processing** even failures happen.
### Startup Reading Position
The config option `scan.startup.mode` specifies the startup mode for MongoDB CDC consumer. The valid enumerations are:
- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest oplog.
- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from
the end of the oplog which means only have the changes since the connector was started.
- `timestamp`: Skip snapshot phase and start reading oplog events from a specific timestamp.
For example in DataStream API:
```java
MongoDBSource.builder()
.startupOptions(StartupOptions.latest()) // Start from latest offset
.startupOptions(StartupOptions.timestamp(1667232000000L) // Start from timestamp
.build()
```
and with SQL:
```SQL
CREATE TABLE mongodb_source (...) WITH (
'connector' = 'mongodb-cdc',
'scan.startup.mode' = 'latest-offset', -- Start from latest offset
...
'scan.startup.mode' = 'timestamp', -- Start from timestamp
'scan.startup.timestamp-millis' = '1667232000000' -- Timestamp under timestamp startup mode
...
)
```
### Change Streams
We integrate the [MongoDB's official Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/) to read snapshot or change events from MongoDB and drive it by Debezium's `EmbeddedEngine`.
Debezium's `EmbeddedEngine` provides a mechanism for running a single Kafka Connect `SourceConnector` within an application's process, and it can drive any standard Kafka Connect `SourceConnector` properly even which is not provided by Debezium.
We choose **MongoDB's official Kafka Connector** instead of the **Debezium's MongoDB Connector** because they use a different change data capture mechanism.
- For Debezium's MongoDB Connector, it reads the `oplog.rs` collection of each replica-set's master node.
- For MongoDB's Kafka Connector, it subscribes `Change Stream` of MongoDB.
MongoDB's `oplog.rs` collection doesn't keep the changed record's update before state, so it's hard to extract the full document state by a single `oplog.rs` record and convert it to change log stream accepted by Flink (Insert Only, Upsert, All).
Additionally, MongoDB 5 (released in July 2021) has changed the oplog format, so the current Debezium connector cannot be used with it.
**Change Stream** is a new feature provided by MongoDB 3.6 for replica sets and sharded clusters that allows applications to access real-time data changes without the complexity and risk of tailing the oplog.<br>
Applications can use change streams to subscribe to all data changes on a single collection, a database, or an entire deployment, and immediately react to them.
**Lookup Full Document for Update Operations** is a feature provided by **Change Stream** which can configure the change stream to return the most current majority-committed version of the updated document. Because of this feature, we can easily collect the latest full document and convert the change log to Flink's **Upsert Changelog Stream**.
By the way, Debezium's MongoDB change streams exploration mentioned by [DBZ-435](https://issues.redhat.com/browse/DBZ-435) is on roadmap.<br>
If it's done, we can consider integrating two kinds of source connector for users to choose.
### DataStream Source
The MongoDB CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows:
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.mongodb.MongoDBSource;
public class MongoDBSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = MongoDBSource.<String>builder()
.hosts("localhost:27017")
.username("flink")
.password("flinkpw")
.databaseList("inventory") // set captured database, support regex
.collectionList("inventory.products", "inventory.orders") //set captured collections, support regex
.deserializer(new JsonDebeziumDeserializationSchema())
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
The MongoDB CDC incremental connector (after 2.3.0) can be used as the following shows:
```java
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.cdc.connectors.mongodb.source.MongoDBSource;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
public class MongoDBIncrementalSourceExample {
public static void main(String[] args) throws Exception {
MongoDBSource<String> mongoSource =
MongoDBSource.<String>builder()
.hosts("localhost:27017")
.databaseList("inventory") // set captured database, support regex
.collectionList("inventory.products", "inventory.orders") //set captured collections, support regex
.username("flink")
.password("flinkpw")
.deserializer(new JsonDebeziumDeserializationSchema())
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// enable checkpoint
env.enableCheckpointing(3000);
// set the source parallelism to 2
env.fromSource(mongoSource, WatermarkStrategy.noWatermarks(), "MongoDBIncrementalSource")
.setParallelism(2)
.print()
.setParallelism(1);
env.execute("Print MongoDB Snapshot + Change Stream");
}
}
```
**Note:**
- If database regex is used, `readAnyDatabase` role is required.
- The incremental snapshot feature only supports after MongoDB 4.0.
### Full Changelog<a name="Full Changelog" id="003" ></a>
MongoDB 6.0 and above supports emitting change stream events containing document before and after the change was made (aka. pre- and post-images).
- The pre-image is the document before it was replaced, updated, or deleted. There is no pre-image for an inserted document.
- The post-image is the document after it was inserted, replaced, or updated. There is no post-image for a deleted document.
MongoDB CDC could make uses of pre-image and post-images to generate full-mode changelog stream including Insert, Update Before, Update After, and Delete data rows, thereby avoiding additional `ChangelogNormalize` downstream node.
To enable this feature, here's some prerequisites:
- MongoDB version must be 6.0 or above;
- Enable `preAndPostImages` feature at the database level:
```javascript
db.runCommand({
setClusterParameter: {
changeStreamOptions: {
preAndPostImages: {
expireAfterSeconds: 'off' // replace with custom image expiration time
}
}
}
})
```
- Enable `changeStreamPreAndPostImages` feature for collections to be monitored:
```javascript
db.runCommand({
collMod: "<< collection name >>",
changeStreamPreAndPostImages: {
enabled: true
}
})
```
- Enable MongoDB CDC's `scan.full-changelog` feature:
```java
MongoDBSource.builder()
.scanFullChangelog(true)
...
.build()
```
or with Flink SQL:
```SQL
CREATE TABLE mongodb_source (...) WITH (
'connector' = 'mongodb-cdc',
'scan.full-changelog' = 'true',
...
)
```
Data Type Mapping
----------------
[BSON](https://docs.mongodb.com/manual/reference/bson-types/) short for **Binary JSON** is a binary-encoded serialization of JSON-like format used to store documents and make remote procedure calls in MongoDB.
[Flink SQL Data Type](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/dev/table/types/) is similar to the SQL standards data type terminology which describes the logical type of a value in the table ecosystem. It can be used to declare input and/or output types of operations.
In order to enable Flink SQL to process data from heterogeneous data sources, the data types of heterogeneous data sources need to be uniformly converted to Flink SQL data types.
The following is the mapping of BSON type and Flink SQL type.
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">BSON type<a href="https://docs.mongodb.com/manual/reference/bson-types/"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>TINYINT</td>
</tr>
<tr>
<td></td>
<td>SMALLINT</td>
</tr>
<tr>
<td>
Int<br>
<td>INT</td>
</tr>
<tr>
<td>Long</td>
<td>BIGINT</td>
</tr>
<tr>
<td></td>
<td>FLOAT</td>
</tr>
<tr>
<td>Double</td>
<td>DOUBLE</td>
</tr>
<tr>
<td>Decimal128</td>
<td>DECIMAL(p, s)</td>
</tr>
<tr>
<td>Boolean</td>
<td>BOOLEAN</td>
</tr>
<tr>
<td>Date</br>Timestamp</td>
<td>DATE</td>
</tr>
<tr>
<td>Date</br>Timestamp</td>
<td>TIME</td>
</tr>
<tr>
<td>Date</td>
<td>TIMESTAMP(3)</br>TIMESTAMP_LTZ(3)</td>
</tr>
<tr>
<td>Timestamp</td>
<td>TIMESTAMP(0)</br>TIMESTAMP_LTZ(0)
</td>
</tr>
<tr>
<td>
String<br>
ObjectId<br>
UUID<br>
Symbol<br>
MD5<br>
JavaScript</br>
Regex</td>
<td>STRING</td>
</tr>
<tr>
<td>BinData</td>
<td>BYTES</td>
</tr>
<tr>
<td>Object</td>
<td>ROW</td>
</tr>
<tr>
<td>Array</td>
<td>ARRAY</td>
</tr>
<tr>
<td>DBPointer</td>
<td>ROW&lt;$ref STRING, $id STRING&gt;</td>
</tr>
<tr>
<td>
<a href="https://docs.mongodb.com/manual/reference/geojson/">GeoJSON</a>
</td>
<td>
Point : ROW&lt;type STRING, coordinates ARRAY&lt;DOUBLE&gt;&gt;</br>
Line : ROW&lt;type STRING, coordinates ARRAY&lt;ARRAY&lt; DOUBLE&gt;&gt;&gt;</br>
...
</td>
</tr>
</tbody>
</table>
</div>
Reference
--------
- [MongoDB Kafka Connector](https://docs.mongodb.com/kafka-connector/current/kafka-source/)
- [Change Streams](https://docs.mongodb.com/manual/changeStreams/)
- [Replication](https://docs.mongodb.com/manual/replication/)
- [Sharding](https://docs.mongodb.com/manual/sharding/)
- [Database User Roles](https://docs.mongodb.com/manual/reference/built-in-roles/#database-user-roles)
- [WiredTiger](https://docs.mongodb.com/manual/core/wiredtiger/#std-label-storage-wiredtiger)
- [Replica set protocol](https://docs.mongodb.com/manual/reference/replica-configuration/#mongodb-rsconf-rsconf.protocolVersion)
- [Connection String Options](https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options)
- [Document Pre- and Post-Images](https://www.mongodb.com/docs/v6.0/changeStreams/#change-streams-with-document-pre--and-post-images)
- [BSON Types](https://docs.mongodb.com/manual/reference/bson-types/)
- [Flink DataTypes](https://nightlies.apache.org/flink/flink-docs-release-1.17/docs/dev/table/types/)
{{< top >}}

File diff suppressed because it is too large Load Diff

@ -1,3 +1,10 @@
---
title: "OceanBase CDC Connector"
weight: 4
type: docs
aliases:
- /connectors/cdc-connectors/oceanbase-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,24 +24,18 @@ specific language governing permissions and limitations
under the License.
-->
# OceanBase CDC 连接器
# OceanBase CDC Connector
OceanBase CDC 连接器允许从 OceanBase 读取快照数据和增量数据。本文介绍了如何设置 OceanBase CDC 连接器以对 OceanBase 进行 SQL 查询。
The OceanBase CDC connector allows for reading snapshot data and incremental data from OceanBase. This document describes how to set up the OceanBase CDC connector to run SQL queries against OceanBase.
## 依赖
Dependencies
------------
为了使用 OceanBase CDC 连接器,您必须提供相关的依赖信息。以下依赖信息适用于使用自动构建工具(如 Maven 或 SBT构建的项目和带有 SQL JAR 包的 SQL 客户端。
In order to set up the OceanBase CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
```xml
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-oceanbase-cdc</artifactId>
<!-- 请使用已发布的版本依赖snapshot 版本的依赖需要本地自行编译。 -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
{{< artifact flink-connector-oceanbase-cdc >}}
如果您是要连接企业版的 OceanBase您可能需要使用 OceanBase 官方的 JDBC 驱动,这时需要引入如下依赖。
If you want to use OceanBase JDBC driver to connect to the enterprise edition database, you should also include the following dependency in your class path.
```xml
<dependency>
@ -44,51 +45,57 @@ OceanBase CDC 连接器允许从 OceanBase 读取快照数据和增量数据。
</dependency>
```
## 下载 SQL 客户端 JAR 包
### SQL Client JAR
```Download link is available only for stable releases.```
```下载链接仅在已发布版本可用,请在文档网站左下角选择浏览已发布的版本。```
Download [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
下载[flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/2.5-SNAPSHOT/flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar) 到 `<FLINK_HOME>/lib/` 目录下。
**Note:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oceanbase-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oceanbase-cdc), the released version will be available in the Maven central warehouse.
**注意:** flink-sql-connector-oceanbase-cdc-XXX-SNAPSHOT 版本是开发分支`release-XXX`对应的快照版本,快照版本用户需要下载源代码并编译相应的 jar。用户应使用已经发布的版本例如 [flink-sql-connector-oceanbase-cdc-2.3.0.jar](https://mvnrepository.com/artifact/com.ververica/flink-sql-connector-oceanbase-cdc) 当前已发布的所有版本都可以在 Maven 中央仓库获取。
For JDBC driver, the cdc jar above already contains MySQL JDBC driver 5.1.47, which is our recommended version. Due to the license issue, we can not include the OceanBase JDBC driver in the cdc jar. If you need to use it, you can download it from [here](https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.2/oceanbase-client-2.4.2.jar) and put it under `<FLINK_HOME>/lib/`, you also need to set the start option `jdbc.driver` to `com.oceanbase.jdbc.Driver`.
对于 JDBC 驱动,上述的 cdc jar 文件中已经包含了我们推荐的 MySQL 驱动版本 5.1.47。由于开源许可证的原因,我们不能在上述 cdc jar 文件中包含 OceanBase 的官方 JDBC 驱动,如果您需要使用它,可以从[这里](https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.2/oceanbase-client-2.4.2.jar)下载,然后放到 `<FLINK_HOME>/lib/` 目录下,同时需要将配置项 `jdbc.driver` 设为 `com.oceanbase.jdbc.Driver`
Setup OceanBase and LogProxy Server
----------------------
### 配置 OceanBase 数据库和 oblogproxy 服务
1. Set up the OceanBase cluster following the [doc](https://github.com/oceanbase/oceanbase#quick-start).
1. 按照 [文档](https://github.com/oceanbase/oceanbase#quick-start) 配置 OceanBase 集群。
2. 在 sys 租户中,为 oblogproxy 创建一个带密码的用户。
2. Create a user with password in `sys` tenant, this user is used in OceanBase LogProxy.
```bash
```shell
mysql -h${host} -P${port} -uroot
mysql> SHOW TENANT;
mysql> CREATE USER ${sys_username} IDENTIFIED BY '${sys_password}';
mysql> GRANT ALL PRIVILEGES ON *.* TO ${sys_username} WITH GRANT OPTION;
```
3. 为你想要监控的租户创建一个用户,这个用户用来读取快照数据和变化事件数据。
4. OceanBase 社区版用户需要获取`rootserver-list`,可以使用以下命令获取:
3. Create a user in the tenant you want to monitor, this is used to read data for snapshot and change event.
```bash
mysql> SHOW PARAMETERS LIKE 'rootservice_list';
4. For users of OceanBase Community Edition, you need to get the `rootserver-list`. You can use the following command to get the value:
```shell
mysql> show parameters like 'rootservice_list';
```
OceanBase 企业版用户需要获取 `config-url`,可以使用以下命令获取:
For users of OceanBase Enterprise Edition, you need to get the `config-url`. You can use the following command to get the value:
```shell
mysql> show parameters like 'obconfig_url';
```
5. 按照 [文档](https://github.com/oceanbase/oblogproxy#getting-started) 配置 oblogproxy。
5. Setup OceanBase LogProxy. For users of OceanBase Community Edition, you can follow the [quick start](https://github.com/oceanbase/oblogproxy#getting-started).
## 创建 OceanBase CDC 表
How to create a OceanBase CDC table
----------------
使用以下命令,创建 OceanBase CDC 表:
The OceanBase CDC table can be defined as following:
```sql
-- 每 3 秒做一次 checkpoint用于测试生产配置建议 5 到 10 分钟
-- checkpoint every 3000 milliseconds
Flink SQL> SET 'execution.checkpointing.interval' = '3s';
-- 在 Flink SQL 中创建 OceanBase 表 `orders`
-- register a OceanBase table 'orders' in Flink SQL
Flink SQL> CREATE TABLE orders (
order_id INT,
order_date TIMESTAMP(0),
@ -113,11 +120,11 @@ Flink SQL> CREATE TABLE orders (
'working-mode' = 'memory'
);
-- 从表 orders 中读取快照数据和 binlog 数据
-- read snapshot and binlogs from orders table
Flink SQL> SELECT * FROM orders;
```
如果您使用的是企业版的 OceanBase Oracle 模式,您需要先添加 OceanBase 的官方 JDBC 驱动 jar 包到 Flink 环境,并且部署企业版的 oblogproxy 服务,然后通过以下命令创建 OceanBase CDC 表:
If you want to use OceanBase Oracle mode, you need to add the OceanBase jdbc jar file to Flink and set up the enterprise edition of oblogproxy, then you can create a table in Flink as following:
```sql
Flink SQL> CREATE TABLE orders (
@ -147,237 +154,235 @@ Flink SQL> CREATE TABLE orders (
);
```
您也可以访问 Flink CDC 官网文档,快速体验将数据从 OceanBase 导入到 Elasticsearch。更多信息参考 [Flink CDC 官网文档](https://ververica.github.io/flink-cdc-connectors/release-2.2/content/%E5%BF%AB%E9%80%9F%E4%B8%8A%E6%89%8B/oceanbase-tutorial-zh.html)。
You can also try the quickstart tutorial that sync data from OceanBase to Elasticsearch, please refer [Flink CDC Tutorial](https://ververica.github.io/flink-cdc-connectors/release-2.3//content/quickstart/oceanbase-tutorial.html) for more information.
## OceanBase CDC 连接器选项
Connector Options
----------------
OceanBase CDC 连接器包括用于 SQL 和 DataStream API 的选项,如下表所示。
The OceanBase CDC Connector contains some options for both sql and stream api as the following sheet.
*注意*:连接器支持两种方式来指定需要监听的表,两种方式同时使用时会监听两种方式匹配的所有表。
1. 使用 `database-name``table-name` 匹配正则表达式中的数据库和表名。 由于`obcdc`(以前的`liboblog`)现在只支持`fnmatch`匹配,我们不能直接使用正则过滤 changelog 事件,所以通过两个选项去匹配去指定监听表只能在`initial`启动模式下使用。
2. 使用 `table-list` 去匹配数据库名和表名的准确列表。
*Note*: The connector supports two ways to specify the table list to listen to, and will get the union of the results when both way are used at the same time.
1. Use `database-name` and `table-name` to match database and table names in regex. As the `obcdc` (former `liboblog`) only supports `fnmatch` now, we can't use regex directly to filter change events, so these two options can only be used in `initial` startup mode.
2. Use `table-list` to match the exact value of database and table names.
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 10%">配置项</th>
<th class="text-left" style="width: 8%">是否必选</th>
<th class="text-left" style="width: 7%">默认值</th>
<th class="text-left" style="width: 10%">类型</th>
<th class="text-left" style="width: 65%">描述</th>
<th class="text-left" style="width: 10%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 65%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>指定要使用的连接器,此处为 <code>'oceanbase-cdc'</code></td>
<td>Specify what connector to use, here should be <code>'oceanbase-cdc'</code>.</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>指定 OceanBase CDC 消费者的启动模式。可取值为<code>'initial'</code>,<code>'latest-offset'</code> or
<code>'timestamp'</code></td>
<td>Specify the startup mode for OceanBase CDC consumer, valid enumerations are
<code>'initial'</code>,<code>'latest-offset'</code> or <code>'timestamp'</code>.
</td>
</tr>
<tr>
<td>scan.startup.timestamp</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>起始点的时间戳,单位为秒。仅在启动模式为 <code>'timestamp'</code> 时可用。</td>
<td>Timestamp in seconds of the start point, only used for <code>'timestamp'</code> startup mode.</td>
</tr>
<tr>
<td>username</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>连接 OceanBase 数据库的用户的名称。</td>
<td>Username to be used when connecting to OceanBase.</td>
</tr>
<tr>
<td>password</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>连接 OceanBase 数据库时使用的密码。</td>
<td>Password to be used when connecting to OceanBase.</td>
</tr>
<tr>
<td>tenant-name</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>待监控 OceanBase 数据库的租户名,应该填入精确值。</td>
<td>Tenant name of OceanBase to monitor, should be exact value.</td>
</tr>
<tr>
<td>database-name</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>待监控 OceanBase 数据库的数据库名,应该是正则表达式,该选项只支持和 'initial' 模式一起使用。</td>
<td>Database name of OceanBase to monitor, should be regular expression. Only can be used with 'initial' mode.</td>
</tr>
<tr>
<td>table-name</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>待监控 OceanBase 数据库的表名,应该是正则表达式,该选项只支持和 'initial' 模式一起使用。</td>
<td>Table name of OceanBase to monitor, should be regular expression. Only can be used with 'initial' mode.</td>
</tr>
<tr>
<td>table-list</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>待监控 OceanBase 数据库的全路径的表名列表,逗号分隔,如:"db1.table1, db2.table2"。</td>
<td>List of full names of tables, separated by commas, e.g. "db1.table1, db2.table2".</td>
</tr>
<tr>
<td>hostname</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>OceanBase 数据库或 OceanBbase 代理 ODP 的 IP 地址或主机名。</td>
<td>IP address or hostname of the OceanBase database server or OceanBase Proxy server.</td>
</tr>
<tr>
<td>port</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Integer</td>
<td>
OceanBase 数据库服务器的整数端口号。可以是 OceanBase 服务器的 SQL 端口号(默认值为 2881<br>
或 OceanBase代理服务的端口号默认值为 2883</td>
<td>Integer port number to connect to OceanBase. It can be the SQL port of OceanBase server, which is 2881 by default, or the port of OceanBase proxy service, which is 2883 by default.</td>
</tr>
<tr>
<td>connect.timeout</td>
<td></td>
<td>optional</td>
<td style="word-wrap: break-word;">30s</td>
<td>Duration</td>
<td>连接器在尝试连接到 OceanBase 数据库服务器超时前的最长时间。</td>
<td>The maximum time that the connector should wait after trying to connect to the OceanBase database server before timing out.</td>
</tr>
<tr>
<td>server-time-zone</td>
<td></td>
<td>optional</td>
<td style="word-wrap: break-word;">+00:00</td>
<td>String</td>
<td>
数据库服务器中的会话时区,用户控制 OceanBase 的时间类型如何转换为 STRING。<br>
合法的值可以是格式为"±hh:mm"的 UTC 时区偏移量,<br>
如果 mysql 数据库中的时区信息表已创建,合法的值则可以是创建的时区。
</td>
<td>The session timezone which controls how temporal types are converted to STRING in OceanBase. Can be UTC offset in format "±hh:mm", or named time zones if the time zone information tables in the mysql database have been created and populated.</td>
</tr>
<tr>
<td>logproxy.host</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>OceanBase 日志代理服务 的 IP 地址或主机名。</td>
<td>Hostname or IP address of OceanBase log proxy service.</td>
</tr>
<tr>
<td>logproxy.port</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Integer</td>
<td>OceanBase 日志代理服务 的端口号。</td>
<td>Port number of OceanBase log proxy service.</td>
</tr>
<tr>
<td>logproxy.client.id</td>
<td></td>
<td style="word-wrap: break-word;">规则生成</td>
<td>optional</td>
<td style="word-wrap: break-word;">By rule.</td>
<td>String</td>
<td>OceanBase日志代理服务的客户端连接 ID默认值的生成规则是 {flink_ip}_{process_id}_{timestamp}_{thread_id}_{tenant}。</td>
<td>Id of a log proxy client connection, will be in format {flink_ip}_{process_id}_{timestamp}_{thread_id}_{tenant} by default.</td>
</tr>
<tr>
<td>rootserver-list</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>OceanBase root 服务器列表,服务器格式为 `ip:rpc_port:sql_port`<br>多个服务器地址使用英文分号 `;` 隔开OceanBase 社区版本必填。</td>
<td>The semicolon-separated list of OceanBase root servers in format `ip:rpc_port:sql_port`, required for OceanBase CE.</td>
</tr>
<tr>
<td>config-url</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>从配置服务器获取服务器信息的 url, OceanBase 企业版本必填。</td>
<td>The url to get the server info from the config server, required for OceanBase EE.</td>
</tr>
<tr>
<td>working-mode</td>
<td></td>
<td>optional</td>
<td style="word-wrap: break-word;">storage</td>
<td>String</td>
<td>日志代理中 `libobcdc` 的工作模式 , 可以是 `storage``memory`</td>
<td>Working mode of `obcdc` in LogProxy, can be `storage` or `memory`.</td>
</tr>
<tr>
<td>compatible-mode</td>
<td></td>
<td>optional</td>
<td style="word-wrap: break-word;">mysql</td>
<td>String</td>
<td>OceanBase 的兼容模式,可以是 `mysql``oracle`</td>
<td>Compatible mode of OceanBase, can be `mysql` or `oracle`.</td>
</tr>
<tr>
<td>jdbc.driver</td>
<td></td>
<td>optional</td>
<td style="word-wrap: break-word;">com.mysql.jdbc.Driver</td>
<td>String</td>
<td>全量读取时使用的 jdbc 驱动类名。</td>
<td>JDBC driver class for snapshot reading.</td>
</tr>
<tr>
<td>jdbc.properties.*</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>传递自定义 JDBC URL 属性的选项。用户可以传递自定义属性,如 'jdbc.properties.useSSL' = 'false'。</td>
<td>Option to pass custom JDBC URL properties. User can pass custom properties like 'jdbc.properties.useSSL' = 'false'.</td>
</tr>
<tr>
<td>obcdc.properties.*</td>
<td></td>
<td style="word-wrap: break-word;"></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>传递参数到<code>libobcdc</code>,如 'obcdc.properties.sort_trans_participants' = '1'。更多参数信息见 <a href="https://www.oceanbase.com/docs/common-oceanbase-database-cn-1000000000221094">obcdc 配置项说明</a></td>
<td>Option to pass custom configurations to the <code>libobcdc</code>, eg: 'obcdc.properties.sort_trans_participants' = '1'. Please refer to <a href="https://en.oceanbase.com/docs/common-oceanbase-database-10000000000872541">obcdc parameters</a> for more details.</td>
</tr>
</tbody>
</table>
</div>
## 支持的元数据
Available Metadata
----------------
在创建表时您可以使用以下格式的元数据作为只读列VIRTUAL
The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition.
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">列名</th>
<th class="text-left" style="width: 30%">数据类型</th>
<th class="text-left" style="width: 55%">描述</th>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>tenant_name</td>
<td>STRING NOT NULL</td>
<td>当前记录所属的租户名称。</td>
<td>Name of the tenant that contains the row.</td>
</tr>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>当前记录所属的库名。</td>
<td>Name of the database that contains the row.</td>
</tr>
<tr>
<td>table_name</td>
<td>STRING NOT NULL</td>
<td>当前记录所属的表名称。</td>
<td>Name of the table that contains the row.</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>该值表示此修改在数据库中发生的时间。如果这条记录是该表在快照阶段读取的记录,则该值返回 0。</td>
<td>It indicates the time that the change was made in the database. <br>
If the record is read from snapshot of the table instead of the change stream, the value is always 0.</td>
</tr>
</tbody>
</table>
如下 SQL 展示了如何在表中使用这些元数据列:
The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields:
```sql
CREATE TABLE products (
@ -404,32 +409,35 @@ CREATE TABLE products (
'port' = '2881',
'rootserver-list' = '127.0.0.1:2882:2881',
'logproxy.host' = '127.0.0.1',
'logproxy.port' = '2983');
'logproxy.port' = '2983',
'working-mode' = 'memory'
);
```
## 特性
Features
--------
### At-Least-Once 处理
### At-Least-Once Processing
OceanBase CDC 连接器是一个 Flink Source 连接器。它将首先读取数据库快照,然后再读取变化事件,并进行 **At-Least-Once 处理**
The OceanBase CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **at-least-once processing**.
OceanBase 数据库是一个分布式数据库,它的日志也分散在不同的服务器上。由于没有类似 MySQL binlog 偏移量的位置信息OceanBase 数据库用时间戳作为位置标记。为确保读取完整的数据liboblog读取 OceanBase 日志记录的 C++ 库可能会在给定的时间戳之前读取一些日志数据。因此OceanBase 数据库可能会读到起始点附近时间戳的重复数据,可保证 **At-Least-Once 处理**
OceanBase is a kind of distributed database whose log files are distributed on different servers. As there is no position information like MySQL binlog offset, we can only use timestamp as the position mark. In order to ensure the completeness of reading data, `liboblog` (a C++ library to read OceanBase log record) might read some log data before the given timestamp. So in this way we may read duplicate data whose timestamp is around the start point, and only 'at-least-once' can be guaranteed.
### 启动模式
### Startup Reading Position
配置选项 `scan.startup.mode` 指定 OceanBase CDC 连接器的启动模式。可用取值包括:
The config option `scan.startup.mode` specifies the startup mode for OceanBase CDC consumer. The valid enumerations are:
- `initial`(默认):在首次启动时对受监视的数据库表执行初始快照,并继续读取最新的提交日志。
- `latest-offset`:首次启动时,不对受监视的数据库表执行快照,仅从连接器启动时读取提交日志。
- `timestamp`:在首次启动时不对受监视的数据库表执行初始快照,仅从指定的 `scan.startup.timestamp` 读取最新的提交日志。
- `initial`: Performs an initial snapshot on the monitored table upon first startup, and continue to read the latest commit log.
- `latest-offset`: Never to perform snapshot on the monitored table upon first startup and just read the latest commit log since the connector is started.
- `timestamp`: Never to perform snapshot on the monitored table upon first startup and just read the commit log from the given `scan.startup.timestamp`.
### 消费提交日志
### Consume Commit Log
OceanBase CDC 连接器使用 [oblogclient](https://github.com/oceanbase/oblogclient) 消费 OceanBase日志代理服务 中的事务日志。
The OceanBase CDC Connector using [oblogclient](https://github.com/oceanbase/oblogclient) to consume commit log from OceanBase LogProxy.
### DataStream Source
OceanBase CDC 连接器也可以作为 DataStream Source 使用。您可以按照如下创建一个 SourceFunction
The OceanBase CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows:
```java
import org.apache.flink.api.common.typeinfo.TypeInformation;
@ -503,27 +511,25 @@ public class OceanBaseSourceExample {
}
}
```
Data Type Mapping
----------------
## 数据类型映射
### Mysql 模式
### Mysql Mode
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">OceanBase 数据类型</th>
<th class="text-left">Flink SQL 类型</th>
<th class="text-left">描述</th>
<th class="text-left">OceanBase type</th>
<th class="text-left">Flink SQL type</th>
<th class="text-left">NOTE</th>
</tr>
</thead>
<tbody>
<tr>
<td>
BOOLEAN<br>
<td>BOOLEAN<br>
TINYINT(1)<br>
BIT(1)
</td>
BIT(1)</td>
<td>BOOLEAN</td>
<td></td>
</tr>
@ -535,8 +541,7 @@ public class OceanBaseSourceExample {
<tr>
<td>
SMALLINT<br>
TINYINT UNSIGNED
</td>
TINYINT UNSIGNED</td>
<td>SMALLINT</td>
<td></td>
</tr>
@ -544,16 +549,14 @@ public class OceanBaseSourceExample {
<td>
INT<br>
MEDIUMINT<br>
SMALLINT UNSIGNED
</td>
SMALLINT UNSIGNED</td>
<td>INT</td>
<td></td>
</tr>
<tr>
<td>
BIGINT<br>
INT UNSIGNED
</td>
INT UNSIGNED</td>
<td>BIGINT</td>
<td></td>
</tr>
@ -565,7 +568,7 @@ public class OceanBaseSourceExample {
<tr>
<td>
REAL<br>
FLOAT
FLOAT<br>
</td>
<td>FLOAT</td>
<td></td>
@ -581,7 +584,8 @@ public class OceanBaseSourceExample {
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)<br>
where p <= 38 </td>
where p <= 38<br>
</td>
<td>DECIMAL(p, s)</td>
<td></td>
</tr>
@ -589,13 +593,13 @@ public class OceanBaseSourceExample {
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)<br>
where 38 < p <=65 </td>
<td>STRING</td>
<td>
DECIMAL 等同于 NUMERIC。在 OceanBase 数据库中DECIMAL 数据类型的精度最高为 65。<br>
但在 Flink 中DECIMAL 的最高精度为 38。因此<br>
如果你定义了一个精度大于 38 的 DECIMAL 列,你应当将其映射为 STRING以避免精度损失。
where 38 < p <=65<br>
</td>
<td>STRING</td>
<td>DECIMAL is equivalent to NUMERIC. The precision for DECIMAL data type is up to 65 in OceanBase, but
the precision for DECIMAL is limited to 38 in Flink.
So if you define a decimal column whose precision is greater than 38, you should map it to STRING to
avoid precision loss.</td>
</tr>
<tr>
<td>DATE</td>
@ -629,7 +633,7 @@ public class OceanBaseSourceExample {
</tr>
<tr>
<td>BIT(n)</td>
<td>BINARY(⌈n/8⌉)</td>
<td>BINARY(⌈(n + 7) / 8⌉)</td>
<td></td>
</tr>
<tr>
@ -647,7 +651,7 @@ public class OceanBaseSourceExample {
TINYTEXT<br>
TEXT<br>
MEDIUMTEXT<br>
LONGTEXT
LONGTEXT<br>
</td>
<td>STRING</td>
<td></td>
@ -657,7 +661,7 @@ public class OceanBaseSourceExample {
TINYBLOB<br>
BLOB<br>
MEDIUMBLOB<br>
LONGBLOB
LONGBLOB<br>
</td>
<td>BYTES</td>
<td></td>
@ -675,21 +679,18 @@ public class OceanBaseSourceExample {
<tr>
<td>SET</td>
<td>ARRAY&lt;STRING&gt;</td>
<td>
因为 OceanBase 的 SET 类型是用包含一个或多个值的字符串对象表示,<br>
所以映射到 Flink 时是一个字符串数组
</td>
<td>As the SET data type in OceanBase is a string object that can have zero or more values, it should always be mapped to an array of string</td>
</tr>
<tr>
<td>JSON</td>
<td>STRING</td>
<td>JSON 类型的数据在 Flink 中会转化为 JSON 格式的字符串</td>
<td>The JSON data type will be converted into STRING with JSON format in Flink.</td>
</tr>
</tbody>
</table>
</div>
### Oracle 模式
### Oracle Mode
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
@ -785,3 +786,5 @@ public class OceanBaseSourceExample {
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,701 @@
---
title: "Oracle CDC Connector"
weight: 5
type: docs
aliases:
- /connectors/cdc-connectors/oracle-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Oracle CDC Connector
The Oracle CDC connector allows for reading snapshot data and incremental data from Oracle database. This document describes how to setup the Oracle CDC connector to run SQL queries against Oracle databases.
Dependencies
------------
In order to setup the Oracle CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-oracle-cdc >}}
### SQL Client JAR
**Download link is available only for stable releases.**
Download [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-oracle-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-oracle-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-oracle-cdc), the released version will be available in the Maven central warehouse.
Setup Oracle
----------------
You have to enable log archiving for Oracle database and define an Oracle user with appropriate permissions on all databases that the Debezium Oracle connector monitors.
### For Non-CDB database
1. Enable log archiving
(1.1). Connect to the database as DBA
```sql
ORACLE_SID=SID
export ORACLE_SID
sqlplus /nolog
CONNECT sys/password AS SYSDBA
```
(1.2). Enable log archiving
```sql
alter system set db_recovery_file_dest_size = 10G;
alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile;
shutdown immediate;
startup mount;
alter database archivelog;
alter database open;
```
**Note:**
- Enable log archiving requires database restart, pay attention when try to do it
- The archived logs will occupy a large amount of disk space, so consider clean the expired logs the periodically
(1.3). Check whether log archiving is enabled
```sql
-- Should now "Database log mode: Archive Mode"
archive log list;
```
**Note:**
Supplemental logging must be enabled for captured tables or the database in order for data changes to capture the <em>before</em> state of changed database rows.
The following illustrates how to configure this on the table/database level.
```sql
-- Enable supplemental logging for a specific table:
ALTER TABLE inventory.customers ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS;
```
```sql
-- Enable supplemental logging for database
ALTER DATABASE ADD SUPPLEMENTAL LOG DATA;
```
2. Create an Oracle user with permissions
(2.1). Create Tablespace
```sql
sqlplus sys/password@host:port/SID AS SYSDBA;
CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/SID/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED;
exit;
```
(2.2). Create a user and grant permissions
```sql
sqlplus sys/password@host:port/SID AS SYSDBA;
CREATE USER flinkuser IDENTIFIED BY flinkpw DEFAULT TABLESPACE LOGMINER_TBS QUOTA UNLIMITED ON LOGMINER_TBS;
GRANT CREATE SESSION TO flinkuser;
GRANT SET CONTAINER TO flinkuser;
GRANT SELECT ON V_$DATABASE to flinkuser;
GRANT FLASHBACK ANY TABLE TO flinkuser;
GRANT SELECT ANY TABLE TO flinkuser;
GRANT SELECT_CATALOG_ROLE TO flinkuser;
GRANT EXECUTE_CATALOG_ROLE TO flinkuser;
GRANT SELECT ANY TRANSACTION TO flinkuser;
GRANT LOGMINING TO flinkuser;
GRANT ANALYZE ANY TO flinkuser;
GRANT CREATE TABLE TO flinkuser;
-- need not to execute if set scan.incremental.snapshot.enabled=true(default)
GRANT LOCK ANY TABLE TO flinkuser;
GRANT ALTER ANY TABLE TO flinkuser;
GRANT CREATE SEQUENCE TO flinkuser;
GRANT EXECUTE ON DBMS_LOGMNR TO flinkuser;
GRANT EXECUTE ON DBMS_LOGMNR_D TO flinkuser;
GRANT SELECT ON V_$LOG TO flinkuser;
GRANT SELECT ON V_$LOG_HISTORY TO flinkuser;
GRANT SELECT ON V_$LOGMNR_LOGS TO flinkuser;
GRANT SELECT ON V_$LOGMNR_CONTENTS TO flinkuser;
GRANT SELECT ON V_$LOGMNR_PARAMETERS TO flinkuser;
GRANT SELECT ON V_$LOGFILE TO flinkuser;
GRANT SELECT ON V_$ARCHIVED_LOG TO flinkuser;
GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO flinkuser;
exit;
```
### For CDB database
Overall, the steps for configuring CDB database is quite similar to non-CDB database, but the commands may be different.
1. Enable log archiving
```sql
ORACLE_SID=ORCLCDB
export ORACLE_SID
sqlplus /nolog
CONNECT sys/password AS SYSDBA
alter system set db_recovery_file_dest_size = 10G;
-- should exist
alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile;
shutdown immediate
startup mount
alter database archivelog;
alter database open;
-- Should show "Database log mode: Archive Mode"
archive log list
exit;
```
**Note:**
You can also use the following commands to enable supplemental logging:
```sql
-- Enable supplemental logging for a specific table:
ALTER TABLE inventory.customers ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS;
-- Enable supplemental logging for database
ALTER DATABASE ADD SUPPLEMENTAL LOG DATA;
```
2. Create an Oracle user with permissions
```sql
sqlplus sys/password@//localhost:1521/ORCLCDB as sysdba
CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED;
exit
```
```sql
sqlplus sys/password@//localhost:1521/ORCLPDB1 as sysdba
CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/ORCLPDB1/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED;
exit
```
```sql
sqlplus sys/password@//localhost:1521/ORCLCDB as sysdba
CREATE USER flinkuser IDENTIFIED BY flinkpw DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs CONTAINER=ALL;
GRANT CREATE SESSION TO flinkuser CONTAINER=ALL;
GRANT SET CONTAINER TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$DATABASE to flinkuser CONTAINER=ALL;
GRANT FLASHBACK ANY TABLE TO flinkuser CONTAINER=ALL;
GRANT SELECT ANY TABLE TO flinkuser CONTAINER=ALL;
GRANT SELECT_CATALOG_ROLE TO flinkuser CONTAINER=ALL;
GRANT EXECUTE_CATALOG_ROLE TO flinkuser CONTAINER=ALL;
GRANT SELECT ANY TRANSACTION TO flinkuser CONTAINER=ALL;
GRANT LOGMINING TO flinkuser CONTAINER=ALL;
GRANT CREATE TABLE TO flinkuser CONTAINER=ALL;
-- need not to execute if set scan.incremental.snapshot.enabled=true(default)
GRANT LOCK ANY TABLE TO flinkuser CONTAINER=ALL;
GRANT CREATE SEQUENCE TO flinkuser CONTAINER=ALL;
GRANT EXECUTE ON DBMS_LOGMNR TO flinkuser CONTAINER=ALL;
GRANT EXECUTE ON DBMS_LOGMNR_D TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$LOG TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$LOG_HISTORY TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$LOGMNR_LOGS TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$LOGMNR_CONTENTS TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$LOGMNR_PARAMETERS TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$LOGFILE TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$ARCHIVED_LOG TO flinkuser CONTAINER=ALL;
GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO flinkuser CONTAINER=ALL;
exit
```
See more about the [Setting up Oracle](https://debezium.io/documentation/reference/1.9/connectors/oracle.html#setting-up-oracle)
How to create an Oracle CDC table
----------------
The Oracle CDC table can be defined as following:
```sql
-- register an Oracle table 'products' in Flink SQL
Flink SQL> CREATE TABLE products (
ID INT NOT NULL,
NAME STRING,
DESCRIPTION STRING,
WEIGHT DECIMAL(10, 3),
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'oracle-cdc',
'hostname' = 'localhost',
'port' = '1521',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database-name' = 'ORCLCDB',
'schema-name' = 'inventory',
'table-name' = 'products');
-- read snapshot and redo logs from products table
Flink SQL> SELECT * FROM products;
```
**Note:**
When working with the CDB + PDB model, you are expected to add an extra option `'debezium.database.pdb.name' = 'xxx'` in Flink DDL to specific the name of the PDB to connect to.
**Note:**
While the connector might work with a variety of Oracle versions and editions, only Oracle 9i, 10g, 11g and 12c have been tested.
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'oracle-cdc'</code>.</td>
</tr>
<tr>
<td>hostname</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>IP address or hostname of the Oracle database server. If the url is not empty, hostname may not be configured, otherwise hostname can not be empty</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the Oracle database to use when connecting to the Oracle database server.</td>
</tr>
<tr>
<td>password</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to use when connecting to the Oracle database server.</td>
</tr>
<tr>
<td>database-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the Oracle server to monitor.</td>
</tr>
<tr>
<td>schema-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Schema name of the Oracle database to monitor.</td>
</tr>
<tr>
<td>table-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the Oracle database to monitor.</td>
</tr>
<tr>
<td>port</td>
<td>optional</td>
<td style="word-wrap: break-word;">1521</td>
<td>Integer</td>
<td>Integer port number of the Oracle database server.</td>
</tr>
<tr>
<td>url</td>
<td>optional</td>
<td style="word-wrap: break-word;">jdbc:oracle:thin:@{hostname}:{port}:{database-name}</td>
<td>String</td>
<td>JdbcUrl of the oracle database server . If the hostname and port parameter is configured, the URL is concatenated by hostname port database-name in SID format by default. Otherwise, you need to configure the URL parameter</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td>Optional startup mode for Oracle CDC consumer, valid enumerations are "initial"
and "latest-offset".
Please see <a href="#startup-reading-position">Startup Reading Position</a> section for more detailed information.</td>
</tr>
<tr>
<td>scan.incremental.snapshot.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>Incremental snapshot is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism,
the incremental snapshot has many advantages, including:
(1) source can be parallel during snapshot reading,
(2) source can perform checkpoints in the chunk granularity during snapshot reading,
(3) source doesn't need to acquire ROW SHARE MODE lock before snapshot reading.
</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">8096</td>
<td>Integer</td>
<td>The chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table.</td>
</tr>
<tr>
<td>scan.snapshot.fetch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>The maximum fetch size for per poll when read table snapshot.</td>
</tr>
<tr>
<td>connect.max-retries</td>
<td>optional</td>
<td style="word-wrap: break-word;">3</td>
<td>Integer</td>
<td>The max retry times that the connector should retry to build Oracle database server connection.</td>
</tr>
<tr>
<td>connection.pool.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">20</td>
<td>Integer</td>
<td>The connection pool size.</td>
</tr>
<tr>
<td>debezium.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from Oracle server.
For example: <code>'debezium.snapshot.mode' = 'never'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.9/connectors/oracle.html#oracle-connector-properties">Debezium's Oracle Connector properties</a></td>
</tr>
<tr>
<td>scan.incremental.close-idle-reader.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to close idle readers at the end of the snapshot phase. <br>
The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.<br>
If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true,
so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true'
</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.key-column</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table.
By default, the chunk key is 'ROWID'. This column must be a column of the primary key.</td>
</tr>
</tbody>
</table>
</div>
Limitation
--------
### Can't perform checkpoint during scanning snapshot of tables
During scanning snapshot of database tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, Oracle CDC source will keep the checkpoint waiting to timeout. The timeout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints:
```
execution.checkpointing.interval: 10min
execution.checkpointing.tolerable-failed-checkpoints: 100
restart-strategy: fixed-delay
restart-strategy.fixed-delay.attempts: 2147483647
```
Available Metadata
----------------
The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition.
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>table_name</td>
<td>STRING NOT NULL</td>
<td>Name of the table that contain the row.</td>
</tr>
<tr>
<td>schema_name</td>
<td>STRING NOT NULL</td>
<td>Name of the schema that contain the row.</td>
</tr>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>Name of the database that contain the row.</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>It indicates the time that the change was made in the database. <br>If the record is read from snapshot of the table instead of the change stream, the value is always 0.</td>
</tr>
</tbody>
</table>
The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields:
```sql
CREATE TABLE products (
db_name STRING METADATA FROM 'database_name' VIRTUAL,
schema_name STRING METADATA FROM 'schema_name' VIRTUAL,
table_name STRING METADATA FROM 'table_name' VIRTUAL,
operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL,
ID INT NOT NULL,
NAME STRING,
DESCRIPTION STRING,
WEIGHT DECIMAL(10, 3),
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'oracle-cdc',
'hostname' = 'localhost',
'port' = '1521',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database-name' = 'ORCLCDB',
'schema-name' = 'inventory',
'table-name' = 'products',
'debezium.log.mining.strategy' = 'online_catalog',
'debezium.log.mining.continuous.mine' = 'true'
);
```
**Note** : The Oracle dialect is case-sensitive, it converts field name to uppercase if the field name is not quoted, Flink SQL doesn't convert the field name. Thus for physical columns from oracle database, we should use its converted field name in Oracle when define an `oracle-cdc` table in Flink SQL.
Features
--------
### Exactly-Once Processing
The Oracle CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.9/connectors/oracle.html#how-the-oracle-connector-works).
### Startup Reading Position
The config option `scan.startup.mode` specifies the startup mode for Oracle CDC consumer. The valid enumerations are:
- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest redo log.
- `latest-offset`: Never to perform a snapshot on the monitored database tables upon first startup, just read from
the change since the connector was started.
_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not use them together. If you specific both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._
### Single Thread Reading
The Oracle CDC source can't work in parallel reading, because there is only one task can receive change events.
### DataStream Source
The Oracle CDC connector can also be a DataStream source. There are two modes for the DataStream source:
- incremental snapshot based, which allows parallel reading
- SourceFunction based, which only supports single thread reading
#### Incremental Snapshot based DataStream (Experimental)
```java
import org.apache.flink.cdc.connectors.base.options.StartupOptions;
import org.apache.flink.cdc.connectors.base.source.jdbc.JdbcIncrementalSource;
import org.apache.flink.cdc.connectors.oracle.source.OracleSourceBuilder;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Properties;
public class OracleParallelSourceExample {
public static void main(String[] args) throws Exception {
Properties debeziumProperties = new Properties();
debeziumProperties.setProperty("log.mining.strategy", "online_catalog");
JdbcIncrementalSource<String> oracleChangeEventSource =
new OracleSourceBuilder()
.hostname("host")
.port(1521)
.databaseList("ORCLCDB")
.schemaList("DEBEZIUM")
.tableList("DEBEZIUM.PRODUCTS")
.username("username")
.password("password")
.deserializer(new JsonDebeziumDeserializationSchema())
.includeSchemaChanges(true) // output the schema changes as well
.startupOptions(StartupOptions.initial())
.debeziumProperties(debeziumProperties)
.splitSize(2)
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// enable checkpoint
env.enableCheckpointing(3000L);
// set the source parallelism to 4
env.fromSource(
oracleChangeEventSource,
WatermarkStrategy.noWatermarks(),
"OracleParallelSource")
.setParallelism(4)
.print()
.setParallelism(1);
env.execute("Print Oracle Snapshot + RedoLog");
}
}
```
#### SourceFunction-based DataStream
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.oracle.OracleSource;
public class OracleSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = OracleSource.<String>builder()
.url("jdbc:oracle:thin:@{hostname}:{port}:{database}")
.port(1521)
.database("ORCLCDB") // monitor XE database
.schemaList("inventory") // monitor inventory schema
.tableList("inventory.products") // monitor products table
.username("flinkuser")
.password("flinkpw")
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left"><a href="https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Data-Types.html">Oracle type</a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td>NUMBER(p, s <= 0), p - s < 3
</td>
<td>TINYINT</td>
</tr>
<tr>
<td>NUMBER(p, s <= 0), p - s < 5
</td>
<td>SMALLINT</td>
</tr>
<tr>
<td>NUMBER(p, s <= 0), p - s < 10
</td>
<td>INT</td>
</tr>
<tr>
<td>NUMBER(p, s <= 0), p - s < 19
</td>
<td>BIGINT</td>
</tr>
<tr>
<td>NUMBER(p, s <= 0), 19 <= p - s <= 38 <br>
</td>
<td>DECIMAL(p - s, 0)</td>
</tr>
<tr>
<td>NUMBER(p, s > 0)
</td>
<td>DECIMAL(p, s)</td>
</tr>
<tr>
<td>NUMBER(p, s <= 0), p - s > 38
</td>
<td>STRING</td>
</tr>
<tr>
<td>
FLOAT<br>
BINARY_FLOAT
</td>
<td>FLOAT</td>
</tr>
<tr>
<td>
DOUBLE PRECISION<br>
BINARY_DOUBLE
</td>
<td>DOUBLE</td>
</tr>
<tr>
<td>NUMBER(1)</td>
<td>BOOLEAN</td>
</tr>
<tr>
<td>
DATE<br>
TIMESTAMP [(p)]
</td>
<td>TIMESTAMP [(p)] [WITHOUT TIMEZONE]</td>
</tr>
<tr>
<td>TIMESTAMP [(p)] WITH TIME ZONE</td>
<td>TIMESTAMP [(p)] WITH TIME ZONE</td>
</tr>
<tr>
<td>TIMESTAMP [(p)] WITH LOCAL TIME ZONE</td>
<td>TIMESTAMP_LTZ [(p)]</td>
</tr>
<tr>
<td>
CHAR(n)<br>
NCHAR(n)<br>
NVARCHAR2(n)<br>
VARCHAR(n)<br>
VARCHAR2(n)<br>
CLOB<br>
NCLOB<br>
XMLType<br>
SYS.XMLTYPE
</td>
<td>STRING</td>
</tr>
<tr>
<td>BLOB<br>
ROWID
</td>
<td>BYTES</td>
</tr>
<tr>
<td>
INTERVAL DAY TO SECOND<br>
INTERVAL YEAR TO MONTH
</td>
<td>BIGINT</td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,307 @@
---
title: "Overview"
weight: 1
type: docs
aliases:
- /connectors/cdc-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# CDC Connectors for Apache Flink
CDC Connectors for Apache Flink<sup>®</sup> is a set of source connectors for <a href="https://flink.apache.org/">Apache Flink<sup>®</sup></a>, ingesting changes from different databases using change data capture (CDC).
The CDC Connectors for Apache Flink<sup>®</sup> integrate Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium).
{{< img src="/fig/cdc-flow.png" width="600px" alt="Flink CDC" >}}
## Supported Connectors
| Connector | Database | Driver |
|-----------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|
| [mongodb-cdc](mongodb-cdc.md) | <li> [MongoDB](https://www.mongodb.com): 3.6, 4.x, 5.0 | MongoDB Driver: 4.3.4 |
| [mysql-cdc](mysql-cdc.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 | JDBC Driver: 8.0.28 |
| [oceanbase-cdc](oceanbase-cdc.md) | <li> [OceanBase CE](https://open.oceanbase.com): 3.1.x, 4.x <li> [OceanBase EE](https://www.oceanbase.com/product/oceanbase): 2.x, 3.x, 4.x | OceanBase Driver: 2.4.x |
| [oracle-cdc](oracle-cdc.md) | <li> [Oracle](https://www.oracle.com/index.html): 11, 12, 19, 21 | Oracle Driver: 19.3.0.0 |
| [postgres-cdc](postgres-cdc.md) | <li> [PostgreSQL](https://www.postgresql.org): 9.6, 10, 11, 12, 13, 14 | JDBC Driver: 42.5.1 |
| [sqlserver-cdc](sqlserver-cdc.md) | <li> [Sqlserver](https://www.microsoft.com/sql-server): 2012, 2014, 2016, 2017, 2019 | JDBC Driver: 9.4.1.jre8 |
| [tidb-cdc](tidb-cdc.md) | <li> [TiDB](https://www.pingcap.com/): 5.1.x, 5.2.x, 5.3.x, 5.4.x, 6.0.0 | JDBC Driver: 8.0.27 |
| [db2-cdc](db2-cdc.md) | <li> [Db2](https://www.ibm.com/products/db2): 11.5 | Db2 Driver: 11.5.0.0 |
| [vitess-cdc](vitess-cdc.md) | <li> [Vitess](https://vitess.io/): 8.0.x, 9.0.x | MySql JDBC Driver: 8.0.26 |
## Supported Flink Versions
The following table shows the version mapping between Flink<sup>®</sup> CDC Connectors and Flink<sup>®</sup>:
| Flink<sup>®</sup> CDC Version | Flink<sup>®</sup> Version |
|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| <font color="DarkCyan">1.0.0</font> | <font color="MediumVioletRed">1.11.*</font> |
| <font color="DarkCyan">1.1.0</font> | <font color="MediumVioletRed">1.11.*</font> |
| <font color="DarkCyan">1.2.0</font> | <font color="MediumVioletRed">1.12.*</font> |
| <font color="DarkCyan">1.3.0</font> | <font color="MediumVioletRed">1.12.*</font> |
| <font color="DarkCyan">1.4.0</font> | <font color="MediumVioletRed">1.13.*</font> |
| <font color="DarkCyan">2.0.*</font> | <font color="MediumVioletRed">1.13.*</font> |
| <font color="DarkCyan">2.1.*</font> | <font color="MediumVioletRed">1.13.*</font> |
| <font color="DarkCyan">2.2.*</font> | <font color="MediumVioletRed">1.13.\*</font>, <font color="MediumVioletRed">1.14.\*</font> |
| <font color="DarkCyan">2.3.*</font> | <font color="MediumVioletRed">1.13.\*</font>, <font color="MediumVioletRed">1.14.\*</font>, <font color="MediumVioletRed">1.15.\*</font>, <font color="MediumVioletRed">1.16.\*</font> |
| <font color="DarkCyan">2.4.*</font> | <font color="MediumVioletRed">1.13.\*</font>, <font color="MediumVioletRed">1.14.\*</font>, <font color="MediumVioletRed">1.15.\*</font>, <font color="MediumVioletRed">1.16.\*</font>, <font color="MediumVioletRed">1.17.\*</font> |
| <font color="DarkCyan">3.0.*</font> | <font color="MediumVioletRed">1.14.\*</font>, <font color="MediumVioletRed">1.15.\*</font>, <font color="MediumVioletRed">1.16.\*</font>, <font color="MediumVioletRed">1.17.\*</font>, <font color="MediumVioletRed">1.18.\*</font> |
## Features
1. Supports reading database snapshot and continues to read binlogs with **exactly-once processing** even failures happen.
2. CDC connectors for DataStream API, users can consume changes on multiple databases and tables in a single job without Debezium and Kafka deployed.
3. CDC connectors for Table/SQL API, users can use SQL DDL to create a CDC source to monitor changes on a single table.
The following table shows the current features of the connector:
| Connector | No-lock Read | Parallel Read | Exactly-once Read | Incremental Snapshot Read |
|-----------------------------------|--------------|---------------|-------------------|---------------------------|
| [mongodb-cdc](mongodb-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [mysql-cdc](mysql-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [oracle-cdc](oracle-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [postgres-cdc](postgres-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [sqlserver-cdc](sqlserver-cdc.md) | ✅ | ✅ | ✅ | ✅ |
| [oceanbase-cdc](oceanbase-cdc.md) | ❌ | ❌ | ❌ | ❌ |
| [tidb-cdc](tidb-cdc.md) | ✅ | ❌ | ✅ | ❌ |
| [db2-cdc](db2-cdc.md) | ❌ | ❌ | ✅ | ❌ |
| [vitess-cdc](vitess-cdc.md) | ✅ | ❌ | ✅ | ❌ |
## Usage for Table/SQL API
We need several steps to setup a Flink cluster with the provided connector.
1. Setup a Flink cluster with version 1.12+ and Java 8+ installed.
2. Download the connector SQL jars from the [Downloads](../downloads.md) page (or [build yourself](#building-from-source)).
3. Put the downloaded jars under `FLINK_HOME/lib/`.
4. Restart the Flink cluster.
The example shows how to create a MySQL CDC source in [Flink SQL Client](https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/table/sqlclient/) and execute queries on it.
```sql
-- creates a mysql cdc table source
CREATE TABLE mysql_binlog (
id INT NOT NULL,
name STRING,
description STRING,
weight DECIMAL(10,3),
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = 'localhost',
'port' = '3306',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database-name' = 'inventory',
'table-name' = 'products'
);
-- read snapshot and binlog data from mysql, and do some transformation, and show on the client
SELECT id, UPPER(name), description, weight FROM mysql_binlog;
```
## Usage for DataStream API
Include following Maven dependency (available through Maven Central):
```
<dependency>
<groupId>org.apache.flink</groupId>
<!-- add the dependency matching your database -->
<artifactId>flink-connector-mysql-cdc</artifactId>
<!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself. -->
<version>3.0-SNAPSHOT</version>
</dependency>
```
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.mysql.source.MySqlSource;
public class MySqlBinlogSourceExample {
public static void main(String[] args) throws Exception {
MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
.hostname("yourHostname")
.port(yourPort)
.databaseList("yourDatabaseName") // set captured database
.tableList("yourDatabaseName.yourTableName") // set captured table
.username("yourUsername")
.password("yourPassword")
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// enable checkpoint
env.enableCheckpointing(3000);
env
.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source")
// set 4 parallel source tasks
.setParallelism(4)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute("Print MySQL Snapshot + Binlog");
}
}
```
### Deserialization
The following JSON data show the change event in JSON format.
```json
{
"before": {
"id": 111,
"name": "scooter",
"description": "Big 2-wheel scooter",
"weight": 5.18
},
"after": {
"id": 111,
"name": "scooter",
"description": "Big 2-wheel scooter",
"weight": 5.15
},
"source": {...},
"op": "u", // the operation type, "u" means this this is an update event
"ts_ms": 1589362330904, // the time at which the connector processed the event
"transaction": null
}
```
**Note:** Please refer [Debezium documentation](https://debezium.io/documentation/reference/1.9/connectors/mysql.html#mysql-events
) to know the meaning of each field.
In some cases, users can use the `JsonDebeziumDeserializationSchema(true)` Constructor to enabled include schema in the message. Then the Debezium JSON message may look like this:
```json
{
"schema": {
"type": "struct",
"fields": [
{
"type": "struct",
"fields": [
{
"type": "int32",
"optional": false,
"field": "id"
},
{
"type": "string",
"optional": false,
"default": "flink",
"field": "name"
},
{
"type": "string",
"optional": true,
"field": "description"
},
{
"type": "double",
"optional": true,
"field": "weight"
}
],
"optional": true,
"name": "mysql_binlog_source.inventory_1pzxhca.products.Value",
"field": "before"
},
{
"type": "struct",
"fields": [
{
"type": "int32",
"optional": false,
"field": "id"
},
{
"type": "string",
"optional": false,
"default": "flink",
"field": "name"
},
{
"type": "string",
"optional": true,
"field": "description"
},
{
"type": "double",
"optional": true,
"field": "weight"
}
],
"optional": true,
"name": "mysql_binlog_source.inventory_1pzxhca.products.Value",
"field": "after"
},
{
"type": "struct",
"fields": {...},
"optional": false,
"name": "io.debezium.connector.mysql.Source",
"field": "source"
},
{
"type": "string",
"optional": false,
"field": "op"
},
{
"type": "int64",
"optional": true,
"field": "ts_ms"
}
],
"optional": false,
"name": "mysql_binlog_source.inventory_1pzxhca.products.Envelope"
},
"payload": {
"before": {
"id": 111,
"name": "scooter",
"description": "Big 2-wheel scooter",
"weight": 5.18
},
"after": {
"id": 111,
"name": "scooter",
"description": "Big 2-wheel scooter",
"weight": 5.15
},
"source": {...},
"op": "u", // the operation type, "u" means this this is an update event
"ts_ms": 1589362330904, // the time at which the connector processed the event
"transaction": null
}
}
```
Usually, it is recommended to exclude schema because schema fields makes the messages very verbose which reduces parsing performance.
The `JsonDebeziumDeserializationSchema` can also accept custom configuration of `JsonConverter`, for example if you want to obtain numeric output for decimal data,
you can construct `JsonDebeziumDeserializationSchema` as following:
```java
Map<String, Object> customConverterConfigs = new HashMap<>();
customConverterConfigs.put(JsonConverterConfig.DECIMAL_FORMAT_CONFIG, "numeric");
JsonDebeziumDeserializationSchema schema =
new JsonDebeziumDeserializationSchema(true, customConverterConfigs);
```
{{< top >}}

@ -0,0 +1,620 @@
---
title: "Postgres CDC Connector"
weight: 6
type: docs
aliases:
- /connectors/cdc-connectors/postgres-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Postgres CDC Connector
The Postgres CDC connector allows for reading snapshot data and incremental data from PostgreSQL database. This document describes how to setup the Postgres CDC connector to run SQL queries against PostgreSQL databases.
Dependencies
------------
In order to setup the Postgres CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-postgres-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-postgres-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-postgres-cdc-2.3.0.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-postgres-cdc), the released version will be available in the Maven central warehouse.
How to create a Postgres CDC table
----------------
The Postgres CDC table can be defined as following:
```sql
-- register a PostgreSQL table 'shipments' in Flink SQL
CREATE TABLE shipments (
shipment_id INT,
order_id INT,
origin STRING,
destination STRING,
is_arrived BOOLEAN
) WITH (
'connector' = 'postgres-cdc',
'hostname' = 'localhost',
'port' = '5432',
'username' = 'postgres',
'password' = 'postgres',
'database-name' = 'postgres',
'schema-name' = 'public',
'table-name' = 'shipments',
'slot.name' = 'flink',
-- experimental feature: incremental snapshot (default off)
'scan.incremental.snapshot.enabled' = 'true'
);
-- read snapshot and binlogs from shipments table
SELECT * FROM shipments;
```
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'postgres-cdc'</code>.</td>
</tr>
<tr>
<td>hostname</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>IP address or hostname of the PostgreSQL database server.</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the PostgreSQL database to use when connecting to the PostgreSQL database server.</td>
</tr>
<tr>
<td>password</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to use when connecting to the PostgreSQL database server.</td>
</tr>
<tr>
<td>database-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the PostgreSQL server to monitor.</td>
</tr>
<tr>
<td>schema-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Schema name of the PostgreSQL database to monitor.</td>
</tr>
<tr>
<td>table-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the PostgreSQL database to monitor.</td>
</tr>
<tr>
<td>port</td>
<td>optional</td>
<td style="word-wrap: break-word;">5432</td>
<td>Integer</td>
<td>Integer port number of the PostgreSQL database server.</td>
</tr>
<tr>
<td>slot.name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The name of the PostgreSQL logical decoding slot that was created for streaming changes from a particular plug-in
for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring.
<br/>Slot names must conform to <a href="https://www.postgresql.org/docs/current/static/warm-standby.html#STREAMING-REPLICATION-SLOTS-MANIPULATION">PostgreSQL replication slot naming rules</a>, which state: "Each replication slot has a name, which can contain lower-case letters, numbers, and the underscore character."</td>
</tr>
<tr>
<td>decoding.plugin.name</td>
<td>optional</td>
<td style="word-wrap: break-word;">decoderbufs</td>
<td>String</td>
<td>The name of the Postgres logical decoding plug-in installed on the server.
Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming, wal2json_rds_streaming and pgoutput.</td>
</tr>
<tr>
<td>changelog-mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">all</td>
<td>String</td>
<td>The changelog mode used for encoding streaming changes. Supported values are <code>all</code> (which encodes changes as retract stream using all RowKinds) and <code>upsert</code> (which encodes changes as upsert stream that describes idempotent updates on a key).
<br/> <code>upsert</code> mode can be used for tables with primary keys when replica identity <code>FULL</code> is not an option. Primary keys must be set to use <code>upsert</code> mode.</td>
</tr>
<tr>
<td>heartbeat.interval.ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">30s</td>
<td>Duration</td>
<td>The interval of sending heartbeat event for tracing the latest available replication slot offsets</td>
</tr>
<tr>
<td>debezium.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from Postgres server.
For example: <code>'debezium.snapshot.mode' = 'never'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.9/connectors/postgresql.html#postgresql-connector-properties">Debezium's Postgres Connector properties</a></td>
</tr>
<tr>
<td>debezium.snapshot.select.statement.overrides</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>If you encounter a situation where there is a large amount of data in the table and you don't need all the historical data. You can try to specify the underlying configuration in debezium to select the data range you want to snapshot. This parameter only affects snapshots and does not affect subsequent data reading consumption.
<br/> Note: PostgreSQL must use schema name and table name.
<br/> For example: <code>'debezium.snapshot.select.statement.overrides' = 'schema.table'</code>.
<br/> After specifying the above attributes, you must also add the following attributes:
<code> debezium.snapshot.select.statement.overrides.[schema].[table] </code>
</td>
</tr>
<tr>
<td>debezium.snapshot.select.statement.overrides.[schema].[table]</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>You can specify SQL statements to limit the data range of snapshot.
<br/> Note1: Schema and table need to be specified in the SQL statement, and the SQL should conform to the syntax of the data source.Currently.
<br/> For example: <code>'debezium.snapshot.select.statement.overrides.schema.table' = 'select * from schema.table where 1 != 1'</code>.
<br/> Note2: The Flink SQL client submission task does not support functions with single quotation marks in the content.
<br/> For example: <code>'debezium.snapshot.select.statement.overrides.schema.table' = 'select * from schema.table where to_char(rq, 'yyyy-MM-dd')'</code>.
</td>
</tr>
<tr>
<td>scan.incremental.snapshot.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Incremental snapshot is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism,
the incremental snapshot has many advantages, including:
(1) source can be parallel during snapshot reading,
(2) source can perform checkpoints in the chunk granularity during snapshot reading,
(3) source doesn't need to acquire global read lock (FLUSH TABLES WITH READ LOCK) before snapshot reading.
Please see <a href="#incremental-snapshot-reading ">Incremental Snapshot Reading</a>section for more detailed information.
</td>
</tr>
<tr>
<td>scan.incremental.close-idle-reader.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to close idle readers at the end of the snapshot phase. <br>
The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.<br>
If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true,
so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true'
</td>
</tr>
</tbody>
</table>
</div>
<div>
Note: `slot.name` is recommended to set for different tables to avoid the potential `PSQLException: ERROR: replication slot "flink" is active for PID 974` error. See more [here](https://debezium.io/documentation/reference/1.9/connectors/postgresql.html#postgresql-property-slot-name).
### Incremental Snapshot Options
The following options is available only when `scan.incremental.snapshot.enabled=true`:
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>scan.incremental.snapshot.chunk.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">8096</td>
<td>Integer</td>
<td>The chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table.</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td>Optional startup mode for Postgres CDC consumer, valid enumerations are "initial"
and "latest-offset".
Please see <a href="#startup-reading-position">Startup Reading Position</a> section for more detailed information.</td>
</tr>
<tr>
<td>chunk-meta.group.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1000</td>
<td>Integer</td>
<td>The group size of chunk meta, if the meta size exceeds the group size, the meta will be divided into multiple groups.</td>
</tr>
<tr>
<td>connect.timeout</td>
<td>optional</td>
<td style="word-wrap: break-word;">30s</td>
<td>Duration</td>
<td>The maximum time that the connector should wait after trying to connect to the PostgreSQL database server before timing out.</td>
</tr>
<tr>
<td>connect.pool.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">30</td>
<td>Integer</td>
<td>The connection pool size.</td>
</tr>
<tr>
<td>connect.max-retries</td>
<td>optional</td>
<td style="word-wrap: break-word;">3</td>
<td>Integer</td>
<td>The max retry times that the connector should retry to build database server connection.</td>
</tr>
<tr>
<td>scan.snapshot.fetch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>The maximum fetch size for per poll when read table snapshot.</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.key-column</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table.
By default, the chunk key is the first column of the primary key. This column must be a column of the primary key.</td>
</tr>
<tr>
<td>chunk-key.even-distribution.factor.lower-bound</td>
<td>optional</td>
<td style="word-wrap: break-word;">0.05d</td>
<td>Double</td>
<td>The lower bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not.
The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven.
The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.</td>
</tr>
<tr>
<td>chunk-key.even-distribution.factor.upper-bound</td>
<td>optional</td>
<td style="word-wrap: break-word;">1000.0d</td>
<td>Double</td>
<td>The upper bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not.
The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven.
The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.</td>
</tr>
</tbody>
</table>
</div>
Available Metadata
----------------
The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition.
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>table_name</td>
<td>STRING NOT NULL</td>
<td>Name of the table that contain the row.</td>
</tr>
<tr>
<td>schema_name</td>
<td>STRING NOT NULL</td>
<td>Name of the schema that contain the row.</td>
</tr>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>Name of the database that contain the row.</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>It indicates the time that the change was made in the database. <br>If the record is read from snapshot of the table instead of the change stream, the value is always 0.</td>
</tr>
</tbody>
</table>
Limitation
--------
### Can't perform checkpoint during scanning snapshot of tables when incremental snapshot is disabled
When `scan.incremental.snapshot.enabled=false`, we have the following limitation.
During scanning snapshot of database tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, Postgres CDC source will keep the checkpoint waiting to timeout. The timeout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints:
```
execution.checkpointing.interval: 10min
execution.checkpointing.tolerable-failed-checkpoints: 100
restart-strategy: fixed-delay
restart-strategy.fixed-delay.attempts: 2147483647
```
The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields:
```sql
CREATE TABLE products (
db_name STRING METADATA FROM 'database_name' VIRTUAL,
table_name STRING METADATA FROM 'table_name' VIRTUAL,
operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL,
shipment_id INT,
order_id INT,
origin STRING,
destination STRING,
is_arrived BOOLEAN
) WITH (
'connector' = 'postgres-cdc',
'hostname' = 'localhost',
'port' = '5432',
'username' = 'postgres',
'password' = 'postgres',
'database-name' = 'postgres',
'schema-name' = 'public',
'table-name' = 'shipments',
'slot.name' = 'flink'
);
```
Features
--------
### Incremental Snapshot Reading (Experimental)
Incremental snapshot reading is a new mechanism to read snapshot of a table. Compared to the old snapshot mechanism, the incremental snapshot has many advantages, including:
* (1) PostgreSQL CDC Source can be parallel during snapshot reading
* (2) PostgreSQL CDC Source can perform checkpoints in the chunk granularity during snapshot reading
* (3) PostgreSQL CDC Source doesn't need to acquire global read lock before snapshot reading
During the incremental snapshot reading, the PostgreSQL CDC Source firstly splits snapshot chunks (splits) by primary key of table,
and then PostgreSQL CDC Source assigns the chunks to multiple readers to read the data of snapshot chunk.
### Exactly-Once Processing
The Postgres CDC connector is a Flink Source connector which will read database snapshot first and then continues to read binlogs with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.9/connectors/postgresql.html#how-the-postgresql-connector-works).
### DataStream Source
The Postgres CDC connector can also be a DataStream source. There are two modes for the DataStream source:
- incremental snapshot based, which allows parallel reading
- SourceFunction based, which only supports single thread reading
#### Incremental Snapshot based DataStream (Experimental)
```java
import org.apache.flink.cdc.connectors.base.source.jdbc.JdbcIncrementalSource;
import org.apache.flink.cdc.connectors.postgres.source.PostgresSourceBuilder;
import org.apache.flink.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class PostgresParallelSourceExample {
public static void main(String[] args) throws Exception {
DebeziumDeserializationSchema<String> deserializer =
new JsonDebeziumDeserializationSchema();
JdbcIncrementalSource<String> postgresIncrementalSource =
PostgresSourceBuilder.PostgresIncrementalSource.<String>builder()
.hostname("localhost")
.port(5432)
.database("postgres")
.schemaList("inventory")
.tableList("inventory.products")
.username("postgres")
.password("postgres")
.slotName("flink")
.decodingPluginName("decoderbufs") // use pgoutput for PostgreSQL 10+
.deserializer(deserializer)
.includeSchemaChanges(true) // output the schema changes as well
.splitSize(2) // the split size of each snapshot split
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(3000);
env.fromSource(
postgresIncrementalSource,
WatermarkStrategy.noWatermarks(),
"PostgresParallelSource")
.setParallelism(2)
.print();
env.execute("Output Postgres Snapshot");
}
}
```
#### SourceFunction-based DataStream
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.postgres.PostgreSQLSource;
public class PostgreSQLSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = PostgreSQLSource.<String>builder()
.hostname("localhost")
.port(5432)
.database("postgres") // monitor postgres database
.schemaList("inventory") // monitor inventory schema
.tableList("inventory.products") // monitor products table
.username("flinkuser")
.password("flinkpw")
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">PostgreSQL type<a href="https://www.postgresql.org/docs/12/datatype.html"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>TINYINT</td>
</tr>
<tr>
<td>
SMALLINT<br>
INT2<br>
SMALLSERIAL<br>
SERIAL2</td>
<td>SMALLINT</td>
</tr>
<tr>
<td>
INTEGER<br>
SERIAL</td>
<td>INT</td>
</tr>
<tr>
<td>
BIGINT<br>
BIGSERIAL</td>
<td>BIGINT</td>
</tr>
<tr>
<td></td>
<td>DECIMAL(20, 0)</td>
</tr>
<tr>
<td>BIGINT</td>
<td>BIGINT</td>
</tr>
<tr>
<td>
REAL<br>
FLOAT4</td>
<td>FLOAT</td>
</tr>
<tr>
<td>
FLOAT8<br>
DOUBLE PRECISION</td>
<td>DOUBLE</td>
</tr>
<tr>
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)</td>
<td>DECIMAL(p, s)</td>
</tr>
<tr>
<td>BOOLEAN</td>
<td>BOOLEAN</td>
</tr>
<tr>
<td>DATE</td>
<td>DATE</td>
</tr>
<tr>
<td>TIME [(p)] [WITHOUT TIMEZONE]</td>
<td>TIME [(p)] [WITHOUT TIMEZONE]</td>
</tr>
<tr>
<td>TIMESTAMP [(p)] [WITHOUT TIMEZONE]</td>
<td>TIMESTAMP [(p)] [WITHOUT TIMEZONE]</td>
</tr>
<tr>
<td>
CHAR(n)<br>
CHARACTER(n)<br>
VARCHAR(n)<br>
CHARACTER VARYING(n)<br>
TEXT</td>
<td>STRING</td>
</tr>
<tr>
<td>BYTEA</td>
<td>BYTES</td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,507 @@
---
title: "SQLServer CDC Connector"
weight: 7
type: docs
aliases:
- /connectors/cdc-connectors/sqlserver-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# SQLServer CDC Connector
The SQLServer CDC connector allows for reading snapshot data and incremental data from SQLServer database. This document describes how to setup the SQLServer CDC connector to run SQL queries against SQLServer databases.
Dependencies
------------
In order to setup the SQLServer CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-sqlserver-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-sqlserver-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-sqlserver-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-sqlserver-cdc), the released version will be available in the Maven central warehouse.
Setup SQLServer Database
----------------
A SQL Server administrator must enable change data capture on the source tables that you want to capture. The database must already be enabled for CDC. To enable CDC on a table, a SQL Server administrator runs the stored procedure ```sys.sp_cdc_enable_table``` for the table.
**Prerequisites:**
* CDC is enabled on the SQL Server database.
* The SQL Server Agent is running.
* You are a member of the db_owner fixed database role for the database.
**Procedure:**
* Connect to the SQL Server database by database management studio.
* Run the following SQL statement to enable CDC on the table.
```sql
USE MyDB
GO
EXEC sys.sp_cdc_enable_table
@source_schema = N'dbo', -- Specifies the schema of the source table.
@source_name = N'MyTable', -- Specifies the name of the table that you want to capture.
@role_name = N'MyRole', -- Specifies a role MyRole to which you can add users to whom you want to grant SELECT permission on the captured columns of the source table. Users in the sysadmin or db_owner role also have access to the specified change tables. Set the value of @role_name to NULL, to allow only members in the sysadmin or db_owner to have full access to captured information.
@filegroup_name = N'MyDB_CT',-- Specifies the filegroup where SQL Server places the change table for the captured table. The named filegroup must already exist. It is best not to locate change tables in the same filegroup that you use for source tables.
@supports_net_changes = 0
GO
```
* Verifying that the user has access to the CDC table
```sql
--The following example runs the stored procedure sys.sp_cdc_help_change_data_capture on the database MyDB:
USE MyDB;
GO
EXEC sys.sp_cdc_help_change_data_capture
GO
```
The query returns configuration information for each table in the database that is enabled for CDC and that contains change data that the caller is authorized to access. If the result is empty, verify that the user has privileges to access both the capture instance and the CDC tables.
How to create a SQLServer CDC table
----------------
The SqlServer CDC table can be defined as following:
```sql
-- register a SqlServer table 'orders' in Flink SQL
CREATE TABLE orders (
id INT,
order_date DATE,
purchaser INT,
quantity INT,
product_id INT,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'sqlserver-cdc',
'hostname' = 'localhost',
'port' = '1433',
'username' = 'sa',
'password' = 'Password!',
'database-name' = 'inventory',
'table-name' = 'dob.orders'
);
-- read snapshot and binlogs from orders table
SELECT * FROM orders;
```
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'sqlserver-cdc'</code>.</td>
</tr>
<tr>
<td>hostname</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>IP address or hostname of the SQLServer database.</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Username to use when connecting to the SQLServer database.</td>
</tr>
<tr>
<td>password</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to use when connecting to the SQLServer database.</td>
</tr>
<tr>
<td>database-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the SQLServer database to monitor.</td>
</tr>
<tr>
<td>table-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the SQLServer database to monitor, e.g.: "db1.table1"</td>
</tr>
<tr>
<td>port</td>
<td>optional</td>
<td style="word-wrap: break-word;">1433</td>
<td>Integer</td>
<td>Integer port number of the SQLServer database.</td>
</tr>
<tr>
<td>server-time-zone</td>
<td>optional</td>
<td style="word-wrap: break-word;">UTC</td>
<td>String</td>
<td>The session time zone in database server, e.g. "Asia/Shanghai".</td>
</tr>
<tr>
<td>scan.incremental.snapshot.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>Whether enable parallelism snapshot.</td>
</tr>
<tr>
<td>chunk-meta.group.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1000</td>
<td>Integer</td>
<td>The group size of chunk meta, if the meta size exceeds the group size, the meta will be divided into multiple groups.</td>
</tr>
<tr>
<td>chunk-key.even-distribution.factor.lower-bound</td>
<td>optional</td>
<td style="word-wrap: break-word;">0.05d</td>
<td>Double</td>
<td>The lower bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not.
The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven.
The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.</td>
</tr>
<tr>
<td>chunk-key.even-distribution.factor.upper-bound</td>
<td>optional</td>
<td style="word-wrap: break-word;">1000.0d</td>
<td>Double</td>
<td>The upper bound of chunk key distribution factor. The distribution factor is used to determine whether the table is evenly distribution or not.
The table chunks would use evenly calculation optimization when the data distribution is even, and the query for splitting would happen when it is uneven.
The distribution factor could be calculated by (MAX(id) - MIN(id) + 1) / rowCount.</td>
</tr>
<tr>
<td>debezium.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from SQLServer.
For example: <code>'debezium.snapshot.mode' = 'initial_only'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.9/connectors/sqlserver.html#sqlserver-required-connector-configuration-properties">Debezium's SQLServer Connector properties</a></td>
</tr>
<tr>
<td>scan.incremental.close-idle-reader.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to close idle readers at the end of the snapshot phase. <br>
The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.<br>
If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true,
so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true'
</td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.key-column</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>The chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table.
By default, the chunk key is the first column of the primary key. This column must be a column of the primary key.</td>
</tr>
</tbody>
</table>
</div>
Available Metadata
----------------
The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition.
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>table_name</td>
<td>STRING NOT NULL</td>
<td>Name of the table that contain the row.</td>
</tr>
<tr>
<td>schema_name</td>
<td>STRING NOT NULL</td>
<td>Name of the schema that contain the row.</td>
</tr>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>Name of the database that contain the row.</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>It indicates the time that the change was made in the database. <br>If the record is read from snapshot of the table instead of the change stream, the value is always 0.</td>
</tr>
</tbody>
</table>
Limitation
--------
### Can't perform checkpoint during scanning snapshot of tables
During scanning snapshot of database tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, SqlServer CDC source will keep the checkpoint waiting to timeout. The timeout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints:
```
execution.checkpointing.interval: 10min
execution.checkpointing.tolerable-failed-checkpoints: 100
restart-strategy: fixed-delay
restart-strategy.fixed-delay.attempts: 2147483647
```
The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields:
```sql
CREATE TABLE products (
table_name STRING METADATA FROM 'table_name' VIRTUAL,
schema_name STRING METADATA FROM 'schema_name' VIRTUAL,
db_name STRING METADATA FROM 'database_name' VIRTUAL,
operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL,
id INT NOT NULL,
name STRING,
description STRING,
weight DECIMAL(10,3)
) WITH (
'connector' = 'sqlserver-cdc',
'hostname' = 'localhost',
'port' = '1433',
'username' = 'sa',
'password' = 'Password!',
'database-name' = 'inventory',
'table-name' = 'dbo.products'
);
```
Features
--------
### Exactly-Once Processing
The SQLServer CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.9/connectors/sqlserver.html#how-the-sqlserver-connector-works).
### Startup Reading Position
The config option `scan.startup.mode` specifies the startup mode for SQLServer CDC consumer. The valid enumerations are:
- `initial` (default): Takes a snapshot of structure and data of captured tables; useful if topics should be populated with a complete representation of the data from the captured tables.
- `initial-only`: Takes a snapshot of structure and data like initial but instead does not transition into streaming changes once the snapshot has completed.
- `latest-offset`: Takes a snapshot of the structure of captured tables only; useful if only changes happening from now onwards should be propagated to topics.
_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not use them together. If you specific both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._
### Single Thread Reading
The SQLServer CDC source can't work in parallel reading, because there is only one task can receive change events.
### DataStream Source
The SQLServer CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows:
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.sqlserver.SqlServerSource;
public class SqlServerSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = SqlServerSource.<String>builder()
.hostname("localhost")
.port(1433)
.database("sqlserver") // monitor sqlserver database
.tableList("dbo.products") // monitor products table
.username("sa")
.password("Password!")
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
The SQLServer CDC incremental connector (after 2.4.0) can be used as the following shows:
```java
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.cdc.connectors.base.options.StartupOptions;
import org.apache.flink.cdc.connectors.sqlserver.source.SqlServerSourceBuilder;
import org.apache.flink.cdc.connectors.sqlserver.source.SqlServerSourceBuilder.SqlServerIncrementalSource;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
public class SqlServerIncrementalSourceExample {
public static void main(String[] args) throws Exception {
SqlServerIncrementalSource<String> sqlServerSource =
new SqlServerSourceBuilder()
.hostname("localhost")
.port(1433)
.databaseList("inventory")
.tableList("dbo.products")
.username("sa")
.password("Password!")
.deserializer(new JsonDebeziumDeserializationSchema())
.startupOptions(StartupOptions.initial())
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// enable checkpoint
env.enableCheckpointing(3000);
// set the source parallelism to 2
env.fromSource(
sqlServerSource,
WatermarkStrategy.noWatermarks(),
"SqlServerIncrementalSource")
.setParallelism(2)
.print()
.setParallelism(1);
env.execute("Print SqlServer Snapshot + Change Stream");
}
}
```
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">SQLServer type<a href="https://docs.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td>char(n)</td>
<td>CHAR(n)</td>
</tr>
<tr>
<td>
varchar(n)<br>
nvarchar(n)<br>
nchar(n)
</td>
<td>VARCHAR(n)</td>
</tr>
<tr>
<td>
text<br>
ntext<br>
xml
</td>
<td>STRING</td>
</tr>
<tr>
<td>
decimal(p, s)<br>
money<br>
smallmoney
</td>
<td>DECIMAL(p, s)</td>
</tr>
<tr>
<td>numeric</td>
<td>NUMERIC</td>
</tr>
<tr>
<td>
float<br>
real
</td>
<td>DOUBLE</td>
</tr>
<tr>
<td>bit</td>
<td>BOOLEAN</td>
</tr>
<tr>
<td>int</td>
<td>INT</td>
</tr>
<tr>
<td>tinyint</td>
<td>SMALLINT</td>
</tr>
<tr>
<td>smallint</td>
<td>SMALLINT</td>
</tr>
<tr>
<td>bigint</td>
<td>BIGINT</td>
</tr>
<tr>
<td>date</td>
<td>DATE</td>
</tr>
<tr>
<td>time(n)</td>
<td>TIME(n)</td>
</tr>
<tr>
<td>
datetime2<br>
datetime<br>
smalldatetime
</td>
<td>TIMESTAMP(n)</td>
</tr>
<tr>
<td>datetimeoffset</td>
<td>TIMESTAMP_LTZ(3)</td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,496 @@
---
title: "TiDB CDC Connector"
weight: 8
type: docs
aliases:
- /connectors/cdc-connectors/tidb-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# TiDB CDC Connector
The TiDB CDC connector allows for reading snapshot data and incremental data from TiDB database. This document describes how to setup the TiDB CDC connector to run SQL queries against TiDB databases.
Dependencies
------------
In order to setup the TiDB CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-tidb-cdc >}}
### SQL Client JAR
```Download link is available only for stable releases.```
Download [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
**Note:** flink-sql-connector-tidb-cdc-XXX-SNAPSHOT version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as [flink-sql-connector-tidb-cdc-2.2.1.jar](https://mvnrepository.com/artifact/org.apache.flink/flink-sql-connector-tidb-cdc), the released version will be available in the Maven central warehouse.
How to create a TiDB CDC table
----------------
The TiDB CDC table can be defined as following:
```sql
-- checkpoint every 3000 milliseconds
Flink SQL> SET 'execution.checkpointing.interval' = '3s';
-- register a TiDB table 'orders' in Flink SQL
Flink SQL> CREATE TABLE orders (
order_id INT,
order_date TIMESTAMP(3),
customer_name STRING,
price DECIMAL(10, 5),
product_id INT,
order_status BOOLEAN,
PRIMARY KEY(order_id) NOT ENFORCED
) WITH (
'connector' = 'tidb-cdc',
'tikv.grpc.timeout_in_ms' = '20000',
'pd-addresses' = 'localhost:2379',
'database-name' = 'mydb',
'table-name' = 'orders'
);
-- read snapshot and binlogs from orders table
Flink SQL> SELECT * FROM orders;
```
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 10%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 65%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'tidb-cdc'</code>.</td>
</tr>
<tr>
<td>database-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the TiDB server to monitor.</td>
</tr>
<tr>
<td>table-name</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the TiDB database to monitor.</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td>Optional startup mode for TiDB CDC consumer, valid enumerations are "initial" and "latest-offset".</td>
</tr>
<tr>
<td>pd-addresses</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>TiKV cluster's PD address.</td>
</tr>
<tr>
<td>tikv.grpc.timeout_in_ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>TiKV GRPC timeout in ms.</td>
</tr>
<tr>
<td>tikv.grpc.scan_timeout_in_ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>TiKV GRPC scan timeout in ms.</td>
</tr>
<tr>
<td>tikv.batch_get_concurrency</td>
<td>optional</td>
<td style="word-wrap: break-word;">20</td>
<td>Integer</td>
<td>TiKV GRPC batch get concurrency.</td>
</tr>
<tr>
<td>tikv.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through TiDB client's properties.</td>
</tr>
</tbody>
</table>
</div>
Available Metadata
----------------
The following format metadata can be exposed as read-only (VIRTUAL) columns in a table definition.
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 15%">Key</th>
<th class="text-left" style="width: 30%">DataType</th>
<th class="text-left" style="width: 55%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>table_name</td>
<td>STRING NOT NULL</td>
<td>Name of the table that contain the row.</td>
</tr>
<tr>
<td>database_name</td>
<td>STRING NOT NULL</td>
<td>Name of the database that contain the row.</td>
</tr>
<tr>
<td>op_ts</td>
<td>TIMESTAMP_LTZ(3) NOT NULL</td>
<td>It indicates the time that the change was made in the database. <br>If the record is read from snapshot of the table instead of the binlog, the value is always 0.</td>
</tr>
</tbody>
</table>
The extended CREATE TABLE example demonstrates the syntax for exposing these metadata fields:
```sql
CREATE TABLE products (
db_name STRING METADATA FROM 'database_name' VIRTUAL,
table_name STRING METADATA FROM 'table_name' VIRTUAL,
operation_ts TIMESTAMP_LTZ(3) METADATA FROM 'op_ts' VIRTUAL,
order_id INT,
order_date TIMESTAMP(0),
customer_name STRING,
price DECIMAL(10, 5),
product_id INT,
order_status BOOLEAN,
PRIMARY KEY(order_id) NOT ENFORCED
) WITH (
'connector' = 'tidb-cdc',
'tikv.grpc.timeout_in_ms' = '20000',
'pd-addresses' = 'localhost:2379',
'database-name' = 'mydb',
'table-name' = 'orders'
);
```
Features
--------
### Exactly-Once Processing
The TiDB CDC connector is a Flink Source connector which will read database snapshot first and then continues to read change events with **exactly-once processing** even failures happen.
### Startup Reading Position
The config option `scan.startup.mode` specifies the startup mode for TiDB CDC consumer. The valid enumerations are:
- `initial` (default): Takes a snapshot of structure and data of captured tables; useful if you want fetch a complete representation of the data from the captured tables.
- `latest-offset`: Takes a snapshot of the structure of captured tables only; useful if only changes happening from now onwards should be fetched.
### Multi Thread Reading
The TiDB CDC source can work in parallel reading, because there is multiple tasks can receive change events.
### DataStream Source
The TiDB CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows:
### DataStream Source
```java
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.cdc.connectors.tidb.TDBSourceOptions;
import org.apache.flink.cdc.connectors.tidb.TiDBSource;
import org.apache.flink.cdc.connectors.tidb.TiKVChangeEventDeserializationSchema;
import org.apache.flink.cdc.connectors.tidb.TiKVSnapshotEventDeserializationSchema;
import org.tikv.kvproto.Cdcpb;
import org.tikv.kvproto.Kvrpcpb;
import java.util.HashMap;
public class TiDBSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> tidbSource =
TiDBSource.<String>builder()
.database("mydb") // set captured database
.tableName("products") // set captured table
.tiConf(
TDBSourceOptions.getTiConfiguration(
"localhost:2399", new HashMap<>()))
.snapshotEventDeserializer(
new TiKVSnapshotEventDeserializationSchema<String>() {
@Override
public void deserialize(
Kvrpcpb.KvPair record, Collector<String> out)
throws Exception {
out.collect(record.toString());
}
@Override
public TypeInformation<String> getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
})
.changeEventDeserializer(
new TiKVChangeEventDeserializationSchema<String>() {
@Override
public void deserialize(
Cdcpb.Event.Row record, Collector<String> out)
throws Exception {
out.collect(record.toString());
}
@Override
public TypeInformation<String> getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
})
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// enable checkpoint
env.enableCheckpointing(3000);
env.addSource(tidbSource).print().setParallelism(1);
env.execute("Print TiDB Snapshot + Binlog");
}
}
```
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">TiDB type<a href="https://dev.tidb.com/doc/man/8.0/en/data-types.html"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
<th class="text-left">NOTE</th>
</tr>
</thead>
<tbody>
<tr>
<td>TINYINT</td>
<td>TINYINT</td>
<td></td>
</tr>
<tr>
<td>
SMALLINT<br>
TINYINT UNSIGNED</td>
<td>SMALLINT</td>
<td></td>
</tr>
<tr>
<td>
INT<br>
MEDIUMINT<br>
SMALLINT UNSIGNED</td>
<td>INT</td>
<td></td>
</tr>
<tr>
<td>
BIGINT<br>
INT UNSIGNED</td>
<td>BIGINT</td>
<td></td>
</tr>
<tr>
<td>BIGINT UNSIGNED</td>
<td>DECIMAL(20, 0)</td>
<td></td>
</tr>
<tr>
<td>
FLOAT<br>
</td>
<td>FLOAT</td>
<td></td>
</tr>
<tr>
<td>
REAL<br>
DOUBLE
</td>
<td>DOUBLE</td>
<td></td>
</tr>
<tr>
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)<br>
where p <= 38<br>
</td>
<td>DECIMAL(p, s)</td>
<td></td>
</tr>
<tr>
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)<br>
where 38 < p <= 65<br>
</td>
<td>STRING</td>
<td>The precision for DECIMAL data type is up to 65 in TiDB, but the precision for DECIMAL is limited to 38 in Flink.
So if you define a decimal column whose precision is greater than 38, you should map it to STRING to avoid precision loss.</td>
</tr>
<tr>
<td>
BOOLEAN<br>
TINYINT(1)<br>
BIT(1)
</td>
<td>BOOLEAN</td>
<td></td>
</tr>
<tr>
<td>DATE</td>
<td>DATE</td>
<td></td>
</tr>
<tr>
<td>TIME [(p)]</td>
<td>TIME [(p)]</td>
<td></td>
</tr>
<tr>
<td>TIMESTAMP [(p)]</td>
<td>TIMESTAMP_LTZ [(p)]</td>
<td></td>
</tr>
<tr>
<td>DATETIME [(p)]</td>
<td>TIMESTAMP [(p)]
</td>
<td></td>
</tr>
<tr>
<td>
CHAR(n)
</td>
<td>CHAR(n)</td>
<td></td>
</tr>
<tr>
<td>
VARCHAR(n)
</td>
<td>VARCHAR(n)</td>
<td></td>
</tr>
<tr>
<td>
BIT(n)
</td>
<td>BINARY(⌈n/8⌉)</td>
<td></td>
</tr>
<tr>
<td>
BINARY(n)
</td>
<td>BINARY(n)</td>
<td></td>
</tr>
<tr>
<td>
TINYTEXT<br>
TEXT<br>
MEDIUMTEXT<br>
LONGTEXT<br>
</td>
<td>STRING</td>
<td></td>
</tr>
<tr>
<td>
TINYBLOB<br>
BLOB<br>
MEDIUMBLOB<br>
LONGBLOB<br>
</td>
<td>BYTES</td>
<td>Currently, for BLOB data type in TiDB, only the blob whose length isn't greater than 2,147,483,647(2 ** 31 - 1) is supported. </td>
</tr>
<tr>
<td>
YEAR
</td>
<td>INT</td>
<td></td>
</tr>
<tr>
<td>
ENUM
</td>
<td>STRING</td>
<td></td>
</tr>
<tr>
<td>
JSON
</td>
<td>STRING</td>
<td>The JSON data type will be converted into STRING with JSON format in Flink.</td>
</tr>
<tr>
<td>
SET
</td>
<td>ARRAY&lt;STRING&gt;</td>
<td>As the SET data type in TiDB is a string object that can have zero or more values,
it should always be mapped to an array of string
</td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,329 @@
---
title: "Vitess CDC Connector"
weight: 10
type: docs
aliases:
- /connectors/cdc-connectors/vitess-cdc.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Vitess CDC Connector
The Vitess CDC connector allows for reading of incremental data from Vitess cluster. The connector does not support snapshot feature at the moment. This document describes how to setup the Vitess CDC connector to run SQL queries against Vitess databases.
[Vitess debezium documentation](https://debezium.io/documentation/reference/connectors/vitess.html)
Dependencies
------------
In order to setup the Vitess CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
{{< artifact flink-connector-vitess-cdc >}}
### SQL Client JAR
Download [flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-vitess-cdc/3.0-SNAPSHOT/flink-sql-connector-vitess-cdc-3.0-SNAPSHOT.jar) and put it under `<FLINK_HOME>/lib/`.
Setup Vitess server
----------------
You can follow the Local Install via [Docker guide](https://vitess.io/docs/get-started/local-docker/), or the Vitess Operator for [Kubernetes guide](https://vitess.io/docs/get-started/operator/) to install Vitess. No special setup is needed to support Vitess connector.
### Checklist
* Make sure that the VTGate host and its gRPC port (default is 15991) is accessible from the machine where the Vitess connector is installed
### gRPC authentication
Because Vitess connector reads change events from the VTGate VStream gRPC server, it does not need to connect directly to MySQL instances.
Therefore, no special database user and permissions are needed. At the moment, Vitess connector only supports unauthenticated access to the VTGate gRPC server.
How to create a Vitess CDC table
----------------
The Vitess CDC table can be defined as following:
```sql
-- checkpoint every 3000 milliseconds
Flink SQL> SET 'execution.checkpointing.interval' = '3s';
-- register a Vitess table 'orders' in Flink SQL
Flink SQL> CREATE TABLE orders (
order_id INT,
order_date TIMESTAMP(0),
customer_name STRING,
price DECIMAL(10, 5),
product_id INT,
order_status BOOLEAN,
PRIMARY KEY(order_id) NOT ENFORCED
) WITH (
'connector' = 'vitess-cdc',
'hostname' = 'localhost',
'port' = '3306',
'keyspace' = 'mydb',
'table-name' = 'orders');
-- read snapshot and binlogs from orders table
Flink SQL> SELECT * FROM orders;
```
Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto">
<thead>
<tr>
<th class="text-left">Option</th>
<th class="text-left">Required</th>
<th class="text-left">Default</th>
<th class="text-left">Type</th>
<th class="text-left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>connector</td>
<td>required</td>
<td>(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>&lsquo;vitess-cdc&rsquo;</code>.</td>
</tr>
<tr>
<td>hostname</td>
<td>required</td>
<td>(none)</td>
<td>String</td>
<td>IP address or hostname of the Vitess database server (VTGate).</td>
</tr>
<tr>
<td>keyspace</td>
<td>required</td>
<td>(none)</td>
<td>String</td>
<td>The name of the keyspace from which to stream the changes.</td>
</tr>
<tr>
<td>username</td>
<td>optional</td>
<td>(none)</td>
<td>String</td>
<td>An optional username of the Vitess database server (VTGate). If not configured, unauthenticated VTGate gRPC is used.</td>
</tr>
<tr>
<td>password</td>
<td>optional</td>
<td>(none)</td>
<td>String</td>
<td>An optional password of the Vitess database server (VTGate). If not configured, unauthenticated VTGate gRPC is used.</td>
</tr>
<tr>
<td>shard</td>
<td>optional</td>
<td>(none)</td>
<td>String</td>
<td>An optional name of the shard from which to stream the changes. If not configured, in case of unsharded keyspace, the connector streams changes from the only shard, in case of sharded keyspace, the connector streams changes from all shards in the keyspace.</td>
</tr>
<tr>
<td>gtid</td>
<td>optional</td>
<td>current</td>
<td>String</td>
<td>An optional GTID position for a shard to stream from.</td>
</tr>
<tr>
<td>stopOnReshard</td>
<td>optional</td>
<td>false</td>
<td>Boolean</td>
<td>Controls Vitess flag stop_on_reshard.</td>
</tr>
<tr>
<td>tombstonesOnDelete</td>
<td>optional</td>
<td>true</td>
<td>Boolean</td>
<td>Controls whether a delete event is followed by a tombstone event.</td>
</tr>
<tr>
<td>tombstonesOnDelete</td>
<td>optional</td>
<td>true</td>
<td>Boolean</td>
<td>Controls whether a delete event is followed by a tombstone event.</td>
</tr>
<tr>
<td>schemaNameAdjustmentMode</td>
<td>optional</td>
<td>avro</td>
<td>String</td>
<td>Specifies how schema names should be adjusted for compatibility with the message converter used by the connector.</td>
</tr>
<tr>
<td>table-name</td>
<td>required</td>
<td>(none)</td>
<td>String</td>
<td>Table name of the MySQL database to monitor.</td>
</tr>
<tr>
<td>tablet.type</td>
<td>optional</td>
<td>RDONLY</td>
<td>String</td>
<td>The type of Tablet (hence MySQL) from which to stream the changes: MASTER represents streaming from the master MySQL instance REPLICA represents streaming from the replica slave MySQL instance RDONLY represents streaming from the read-only slave MySQL instance.</td>
</tr>
</tbody>
</table>
</div>
Features
--------
### Incremental Reading
The Vitess connector spends all its time streaming changes from the VTGates VStream gRPC service to which it is subscribed. The client receives changes from VStream as they are committed in the underlying MySQL servers binlog at certain positions, which are referred to as VGTID.
The VGTID in Vitess is the equivalent of GTID in MySQL, it describes the position in the VStream in which a change event happens. Typically, A VGTID has multiple shard GTIDs, each shard GTID is a tuple of (Keyspace, Shard, GTID), which describes the GTID position of a given shard.
When subscribing to a VStream service, the connector needs to provide a VGTID and a Tablet Type (e.g. MASTER, REPLICA). The VGTID describes the position from which VStream should starts sending change events; the Tablet type describes which underlying MySQL instance (master or replica) in each shard do we read change events from.
The first time the connector connects to a Vitess cluster, it gets and provides the current VGTID to VStream.
The Debezium Vitess connector acts as a gRPC client of VStream. When the connector receives changes it transforms the events into Debezium create, update, or delete events that include the VGTID of the event. The Vitess connector forwards these change events in records to the Kafka Connect framework, which is running in the same process. The Kafka Connect process asynchronously writes the change event records in the same order in which they were generated to the appropriate Kafka topic.
#### Checkpoint
Incremental snapshot reading provides the ability to perform checkpoint in chunk level. It resolves the checkpoint timeout problem in previous version with old snapshot reading mechanism.
### Exactly-Once Processing
The Vitess CDC connector is a Flink Source connector which will read table snapshot chunks first and then continues to read binlog,
both snapshot phase and binlog phase, Vitess CDC connector read with **exactly-once processing** even failures happen.
### DataStream Source
The Incremental Reading feature of Vitess CDC Source only exposes in SQL currently, if you're using DataStream, please use Vitess Source:
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.cdc.connectors.vitess.VitessSource;
public class VitessSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = VitessSource.<String>builder()
.hostname("localhost")
.port(15991)
.keyspace("inventory")
.username("flinkuser")
.password("flinkpw")
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left">MySQL type<a href="https://dev.mysql.com/doc/man/8.0/en/data-types.html"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td>TINYINT</td>
<td>TINYINT</td>
</tr>
<tr>
<td>
SMALLINT<br>
TINYINT UNSIGNED</td>
<td>SMALLINT</td>
</tr>
<tr>
<td>
INT<br>
MEDIUMINT<br>
SMALLINT UNSIGNED</td>
<td>INT</td>
</tr>
<tr>
<td>
BIGINT<br>
INT UNSIGNED</td>
<td>BIGINT</td>
</tr>
<tr>
<td>BIGINT UNSIGNED</td>
<td>DECIMAL(20, 0)</td>
</tr>
<tr>
<td>BIGINT</td>
<td>BIGINT</td>
</tr>
<tr>
<td>FLOAT</td>
<td>FLOAT</td>
</tr>
<tr>
<td>
DOUBLE<br>
DOUBLE PRECISION</td>
<td>DOUBLE</td>
</tr>
<tr>
<td>
NUMERIC(p, s)<br>
DECIMAL(p, s)</td>
<td>DECIMAL(p, s)</td>
</tr>
<tr>
<td>
BOOLEAN<br>
TINYINT(1)</td>
<td>BOOLEAN</td>
</tr>
<tr>
<td>
CHAR(n)<br>
VARCHAR(n)<br>
TEXT</td>
<td>STRING</td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -0,0 +1,23 @@
---
title: Pipeline Connectors
bookCollapseSection: true
weight: 1
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,287 @@
---
title: "Doris Pipeline Connector"
weight: 2
type: docs
aliases:
- /pipelines/doris-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Doris Pipeline Connector
This article introduces of Doris Pipeline Connector
## Example
----------------
```yaml
source:
type: values
name: ValuesSource
sink:
type: doris
name: Doris Sink
fenodes: 127.0.0.1:8030
username: root
password: ""
table.create.properties.replication_num: 1
pipeline:
parallelism: 1
```
## Pipeline options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width: 10%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 65%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>type</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify the Sink to use, here is <code>'doris'</code>.</td>
</tr>
<tr>
<td>name</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td> Name of PipeLine </td>
</tr>
<tr>
<td>fenodes</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Http address of Doris cluster FE, such as 127.0.0.1:8030 </td>
</tr>
<tr>
<td>benodes</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Http address of Doris cluster BE, such as 127.0.0.1:8040 </td>
</tr>
<tr>
<td>jdbc-url</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>JDBC address of Doris cluster, for example: jdbc:mysql://127.0.0.1:9030/db</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Username of Doris cluster</td>
</tr>
<tr>
<td>password</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password for Doris cluster</td>
</tr>
<tr>
<td>auto-redirect</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>String</td>
<td> Whether to write through FE redirection and directly connect to BE to write </td>
</tr>
<tr>
<td>sink.enable.batch-mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td> Whether to use the batch method to write to Doris </td>
</tr>
<tr>
<td>sink.flush.queue-size</td>
<td>optional</td>
<td style="word-wrap: break-word;">2</td>
<td>Integer</td>
<td> Queue size for batch writing
</td>
</tr>
<tr>
<td>sink.buffer-flush.max-rows</td>
<td>optional</td>
<td style="word-wrap: break-word;">50000</td>
<td>Integer</td>
<td>Maximum number of Flush records in a single batch</td>
</tr>
<tr>
<td>sink.buffer-flush.max-bytes</td>
<td>optional</td>
<td style="word-wrap: break-word;">10485760(10MB)</td>
<td>Integer</td>
<td>Maximum number of bytes flushed in a single batch</td>
</tr>
<tr>
<td>sink.buffer-flush.interval</td>
<td>optional</td>
<td style="word-wrap: break-word;">10s</td>
<td>String</td>
<td>Flush interval duration. If this time is exceeded, the data will be flushed asynchronously</td>
</tr>
<tr>
<td>sink.properties.</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td> Parameters of StreamLoad.
For example: <code> sink.properties.strict_mode: true</code>.
See more about <a href="https://doris.apache.org/zh-CN/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD/"> StreamLoad Properties properties</a></td>
</td>
</tr>
<tr>
<td>table.create.properties.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Create the Properties configuration of the table.
For example: <code> table.create.properties.replication_num: 1</code>.
See more about <a href="https://doris.apache.org/zh-CN/docs/dev/sql-manual/sql-reference/Data-Definition-Statements/Create/CREATE-TABLE/"> Doris Table Properties properties</a></td>
</td>
</tr>
</tbody>
</table>
</div>
## Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
<tr>
<th class="text-left" style="width:10%;">CDC type</th>
<th class="text-left" style="width:30%;">Doris type<a href="https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Types/BOOLEAN/"></a></th>
<th class="text-left" style="width:60%;">NOTE</th>
</tr>
</thead>
<tbody>
<tr>
<td>TINYINT</td>
<td>TINYINT</td>
<td></td>
</tr>
<tr>
<td>SMALLINT</td>
<td>SMALLINT</td>
<td></td>
</tr>
<tr>
<td>INT</td>
<td>INT</td>
<td></td>
</tr>
<tr>
<td>BIGINT</td>
<td>BIGINT</td>
<td></td>
</tr>
<tr>
<td>DECIMAL</td>
<td>DECIMAL</td>
<td></td>
</tr>
<tr>
<td>FLOAT</td>
<td>FLOAT</td>
<td></td>
</tr>
<tr>
<td>DOUBLE</td>
<td>DOUBLE</td>
<td></td>
</tr>
<tr>
<td>BOOLEAN</td>
<td>BOOLEAN</td>
<td></td>
</tr>
<tr>
<td>DATE</td>
<td>DATE</td>
<td></td>
</tr>
<tr>
<td>TIMESTAMP [(p)]</td>
<td>DATETIME [(p)]</td>
<td></td>
</tr>
<tr>
<td>TIMESTAMP_LTZ [(p)]
</td>
<td>DATETIME [(p)]
</td>
<td></td>
</tr>
<tr>
<td>CHAR(n)</td>
<td>CHAR(n*3)</td>
<td>In Doris, strings are stored in UTF-8 encoding, so English characters occupy 1 byte and Chinese characters occupy 3 bytes. The length here is multiplied by 3. The maximum length of CHAR is 255. Once exceeded, it will automatically be converted to VARCHAR type.</td>
</tr>
<tr>
<td>VARCHAR(n)</td>
<td>VARCHAR(n*3)</td>
<td>Same as above. The length here is multiplied by 3. The maximum length of VARCHAR is 65533. Once exceeded, it will automatically be converted to STRING type.</td>
</tr>
<tr>
<td>
BINARY(n)
</td>
<td>STRING</td>
<td></td>
</tr>
<tr>
<td>
VARBINARY(N)
</td>
<td>STRING</td>
<td></td>
</tr>
<tr>
<td>STRING</td>
<td>STRING</td>
<td></td>
</tr>
</tbody>
</table>
</div>
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MySQL Pipeline Connector"
weight: 3
type: docs
aliases:
- /pipelines/mysql-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,16 +24,16 @@ specific language governing permissions and limitations
under the License.
-->
# MySQL CDC Pipeline 连接器
# MySQL CDC Pipeline Connector
MySQL CDC Pipeline 连接器允许从 MySQL 数据库读取快照数据和增量数据,并提供端到端的整库数据同步能力。
本文描述了如何设置 MySQL CDC Pipeline 连接器。
The MySQL CDC Pipeline Connector allows for reading snapshot data and incremental data from MySQL database and provides end-to-end full-database data synchronization capabilities.
This document describes how to setup the MySQL CDC Pipeline connector.
如何创建 Pipeline
How to create Pipeline
----------------
从 MySQL 读取数据同步到 Doris 的 Pipeline 可以定义如下:
The pipeline for reading data from MySQL and sink to Doris can be defined as follows:
```yaml
source:
@ -51,7 +58,7 @@ pipeline:
parallelism: 4
```
Pipeline 连接器选项
Pipeline Connector Options
----------------
<div class="highlight">
@ -71,181 +78,188 @@ Pipeline 连接器选项
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td> MySQL 数据库服务器的 IP 地址或主机名。</td>
<td>IP address or hostname of the MySQL database server.</td>
</tr>
<tr>
<td>port</td>
<td>optional</td>
<td style="word-wrap: break-word;">3306</td>
<td>Integer</td>
<td>MySQL 数据库服务器的整数端口号。</td>
<td>Integer port number of the MySQL database server.</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>连接到 MySQL 数据库服务器时要使用的 MySQL 用户的名称。</td>
<td>Name of the MySQL database to use when connecting to the MySQL database server.</td>
</tr>
<tr>
<td>password</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>连接 MySQL 数据库服务器时使用的密码。</td>
<td>Password to use when connecting to the MySQL database server.</td>
</tr>
<tr>
<td>tables</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>需要监视的 MySQL 数据库的表名。表名支持正则表达式,以监视满足正则表达式的多个表。<br>
需要注意的是,点号(.)被视为数据库和表名的分隔符。 如果需要在正则表达式中使用点(.)来匹配任何字符,必须使用反斜杠对点进行转义。<br>
例如db0.\.*, db1.user_table_[0-9]+, db[1-2].[app|web]order_\.*</td>
<td>Table name of the MySQL database to monitor. The table-name also supports regular expressions to monitor multiple tables that satisfy the regular expressions. <br>
It is important to note that the dot (.) is treated as a delimiter for database and table names.
If there is a need to use a dot (.) in a regular expression to match any character, it is necessary to escape the dot with a backslash.<br>
eg. db0.\.*, db1.user_table_[0-9]+, db[1-2].[app|web]order_\.*</td>
</tr>
<tr>
<td>schema-change.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>是否发送模式更改事件,下游 sink 可以响应模式变更事件实现表结构同步默认为true。</td>
<td>Whether to send schema change events, so that downstream sinks can respond to schema changes and achieve table structure synchronization.</td>
</tr>
<tr>
<td>server-id</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>读取数据使用的 server idserver id 可以是个整数或者一个整数范围,比如 '5400' 或 '5400-5408',
建议在 'scan.incremental.snapshot.enabled' 参数为启用时,配置成整数范围。因为在当前 MySQL 集群中运行的所有 slave 节点,标记每个 salve 节点的 id 都必须是唯一的。 所以当连接器加入 MySQL 集群作为另一个 slave 节点(并且具有唯一 id 的情况下),它就可以读取 binlog。 默认情况下,连接器会在 5400 和 6400 之间生成一个随机数,但是我们建议用户明确指定 Server id。
</td>
<td>A numeric ID or a numeric ID range of this database client, The numeric ID syntax is like '5400',
the numeric ID range syntax is like '5400-5408', The numeric ID range syntax is recommended when 'scan.incremental.snapshot.enabled' enabled.
Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the MySQL cluster
as another server (with this unique ID) so it can read the binlog. By default, a random number is generated between 5400 and 6400,
though we recommend setting an explicit value. </td>
</tr>
<tr>
<td>scan.incremental.snapshot.chunk.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">8096</td>
<td>Integer</td>
<td>表快照的块大小(行数),读取表的快照时,捕获的表被拆分为多个块。</td>
<td>The chunk size (number of rows) of table snapshot, captured tables are split into multiple chunks when read the snapshot of table.</td>
</tr>
<tr>
<td>scan.snapshot.fetch.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">1024</td>
<td>Integer</td>
<td>读取表快照时每次读取数据的最大条数。</td>
<td>The maximum fetch size for per poll when read table snapshot.</td>
</tr>
<tr>
<td>scan.startup.mode</td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td> MySQL CDC 消费者可选的启动模式,
合法的模式为 "initial""earliest-offset""latest-offset""specific-offset" 和 "timestamp"。
请查阅 <a href="#a-name-id-002-a">启动模式</a> 章节了解更多详细信息。</td>
<td>Optional startup mode for MySQL CDC consumer, valid enumerations are "initial", "earliest-offset", "latest-offset", "specific-offset" and "timestamp".
Please see <a href="#startup-reading-position">Startup Reading Position</a> section for more detailed information.</td>
</tr>
<tr>
<td>scan.startup.specific-offset.file</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>在 "specific-offset" 启动模式下,启动位点的 binlog 文件名。</td>
<td>Optional binlog file name used in case of "specific-offset" startup mode</td>
</tr>
<tr>
<td>scan.startup.specific-offset.pos</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>在 "specific-offset" 启动模式下,启动位点的 binlog 文件位置。</td>
<td>Optional binlog file position used in case of "specific-offset" startup mode</td>
</tr>
<tr>
<td>scan.startup.specific-offset.gtid-set</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>在 "specific-offset" 启动模式下,启动位点的 GTID 集合。</td>
<td>Optional GTID set used in case of "specific-offset" startup mode</td>
</tr>
<tr>
<td>scan.startup.specific-offset.skip-events</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>在指定的启动位点后需要跳过的事件数量。</td>
<td>Optional number of events to skip after the specific starting offset</td>
</tr>
<tr>
<td>scan.startup.specific-offset.skip-rows</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Long</td>
<td>在指定的启动位点后需要跳过的数据行数量。</td>
<td>Optional number of rows to skip after the specific starting offset</td>
</tr>
<tr>
<td>connect.timeout</td>
<td>optional</td>
<td style="word-wrap: break-word;">30s</td>
<td>Duration</td>
<td>连接器在尝试连接到 MySQL 数据库服务器后超时前应等待的最长时间。</td>
<td>The maximum time that the connector should wait after trying to connect to the MySQL database server before timing out.</td>
</tr>
<tr>
<td>connect.max-retries</td>
<td>optional</td>
<td style="word-wrap: break-word;">3</td>
<td>Integer</td>
<td>连接器应重试以建立 MySQL 数据库服务器连接的最大重试次数。</td>
<td>The max retry times that the connector should retry to build MySQL database server connection.</td>
</tr>
<tr>
<td>connection.pool.size</td>
<td>optional</td>
<td style="word-wrap: break-word;">20</td>
<td>Integer</td>
<td>连接池大小。</td>
<td>The connection pool size.</td>
</tr>
<tr>
<td>jdbc.properties.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">20</td>
<td>String</td>
<td>传递自定义 JDBC URL 属性的选项。用户可以传递自定义属性,如 'jdbc.properties.useSSL' = 'false'.</td>
<td>Option to pass custom JDBC URL properties. User can pass custom properties like 'jdbc.properties.useSSL' = 'false'.</td>
</tr>
<tr>
<td>heartbeat.interval</td>
<td>optional</td>
<td style="word-wrap: break-word;">30s</td>
<td>Duration</td>
<td>用于跟踪最新可用 binlog 偏移的发送心跳事件的间隔。</td>
<td>The interval of sending heartbeat event for tracing the latest available binlog offsets.</td>
</tr>
<tr>
<td>debezium.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>将 Debezium 的属性传递给 Debezium 嵌入式引擎,该引擎用于从 MySQL 服务器捕获数据更改。
例如: <code>'debezium.snapshot.mode' = 'never'</code>.
查看更多关于 <a href="https://debezium.io/documentation/reference/1.9/connectors/mysql.html#mysql-connector-properties"> Debezium 的 MySQL 连接器属性</a></td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from MySQL server.
For example: <code>'debezium.snapshot.mode' = 'never'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.9/connectors/mysql.html#mysql-connector-properties">Debezium's MySQL Connector properties</a></td>
</tr>
<tr>
<td>scan.incremental.close-idle-reader.enabled</td>
<td>optional</td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>是否在快照结束后关闭空闲的 Reader。 此特性需要 flink 版本大于等于 1.14 并且 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 需要设置为 true。<br>
若 flink 版本大于等于 1.15'execution.checkpointing.checkpoints-after-tasks-finish.enabled' 默认值变更为 true可以不用显式配置 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = true。</td>
<td>Whether to close idle readers at the end of the snapshot phase. <br>
The flink version is required to be greater than or equal to 1.14 when 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' is set to true.<br>
If the flink version is greater than or equal to 1.15, the default value of 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' has been changed to true,
so it does not need to be explicitly configured 'execution.checkpointing.checkpoints-after-tasks-finish.enabled' = 'true'
</td>
</tr>
</tbody>
</table>
</div>
启动模式
Startup Reading Position
--------
配置选项```scan.startup.mode```指定 MySQL CDC 使用者的启动模式。有效枚举包括:
The config option `scan.startup.mode` specifies the startup mode for MySQL CDC consumer. The valid enumerations are:
- `initial` (默认):在第一次启动时对受监视的数据库表执行初始快照,并继续读取最新的 binlog。
- `earliest-offset`:跳过快照阶段,从可读取的最早 binlog 位点开始读取
- `latest-offset`:首次启动时,从不对受监视的数据库表执行快照, 连接器仅从 binlog 的结尾处开始读取,这意味着连接器只能读取在连接器启动之后的数据更改。
- `specific-offset`:跳过快照阶段,从指定的 binlog 位点开始读取。位点可通过 binlog 文件名和位置指定,或者在 GTID 在集群上启用时通过 GTID 集合指定。
- `timestamp`:跳过快照阶段,从指定的时间戳开始读取 binlog 事件。
- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest binlog.
- `earliest-offset`: Skip snapshot phase and start reading binlog events from the earliest accessible binlog offset.
- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from
the end of the binlog which means only have the changes since the connector was started.
- `specific-offset`: Skip snapshot phase and start reading binlog events from a specific offset. The offset could be
specified with binlog filename and position, or a GTID set if GTID is enabled on server.
- `timestamp`: Skip snapshot phase and start reading binlog events from a specific timestamp.
数据类型映射
Data Type Mapping
----------------
<div class="wy-table-responsive">
@ -357,7 +371,8 @@ Pipeline 连接器选项
where 38 < p <= 65<br>
</td>
<td>STRING</td>
<td>在 MySQL 中,十进制数据类型的精度高达 65但在 Flink 中,十进制数据类型的精度仅限于 38。所以如果定义精度大于 38 的十进制列,则应将其映射到字符串以避免精度损失。</td>
<td>The precision for DECIMAL data type is up to 65 in MySQL, but the precision for DECIMAL is limited to 38 in Flink.
So if you define a decimal column whose precision is greater than 38, you should map it to STRING to avoid precision loss.</td>
</tr>
<tr>
<td>
@ -445,7 +460,7 @@ Pipeline 连接器选项
LONGBLOB<br>
</td>
<td>BYTES</td>
<td>目前,对于 MySQL 中的 BLOB 数据类型,仅支持长度不大于 21474836472**31-1的 blob。 </td>
<td>Currently, for BLOB data type in MySQL, only the blob whose length isn't greater than 2,147,483,647(2 ** 31 - 1) is supported. </td>
</tr>
<tr>
<td>
@ -459,14 +474,14 @@ Pipeline 连接器选项
JSON
</td>
<td>STRING</td>
<td> JSON 数据类型将在 Flink 中转换为 JSON 格式的字符串。</td>
<td>The JSON data type will be converted into STRING with JSON format in Flink.</td>
</tr>
<tr>
<td>
SET
</td>
<td>-</td>
<td> 暂不支持 </td>
<td>Not supported yet.</td>
</tr>
<tr>
<td>
@ -483,35 +498,34 @@ Pipeline 连接器选项
STRING
</td>
<td>
MySQL 中的空间数据类型将转换为具有固定 Json 格式的字符串。
请参考 MySQL <a href="#a-name-id-003-a">空间数据类型映射</a> 章节了解更多详细信息。
The spatial data types in MySQL will be converted into STRING with a fixed Json format.
Please see <a href="#mysql-spatial-data-types-mapping ">MySQL Spatial Data Types Mapping</a> section for more detailed information.
</td>
</tr>
</tbody>
</table>
</div>
### 空间数据类型映射<a name="空间数据类型映射" id="003"></a>
MySQL中除`GEOMETRYCOLLECTION`之外的空间数据类型都会转换为 Json 字符串,格式固定,如:<br>
### MySQL Spatial Data Types Mapping
The spatial data types except for `GEOMETRYCOLLECTION` in MySQL will be converted into Json String with a fixed format like:<br>
```json
{"srid": 0 , "type": "xxx", "coordinates": [0, 0]}
```
字段`srid`标识定义几何体的 SRS如果未指定 SRID则 SRID 0 是新几何体值的默认值。
由于 MySQL 8+ 在定义空间数据类型时只支持特定的 SRID因此在版本较低的MySQL中字段`srid`将始终为 0。
The field `srid` identifies the SRS in which the geometry is defined, SRID 0 is the default for new geometry values if no SRID is specified.
As only MySQL 8+ support to specific SRID when define spatial data type, the field `srid` will always be 0 in MySQL with a lower version.
字段`type`标识空间数据类型,例如`POINT`/`LINESTRING`/`POLYGON`。
The field `type` identifies the spatial data type, such as `POINT`/`LINESTRING`/`POLYGON`.
字段`coordinates`表示空间数据的`坐标`。
The field `coordinates` represents the `coordinates` of the spatial data.
对于`GEOMETRYCOLLECTION`,它将转换为 Json 字符串,格式固定,如:<br>
For `GEOMETRYCOLLECTION`, it will be converted into Json String with a fixed format like:<br>
```json
{"srid": 0 , "type": "GeometryCollection", "geometries": [{"type":"Point","coordinates":[10,10]}]}
```
`Geometrics`字段是一个包含所有空间数据的数组。
The field `geometries` is an array contains all spatial data.
不同空间数据类型映射的示例如下:
The example for different spatial data types mapping is as follows:
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
<thead>
@ -553,7 +567,4 @@ MySQL中除`GEOMETRYCOLLECTION`之外的空间数据类型都会转换为 Json
</table>
</div>
常见问题
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -0,0 +1,44 @@
---
title: "Overview"
weight: 1
type: docs
aliases:
- /connectors/pipeline-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Pipeline Connectors Of CDC Streaming ELT Framework
## Supported Connectors
| Connector | Database |
|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [doris-pipeline](doris-pipeline.md) | <li> [Doris](https://doris.apache.org/): 1.2.x, 2.x.x |
| [mysql-pipeline](mysql-pipeline.md) | <li> [MySQL](https://dev.mysql.com/doc): 5.6, 5.7, 8.0.x <li> [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x <li> [PolarDB MySQL](https://www.aliyun.com/product/polardb): 5.6, 5.7, 8.0.x <li> [Aurora MySQL](https://aws.amazon.com/cn/rds/aurora): 5.6, 5.7, 8.0.x <li> [MariaDB](https://mariadb.org): 10.x <li> [PolarDB X](https://github.com/ApsaraDB/galaxysql): 2.0.1 |
| [starrocks-pipeline](starrocks-pipeline.md) | <li> [StarRocks](https://www.starrocks.io/): 2.x, 3.x |
## Supported Flink Versions
The following table shows the version mapping between Flink<sup>®</sup> CDC Pipeline and Flink<sup>®</sup>:
| Flink<sup>®</sup> CDC Version | Flink<sup>®</sup> Version |
|:-----------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| <font color="DarkCyan">3.0.*</font> | <font color="MediumVioletRed">1.14.\*</font>, <font color="MediumVioletRed">1.15.\*</font>, <font color="MediumVioletRed">1.16.\*</font>, <font color="MediumVioletRed">1.17.\*</font>, <font color="MediumVioletRed">1.18.\*</font> |
{{< top >}}

@ -1,3 +1,10 @@
---
title: "StarRocks Pipeline Connector"
weight: 4
type: docs
aliases:
- /pipelines/starrocks-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,19 +24,19 @@ specific language governing permissions and limitations
under the License.
-->
# StarRocks Pipeline 连接器
# StarRocks Pipeline Connector
StarRocks Pipeline 连接器可以用作 Pipeline 的 *Data Sink*,将数据写入[StarRocks](https://github.com/StarRocks/starrocks)。 本文档介绍如何设置 StarRocks Pipeline 连接器。
The StarRocks Pipeline connector can be used as the *Data Sink* of the pipeline, and write data to [StarRocks](https://github.com/StarRocks/starrocks). This document describes how to set up the StarRocks Pipeline connector.
## 连接器的功能
* 自动建表
* 表结构变更同步
* 数据实时同步
## What can the connector do?
* Create table automatically if not exist
* Schema change synchronization
* Data synchronization
如何创建 Pipeline
How to create Pipeline
----------------
从 MySQL 读取数据同步到 StarRocks 的 Pipeline 可以定义如下:
The pipeline for reading data from MySQL and sink to StarRocks can be defined as follows:
```yaml
source:
@ -55,7 +62,7 @@ pipeline:
parallelism: 2
```
Pipeline 连接器配置项
Pipeline Connector Options
----------------
<div class="highlight">
<table class="colwidths-auto docutils">
@ -74,162 +81,170 @@ Pipeline 连接器配置项
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>指定要使用的连接器, 这里需要设置成 <code>'starrocks'</code>.</td>
<td>Specify what connector to use, here should be <code>'starrocks'</code>.</td>
</tr>
<tr>
<td>name</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Sink 的名称.</td>
<td>The name of the sink.</td>
</tr>
<tr>
<td>jdbc-url</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>用于访问 FE 节点上的 MySQL 服务器。多个地址用英文逗号(,)分隔。格式:`jdbc:mysql://fe_host1:fe_query_port1,fe_host2:fe_query_port2`。</td>
<td>The address that is used to connect to the MySQL server of the FE. You can specify multiple addresses, which must be separated by a comma (,). Format: `jdbc:mysql://fe_host1:fe_query_port1,fe_host2:fe_query_port2,fe_host3:fe_query_port3`.</td>
</tr>
<tr>
<td>load-url</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>用于访问 FE 节点上的 HTTP 服务器。多个地址用英文分号(;)分隔。格式:`fe_host1:fe_http_port1;fe_host2:fe_http_port2`。</td>
<td>The address that is used to connect to the HTTP server of the FE. You can specify multiple addresses, which must be separated by a semicolon (;). Format: `fe_host1:fe_http_port1;fe_host2:fe_http_port2`.</td>
</tr>
<tr>
<td>username</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>StarRocks 集群的用户名。</td>
<td>User name to use when connecting to the StarRocks database.</td>
</tr>
<tr>
<td>password</td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>StarRocks 集群的用户密码。</td>
<td>Password to use when connecting to the StarRocks database.</td>
</tr>
<tr>
<td>sink.label-prefix</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>指定 Stream Load 使用的 label 前缀。</td>
<td>The label prefix used by Stream Load.</td>
</tr>
<tr>
<td>sink.connect.timeout-ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">30000</td>
<td>String</td>
<td>与 FE 建立 HTTP 连接的超时时间。取值范围:[100, 60000]。</td>
<td>The timeout for establishing HTTP connection. Valid values: 100 to 60000.</td>
</tr>
<tr>
<td>sink.wait-for-continue.timeout-ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">30000</td>
<td>String</td>
<td>等待 FE HTTP 100-continue 应答的超时时间。取值范围:[3000, 60000]。</td>
<td>Timeout in millisecond to wait for 100-continue response from FE http server.
Valid values: 3000 to 600000.</td>
</tr>
<tr>
<td>sink.buffer-flush.max-bytes</td>
<td>optional</td>
<td style="word-wrap: break-word;">157286400</td>
<td>Long</td>
<td>内存中缓冲的数据量大小缓冲区由所有导入的表共享达到阈值后将选择一个或多个表的数据写入到StarRocks。
达到阈值后取值范围:[64MB, 10GB]。</td>
<td>The maximum size of data that can be accumulated in memory before being sent to StarRocks at a time.
The value ranges from 64 MB to 10 GB. This buffer is shared by all tables in the sink. If the buffer
is full, the connector will choose one or more tables to flush.</td>
</tr>
<tr>
<td>sink.buffer-flush.interval-ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">300000</td>
<td>Long</td>
<td>每个表缓冲数据发送的间隔,用于控制数据写入 StarRocks 的延迟。单位是毫秒,取值范围:[1000, 3600000]。</td>
<td>The interval at which data is flushed for each table. The unit is in millisecond.</td>
</tr>
<tr>
<td>sink.scan-frequency.ms</td>
<td>optional</td>
<td style="word-wrap: break-word;">50</td>
<td>Long</td>
<td>连接器会定期检查每个表是否到达发送间隔,该配置控制检查频率,单位为毫秒。</td>
<td>Scan frequency in milliseconds to check whether the buffered data for a table should be flushed
because of reaching the flush interval.</td>
</tr>
<tr>
<td>sink.io.thread-count</td>
<td>optional</td>
<td style="word-wrap: break-word;">2</td>
<td>Integer</td>
<td>用来执行 Stream Load 的线程数,不同表之间的导入可以并发执行。</td>
<td>Number of threads used for concurrent stream loads among different tables.</td>
</tr>
<tr>
<td>sink.at-least-once.use-transaction-stream-load</td>
<td>optional</td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>at-least-once 下是否使用 transaction stream load。</td>
<td>Whether to use transaction stream load for at-least-once when it's available.</td>
</tr>
<tr>
<td>sink.properties.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Stream Load 的参数,控制 Stream Load 导入行为。例如 参数 `sink.properties.timeout` 用来控制导入的超时时间。
全部参数和解释请参考 <a href="https://docs.starrocks.io/zh/docs/sql-reference/sql-statements/data-manipulation/STREAM_LOAD">
STREAM LOAD</a></td>
<td>The parameters that control Stream Load behavior. For example, the parameter `sink.properties.timeout`
specifies the timeout of Stream Load. For a list of supported parameters and their descriptions,
see <a href="https://docs.starrocks.io/docs/sql-reference/sql-statements/data-manipulation/STREAM_LOAD">
STREAM LOAD</a>.</td>
</tr>
<tr>
<td>table.create.num-buckets</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Integer</td>
<td>自动创建 StarRocks 表时使用的桶数。对于 StarRocks 2.5 及之后的版本可以不设置StarRocks 将会
<a href="https://docs.starrocks.io/zh/docs/table_design/Data_distribution/#%E7%A1%AE%E5%AE%9A%E5%88%86%E6%A1%B6%E6%95%B0%E9%87%8F">
自动设置分桶数量</a>;对于 StarRocks 2.5 之前的版本必须设置。</td>
<td>Number of buckets when creating a StarRocks table automatically. For StarRocks 2.5 or later, it's not required
to set the option because StarRocks can
<a href="https://docs.starrocks.io/docs/table_design/Data_distribution/#determine-the-number-of-buckets">
determine the number of buckets automatically</a>. For StarRocks prior to 2.5, you must set this option. </td>
</tr>
<tr>
<td>table.create.properties.*</td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>自动创建 StarRocks 表时使用的属性。比如: 如果使用 StarRocks 3.2 及之后的版本,<code>'table.create.properties.fast_schema_evolution' = 'true'</code>
将会打开 fast schema evolution 功能。 更多信息请参考
<a href="https://docs.starrocks.io/zh/docs/table_design/table_types/primary_key_table/">主键模型</a></td>
<td>Properties used for creating a StarRocks table. For example: <code>'table.create.properties.fast_schema_evolution' = 'true'</code>
will enable fast schema evolution if you are using StarRocks 3.2 or later. For more information,
see <a href="https://docs.starrocks.io/docs/table_design/table_types/primary_key_table">how to create a primary key table</a>.</td>
</tr>
<tr>
<td>table.schema-change.timeout</td>
<td>optional</td>
<td style="word-wrap: break-word;">30min</td>
<td>Duration</td>
<td>StarRocks 侧执行 schema change 的超时时间,必须是秒的整数倍。超时后 StarRocks 将会取消 schema change从而导致作业失败。</td>
<td>Timeout for a schema change on StarRocks side, and must be an integral multiple of
seconds. StarRocks will cancel the schema change after timeout which will
cause the sink failure. </td>
</tr>
</tbody>
</table>
</div>
使用说明
Usage Notes
--------
* 只支持主键表,因此源表必须有主键
* Only support StarRocks primary key table, so the source table must have primary keys.
* 暂不支持 exactly-once连接器 通过 at-least-once 和主键表实现幂等写
* Not support exactly-once. The connector uses at-least-once + primary key table for idempotent writing.
* 对于自动建表
* 分桶键和主键相同
* 没有分区键
* 分桶数由 `table.create.num-buckets` 控制。如果使用的 StarRocks 2.5 及之后的版本可以不设置StarRocks 能够
<a href="https://docs.starrocks.io/zh/docs/table_design/Data_distribution/#%E7%A1%AE%E5%AE%9A%E5%88%86%E6%A1%B6%E6%95%B0%E9%87%8F">
自动设置分桶数量</a>。对于 StarRocks 2.5 之前的版本必须设置,否则无法自动创建表。
* For creating table automatically
* the distribution keys are the same as the primary keys
* there is no partition key
* the number of buckets is controlled by `table.create.num-buckets`. If you are using StarRocks 2.5 or later,
it's not required to set the option because StarRocks can [determine the number of buckets automatically](https://docs.starrocks.io/docs/table_design/Data_distribution/#determine-the-number-of-buckets),
otherwise you must set the option.
* 对于表结构变更同步
* 只支持增删列
* 新增列只能添加到最后一列
* 如果使用 StarRocks 3.2 及之后版本,并且通过连接器来自动建表, 可以通过配置 `table.create.properties.fast_schema_evolution``true`
来加速 StarRocks 执行变更。
* For schema change synchronization
* only supports add/drop columns
* the new column will always be added to the last position
* if your StarRocks version is 3.2 or later, and using the connector to create table automatically,
you can set `table.create.properties.fast_schema_evolution` to `true` to speed up the schema change.
* 对于数据同步pipeline 连接器使用 [StarRocks Sink 连接器](https://github.com/StarRocks/starrocks-connector-for-apache-flink)
将数据写入 StarRocks具体可以参考 [Sink 文档](https://github.com/StarRocks/starrocks-connector-for-apache-flink/blob/main/docs/content/connector-sink.md)。
* For data synchronization, the pipeline connector uses [StarRocks Sink Connector](https://github.com/StarRocks/starrocks-connector-for-apache-flink)
to write data to StarRocks. You can see [sink documentation](https://github.com/StarRocks/starrocks-connector-for-apache-flink/blob/main/docs/content/connector-sink.md)
for how it works.
数据类型映射
Data Type Mapping
----------------
<div class="wy-table-responsive">
<table class="colwidths-auto docutils">
@ -299,26 +314,25 @@ Pipeline 连接器配置项
<tr>
<td>CHAR(n) where n <= 85</td>
<td>CHAR(n * 3)</td>
<td>CDC 中长度表示字符数,而 StarRocks 中长度表示字节数。根据 UTF-8 编码,一个中文字符占用三个字节,因此 CDC 中的长度对应到 StarRocks
中为 n * 3。由于 StarRocks CHAR 类型的最大长度为255所以只有当 CDC 中长度不超过85时才将 CDC CHAR 映射到 StarRocks CHAR。</td>
<td>CDC defines the length by characters, and StarRocks defines it by bytes. According to UTF-8, one Chinese
character is equal to three bytes, so the length for StarRocks is n * 3. Because the max length of StarRocks
CHAR is 255, map CDC CHAR to StarRocks CHAR only when the CDC length is no larger than 85.</td>
</tr>
<tr>
<td>CHAR(n) where n > 85</td>
<td>VARCHAR(n * 3)</td>
<td>CDC 中长度表示字符数,而 StarRocks 中长度表示字节数。根据 UTF-8 编码,一个中文字符占用三个字节,因此 CDC 中的长度对应到 StarRocks
中为 n * 3。由于 StarRocks CHAR 类型的最大长度为255所以当 CDC 中长度超过85时才将 CDC CHAR 映射到 StarRocks VARCHAR。</td>
<td>CDC defines the length by characters, and StarRocks defines it by bytes. According to UTF-8, one Chinese
character is equal to three bytes, so the length for StarRocks is n * 3. Because the max length of StarRocks
CHAR is 255, map CDC CHAR to StarRocks VARCHAR if the CDC length is larger than 85.</td>
</tr>
<tr>
<td>VARCHAR(n)</td>
<td>VARCHAR(n * 3)</td>
<td>CDC 中长度表示字符数,而 StarRocks 中长度表示字节数。根据 UTF-8 编码,一个中文字符占用三个字节,因此 CDC 中的长度对应到 StarRocks
中为 n * 3。</td>
<td>CDC defines the length by characters, and StarRocks defines it by bytes. According to UTF-8, one Chinese
character is equal to three bytes, so the length for StarRocks is n * 3.</td>
</tr>
</tbody>
</table>
</div>
FAQ
--------
* [FAQ(English)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ)
* [FAQ(中文)](https://github.com/ververica/flink-cdc-connectors/wiki/FAQ(ZH))
{{< top >}}

@ -0,0 +1,26 @@
---
title: Development
icon: <i class="fa fa-code title maindish" aria-hidden="true"></i>
bold: true
sectionBreak: true
bookCollapseSection: true
weight: 2
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,130 @@
---
title: "CDC Streaming ELT Framework Concepts"
weight: 1
type: docs
aliases:
- /development/concept-pipeline.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# CDC Streaming ELT Framework
## What is CDC Streaming ELT Framework
CDC Streaming ELT Framework is a stream data integration framework that aims to provide users with a more robust API. It allows users to configure their data synchronization logic through customized Flink operators and job submission tools. The framework prioritizes optimizing the task submission process and offers enhanced functionalities such as whole database synchronization, sharding, and schema change synchronization.
## What can CDC Streaming ELT Framework do?
{{< img src="/fig/architecture.png" alt="CDC Architecture" >}}
* ✅ End-to-end data integration framework
* ✅ API for data integration users to build jobs easily
* ✅ Multi-table support in Source / Sink
* ✅ Synchronization of entire databases
* ✅ Schema evolution capability
## Core Concepts
{{< img src="/fig/design.png" alt="CDC Design" >}}
The data types flowing in the Flink CDC 3.0 framework are referred to as **Event**, which represent the change events generated by external systems.
Each event is marked with a **Table ID** for which the change occurred. Events are categorized into `SchemaChangeEvent` and `DataChangeEvent`, representing changes in table structure and data respectively.
**Data Source** Connector captures the changes in external systems and converts them into events as the output of the synchronization task. It also provides a `MetadataAccessor` for the framework to read the metadata of the external systems.
**Data Sink** connector receives the change events from **Data Source** and applies them to the external systems. Additionally, `MetadataApplier` is used to apply metadata changes from the source system to the target system.
Since events flow from the upstream to the downstream in a pipeline manner, the data synchronization task is referred as a **Data Pipeline**. A **Data Pipeline** consists of a **Data Source**, **Route**, **Transform** and **Data Sink**. The transform can add extra content to events, and the router can remap the `Table ID`s corresponding to events.
Now let's introduce more details about the concepts you need to know when using the CDC Streaming ELT Framework.
### Table ID
When connecting to external systems, it is necessary to establish a mapping relationship with the storage objects of the external system. This is what `Table Id` refers to.
To be compatible with most external systems, the `Table ID` is represented by a 3-tuple : (namespace, schemaName, table). Connectors need to establish the mapping between Table ID and storage objects in external systems.
For instance, a table in MySQL/Doris is mapped to (null, database, table) and a topic in a message queue system such as Kafka is mapped to (null, null, topic).
### Data Source
Data Source is used to access metadata and read the changed data from external systems.
A Data Source can read data from multiple tables simultaneously.
To describe a data source, the follows are required:
* Type: The type of the source, such as MySQL, Postgres.
* Name: The name of the source, which is user-defined (optional, with a default value provided).
* Other custom configurations for the source.
For example, we could use `yaml` files to define a mysql source
```yaml
source:
type: mysql
name: mysql-source #optionaldescription information
host: localhost
port: 3306
username: admin
password: pass
tables: adb.*, bdb.user_table_[0-9]+, [app|web]_order_\.*
```
### Data Sink
The Data Sink is used to apply schema changes and write change data to external systems. A Data Sink can write to multiple tables simultaneously.
To describe a data sink, the follows are required:
* Type: The type of the sink, such as MySQL or PostgreSQL.
* Name: The name of the sink, which is user-defined (optional, with a default value provided).
* Other custom configurations for the sink.
For example, we can use this `yaml` file to define a kafka sink:
```yaml
sink:
type: kafka
name: mysink-queue # Optional parameter for description purpose
bootstrap-servers: localhost:9092
auto-create-table: true # Optional parameter for advanced functionalities
```
### Route
Route specifies the target table ID of each event.
The most typical scenario is the merge of sub-databases and sub-tables, routing multiple upstream source tables to the same sink table
To describe a route, the follows are required:
* source-table: Source table id, supports regular expressions
* sink-table: Sink table id, supports regular expressions
* description: Routing rule description(optional, default value provided)
For example, if synchronize the table 'web_order' in the database 'mydb' to a Kafka topic 'ods_web_order', we can use this yaml file to define this route
```yaml
route:
source-table: mydb.default.web_order
sink-table: ods_web_order
description: sync table to one destination table with given prefix ods_
```
### Data Pipeline
Since events flow from the upstream to the downstream in a pipeline manner, the data synchronization task is also referred as a Data Pipeline.
To describe a Data Pipeline, the follows are required:
* Name: The name of the pipeline, which will be submitted to the Flink cluster as the job name.
* Other advanced capabilities such as automatic table creation, schema evolution, etc., will be implemented.
For example, we can use this yaml file to define a pipeline:
```yaml
pipeline:
name: mysql-to-kafka-pipeline
parallelism: 1
```
{{< top >}}

@ -1,3 +1,11 @@
---
title: "DataStream Api Package Guidance"
weight: 999
type: docs
aliases:
- /development/datastream-api-package-guidance.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,15 +25,15 @@ specific language governing permissions and limitations
under the License.
-->
# DataStream api 打包指南
# DataStream Api Package Guidance
本指南提供了 mysql cdc DataStream api 的简单 pom 示例
This guide provides a simple pom example of mysql cdc DataStream api
## 框架版本
## frame version
flink 1.17.2 flink mysql cdc 2.4.2
## pom 示例
## pom example
```xml
<?xml version="1.0" encoding="UTF-8"?>
@ -34,7 +42,7 @@ flink 1.17.2 flink mysql cdc 2.4.2
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ververica</groupId>
<groupId>org.apache.flink</groupId>
<artifactId>FlinkCDCTest</artifactId>
<version>1.0-SNAPSHOT</version>
@ -113,7 +121,7 @@ flink 1.17.2 flink mysql cdc 2.4.2
<version>30.1.1-jre-16.1</version>
</dependency>
<dependency>
<groupId>com.ververica</groupId>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>2.4.2</version>
</dependency>
@ -164,8 +172,8 @@ flink 1.17.2 flink mysql cdc 2.4.2
<include>io.debezium:debezium-core</include>
<include>io.debezium:debezium-ddl-parser</include>
<include>io.debezium:debezium-connector-mysql</include>
<include>com.ververica:flink-connector-debezium</include>
<include>com.ververica:flink-connector-mysql-cdc</include>
<include>org.apache.flink:flink-connector-debezium</include>
<include>org.apache.flink:flink-connector-mysql-cdc</include>
<include>org.antlr:antlr4-runtime</include>
<include>org.apache.kafka:*</include>
<include>mysql:mysql-connector-java</include>
@ -225,10 +233,10 @@ flink 1.17.2 flink mysql cdc 2.4.2
</project>
```
## 代码示例
## code example
```java
package com.ververica.flink.cdc;
package org.apache.flink.flink.cdc;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
@ -241,26 +249,28 @@ public class CdcTest {
MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
.hostname("yourHostname")
.port(yourPort)
.databaseList("yourDatabaseName") // 设置捕获的数据库, 如果需要同步整个数据库,请将 tableList 设置为 ".*".
.tableList("yourDatabaseName.yourTableName") // 设置捕获的表
.databaseList("yourDatabaseName") // set captured database, If you need to synchronize the whole database, Please set tableList to ".*".
.tableList("yourDatabaseName.yourTableName") // set captured table
.username("yourUsername")
.password("yourPassword")
.deserializer(new JsonDebeziumDeserializationSchema()) // 将 SourceRecord 转换为 JSON 字符串
.deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String
.build();
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 设置 3s 的 checkpoint 间隔
// enable checkpoint
env.enableCheckpointing(3000);
env
.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source")
// 设置 source 节点的并行度为 1
// set 1 parallel source tasks
.setParallelism(1)
.print().setParallelism(1); // 设置 sink 节点并行度为 1
.print().setParallelism(1); // use parallelism 1 for sink
env.execute("Print MySQL Snapshot + Binlog");
}
}
```
{{< top >}}

@ -0,0 +1,25 @@
---
title: "FAQ"
icon: <i class="fa fa-question title appetizer" aria-hidden="true"></i>
bold: true
bookCollapseSection: true
weight: 4
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,330 @@
---
title: "FAQ"
weight: 1
type: docs
aliases:
- /faq/faq.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## General FAQ
### Q1: Why can't I download Flink-sql-connector-mysql-cdc-2.2-snapshot jar, why doesn't Maven warehouse rely on XXX snapshot?
Like the mainstream Maven project version management, XXX snapshot version is the code corresponding to the development branch. Users need to download the source code and compile the corresponding jar. Users should use the released version, such as flink-sql-connector-mysql-cdc-2.1 0.jar, the released version will be available in the Maven central warehouse.
### Q2: When should I use Flink SQL connector XXX Jar? When should I Flink connector XXX jar? What's the difference between the two?
The dependency management of each connector in Flink CDC project is consistent with that in Flink project. Flink SQL connector XX is a fat jar. In addition to the code of connector, it also enters all the third-party packages that connector depends on into the shade and provides them to SQL jobs. Users only need to add the fat jar in the flink/lib directory. The Flink connector XX has only the code of the connector and does not contain the required dependencies. It is used by DataStream jobs. Users need to manage the required three-party package dependencies. Conflicting dependencies need to be excluded and shaded by themselves.
### Q3: Why change the package name from com.alibaba.ververica changed to org.apache.flink? Why can't the 2. X version be found in Maven warehouse?
Flink CDC project changes the group ID from com.alibaba.ververica changed to org.apache.flink since 2.0.0 version, this is to make the project more community neutral and more convenient for developers of various companies to build. So look for 2.x in Maven warehouse package, the path is /org/apache/flink.
## MySQL CDC FAQ
### Q1: I use CDC 2.x version , only full data can be read, but binlog data cannot be read. What's the matter?
CDC 2.0 supports lock free algorithm and concurrent reading. In order to ensure the order of full data + incremental data, it relies on Flink's checkpoint mechanism, so the job needs to be configured with checkpoint.
Configuration method in SQL job:
```sql
Flink SQL> SET 'execution.checkpointing.interval' = '3s';
```
DataStream job configuration mode:
```java
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(3000);
```
### Q2: Using MySQL CDC DataStream API, the timestamp field read in the incremental phase has a time zone difference of 8 hours. What's the matter?
When parsing the timestamp field in binlog data, CDC will use the server time zone information configured in the job, that is, the time zone of the MySQL server. If this time zone is not consistent with the time zone of your MySQL server, this problem will occur.
In addition, if the serializer is customized in the DataStream job.
such as MyDeserializer implements DebeziumDeserializationSchema, when the customized serializer parses the timestamp type data, it needs to refer to the analysis of the timestamp type in RowDataDebeziumDeserializeSchema and use the given time zone information.
```
private TimestampData convertToTimestamp(Object dbzObj, Schema schema) {
if (dbzObj instanceof Long) {
switch (schema.name()) {
case Timestamp.SCHEMA_NAME:
return TimestampData.fromEpochMillis((Long) dbzObj);
case MicroTimestamp.SCHEMA_NAME:
long micro = (long) dbzObj;
return TimestampData.fromEpochMillis(micro / 1000, (int) (micro % 1000 * 1000));
case NanoTimestamp.SCHEMA_NAME:
long nano = (long) dbzObj;
return TimestampData.fromEpochMillis(nano / 1000_000, (int) (nano % 1000_000));
}
}
LocalDateTime localDateTime = TemporalConversions.toLocalDateTime(dbzObj, serverTimeZone);
return TimestampData.fromLocalDateTime(localDateTime);
}
```
### Q3: Does MySQL CDC support listening to slave database? How to configure slave database?
Yes, the slave database needs to be configured with log slave updates = 1, so that the slave instance can also write the data synchronized from the master instance to the binlog file of the slave database. If the master database has enabled gtid mode, the slave database also needs to be enabled.
```
log-slave-updates = 1
gtid_mode = on
enforce_gtid_consistency = on
```
### Q4: I want to synchronize sub databases and sub tables. How should I configure them?
In the with parameter of MySQL CDC table, both table name and database name support regular configuration, such as 'table name ' = 'user_ '.' Can match table name 'user_ 1, user_ 2,user_ A ' table.
Note that any regular matching character is'. ' Instead of '*', where the dot represents any character, the asterisk represents 0 or more, and so does database name, that the shared table should be in the same schema.
### Q5: I want to skip the stock reading phase and only read binlog data. How to configure it?
In the with parameter of MySQL CDC table
```
'scan.startup.mode' = 'latest-offset'.
```
### Q6: I want to get DDL events in the database. What should I do? Is there a demo?
Flink CDC provides DataStream API `MysqlSource` since version 2.1. Users can configure includeschemachanges to indicate whether DDL events are required. After obtaining DDL events, they can write code for next processing.
```java
public void consumingAllEvents() throws Exception {
inventoryDatabase.createAndInitialize();
MySqlSource<String> mySqlSource =
MySqlSource.<String>builder()
.hostname(MYSQL_CONTAINER.getHost())
.port(MYSQL_CONTAINER.getDatabasePort())
.databaseList(inventoryDatabase.getDatabaseName())
.tableList(inventoryDatabase.getDatabaseName() + ".products")
.username(inventoryDatabase.getUsername())
.password(inventoryDatabase.getPassword())
.serverId("5401-5404")
.deserializer(new JsonDebeziumDeserializationSchema())
.includeSchemaChanges(true) // Configure here and output DDL events
.build();
... // Other processing logic
}
```
### Q7: How to synchronize the whole MySQL database? Does Flink CDC support it?
The DataStream API provided in Q6 has enabled users to obtain DDL change events and data change events. On this basis, users need to develop DataStream jobs according to their own business logic and downstream storage.
### Q8: In the same MySQL instance, the table of one database cannot synchronize incremental data, but other databases works fine. Why?
Users can check Binlog_Ignore_DB and Binlog_Do_DB through the `show master status` command
```mysql
mysql> show master status;
+------------------+----------+--------------+------------------+----------------------+
| File | Position | Binlog_Do_DB | Binlog_Ignore_DB | Executed_Gtid_Set |
+------------------+----------+--------------+------------------+----------------------+
| mysql-bin.000006 | 4594 | | | xxx:1-15 |
+------------------+----------+--------------+------------------+----------------------+
```
### Q9: The job reports an error the connector is trying to read binlog starting at GTIDs xxx and binlog file 'binlog.000064', pos=89887992, skipping 4 events plus 1 rows, but this is no longer available on the server. Reconfigure the connector to use a snapshot when needed, What should I do?
This error occurs because the binlog file being read by the job has been cleaned up on the MySQL server. Generally, the expiration time of the binlog file retained on the MySQL server is too short. You can set this value higher, such as 7 days.
```mysql
mysql> show variables like 'expire_logs_days';
mysql> set global expire_logs_days=7;
```
In another case, the binlog consumption of the Flink CDC job is too slow. Generally, sufficient resources can be allocated.
### Q10: The job reports an error ConnectException: A slave with the same server_uuid/server_id as this slave has connected to the master,What should I do?
This error occurs because the server ID used in the job conflicts with the server ID used by other jobs or other synchronization tools. The server ID needs to be globally unique. The server ID is an int type integer. In CDC 2.x In version, each concurrency of the source requires a server ID. it is recommended to reasonably plan the server ID. for example, if the source of the job is set to four concurrency, you can configure 'server ID' = '5001-5004', so that each source task will not conflict.
### Q11: The job reports an error ConnectException: Received DML ‘…’ for processing, binlog probably contains events generated with statement or mixed based replication format,What should I do?
This error occurs because the MySQL server is not configured correctly. You need to check the binlog is format row? You can view it through the following command
```mysql
mysql> show variables like '%binlog_format%';
```
### Q12: The job reports an error Mysql8.0 Public Key Retrieval is not allowed,What should I do?
This is because the MySQL user configured by the user uses sha256 password authentication and requires TLS and other protocols to transmit passwords. A simple method is to allow MySQL users to support original password access.
```mysql
mysql> ALTER USER 'username'@'localhost' IDENTIFIED WITH mysql_native_password BY 'password';
mysql> FLUSH PRIVILEGES;
```
### Q13: The job reports an error EventDataDeserializationException: Failed to deserialize data of EventHeaderV4 .... Caused by: java.net.SocketException: Connection reset,What should I do?
This problem is generally caused by the network. First, check the network between the Flink cluster and the database, and then increase the network parameters of the MySQL server.
```mysql
mysql> set global slave_net_timeout = 120;
mysql> set global thread_pool_idle_timeout = 120;
```
Or try to use the flink configuration as follows.
```
execution.checkpointing.interval=10min
execution.checkpointing.tolerable-failed-checkpoints=100
restart-strategy=fixed-delay
restart-strategy.fixed-delay.attempts=2147483647
restart-strategy.fixed-delay.delay= 30s
```
If there is bad back pressure in the job, this problem may happen too. Then you need to handle the back pressure in the job first.
### Q14: The job reports an error The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires,What should I do?
The reason for this problem is that the reading of the full volume phase of the job is too slow. After reading the full volume phase, the previously recorded gtid site at the beginning of the full volume phase has been cleared by mysql. This can increase the save time of binlog files on the MySQL server, or increase the concurrency of source to make the full volume phase read faster.
### Q15: How to config `tableList` option when build MySQL CDC source in DataStream API?
The `tableList` option requires table name with database name rather than table name in DataStream API. For MySQL CDC source, the `tableList` option value should like my_db.my_table.
## Postgres CDC FAQ
### Q1: It is found that the disk utilization rate of PG server is high. What is the reason why wal is not released?
Flink Postgres CDC will only update the LSN in the Postgres slot when the checkpoint is completed. Therefore, if you find that the disk utilization is high, please first confirm whether the checkpoint is turned on.
### Q2: Flink Postgres CDC returns null for decimal types exceeding the maximum precision (38, 18) in synchronous Postgres
In Flink, if the precision of the received data is greater than the precision of the type declared in Flink, the data will be processed as null. You can configure the corresponding 'debezium decimal. handling. Mode '='string' process the read data with string type
### Q3: Flink Postgres CDC prompts that toast data is not transmitted. What is the reason?
Please ensure that the replica identity is full first. The toast data is relatively large. In order to save the size of wal, if the toast data is not changed, the wal2json plugin will not bring toast data to the updated data. To avoid this problem, you can use 'debezium schema. refresh. mode'='columns_ diff_ exclude_ unchanged_ Toast 'to solve.
### Q4: The job reports an error replication slot "XXXX" is active. What should I do?
Currently, Flink Postgres CDC does not release the slot manually after the job exits. There are two ways to solve this problem
- Go to Postgres and manually execute the following command
```
select pg_drop_replication_slot('rep_slot');
ERROR: replication slot "rep_slot" is active for PID 162564
select pg_terminate_backend(162564); select pg_drop_replication_slot('rep_slot');
```
- Add 'debezium.slot.drop.on.stop'='true' to PG source with parameter to automatically clean up the slot after the job stops
### Q5: Jobs have dirty data, such as illegal dates. Are there parameters that can be configured and filtered?
Yes, you can add configure. In the with parameter of the Flink CDC table 'debezium.event.deserialization.failure.handling.mode'='warn' parameter, skip dirty data and print dirty data to warn log. You can also configure 'debezium.event.deserialization.failure.handling.mode'='ignore', skip dirty data directly and do not print dirty data to the log.
### Q6: How to config `tableList` option when build Postgres CDC source in DataStream API?
The `tableList` option requires table name with schema name rather than table name in DataStream API. For Postgres CDC source, the `tableList` option value should like my_schema.my_table.
## MongoDB CDC FAQ
### Q1: Does mongodb CDC support full + incremental read and read-only incremental?
Yes, the default is full + incremental reading; Use copy The existing = false parameter is set to read-only increment.
### Q2: Does mongodb CDC support recovery from checkpoint? What is the principle?
Yes, the checkpoint will record the resumetoken of the changestream. During recovery, the changestream can be restored through the resumetoken. Where resumetoken corresponds to oplog RS (mongodb change log collection), oplog RS is a fixed capacity collection. When the corresponding record of resumetoken is in oplog When RS does not exist, an exception of invalid resumetoken may occur. In this case, you can set the appropriate oplog Set size of RS to avoid oplog RS retention time is too short, you can refer to https://docs.mongodb.com/manual/tutorial/change-oplog-size/ In addition, the resumetoken can be refreshed through the newly arrived change record and heartbeat record.
### Q3: Does mongodb CDC support outputting - U (update_before) messages?
Mongodb original oplog RS has only insert, update, replace and delete operation types. It does not retain the information before update. It cannot output - U messages. It can only realize the update semantics in Flink. When using mongodbtablesource, Flink planner will automatically perform changelognormalize optimization, fill in the missing - U messages, and output complete + I, - u, + U, and - D messages. The cost of changelognormalize optimization is that the node will save the status of all previous keys. Therefore, if the DataStream job directly uses mongodbsource, without the optimization of Flink planner, changelognormalize will not be performed automatically, so - U messages cannot be obtained directly. To obtain the pre update image value, you need to manage the status yourself. If you don't want to manage the status yourself, you can convert mongodbtablesource to changelogstream or retractstream and supplement the pre update image value with the optimization ability of Flink planner. An example is as follows:
```
tEnv.executeSql("CREATE TABLE orders ( ... ) WITH ( 'connector'='mongodb-cdc',... )");
Table table = tEnv.from("orders")
.select($("*"));
tEnv.toChangelogStream(table)
.print()
.setParallelism(1);
env.execute();
```
### Q4: Does mongodb CDC support subscribing to multiple collections?
Only the collection of the whole database can be subscribed, but some collection filtering functions are not supported. For example, if the database is configured as' mgdb 'and the collection is an empty string, all collections under the' mgdb 'database will be subscribed.
### Q5: Does mongodb CDC support setting multiple concurrent reads?
Not yet supported.
### Q6: What versions of mongodb are supported by mongodb CDC?
Mongodb CDC is implemented based on the changestream feature, which is a new feature launched by mongodb 3.6. Mongodb CDC theoretically supports versions above 3.6. It is recommended to run version > = 4.0. When executing versions lower than 3.6, an error will occur: unrecognized pipeline stage name: '$changestream'.
### Q7: What is the operation mode of mongodb supported by mongodb CDC?
Changestream requires mongodb to run in replica set or fragment mode. Local tests can use stand-alone replica set rs.initiate().
Errors occur in standalone mode : The $changestage is only supported on replica sets.
### Q8: Mongodb CDC reports an error. The user name and password are incorrect, but other components can connect normally with this user name and password. What is the reason?
If the user is creating a DB that needs to be connected, add 'connection' to the with parameter Options' ='authsource = DB where the user is located '.
### Q9: Does mongodb CDC support debezium related parameters?
The mongodb CDC connector is not supported because it is independently developed in the Flink CDC project and does not rely on the debezium project.
### Q10: In the mongodb CDC full reading phase, can I continue reading from the checkpoint after the job fails?
In the full reading phase, mongodb CDC does not do checkpoint until the full reading phase is completed. If it fails in the full reading phase, mongodb CDC will read the stock data again.
## Oracle CDC FAQ
### Q1: Oracle CDC's archive logs grow rapidly and read logs slowly?
The online mining mode can be used without writing the data dictionary to the redo log, but it cannot process DDL statements. The default policy of the production environment reads the log slowly, and the default policy will write the data dictionary information to the redo log, resulting in a large increase in the log volume. You can add the following debezium configuration items. " log. mining. strategy' = 'online_ catalog','log. mining. continuous. mine' = 'true'。 If you use SQL, you need to prefix the configuration item with 'debezium.', Namely:
```
'debezium.log.mining.strategy' = 'online_catalog',
'debezium.log.mining.continuous.mine' = 'true'
```
### Q2: Operation error caused by: io debezium. DebeziumException: Supplemental logging not configured for table xxx. Use command: alter table XXX add supplementary log data (all) columns?
For Oracle version 11, debezium will set tableidcasesensitive to true by default, resulting in the table name being updated to lowercase. Therefore, the table completion log setting cannot be queried in Oracle, resulting in the false alarm of "supplementary logging not configured for table error".
If it is the DataStream API, add the configuration item of debezium 'database.tablename.case.insensitive' = 'false'. If the SQL API is used, add the configuration item 'debezium.database.tablename.case.insensitive' = 'false' in the option of the table
### Q3: How does Oracle CDC switch to XStream?
Add configuration item 'database.connection.adpter' = 'xstream', please use the configuration item 'debezium.database.connection.adpter' = 'xstream' if you're using SQL API.
### Q4: What are the database name and schema name of Oracle CDC
Database name is the name of the database example, that is, the SID of Oracle. Schema name is the schema corresponding to the table. Generally speaking, a user corresponds to a schema. The schema name of the user is equal to the user name and is used as the default schema of the user. Therefore, schema name is generally the user name for creating the table, but if a schema is specified when creating the table, the specified schema is schema name. For example, use create table AAAA If TestTable (XXXX) is successfully created, AAAA is schema name.

@ -0,0 +1,25 @@
---
title: "Try Flink CDC"
icon: <i class="fa fa-rocket title appetizer" aria-hidden="true"></i>
bold: true
bookCollapseSection: true
weight: 1
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,25 @@
---
title: CDC Connectors
bookCollapseSection: true
weight: 2
aliases:
- /try-flink-cdc/cdc-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

@ -0,0 +1,331 @@
---
title: "Building a Real-time Data Lake with Flink CDC"
weight: 999
type: docs
aliases:
- /development/build-real-time-data-lake-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Using Flink CDC to synchronize data from MySQL sharding tables and build real-time data lake
For OLTP databases, to deal with a huge number of data in a single table, we usually do database and table sharding to get better throughput.
But sometimes, for convenient analysis, we need to merge them into one table when loading them to data warehouse or data lake.
This tutorial will show how to use Flink CDC to build a real-time data lake for such a scenario.
You can walk through the tutorial easily in the docker environment. The entire process uses standard SQL syntax without a single line of Java/Scala code or IDE installation.
The following sections will take the pipeline from MySQL to [Iceberg](https://iceberg.apache.org/) as an example. The overview of the architecture is as follows:
{{< img src="/fig/real-time-data-lake-tutorial/real-time-data-lake-tutorial.png" alt="Real-time data lake with Flink CDC" >}}
You can also use other data sources like Oracle/Postgres and sinks like Hudi to build your own pipeline.
## Preparation
Prepare a Linux or MacOS computer with Docker installed.
## Preparing JAR package required
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.**
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- [flink-shaded-hadoop-2-uber-2.7.5-10.0.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.7.5-10.0/flink-shaded-hadoop-2-uber-2.7.5-10.0.jar)
- [iceberg-flink-runtime-1.16-1.3.1.jar](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/1.3.1/iceberg-flink-runtime-1.16-1.3.1.jar)
### Starting components required
The components required in this tutorial are all managed in containers, so we will use `docker-compose` to start them.
1. Create `Dockerfile` file using following contents:
```dockerfile
FROM flink:1.16.0-scala_2.12
# Place the downloaded jar packages in the lib directory at the same level.
COPY ./lib /opt/flink/lib
RUN apt-get update && apt-get install tree
```
2. Create `docker-compose.yml` file using following contents:
```yml
version: '2.1'
services:
sql-client:
user: flink:flink
build: .
command: bin/sql-client.sh
depends_on:
- jobmanager
- mysql
environment:
- MYSQL_HOST=mysql
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
rest.address: jobmanager
volumes:
- shared-tmpfs:/tmp/iceberg
jobmanager:
user: flink:flink
build: .
ports:
- "8081:8081"
command: jobmanager
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
volumes:
- shared-tmpfs:/tmp/iceberg
taskmanager:
user: flink:flink
build: .
depends_on:
- jobmanager
command: taskmanager
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
taskmanager.numberOfTaskSlots: 2
volumes:
- shared-tmpfs:/tmp/iceberg
mysql:
image: debezium/example-mysql:1.1
ports:
- "3306:3306"
environment:
- MYSQL_ROOT_PASSWORD=123456
- MYSQL_USER=mysqluser
- MYSQL_PASSWORD=mysqlpw
volumes:
shared-tmpfs:
driver: local
driver_opts:
type: "tmpfs"
device: "tmpfs"
```
The Docker Compose environment consists of the following containers:
- SQL-Client: Flink SQL Client, used to submit queries and visualize their results.
- Flink Cluster: a Flink JobManager and a Flink TaskManager container to execute queries.
- MySQL: mainly used as a data source to store the sharding table.
3. To start all containers, run the following command in the directory that contains the `docker-compose.yml` file:
```shell
docker-compose up -d
```
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run `docker ps` to check whether these containers are running properly.
We can also visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally.
***Note:***
* If you want to run with your own Flink environment, remember to download the jar packages and then put them to `FLINK_HOME/lib/`.
* All the following commands involving `docker-compose` should be executed in the directory of the `docker-compose.yml` file.
{{< img src="/fig/real-time-data-lake-tutorial/flink-ui.png" alt="Flink UI" >}}
### Preparing data in databases
1. Enter mysql's container:
```shell
docker-compose exec mysql mysql -uroot -p123456
```
2. Create databases/tables and populate data:
Create a logical sharding table `user` sharded in different databases and tables physically.
```sql
CREATE DATABASE db_1;
USE db_1;
CREATE TABLE user_1 (
id INTEGER NOT NULL PRIMARY KEY,
name VARCHAR(255) NOT NULL DEFAULT 'flink',
address VARCHAR(1024),
phone_number VARCHAR(512),
email VARCHAR(255)
);
INSERT INTO user_1 VALUES (110,"user_110","Shanghai","123567891234","user_110@foo.com");
CREATE TABLE user_2 (
id INTEGER NOT NULL PRIMARY KEY,
name VARCHAR(255) NOT NULL DEFAULT 'flink',
address VARCHAR(1024),
phone_number VARCHAR(512),
email VARCHAR(255)
);
INSERT INTO user_2 VALUES (120,"user_120","Shanghai","123567891234","user_120@foo.com");
```
```sql
CREATE DATABASE db_2;
USE db_2;
CREATE TABLE user_1 (
id INTEGER NOT NULL PRIMARY KEY,
name VARCHAR(255) NOT NULL DEFAULT 'flink',
address VARCHAR(1024),
phone_number VARCHAR(512),
email VARCHAR(255)
);
INSERT INTO user_1 VALUES (110,"user_110","Shanghai","123567891234", NULL);
CREATE TABLE user_2 (
id INTEGER NOT NULL PRIMARY KEY,
name VARCHAR(255) NOT NULL DEFAULT 'flink',
address VARCHAR(1024),
phone_number VARCHAR(512),
email VARCHAR(255)
);
INSERT INTO user_2 VALUES (220,"user_220","Shanghai","123567891234","user_220@foo.com");
```
## Creating tables using Flink DDL in Flink SQL CLI
First, use the following command to enter the Flink SQL CLI Container:
```shell
docker-compose run sql-client
```
We should see the welcome screen of the CLI client:
{{< img src="/fig/real-time-data-lake-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
Then do the following steps in Flink SQL CLI:
1. Enable checkpoints every 3 seconds
Checkpoint is disabled by default, we need to enable it to commit Iceberg transactions.
Besides, the beginning of mysql-cdc binlog phase also requires waiting a complete checkpoint to avoid disorder of binlog records.
```sql
-- Flink SQL
Flink SQL> SET execution.checkpointing.interval = 3s;
```
2. Create MySQL sharding source table
Create a source table that captures the data from the logical sharding table `user`. Here, we use regex to match all the physical tables.
Besides, the table defines metadata column to identify which database/table the record comes from.
```sql
-- Flink SQL
Flink SQL> CREATE TABLE user_source (
database_name STRING METADATA VIRTUAL,
table_name STRING METADATA VIRTUAL,
`id` DECIMAL(20, 0) NOT NULL,
name STRING,
address STRING,
phone_number STRING,
email STRING,
PRIMARY KEY (`id`) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = 'mysql',
'port' = '3306',
'username' = 'root',
'password' = '123456',
'database-name' = 'db_[0-9]+',
'table-name' = 'user_[0-9]+'
);
```
3. Create Iceberg sink table
Create a sink table `all_users_sink` used to load data to Iceberg.
We define `database_name`, `table_name` and `id` as a combined primary key, because `id` maybe not unique across different databases and tables.
```sql
-- Flink SQL
Flink SQL> CREATE TABLE all_users_sink (
database_name STRING,
table_name STRING,
`id` DECIMAL(20, 0) NOT NULL,
name STRING,
address STRING,
phone_number STRING,
email STRING,
PRIMARY KEY (database_name, table_name, `id`) NOT ENFORCED
) WITH (
'connector'='iceberg',
'catalog-name'='iceberg_catalog',
'catalog-type'='hadoop',
'warehouse'='file:///tmp/iceberg/warehouse',
'format-version'='2'
);
```
## Streaming to Iceberg
1. Streaming write data from MySQL to Iceberg using the following Flink SQL:
```sql
-- Flink SQL
Flink SQL> INSERT INTO all_users_sink select * from user_source;
```
It will start a streaming job which will synchronize historical and incremental data from MySQL to Iceberg continuously.
The running job can be found in [Flink UI](http://localhost:8081/#/job/running), and it looks like:
{{< img src="/fig/real-time-data-lake-tutorial/flink-cdc-iceberg-running-job.png" alt="CDC to Iceberg Running Job" >}}
Then, we can use the following command to see the files written to Iceberg:
```shell
docker-compose exec sql-client tree /tmp/iceberg/warehouse/default_database/
```
It should look like:
{{< img src="/fig/real-time-data-lake-tutorial/files-in-iceberg.png" alt="Files in Iceberg" >}}
The actual files may differ in your environment, but the structure of the directory should be similar.
2. Use the following Flink SQL to query the data written to `all_users_sink`:
```sql
-- Flink SQL
Flink SQL> SELECT * FROM all_users_sink;
```
We can see the data queried in the Flink SQL CLI:
{{< img src="/fig/real-time-data-lake-tutorial/data_in_iceberg.png" alt="Data in Iceberg" >}}
3. Make some changes in the MySQL databases, and then the data in Iceberg table `all_users_sink` will also change in real time.
(3.1) Insert a new user in table `db_1.user_1`
```sql
--- db_1
INSERT INTO db_1.user_1 VALUES (111,"user_111","Shanghai","123567891234","user_111@foo.com");
```
(3.2) Update a user in table `db_1.user_2`
```sql
--- db_1
UPDATE db_1.user_2 SET address='Beijing' WHERE id=120;
```
(3.3) Delete a user in table `db_2.user_2`
```sql
--- db_2
DELETE FROM db_2.user_2 WHERE id=220;
```
After executing each step, we can query the table `all_users_sink` using `SELECT * FROM all_users_sink` in Flink SQL CLI to see the changes.
The final query result is as follows:
{{< img src="/fig/real-time-data-lake-tutorial/final-data-in-iceberg.png" alt="Final Data in Iceberg" >}}
From the latest result in Iceberg, we can see that there is a new record of `(db_1, user_1, 111)`, and the address of `(db_1, user_2, 120)` has been updated to `Beijing`.
Besides, the record of `(db_2, user_2, 220)` has been deleted. The result is exactly the same with the changes we did in MySQL.
## Clean up
After finishing the tutorial, run the following command in the directory of `docker-compose.yml` to stop all containers:
```shell
docker-compose down
```
{{< top >}}

@ -0,0 +1,163 @@
---
title: "Db2 Tutorial"
weight: 8
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/db2-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Demo: Db2 CDC to Elasticsearch
**1. Create `docker-compose.yml` file using following contents:**
```
version: '2.1'
services:
db2:
image: ruanhang/db2-cdc-demo:v1
privileged: true
ports:
- 50000:50000
environment:
- LICENSE=accept
- DB2INSTANCE=db2inst1
- DB2INST1_PASSWORD=admin
- DBNAME=testdb
- ARCHIVE_LOGS=true
elasticsearch:
image: elastic/elasticsearch:7.6.0
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
- discovery.type=single-node
ports:
- "9200:9200"
- "9300:9300"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
kibana:
image: elastic/kibana:7.6.0
ports:
- "5601:5601"
volumes:
- /var/run/docker.sock:/var/run/docker.sock
```
The Docker Compose environment consists of the following containers:
- Db2: db2 server and a pre-populated `products` table in the database `testdb`.
- Elasticsearch: store the result of the `products` table.
- Kibana: mainly used to visualize the data in Elasticsearch
To start all containers, run the following command in the directory that contains the docker-compose.yml file.
```shell
docker-compose up -d
```
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode.
Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally.
Dont forget to run the following command to stop all containers after you finished the tutorial:
```shell
docker-compose down
```
**2. Download following JAR package to `<FLINK_HOME>/lib`**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- flink-sql-connector-db2-cdc-3.0-SNAPSHOT.jar
**3. Launch a Flink cluster and start a Flink SQL CLI**
Execute following SQL statements in the Flink SQL CLI:
```sql
-- Flink SQL
-- checkpoint every 3000 milliseconds
Flink SQL> SET execution.checkpointing.interval = 3s;
Flink SQL> CREATE TABLE products (
ID INT NOT NULL,
NAME STRING,
DESCRIPTION STRING,
WEIGHT DECIMAL(10,3),
PRIMARY KEY (ID) NOT ENFORCED
) WITH (
'connector' = 'db2-cdc',
'hostname' = 'localhost',
'port' = '50000',
'username' = 'db2inst1',
'password' = 'admin',
'database-name' = 'testdb',
'schema-name' = 'DB2INST1',
'table-name' = 'PRODUCTS'
);
Flink SQL> CREATE TABLE es_products (
ID INT NOT NULL,
NAME STRING,
DESCRIPTION STRING,
WEIGHT DECIMAL(10,3),
PRIMARY KEY (ID) NOT ENFORCED
) WITH (
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'enriched_products_1'
);
Flink SQL> INSERT INTO es_products SELECT * FROM products;
```
**4. Check result in Elasticsearch**
Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data.
**5. Make changes in Db2 and watch result in Elasticsearch**
Enter Db2's container to make some changes in Db2, then you can see the result in Elasticsearch will change after
executing every SQL statement:
```shell
docker exec -it ${containerId} /bin/bash
su db2inst1
db2 connect to testdb
# enter db2 and execute sqls
db2
```
```sql
UPDATE DB2INST1.PRODUCTS SET DESCRIPTION='18oz carpenter hammer' WHERE ID=106;
INSERT INTO DB2INST1.PRODUCTS VALUES (default,'jacket','water resistent white wind breaker',0.2);
INSERT INTO DB2INST1.PRODUCTS VALUES (default,'scooter','Big 2-wheel scooter ',5.18);
DELETE FROM DB2INST1.PRODUCTS WHERE ID=111;
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "MongoDB Tutorial"
weight: 1
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/mongodb-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,9 +24,9 @@ specific language governing permissions and limitations
under the License.
-->
# 演示: MongoDB CDC 导入 Elasticsearch
# Demo: MongoDB CDC to Elasticsearch
1. 下载 `docker-compose.yml`
1. Create `docker-compose.yml` file using following contents:
```
version: '2.1'
@ -55,20 +62,20 @@ services:
- "5601:5601"
```
2. 进入 MongoDB 容器,初始化副本集和数据:
2. Enter Mongodb's container and initialize replica set and data:
```
docker-compose exec mongo /usr/bin/mongo -u mongouser -p mongopw
```
```javascript
// 1. 初始化副本集
// 1. initialize replica set
rs.initiate();
rs.status();
// 2. 切换数据库
// 2. switch database
use mgdb;
// 3. 初始化数据
// 3. initialize data
db.orders.insertMany([
{
order_id: 101,
@ -124,21 +131,21 @@ db.customers.insertMany([
]);
```
3. 下载以下 jar 包到 `<FLINK_HOME>/lib/`:
3. Download following JAR package to `<FLINK_HOME>/lib/`:
```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译```
```Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself. ```
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-mongodb-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mongodb-cdc/2.5-SNAPSHOT/flink-sql-connector-mongodb-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-mongodb-cdc/3.0-SNAPSHOT/flink-sql-connector-mongodb-cdc-3.0-SNAPSHOT.jar)
4. 然后启动 Flink 集群,再启动 SQL CLI.
4. Launch a Flink cluster, then start a Flink SQL CLI and execute following SQL statements inside:
```sql
-- Flink SQL
-- 设置间隔时间为3秒
-- checkpoint every 3000 milliseconds
Flink SQL> SET execution.checkpointing.interval = 3s;
-- 设置本地时区为 Asia/Shanghai
-- set local time zone as Asia/Shanghai
Flink SQL> SET table.local-time-zone = Asia/Shanghai;
Flink SQL> CREATE TABLE orders (
@ -203,7 +210,7 @@ Flink SQL> INSERT INTO enriched_orders
LEFT JOIN customers AS c ON o.customer_id = c.customer_id;
```
5. 修改 MongoDB 里面的数据,观察 elasticsearch 里的结果。
5. Make some changes in MongoDB, then check the result in Elasticsearch:
```javascript
db.orders.insert({
@ -233,3 +240,5 @@ db.orders.deleteOne(
{ order_id : 104 }
);
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Mysql & Postgres Tutorial"
weight: 2
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/mysql-postgres-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,23 +24,26 @@ specific language governing permissions and limitations
under the License.
-->
# 基于 Flink CDC 构建 MySQL 和 Postgres 的 Streaming ETL
# Streaming ETL for MySQL and Postgres with Flink CDC
这篇教程将展示如何基于 Flink CDC 快速构建 MySQL 和 Postgres 的流式 ETL。本教程的演示都将在 Flink SQL CLI 中进行,只涉及 SQL无需一行 Java/Scala 代码,也无需安装 IDE。
This tutorial is to show how to quickly build streaming ETL for MySQL and Postgres with Flink CDC.
假设我们正在经营电子商务业务,商品和订单的数据存储在 MySQL 中,订单对应的物流信息存储在 Postgres 中。
对于订单表,为了方便进行分析,我们希望让它关联上其对应的商品和物流信息,构成一张宽表,并且实时把它写到 ElasticSearch 中。
Assuming we are running an e-commerce business. The product and order data stored in MySQL, the shipment data related to the order is stored in Postgres.
We want to enrich the orders using the product and shipment table, and then load the enriched orders to ElasticSearch in real time.
接下来的内容将介绍如何使用 Flink Mysql/Postgres CDC 来实现这个需求,系统的整体架构如下图所示:
![Flink CDC Streaming ETL](/_static/fig/mysql-postgress-tutorial/flink-cdc-streaming-etl.png "Flink CDC Streaming ETL")
In the following sections, we will describe how to use Flink Mysql/Postgres CDC to implement it.
All exercises in this tutorial are performed in the Flink SQL CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation.
## 准备阶段
准备一台已经安装了 Docker 的 Linux 或者 MacOS 电脑。
The overview of the architecture is as follows:
{{< img src="/fig/mysql-postgres-tutorial/flink-cdc-streaming-etl.png" width="700px" alt="Flink CDC Streaming ETL" >}}
### 准备教程所需要的组件
接下来的教程将以 `docker-compose` 的方式准备所需要的组件。
## Preparation
Prepare a Linux or MacOS computer with Docker installed.
使用下面的内容创建一个 `docker-compose.yml` 文件:
### Starting components required
The components required in this demo are all managed in containers, so we will use `docker-compose` to start them.
Create `docker-compose.yml` file using following contents:
```
version: '2.1'
services:
@ -75,34 +85,35 @@ services:
ports:
- "5601:5601"
```
该 Docker Compose 中包含的容器有:
- MySQL: 商品表 `products` 和 订单表 `orders` 将存储在该数据库中, 这两张表将和 Postgres 数据库中的物流表 `shipments`进行关联,得到一张包含更多信息的订单表 `enriched_orders`
- Postgres: 物流表 `shipments` 将存储在该数据库中
- Elasticsearch: 最终的订单表 `enriched_orders` 将写到 Elasticsearch
- Kibana: 用来可视化 ElasticSearch 的数据
The Docker Compose environment consists of the following containers:
- MySQL: the `products`,`orders` tables will be store in the database. They will be joined with data in Postgres to enrich the orders.
- Postgres: the `shipments` table will be store in the database.
- Elasticsearch: mainly used as a data sink to store enriched orders.
- Kibana: used to visualize the data in Elasticsearch.
`docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件:
To start all containers, run the following command in the directory that contains the `docker-compose.yml` file.
```shell
docker-compose up -d
```
该命令将以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。你可以通过 docker ps 来观察上述的容器是否正常启动了,也可以通过访问 [http://localhost:5601/](http://localhost:5601/) 来查看 Kibana 是否运行正常。
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run docker ps to check whether these containers are running properly.
We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kibana is running normally.
### 下载 Flink 和所需要的依赖包
1. 下载 [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) 并将其解压至目录 `flink-1.18.0`
2. 下载下面列出的依赖包,并将它们放到目录 `flink-1.18.0/lib/` 下:
### Preparing Flink and JAR package required
1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) and unzip it to the directory `flink-1.18.0`
2. Download following JAR package required and put them under `flink-1.18.0/lib/`:
**下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译**
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- flink-sql-connector-mysql-cdc-2.5-SNAPSHOT.jar
- flink-sql-connector-postgres-cdc-2.5-SNAPSHOT.jar
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- flink-sql-connector-postgres-cdc-3.0-SNAPSHOT.jar
### 准备数据
#### 在 MySQL 数据库中准备数据
1. 进入 MySQL 容器
### Preparing data in databases
#### Preparing data in MySQL
1. Enter mysql's container:
```shell
docker-compose exec mysql mysql -uroot -p123456
```
2. 创建数据库和表 `products``orders`,并插入数据
2. Create tables and populate data:
```sql
-- MySQL
CREATE DATABASE mydb;
@ -139,12 +150,12 @@ docker-compose up -d
(default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false),
(default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false);
```
#### 在 Postgres 数据库中准备数据
1. 进入 Postgres 容器
#### Preparing data in Postgres
1. Enter postgres's container:
```shell
docker-compose exec postgres psql -h localhost -U postgres
```
2. 创建表 `shipments`,并插入数据
2. Create tables and populate data
```sql
-- PG
CREATE TABLE shipments (
@ -162,37 +173,37 @@ docker-compose up -d
(default,10003,'Shanghai','Hangzhou',false);
```
## 启动 Flink 集群和 Flink SQL CLI
## Starting Flink cluster and Flink SQL CLI
1. 使用下面的命令跳转至 Flink 目录下
1. Use the following command to change to the Flink directory:
```
cd flink-1.18.0
```
2. 使用下面的命令启动 Flink 集群
2. Use the following command to start a Flink cluster:
```shell
./bin/start-cluster.sh
```
启动成功的话,可以在 [http://localhost:8081/](http://localhost:8081/) 访问到 Flink Web UI如下所示
Then we can visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally, and the web page looks like:
![Flink UI](/_static/fig/mysql-postgress-tutorial/flink-ui.png "Flink UI")
{{< img src="/fig/mysql-postgres-tutorial/flink-ui.png" width="700px" alt="Flink UI" >}}
3. 使用下面的命令启动 Flink SQL CLI
3. Use the following command to start a Flink SQL CLI:
```shell
./bin/sql-client.sh
```
启动成功后,可以看到如下的页面:
We should see the welcome screen of the CLI client.
![Flink SQL_Client](/_static/fig/mysql-postgress-tutorial/flink-sql-client.png "Flink SQL Client")
{{< img src="/fig/mysql-postgres-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
## 在 Flink SQL CLI 中使用 Flink DDL 创建表
首先,开启 checkpoint每隔3秒做一次 checkpoint
## Creating tables using Flink DDL in Flink SQL CLI
First, enable checkpoints every 3 seconds
```sql
-- Flink SQL
Flink SQL> SET execution.checkpointing.interval = 3s;
```
然后, 对于数据库中的表 `products`, `orders`, `shipments` 使用 Flink SQL CLI 创建对应的表,用于同步这些底层数据库表的数据
Then, create tables that capture the change data from the corresponding database tables.
```sql
-- Flink SQL
Flink SQL> CREATE TABLE products (
@ -248,7 +259,7 @@ Flink SQL> CREATE TABLE shipments (
);
```
最后,创建 `enriched_orders` 表, 用来将关联后的订单数据写入 Elasticsearch 中
Finally, create `enriched_orders` table that is used to load data to the Elasticsearch.
```sql
-- Flink SQL
Flink SQL> CREATE TABLE enriched_orders (
@ -272,8 +283,8 @@ Flink SQL> CREATE TABLE enriched_orders (
);
```
## 关联订单数据并且将其写入 Elasticsearch 中
使用 Flink SQL 将订单表 `order` 与 商品表 `products`,物流信息表 `shipments` 关联,并将关联后的订单信息写入 Elasticsearch 中
## Enriching orders and load to ElasticSearch
Use Flink SQL to join the `order` table with the `products` and `shipments` table to enrich orders and write to the Elasticsearch.
```sql
-- Flink SQL
Flink SQL> INSERT INTO enriched_orders
@ -282,55 +293,54 @@ Flink SQL> INSERT INTO enriched_orders
LEFT JOIN products AS p ON o.product_id = p.id
LEFT JOIN shipments AS s ON o.order_id = s.order_id;
```
现在,就可以在 Kibana 中看到包含商品和物流信息的订单数据。
首先访问 [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) 创建 index pattern `enriched_orders`.
Now, the enriched orders should be shown in Kibana.
Visit [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) to create an index pattern `enriched_orders`.
![Create Index Pattern](/_static/fig/mysql-postgress-tutorial/kibana-create-index-pattern.png "Create Index Pattern")
{{< img src="/fig/mysql-postgres-tutorial/kibana-create-index-pattern.png" width="700px" alt="Create Index Pattern" >}}
然后就可以在 [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) 看到写入的数据了.
Visit [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) to find the enriched orders.
![Find enriched Orders](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders.png "Find enriched Orders")
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders.png" width="700px" alt="Find enriched Orders" >}}
接下来,修改 MySQL 和 Postgres 数据库中表的数据Kibana中显示的订单数据也将实时更新
1. 在 MySQL 的 `orders` 表中插入一条数据
Next, do some change in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time.
1. Insert a new order in MySQL
```sql
--MySQL
INSERT INTO orders
VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false);
```
2. 在 Postgres 的 `shipment` 表中插入一条数据
2. Insert a shipment in Postgres
```sql
--PG
INSERT INTO shipments
VALUES (default,10004,'Shanghai','Beijing',false);
```
3. 在 MySQL 的 `orders` 表中更新订单的状态
3. Update the order status in MySQL
```sql
--MySQL
UPDATE orders SET order_status = true WHERE order_id = 10004;
```
4. 在 Postgres 的 `shipment` 表中更新物流的状态
4. Update the shipment status in Postgres
```sql
--PG
UPDATE shipments SET is_arrived = true WHERE shipment_id = 1004;
```
5. 在 MYSQL 的 `orders` 表中删除一条数据
5. Delete the order in MySQL
```sql
--MySQL
DELETE FROM orders WHERE order_id = 10004;
```
每执行一步就刷新一次 Kibana可以看到 Kibana 中显示的订单数据将实时更新,如下所示:
![Enriched Orders Changes](/_static/fig/mysql-postgress-tutorial/kibana-detailed-orders-changes.gif "Enriched Orders Changes")
The changes of enriched orders in Kibana are as follows:
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders-changes.gif" width="700px" alt="Enriched Orders Changes" >}}
## 环境清理
本教程结束后,在 `docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器:
## Clean up
After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`:
```shell
docker-compose down
```
在 Flink 所在目录 `flink-1.18.0` 下执行如下命令停止 Flink 集群:
Run the following command to stop the Flink cluster in the directory of Flink `flink-1.18.0`:
```shell
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "OceanBase Tutorial"
weight: 3
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/oceanbase-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,21 +24,20 @@ specific language governing permissions and limitations
under the License.
-->
# 演示: OceanBase CDC 导入 Elasticsearch
# Demo: OceanBase CDC to ElasticSearch
## 视频教程
## Video tutorial
- [YouTube](https://www.youtube.com/watch?v=ODGE-73Dntg&t=2s)
- [Bilibili](https://www.bilibili.com/video/BV1Zg411a7ZB/?spm_id_from=333.999.0.0)
### Preparation
### 准备教程所需要的组件
#### Configure and start the components
#### 配置并启动容器
Create `docker-compose.yml`.
配置 `docker-compose.yml`
*注意*:本示例需要使用`host`网络,所以只能在 Linux 系统上运行,更多信息见 [network-tutorial-host](https://docs.docker.com/network/network-tutorial-host/)。
*Note*: `host` network mode is required in this demo, so it can only work on Linux, see [network-tutorial-host](https://docs.docker.com/network/network-tutorial-host/).
```yaml
version: '2.1'
@ -77,36 +83,40 @@ services:
- '/var/run/docker.sock:/var/run/docker.sock'
```
`docker-compose.yml` 所在目录下执行下面的命令来启动本教程需要的组件:
Execute the following command in the directory where `docker-compose.yml` is located.
```shell
docker-compose up -d
```
### 设置密码
### Set password
From OceanBase 4.0.0.0 CE, we can only fetch the commit log of non-sys tenant.
OceanBase 从社区版 4.0.0.0 开始只支持对非 sys 租户的增量数据拉取,这里我们使用 test 租户的 root 用户作为示例。
Here we use the 'test' tenant for example.
登陆 test 租户的 root 用户:
Login with 'root' user of 'test' tenant:
```shell
docker-compose exec observer obclient -h127.0.0.1 -P2881 -uroot@test
```
设置密码:
Set a password:
```mysql
ALTER USER root IDENTIFIED BY 'test';
```
### 准备数据
### Create data for reading snapshot
使用 'root@test' 用户登陆。
Login 'root' user of 'test' tenant.
```shell
docker-compose exec observer obclient -h127.0.0.1 -P2881 -uroot@test -ptest
```
Insert data:
```sql
CREATE DATABASE ob;
USE ob;
@ -144,23 +154,23 @@ VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false),
(default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false);
```
### 下载所需要的依赖包
### Download the libraries required
```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译```
```Download links are only available for stable releases.```
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oceanbase-cdc/2.5-SNAPSHOT/flink-sql-connector-oceanbase-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oceanbase-cdc/3.0-SNAPSHOT/flink-sql-connector-oceanbase-cdc-3.0-SNAPSHOT.jar)
### 在 Flink SQL CLI 中使用 Flink DDL 创建表
### Use Flink DDL to create dynamic table in Flink SQL CLI
```sql
-- 设置间隔时间为3秒
-- checkpoint every 3000 milliseconds
Flink SQL> SET execution.checkpointing.interval = 3s;
-- 设置本地时区为 Asia/Shanghai
-- set local time zone as Asia/Shanghai
Flink SQL> SET table.local-time-zone = Asia/Shanghai;
-- 创建订单表
-- create orders table
Flink SQL> CREATE TABLE orders (
order_id INT,
order_date TIMESTAMP(0),
@ -185,7 +195,7 @@ Flink SQL> CREATE TABLE orders (
'working-mode' = 'memory'
);
-- 创建商品表
-- create products table
Flink SQL> CREATE TABLE products (
id INT,
name STRING,
@ -207,7 +217,7 @@ Flink SQL> CREATE TABLE products (
'working-mode' = 'memory'
);
-- 创建关联后的订单数据表
-- create flat table enriched_orders
Flink SQL> CREATE TABLE enriched_orders (
order_id INT,
order_date TIMESTAMP(0),
@ -223,7 +233,7 @@ Flink SQL> CREATE TABLE enriched_orders (
'hosts' = 'http://localhost:9200',
'index' = 'enriched_orders');
-- 执行读取和写入
-- Start the reading and writing job
Flink SQL> INSERT INTO enriched_orders
SELECT o.order_id,
o.order_date,
@ -237,13 +247,13 @@ Flink SQL> INSERT INTO enriched_orders
LEFT JOIN products AS p ON o.product_id = p.id;
```
### 在 Kibana 中查看数据
### Check data on Kibana
访问 [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) 创建 index pattern `enriched_orders`,之后可以在 [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) 看到写入的数据了。
Open [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) and create index pattern `enriched_orders`, then go to [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover), and you will see the data of `enriched_orders`.
### 修改监听表数据,查看增量数据变动
### Check data changes
在OceanBase中依次执行如下修改操作每执行一步就刷新一次 Kibana可以看到 Kibana 中显示的订单数据将实时更新。
Execute the following sql in OceanBase under `ob` database, you will find records in Kibana be updated after each step in real time.
```sql
INSERT INTO orders VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false);
@ -251,16 +261,18 @@ UPDATE orders SET order_status = true WHERE order_id = 10004;
DELETE FROM orders WHERE order_id = 10004;
```
### 环境清理
### Clean up
`docker-compose.yml` 文件所在的目录下执行如下命令停止所有容器:
Execute the following command to stop all containers in the directory where `docker-compose.yml` is located.
```shell
docker-compose down
```
进入Flink的部署目录停止 Flink 集群:
Stop the flink cluster by following command.
```shell
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "Oracle Tutorial"
weight: 4
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/oracle-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,9 +24,9 @@ specific language governing permissions and limitations
under the License.
-->
# 演示: Oracle CDC 导入 Elasticsearch
# Demo: Oracle CDC to Elasticsearch
**创建`docker-compose.yml`文件,内容如下所示:**
**Create `docker-compose.yml` file using following contents:**
```
version: '2.1'
@ -52,35 +59,34 @@ services:
volumes:
- /var/run/docker.sock:/var/run/docker.sock
```
该 Docker Compose 中包含的容器有:
- Oracle: Oracle 19c 数据库
- Elasticsearch: `orders` 表将和 `products` 表进行joinjoin的结果写入Elasticsearch中
- Kibana: 可视化 Elasticsearch 中的数据
The Docker Compose environment consists of the following containers:
- Oracle: Oracle 19c database.
- Elasticsearch: store the join result of the `orders` and `products` table.
- Kibana: mainly used to visualize the data in Elasticsearch
在 docker-compose.yml 所在目录下运行如下命令以启动所有容器:
To start all containers, run the following command in the directory that contains the docker-compose.yml file.
```shell
docker-compose up -d
```
该命令会以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。
你可以通过 docker ps 来观察上述的容器是否正常启动了。 也可以访问 http://localhost:5601/ 来查看 Kibana 是否运行正常。
另外可以通过如下命令停止所有的容器:
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode.
Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally.
Dont forget to run the following command to stop all containers after you finished the tutorial:
```shell
docker-compose down
````
```
**下载以下 jar 包到 `<FLINK_HOME>/lib/`:**
**Download following JAR package to `<FLINK_HOME>/lib`**
*下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译*
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release-branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-oracle-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-oracle-cdc/2.5-SNAPSHOT/flink-sql-connector-oracle-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-oracle-cdc/3.0-SNAPSHOT/flink-sql-connector-oracle-cdc-3.0-SNAPSHOT.jar)
**在 Oracle 数据库中准备数据**
创建数据库和表 `products``orders`,并插入数据:
**Preparing data in Oracle database**
Introduce the tables in Oracle:
```shell
docker-compose exec oracle sqlplus debezium/dbz@localhost:1521/ORCLCDB
```
@ -141,7 +147,9 @@ INSERT INTO DEBEZIUM.ORDERS VALUES (1003, TO_TIMESTAMP('2020-07-30 12:00:30.0010
INSERT INTO DEBEZIUM.ORDERS VALUES (1004, TO_TIMESTAMP('2020-07-30 15:22:00.001000', 'YYYY-MM-DD HH24:MI:SS.FF'), 'Jark', 1, 104);
```
**然后启动 Flink 集群,再启动 SQL CLI:**
**Launch a Flink cluster and start a Flink SQL CLI**
Execute following SQL statements in the Flink SQL CLI:
```sql
-- Flink SQL
@ -202,13 +210,13 @@ Flink SQL> INSERT INTO enriched_orders
LEFT JOIN products AS p ON o.PRODUCT_ID = p.ID;
```
**检查 ElasticSearch 中的结果**
**Check result in Elasticsearch**
检查最终的结果是否写入ElasticSearch中, 可以在[Kibana](http://localhost:5601/)看到ElasticSearch中的数据
Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data.
**在 Oracle 制造一些变更,观察 ElasticSearch 中的结果**
**Make changes in Oracle and watch result in Elasticsearch**
进入Oracle容器中并通过如下的SQL语句对Oracle数据库进行一些修改, 然后就可以看到每执行一条SQL语句Elasticsearch中的数据都会实时更新。
Enter Oracle's container to make some changes in Oracle, then you can see the result in Elasticsearch will change after executing every SQL statement:
```shell
docker-compose exec oracle sqlplus debezium/dbz@localhost:1521/ORCLCDB
@ -221,3 +229,5 @@ UPDATE DEBEZIUM.ORDERS SET QUANTITY = 10 WHERE ID = 1002;
DELETE FROM DEBEZIUM.ORDERS WHERE ID = 1004;
```
{{< top >}}

@ -0,0 +1,289 @@
---
title: "PolarDB-X Tutorial"
weight: 5
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/mongodb-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Demo: PolarDB-X CDC to Elasticsearch
This tutorial is to show how to quickly build streaming ETL for PolarDB-X with Flink CDC.
Assuming we are running an e-commerce business. The product and order data stored in PolarDB-X.
We want to enrich the orders using the product table, and then load the enriched orders to ElasticSearch in real time.
In the following sections, we will describe how to use Flink PolarDB-X CDC to implement it.
All exercises in this tutorial are performed in the Flink SQL CLI, and the entire process uses standard SQL syntax, without a single line of Java/Scala code or IDE installation.
## Preparation
Prepare a Linux or MacOS computer with Docker installed.
### Starting components required
The components required in this demo are all managed in containers, so we will use `docker-compose` to start them.
Create `docker-compose.yml` file using following contents:
```
version: '2.1'
services:
polardbx:
polardbx:
image: polardbx/polardb-x:2.0.1
container_name: polardbx
ports:
- "8527:8527"
elasticsearch:
image: 'elastic/elasticsearch:7.6.0'
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- ES_JAVA_OPTS=-Xms512m -Xmx512m
- discovery.type=single-node
ports:
- '9200:9200'
- '9300:9300'
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
kibana:
image: 'elastic/kibana:7.6.0'
container_name: kibana
ports:
- '5601:5601'
volumes:
- '/var/run/docker.sock:/var/run/docker.sock'
```
The Docker Compose environment consists of the following containers:
- PolarDB-X: the `products`,`orders` tables will be store in the database. They will be joined enrich the orders.
- Elasticsearch: mainly used as a data sink to store enriched orders.
- Kibana: used to visualize the data in Elasticsearch.
To start all containers, run the following command in the directory that contains the `docker-compose.yml` file.
```shell
docker-compose up -d
```
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode. Run docker ps to check whether these containers are running properly.
We can also visit [http://localhost:5601/](http://localhost:5601/) to see if Kibana is running normally.
### Preparing Flink and JAR package required
1. Download [Flink 1.18.0](https://archive.apache.org/dist/flink/flink-1.18.0/flink-1.18.0-bin-scala_2.12.tgz) and unzip it to the directory `flink-1.18.0`
2. Download following JAR package required and put them under `flink-1.18.0/lib/`:
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- flink-sql-connector-mysql-cdc-3.0-SNAPSHOT.jar
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
### Preparing data in databases
#### Preparing data in PolarDB-X
1. Enter PolarDB-X Database:
```shell
mysql -h127.0.0.1 -P8527 -upolardbx_root -p"123456"
```
2. Create tables and populate data:
```sql
-- PolarDB-X
CREATE TABLE products (
id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) NOT NULL,
description VARCHAR(512)
) AUTO_INCREMENT = 101;
INSERT INTO products
VALUES (default,"scooter","Small 2-wheel scooter"),
(default,"car battery","12V car battery"),
(default,"12-pack drill bits","12-pack of drill bits with sizes ranging from #40 to #3"),
(default,"hammer","12oz carpenter's hammer"),
(default,"hammer","14oz carpenter's hammer"),
(default,"hammer","16oz carpenter's hammer"),
(default,"rocks","box of assorted rocks"),
(default,"jacket","water resistent black wind breaker"),
(default,"spare tire","24 inch spare tire");
CREATE TABLE orders (
order_id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
order_date DATETIME NOT NULL,
customer_name VARCHAR(255) NOT NULL,
price DECIMAL(10, 5) NOT NULL,
product_id INTEGER NOT NULL,
order_status BOOLEAN NOT NULL -- Whether order has been placed
) AUTO_INCREMENT = 10001;
INSERT INTO orders
VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false),
(default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false),
(default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false);
```
## Starting Flink cluster and Flink SQL CLI
1. Use the following command to change to the Flink directory:
```
cd flink-1.18.0
```
2. Use the following command to start a Flink cluster:
```shell
./bin/start-cluster.sh
```
Then we can visit [http://localhost:8081/](http://localhost:8081/) to see if Flink is running normally, and the web page looks like:
{{< img src="/fig/mysql-postgres-tutorial/flink-ui.png" alt="Flink UI" >}}
3. Use the following command to start a Flink SQL CLI:
```shell
./bin/sql-client.sh
```
We should see the welcome screen of the CLI client.
{{< img src="/fig/mysql-postgres-tutorial/flink-sql-client.png" alt="Flink SQL Client" >}}
## Creating tables using Flink DDL in Flink SQL CLI
First, enable checkpoints every 3 seconds
```sql
-- Flink SQL
Flink SQL> SET execution.checkpointing.interval = 3s;
```
Then, create tables that capture the change data from the corresponding database tables.
```sql
-- Flink SQL
Flink SQL> SET execution.checkpointing.interval = 3s;
-- create source table2 - orders
Flink SQL> CREATE TABLE orders (
order_id INT,
order_date TIMESTAMP(0),
customer_name STRING,
price DECIMAL(10, 5),
product_id INT,
order_status BOOLEAN,
PRIMARY KEY (order_id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = '127.0.0.1',
'port' = '8527',
'username' = 'polardbx_root',
'password' = '123456',
'database-name' = 'mydb',
'table-name' = 'orders'
);
-- create source table2 - products
CREATE TABLE products (
id INT,
name STRING,
description STRING,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = '127.0.0.1',
'port' = '8527',
'username' = 'polardbx_root',
'password' = '123456',
'database-name' = 'mydb',
'table-name' = 'products'
);
```
Finally, create `enriched_orders` table that is used to load data to the Elasticsearch.
```sql
-- Flink SQL
-- create sink table - enrich_orders
Flink SQL> CREATE TABLE enriched_orders (
order_id INT,
order_date TIMESTAMP(0),
customer_name STRING,
price DECIMAL(10, 5),
product_id INT,
order_status BOOLEAN,
product_name STRING,
product_description STRING,
PRIMARY KEY (order_id) NOT ENFORCED
) WITH (
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'enriched_orders'
);
```
## Enriching orders and load to ElasticSearch
Use Flink SQL to join the `order` table with the `products` table to enrich orders and write to the Elasticsearch.
```sql
-- Flink SQL
Flink SQL> INSERT INTO enriched_orders
SELECT o.order_id,
o.order_date,
o.customer_name,
o.price,
o.product_id,
o.order_status,
p.name,
p.description
FROM orders AS o
LEFT JOIN products AS p ON o.product_id = p.id;
```
Now, the enriched orders should be shown in Kibana.
Visit [http://localhost:5601/app/kibana#/management/kibana/index_pattern](http://localhost:5601/app/kibana#/management/kibana/index_pattern) to create an index pattern `enriched_orders`.
{{< img src="/fig/mysql-postgres-tutorial/kibana-create-index-pattern.png" alt="Create Index Pattern" >}}
Visit [http://localhost:5601/app/kibana#/discover](http://localhost:5601/app/kibana#/discover) to find the enriched orders.
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders.png" alt="Find enriched Orders" >}}
Next, do some change in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time.
1. Insert a new order in PolarDB-X
```sql
--PolarDB-X
INSERT INTO orders
VALUES (default, '2020-07-30 15:22:00', 'Jark', 29.71, 104, false);
```
2. Update the order status in PolarDB-X
```sql
--PolarDB-X
UPDATE orders SET order_status = true WHERE order_id = 10004;
```
3. Delete the order in PolarDB-X
```sql
--PolarDB-X
DELETE FROM orders WHERE order_id = 10004;
```
The changes of enriched orders in Kibana are as follows:
{{< img src="/fig/mysql-postgres-tutorial/kibana-detailed-orders-changes.gif" alt="Enriched Orders Changes" >}}
## Clean up
After finishing the tutorial, run the following command to stop all containers in the directory of `docker-compose.yml`:
```shell
docker-compose down
```
Run the following command to stop the Flink cluster in the directory of Flink `flink-1.18.0`:
```shell
./bin/stop-cluster.sh
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "SqlServer Tutorial"
weight: 6
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/sqlserver-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,9 +24,9 @@ specific language governing permissions and limitations
under the License.
-->
# 演示: SqlServer CDC 导入 Elasticsearch
# Demo: SqlServer CDC to Elasticsearch
**创建 `docker-compose.yml` 文件,内容如下所示:**
**Create `docker-compose.yml` file using following contents:**
```
version: '2.1'
@ -60,35 +67,35 @@ services:
volumes:
- /var/run/docker.sock:/var/run/docker.sock
```
该 Docker Compose 中包含的容器有:
- SqlServerSqlServer 数据库。
- Elasticsearch`orders` 表将和 `products` 表进行 joinjoin 的结果写入 Elasticsearch 中。
- Kibana:可视化 Elasticsearch 中的数据。
The Docker Compose environment consists of the following containers:
- SqlServer: SqlServer database.
- Elasticsearch: store the join result of the `orders` and `products` table.
- Kibana: mainly used to visualize the data in Elasticsearch.
在 docker-compose.yml 所在目录下运行如下命令以启动所有容器:
To start all containers, run the following command in the directory that contains the docker-compose.yml file:
```shell
docker-compose up -d
```
该命令会以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。
你可以通过 docker ps 来观察上述的容器是否正常启动了。 也可以访问 http://localhost:5601/ 来查看 Kibana 是否运行正常。
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode.
Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally.
另外可以通过如下命令停止并删除所有的容器:
Dont forget to run the following command to stop and remove all containers after you finished the tutorial:
```shell
docker-compose down
````
**下载以下 jar 包到 `<FLINK_HOME>/lib/`**
**Download following JAR package to `<FLINK_HOME>/lib`:**
```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译```
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-sqlserver-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-sqlserver-cdc/2.5-SNAPSHOT/flink-sql-connector-sqlserver-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-sqlserver-cdc/3.0-SNAPSHOT/flink-sql-connector-sqlserver-cdc-3.0-SNAPSHOT.jar)
**在 SqlServer 数据库中准备数据**
**Preparing data in SqlServer database**
创建数据库和表 `products``orders`,并插入数据:
Create databases/tables and populate data
```sql
-- Sqlserver
@ -143,7 +150,7 @@ docker-compose down
EXEC sys.sp_cdc_enable_table @source_schema = 'dbo', @source_name = 'orders', @role_name = NULL, @supports_net_changes = 0;
GO
```
**然后启动 Flink 集群,再启动 SQL CLI**
**Launch a Flink cluster and start a Flink SQL CLI:**
```sql
-- Flink SQL
@ -202,13 +209,13 @@ Flink SQL> INSERT INTO enriched_orders
LEFT JOIN products AS p ON o.product_id = p.id;
```
**检查 ElasticSearch 中的结果**
**Check result in Elasticsearch**
检查最终的结果是否写入 ElasticSearch 中,可以在 [Kibana](http://localhost:5601/) 看到 ElasticSearch 中的数据。
Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data.
**在 SqlServer 制造一些变更,观察 ElasticSearch 中的结果**
**Make changes in SqlServer and watch result in Elasticsearch**
通过如下的 SQL 语句对 SqlServer 数据库进行一些修改,然后就可以看到每执行一条 SQL 语句Elasticsearch 中的数据都会实时更新。
Do some changes in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time.
```sql
INSERT INTO orders(order_date,purchaser,quantity,product_id) VALUES ('22-FEB-2016', 1006, 22, 107);
@ -220,3 +227,5 @@ GO
DELETE FROM orders WHERE id = 10004;
GO
```
{{< top >}}

@ -1,3 +1,10 @@
---
title: "TiDB Tutorial"
weight: 7
type: docs
aliases:
- /try-flink-cdc/cdc-connectors/tidb-tutorial.html
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@ -17,14 +24,14 @@ specific language governing permissions and limitations
under the License.
-->
# 演示: TiDB CDC 导入 Elasticsearch
# Demo: TiDB CDC to Elasticsearch
**首先我们得通过 docker 来启动 TiDB 集群。**
**First,we will start TiDB cluster with docker.**
```shell
$ git clone https://github.com/pingcap/tidb-docker-compose.git
```
**其次替换目录 `tidb-docker-compose` 里面的 `docker-compose.yml` 文件,内容如下所示:**
**Next,replace `docker-compose.yml` file using following contents in directory `tidb-docker-compose`:**
```
version: "2.1"
@ -111,37 +118,37 @@ services:
- /var/run/docker.sock:/var/run/docker.sock
```
该 Docker Compose 中包含的容器有:
- TiDB 集群: tikv、pd、tidb。
- Elasticsearch`orders` 表将和 `products` 表进行 joinjoin 的结果写入 Elasticsearch 中。
- Kibana:可视化 Elasticsearch 中的数据。
The Docker Compose environment consists of the following containers:
- TiDB cluster: tikv、pd、tidb.
- Elasticsearch: store the join result of the `orders` and `products` table.
- Kibana: mainly used to visualize the data in Elasticsearch.
本机添加 host 映射 `pd``tikv` 映射 `127.0.0.1`
在 docker-compose.yml 所在目录下运行如下命令以启动所有容器:
Add `pd` and `tikv` mapping to `127.0.0.1` in `host` file.
To start all containers, run the following command in the directory that contains the docker-compose.yml file:
```shell
docker-compose up -d
mysql -h 127.0.0.1 -P 4000 -u root # Just test tidb cluster is ready,if you have install mysql local.
```
该命令会以 detached 模式自动启动 Docker Compose 配置中定义的所有容器。
你可以通过 docker ps 来观察上述的容器是否正常启动了。 也可以访问 http://localhost:5601/ 来查看 Kibana 是否运行正常。
This command automatically starts all the containers defined in the Docker Compose configuration in a detached mode.
Run docker ps to check whether these containers are running properly. You can also visit http://localhost:5601/ to see if Kibana is running normally.
另外可以通过如下命令停止并删除所有的容器:
Dont forget to run the following command to stop and remove all containers after you finished the tutorial:
```shell
docker-compose down
````
**下载以下 jar 包到 `<FLINK_HOME>/lib/`**
**Download following JAR package to `<FLINK_HOME>/lib`:**
```下载链接只对已发布的版本有效, SNAPSHOT 版本需要本地基于 master 或 release- 分支编译```
**Download links are available only for stable releases, SNAPSHOT dependencies need to be built based on master or release branches by yourself.**
- [flink-sql-connector-elasticsearch7-3.0.1-1.17.jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7/3.0.1-1.17/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar)
- [flink-sql-connector-tidb-cdc-2.5-SNAPSHOT.jar](https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-tidb-cdc/2.5-SNAPSHOT/flink-sql-connector-tidb-cdc-2.5-SNAPSHOT.jar)
- [flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar](https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-tidb-cdc/3.0-SNAPSHOT/flink-sql-connector-tidb-cdc-3.0-SNAPSHOT.jar)
**在 TiDB 数据库中准备数据**
**Preparing data in TiDB database**
创建数据库和表 `products``orders`,并插入数据:
Create databases/tables and populate data
```sql
-- TiDB
@ -178,7 +185,7 @@ VALUES (default, '2020-07-30 10:08:22', 'Jark', 50.50, 102, false),
(default, '2020-07-30 10:11:09', 'Sally', 15.00, 105, false),
(default, '2020-07-30 12:00:30', 'Edward', 25.25, 106, false);
```
**然后启动 Flink 集群,再启动 SQL CLI**
**Launch a Flink cluster and start a Flink SQL CLI:**
```sql
-- Flink SQL
@ -234,13 +241,13 @@ Flink SQL> INSERT INTO enriched_orders
LEFT JOIN products AS p ON o.product_id = p.id;
```
**检查 ElasticSearch 中的结果**
**Check result in Elasticsearch**
检查最终的结果是否写入 ElasticSearch 中,可以在 [Kibana](http://localhost:5601/) 看到 ElasticSearch 中的数据。
Check the data has been written to Elasticsearch successfully, you can visit [Kibana](http://localhost:5601/) to see the data.
**在 TiDB 制造一些变更,观察 ElasticSearch 中的结果**
**Make changes in TiDB and watch result in Elasticsearch**
通过如下的 SQL 语句对 TiDB 数据库进行一些修改,然后就可以看到每执行一条 SQL 语句Elasticsearch 中的数据都会实时更新。
Do some changes in the databases, and then the enriched orders shown in Kibana will be updated after each step in real time.
```sql
INSERT INTO orders
@ -251,3 +258,4 @@ UPDATE orders SET order_status = true WHERE order_id = 10004;
DELETE FROM orders WHERE order_id = 10004;
```
{{< top >}}

@ -0,0 +1,25 @@
---
title: Pipeline Connectors
bookCollapseSection: true
weight: 1
aliases:
- /try-flink-cdc/pipeline-connectors/
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save