[doc] Use sphinx-rtd-theme to manage project documents

pull/294/head
Leonard Xu 4 years ago committed by Leonard Xu
parent d7fd46d0be
commit 7d7ca57e24

1
.gitignore vendored

@ -21,3 +21,4 @@ tmp
build-targetatlassian-ide-plugin.xml build-targetatlassian-ide-plugin.xml
*.ipr *.ipr
*.iws *.iws
docs/_build

@ -0,0 +1,19 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

@ -0,0 +1,81 @@
#!/bin/bash
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
set -x
# step-1: install dependencies
apt-get update
apt-get -y install git rsync python3-pip python3-sphinx python3-git python3-sphinx-rtd-theme python3-stemmer python3-virtualenv python3-setuptools
python3 -m pip install myst-parser pygments
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
export REPO_NAME="${GITHUB_REPOSITORY##*/}"
temp_docs_root=`mktemp -d`
# step-2: build sites for all branches(for multiple versioned docs), excludes 'HEAD' and 'gh-pages'
make -C docs clean
branches="`git for-each-ref '--format=%(refname:lstrip=-1)' refs/remotes/origin/ | grep -viE '^(HEAD|gh-pages)$'`"
for current_branch in ${branches}; do
export current_branch
git checkout ${current_branch}
# skip the branch that has no docs
if [ ! -e 'docs/conf.py' ]; then
echo -e "\tINFO: Couldn't find 'docs/conf.py' for branch: ${current_branch}, just skip this branch"
continue
fi
echo "INFO: Building sites for branch: ${current_branch}"
sphinx-build -b html docs/ docs/_build/html/${current_branch}
# copy the build content to temp dir
rsync -av "docs/_build/html/" "${temp_docs_root}/"
done
git checkout master
git config --global user.name "${GITHUB_ACTOR}"
git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
# step-3: push build sites to gh-pages branch
pushd "${temp_docs_root}"
git init
git remote add deploy "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git"
git checkout -b gh-pages
touch .nojekyll
cat > index.html <<EOF
<!DOCTYPE html>
<html>
<head>
<title>Flink CDC Connectors</title>
<meta http-equiv = "refresh" content="0; url='/${REPO_NAME}/master/'" />
</head>
<body>
<p>Please wait while you're redirected to our <a href="/${REPO_NAME}/master/">documentation</a>.</p>
</body>
</html>
EOF
git add .
git commit -m "Generated docs from commit ${GITHUB_SHA}"
git push deploy gh-pages --force
# pop back and exit
popd
exit 0

@ -0,0 +1,123 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Maven
# Build your Java project and run tests with Apache Maven.
# Add steps that analyze code, save build artifacts, deploy, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/java
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'Flink CDC Connectors'
copyright = '2021, Leonard Xu'
author = 'Leonard Xu'
# The full version, including alpha/beta/rc tags
release = '2.0.0'
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx_rtd_theme',
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
'myst_parser'
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
import myst_parser
source_parsers = {
'.md': myst_parser
}
source_suffix = ['.md']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# multiple version supports
import sys, os
sys.path.insert(0, os.path.abspath('.'))
try:
html_context
except NameError:
html_context = dict()
html_context['display_lower_left'] = True
if 'REPO_NAME' in os.environ:
REPO_NAME = os.environ['REPO_NAME']
else:
REPO_NAME = ''
from git import Repo
repo = Repo( search_parent_directories=True )
if 'current_version' in os.environ:
current_version = os.environ['current_version']
else:
current_version = repo.active_branch.name
html_context['current_version'] = current_version
html_context['version'] = current_version
html_context['versions'] = list()
versions = [branch.name for branch in repo.branches]
for version in versions:
html_context['versions'].append( (version, '/' +REPO_NAME+ '/' +version+ '/') )
html_context['display_github'] = True

@ -0,0 +1,111 @@
# Flink CDC Connectors
Flink CDC Connectors is a set of source connectors for Apache Flink, ingesting changes from different databases using change data capture (CDC).
The Flink CDC Connectors integrates Debezium as the engine to capture data changes. So it can fully leverage the ability of Debezium. See more about what is [Debezium](https://github.com/debezium/debezium).
This README is meant as a brief walkthrough on the core features with Flink CDC Connectors. For a fully detailed documentation, please see [Documentation](https://github.com/ververica/flink-cdc-connectors/wiki).
## Supported (Tested) Connectors
| Database | Version |
| --- | --- |
| MySQL | Database: 5.7, 8.0.x <br/>JDBC Driver: 8.0.16 |
| PostgreSQL | Database: 9.6, 10, 11, 12 <br/>JDBC Driver: 42.2.12|
## Features
1. Supports reading database snapshot and continues to read binlogs with **exactly-once processing** even failures happen.
2. CDC connectors for DataStream API, users can consume changes on multiple databases and tables in a single job without Debezium and Kafka deployed.
3. CDC connectors for Table/SQL API, users can use SQL DDL to create a CDC source to monitor changes on a single table.
## Usage for Table/SQL API
We need several steps to setup a Flink cluster with the provided connector.
1. Setup a Flink cluster with version 1.12+ and Java 8+ installed.
2. Download the connector SQL jars from the [Download](https://github.com/ververica/flink-cdc-connectors/wiki/Downloads) page (or [build yourself](#building-from-source).
3. Put the downloaded jars under `FLINK_HOME/lib/`.
4. Restart the Flink cluster.
The example shows how to create a MySQL CDC source in [Flink SQL Client](https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/sqlClient.html) and execute queries on it.
```sql
-- creates a mysql cdc table source
CREATE TABLE mysql_binlog (
id INT NOT NULL,
name STRING,
description STRING,
weight DECIMAL(10,3)
) WITH (
'connector' = 'mysql-cdc',
'hostname' = 'localhost',
'port' = '3306',
'username' = 'flinkuser',
'password' = 'flinkpw',
'database-name' = 'inventory',
'table-name' = 'products'
);
-- read snapshot and binlog data from mysql, and do some transformation, and show on the client
SELECT id, UPPER(name), description, weight FROM mysql_binlog;
```
## Usage for DataStream API
Include following Maven dependency (available through Maven Central):
```
<dependency>
<groupId>com.alibaba.ververica</groupId>
<!-- add the dependency matching your database -->
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>1.2.0</version>
</dependency>
```
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import com.alibaba.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import com.alibaba.ververica.cdc.connectors.mysql.MySqlSource;
public class MySqlBinlogSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = MySQLSource.<String>builder()
.hostname("localhost")
.port(3306)
.databaseList("inventory") // monitor all tables under inventory database
.username("flinkuser")
.password("flinkpw")
.deserializer(new StringDebeziumDeserializationSchema()) // converts SourceRecord to String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
## Building from source
Prerequisites:
- git
- Maven
- At least Java 8
```
git clone https://github.com/ververica/flink-cdc-connectors.git
cd flink-cdc-connectors
mvn clean install -DskipTests
```
Flink CDC Connectors is now available at your local `.m2` repository.
## License
The code in this repository is licensed under the [Apache Software License 2](https://github.com/ververica/flink-cdc-connectors/blob/master/LICENSE).

@ -0,0 +1,420 @@
# MySQL CDC Connector
* [Dependencies](#dependencies)
* [Maven dependency](#maven-dependency)
* [SQL Client JAR](#sql-client-jar)
* [Setup MySQL server](#setup-mysql-server)
* [Notes](#notes)
* [How MySQL CDC source works](#how-mysql-cdc-source-works)
* [Grant RELOAD permission for MySQL user](#grant-reload-permission-for-mysql-user)
* [The global read lock](#the-global-read-lock-flush-tables-with-read-lock)
* [Set a differnet SERVER ID for each job](#set-a-differnet-server-id-for-each-job)
* [Can't perform checkpoint during scaning database tables](#cant-perform-checkpoint-during-scaning-database-tables)
* [Setting up MySQL session timeouts](#setting-up-mysql-session-timeouts)
* [How to create a MySQL CDC table](#how-to-create-a-mysql-cdc-table)
* [Connector Options](#connector-options)
* [Features](#features)
* [Exactly-Once Processing](#exactly-once-processing)
* [Startup Reading Position](#startup-reading-position)
* [Single Thread Reading](#single-thread-reading)
* [DataStream Source](#datastream-source)
* [Data Type Mapping](#data-type-mapping)
* [FAQ](#faq)
The MySQL CDC connector allows for reading snapshot data and incremental data from MySQL database. This document describes how to setup the MySQL CDC connector to run SQL queries against MySQL databases.
Dependencies
------------
In order to setup the MySQL CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
```
<dependency>
<groupId>com.alibaba.ververica</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>1.1.0</version>
</dependency>
```
### SQL Client JAR
Download [flink-sql-connector-mysql-cdc-1.1.0.jar](https://repo1.maven.org/maven2/com/alibaba/ververica/flink-sql-connector-mysql-cdc/1.1.0/flink-sql-connector-mysql-cdc-1.1.0.jar) and put it under `<FLINK_HOME>/lib/`.
Setup MySQL server
----------------
You have to define a MySQL user with appropriate permissions on all databases that the Debezium MySQL connector monitors.
1. Create the MySQL user:
```sql
mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password';
```
2. Grant the required permissions to the user:
```sql
mysql> GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password';
```
3. Finalize the users permissions:
```sql
mysql> FLUSH PRIVILEGES;
```
See more about the [permission explanation](https://debezium.io/documentation/reference/1.2/connectors/mysql.html#_permissions_explained).
Notes
----------------
### How MySQL CDC source works
When the MySQL CDC source is started, it grabs a global read lock (`FLUSH TABLES WITH READ LOCK`) that blocks writes by other database. Then it reads current binlog position and the schema of the databases and tables. After that, the global read lock is released. Then it scans the database tables and reads binlog from the previous recorded position. Flink will periodically perform checkpoints to record the binlog position. In case of failover, the job will restart and restore from the checkpointed binlog position. Consequently, it guarantees the exactly once semantic.
### Grant RELOAD permission for MySQL user
If the MySQL user is not granted the RELOAD permission, the MySQL CDC source will use table-level locks instead and performs a snapshot with this method. This blocks write for a longer time.
### The global read lock (FLUSH TABLES WITH READ LOCK)
The global read lock is hold during reading binlog position and schema. It may takes seconds which is depending on the numbers of tables. The global read lock blocks writes, so it may still affect online business. If you want to skip the read lock, and can tolerate at-least-once semantic, you can add `'debezium.snapshot.locking.mode' = 'none'` option to skip the lock.
### Set a differnet SERVER ID for each job
Every MySQL database client for reading binlog should have an unique id, called server id. MySQL server will use this id to maintain network connection and the binlog position. Therefore, if different jobs share a same server id, it may result to reading from wrong binlog position.
Thus, it is recommended to set different server id for each job via the [SQL Hints](https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/sql/hints.html), e.g. `SELECT * FROM source_table /*+ OPTIONS('server-id'='123456') */ ;`.
Tip: By default the server id is randomed when the TaskManager is started up. If TaskManager is failed down, it may have a different server id when starting up again. But this shouldn't happen frequently (job exception doesn't restart TaskManager), and shouldn't affect MySQL server too much.
### Can't perform checkpoint during scaning database tables
During scanning tables, since there is no recoverable position, we can't perform checkpoints. In order to not perform checkpoints, MySQL CDC source will keep the checkpoint waiting to timeout. The timemout checkpoint will be recognized as failed checkpoint, by default, this will trigger a failover for the Flink job. So if the database table is large, it is recommended to add following Flink configurations to avoid failover because of the timeout checkpoints:
```
execution.checkpointing.interval: 10min
execution.checkpointing.tolerable-failed-checkpoints: 100
restart-strategy: fixed-delay
restart-strategy.fixed-delay.attempts: 2147483647
```
### Setting up MySQL session timeouts
When an initial consistent snapshot is made for large databases, your established connection could timeout while the tables are being read. You can prevent this behavior by configuring interactive_timeout and wait_timeout in your MySQL configuration file.
- `interactive_timeout`: The number of seconds the server waits for activity on an interactive connection before closing it. See [MySQL documentations](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout).
- `wait_timeout`: The number of seconds the server waits for activity on a noninteractive connection before closing it. See [MySQL documentations](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout).
How to create a MySQL CDC table
----------------
The MySQL CDC table can be defined as following:
```sql
-- register a MySQL table 'orders' in Flink SQL
CREATE TABLE orders (
order_id INT,
order_date TIMESTAMP(0),
customer_name STRING,
price DECIMAL(10, 5),
product_id INT,
order_status BOOLEAN
) WITH (
'connector' = 'mysql-cdc',
'hostname' = 'localhost',
'port' = '3306',
'username' = 'root',
'password' = '123456',
'database-name' = 'mydb',
'table-name' = 'orders'
);
-- read snapshot and binlogs from orders table
SELECT * FROM orders;
```
Connector Options
----------------
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td><h5>connector</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'mysql-cdc'</code>.</td>
</tr>
<tr>
<td><h5>hostname</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>IP address or hostname of the MySQL database server.</td>
</tr>
<tr>
<td><h5>username</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the MySQL database to use when connecting to the MySQL database server.</td>
</tr>
<tr>
<td><h5>password</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to use when connecting to the MySQL database server.</td>
</tr>
<tr>
<td><h5>database-name</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the MySQL server to monitor. The database-name also supports regular expressions to monitor multiple tables matches the regular expression.</td>
</tr>
<tr>
<td><h5>table-name</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the MySQL database to monitor. The table-name also supports regular expressions to monitor multiple tables matches the regular expression.</td>
</tr>
<tr>
<td><h5>port</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">3306</td>
<td>Integer</td>
<td>Integer port number of the MySQL database server.</td>
</tr>
<tr>
<td><h5>server-id</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Integer</td>
<td>A numeric ID of this database client, which must be unique across all currently-running database processes in the MySQL cluster.
This connector joins the MySQL database cluster as another server (with this unique ID) so it can read the binlog.
By default, a random number is generated between 5400 and 6400, though we recommend setting an explicit value.</td>
</tr>
<tr>
<td><h5>scan.startup.mode</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">initial</td>
<td>String</td>
<td>Optional startup mode for MySQL CDC consumer, valid enumerations are "initial"
and "latest-offset".
Please see <a href="#startup-reading-position">Startup Reading Position</a>section for more detailed information.</td>
</tr>
<tr>
<td><h5>server-time-zone</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">UTC</td>
<td>String</td>
<td>The session time zone in database server, e.g. "Asia/Shanghai".
It controls how the TIMESTAMP type in MYSQL converted to STRING.
See more <a href="https://debezium.io/documentation/reference/1.2/connectors/mysql.html#_temporal_values">here</a>.</td>
</tr>
<tr>
<td><h5>debezium.min.row.count.to.stream.results</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">1000</td>
<td>Integer</td>
<td>
During a snapshot operation, the connector will query each included table to produce a read event for all rows in that table. This parameter determines whether the MySQL connection will pull all results for a table into memory (which is fast but requires large amounts of memory), or whether the results will instead be streamed (can be slower, but will work for very large tables). The value specifies the minimum number of rows a table must contain before the connector will stream results, and defaults to 1,000. Set this parameter to '0' to skip all table size checks and always stream all results during a snapshot.</td>
</tr>
<tr>
<td><h5>debezium.snapshot.fetch.size</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>Integer</td>
<td>Specifies the maximum number of rows that should be read in one go from each table while taking a snapshot. The connector will read the table contents in multiple batches of this size.</td>
</tr>
<tr>
<td><h5>debezium.*</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from MySQL server.
For example: <code>'debezium.snapshot.mode' = 'never'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.2/connectors/mysql.html#mysql-connector-configuration-properties_debezium">Debezium's MySQL Connector properties</a></td>
</tr>
</tbody>
</table>
Features
--------
### Exactly-Once Processing
The MySQL CDC connector is a Flink Source connector which will read database snapshot first and then continues to read binlogs with **exactly-once processing** even failures happen. Please read [How the connector performs database snapshot](https://debezium.io/documentation/reference/1.2/connectors/mysql.html#how-the-mysql-connector-performs-database-snapshots_debezium).
### Startup Reading Position
The config option `scan.startup.mode` specifies the startup mode for MySQL CDC consumer. The valid enumerations are:
- `initial` (default): Performs an initial snapshot on the monitored database tables upon first startup, and continue to read the latest binlog.
- `latest-offset`: Never to perform snapshot on the monitored database tables upon first startup, just read from
the end of the binlog which means only have the changes since the connector was started.
_Note: the mechanism of `scan.startup.mode` option relying on Debezium's `snapshot.mode` configuration. So please do not using them together. If you speicifying both `scan.startup.mode` and `debezium.snapshot.mode` options in the table DDL, it may make `scan.startup.mode` doesn't work._
### Single Thread Reading
The MySQL CDC source can't work in parallel reading, because there is only one task can receive binlog events.
### DataStream Source
The MySQL CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows:
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import com.alibaba.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import com.alibaba.ververica.cdc.connectors.mysql.MySQLSource;
public class MySqlBinlogSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = MySQLSource.<String>builder()
.hostname("localhost")
.port(3306)
.databaseList("inventory") // monitor all tables under inventory database
.username("flinkuser")
.password("flinkpw")
.deserializer(new StringDebeziumDeserializationSchema()) // converts SourceRecord to String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
Data Type Mapping
----------------
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left">MySQL type<a href="https://dev.mysql.com/doc/man/8.0/en/data-types.html"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td><code>TINYINT</code></td>
<td><code>TINYINT</code></td>
</tr>
<tr>
<td>
<code>SMALLINT</code><br>
<code>TINYINT UNSIGNED</code></td>
<td><code>SMALLINT</code></td>
</tr>
<tr>
<td>
<code>INT</code><br>
<code>MEDIUMINT</code><br>
<code>SMALLINT UNSIGNED</code></td>
<td><code>INT</code></td>
</tr>
<tr>
<td>
<code>BIGINT</code><br>
<code>INT UNSIGNED</code></td>
<td><code>BIGINT</code></td>
</tr>
<tr>
<td><code>BIGINT UNSIGNED</code></td>
<td><code>DECIMAL(20, 0)</code></td>
</tr>
<tr>
<td><code>BIGINT</code></td>
<td><code>BIGINT</code></td>
</tr>
<tr>
<td><code>FLOAT</code></td>
<td><code>FLOAT</code></td>
</tr>
<tr>
<td>
<code>DOUBLE</code><br>
<code>DOUBLE PRECISION</code></td>
<td><code>DOUBLE</code></td>
</tr>
<tr>
<td>
<code>NUMERIC(p, s)</code><br>
<code>DECIMAL(p, s)</code></td>
<td><code>DECIMAL(p, s)</code></td>
</tr>
<tr>
<td>
<code>BOOLEAN</code><br>
<code>TINYINT(1)</code></td>
<td><code>BOOLEAN</code></td>
</tr>
<tr>
<td><code>DATE</code></td>
<td><code>DATE</code></td>
</tr>
<tr>
<td><code>TIME [(p)]</code></td>
<td><code>TIME [(p)] [WITHOUT TIMEZONE]</code></td>
</tr>
<tr>
<td><code>DATETIME [(p)]</code></td>
<td><code>TIMESTAMP [(p)] [WITHOUT TIMEZONE]</code></td>
</tr>
<tr>
<td><code>TIMESTAMP [(p)]</code></td>
<td><code>TIMESTAMP [(p)]</code><br>
<code>TIMESTAMP [(p)] WITH LOCAL TIME ZONE</code>
</td>
</tr>
<tr>
<td>
<code>CHAR(n)</code><br>
<code>VARCHAR(n)</code><br>
<code>TEXT</code></td>
<td><code>STRING</code></td>
</tr>
<tr>
<td>
<code>BINARY</code><br>
<code>VARBINARY</code><br>
<code>BLOB</code></td>
<td><code>BYTES</code></td>
</tr>
</tbody>
</table>
FAQ
--------
### How to skip snapshot and only read from binlog?
Please see [Startup Reading Position](#startup-reading-position) section.
### How to read a shared database that contains multiple tables, e.g. user_00, user_01, ..., user_99 ?
The `table-name` option supports regular expressions to monitor multiple tables matches the regular expression. So you can set `table-name` to `user_.*` to monitor all the `user_` prefix tables. The same to the `database-name` option. Note that the shared table should be in the same schema.
### ConnectException: Received DML '...' for processing, binlog probably contains events generated with statement or mixed based replication format
If there is above exception, please check `binlog_format` is `ROW`, you can check this by running `show variables like '%binlog_format%'` in MySQL client. Please note that even if the `binlog_format` configuration of your database is `ROW`, this configuration can be changed by other sessions, for example, `SET SESSION binlog_format='MIXED'; SET SESSION tx_isolation='REPEATABLE-READ'; COMMIT;`. Please also make sure there are no other session are changing this configuration.

@ -0,0 +1,300 @@
# Postgres CDC Connector
* [Dependencies](#dependencies)
* [Maven dependency](#maven-dependency)
* [SQL Client JAR](#sql-client-jar)
* [How to create a Postgres CDC table](#how-to-create-a-postgres-cdc-table)
* [Connector Options](#connector-options)
* [Features](#features)
* [Exactly-Once Processing](#exactly-once-processing)
* [Single Thread Reading](#single-thread-reading)
* [DataStream Source](#datastream-source)
* [Data Type Mapping](#data-type-mapping)
The Postgres CDC connector allows for reading snapshot data and incremental data from PostgreSQL database. This document describes how to setup the Postgres CDC connector to run SQL queries against PostgreSQL databases.
Dependencies
------------
In order to setup the Postgres CDC connector, the following table provides dependency information for both projects using a build automation tool (such as Maven or SBT) and SQL Client with SQL JAR bundles.
### Maven dependency
```
<dependency>
<groupId>com.alibaba.ververica</groupId>
<artifactId>flink-connector-postgres-cdc</artifactId>
<version>1.1.0</version>
</dependency>
```
### SQL Client JAR
Download [flink-sql-connector-postgres-cdc-1.1.0.jar](https://repo1.maven.org/maven2/com/alibaba/ververica/flink-sql-connector-postgres-cdc/1.1.0/flink-sql-connector-postgres-cdc-1.1.0.jar) and put it under `<FLINK_HOME>/lib/`.
How to create a Postgres CDC table
----------------
The Postgres CDC table can be defined as following:
```sql
-- register a PostgreSQL table 'shipments' in Flink SQL
CREATE TABLE shipments (
shipment_id INT,
order_id INT,
origin STRING,
destination STRING,
is_arrived BOOLEAN
) WITH (
'connector' = 'postgres-cdc',
'hostname' = 'localhost',
'port' = '5432',
'username' = 'postgres',
'password' = 'postgres',
'database-name' = 'postgres',
'schema-name' = 'public',
'table-name' = 'shipments'
);
-- read snapshot and binlogs from shipments table
SELECT * FROM shipments;
```
Connector Options
----------------
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 25%">Option</th>
<th class="text-left" style="width: 8%">Required</th>
<th class="text-left" style="width: 7%">Default</th>
<th class="text-left" style="width: 10%">Type</th>
<th class="text-left" style="width: 50%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td><h5>connector</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Specify what connector to use, here should be <code>'postgres-cdc'</code>.</td>
</tr>
<tr>
<td><h5>hostname</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>IP address or hostname of the PostgreSQL database server.</td>
</tr>
<tr>
<td><h5>username</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Name of the PostgreSQL database to use when connecting to the PostgreSQL database server.</td>
</tr>
<tr>
<td><h5>password</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Password to use when connecting to the PostgreSQL database server.</td>
</tr>
<tr>
<td><h5>database-name</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Database name of the PostgreSQL server to monitor.</td>
</tr>
<tr>
<td><h5>schema-name</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Schema name of the PostgreSQL database to monitor.</td>
</tr>
<tr>
<td><h5>table-name</h5></td>
<td>required</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Table name of the PostgreSQL database to monitor.</td>
</tr>
<tr>
<td><h5>port</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">5432</td>
<td>Integer</td>
<td>Integer port number of the PostgreSQL database server.</td>
</tr>
<tr>
<td><h5>decoding.plugin.name</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">decoderbufs</td>
<td>String</td>
<td>The name of the Postgres logical decoding plug-in installed on the server.
Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming, wal2json_rds_streaming and pgoutput.</td>
</tr>
<tr>
<td><h5>slot.name</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">flink</td>
<td>String</td>
<td>The name of the PostgreSQL logical decoding slot that was created for streaming changes from a particular plug-in
for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring.
<br/>Slot names must conform to <a href="https://www.postgresql.org/docs/current/static/warm-standby.html#STREAMING-REPLICATION-SLOTS-MANIPULATION">PostgreSQL replication slot naming rules</a>, which state: "Each replication slot has a name, which can contain lower-case letters, numbers, and the underscore character."</td>
</tr>
<tr>
<td><h5>debezium.*</h5></td>
<td>optional</td>
<td style="word-wrap: break-word;">(none)</td>
<td>String</td>
<td>Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from Postgres server.
For example: <code>'debezium.snapshot.mode' = 'never'</code>.
See more about the <a href="https://debezium.io/documentation/reference/1.2/connectors/postgresql.html#postgresql-connector-properties">Debezium's Postgres Connector properties</a></td>
</tr>
</tbody>
</table>
Note: `slot.name` is recommended to set for different tables to avoid the potential `PSQLException: ERROR: replication slot "flink" is active for PID 974` error. See more [here](https://debezium.io/documentation/reference/1.2/connectors/postgresql.html#postgresql-property-slot-name).
Features
--------
### Exactly-Once Processing
The Postgres CDC connector is a Flink Source connector which will read database snapshot first and then continues to read binlogs with **exactly-once processing** even failures happen. Please read [How the connector works](https://debezium.io/documentation/reference/1.2/connectors/postgresql.html#how-the-postgresql-connector-works).
### Single Thread Reading
The Postgres CDC source can't work in parallel reading, because there is only one task can receive binlog events.
### DataStream Source
The Postgres CDC connector can also be a DataStream source. You can create a SourceFunction as the following shows:
```java
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import com.alibaba.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import com.alibaba.ververica.cdc.connectors.postgres.PostgreSQLSource;
public class PostgreSQLSourceExample {
public static void main(String[] args) throws Exception {
SourceFunction<String> sourceFunction = PostgreSQLSource.<String>builder()
.hostname("localhost")
.port(5432)
.database("postgres")
.schemaList("inventory") // monitor all tables under inventory schema
.username("flinkuser")
.password("flinkpw")
.deserializer(new StringDebeziumDeserializationSchema()) // converts SourceRecord to String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(sourceFunction)
.print().setParallelism(1); // use parallelism 1 for sink to keep message ordering
env.execute();
}
}
```
Data Type Mapping
----------------
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left">PostgreSQL type<a href="https://www.postgresql.org/docs/12/datatype.html"></a></th>
<th class="text-left">Flink SQL type<a href="{% link dev/table/types.md %}"></a></th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td><code>TINYINT</code></td>
</tr>
<tr>
<td>
<code>SMALLINT</code><br>
<code>INT2</code><br>
<code>SMALLSERIAL</code><br>
<code>SERIAL2</code></td>
<td><code>SMALLINT</code></td>
</tr>
<tr>
<td>
<code>INTEGER</code><br>
<code>SERIAL</code></td>
<td><code>INT</code></td>
</tr>
<tr>
<td>
<code>BIGINT</code><br>
<code>BIGSERIAL</code></td>
<td><code>BIGINT</code></td>
</tr>
<tr>
<td></td>
<td><code>DECIMAL(20, 0)</code></td>
</tr>
<tr>
<td><code>BIGINT</code></td>
<td><code>BIGINT</code></td>
</tr>
<tr>
<td>
<code>REAL</code><br>
<code>FLOAT4</code></td>
<td><code>FLOAT</code></td>
</tr>
<tr>
<td>
<code>FLOAT8</code><br>
<code>DOUBLE PRECISION</code></td>
<td><code>DOUBLE</code></td>
</tr>
<tr>
<td>
<code>NUMERIC(p, s)</code><br>
<code>DECIMAL(p, s)</code></td>
<td><code>DECIMAL(p, s)</code></td>
</tr>
<tr>
<td><code>BOOLEAN</code></td>
<td><code>BOOLEAN</code></td>
</tr>
<tr>
<td><code>DATE</code></td>
<td><code>DATE</code></td>
</tr>
<tr>
<td><code>TIME [(p)] [WITHOUT TIMEZONE]</code></td>
<td><code>TIME [(p)] [WITHOUT TIMEZONE]</code></td>
</tr>
<tr>
<td><code>TIMESTAMP [(p)] [WITHOUT TIMEZONE]</code></td>
<td><code>TIMESTAMP [(p)] [WITHOUT TIMEZONE]</code></td>
</tr>
<tr>
<td>
<code>CHAR(n)</code><br>
<code>CHARACTER(n)</code><br>
<code>VARCHAR(n)</code><br>
<code>CHARACTER VARYING(n)</code><br>
<code>TEXT</code></td>
<td><code>STRING</code></td>
</tr>
<tr>
<td><code>BYTEA</code></td>
<td><code>BYTES</code></td>
</tr>
</tbody>
</table>

@ -0,0 +1,14 @@
# Welcome to Flink CDC Connectors' documentation!
```{toctree}
:maxdepth: 2
:caption: Contents
content/README
content/mysql-cdc
content/postgres-cdc
```
# Indices and tables
* {ref}`genindex`
* {ref}`search`

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd

@ -180,6 +180,8 @@ under the License.
<exclude>**/*.iml</exclude> <exclude>**/*.iml</exclude>
<!-- Docs --> <!-- Docs -->
<exclude>**/*.md</exclude> <exclude>**/*.md</exclude>
<exclude>docs/Makefile</exclude>
<exclude>docs/make.bat</exclude>
<!-- Tests --> <!-- Tests -->
<exclude>**/*.txt</exclude> <exclude>**/*.txt</exclude>
</excludes> </excludes>

Loading…
Cancel
Save