You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
flink-cdc-connectors/tools/ci/watchdog.sh

128 lines
3.5 KiB
Bash

#!/usr/bin/env bash
################################################################################
# Copyright 2023 Ververica Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
#
# This file contains a watchdog tool to monitor a task and potentially kill it after
# not producing any output for $MAX_NO_OUTPUT seconds.
#
# Number of seconds w/o output before printing a stack trace and killing the watched process
MAX_NO_OUTPUT=${MAX_NO_OUTPUT:-900}
# Number of seconds to sleep before checking the output again
SLEEP_TIME=${SLEEP_TIME:-20}
# Internal fields
CMD_OUT="/tmp/watchdog.out"
CMD_PID="/tmp/watchdog.pid"
CMD_EXIT="/tmp/watchdog.exit"
# =============================================
# Utility functions
# =============================================
max_of() {
local max number
max="$1"
for number in "${@:2}"; do
if ((number > max)); then
max="$number"
fi
done
printf '%d\n' "$max"
}
# Returns the highest modification time out of $CMD_OUT (which is the command output file)
# and any file(s) named "mvn-*.log" (which are logging files created by Flink's tests)
mod_time () {
CMD_OUT_MOD_TIME=`stat -c "%Y" $CMD_OUT`
ADDITIONAL_FILES_MOD_TIMES=`stat -c "%Y" $WATCHDOG_ADDITIONAL_MONITORING_FILES 2> /dev/null`
echo `max_of $CMD_OUT_MOD_TIME $ADDITIONAL_FILES_MOD_TIMES`
}
the_time() {
echo `date +%s`
}
# watchdog process
watchdog () {
touch $CMD_OUT
while true; do
sleep $SLEEP_TIME
time_diff=$((`the_time` - `mod_time`))
if [ $time_diff -ge $MAX_NO_OUTPUT ]; then
echo "=============================================================================="
echo "Process produced no output for ${MAX_NO_OUTPUT} seconds."
echo "=============================================================================="
# run timeout callback
$CALLBACK_ON_TIMEOUT
echo "Killing process with pid=$(<$CMD_PID) and all descendants"
pkill -P $(<$CMD_PID) # kill descendants
kill $(<$CMD_PID) # kill process itself
exit 1
fi
done
}
# =============================================
# main function
# =============================================
# entrypoint
function run_with_watchdog() {
local cmd="$1"
local CALLBACK_ON_TIMEOUT="$2"
watchdog &
WD_PID=$!
echo "STARTED watchdog (${WD_PID})."
echo "RUNNING '${cmd}'."
# Run $CMD and pipe output to $CMD_OUT for the watchdog. The PID is written to $CMD_PID to
# allow the watchdog to kill $CMD if it is not producing any output anymore. $CMD_EXIT contains
# the exit code. This is important for CI build life-cycle (success/failure).
( $cmd & PID=$! ; echo $PID >&3 ; wait $PID ; echo $? >&4 ) 3>$CMD_PID 4>$CMD_EXIT | tee $CMD_OUT
EXIT_CODE=$(<$CMD_EXIT)
echo "Process exited with EXIT CODE: ${EXIT_CODE}."
# Make sure to kill the watchdog in any case after $CMD has completed
echo "Trying to KILL watchdog (${WD_PID})."
( kill $WD_PID 2>&1 ) > /dev/null
rm $CMD_PID
rm $CMD_EXIT
return $EXIT_CODE
}