#!/usr/bin/env bash ################################################################################ # Copyright 2023 Ververica Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ # # This file contains a watchdog tool to monitor a task and potentially kill it after # not producing any output for $MAX_NO_OUTPUT seconds. # # Number of seconds w/o output before printing a stack trace and killing the watched process MAX_NO_OUTPUT=${MAX_NO_OUTPUT:-900} # Number of seconds to sleep before checking the output again SLEEP_TIME=${SLEEP_TIME:-20} # Internal fields CMD_OUT="/tmp/watchdog.out" CMD_PID="/tmp/watchdog.pid" CMD_EXIT="/tmp/watchdog.exit" # ============================================= # Utility functions # ============================================= max_of() { local max number max="$1" for number in "${@:2}"; do if ((number > max)); then max="$number" fi done printf '%d\n' "$max" } # Returns the highest modification time out of $CMD_OUT (which is the command output file) # and any file(s) named "mvn-*.log" (which are logging files created by Flink's tests) mod_time () { CMD_OUT_MOD_TIME=`stat -c "%Y" $CMD_OUT` ADDITIONAL_FILES_MOD_TIMES=`stat -c "%Y" $WATCHDOG_ADDITIONAL_MONITORING_FILES 2> /dev/null` echo `max_of $CMD_OUT_MOD_TIME $ADDITIONAL_FILES_MOD_TIMES` } the_time() { echo `date +%s` } # watchdog process watchdog () { touch $CMD_OUT while true; do sleep $SLEEP_TIME time_diff=$((`the_time` - `mod_time`)) if [ $time_diff -ge $MAX_NO_OUTPUT ]; then echo "==============================================================================" echo "Process produced no output for ${MAX_NO_OUTPUT} seconds." echo "==============================================================================" # run timeout callback $CALLBACK_ON_TIMEOUT echo "Killing process with pid=$(<$CMD_PID) and all descendants" pkill -P $(<$CMD_PID) # kill descendants kill $(<$CMD_PID) # kill process itself exit 1 fi done } # ============================================= # main function # ============================================= # entrypoint function run_with_watchdog() { local cmd="$1" local CALLBACK_ON_TIMEOUT="$2" watchdog & WD_PID=$! echo "STARTED watchdog (${WD_PID})." echo "RUNNING '${cmd}'." # Run $CMD and pipe output to $CMD_OUT for the watchdog. The PID is written to $CMD_PID to # allow the watchdog to kill $CMD if it is not producing any output anymore. $CMD_EXIT contains # the exit code. This is important for CI build life-cycle (success/failure). ( $cmd & PID=$! ; echo $PID >&3 ; wait $PID ; echo $? >&4 ) 3>$CMD_PID 4>$CMD_EXIT | tee $CMD_OUT EXIT_CODE=$(<$CMD_EXIT) echo "Process exited with EXIT CODE: ${EXIT_CODE}." # Make sure to kill the watchdog in any case after $CMD has completed echo "Trying to KILL watchdog (${WD_PID})." ( kill $WD_PID 2>&1 ) > /dev/null rm $CMD_PID rm $CMD_EXIT return $EXIT_CODE }