Update monitor-all.sh

Sorry before my last request.
I had completely overlooked the syntax.
I have now corrected the syntax and checked the script on a test instance. I hope I have not overlooked anything.

I have extended the script and fixed some bugs. The most important one:

    The detection of whether "onboot" should be started was defective.
    The LCX or VM containers are restarted a maximum of 3 times.
    The detection of the IP address had a bug, the variable was not reset, so if none was found, the last determined one was used.
    I have improved the log output.
    Additionally I added an email output in case of restarts. Email is disabled by default.
pull/2080/head
FrodoVDR 2 years ago committed by GitHub
parent 6c643e39d1
commit 0f2ff36d14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 125
      misc/monitor-all.sh

@ -26,67 +26,116 @@ while true; do
done
echo '#!/usr/bin/env bash
# Maximum number of restarts
maxrestartcount=3
# Emails for restarts
mailaddress='\''admin@domain.tld'\''
mail="false"
# Search string for IP address, here more precise selection, IP addresses can be excluded.
searchip='\''192\.168\.|10\.'\''
# Read excluded instances from command line arguments
excluded_instances=("$@")
echo "Excluded instances: ${excluded_instances[@]}"
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Excluded instances: ${excluded_instances[@]}"
while true; do
for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do
# Skip excluded instances
if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then
echo "Skipping $instance because it is excluded"
continue
for vmid in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\'')
do
IP=
skip="false"
if [ -f /tmp/$vmid.count ] ; then
count=$(cat /tmp/$vmid.count)
else
count=0
fi
# Determine the type of the instance (container or virtual machine)
if pct status $instance >/dev/null 2>&1; then
if pct status $vmid >/dev/null 2>&1; then
# It is a container
config_cmd="pct config"
IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1)
test=$(pct status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(pct exec $vmid ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1)
fi
else
# It is a virtual machine
config_cmd="qm config"
IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1)
test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(qm guest cmd $vmid network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "$searchip" | head -n 1)
fi
fi
name=$($config_cmd $vmid | grep name: | awk '\''{ print $2 }'\'')
# Skip instances based on onboot and templates
onboot=$($config_cmd $instance | grep onboot | grep -q "onboot: 0" && echo "true" || echo "false")
template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false")
if [ "$onboot" == "true" ]; then
echo "Skipping $instance because it is set not to boot"
continue
elif [ "$template" == "true" ]; then
echo "Skipping $instance because it is a template"
continue
test=$($config_cmd $vmid | grep "onboot" | awk '\''{ print $2 }'\'')
if [ "$test" == "1" ] ; then
onboot="true"
else
onboot="false"
fi
test=$($config_cmd $vmid | grep "template:" | awk '\''{ print $2 }'\'')
if [ "$test" == "1" ] ; then
template="true"
else
template="false"
fi
# Ping the instance
if ! ping -c 1 $IP >/dev/null 2>&1; then
# If the instance can not be pinged, stop and start it
if pct status $instance >/dev/null 2>&1; then
# It is a container
echo "$(date): CT $instance is not responding, restarting..."
pct stop $instance >/dev/null 2>&1
sleep 5
pct start $instance >/dev/null 2>&1
else
# It is a virtual machine
if qm status $instance | grep -q "status: running"; then
echo "$(date): VM $instance is not responding, restarting..."
qm stop $instance >/dev/null 2>&1
sleep 5
if [ "$onboot" == "false" ]; then
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Skipping $vmid $name because it is set not to boot"
skip="true"
fi
if [ "$template" == "true" ]; then
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Skipping $vmid $name because it is a template"
skip="true"
fi
if [ "$skip" == "false" ] ; then
# Ping the instance
if ! ping -c 1 $IP >/dev/null 2>&1; then
if [ $count -le $maxrestartcount ] ; then
count=$((count + 1))
# If the instance can not be pinged, stop and start it
if pct status $vmid >/dev/null 2>&1; then
# It is a container
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): CT $vmid $name is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "CT $vmid $name is not responding, restarting" | mail -s "$(date +'\''%Y-%m-%d %H:%M:%S'\''): $(hostname) - $name" $mailaddress
fi
pct stop $vmid >/dev/null 2>&1
sleep 5
pct start $vmid >/dev/null 2>&1
else
# It is a virtual machine
test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): VM $vmid $name is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "VM $vmid $name is not responding, restarting" | mail -s "$(date +'\''%Y-%m-%d %H:%M:%S'\''): $(hostname) - $name" $mailaddress
fi
qm stop $vmid >/dev/null 2>&1
sleep 5
else
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $name is not running, starting..."
fi
qm start $vmid >/dev/null 2>&1
echo "$count" > /tmp/$vmid.count
fi
else
echo "$(date): VM $instance is not running, starting..."
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): VM $vmid $name max restart count $count reached"
if [ "$mail" == "true" ] ;then
echo "VM $vmid $name max restart count $count reached" | mail -s "$(date +'\''%Y-%m-%d %H:%M:%S'\''): $(hostname) - $name" $mailaddress
fi
fi
qm start $instance >/dev/null 2>&1
else
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): CT/VM $vmid $name with ip $IP is pingable..."
echo "0" > /tmp/$vmid.count
fi
fi
done
# Wait for 5 minutes. (Edit to your needs)
echo "$(date): Pausing for 5 minutes..."
echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Pausing for 5 minutes..."
sleep 300
done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh
touch /var/log/ping-instances.log

Loading…
Cancel
Save