From 4c93763fa7f2631298f22fb794abcb74c7f4578d Mon Sep 17 00:00:00 2001 From: FrodoVDR Date: Sun, 19 Nov 2023 15:21:11 +0100 Subject: [PATCH] Update monitor-all.sh Bugs have been fixed, e.g. the IP address was taken over from the previous VM if none was found. The parameters for "onboot" were not recognized correctly. Furthermore, the IP search string was packed into a variable. The output has been extended by the name of the PCT/VM. A maximum of 3 restarts are preset, but this can also be adjusted using a variable. The original always restarts. Emails can now also be sent for restarts. --- misc/monitor-all.sh | 125 ++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/misc/monitor-all.sh b/misc/monitor-all.sh index e4251757..a9a2e344 100644 --- a/misc/monitor-all.sh +++ b/misc/monitor-all.sh @@ -26,67 +26,116 @@ while true; do done echo '#!/usr/bin/env bash +# Maximum number of restarts +maxrestartcount=3 + +# Emails for restarts +mailaddress='admin@domain.tld' +mail="false" + +# Search string for IP address, here more precise selection, IP addresses can be excluded. +searchip='192\.168\.|10\.' + # Read excluded instances from command line arguments excluded_instances=("$@") -echo "Excluded instances: ${excluded_instances[@]}" +echo "$(date +'%Y-%m-%d %H:%M:%S'): Excluded instances: ${excluded_instances[@]}" while true; do - for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do - # Skip excluded instances - if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then - echo "Skipping $instance because it is excluded" - continue + for vmid in $(pct list | awk '{if(NR>1) print $1}'; qm list | awk '{if(NR>1) print $1}') + do + IP= + skip="false" + if [ -f /tmp/$vmid.count ] ; then + count=$(cat /tmp/$vmid.count) + else + count=0 fi - # Determine the type of the instance (container or virtual machine) - if pct status $instance >/dev/null 2>&1; then + if pct status $vmid >/dev/null 2>&1; then # It is a container config_cmd="pct config" - IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1) + test=$(pct status $vmid | grep -q "status: running") + if [ $? -eq 0 ] ; then + IP=$(pct exec $vmid ip a s dev eth0 | awk '/inet / {print $2}' | cut -d/ -f1) + fi else # It is a virtual machine config_cmd="qm config" - IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1) + test=$(qm status $vmid | grep -q "status: running") + if [ $? -eq 0 ] ; then + IP=$(qm guest cmd $vmid network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "$searchip" | head -n 1) + fi fi - + NAME=$($config_cmd $vmid | grep name: | awk '{ print $2 }') # Skip instances based on onboot and templates - onboot=$($config_cmd $instance | grep onboot | grep -q "onboot: 0" && echo "true" || echo "false") - template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false") - - if [ "$onboot" == "true" ]; then - echo "Skipping $instance because it is set not to boot" - continue - elif [ "$template" == "true" ]; then - echo "Skipping $instance because it is a template" - continue + test=$($config_cmd $vmid | grep "onboot" | awk '{ print $2 }') + if [ "$test" == "1" ] ; then + onboot="true" + else + onboot="false" + fi + test=$($config_cmd $vmid | grep "template:" | awk '{ print $2 }') + if [ "$test" == "1" ] ; then + template="true" + else + template="false" fi - # Ping the instance - if ! ping -c 1 $IP >/dev/null 2>&1; then - # If the instance can not be pinged, stop and start it - if pct status $instance >/dev/null 2>&1; then - # It is a container - echo "$(date): CT $instance is not responding, restarting..." - pct stop $instance >/dev/null 2>&1 - sleep 5 - pct start $instance >/dev/null 2>&1 - else - # It is a virtual machine - if qm status $instance | grep -q "status: running"; then - echo "$(date): VM $instance is not responding, restarting..." - qm stop $instance >/dev/null 2>&1 - sleep 5 + if [ "$onboot" == "false" ]; then + echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is set not to boot" + skip="true" + fi + if [ "$template" == "true" ]; then + echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is a template" + skip="true" + fi + if [ "$skip" == "false" ] ; then + # Ping the instance + if ! ping -c 1 $IP >/dev/null 2>&1; then + if [ $count -le $maxrestartcount ] ; then + count=$((count + 1)) + # If the instance can not be pinged, stop and start it + if pct status $vmid >/dev/null 2>&1; then + # It is a container + echo "$(date +'%Y-%m-%d %H:%M:%S'): CT $vmid $NAME is not responding, restarting..." + if [ "$mail" == "true" ] ;then + echo "CT $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress + fi + pct stop $vmid >/dev/null 2>&1 + sleep 5 + pct start $vmid >/dev/null 2>&1 + else + # It is a virtual machine + test=$(qm status $vmid | grep -q "status: running") + if [ $? -eq 0 ] ; then + echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not responding, restarting..." + if [ "$mail" == "true" ] ;then + echo "VM $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress + fi + qm stop $vmid >/dev/null 2>&1 + sleep 5 + else + echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not running, starting..." + fi + qm start $vmid >/dev/null 2>&1 + echo "$count" > /tmp/$vmid.count + fi else - echo "$(date): VM $instance is not running, starting..." + echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME max restart count $count reached" + if [ "$mail" == "true" ] ;then + echo "VM $vmid $NAME max restart count $count reached" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress + fi fi - qm start $instance >/dev/null 2>&1 + else + echo "$(date +'%Y-%m-%d %H:%M:%S'): CT/VM $vmid $NAME with ip $IP is pingable..." + echo "0" > /tmp/$vmid.count fi fi done # Wait for 5 minutes. (Edit to your needs) - echo "$(date): Pausing for 5 minutes..." + echo "$(date +'%Y-%m-%d %H:%M:%S'): Pausing for 5 minutes..." sleep 300 done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh touch /var/log/ping-instances.log