diff --git a/misc/monitor-all.sh b/misc/monitor-all.sh index e4251757..2fab1503 100644 --- a/misc/monitor-all.sh +++ b/misc/monitor-all.sh @@ -26,67 +26,116 @@ while true; do done echo '#!/usr/bin/env bash +# Maximum number of restarts +maxrestartcount=3 + +# Emails for restarts +mailaddress='\''admin@domain.tld'\'' +mail="false" + +# Search string for IP address, here more precise selection, IP addresses can be excluded. +searchip='\''192\.168\.|10\.'\'' + # Read excluded instances from command line arguments excluded_instances=("$@") -echo "Excluded instances: ${excluded_instances[@]}" +echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Excluded instances: ${excluded_instances[@]}" while true; do - for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do - # Skip excluded instances - if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then - echo "Skipping $instance because it is excluded" - continue + for vmid in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\'') + do + IP= + skip="false" + if [ -f /tmp/$vmid.count ] ; then + count=$(cat /tmp/$vmid.count) + else + count=0 fi - # Determine the type of the instance (container or virtual machine) - if pct status $instance >/dev/null 2>&1; then + if pct status $vmid >/dev/null 2>&1; then # It is a container config_cmd="pct config" - IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1) + test=$(pct status $vmid | grep -q "status: running") + if [ $? -eq 0 ] ; then + IP=$(pct exec $vmid ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1) + fi else # It is a virtual machine config_cmd="qm config" - IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1) + test=$(qm status $vmid | grep -q "status: running") + if [ $? -eq 0 ] ; then + IP=$(qm guest cmd $vmid network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "$searchip" | head -n 1) + fi fi - + name=$($config_cmd $vmid | grep name: | awk '\''{ print $2 }'\'') # Skip instances based on onboot and templates - onboot=$($config_cmd $instance | grep onboot | grep -q "onboot: 0" && echo "true" || echo "false") - template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false") - - if [ "$onboot" == "true" ]; then - echo "Skipping $instance because it is set not to boot" - continue - elif [ "$template" == "true" ]; then - echo "Skipping $instance because it is a template" - continue + test=$($config_cmd $vmid | grep "onboot" | awk '\''{ print $2 }'\'') + if [ "$test" == "1" ] ; then + onboot="true" + else + onboot="false" + fi + test=$($config_cmd $vmid | grep "template:" | awk '\''{ print $2 }'\'') + if [ "$test" == "1" ] ; then + template="true" + else + template="false" fi - # Ping the instance - if ! ping -c 1 $IP >/dev/null 2>&1; then - # If the instance can not be pinged, stop and start it - if pct status $instance >/dev/null 2>&1; then - # It is a container - echo "$(date): CT $instance is not responding, restarting..." - pct stop $instance >/dev/null 2>&1 - sleep 5 - pct start $instance >/dev/null 2>&1 - else - # It is a virtual machine - if qm status $instance | grep -q "status: running"; then - echo "$(date): VM $instance is not responding, restarting..." - qm stop $instance >/dev/null 2>&1 - sleep 5 + if [ "$onboot" == "false" ]; then + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Skipping $vmid $name because it is set not to boot" + skip="true" + fi + if [ "$template" == "true" ]; then + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Skipping $vmid $name because it is a template" + skip="true" + fi + if [ "$skip" == "false" ] ; then + # Ping the instance + if ! ping -c 1 $IP >/dev/null 2>&1; then + if [ $count -le $maxrestartcount ] ; then + count=$((count + 1)) + # If the instance can not be pinged, stop and start it + if pct status $vmid >/dev/null 2>&1; then + # It is a container + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): CT $vmid $name is not responding, restarting..." + if [ "$mail" == "true" ] ;then + echo "CT $vmid $name is not responding, restarting" | mail -s "$(date +'\''%Y-%m-%d %H:%M:%S'\''): $(hostname) - $name" $mailaddress + fi + pct stop $vmid >/dev/null 2>&1 + sleep 5 + pct start $vmid >/dev/null 2>&1 + else + # It is a virtual machine + test=$(qm status $vmid | grep -q "status: running") + if [ $? -eq 0 ] ; then + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): VM $vmid $name is not responding, restarting..." + if [ "$mail" == "true" ] ;then + echo "VM $vmid $name is not responding, restarting" | mail -s "$(date +'\''%Y-%m-%d %H:%M:%S'\''): $(hostname) - $name" $mailaddress + fi + qm stop $vmid >/dev/null 2>&1 + sleep 5 + else + echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $name is not running, starting..." + fi + qm start $vmid >/dev/null 2>&1 + echo "$count" > /tmp/$vmid.count + fi else - echo "$(date): VM $instance is not running, starting..." + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): VM $vmid $name max restart count $count reached" + if [ "$mail" == "true" ] ;then + echo "VM $vmid $name max restart count $count reached" | mail -s "$(date +'\''%Y-%m-%d %H:%M:%S'\''): $(hostname) - $name" $mailaddress + fi fi - qm start $instance >/dev/null 2>&1 + else + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): CT/VM $vmid $name with ip $IP is pingable..." + echo "0" > /tmp/$vmid.count fi fi done # Wait for 5 minutes. (Edit to your needs) - echo "$(date): Pausing for 5 minutes..." + echo "$(date +'\''%Y-%m-%d %H:%M:%S'\''): Pausing for 5 minutes..." sleep 300 done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh touch /var/log/ping-instances.log