Update monitor-all.sh

Bugs have been fixed, e.g. the IP address was taken over from the previous VM if none was found.
The parameters for "onboot" were not recognized correctly.
Furthermore, the IP search string was packed into a variable.
The output has been extended by the name of the PCT/VM.
A maximum of 3 restarts are preset, but this can also be adjusted using a variable. The original always restarts.
Emails can now also be sent for restarts.
pull/2074/head
FrodoVDR 2 years ago committed by GitHub
parent 7f6521e3c7
commit 4c93763fa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 125
      misc/monitor-all.sh

@ -26,67 +26,116 @@ while true; do
done
echo '#!/usr/bin/env bash
# Maximum number of restarts
maxrestartcount=3
# Emails for restarts
mailaddress='admin@domain.tld'
mail="false"
# Search string for IP address, here more precise selection, IP addresses can be excluded.
searchip='192\.168\.|10\.'
# Read excluded instances from command line arguments
excluded_instances=("$@")
echo "Excluded instances: ${excluded_instances[@]}"
echo "$(date +'%Y-%m-%d %H:%M:%S'): Excluded instances: ${excluded_instances[@]}"
while true; do
for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do
# Skip excluded instances
if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then
echo "Skipping $instance because it is excluded"
continue
for vmid in $(pct list | awk '{if(NR>1) print $1}'; qm list | awk '{if(NR>1) print $1}')
do
IP=
skip="false"
if [ -f /tmp/$vmid.count ] ; then
count=$(cat /tmp/$vmid.count)
else
count=0
fi
# Determine the type of the instance (container or virtual machine)
if pct status $instance >/dev/null 2>&1; then
if pct status $vmid >/dev/null 2>&1; then
# It is a container
config_cmd="pct config"
IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1)
test=$(pct status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(pct exec $vmid ip a s dev eth0 | awk '/inet / {print $2}' | cut -d/ -f1)
fi
else
# It is a virtual machine
config_cmd="qm config"
IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1)
test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(qm guest cmd $vmid network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "$searchip" | head -n 1)
fi
fi
NAME=$($config_cmd $vmid | grep name: | awk '{ print $2 }')
# Skip instances based on onboot and templates
onboot=$($config_cmd $instance | grep onboot | grep -q "onboot: 0" && echo "true" || echo "false")
template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false")
if [ "$onboot" == "true" ]; then
echo "Skipping $instance because it is set not to boot"
continue
elif [ "$template" == "true" ]; then
echo "Skipping $instance because it is a template"
continue
test=$($config_cmd $vmid | grep "onboot" | awk '{ print $2 }')
if [ "$test" == "1" ] ; then
onboot="true"
else
onboot="false"
fi
test=$($config_cmd $vmid | grep "template:" | awk '{ print $2 }')
if [ "$test" == "1" ] ; then
template="true"
else
template="false"
fi
# Ping the instance
if ! ping -c 1 $IP >/dev/null 2>&1; then
# If the instance can not be pinged, stop and start it
if pct status $instance >/dev/null 2>&1; then
# It is a container
echo "$(date): CT $instance is not responding, restarting..."
pct stop $instance >/dev/null 2>&1
sleep 5
pct start $instance >/dev/null 2>&1
else
# It is a virtual machine
if qm status $instance | grep -q "status: running"; then
echo "$(date): VM $instance is not responding, restarting..."
qm stop $instance >/dev/null 2>&1
sleep 5
if [ "$onboot" == "false" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is set not to boot"
skip="true"
fi
if [ "$template" == "true" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is a template"
skip="true"
fi
if [ "$skip" == "false" ] ; then
# Ping the instance
if ! ping -c 1 $IP >/dev/null 2>&1; then
if [ $count -le $maxrestartcount ] ; then
count=$((count + 1))
# If the instance can not be pinged, stop and start it
if pct status $vmid >/dev/null 2>&1; then
# It is a container
echo "$(date +'%Y-%m-%d %H:%M:%S'): CT $vmid $NAME is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "CT $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
pct stop $vmid >/dev/null 2>&1
sleep 5
pct start $vmid >/dev/null 2>&1
else
# It is a virtual machine
test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "VM $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
qm stop $vmid >/dev/null 2>&1
sleep 5
else
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not running, starting..."
fi
qm start $vmid >/dev/null 2>&1
echo "$count" > /tmp/$vmid.count
fi
else
echo "$(date): VM $instance is not running, starting..."
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME max restart count $count reached"
if [ "$mail" == "true" ] ;then
echo "VM $vmid $NAME max restart count $count reached" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
fi
qm start $instance >/dev/null 2>&1
else
echo "$(date +'%Y-%m-%d %H:%M:%S'): CT/VM $vmid $NAME with ip $IP is pingable..."
echo "0" > /tmp/$vmid.count
fi
fi
done
# Wait for 5 minutes. (Edit to your needs)
echo "$(date): Pausing for 5 minutes..."
echo "$(date +'%Y-%m-%d %H:%M:%S'): Pausing for 5 minutes..."
sleep 300
done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh
touch /var/log/ping-instances.log

Loading…
Cancel
Save