Update monitor-all.sh

Bugs have been fixed, e.g. the IP address was taken over from the previous VM if none was found.
The parameters for "onboot" were not recognized correctly.
Furthermore, the IP search string was packed into a variable.
The output has been extended by the name of the PCT/VM.
A maximum of 3 restarts are preset, but this can also be adjusted using a variable. The original always restarts.
Emails can now also be sent for restarts.
pull/2074/head
FrodoVDR 2 years ago committed by GitHub
parent 7f6521e3c7
commit 4c93763fa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 125
      misc/monitor-all.sh

@ -26,67 +26,116 @@ while true; do
done done
echo '#!/usr/bin/env bash echo '#!/usr/bin/env bash
# Maximum number of restarts
maxrestartcount=3
# Emails for restarts
mailaddress='admin@domain.tld'
mail="false"
# Search string for IP address, here more precise selection, IP addresses can be excluded.
searchip='192\.168\.|10\.'
# Read excluded instances from command line arguments # Read excluded instances from command line arguments
excluded_instances=("$@") excluded_instances=("$@")
echo "Excluded instances: ${excluded_instances[@]}" echo "$(date +'%Y-%m-%d %H:%M:%S'): Excluded instances: ${excluded_instances[@]}"
while true; do while true; do
for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do for vmid in $(pct list | awk '{if(NR>1) print $1}'; qm list | awk '{if(NR>1) print $1}')
# Skip excluded instances do
if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then IP=
echo "Skipping $instance because it is excluded" skip="false"
continue if [ -f /tmp/$vmid.count ] ; then
count=$(cat /tmp/$vmid.count)
else
count=0
fi fi
# Determine the type of the instance (container or virtual machine) # Determine the type of the instance (container or virtual machine)
if pct status $instance >/dev/null 2>&1; then if pct status $vmid >/dev/null 2>&1; then
# It is a container # It is a container
config_cmd="pct config" config_cmd="pct config"
IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1) test=$(pct status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(pct exec $vmid ip a s dev eth0 | awk '/inet / {print $2}' | cut -d/ -f1)
fi
else else
# It is a virtual machine # It is a virtual machine
config_cmd="qm config" config_cmd="qm config"
IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1) test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(qm guest cmd $vmid network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "$searchip" | head -n 1)
fi
fi fi
NAME=$($config_cmd $vmid | grep name: | awk '{ print $2 }')
# Skip instances based on onboot and templates # Skip instances based on onboot and templates
onboot=$($config_cmd $instance | grep onboot | grep -q "onboot: 0" && echo "true" || echo "false") test=$($config_cmd $vmid | grep "onboot" | awk '{ print $2 }')
template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false") if [ "$test" == "1" ] ; then
onboot="true"
if [ "$onboot" == "true" ]; then else
echo "Skipping $instance because it is set not to boot" onboot="false"
continue fi
elif [ "$template" == "true" ]; then test=$($config_cmd $vmid | grep "template:" | awk '{ print $2 }')
echo "Skipping $instance because it is a template" if [ "$test" == "1" ] ; then
continue template="true"
else
template="false"
fi fi
# Ping the instance if [ "$onboot" == "false" ]; then
if ! ping -c 1 $IP >/dev/null 2>&1; then echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is set not to boot"
# If the instance can not be pinged, stop and start it skip="true"
if pct status $instance >/dev/null 2>&1; then fi
# It is a container if [ "$template" == "true" ]; then
echo "$(date): CT $instance is not responding, restarting..." echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is a template"
pct stop $instance >/dev/null 2>&1 skip="true"
sleep 5 fi
pct start $instance >/dev/null 2>&1 if [ "$skip" == "false" ] ; then
else # Ping the instance
# It is a virtual machine if ! ping -c 1 $IP >/dev/null 2>&1; then
if qm status $instance | grep -q "status: running"; then if [ $count -le $maxrestartcount ] ; then
echo "$(date): VM $instance is not responding, restarting..." count=$((count + 1))
qm stop $instance >/dev/null 2>&1 # If the instance can not be pinged, stop and start it
sleep 5 if pct status $vmid >/dev/null 2>&1; then
# It is a container
echo "$(date +'%Y-%m-%d %H:%M:%S'): CT $vmid $NAME is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "CT $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
pct stop $vmid >/dev/null 2>&1
sleep 5
pct start $vmid >/dev/null 2>&1
else
# It is a virtual machine
test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "VM $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
qm stop $vmid >/dev/null 2>&1
sleep 5
else
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not running, starting..."
fi
qm start $vmid >/dev/null 2>&1
echo "$count" > /tmp/$vmid.count
fi
else else
echo "$(date): VM $instance is not running, starting..." echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME max restart count $count reached"
if [ "$mail" == "true" ] ;then
echo "VM $vmid $NAME max restart count $count reached" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
fi fi
qm start $instance >/dev/null 2>&1 else
echo "$(date +'%Y-%m-%d %H:%M:%S'): CT/VM $vmid $NAME with ip $IP is pingable..."
echo "0" > /tmp/$vmid.count
fi fi
fi fi
done done
# Wait for 5 minutes. (Edit to your needs) # Wait for 5 minutes. (Edit to your needs)
echo "$(date): Pausing for 5 minutes..." echo "$(date +'%Y-%m-%d %H:%M:%S'): Pausing for 5 minutes..."
sleep 300 sleep 300
done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh
touch /var/log/ping-instances.log touch /var/log/ping-instances.log

Loading…
Cancel
Save