Update monitor-all.sh

Bugs have been fixed, e.g. the IP address was taken over from the previous VM if none was found.
The parameters for "onboot" were not recognized correctly.
Furthermore, the IP search string was packed into a variable.
The output has been extended by the name of the PCT/VM.
A maximum of 3 restarts are preset, but this can also be adjusted using a variable. The original always restarts.
Emails can now also be sent for restarts.
pull/2074/head
FrodoVDR 2 years ago committed by GitHub
parent 7f6521e3c7
commit 4c93763fa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 109
      misc/monitor-all.sh

@ -26,67 +26,116 @@ while true; do
done done
echo '#!/usr/bin/env bash echo '#!/usr/bin/env bash
# Maximum number of restarts
maxrestartcount=3
# Emails for restarts
mailaddress='admin@domain.tld'
mail="false"
# Search string for IP address, here more precise selection, IP addresses can be excluded.
searchip='192\.168\.|10\.'
# Read excluded instances from command line arguments # Read excluded instances from command line arguments
excluded_instances=("$@") excluded_instances=("$@")
echo "Excluded instances: ${excluded_instances[@]}" echo "$(date +'%Y-%m-%d %H:%M:%S'): Excluded instances: ${excluded_instances[@]}"
while true; do while true; do
for instance in $(pct list | awk '\''{if(NR>1) print $1}'\''; qm list | awk '\''{if(NR>1) print $1}'\''); do for vmid in $(pct list | awk '{if(NR>1) print $1}'; qm list | awk '{if(NR>1) print $1}')
# Skip excluded instances do
if [[ " ${excluded_instances[@]} " =~ " ${instance} " ]]; then IP=
echo "Skipping $instance because it is excluded" skip="false"
continue if [ -f /tmp/$vmid.count ] ; then
count=$(cat /tmp/$vmid.count)
else
count=0
fi fi
# Determine the type of the instance (container or virtual machine) # Determine the type of the instance (container or virtual machine)
if pct status $instance >/dev/null 2>&1; then if pct status $vmid >/dev/null 2>&1; then
# It is a container # It is a container
config_cmd="pct config" config_cmd="pct config"
IP=$(pct exec $instance ip a s dev eth0 | awk '\''/inet / {print $2}'\'' | cut -d/ -f1) test=$(pct status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(pct exec $vmid ip a s dev eth0 | awk '/inet / {print $2}' | cut -d/ -f1)
fi
else else
# It is a virtual machine # It is a virtual machine
config_cmd="qm config" config_cmd="qm config"
IP=$(qm guest cmd $instance network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "192\.|10\." | head -n 1) test=$(qm status $vmid | grep -q "status: running")
if [ $? -eq 0 ] ; then
IP=$(qm guest cmd $vmid network-get-interfaces | egrep -o "([0-9]{1,3}\.){3}[0-9]{1,3}" | grep -E "$searchip" | head -n 1)
fi fi
fi
NAME=$($config_cmd $vmid | grep name: | awk '{ print $2 }')
# Skip instances based on onboot and templates # Skip instances based on onboot and templates
onboot=$($config_cmd $instance | grep onboot | grep -q "onboot: 0" && echo "true" || echo "false") test=$($config_cmd $vmid | grep "onboot" | awk '{ print $2 }')
template=$($config_cmd $instance | grep template | grep -q "template:" && echo "true" || echo "false") if [ "$test" == "1" ] ; then
onboot="true"
if [ "$onboot" == "true" ]; then else
echo "Skipping $instance because it is set not to boot" onboot="false"
continue fi
elif [ "$template" == "true" ]; then test=$($config_cmd $vmid | grep "template:" | awk '{ print $2 }')
echo "Skipping $instance because it is a template" if [ "$test" == "1" ] ; then
continue template="true"
else
template="false"
fi fi
if [ "$onboot" == "false" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is set not to boot"
skip="true"
fi
if [ "$template" == "true" ]; then
echo "$(date +'%Y-%m-%d %H:%M:%S'): Skipping $vmid $NAME because it is a template"
skip="true"
fi
if [ "$skip" == "false" ] ; then
# Ping the instance # Ping the instance
if ! ping -c 1 $IP >/dev/null 2>&1; then if ! ping -c 1 $IP >/dev/null 2>&1; then
if [ $count -le $maxrestartcount ] ; then
count=$((count + 1))
# If the instance can not be pinged, stop and start it # If the instance can not be pinged, stop and start it
if pct status $instance >/dev/null 2>&1; then if pct status $vmid >/dev/null 2>&1; then
# It is a container # It is a container
echo "$(date): CT $instance is not responding, restarting..." echo "$(date +'%Y-%m-%d %H:%M:%S'): CT $vmid $NAME is not responding, restarting..."
pct stop $instance >/dev/null 2>&1 if [ "$mail" == "true" ] ;then
echo "CT $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
pct stop $vmid >/dev/null 2>&1
sleep 5 sleep 5
pct start $instance >/dev/null 2>&1 pct start $vmid >/dev/null 2>&1
else else
# It is a virtual machine # It is a virtual machine
if qm status $instance | grep -q "status: running"; then test=$(qm status $vmid | grep -q "status: running")
echo "$(date): VM $instance is not responding, restarting..." if [ $? -eq 0 ] ; then
qm stop $instance >/dev/null 2>&1 echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not responding, restarting..."
if [ "$mail" == "true" ] ;then
echo "VM $vmid $NAME is not responding, restarting" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
qm stop $vmid >/dev/null 2>&1
sleep 5 sleep 5
else else
echo "$(date): VM $instance is not running, starting..." echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME is not running, starting..."
fi fi
qm start $instance >/dev/null 2>&1 qm start $vmid >/dev/null 2>&1
echo "$count" > /tmp/$vmid.count
fi
else
echo "$(date +'%Y-%m-%d %H:%M:%S'): VM $vmid $NAME max restart count $count reached"
if [ "$mail" == "true" ] ;then
echo "VM $vmid $NAME max restart count $count reached" | mail -s "$(date +'%Y-%m-%d %H:%M:%S'): $(hostname) - $NAME" $mailaddress
fi
fi
else
echo "$(date +'%Y-%m-%d %H:%M:%S'): CT/VM $vmid $NAME with ip $IP is pingable..."
echo "0" > /tmp/$vmid.count
fi fi
fi fi
done done
# Wait for 5 minutes. (Edit to your needs) # Wait for 5 minutes. (Edit to your needs)
echo "$(date): Pausing for 5 minutes..." echo "$(date +'%Y-%m-%d %H:%M:%S'): Pausing for 5 minutes..."
sleep 300 sleep 300
done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh done >/var/log/ping-instances.log 2>&1' >/usr/local/bin/ping-instances.sh
touch /var/log/ping-instances.log touch /var/log/ping-instances.log

Loading…
Cancel
Save