Monitoring Synology DS1511+ with OP5/Nagios


The basics: https://www.nickebo.net/monitoring-a-synology-nas-from-op5/

root@op5-system:~# snmpwalk -c public -v2c 192.168.2.85 SYNOLOGY-SYSTEM-MIB::synoSystem
SYNOLOGY-SYSTEM-MIB::SystemStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::Temperature.0 = INTEGER: 48
SYNOLOGY-SYSTEM-MIB::PowerStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::SystemFanStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::CPUFanStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::ModelName.0 = STRING: “DS1511+”
SYNOLOGY-SYSTEM-MIB::SerialNumber.0 = STRING: “B1J4N00273”
SYNOLOGY-SYSTEM-MIB::Version.0 = STRING: “DSM 4.3-3776”
SYNOLOGY-SYSTEM-MIB::UpgradeAvailable.0 = INTEGER: Checking(3)

snmpwalk -c public -v2c 192.168.2.85 SYNOLOGY-DISK-MIB::synoDisk

root@op5-system:/usr/share/snmp/mibs# grep “OBJECT IDENTIFIER” SYNO*.txt
SYNOLOGY-DISK-MIB.txt:synoDisk OBJECT IDENTIFIER
SYNOLOGY-RAID-MIB.txt:synoRaid OBJECT IDENTIFIER
SYNOLOGY-SPACEIO-MIB.txt:SpaceIO OBJECT IDENTIFIER
SYNOLOGY-STORAGEIO-MIB.txt:StorageIO OBJECT IDENTIFIER
SYNOLOGY-SYSTEM-MIB.txt:synology OBJECT IDENTIFIER
SYNOLOGY-SYSTEM-MIB.txt:synoSystem OBJECT IDENTIFIER ::= { synology 1 }
SYNOLOGY-SYSTEM-MIB.txt:Fan OBJECT IDENTIFIER ::= { synoSystem 4 }
SYNOLOGY-SYSTEM-MIB.txt:DSMInfo OBJECT IDENTIFIER ::= { synoSystem 5 }
SYNOLOGY-UPS-MIB.txt:synoUPS OBJECT IDENTIFIER
SYNOLOGY-UPS-MIB.txt:upsDevice OBJECT IDENTIFIER ::= { synoUPS 1 }
SYNOLOGY-UPS-MIB.txt:upsInfo OBJECT IDENTIFIER ::= { synoUPS 2 }
SYNOLOGY-UPS-MIB.txt:upsInfoMfr OBJECT IDENTIFIER ::= { upsInfo 6 }
SYNOLOGY-UPS-MIB.txt:upsInfoFirmware OBJECT IDENTIFIER ::= { upsInfo 10 }
SYNOLOGY-UPS-MIB.txt:upsInfoLoad OBJECT IDENTIFIER ::= { upsInfo 12 }
SYNOLOGY-UPS-MIB.txt:upsInfoDelay OBJECT IDENTIFIER ::= { upsInfo 14 }
SYNOLOGY-UPS-MIB.txt:upsInfoTimer OBJECT IDENTIFIER ::= { upsInfo 15 }
SYNOLOGY-UPS-MIB.txt:upsInfoTest OBJECT IDENTIFIER ::= { upsInfo 16 }
SYNOLOGY-UPS-MIB.txt:upsInfoPower OBJECT IDENTIFIER ::= { upsInfo 20 }
SYNOLOGY-UPS-MIB.txt:upsInfoRealPower OBJECT IDENTIFIER ::= { upsInfo 21 }
SYNOLOGY-UPS-MIB.txt:upsInfoStart OBJECT IDENTIFIER ::= { upsInfo 25 }
SYNOLOGY-UPS-MIB.txt:upsBattery OBJECT IDENTIFIER ::= { synoUPS 3 }
SYNOLOGY-UPS-MIB.txt:upsBatteryCharge OBJECT IDENTIFIER ::= { upsBattery 1 }
SYNOLOGY-UPS-MIB.txt:upsBatteryVoltage OBJECT IDENTIFIER ::= { upsBattery 2 }
SYNOLOGY-UPS-MIB.txt:upsBatteryRuntime OBJECT IDENTIFIER ::= { upsBattery 6 }
SYNOLOGY-UPS-MIB.txt:upsInput OBJECT IDENTIFIER ::= { synoUPS 4 }
SYNOLOGY-UPS-MIB.txt:upsInputVoltage OBJECT IDENTIFIER ::= { upsInput 1 }
SYNOLOGY-UPS-MIB.txt:upsInputTransfer OBJECT IDENTIFIER ::= { upsInput 2 }
SYNOLOGY-UPS-MIB.txt:upsInputCurrent OBJECT IDENTIFIER ::= { upsInput 5 }
SYNOLOGY-UPS-MIB.txt:upsInputFrequency OBJECT IDENTIFIER ::= { upsInput 6 }
SYNOLOGY-UPS-MIB.txt:upsOutput OBJECT IDENTIFIER ::= { synoUPS 5 }
SYNOLOGY-UPS-MIB.txt:upsOutputVoltage OBJECT IDENTIFIER ::= { upsOutput 1 }
SYNOLOGY-UPS-MIB.txt:upsOutputFrequency OBJECT IDENTIFIER ::= { upsOutput 2 }
SYNOLOGY-UPS-MIB.txt:upsOutputCurrent OBJECT IDENTIFIER ::= { upsOutput 3 }
SYNOLOGY-UPS-MIB.txt:upsAmbient OBJECT IDENTIFIER ::= { synoUPS 6 }
SYNOLOGY-UPS-MIB.txt:upsAmbientTemperature OBJECT IDENTIFIER ::= { upsAmbient 1 }
SYNOLOGY-UPS-MIB.txt:upsAmbientHumidity OBJECT IDENTIFIER ::= { upsAmbient 2 }
SYNOLOGY-UPS-MIB.txt:upsDriver OBJECT IDENTIFIER ::= { synoUPS 7 }
SYNOLOGY-UPS-MIB.txt:upsServer OBJECT IDENTIFIER ::= { synoUPS 8 }

The MIBs are found on the Synology:

synology02> pwd
/usr/syno/share/snmp/mibs
synology02> find . -type f -name ‘SYNOLOGY*MIB.txt’
./SYNOLOGY-SPACEIO-MIB.txt
./SYNOLOGY-DISK-MIB.txt
./SYNOLOGY-STORAGEIO-MIB.txt
./SYNOLOGY-SYSTEM-MIB.txt
./SYNOLOGY-UPS-MIB.txt
./SYNOLOGY-RAID-MIB.txt

On the OP5 server, the MIBs has to be copied to /usr/share/snmp/mibs, after which they are available for snmpwalk and check_snmp.

Two additional snmp check commands (which I picked up from http://www.it-slav.net/blogs/2013/12/15/howto-monitor-netgear-readynas-rn104-with-op5-monitor-or-nagios/#more-2314):

# command ‘custom_check_snmp_v2c_ranges’
define command{
command_name custom_check_snmp_v2c_ranges
command_line $USER1$/check_snmp -H $HOSTADDRESS$ -P 2c -o $ARG1$ -w $ARG2$ -c $ARG3
$ -C$ARG4$ -m: -l $ARG5$
}

# command ‘custom_check_snmp_v2c_regexp’
define command{
command_name custom_check_snmp_v2c_regexp
command_line $USER1$/check_snmp -H $HOSTADDRESS$ -P 2c -o $ARG1$ -R $ARG2$ -C$ARG3
$ -m: -l $ARG4$
}

# service ‘L0 syno – Disk 1’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 1
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.0!Normal!public!”Disk 1:”
}

# service ‘L0 syno – Disk 2’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 2
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.1!Normal!public!”Disk 2:”
}

# service ‘L0 syno – Disk 3’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 3
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.2!Normal!public!”Disk 3:”
}

# service ‘L0 syno – Disk 4’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 4
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.3!Normal!public!”Disk 4:”
}

# service ‘L0 syno – Disk 5’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 5
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.4!Normal!public!”Disk 5:”
}

# service ‘L0 syno – Power Status’
define service{
use default-service
host_name synology02
service_description L0 syno – Power Status
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-SYSTEM-MIB::PowerStatus.0!Normal!public!”Power:”
}

# service ‘L0 syno – System Status’
define service{
use default-service
host_name synology02
service_description L0 syno – System Status
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-SYSTEM-MIB::SystemStatus.0!Normal!public!”Status:”
}

# service ‘L0 syno – Temperature Disk 1’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 1
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.0!45!50!public!”Disk 1 temperature”
}

# service ‘L0 syno – Temperature Disk 2’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 2
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.1!45!50!public!”Disk 2 temperature”
}

# service ‘L0 syno – Temperature Disk 3’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 3
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.2!45!50!public!”Disk 3 temperature”
}

# service ‘L0 syno – Temperature Disk 4’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 4
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.3!45!50!public!”Disk 4 temperature”
}

# service ‘L0 syno – Temperature Disk 5’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 5
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.4!45!50!public!”Disk 5 temperature”
}

# service ‘L0 syno – Temperature System’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature System
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-SYSTEM-MIB::Temperature.0!50!55!public!”System temperature”
}

Who monitors the monitor?


How do you know that all virtual machines (VM’s) in a VMWare environment is actually monitored in your monitoring system (read Nagios, Op5)?

The follow-up question is: is this really important? The answer is: yes. It is important. Of course there might be virtual machines in your environment that you really don’t care about. But there will be a day, when you realize that you wish that you had monitored that one machine in your environment, that just was not.

There are only two ways to know:

  1. Your deployment system/process/whatever of VM’s also adds the new virtual machine to your monitoring system
  2. You make a list of existing virtual machines and compare it to what is monitored

You decide what is easier for you. In most environments (1) just doesn’t happen. So, what if you are left with (2)? How do you do this automatically? In principle, you are not alone. (2) is common, but is a tedious job. I call (2) “meta monitoring”. The monitoring of the monitoring. In my environment I have a set of monitoring checks that are telling me if I am doing my job properly. This is one of them.

Most people are aware that they actually have a handful of virtual machines in their environment that they really don’t want to monitor. You might want to use a temporary VM for a test, a development system under construction. Whatever your reason might be, you might have a valid reason not to monitor a system. The common denominator is usually that you _know_ that you don’t want to monitor it.

The following approach will give you a way of telling what is not monitored in your virtual environment, as well as allowing you to have the occasional test system running in your environment. What I advocate, is an approach which is illegal in business, called “negative confirmation”. Basically, you should give an explanation, and make an active decision if you do not want a virtual machine to be monitored. What I usually do to accomplish this, is to add a custom attribute to the virtual machines in vCenter called noMonitoring, where one should write a note if monitoring is not desired. If this field is empty, it implies that the system should be monitored.

Sounds simple, no?

Given environment:

  • VMWare hypervisor (formerly known as ESXi)
  • VMWare Virtual Center
  • A read-only user in vCenter, in my case “op5”
  • OP5, version 6.0.7 or higher
  • VMware vSphere SDK for Perl installed on your OP5 installation

In vCenter, set up a custom field called noMonitoring (Management->User defined Attributes->Add (Global attribute). I usually also want to keep track of ownership, so I have added two more custom fields; ownerCustomer and ownerTech, so that I know which customer a VM belongs to, and who is responsible for the VM from a technical point of view.

This way, you can use this field to type in information if you don’t want a virtual machine to be monitored. My recommendation is that you use this field such, that if you don’t write anything into it when you have created a virtual machine, you intend for it to be monitored. If you write anything into it, just one character or more, you mean for the virtual machine not to be monitored. The best way to keep track of the whole thing, is to write a short description on why you don’t want the system to be monitored. For example: “2013-05-20, LUM, demo system” or similar. This way other people will know why you don’t want the system to be monitored.

But, then, how do we get this information into OP5?

I have two scripts to do this:

  • getVMsAndCustomAttributes.pl
  • check_metaMonitoring_vmWare

The perl script connects to a vCenter and reads out all virtual machines and a handful of attributes (of which noMonitoring is one of them). The attributes are separated by a semicolon “;”.

Example:

[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# ./getVMsAndCustomAttributes.pl –server=192.168.2.30 –username=op5 –password=op5
#vm;onHost;dataStore;noMonitoring;ownerCustomer;ownerTech
kmg-guran-0001;192.168.2.204;NFSProd;;;;
kmg-op5-0001;192.168.2.204;NFSProd,Synology02;2013-05-12, LUM, To be decommissioned;;;
kmg-zenLoadbalancer-0001;192.168.2.204;NFSDev,Synology02;2013-02-05, LUM, To be decommissioned;;;
kmg-web-0001;192.168.2.204;NFSDev,Synology02;;;;
kmg-web-0002;192.168.2.204;NFSDev,Synology02;;;;
kmg-jumphost-0002;192.168.2.204;NFSProd,Synology02;;asdf;;
kmg-sandbox-0003;192.168.2.204;NFSProd,Synology02;;;;
kmg-buildbox-0001;192.168.2.204;NFSDev,Synology02;LUM, To be decommissioned;;;
kmg-plex-0001;192.168.2.204;NFSProd,Synology02;;;;
kmg-winxp-0001;192.168.2.204;NFSDev;2012-01-12, Windows client, no monitoring;;;
kmg-op5-0004;192.168.2.204;NFSDev;2013-04-20, Quarantin, to be decommissioned when v6 works well in prod.;;;
kmg-sandbox-0005;192.168.2.204;NFSDev,Synology02;2012-10-01, LUM, To be decommissioned;;;
jira-v001fry;192.168.2.204;NFSDev,Synology02;;;;
proxy-v001fry;192.168.2.204;NFSProd,Synology02;2013-03-20, LUM, Under construction 4;;;
kmg-pfsense-0001;192.168.2.204;datastore1,Synology02;2012-12-20, Quarantin;;;
op5-v005fry;192.168.2.204;NFSProd;;;;
backup-v001fry;192.168.2.204;NFSProd,Synology02;2013-05-02, LUM, Under construction;Maggan;;
guran-v001fry;192.168.2.204;NFSProd,Synology02;2013-05-10, LUM, New server, Under construction 2;;;
vcenter-v001fry;192.168.2.204;NFSProd;;;;
[/cce]

Field number 4 represents my custom field “noMonitoring”.

noMonitoring field

In principle, I just have to check field number 4 of the output, and print field number 1 to get a decent list to check against my monitoring system.

[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# ./getVMsAndCustomAttributes.pl –server=192.168.2.30 –username=op5 –password=op5 | awk -F”;” ‘ $4 == “” {print $1}’
kmg-guran-0001
kmg-web-0001
kmg-web-0002
kmg-jumphost-0002
kmg-sandbox-0003
kmg-plex-0001
jira-v001fry
op5-v005fry
vcenter-v001fry
[/cce]

To check this against my OP5 configuration, I just have to ask my monitoring system if the host is monitored. Had I used an older version of OP5, I would have done this by either using grep on /opt/monitor/etc/hosts (grep host_name /opt/monitor/etc/hosts.cfg | grep kmg-guran-0001 | wc -l) or connecting to the merlin database and issuing a clever sql query (no example).

But now, we are on version 6, where Op5 are nowadays using MK Livestatus, which in itself deserves some attention. Long story short; instead of parsing text files or updating a database, MK Livestatus is used to hook into Nagios to keep track of the configuration and the status of the system. The benefit: less disk IO. Asking your monitoring installation about more or less anythings is now very easy, communicating with MK Livestatus over a unix socket. In this case, I will make an extremely simple query, give me the host name of a configured host, that has the host name xxyy. For more inspirational references, look here: http://mathias-kettner.de/checkmk_livestatus.html.

Example:

[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# printf “GET hostsnColumns: host_name host_addressnFilter: host_name = kmg-guran-0001n” | unixcat /opt/monitor/var/rw/live
kmg-guran-0001;192.168.2.37
[/cce]

We put this together into a check_script, check_metaMonitoring_vmWare, which I use to keep track of unmonitored systems.

[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# ./check_metaMonitoring_vmWare  2>/dev/null
WARN – H: 19 M: 7 !M: 12 ok!M: 8 nok!M: 4
Hosts:  kmg-plex-0001 jira-v001fry op5-v005fry vcenter-v001fry
| hosts=19 monitored=7 notMonitored=12 okNotMonitored=8 nokNotMonitored=4
[/cce]

I have added this as a service check to my installation (just add the command to checkcommands.cfg and add a service check to your vcenter host in your monitoring), and can see the following:

meta monitoring - service check

In the output you can see the following:

  • H: 19 -> VMs in this installation
  • M: 8 -> Number of monitored VM’s
  • !M: 11 -> Number of VM’s that are not monitored
  • ok!M: 8 -> Non monitored VM’s that are ok (to not be monitored)
  • nok!M: 3 -> Not OK -> This is what we try and catch, VM’s that should be monitored.

What can you do to remedy this? You have two possibilities:

  1. Add the VM’s to your monitoring system
  2. Add a comment in the “noMonitoring” fields in your vCenter

Simple as that. I guess I have to add a few VM’s to my monitoring now.

Here, the sweets:

[1] getVMsAndCustomAttributes.pl

[ccne lines=”-1″]
#!/usr/bin/perl
## ———————————————–
# Script: getVMsAndCustomAtributes
# Author: magnus.luebeck@kmggroup.ch
# Date: 2013-05-20
#
# Description: This script will output a semicolon “;” separated
# of VMs from a vCenter, together with the custom
# attributes:
# – noMonitoring – Empty field = VM should be monitored
# – noMonitoring – Non empty = good excuse for not monitoring
# – ownerCustomer
# – ownerTech
#
# Usage: ./getVMsAndCustomAttributes.pl –server=192.168.2.30 –username=USERNAME –password=PASSWORD
## Script inspired by/to large extent copied from Reuben Stump
## (rstump@vmware.com | http://www.virtuin.com)
## http://www.virtuin.com/2012/11/best-practices-for-faster-vsphere-sdk.html
## http://communities.vmware.com/docs/DOC-10220 /
## http://communities.vmware.com/servlet/JiveServlet/download/10220-4-24610/queryVMCustomField.pl
## and http://communities.vmware.com/message/519501
## ———————————————–

use strict;
use warnings;

use VMware::VIRuntime;

Opts::parse();
Opts::validate();

Util::connect();

# Fetch all VirtualMachines from SDK, limiting the property set
my $vm_views = Vim::find_entity_views(view_type => “VirtualMachine”,
properties => [‘name’, ‘runtime.host’, ‘datastore’, ‘summary’ ]) ||
die “Failed to get VirtualMachines: $!”;

# Fetch all HostSystems from SDK, limiting the property set
my $host_views = Vim::find_entity_views(view_type => “HostSystem”,
properties => [‘name’]) ||
die “Failed to get HostSystems: $!”;

# Fetch all Datastores from SDK, limiting the property set
my $datastore_views = Vim::find_entity_views(view_type => “Datastore”,
properties => [‘name’]) ||
die “Failed to get Datastores: $!”;

# Create hash tables with key = entity.mo_ref.value
my %host_map = map { $_->get_property(‘mo_ref.value’) => $_ } @{ $host_views || [] };
my %ds_map = map { $_->get_property(‘mo_ref.value’) => $_ } @{ $datastore_views || [] };

#— The correlation between custom field ID and it’s name is only found in
#— the customFields manager
my $sc = Vim::get_service_content();
my $customFieldsMgr = Vim::get_view( mo_ref => $sc->customFieldsManager );

# Create hash table with key = keyName => value
my %keys_map = map { $_->name => $_->key } @{ $customFieldsMgr->field || [] };

# Enumerate VirtualMachines
printf (“#vm;onHost;dataStore;noMonitoring;ownerCustomer;ownerTechn”);
foreach my $vm ( @{$vm_views || []} ) {
# Get HostSystem from the host map
my $host_ref = $vm->get_property(‘runtime.host’)->{‘value’};
my $host = $host_map{$host_ref};

# Get array of datastore moref values
my @ds_refs = map($_->{‘value’}, @{$vm->get_property(‘datastore’) || []});

# Get array of datastore entities from the datastore map by slicing %ds_map
my @datastores = @ds_map{@ds_refs};

# Map the custom field values to a hash
my %cVals = map { $_->key => $_->value } @{$vm->summary->customValue || []} ;

my $noMonitoring = “”;
my $ownerCustomer = “”;
my $ownerTech = “”;

$noMonitoring = $cVals{$keys_map{“noMonitoring”}} if (defined($cVals{$keys_map{“noMonitoring”}}));
$ownerCustomer = $cVals{$keys_map{“ownerCustomer”}} if (defined($cVals{$keys_map{“ownerCustomer”}}));
$ownerTech = $cVals{$keys_map{“ownerTech”}} if (defined($cVals{$keys_map{“ownerTech”}}));

printf(“%s;%s;%s;%s;%s;%s;n”,
$vm->get_property(‘name’),
$host->get_property(‘name’),
join(‘,’, map($_->get_property(‘name’), @datastores) ),
$noMonitoring,
$ownerCustomer,
$ownerTech
);

}

# Disable SSL hostname verification for vCenter self-signed certificate
BEGIN {
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
}
[/ccne]

[2] kmg# cat check_metaMonitoring_vmWare

[ccne lines=”-1″]
#!/bin/bash

## ———————————————–
# Script: check_metaMonitoring_vmWare
# Author: magnus.luebeck@kmggroup.ch
# Date: 2013-05-20
#
# Description: This script will check if your VMs are monitored
# in your Op5-environment.
#
## ———————————————–

this_dir=$(cd `dirname $0`;pwd)
live_path=$(awk ‘/broker_module.*live/ { print $NF}’ /opt/monitor/etc/nagios.cfg)

thresholdWarning=0
thresholdCritical=10

OLD_IFS=$IFS
IFS=’

checkHostExist(){
curHost=$1

unixcat <&2 ; (( numMonitoredHosts += 1 )) ; }
[ -z “$result” ] && { echo “$hostName is NOT monitored” 1>&2 ; (( numNotMonitoredHosts += 1 )) ; }

#— the secret sauce – noMonitoring field is empty -> should be monitored
[[ -n “$noMonitoring” && -z “$result” ]] && { echo ” – But does not have to: $noMonitoring” 1>&2 ; (( numNotMonitoredWithGoodExcuseHosts += 1 )) ; }
[[ -z “$noMonitoring” && -z “$result” ]] && { echo ” – Should be monitored” 1>&2 ; (( numNotMonitoredWithoutExcuseHosts += 1 )) ; hostsToOutput=”$hostsToOutput $hostName” ; }

done

[ $numNotMonitoredWithoutExcuseHosts -le $thresholdWarning ] && { retVal=0 ; retPrefix=OK ; }
[ $numNotMonitoredWithoutExcuseHosts -gt $thresholdWarning ] && { retVal=1 ; retPrefix=WARN ; }
[ $numNotMonitoredWithoutExcuseHosts -gt $thresholdCritical ] && { retVal=2 ; retPrefix=CRIT ; }

echo “$retPrefix – H: $numHosts M: $numMonitoredHosts !M: $numNotMonitoredHosts ok!M: $numNotMonitoredWithGoodExcuseHosts nok!M: $numNotMonitoredWithoutExcuseHosts”
[ -n “$hostsToOutput” ] && echo “Hosts: $hostsToOutput”
echo “| hosts=$numHosts monitored=$numMonitoredHosts notMonitored=$numNotMonitoredHosts okNotMonitored=$numNotMonitoredWithGoodExcuseHosts nokNotMonitored=$numNotMonitoredWithoutExcuseHosts”

exit $retVal
[/ccne]

Sawtooth – The power of a waveform!


Triangles are nice. They are robust, the strongest shape of them all. A triangle will also help you spot anomalies in contextually complex situations. Today we will use this shape to make sure that your backups are running properly, as well as showing you one of the amazing capabilities of the human brain; pattern recognition.

This is just one example of how you can use arithmetic on timestamps to get  more or less anything under control. Here is a good example of something I am trying to achieve today:

The triangle is simple, you know what to expect from it. And that, is the whole point of this blog entry.

Like in any good cooking show, I prepared the dish in beforehand. This is an example of what I can see in my OP5 monitoring. At this point it does not matter what the graph shows. Look at it for a few seconds, then answer the following questions:

  1. When did I have a problem with my backups (it did not run)?
  2. When could my monitoring system _not_ get any information from my backup system?

 

You see? You could answer these two questions. If you by any chance could not come up with the answers, you are either tired, or not really the target group of this blog. Without knowing anything about my system you easily could spot the exceptions in the pattern.

The graph shows the age in seconds of the last successful backup of my file share data. My backup policy is to make a backup (incremental, more about that in a different blog entry) every four hours. But even so, it doesn’t really matter what my backup schedule is. Given your inherited human skill of being able to recognize patterns, the two abnormalities just popped out in your face.

If you didn’t see it, and still find this blog interesting, the answer is: Just before midnight the 31st of whatever month it displays, my monitoring system could not gather this data (empty spot in the graph). And, all by a sudden, just before midnight between the 1st and the second, my backups stopped running (or failed, remember the graph shows the age of the last successful backup).

And now over to the long, interesting explanation on how I got there.

THE SETUP

I am using rsnapshot for my backups. There are several reasons and considerations behind this, but the interesting point is that I really want to know that this is working (disclaimer: this type of monitoring does not guarantee anything), and my implementation outputs logs into /var/log/rsnapshot.log where a successful backup looks like this:

[ccne]

[14/Aug/2012:16:27:54] /usr/bin/rsnapshot hourly: completed successfully

[/ccne]

So, basically, since I am interested in the age of the last successful backup, I can simply filter the logfile for this (grep), use the last line of the output (tail -1), and get the timestamp (awk, tr -d”[]”).

[ccne]

cur_output=$(grep “successfully” /var/log/rsnapshot.log | tail -1 | awk ‘{print $1}’ | tr -d “[]” )

[/ccne]

This, of course, has to be scrubbed a bit, since the output is a timestamp that is not really machine readable. And, it is a timestamp, not an age. I know, my way of doing it is a little bit complicated. But it is the way I learned to do it many years ago, and it is hard to teach an old dog how to sit.

[ccne]

#— the timestamp is in a really weird format; [14/Aug/2012:16:27:54]
curDate=$(echo $cur_output | awk -F”:” ‘{print $1}’)
curTime=$(echo $cur_output | awk -F”:” ‘{print $2″:”$3″:”$4}’)

#— split the date part into day month year
echo $curDate | awk -F”/” ‘{print $1, $2, $3}’ | read curDay curMonth curYear

#— get the age of the last successful backup
#— %s returns the number of seconds since 1.1.1970, epoc
lastBackupTime=$(date -d “$curMonth $curDay $curYear $curTime” “+%s”)
nowTime=$(date “+%s”)
(( lastOkBackupAge=$nowTime – $lastBackupTime ))

[/ccne]

In principle what I do, is to convert the time to unix_timestamp (seconds from epoc, 1st of January, 1970), then subtract this from the current time. This gives me the number of seconds that has passed since the last successful backup until now. I do this through OP5/Nagios every 5 minutes, and the backups are supposed to run every 4 hours. In between backups, the output of my script will show an increasing age of the last successful backup, until just after a new backup, where the age is close to 0 seconds old.

So, the configuration for the whole setup is done on:

  • OP5 server
  • The backup server

On the OP5 server, checkcommands.cfg:

[ccne]

# command ‘kmgBackup’
define command{
command_name kmgBackup
command_line $USER1$/check_nrpe -H kmg-sandbox-0003 -c kmg_backup
}

[/ccne]

On the OP5 server, services.cfg:

[ccne]

# service ‘Rsnapshot backup’
define service{
use default-service
host_name kmg-sandbox-0003
service_description Rsnapshot backup
check_command kmgBackup
}

[/ccne]

On the backup server, /etc/nrpe.d/kmg_commands.cfg (any filename ending with .cfg will do):

[ccne]

malu@kmg-sandbox-0003:/etc/nrpe.d $cat kmg_commands.cfg
#— kmg backup

command[kmg_backup]=/app/prd/op5/bin/checkBackup

[/ccne]

And, at last, the script that checks the backups:

[ccne lines=”-1″]

malu@kmg-sandbox-0003:/app/prd/op5/bin $cat /app/prd/op5/bin/checkBackup
#!/bin/ksh

logFile=/var/log/rsnapshot.log
logHost=kmg-sandbox-0003.localdomain

getLastComplete() {

#— We are looking for lines like this. Only the timestamp of the last one is interesting
#— [14/Aug/2012:16:27:54] /usr/bin/rsnapshot hourly: completed successfully
cur_output=$(grep “successfully” /var/log/rsnapshot.log | tail -1 | awk ‘{print $1}’ | tr -d “[]” )

#— the timestamp is in a really weird format; [14/Aug/2012:16:27:54]
curDate=$(echo $cur_output | awk -F”:” ‘{print $1}’)
curTime=$(echo $cur_output | awk -F”:” ‘{print $2″:”$3″:”$4}’)

#— split the date part into day month year
echo $curDate | awk -F”/” ‘{print $1, $2, $3}’ | read curDay curMonth curYear

#— get the age of the last successful backup
#— %s returns the number of seconds since 1.1.1970, epoc
lastBackupTime=$(date -d “$curMonth $curDay $curYear $curTime” “+%s”)
nowTime=$(date “+%s”)
(( lastOkBackupAge=$nowTime – $lastBackupTime ))

#— echo the age of the last successful backup, in seconds
echo $lastOkBackupAge
}
backupAge=$(getLastComplete)

#— hard coded crit and warn
# – 18000 seconds is 5 hours
# – 86400 seconds is 24 hours
retMessage=”OK”
returnCode=0
[ $backupAge -gt 18000 ] && {
retMessage=”WARN”
returnCode=1
}

[ $backupAge -gt 86400 ] && {
retMessage=”CRIT”
returnCode=2
}

echo “$retMessage – Backup $backupAge seconds old”
echo “| backupAge=$backupAge”

[/cc]