Monitoring Synology DS1511+ with OP5/Nagios

The basics:

root@op5-system:~# snmpwalk -c public -v2c SYNOLOGY-SYSTEM-MIB::synoSystem
SYNOLOGY-SYSTEM-MIB::SystemStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::PowerStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::SystemFanStatus.0 = INTEGER: Normal(1)
SYNOLOGY-SYSTEM-MIB::SerialNumber.0 = STRING: “B1J4N00273”
SYNOLOGY-SYSTEM-MIB::Version.0 = STRING: “DSM 4.3-3776”
SYNOLOGY-SYSTEM-MIB::UpgradeAvailable.0 = INTEGER: Checking(3)

snmpwalk -c public -v2c SYNOLOGY-DISK-MIB::synoDisk

root@op5-system:/usr/share/snmp/mibs# grep “OBJECT IDENTIFIER” SYNO*.txt
SYNOLOGY-SYSTEM-MIB.txt:synoSystem OBJECT IDENTIFIER ::= { synology 1 }
SYNOLOGY-UPS-MIB.txt:upsInfoMfr OBJECT IDENTIFIER ::= { upsInfo 6 }
SYNOLOGY-UPS-MIB.txt:upsInfoFirmware OBJECT IDENTIFIER ::= { upsInfo 10 }
SYNOLOGY-UPS-MIB.txt:upsInfoLoad OBJECT IDENTIFIER ::= { upsInfo 12 }
SYNOLOGY-UPS-MIB.txt:upsInfoDelay OBJECT IDENTIFIER ::= { upsInfo 14 }
SYNOLOGY-UPS-MIB.txt:upsInfoTimer OBJECT IDENTIFIER ::= { upsInfo 15 }
SYNOLOGY-UPS-MIB.txt:upsInfoTest OBJECT IDENTIFIER ::= { upsInfo 16 }
SYNOLOGY-UPS-MIB.txt:upsInfoPower OBJECT IDENTIFIER ::= { upsInfo 20 }
SYNOLOGY-UPS-MIB.txt:upsInfoRealPower OBJECT IDENTIFIER ::= { upsInfo 21 }
SYNOLOGY-UPS-MIB.txt:upsInfoStart OBJECT IDENTIFIER ::= { upsInfo 25 }
SYNOLOGY-UPS-MIB.txt:upsBatteryCharge OBJECT IDENTIFIER ::= { upsBattery 1 }
SYNOLOGY-UPS-MIB.txt:upsBatteryVoltage OBJECT IDENTIFIER ::= { upsBattery 2 }
SYNOLOGY-UPS-MIB.txt:upsBatteryRuntime OBJECT IDENTIFIER ::= { upsBattery 6 }
SYNOLOGY-UPS-MIB.txt:upsInputVoltage OBJECT IDENTIFIER ::= { upsInput 1 }
SYNOLOGY-UPS-MIB.txt:upsInputTransfer OBJECT IDENTIFIER ::= { upsInput 2 }
SYNOLOGY-UPS-MIB.txt:upsInputCurrent OBJECT IDENTIFIER ::= { upsInput 5 }
SYNOLOGY-UPS-MIB.txt:upsInputFrequency OBJECT IDENTIFIER ::= { upsInput 6 }
SYNOLOGY-UPS-MIB.txt:upsOutputVoltage OBJECT IDENTIFIER ::= { upsOutput 1 }
SYNOLOGY-UPS-MIB.txt:upsOutputFrequency OBJECT IDENTIFIER ::= { upsOutput 2 }
SYNOLOGY-UPS-MIB.txt:upsOutputCurrent OBJECT IDENTIFIER ::= { upsOutput 3 }
SYNOLOGY-UPS-MIB.txt:upsAmbientTemperature OBJECT IDENTIFIER ::= { upsAmbient 1 }
SYNOLOGY-UPS-MIB.txt:upsAmbientHumidity OBJECT IDENTIFIER ::= { upsAmbient 2 }

The MIBs are found on the Synology:

synology02> pwd
synology02> find . -type f -name ‘SYNOLOGY*MIB.txt’

On the OP5 server, the MIBs has to be copied to /usr/share/snmp/mibs, after which they are available for snmpwalk and check_snmp.

Two additional snmp check commands (which I picked up from

# command ‘custom_check_snmp_v2c_ranges’
define command{
command_name custom_check_snmp_v2c_ranges
command_line $USER1$/check_snmp -H $HOSTADDRESS$ -P 2c -o $ARG1$ -w $ARG2$ -c $ARG3
$ -C$ARG4$ -m: -l $ARG5$

# command ‘custom_check_snmp_v2c_regexp’
define command{
command_name custom_check_snmp_v2c_regexp
command_line $USER1$/check_snmp -H $HOSTADDRESS$ -P 2c -o $ARG1$ -R $ARG2$ -C$ARG3
$ -m: -l $ARG4$

# service ‘L0 syno – Disk 1’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 1
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.0!Normal!public!”Disk 1:”

# service ‘L0 syno – Disk 2’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 2
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.1!Normal!public!”Disk 2:”

# service ‘L0 syno – Disk 3’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 3
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.2!Normal!public!”Disk 3:”

# service ‘L0 syno – Disk 4’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 4
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.3!Normal!public!”Disk 4:”

# service ‘L0 syno – Disk 5’
define service{
use default-service
host_name synology02
service_description L0 syno – Disk 5
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-DISK-MIB::DiskStatus.4!Normal!public!”Disk 5:”

# service ‘L0 syno – Power Status’
define service{
use default-service
host_name synology02
service_description L0 syno – Power Status
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-SYSTEM-MIB::PowerStatus.0!Normal!public!”Power:”

# service ‘L0 syno – System Status’
define service{
use default-service
host_name synology02
service_description L0 syno – System Status
check_command custom_check_snmp_v2c_regexp!SYNOLOGY-SYSTEM-MIB::SystemStatus.0!Normal!public!”Status:”

# service ‘L0 syno – Temperature Disk 1’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 1
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.0!45!50!public!”Disk 1 temperature”

# service ‘L0 syno – Temperature Disk 2’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 2
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.1!45!50!public!”Disk 2 temperature”

# service ‘L0 syno – Temperature Disk 3’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 3
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.2!45!50!public!”Disk 3 temperature”

# service ‘L0 syno – Temperature Disk 4’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 4
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.3!45!50!public!”Disk 4 temperature”

# service ‘L0 syno – Temperature Disk 5’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature Disk 5
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-DISK-MIB::DiskTemperature.4!45!50!public!”Disk 5 temperature”

# service ‘L0 syno – Temperature System’
define service{
use default-service
host_name synology02
service_description L0 syno – Temperature System
check_command custom_check_snmp_v2c_ranges!SYNOLOGY-SYSTEM-MIB::Temperature.0!50!55!public!”System temperature”

Meta monitoring revisited – EC2 meta monitoring

This post is a revisit of a topic I have already blogged about, Who monitors the monitor?

Meta monitoring -> frequently compare an inventory with your monitoring configuration.

In my terminology I call this meta monitoring since it is not actively monitoring a business function or the functionality of an infrastructure item. By using meta monitoring I am making myself aware of the completeness of my monitoring. Meta monitoring should give an answer to the question: Is there is something I am missing?

Well, as most of you will say; we always miss something. I agree. But with meta monitoring, we will aim to limit the unknown to a bare minimum. If you don’t do it, your configuration will be hopelessly out of date within days.


My take on meta monitoring is to make a list of something that could be monitored, filter away known exceptions, then compare it with the monitoring system configuration.

There are plenty of tools on the market that will help you make inventories of more or less every aspect of your infrastructure. They are usually very expensive. And, honestly, to do this yourself is not even hard.

  • Get a list of items from your current infrastructure (may it be vCenter or Amazon Cloud)
  • Remove items that you know should not be monitored
  • Compare this list with your monitoring system.

In an OP5 environment, you can even do this in a “one-liner”, for example:

[ccne lines=”0″]
root@op5-system:~# echo mysql-v001fry magnus monitor synology03 | sed -e ‘s/ /n/g’ | grep -wv “$(printf “GET hostsnColumns:namen” | unixcat /opt/monitor/var/rw/live)” | xargs -I”{}” echo “Host “{}” is not configured in OP5″
Host “magnus” is not configured in OP5
Host “synology03” is not configured in OP5


Now, this one-liner is listing all configured hosts in your monitoring environment and using that list to filter away known (monitored) hosts from the list you echo, so it is probably not the most effective way to do it, but it works. See it as an example of how easy it can be.

Now, when it comes to gathering the complete (or partial) inventory from your infrastructure is also not that hard. In it’s simplest form you just copy/paste from your favorite excel sheet, or you request it from your infrastructure through an API. Amazon EC2 has a very powerful API. Just create a read-only user with access to your environment, and use a simple ruby script to get the names from EC2. Note that you need to point out which region you would like to list, and optionally add your proxy URI to the script below.


[ccne lines=”0″]
%w[ rubygems aws-sdk ].each { |f| require f }

aws_api = => ‘YOUR_ACCESS_KEY’, :secret_access_key => ‘YOUR_SECRET_KEY’, :region=>’us-west-2′ , :proxy_uri => ”)

aws_api.client.describe_instances[:reservation_set].each do | instance |
instance[:instances_set][0][:tag_set].each do | tag |
puts tag[:value] if tag[:key] == ‘Name’

Running this script will give you a list of your instances in Amazon EC2. I called this script “listEC2Instances.minimal.rb” and put it together with my one-liner:

[ccne lines=”0″]
root@op5-system:/opt/kmg/ec2/bin# ./listEC2Instances.minimal.rb

root@op5-system:/opt/kmg/ec2/bin# ./listEC2Instances.minimal.rb | sed -e ‘s/ /n/g’ | grep -wv “$(printf “GET hostsnColumns:namen” | unixcat /opt/monitor/var/rw/live)” | xargs -I”{}” echo “Host “{}” is not configured in OP5″
Host “vpn-v001ec2” is not configured in OP5


Now, you know which hosts in your Amazon Cloud that are not monitored. Do something about it! =)


Who monitors the monitor?

How do you know that all virtual machines (VM’s) in a VMWare environment is actually monitored in your monitoring system (read Nagios, Op5)?

The follow-up question is: is this really important? The answer is: yes. It is important. Of course there might be virtual machines in your environment that you really don’t care about. But there will be a day, when you realize that you wish that you had monitored that one machine in your environment, that just was not.

There are only two ways to know:

  1. Your deployment system/process/whatever of VM’s also adds the new virtual machine to your monitoring system
  2. You make a list of existing virtual machines and compare it to what is monitored

You decide what is easier for you. In most environments (1) just doesn’t happen. So, what if you are left with (2)? How do you do this automatically? In principle, you are not alone. (2) is common, but is a tedious job. I call (2) “meta monitoring”. The monitoring of the monitoring. In my environment I have a set of monitoring checks that are telling me if I am doing my job properly. This is one of them.

Most people are aware that they actually have a handful of virtual machines in their environment that they really don’t want to monitor. You might want to use a temporary VM for a test, a development system under construction. Whatever your reason might be, you might have a valid reason not to monitor a system. The common denominator is usually that you _know_ that you don’t want to monitor it.

The following approach will give you a way of telling what is not monitored in your virtual environment, as well as allowing you to have the occasional test system running in your environment. What I advocate, is an approach which is illegal in business, called “negative confirmation”. Basically, you should give an explanation, and make an active decision if you do not want a virtual machine to be monitored. What I usually do to accomplish this, is to add a custom attribute to the virtual machines in vCenter called noMonitoring, where one should write a note if monitoring is not desired. If this field is empty, it implies that the system should be monitored.

Sounds simple, no?

Given environment:

  • VMWare hypervisor (formerly known as ESXi)
  • VMWare Virtual Center
  • A read-only user in vCenter, in my case “op5”
  • OP5, version 6.0.7 or higher
  • VMware vSphere SDK for Perl installed on your OP5 installation

In vCenter, set up a custom field called noMonitoring (Management->User defined Attributes->Add (Global attribute). I usually also want to keep track of ownership, so I have added two more custom fields; ownerCustomer and ownerTech, so that I know which customer a VM belongs to, and who is responsible for the VM from a technical point of view.

This way, you can use this field to type in information if you don’t want a virtual machine to be monitored. My recommendation is that you use this field such, that if you don’t write anything into it when you have created a virtual machine, you intend for it to be monitored. If you write anything into it, just one character or more, you mean for the virtual machine not to be monitored. The best way to keep track of the whole thing, is to write a short description on why you don’t want the system to be monitored. For example: “2013-05-20, LUM, demo system” or similar. This way other people will know why you don’t want the system to be monitored.

But, then, how do we get this information into OP5?

I have two scripts to do this:

  • check_metaMonitoring_vmWare

The perl script connects to a vCenter and reads out all virtual machines and a handful of attributes (of which noMonitoring is one of them). The attributes are separated by a semicolon “;”.


[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# ./ –server= –username=op5 –password=op5
kmg-op5-0001;;NFSProd,Synology02;2013-05-12, LUM, To be decommissioned;;;
kmg-zenLoadbalancer-0001;;NFSDev,Synology02;2013-02-05, LUM, To be decommissioned;;;
kmg-buildbox-0001;;NFSDev,Synology02;LUM, To be decommissioned;;;
kmg-winxp-0001;;NFSDev;2012-01-12, Windows client, no monitoring;;;
kmg-op5-0004;;NFSDev;2013-04-20, Quarantin, to be decommissioned when v6 works well in prod.;;;
kmg-sandbox-0005;;NFSDev,Synology02;2012-10-01, LUM, To be decommissioned;;;
proxy-v001fry;;NFSProd,Synology02;2013-03-20, LUM, Under construction 4;;;
kmg-pfsense-0001;;datastore1,Synology02;2012-12-20, Quarantin;;;
backup-v001fry;;NFSProd,Synology02;2013-05-02, LUM, Under construction;Maggan;;
guran-v001fry;;NFSProd,Synology02;2013-05-10, LUM, New server, Under construction 2;;;

Field number 4 represents my custom field “noMonitoring”.

noMonitoring field

In principle, I just have to check field number 4 of the output, and print field number 1 to get a decent list to check against my monitoring system.

[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# ./ –server= –username=op5 –password=op5 | awk -F”;” ‘ $4 == “” {print $1}’

To check this against my OP5 configuration, I just have to ask my monitoring system if the host is monitored. Had I used an older version of OP5, I would have done this by either using grep on /opt/monitor/etc/hosts (grep host_name /opt/monitor/etc/hosts.cfg | grep kmg-guran-0001 | wc -l) or connecting to the merlin database and issuing a clever sql query (no example).

But now, we are on version 6, where Op5 are nowadays using MK Livestatus, which in itself deserves some attention. Long story short; instead of parsing text files or updating a database, MK Livestatus is used to hook into Nagios to keep track of the configuration and the status of the system. The benefit: less disk IO. Asking your monitoring installation about more or less anythings is now very easy, communicating with MK Livestatus over a unix socket. In this case, I will make an extremely simple query, give me the host name of a configured host, that has the host name xxyy. For more inspirational references, look here:


[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# printf “GET hostsnColumns: host_name host_addressnFilter: host_name = kmg-guran-0001n” | unixcat /opt/monitor/var/rw/live

We put this together into a check_script, check_metaMonitoring_vmWare, which I use to keep track of unmonitored systems.

[cce lines=”-1″]

root@op5-v005fry:/opt/plugins/kmg# ./check_metaMonitoring_vmWare  2>/dev/null
WARN – H: 19 M: 7 !M: 12 ok!M: 8 nok!M: 4
Hosts:  kmg-plex-0001 jira-v001fry op5-v005fry vcenter-v001fry
| hosts=19 monitored=7 notMonitored=12 okNotMonitored=8 nokNotMonitored=4

I have added this as a service check to my installation (just add the command to checkcommands.cfg and add a service check to your vcenter host in your monitoring), and can see the following:

meta monitoring - service check

In the output you can see the following:

  • H: 19 -> VMs in this installation
  • M: 8 -> Number of monitored VM’s
  • !M: 11 -> Number of VM’s that are not monitored
  • ok!M: 8 -> Non monitored VM’s that are ok (to not be monitored)
  • nok!M: 3 -> Not OK -> This is what we try and catch, VM’s that should be monitored.

What can you do to remedy this? You have two possibilities:

  1. Add the VM’s to your monitoring system
  2. Add a comment in the “noMonitoring” fields in your vCenter

Simple as that. I guess I have to add a few VM’s to my monitoring now.

Here, the sweets:


[ccne lines=”-1″]
## ———————————————–
# Script: getVMsAndCustomAtributes
# Author:
# Date: 2013-05-20
# Description: This script will output a semicolon “;” separated
# of VMs from a vCenter, together with the custom
# attributes:
# – noMonitoring – Empty field = VM should be monitored
# – noMonitoring – Non empty = good excuse for not monitoring
# – ownerCustomer
# – ownerTech
# Usage: ./ –server= –username=USERNAME –password=PASSWORD
## Script inspired by/to large extent copied from Reuben Stump
## ( |
## /
## and
## ———————————————–

use strict;
use warnings;

use VMware::VIRuntime;



# Fetch all VirtualMachines from SDK, limiting the property set
my $vm_views = Vim::find_entity_views(view_type => “VirtualMachine”,
properties => [‘name’, ‘’, ‘datastore’, ‘summary’ ]) ||
die “Failed to get VirtualMachines: $!”;

# Fetch all HostSystems from SDK, limiting the property set
my $host_views = Vim::find_entity_views(view_type => “HostSystem”,
properties => [‘name’]) ||
die “Failed to get HostSystems: $!”;

# Fetch all Datastores from SDK, limiting the property set
my $datastore_views = Vim::find_entity_views(view_type => “Datastore”,
properties => [‘name’]) ||
die “Failed to get Datastores: $!”;

# Create hash tables with key = entity.mo_ref.value
my %host_map = map { $_->get_property(‘mo_ref.value’) => $_ } @{ $host_views || [] };
my %ds_map = map { $_->get_property(‘mo_ref.value’) => $_ } @{ $datastore_views || [] };

#— The correlation between custom field ID and it’s name is only found in
#— the customFields manager
my $sc = Vim::get_service_content();
my $customFieldsMgr = Vim::get_view( mo_ref => $sc->customFieldsManager );

# Create hash table with key = keyName => value
my %keys_map = map { $_->name => $_->key } @{ $customFieldsMgr->field || [] };

# Enumerate VirtualMachines
printf (“#vm;onHost;dataStore;noMonitoring;ownerCustomer;ownerTechn”);
foreach my $vm ( @{$vm_views || []} ) {
# Get HostSystem from the host map
my $host_ref = $vm->get_property(‘’)->{‘value’};
my $host = $host_map{$host_ref};

# Get array of datastore moref values
my @ds_refs = map($_->{‘value’}, @{$vm->get_property(‘datastore’) || []});

# Get array of datastore entities from the datastore map by slicing %ds_map
my @datastores = @ds_map{@ds_refs};

# Map the custom field values to a hash
my %cVals = map { $_->key => $_->value } @{$vm->summary->customValue || []} ;

my $noMonitoring = “”;
my $ownerCustomer = “”;
my $ownerTech = “”;

$noMonitoring = $cVals{$keys_map{“noMonitoring”}} if (defined($cVals{$keys_map{“noMonitoring”}}));
$ownerCustomer = $cVals{$keys_map{“ownerCustomer”}} if (defined($cVals{$keys_map{“ownerCustomer”}}));
$ownerTech = $cVals{$keys_map{“ownerTech”}} if (defined($cVals{$keys_map{“ownerTech”}}));

join(‘,’, map($_->get_property(‘name’), @datastores) ),


# Disable SSL hostname verification for vCenter self-signed certificate

[2] kmg# cat check_metaMonitoring_vmWare

[ccne lines=”-1″]

## ———————————————–
# Script: check_metaMonitoring_vmWare
# Author:
# Date: 2013-05-20
# Description: This script will check if your VMs are monitored
# in your Op5-environment.
## ———————————————–

this_dir=$(cd `dirname $0`;pwd)
live_path=$(awk ‘/broker_module.*live/ { print $NF}’ /opt/monitor/etc/nagios.cfg)




unixcat <&2 ; (( numMonitoredHosts += 1 )) ; }
[ -z “$result” ] && { echo “$hostName is NOT monitored” 1>&2 ; (( numNotMonitoredHosts += 1 )) ; }

#— the secret sauce – noMonitoring field is empty -> should be monitored
[[ -n “$noMonitoring” && -z “$result” ]] && { echo ” – But does not have to: $noMonitoring” 1>&2 ; (( numNotMonitoredWithGoodExcuseHosts += 1 )) ; }
[[ -z “$noMonitoring” && -z “$result” ]] && { echo ” – Should be monitored” 1>&2 ; (( numNotMonitoredWithoutExcuseHosts += 1 )) ; hostsToOutput=”$hostsToOutput $hostName” ; }


[ $numNotMonitoredWithoutExcuseHosts -le $thresholdWarning ] && { retVal=0 ; retPrefix=OK ; }
[ $numNotMonitoredWithoutExcuseHosts -gt $thresholdWarning ] && { retVal=1 ; retPrefix=WARN ; }
[ $numNotMonitoredWithoutExcuseHosts -gt $thresholdCritical ] && { retVal=2 ; retPrefix=CRIT ; }

echo “$retPrefix – H: $numHosts M: $numMonitoredHosts !M: $numNotMonitoredHosts ok!M: $numNotMonitoredWithGoodExcuseHosts nok!M: $numNotMonitoredWithoutExcuseHosts”
[ -n “$hostsToOutput” ] && echo “Hosts: $hostsToOutput”
echo “| hosts=$numHosts monitored=$numMonitoredHosts notMonitored=$numNotMonitoredHosts okNotMonitored=$numNotMonitoredWithGoodExcuseHosts nokNotMonitored=$numNotMonitoredWithoutExcuseHosts”

exit $retVal

Sawtooth – The power of a waveform!

Triangles are nice. They are robust, the strongest shape of them all. A triangle will also help you spot anomalies in contextually complex situations. Today we will use this shape to make sure that your backups are running properly, as well as showing you one of the amazing capabilities of the human brain; pattern recognition.

This is just one example of how you can use arithmetic on timestamps to get  more or less anything under control. Here is a good example of something I am trying to achieve today:

The triangle is simple, you know what to expect from it. And that, is the whole point of this blog entry.

Like in any good cooking show, I prepared the dish in beforehand. This is an example of what I can see in my OP5 monitoring. At this point it does not matter what the graph shows. Look at it for a few seconds, then answer the following questions:

  1. When did I have a problem with my backups (it did not run)?
  2. When could my monitoring system _not_ get any information from my backup system?


You see? You could answer these two questions. If you by any chance could not come up with the answers, you are either tired, or not really the target group of this blog. Without knowing anything about my system you easily could spot the exceptions in the pattern.

The graph shows the age in seconds of the last successful backup of my file share data. My backup policy is to make a backup (incremental, more about that in a different blog entry) every four hours. But even so, it doesn’t really matter what my backup schedule is. Given your inherited human skill of being able to recognize patterns, the two abnormalities just popped out in your face.

If you didn’t see it, and still find this blog interesting, the answer is: Just before midnight the 31st of whatever month it displays, my monitoring system could not gather this data (empty spot in the graph). And, all by a sudden, just before midnight between the 1st and the second, my backups stopped running (or failed, remember the graph shows the age of the last successful backup).

And now over to the long, interesting explanation on how I got there.


I am using rsnapshot for my backups. There are several reasons and considerations behind this, but the interesting point is that I really want to know that this is working (disclaimer: this type of monitoring does not guarantee anything), and my implementation outputs logs into /var/log/rsnapshot.log where a successful backup looks like this:


[14/Aug/2012:16:27:54] /usr/bin/rsnapshot hourly: completed successfully


So, basically, since I am interested in the age of the last successful backup, I can simply filter the logfile for this (grep), use the last line of the output (tail -1), and get the timestamp (awk, tr -d”[]”).


cur_output=$(grep “successfully” /var/log/rsnapshot.log | tail -1 | awk ‘{print $1}’ | tr -d “[]” )


This, of course, has to be scrubbed a bit, since the output is a timestamp that is not really machine readable. And, it is a timestamp, not an age. I know, my way of doing it is a little bit complicated. But it is the way I learned to do it many years ago, and it is hard to teach an old dog how to sit.


#— the timestamp is in a really weird format; [14/Aug/2012:16:27:54]
curDate=$(echo $cur_output | awk -F”:” ‘{print $1}’)
curTime=$(echo $cur_output | awk -F”:” ‘{print $2″:”$3″:”$4}’)

#— split the date part into day month year
echo $curDate | awk -F”/” ‘{print $1, $2, $3}’ | read curDay curMonth curYear

#— get the age of the last successful backup
#— %s returns the number of seconds since 1.1.1970, epoc
lastBackupTime=$(date -d “$curMonth $curDay $curYear $curTime” “+%s”)
nowTime=$(date “+%s”)
(( lastOkBackupAge=$nowTime – $lastBackupTime ))


In principle what I do, is to convert the time to unix_timestamp (seconds from epoc, 1st of January, 1970), then subtract this from the current time. This gives me the number of seconds that has passed since the last successful backup until now. I do this through OP5/Nagios every 5 minutes, and the backups are supposed to run every 4 hours. In between backups, the output of my script will show an increasing age of the last successful backup, until just after a new backup, where the age is close to 0 seconds old.

So, the configuration for the whole setup is done on:

  • OP5 server
  • The backup server

On the OP5 server, checkcommands.cfg:


# command ‘kmgBackup’
define command{
command_name kmgBackup
command_line $USER1$/check_nrpe -H kmg-sandbox-0003 -c kmg_backup


On the OP5 server, services.cfg:


# service ‘Rsnapshot backup’
define service{
use default-service
host_name kmg-sandbox-0003
service_description Rsnapshot backup
check_command kmgBackup


On the backup server, /etc/nrpe.d/kmg_commands.cfg (any filename ending with .cfg will do):


malu@kmg-sandbox-0003:/etc/nrpe.d $cat kmg_commands.cfg
#— kmg backup



And, at last, the script that checks the backups:

[ccne lines=”-1″]

malu@kmg-sandbox-0003:/app/prd/op5/bin $cat /app/prd/op5/bin/checkBackup


getLastComplete() {

#— We are looking for lines like this. Only the timestamp of the last one is interesting
#— [14/Aug/2012:16:27:54] /usr/bin/rsnapshot hourly: completed successfully
cur_output=$(grep “successfully” /var/log/rsnapshot.log | tail -1 | awk ‘{print $1}’ | tr -d “[]” )

#— the timestamp is in a really weird format; [14/Aug/2012:16:27:54]
curDate=$(echo $cur_output | awk -F”:” ‘{print $1}’)
curTime=$(echo $cur_output | awk -F”:” ‘{print $2″:”$3″:”$4}’)

#— split the date part into day month year
echo $curDate | awk -F”/” ‘{print $1, $2, $3}’ | read curDay curMonth curYear

#— get the age of the last successful backup
#— %s returns the number of seconds since 1.1.1970, epoc
lastBackupTime=$(date -d “$curMonth $curDay $curYear $curTime” “+%s”)
nowTime=$(date “+%s”)
(( lastOkBackupAge=$nowTime – $lastBackupTime ))

#— echo the age of the last successful backup, in seconds
echo $lastOkBackupAge

#— hard coded crit and warn
# – 18000 seconds is 5 hours
# – 86400 seconds is 24 hours
[ $backupAge -gt 18000 ] && {

[ $backupAge -gt 86400 ] && {

echo “$retMessage – Backup $backupAge seconds old”
echo “| backupAge=$backupAge”