Nagios/checks/solaris CAM

Define Service
this will add a new service and group (the actual check defanation) add this to the local services folder on the Nagios server

'/opt/nagios/etc/local/services'

define service { service_description  solaris_cam display_name           solaris common array manager # the CAM software runs on host hundsglump.muc check_command        check_nrpe!check_sstcam is_volatile          1 max_check_attempts   1 servicegroups          solaris_CAM hostgroup_name         solaris_CAM } (a service group to look at all CAM monitoring)
 * 1) cat solaris-cam.cfg

Define a Service group
/opt/nagios/etc/local/servicegroups

define servicegroup { servicegroup_name      solaris_CAM alias                  solaris common array manager }
 * 1) cat solaris-cam.cfg

=
=========================================

Define a host-template
/opt/nagios/etc/local/host-templates

cat solaris-cam.cfg define host{ name           solaris_CAM hostgroups     +solaris_CAM register       0 }

/opt/nagios/etc/local/hostgroups cat solaris-cam.cfg define hostgroup { hostgroup_name solaris_CAM alias          solaris common array manager }

Add check to host
define host { use            solaris_CAM x x x

}

Check config
/opt/nagios/bin/nagios -v /opt/nagios/etc/nagios.cfg '

NRPE Command defanition
Add this to the client in the commands folder '/opt/nagios/nrpe/nrpe-commands"

command[check_sstcam]=/opt/nagios/plugins/libexec/check_sstcam
 * 1) cat > sstcam.cfg

add the check script to plugins folder

'/opt/nagios/plugins/libexec/'

then make it exacutable

chmod u+x /opt/nagios/plugins/libexec/check_sstcam

and a quick `chown -R nagios:nagios /opt/nagios` never did any harm

then restart nrpe '/etc/init.d/nrpe restart'

Check script [check_sstcam]
# # # # # # # # # # # # # use strict; use Data::Dumper; use IO::File; use Getopt::Long qw(:config no_ignore_case getopt_compat); use vars qw($PROGNAME $REVISION $TIMEOUT $ALARMDIR $STATUSDIR    $opt_V $opt_h $opt_t $opt_v $opt_n $opt_f $opt_p); my %ERRORS=( OK => 0, WARNING => 1, CRITICAL => 2, UNKNOWN => 3 ); my %ERRORCODES=( 0 => 'OK', 1 => 'WARNING', 2 => 'CRITICAL', 3 => 'UNKNOWN' ); $PROGNAME = "check_sstcam"; $REVISION = '$Revision: 1.2 $'; $TIMEOUT = 10; $STATUSDIR = "/opt/nagios/etc/logchecks"; $ALARMDIR = "/var/opt/SUNWsefms/store/Alarms"; sub print_usage { print "Usage:\n"; print " $PROGNAME [-t ] [-n | -f ]\n"; print " $PROGNAME [-h | --help]\n"; print " $PROGNAME [-V | --version]\n"; print "\n\nOptions:\n"; print " -t, --timeout\n"; print "    The number of seconds after which the plugin will abort\n"; print " -p, --persistent\n"; print "    Stay critical as long as there are alarm files\n"; print " -h, --help\n"; print "    Print detailed help screen\n"; print " -V, --version\n"; print "    Print version information\n\n"; } sub print_help { print "Copyright (c) 2007 Gerhard Lausser\n\n"; print_usage; print "\n"; print " Check the solaris storage tek common array manager alert files\n"; print "\n"; print "The default is to check all devices managed by the cam.\n"; print "By providing a certain device's name with the -n option, you can\n"; print "limit checking for this device only.\n"; print "By using a config file you can name more than just one devices to\n"; print "monitor. Provide the filename with the -f option and specify the\n"; print "device names in the config file with a line like:\n"; print "\@devicenames = qw(devicename-1 devicename-2 devicename-3);\n"; print "\n"; support; } sub print_revision ($$) { my $commandName = shift; my $pluginRevision = shift; $pluginRevision =~ s/^\$Revision: //; $pluginRevision =~ s/ \$\s*$//; print "$commandName $pluginRevision\n"; print "This nagios plugin comes with ABSOLUTELY NO WARRANTY. You may redistribute\ncopies of this plugin under the terms of the GNU General Public License.\n"; } sub support { my $support='Send email to gerhard.lausser@consol.de if you have questions\nregarding use of this software. \nPlease include version information with all correspondence (when possible,\nuse output from the --version option of the plugin itself).\n'; $support =~ s/@/\@/g; $support =~ s/\\n/\n/g; print $support; } sub trace { my $format = shift; my $logfh = new IO::File; if (-f "/tmp/".$PROGNAME.".trace") { if ($logfh->open("/tmp/".$PROGNAME.".trace", "a")) { $logfh->printf("%s: ", scalar localtime); $logfh->printf($format, @_); $logfh->printf("\n"); $logfh->close; }  } } sub getcurrentevents { my $tmpevents = {}; trace("looking for alarm files"); foreach my $alarmfile (glob $ALARMDIR.'/alarm*') { next if $alarmfile !~ /^.*\/alarm\d+$/; if (-r $alarmfile) { trace(sprintf "opened alarm file %s", $alarmfile); my $tmpevent = {}; my $xml = do { local (@ARGV, $/) = $alarmfile; <> }; my $patterns = { 'eventid' => qr#(.*?) #, 'devicename' => qr#(.*?) #, 'id' => qr#(.*?) #, 'devicetype' => qr#(.*?) #, 'description' => qr#(.*?) #s, 'severity' => qr#(.*?) #, 'devicekey' => qr#(.*?) #, 'datecreated' => qr#(.*?) #, 'state' => qr#(.*?) #, };      foreach (keys %{$patterns}) { if ($xml =~ /$patterns->{$_}/) { $tmpevent->{$_} = $1; $tmpevent->{$_} =~ s#\n# #g; # Description my be multiline }      }        $tmpevents->{$tmpevent->{eventid}} = $tmpevent; } else { trace(sprintf "cannot open alarm file %s", $alarmfile); }  }   return $tmpevents; } sub loadevents { my $statusfile = $STATUSDIR.'/'.$PROGNAME.'.status'; our $events = {}; if (-f $statusfile) { trace("loading saved events"); eval { require $statusfile; };  }   return $events; } sub saveevents { my $events = shift; my $statusfile = $STATUSDIR.'/'.$PROGNAME.'.status'; trace("saving current events"); $Data::Dumper::Indent = 1; my $dump = Data::Dumper->Dump([$events], [qw(events)]); if (open SNAP, "> $statusfile") { printf SNAP "%s\n", $dump; close SNAP; } } my $exitcode = $ERRORS{UNKNOWN}; my $exitmessage = "you should never see this message"; my @warnings = ; my @criticals = ; my @unknowns = ; chdir; if (! GetOptions( "t|timeout=i" => \$opt_t, "n|name=s" => \$opt_n, "f|config=s" => \$opt_f, "p|persistent" => \$opt_p, "V|version" => \$opt_V, "h|help" => \$opt_h, "v|verbose" => \$opt_v, )) {  print_help; exit $ERRORS{UNKNOWN}; } if ($opt_t) { $TIMEOUT = $opt_t; } $SIG{'ALRM'} = sub { printf "UNKNOWN - %s timed out after %d seconds\n", $PROGNAME, $TIMEOUT; exit $ERRORS{UNKNOWN}; }; alarm($TIMEOUT); if ($opt_V) { print_revision($PROGNAME, $REVISION); exit $ERRORS{OK}; } if ($opt_h) { print_help; exit $ERRORS{OK}; } # # our @devicenames = ; if ($opt_n) { @devicenames = ($opt_n); } elsif ($opt_f) { $opt_f = (-f $opt_f.'.cfg') ? $opt_f.'.cfg' : $opt_f; if (-f $opt_f) { eval { require $opt_f; };    if ($@) { printf "syntax errors in config file %s: %s\n", $opt_f, $@; exit $ERRORS{UNKNOWN}; }  } else { printf STDERR "cannot open config file %s\n", $opt_f; exit $ERRORS{UNKNOWN}; } } # # # # # # if (-d $ALARMDIR) { my $currentevents = getcurrentevents; my $savedevents = $opt_p ? {} : loadevents; my @neweventids = ; my @oldeventids = ; my @deleventids = ; if ($#devicenames == -1) { # empty devicenames means: monitor all known devices my %seen = ; @devicenames = grep { ! $seen{$_} ++ } map { $currentevents->{$_}->{devicename} } keys %{$currentevents}; }  #printf "%s\n", Data::Dumper::Dumper($currentevents); foreach my $eventid (keys %{$savedevents}) { if (exists $currentevents->{$eventid}) { push(@oldeventids, $eventid); trace(sprintf "already known event %s", $eventid); printf STDERR "already known event %s\n", $eventid if $opt_v; # eventuell hat sich die severity erhoeht. nachfragen. } else { push(@deleventids, $eventid); trace(sprintf "cleared event %s", $eventid); printf STDERR "cleared event %s\n", $eventid if $opt_v; }  }   foreach my $eventid (keys %{$currentevents}) { if (! exists $savedevents->{$eventid}) { push(@neweventids, $eventid); trace(sprintf "new event %s", $eventid); printf STDERR "new event %s\n", $eventid if $opt_v; }  }   foreach my $device (@devicenames) { my @devcriticals = ; my @devwarnings = ; my @devunknowns = ; my $statistics = ""; foreach my $eventid (@neweventids) { next if $device ne $currentevents->{$eventid}->{devicename}; if ($currentevents->{$eventid}->{severity} == 2) { push(@devwarnings, $currentevents->{$eventid}->{description}); } elsif (($currentevents->{$eventid}->{severity} == 3) ||          ($currentevents->{$eventid}->{severity} == 4)) { push(@devcriticals, $currentevents->{$eventid}->{description}); }    }     $statistics = join(", ", ( @devcriticals ? sprintf("%d error%s", scalar(@devcriticals),             scalar(@devcriticals) == 1 ? "" : "s") : , @devwarnings ? sprintf("%d warning%s", scalar(@devwarnings),            scalar(@devwarnings) == 1 ? "" : "s") : )); if (@devcriticals) { push(@criticals, sprintf "Storage %s (%s): %s", $device,          $statistics, join(", ", (@devcriticals, @devwarnings))); } elsif (@devwarnings) { push(@warnings, sprintf "Storage %s (%s): %s", $device,          $statistics, join(", ", @devwarnings)); }  }   saveevents($currentevents); if (@criticals) { $exitmessage = sprintf "%s", join(" // ", @criticals, @warnings); $exitcode = 2; } elsif (@warnings) { $exitmessage = sprintf "%s", join(" // ", @warnings); $exitcode = 1; } else { $exitmessage = sprintf "cam detected no new errors"; $exitcode = 0; } } else { $exitmessage = "Alarm directory does not exist"; $exitcode = $ERRORS{UNKNOWN}; } printf "%s - %s\n", $ERRORCODES{$exitcode}, $exitmessage; exit $exitcode;
 * 1) cat check_sstcam
 * 1) ! /usr/bin/perl -w
 * 1) check_sstcam - nagios plugin which checks for alerts generated by the
 * 2)                solaris storage tek common array manager
 * 1) Copyright (C) 2007 Gerhard Lausser, gerhard.lausser@consol.de
 * 1) This program is free software; you can redistribute it and/or
 * 2) modify it under the terms of the GNU General Public License
 * 3) as published by the Free Software Foundation; either version 2
 * 4) of the License, or (at your option) any later version.
 * 1) This program is distributed in the hope that it will be useful,
 * 2) but WITHOUT ANY WARRANTY; without even the implied warranty of
 * 3) MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * 4) GNU General Public License for more details.
 * 1) You should have received a copy of the GNU General Public License
 * 2) along with this program; if not, write to the Free Software
 * 3) Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 * 1) Report bugs to:  gerhard.lausser@consol.de
 * 1) 2007-02-13 1.0        initial release
 * 2) 2007-02-19 1.1        added -n and -f options to check only certain devices.
 * 3) 2008-01-17 1.2        added -p for persistency. bugfix with multiline descr.
 * 1) check_sstcam watches the alertfiles generated by the common array manager.
 * 2) any changes will be reported immediately. in the service definition
 * 3) is_volatile and max_check_attempts must be set to 1.
 * 4) if -p is used, the usual way with is_volatile 0 and max_check_attempts > 1
 * 5) is also possible.
 * 1) read a configfile with whitelisted devices
 * 1) todo: decide wether this is the active node of a clustered installation
 * 1) todo: check for processes and alert if cam is not running
 * 1) check for alarm files