#!/opt/vdops/bin/perl # This script queries status variables in NETWORK-APPLIANCE-MIB and # produces a report identifying salient issues. # V Who When What # --------------------------------------------------------------------------- # 1.3.1 skendric 2011-02-25 Ignore miscGlobalStatus.0 # 1.3.0 skendric 2011-02-21 Upgrade to Netops 1.4.0 # 1.2.4 skendric 2010-12-30 Add @insane to report # 1.2.3 skendric 2010-12-17 Futz with owner/owner_backup # 1.2.2 skendric 2010-05-23 Change fullCharged to fullyCharged # 1.2.1 skendric 2010-04-16 Fix bug in use of smart matching # 1.2.0 skendric 2010-02-01 Upgrade to perl 5.10.1 # 1.1.2 skendric 2009-04-19 Distinguish between silent and unresponsive # 1.1.1 skendric 2009-04-18 Ignore perenially full filers # 1.1.0 skendric 2009-04-06 Refine how we alarm on disk issues # 1.0.9 skendric 2009-03-18 Consider nearlyFull volumes to be ok # 1.0.8 skendric 2009-03-01 Consider scrubbingInProgress to be ok # 1.0.7 skendric 2007-12-07 Add owner # 1.0.6 skendric 2007-08-25 Log all disk query results # 1.0.5 skendric 2007-06-18 Consider scrubbingInProgress to be ok # 1.0.4 skendric 2007-06-07 Report fiddles # 1.0.3 skendric 2007-03-21 Stylistic mods # 1.0.2 skendric 2007-02-15 Support non-clustered configurations # 1.0.1 skendric 2007-02-08 Log results of each query # 1.0.0 skendric 2007-01-23 First Version # # Author: Stuart Kendrick, sbk {put at sign here} skendric {put dot here} com # # Source: http://www.skendric.com/device # # This software is available under the GNU GENERAL PUBLIC LICENSE, see # http://www.fsf.org/licenses/gpl.html # # # This script takes the following approach: # -Parses the hosts table for a list of targets (or accepts a command- # line list) # -Queries a bunch of NetApp-specific variables # -Produces a report # # # Requirements: # -The target(s) must be pingable # # -PERL modules: the WI::Netops collection # # # Assumptions: # # # Tested on: # -FAS3050 running 7.1.1, FAS3020 running 7.2.2 # -perl-5.12.2 # -net-snmp-5.6 # # # Instructions: # -Customize the script for your site: find the 'user-configurable # variables' section and modify as appropriate # -Type "netapp-alarm" to see the command-line options # -Try it out # # # # Caveats: # # # Known Bugs: # # # To do: # # Begin script # Load modules use strict; use warnings; use feature 'say'; use feature 'switch'; use Carp qw(carp cluck croak confess); use Data::Dumper; use English qw( -no_match_vars ); use Getopt::Std; use List::MoreUtils qw(any uniq); use List::Util qw(sum); use WI::Netops::HostTools 1.0.4; use WI::Netops::NetopsTools 2.2.3; use WI::Netops::NetopsData 1.4.0; use WI::Netops::PingTools 1.1.7; use WI::Netops::SNMPTools 1.5.3; use WI::Netops::Utilities 1.4.4; # Declare global variables my %global; # miscGlobalStatus.0 my %failover; # cfState.0 my %interconnect; # cfInterconnectStatus.0 my %temperature; # envOverTemperature.0 my %fan; # envFailedFanCount.0 my %power; # envFailedPowerSupplyCount.0 my %nvram; # nvramBatteryStatus.0 my %autosupport; # autosupportStatus.0 my %space; # autosupportStatus.0 my %failed_disks; # diskFailedCount.0 my %spare_disks; # diskSpareCount.0 my %stale_disks; # outOfDateDiskCount.0 my %storage; # Roll-up of many variables my $spare_disk_threshhold; # Number of spare disks below with I will # complain my @typically_full; # List of targets whose file systems are often # full ... so often, that we don't want to hear # about them anymore # Define global variables $program_name = 'netapp-alarm'; $usage = 'Usage: netapp-alarm -s {yes|no} [-d {integer}] [-r] [-a | -e {expr} | -f {filename} | target1 target2 target3 ...]'; $version = '1.3.1'; # Ignore space alarms @typically_full = qw/cadmium horton neon netappa9 netappb9/; # Threshholds $spare_disk_threshhold = 1; # Grab arguments getopts('ad:e:f:rs:', \%option); @target = @ARGV; # Set mode if ($option{r}) { $mode = 'report' } elsif (-t STDIN) { $mode = 'interactive' } else { $mode = 'batch' } ### Begin Main Program ############################################### { check_args(); # Check arguments read_config(); # Read Netops config file compile_mibs(); # Compile MIB files build_target(); # Populate @target target_check(); # Look for errors in @target basic_info(); # Gather information sanity_check(); # Check for major errors do_the_work(); # Do the work identify_alarms(); # Count devices with alarms write_alarm_log(); # Record issues print_report(); # Print report notify_staff(); # Mail report } ##### End Main Program ############################################### ######################################################################## # Do the work ######################################################################## sub do_the_work { my $answer; my %arg; # Args for snmpGet my $val; # Result of snmpGet my $vb; # Result of snmpWalk # Debug trace trace_location('begin') if $debug; # Notify operator print_it('Querying targets...'); unless ($dome) { sleep $short; return 1; } # Loop through targets TARGET: for my $target (@target) { # Acquire miscGlobalStatus $val = snmpGet( {host => $target, oid => 'miscGlobalStatus.0'} ); $val //= $QUERY; log_it("$target miscGlobalStatus = $val"); given ($val) { when ('ok') { $answer = 'ok' } when ('nonCritical') { $answer = 'ok' } default { $answer = 'bad' } } $global{$target} = $answer; log_it("For $target, miscGlobalStatus = $val") unless $val eq 'ok'; #$alarm_count{$target}++ if $answer eq 'bad'; # Acquire cfState $val = snmpGet( {host => $target, oid => 'cfState.0'} ); $val //= $QUERY; given ($val) { when ($QUERY) { $answer = $DASH } when ('canTakeover') { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target cfState = $val") unless $val eq 'ok'; $failover{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire cfInterconnectStatus $val = snmpGet( {host => $target, oid => 'cfInterconnectStatus.0'} ); $val //= $QUERY; given ($val) { when ($QUERY) { $answer = $DASH } when ('up') { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target cfInterconnectStatus = $val") unless $val eq 'ok'; $interconnect{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire envOverTemperature $val = snmpGet( {host => $target, oid => 'envOverTemperature.0'} ); $val //= $QUERY; given ($val) { when ('no') { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target, envOverTemperature = $val") unless $val eq 'ok'; $temperature{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire envFailedFanCount $val = snmpGet( {host => $target, oid => 'envFailedFanCount.0'} ); $val //= $QUERY; log_it("$target envFailedFanCount = $val"); given ($val) { when (0) { $answer = 'ok' } default { $answer = 'bad' } } $fan{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire envFailedPowerSupplyCount $val = snmpGet( {host => $target, oid => 'envFailedPowerSupplyCount.0'} ); $val //= $QUERY; given ($val) { when (0) { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target envFailedPowerSupplyCount = $val") unless $val == 0; $power{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire nvramBatteryStatus $val = snmpGet( {host => $target, oid => 'nvramBatteryStatus.0'} ); $val //= $QUERY; given ($val) { when ('ok') { $answer = 'ok' } when ('fullyCharged') { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target nvramBatteryStatus = $val") unless $val eq 'ok'; $nvram{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire autosupportStatus $val = snmpGet( {host => $target, oid => 'autosupportStatus.0'} ); given ($val) { when ('ok') { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target autosupportStatus = $val") unless $val eq 'ok'; $autosupport{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire fsOverallStatus $val = snmpGet( {host => $target, oid => 'fsOverallStatus.0'} ); given ($val) { when ('ok') { $answer = 'ok' } when ('nearlyFull') { $answer = 'nf' } when ('full') { $answer = 'full' } default { $answer = 'bad' } } log_it("For $target fsOverallStatus = $val") unless $val eq 'ok'; $space{$target} = $answer; unless ($answer eq 'ok') { $alarm_count{$target}++ unless any {$_ eq $target} @typically_full; } # Acquire diskFailedCount.0 $val = snmpGet( {host => $target, oid => 'diskFailedCount.0'} ); $val //= $QUERY; given ($val) { when (0) { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target diskFailedCount = $val") unless $val == 0; $failed_disks{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Acquire diskSpareCount $val = snmpGet( {host => $target, oid => 'diskSpareCount.0'} ); $val //= $QUERY; log_it("For $target diskSpareCount = $val") unless $val >= $spare_disk_threshhold; $spare_disks{$target} = $answer; if ($answer =~ /\d/) { $alarm_count{$target}++ if $answer < $spare_disk_threshhold; } # Acquire outOfDateDiskCount $val = snmpGet( {host => $target, oid => 'outOfDateDiskCount.0'} ); $val //= $QUERY; given ($val) { when (0) { $answer = 'ok' } default { $answer = 'bad' } } log_it("For $target outOfDateDiskCount = $val") unless $val == 0; $stale_disks{$target} = $answer; $alarm_count{$target}++ if $answer eq 'bad'; # Walk aggrState $vb = snmpWalk( {host => $target, oid => 'aggrState'} ); my $aggrState = 0; for (my $i = 0; $i < @$vb; $i++) { my $state = $vb->[$i]->{val}; $state =~ s/"//g; $aggrState++ unless $state eq 'online'; } unless ($aggrState == 0) { $alarm_count{$target}++; log_it("$target in alarm because aggrState is not online"); } # Walk raidStatus $vb = snmpWalk( {host => $target, oid => 'raidStatus'} ); my $raidStatus = 0; for (my $i = 0; $i < @$vb; $i++) { my $state = $vb->[$i]->{val}; given ($state) { when ('active') { $state = $state } # This is good when ('scrubbingInProgress') { $state = $state } # This is normal default { $raidStatus++ } # Rest is bad } } unless ($raidStatus == 0) { $alarm_count{$target}++; log_it("$target in alarm because raidStatus is neither active nor scrubbingInProgress"); } # Walk raidVStatus $vb = snmpWalk( {host => $target, oid => 'raidVStatus'} ); my $raidVStatus = 0; for (my $i = 0; $i < @$vb; $i++) { my $state = $vb->[$i]->{val}; given ($state) { when ('active') { $state = $state } # This is good when ('scrubbingInProgress') { $state = $state } # This is normal default { $raidVStatus++ } # Rest is bad } } unless ($raidVStatus == 0) { $alarm_count{$target}++; log_it("$target in alarm because raidVStatus is neither active nor scrubbingInProgress"); } # Walk raidPStatus $vb = snmpWalk( {host => $target, oid => 'raidPStatus'} ); my $raidPStatus = 0; for (my $i = 0; $i < @$vb; $i++) { my $state = $vb->[$i]->{val}; given ($state) { when ('active') { $state = $state } # This is good when ('scrubbingInProgress') { $state = $state } # This is normal default { $raidPStatus++ } # Rest is bad } } unless ($raidPStatus == 0) { $alarm_count{$target}++; log_it("$target is in alarm because raidPStatus is neither active nor scrubbingInProgress"); } # Walk plexStatus %arg = (host => $target, oid => 'plexStatus', max_rep => 10, non_rep => 0); $vb = snmpWalk(\%arg); my $plexStatus = 0; for (my $i = 0; $i < @$vb; $i++) { my $state = $vb->[$i]->{val}; given ($state) { when ('online') { $state = $state } default { $plexStatus++ } } } unless ($plexStatus == 0) { $alarm_count{$target}++; log_it("$target in alarm because plexStatus is not online"); } # Save roll-up variable my $total = $aggrState + $raidStatus + $raidVStatus + $raidPStatus + $plexStatus; given ($total) { when (0) { $storage{$target} = 'ok' } default { $storage{$target} = 'bad' } } # Entertain operator print $BANG if $mode eq 'interactive'; } # Debug info if ($debug > 2) { for my $target (@target) { if ($alarm_count{$target} > 0) { say "alarm_count{$target} = $alarm_count{$target}"; } } } # Make things look pretty say "\n" if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Tell the operator what I discovered ######################################################################## sub print_report { my $handle; my $total = @target; my $now = get_now(); # If we are running in test mode, skip this routine unless ($dome) { print_it("Running in test mode, cannot print a meaningful report\n"); return 1; } # Debug trace trace_location('begin') if $debug; # Direct output to screen or to file if ($mode eq 'interactive') { $handle = *STDOUT; } else { open $handle, '>', $report_file or die "Cannot open $report_file: $!\n"; } print {$handle} <