#!/opt/vdops/bin/perl # This script queries Compaq boxes for hardware alarms and produces a report # V Who When What # --------------------------------------------------------------------------- # 1.4.0 skendric 2011-02-25 Dig deeper into health alarms # 1.3.0 skendric 2011-02-21 Upgrade to Netops 1.4.0 # 1.2.3 skendric 2010-12-30 Add @insane to report # 1.2.2 skendric 2010-12-17 Futz with owner/owner_backup # 1.2.1 skendric 2010-05-04 Skip non-Compaq hosts # 1.2.0 skendric 2010-01-23 Upgrade to perl 5.10.1 # 1.1.0 skendric 2009-11-18 Look at power supplies and temperature # 1.0.3 skendric 2009-11-02 Actually look for unresponsive nodes # 1.0.2 skendric 2009-04-19 Distinguish between silent and unresponsive # 1.0.1 skendric 2009-03-20 Add @down_for_maintenance # 1.0.0 skendric 2009-02-06 First Version # # # Author: Stuart Kendrick, sbk {put at sign here} skendric {put dot here} com # # Source: http://www.skendric.com/device # # This software is available under the GNU GENERAL PUBLIC LICENSE, see # http://www.fsf.org/licenses/gpl.html # # # This script takes the following approach: # -Parses the hosts table for a list of targets (or accepts a command- # line list) # -Queries a bunch of CPQ***-MIB variables # -Produces a report # # # Requirements: # -The target(s) must be pingable # # -The following MIB modules stashed in /opt/vdops/share/snmp/mibs, # or wherever it is that you store MIB modules: # # -PERL modules: the WI::Netops collection # # # Assumptions: # # # Tested on: # -perl-5.12.2 # -net-snmp-5.6 # # # Instructions: # -Customize the script for your site: find the 'user-configurable # variables' section and modify as appropriate # -Try it out # # # Caveats: # # # Known Bugs: # # # To do: # -Add support for SNMPv3 # # Begin script # Load modules use strict; use warnings; use feature 'say'; use feature 'switch'; use Carp qw(carp cluck croak confess); use Data::Dumper; use English qw( -no_match_vars ); use Getopt::Std; use List::MoreUtils qw(any); use WI::Netops::HostTools 1.0.4; use WI::Netops::NetopsTools 2.2.3; use WI::Netops::NetopsData 1.4.0; use WI::Netops::PingTools 1.1.7; use WI::Netops::SNMPTools 1.5.3; use WI::Netops::Utilities 1.4.4; # Declare global variables my %health; # Health MIB status my %temperature; # cpqHeThermalCondition.0 my %power; # cpqHeFltTolPwrSupplyCondition.0 my %stdeq; # Standard PC equipment status my %ida; # Integrated drive array status my %scsi; # SCSI status my %stsys; # Storage system status my %ilo; # Integrated Lights Out status my %thrsh; # Threshold status my %host; # Host OS status my %ide; # IDE status my %fca; # FibreChannel status my %nic; # NIC status # Define global variables $program_name = 'compaq-alarm'; $usage = 'Usage: compaq-alarm -s {yes|no} [-d {integer}] [-r] [-a | -e {expr} | -f {filename} | target1 target2 target3 ...]'; $version = '1.3.0'; # Grab arguments getopts('ad:e:f:rs:', \%option); @target = @ARGV; # Set mode if ($option{r}) { $mode = 'report' } elsif (-t STDIN) { $mode = 'interactive' } else { $mode = 'batch' } ### Begin Main Program ############################################### { check_args(); # Check arguments read_config(); # Read Netops config file compile_mibs(); # Compile MIB files build_target(); # Populate @target target_check(); # Look for errors in @target basic_info(); # Gather information sanity_check(); # Check for major errors do_the_work(); # Do it identify_alarms(); # Count devices with alarms write_alarm_log(); # Record issues print_report(); # Print report notify_staff(); # Mail report } ##### End Main Program ################################################# ######################################################################## # Query variables ######################################################################## sub do_the_work { my @fine = qw/- ok unknown/; # Debug trace trace_location('begin') if $debug; # Notify operator print_it('Querying targets...'); unless ($dome) { sleep $short; return 1; } # Loop through the list of targets for my $target (@target) { my $status; say "Processing $target" if $debug; # Check Health status $status = snmpGet({host => $target, oid => 'cpqHeMibCondition.0'}); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Health = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'health'; dig_into_health($target); } $health{$target} = $status; # Check Temperature status $status = snmpGet( {host => $target, oid => 'cpqHeThermalCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Temperature = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'temp'; } $temperature{$target} = $status; # Check Power Supply status $status = snmpGet({host=>$target, oid=>'cpqHeFltTolPwrSupplyCondition.0'}); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Power Supply = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'PS'; } $power{$target} = $status; # Check Standard Equipment status $status = snmpGet( {host => $target, oid => 'cpqSeMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Standard Equipment = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'Standard Equipment'; } $stdeq{$target} = $status; # Check Intelligent Drive Array Management Status say 'Getting cpqDaMibCondition.0' if $debug > 3; $status = snmpGet( {host => $target, oid => 'cpqDaMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Drive Array = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'Drive Array'; } $ida{$target} = $status; # Check SCSI Status $status = snmpGet( {host => $target, oid => 'cpqScsiMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, SCSI Status = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'SCSI'; } $scsi{$target} = $status; # Check Storage System Status $status = snmpGet( {host => $target, oid => 'cpqSsMibCondition.0'} ); $status //= $QUERY; $status = $DASH if ($status eq 'other' or $status eq '0'); unless (any { $status eq $_} @fine) { log_it("For $target, Storage Status = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'storage'; } $stsys{$target} = $status; # Check Remote Insight / Integrated Lights-Out $status = snmpGet( {host => $target, oid => 'cpqSm2MibCondition.0'} ); $status //= $QUERY; $status = $DASH if ($status eq 'other' or $status eq '0'); unless (any { $status eq $_} @fine) { # log_it("For $target, ILO Status = $status"); # $alarm_count{$target}++; # push @{$alarms{$target}}, 'ilo'; } $ilo{$target} = $status; # Check Threshold Status $status = snmpGet( {host => $target, oid => 'cpqMeMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Threshold = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'threshold'; } $thrsh{$target} = $status; # Check Host OS Status $status = snmpGet( {host => $target, oid => 'cpqHoMibCondition.0'} ); $status //= $QUERY; $status = $DASH if ($status eq 'other' or $status eq 'unknown'); unless (any { $status eq $_} @fine) { log_it("For $target, Host OS = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'Host OS'; } $host{$target} = $status; # Check IDE Status $status = snmpGet( {host => $target, oid => 'cpqIdeMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, IDE = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'IDE'; } $ide{$target} = $status; # Check Fibre Channel Status $status = snmpGet( {host => $target, oid => 'cpqFcaMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; unless (any { $status eq $_} @fine) { log_it("For $target, Fibre Channel = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'FC'; } $fca{$target} = $status; # Check NIC Status say 'Getting cpqNicMibCondition.0' if $debug > 3; $status = snmpGet( {host => $target, oid => 'cpqNicMibCondition.0'} ); $status //= $QUERY; $status = $DASH if $status eq 'other'; $nic{$target} = $status; # If NIC status is 'unknown', dig a little deeper if ($nic{$target} eq 'unknown') { $status = find_bad_nic($target); unless (any { $status eq $_} @fine) { log_it("For $target, NIC = $status"); $alarm_count{$target}++; push @{$alarms{$target}}, 'NIC'; } $status = $DASH if $status eq 'other'; $nic{$target} = $status; } # Entertain operator print $BANG if $mode eq 'interactive'; } # Debug info if ($debug > 2) { for my $target (@target) { if (defined $alarm_count{$target} and $alarm_count{$target} > 0) { say "alarm_count{$target} = $alarm_count{$target}"; } } } # Make things look pretty say "\n" if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Dig into CPQHLTH-MIB details ######################################################################## sub dig_into_health { my @fine = qw/0 enabled ok other unknown/; my $host = shift; my $status; my $vb; # Debug trace trace_location('begin') if $debug; # Sanity check die 'No argument' unless (defined $host and $host ne $EMPTY_STR); # cpqHeCritLogCondition $status = snmpGet( {host => $host, oid => 'cpqHeCritLogCondition.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeCritLogCondition.0 = $status"); } # cpqHeCorrMemLogCondition $status = snmpGet( {host => $host, oid => 'cpqHeCorrMemLogCondition.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeCorrMemLogCondition.0 = $status"); } # cpqHeCorrMemTotalErrs $status = snmpGet( {host => $host, oid => 'cpqHeCorrMemTotalErrs.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeCorrMemTotalErrs.0 = $status"); } # cpqHeAsrStatus $status = snmpGet( {host => $host, oid => 'cpqHeAsrStatus.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeAsrStatus.0 = $status"); } # cpqHeAsrCondition $status = snmpGet( {host => $host, oid => 'cpqHeAsrCondition.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeAsrCondition.0 = $status"); } # cpqHeThermalTempStatus $status = snmpGet( {host => $host, oid => 'cpqHeThermalTempStatus.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeThermalTempStatus.0 = $status"); } # cpqHeThermalSystemFanStatus $status = snmpGet( {host => $host, oid => 'cpqHeThermalSystemFanStatus.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeThermalSystemFanStatus.0 = $status"); } # cpqHeThermalCpuFanStatus $status = snmpGet( {host => $host, oid => 'cpqHeThermalCpuFanStatus.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeThermalCpuFanStatus.0 = $status"); } # cpqHeFltTolFanCondition $vb = snmpWalk( {host => $host, oid => 'cpqHeFltTolFanCondition'} ); for my $varbind (@$vb) { my $val = $varbind->{val}; unless ($val eq 'ok') { my $iid = $varbind->{iid}; log_it("For $host, cpqHeFltTolFanCondition.$iid = $val"); } } # cpqHeTemperatureCondition $vb = snmpWalk( {host => $host, oid => 'cpqHeTemperatureCondition'} ); for my $varbind (@$vb) { my $val = $varbind->{val}; unless ($val eq 'ok') { my $iid = $varbind->{iid}; log_it("For $host, cpqHeTemperatureCondition.$iid = $val"); } } # cpqHeEventLogCondition $status = snmpGet( {host => $host, oid => 'cpqHeEventLogCondition.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeEventLogCondition.0 = $status"); } # cpqHePowerConverterCondition $status = snmpGet( {host => $host, oid => 'cpqHePowerConverterCondition.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHePowerConverterCondition.0 = $status"); } # cpqHeResilientMemCondition $status = snmpGet( {host => $host, oid => 'cpqHeResilientMemCondition.0'} ); $status //= $QUERY; unless (any { $status eq $_} @fine) { log_it("For $host, cpqHeResilientMemCondition.0 = $status"); } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Poke through NIC variables, return status of first non-good NIC ######################################################################## sub find_bad_nic { my $bad; my $host = shift; my $vb; # Debug trace trace_location('begin') if $debug; # Sanity check die 'No argument' unless (defined $host and $host ne $EMPTY_STR); # Walk cpqNicIfLogMapCondition say 'Walking cpqNicIfLogMapCondition' if $debug > 3; $vb = snmpWalk( { host => $host, oid => '.1.3.6.1.4.1.232.18.2.2.1.1.10'} ); # Pull out the first non-good state VB: for my $varbind (@$vb) { my $val = $varbind->{val}; next VB if ($val eq 'other' or $val eq 'ok'); $bad = $val; last VB; } # Set bad to 'ok' if undefined $bad //= 'ok'; return $bad; } ######################################################################## # Tell the operator what I discovered ######################################################################## sub print_report { my $handle; my $total = @target; my $now = get_now(); # If we are running in test mode, skip this routine unless ($dome) { print_it("Running in test mode, cannot print a meaningful report\n"); return 1; } # Debug trace trace_location('begin') if $debug; # Direct output to screen or to file if ($mode eq 'interactive') { $handle = *STDOUT; } else { open $handle, '>', $report_file or die "Cannot open $report_file: $!\n"; } print {$handle} <