#!/opt/vdops/bin/perl # This script queries combined status variables in MIB-Dell-10892 and # produces a report identifying salient issues. # V Who When What # --------------------------------------------------------------------------- # 1.3.0 skendric 2010-01-23 Upgrade to perl 5.10.1 # 1.2.0 skendric 2009-09-03 Log problems, examine more variables # 1.1.3 skendric 2009-04-19 Distinguish between silent and unresponsive # 1.1.2 skendric 2009-04-16 Hand controller subroutine an argument # 1.1.1 skendric 2009-03-22 Compare controller firmware version against # minimum required # 1.1.0 skendric 2009-03-05 Examine Dell NIC status # 1.0.3 skendric 2007-12-07 Add owner # 1.0.2 skendric 2007-03-21 Stylistic mods # 1.0.1 skendric 2007-01-22 Add support for disk and NICs # 1.0.0 skendric 2007-01-14 First Version # # Author: Stuart Kendrick, sbk {put at sign here} skendric {put dot here} com # # Source: http://www.skendric.com/device # # This software is available under the GNU GENERAL PUBLIC LICENSE, see # http://www.fsf.org/licenses/gpl.html # # # This script takes the following approach: # -Parses the hosts table for a list of targets (or accepts a command- # line list) # -Queries a bunch of Dell-specific variables # -Produces a report # # # Requirements: # -The target(s) must be pingable # # -PERL modules: the FHCRC::Netops collection # # # Assumptions: # # # Tested on: # -PowerEdge 6650 # -perl-5.10.1 # -net-snmp-5.5 # # # Instructions: # -Customize the script for your site: find the 'user-configurable # variables' section and modify as appropriate # -Type "dell-alarm" to see the command-line options # -Try it out # # # # Caveats: # # # Known Bugs: # # # To do: # # Begin script # Load modules use strict; use warnings; use feature 'say'; use feature 'switch'; use Carp qw(carp cluck croak confess); use Data::Dumper; use English qw( -no_match_vars ); use Getopt::Std; use List::MoreUtils qw(any); use FHCRC::Netops::DellTools 1.0.0; use FHCRC::Netops::HostTools 1.0.3; use FHCRC::Netops::NetopsTools 2.0.7; use FHCRC::Netops::NetopsData 1.3.0; use FHCRC::Netops::PingTools 1.1.5; use FHCRC::Netops::SNMPTools 1.3.9; use FHCRC::Netops::Utilities 1.3.9; # Declare global variables my %alarms; # Hash of array refs, listing problems my %battery; # systemStateBatteryStatusCombined.1 my %battery_status; # batteryStatus.1.1 my %battery_roll_up_status; # batteryRollUpStatus.1 my %base_board; # baseBoardStatus.1.1 my %bios; # systemBIOSStatus.1.1 my %chassis; # systemStateChassisStatus.1 my %cooling_device; # systemStateCoolingDeviceStatusCombined.1 my %cooling_unit; # systemStateCoolingUnitStatusCombined.1 my %controller_battery_state; # batteryState.1 my %controller_firmware_fresh; # Boolean identifying whether or not the # controller firwmare meets minimum requirements my %disk; # agentGlobalSystemStatus.0 my @fine; # List of responses which we consider good my $log_file; # Location of log file my %memory; # systemStateMemoryDeviceStatusCombined.1 my %memory_array; # physicalMemoryArrayStatus.1.1 my %nics; # ansTeamMemberState my %os_status; # operatingSystemStatus.1 my %os_memory_status; # operatingSystemMemoryStatus.1 my %power; # systemStatePowerSupplyStatusCombined.1 my %power_unit; # powerUnitStatus.1.1 my %processor; # systemStateProcessorDeviceStatusCombined.1 my %remote_flash_bios; # remoteFlashBIOSStatus.1.1 my %system; # systemStateGlobalSystemStatus.1 my %temperature; # systemStateTemperatureStatusCombined.1 my %temperature_probe; # temperatureProbeStatus.1.1 my %voltage; # systemStateVoltageStatusCombined.1 # Define global variables $debug = 0; # 10 = Logging # 9 = Database SELECT operations # 8 = Per IP/MAC/Port processing # 7 = Database INSERT/UPDATE/DELETE # 6 = Dump SNMP var # 5 = Dump snmp_packets # 4 = Grody: print big var # 3 = Verbose: print mid var # 2 = Simple: print small var # 1 = Basic: subroutine trace # 0 = Disable debugging $program_name = 'dell-alarm'; $usage = 'Usage: dell-alarm -s yes|no [-d {integer}] [-r] [-a | -e {expr} | -f {filename} | target1 target2 target3 ...]'; $version = '1.4.0'; # Define user-configurable variables # Binaries $grab_hosts = '/bin/cat /etc/hosts'; # Fine @fine = qw/- ok 0/; # Report stuff $institution = 'Widgets International'; $log_file = '/home/netops/logs/dell-alarm.log'; $owner = 'Jason Burdullis'; $owner_backup = 'Patrick Hirayama'; $report_file = '/home/netops/rpts/dell-alarm.txt'; $report_queries = 'bsmith@widgets.com'; $report_recipients = 'sopsops@fhcrc.org,skendric@fhcrc.org'; $report_subject = 'Dell Alarm Report'; # Pause parameters $long = 30; $mid = 10; $short = 5; # Ping Stuff $ping_count = 3; $ping_timeout = 1; # SNMP Stuff # Optimize performance by sorting your community strings and SNMP version # list, most frequently used to the left, least frequently used to the right @snmp_read_list = qw/public public/; @snmp_version_list = qw/2/; $snmp_port = 161; $snmp_retries = 6; $snmp_timeout = 33000000; # Syslog stuff $syslog_facility = 'local5'; $syslog_host = 'localhost'; $syslog_port = 514; $syslog_priority = 'info'; $syslog_socket = 'unix'; # Other possibilites include 'udp' and # 'stream'; depending on the flavor of Unix, # I've employed each of these # Target details @down_for_maintenance = qw//; @skip_name = qw//; # Grab arguments getopts('ad:e:f:rs:', \%option); @target = @ARGV; # Set mode if ($option{r}) { $mode = 'report' } elsif (-t STDIN) { $mode = 'interactive' } else { $mode = 'batch' } ### Begin Main Program ############################################### { check_args(); # Check arguments compile_mibs(); # Compile MIB files build_target(); # Populate @target push @silent, target_check(); # Look for errors in @target push @unresponsive, basic_info(); # Gather information sanity_check(); # Check for major errors do_the_work(); # Do the work identify_alarms(); # Count devices with alarms write_log(); # Record issues write_log(); # Record issues print_report(); # Print report notify_staff(); # Mail report } ##### End Main Program ############################################### ######################################################################## # Do the work ######################################################################## sub do_the_work { my %arg; # Args for snmpGet my $val; # Result of snmpGet # Debug trace trace_location('begin') if $debug; # Notify operator print_it('Querying targets...'); unless ($dome) { sleep $short; return 1; } # Loop through targets for my $target (@target) { # Acquire systemStateGlobalSystemStatus say 'Getting systemStateGlobalSystemStatus.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.2.1'); $val = snmpGet(\%arg); $val //= $DASH; $system{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'global status'; log_it("systemStateGlobalSystemStatus.1 = $val"); } # Acquire systemStateChassisStatus say 'Getting systemStateChassisStatus.1' if $debug > 0; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.4.1'); $val = snmpGet(\%arg); $val //= $DASH; $chassis{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'chassis status'; log_it("systemStateChassisStatus.1 = $val"); } # Acquire systemStateBatteryStatusCombined say 'Getting systemStateBatteryStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.52.1'); $val = snmpGet(\%arg); $val //= $DASH; $battery{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'battery'; log_it("systemStateBatteryStatusCombined.1 = $val"); } # Acquire systemStatePowerSupplyStatusCombined say 'Getting systemStatePowerSupplyStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.9.1'); $val = snmpGet(\%arg); $val //= $DASH; $power{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'power supply'; log_it("systemStatePowerSupplyStatusCombined.1 = $val"); } # Acquire systemStateVoltageStatusCombined say 'Getting systemStateVoltageStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.12.1'); $val = snmpGet(\%arg); $val //= $DASH; $voltage{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'voltage'; log_it("systemStateVoltageStatusCombined.1 = $val"); } # Acquire systemStateCoolingDeviceStatusCombined say 'Getting systemStateCoolingDeviceStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.21.1'); $val = snmpGet(\%arg); $val //= $DASH; $cooling_device{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'cooling device'; log_it("systemStateCoolingDeviceStatusCombined.1 = $val"); } # Acquire systemStateCoolingUnitStatusCombined say 'Getting systemStateCoolingUnitStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.44.1'); $val = snmpGet(\%arg); $val //= $DASH; $cooling_unit{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'cooling unit'; log_it("systemStateCoolingUnitStatusCombined.1 = $val"); } # Acquire systemStateTemperatureStatusCombined say 'Getting systemStateTemperatureStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.24.1'); $val = snmpGet(\%arg); $val //= $DASH; $temperature{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'temperature'; log_it("systemStateTemperatureStatusCombined.1 = $val"); } # Acquire systemStateMemoryDeviceStatusCombined say 'Getting systemStateMemoryDeviceStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.27.1'); $val = snmpGet(\%arg); $val //= $DASH; $memory{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'RAM'; log_it("systemStateMemoryDeviceStatusCombined.1 = $val"); } # Acquire systemStateProcessorDeviceStatusCombined say 'Getting systemStateProcessorDeviceStatusCombined.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.200.10.1.50.1'); $val = snmpGet(\%arg); $val //= $DASH; $processor{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'CPU'; log_it("systemStateProcessorDeviceStatusCombined.1 = $val"); } # Acquire systemBIOSStatus say 'Getting systemBIOSStatus.1.1' if $debug > 3; %arg = (host => $target, oid=>'.1.3.6.1.4.1.674.10892.1.300.50.1.5.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $bios{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; log_it("systemBIOSStatus.1.1 = $val"); } # Acquire baseBoardStatus say 'Getting baseBoardStatus.1.1' if $debug > 3; %arg = (host => $target, oid=>'.1.3.6.1.4.1.674.10892.1.300.80.1.5.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $base_board{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; log_it("baseBoardStatus.1.1 = $val"); } # Acquire operatingSystemStatus say 'Getting operatingSystemStatus.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.400.10.1.4.1'); $val = snmpGet(\%arg); $val //= $DASH; $os_status{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; log_it("operatingSystemStatus.1 = $val"); } # Acquire operatingSystemMemoryStatus say 'Getting operatingSystemMemoryStatus.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10892.1.400.20.1.4.1'); $val = snmpGet(\%arg); $val //= $DASH; $os_memory_status{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; log_it("operatingSystemMemoryStatus.1 = $val"); } # Acquire powerUnitStatus say 'Getting powerUnitStatus.1.1' if $debug > 3; %arg = (host => $target, oid=>'.1.3.6.1.4.1.674.10892.1.600.10.1.8.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $power_unit{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'power unit'; log_it("powerUnitStatus.1.1 = $val"); } # Acquire batteryStatus say 'Getting batteryStatus.1.1' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10890.5.25.2.1.8.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $battery_status{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'battery'; log_it("batteryStatus.1.1 = $val"); } # Acquire temperatureProbeStatus say 'Getting temperatureProbeStatus.1.1' if $debug > 3; %arg = (host => $target, oid=>'.1.3.6.1.4.1.674.10892.1.700.20.1.5.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $temperature_probe{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'thermometer'; log_it("temperatureProbeStatus.1.1 = $val"); } # Acquire remoteFlashBIOSStatus say 'Getting remoteFlashBIOSStatus.1.1' if $debug > 3; %arg = (host => $target, oid=>'.1.3.6.1.4.1.674.10892.1.900.10.1.5.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $remote_flash_bios{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'remote BIOS'; log_it("remoteFlashBIOSStatus.1.1 = $val"); } # Acquire physicalMemoryArrayStatus say 'Getting physicalMemoryArrayStatus.1.1' if $debug > 3; %arg = (host => $target, oid=>'.1.3.6.1.4.1.674.10892.1.1300.10.1.5.1.1'); $val = snmpGet(\%arg); $val //= $DASH; $memory_array{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'memory array'; log_it("physicalMemoryArrayStatus.1.1 = $val"); } # Acquire agentGlobalSystemStatus.0 say 'Getting agentGlobalSystemStatus.0' if $debug > 3; %arg = (host => $target, oid => '.1.3.6.1.4.1.674.10893.1.20.110.13.0'); $val = snmpGet(\%arg); $val //= $DASH; $val = $DASH if $val eq '0'; $disk{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; log_it("agentGlobalSystemStatus.0 = $val"); $controller_firmware_fresh{$target} = compare_dell_controller_firmware($target); } # If agentGlobalSystemStatus.0 is unhappy, then something is wrong with # storage. Dig deeper. unless ($disk{$target} eq 'ok') { # Acquire batteryState say 'Getting batteryState.1' if $debug > 3; %arg = (host=>$target, oid=>'.1.3.6.1.4.1.674.10893.1.20.130.15.1.4.1'); $val = snmpGet(\%arg); $val //= $DASH; $controller_battery_state{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'controller battery'; log_it("StorageManagement-MIB::batteryState.1 = $val"); } # Acquire batteryRollUpStatus say 'Getting batteryRollUpStatus.1' if $debug > 3; %arg = (host=>$target, oid=>'.1.3.6.1.4.1.674.10893.1.20.130.15.1.5.1'); $val = snmpGet(\%arg); $val //= $DASH; $battery_roll_up_status{$target} = $val; unless ( any { $_ eq $val } @fine) { $alarm_count{$target}++; push @{$alarms{$target}}, 'battery roll-up'; log_it("StorageManagement-MIB::batteryRollUpStatus.1 = $val"); } } # If this target supports INTEL-LAN-ADAPTERS-MIB, then examine its NICs say 'Getting company.0' if $debug > 3; if (snmpGet( {host => $target, oid => '.1.3.6.1.4.1.343.2.7.2.1.1.0'} )) { $nics{$target} = examine_intel_nics($target); } else { $nics{$target} = examine_dell_nics($target); } unless ($nics{$target} eq 'ok') { $alarm_count{$target}++; push @{$alarms{$target}}, 'NIC'; log_it('Bad NIC'); } # Entertain operator print $BANG if $mode eq 'interactive'; } # Debug info if ($debug > 2) { for my $target (@target) { if (defined $alarm_count{$target} and $alarm_count{$target} > 0) { say "alarm_count{$target} = $alarm_count{$target}"; } } } # Make things look pretty say "\n" if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Examine NICs using Dell's MIB, returning status for the worst NIC, where # 'worst' is not well-defined but is, minimally, worse than 'ok' ######################################################################## sub examine_dell_nics { my %arg; my $host = shift; my $status; my $val; # Debug trace trace_location('begin') if $debug; # Walk networkInterfaceControlStatus say 'Walking networkInterfaceControlStatus' if $debug > 3; %arg = (host => $host, oid => '.1.3.6.1.4.1.674.10892.1.1400.80.1.5'); $val = snmpWalk(\%arg); # Pull out first non-good status VARBIND: for my $varbind (@$val) { $status = $varbind->{val}; last VARBIND unless $status eq 'ok'; } # Set status to 'ok' if undefined $status //= 'ok'; # Debug trace trace_location('end') if $debug; return $status; } ######################################################################## # Examine Intel NICs, looking for problems ######################################################################## sub examine_intel_nics { my %ansMemberIndex; my $answer = 'ok'; my %arg; my @array; my $host = shift; my $type; my $val; # Debug trace trace_location('begin') if $debug; # Walk ansMemberIndex say 'Walking ansMemberIndex' if $debug > 3; %arg = (host => $host, oid => '.1.3.6.1.4.1.343.2.7.2.2.5.1.1.1'); $val = snmpWalk(\%arg); for (my $i = 0; $i < @$val; $i++) { push @array, $val->[$i]->{iid}; } $ansMemberIndex{$host} = \@array; # Acquire ansTeamMemberState for my $iid (@{$ansMemberIndex{$host}}) { say "Getting ansTeamMemberState.$iid" if $debug > 3; %arg = ( host => $host, oid => ".1.3.6.1.4.1.343.2.7.2.2.5.2.1.1.$iid" ); $val = snmpGet(\%arg); given ($val) { when ('disabled') { log_it("For $host, ansTeamMemberState is disabled"); $alarm_count{$host}++; $answer = 'critical'; } when ($DASH) { log_it("For $host, ansTeamMemberState is unknown"); $alarm_count{$host}++; $answer = 'unk'; } } } # Debug trace trace_location('end') if $debug; return $answer; } ######################################################################## # Tell the operator what I discovered ######################################################################## sub print_report { my $handle; my $total = @target; my $now = get_now(); # If we are running in test mode, skip this routine unless ($dome) { print_it("Running in test mode, cannot print a meaningful report\n"); return 1; } # Debug trace trace_location('begin') if $debug; # Direct output to screen or to file if ($mode eq 'interactive') { $handle = *STDOUT; } else { open $handle, '>', $report_file or die "Cannot open $report_file: $!\n"; } print {$handle} < 0) { @silent = sort @silent; for my $silent (@silent) { printf {$handle} "%-15s Not answering pings\n", $silent; } } # Add unresponsive devices to the report if (@unresponsive > 0) { @unresponsive = sort @unresponsive; for my $unresponsive (@unresponsive) { printf {$handle} "%-15s Not answering SNMP GETs\n", $unresponsive; } } # Clean up unless ($handle =~ /STDOUT/) { close $handle or warn "Cannot close $report_file: $!\n"; } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Sanity check ######################################################################## sub sanity_check { my @remove; # Debug trace trace_location('begin') if $debug; # Notify operator print_it('Sanity check...'); # Loop through targets, removing non-Dell devices SANITY: for my $target (@target) { # Identify manufacturer unless ($manufacturer{$target} eq 'Dell') { say "\nManufacturer of $target is not Dell, ignoring" if $debug; push @remove, $target; next SANITY; } # Entertain operator print $BANG if $mode eq 'interactive'; } # Remove entries which failed checks prune_basic(@remove); # Make things look pretty say "\n" if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Write summary data to a log file ######################################################################## sub write_log { my $date = get_date(); # Debug trace trace_location('begin') if $debug; # Notify operator print_it('Writing log...'); # Write data if ($dome and $shit_happens > 0) { if (open my $log, '>>', $log_file) { for my $target (sort keys %alarms) { print {$log} "$date\t$target\t", join $COMMA, $alarms{$target}, "\n"; } close $log or warn "Cannot close $log_file: $!"; } else { warn "Cannot open $log_file: $!"; } } # Make things look pretty say('') if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Output help ######################################################################## sub HELP_MESSAGE { print <