#!/opt/vdops/bin/perl # This script reboots a list of devices in sequence, waiting for each one # to return to life before continuing to the next. After a suitable wait, # if a device fails to return pings, the script bails. Also, after a device # reboots, the script checks various hardware status variables, and if the # hardware status has degraded, the script bails # V Who When What # --------------------------------------------------------------------------- # 1.7.0 skendric 04-02-2010 Upgrade to perl 5.10.1 # 1.6.3 skendric 01-26-2008 Support new get_time function # 1.6.2 skendric 12-29-2008 Support new snmpSet format # 1.6.1 skendric 05-12-2008 Simplify handling of IOS devices # 1.6.0 skendric 11-23-2007 Check for hardware alarms before continuing # 1.5.3 skendric 03-21-2007 Stylistic mods # 1.5.2 skendric 11-19-2006 Replace Object Values with OIDs # 1.5.1 skendric 11-05-2005 Upgrade to new FHCRC::VDOPS module structure # 1.5.0 skendric 05-09-2005 Support Netops.pm-1.2 # 1.4.1 skendric 11-11-2004 Refine interactive output # 1.4.0 skendric 05-09-2004 Migrate common functions to Netops.pm # 1.3.0 skendric 04-30-2004 Enhance command-line options # 1.2.2 skendric 11-16-2003 Use Net::Ping::External # 1.2.1 skendric 08-10-2003 Minor bug fixes # 1.2.0 skendric 08-09-2003 Added the 'all' option # 1.1.0 skendric 03-28-2003 Numerous minor updates # 1.0.0 skendric 01-26-2003 First Version # Author: Stuart Kendrick, sbk@skendric.com # # Source: http://www.skendric.com/device # # This software is available under the GNU GENERAL PUBLIC LICENSE, see # http://www.fsf.org/licenses/gpl.html # # # This script takes the following high-level approach: # -Cycle through a list of devices, rebooting each in turn, # pausing between each reboot # -If any device fails to return to life, bail # # # Requirements: # -The target(s) must be pingable # # -The target(s) must reboot when sysReset.0 is set to "2" (CatOS) # or when tsMsgSend.0 is set to "2" (IOS). # # -IOS devices must have the line "snmp system-shutdown" in their # running config files # # -The following MIB modules stashed in /opt/local/share/snmp/mibs, # or wherever it is that you store MIB modules: # CISCO-PRODUCTS-MIB.my # # -PERL modules: the FHCRC::Netops collection # # # Assumptions: # # # Tested on: # -perl-5.10.1 # -net-snmp-5.5 # # # Instructions: # -Customize the script for your site: find the 'user-configurable # variables' section and modify as appropriate # -Create $nodes, if desired # -Play with the script in "firing blanks" mode, to get a feel for # what it does # -When you are ready to do damage, run it # # # Caveats: # # # Known Bugs: # # # To do: # -Add support for SNMPv3 # # Begin script # Load modules use strict; use warnings FATAL => qw(all); use feature 'say'; use feature 'switch'; use Carp qw(carp cluck croak confess); use Data::Dumper; use English qw( -no_match_vars ); use Getopt::Std; use Net::Ping::External qw(ping); use FHCRC::Netops::CiscoTools 1.3.1; use FHCRC::Netops::HostTools 1.0.3; use FHCRC::Netops::NetopsTools 2.0.7; use FHCRC::Netops::NetopsData 1.3.0; use FHCRC::Netops::PingTools 1.1.5; use FHCRC::Netops::SNMPTools 1.3.9; use FHCRC::Netops::Utilities 1.3.9; # Declare global variables my @devices_in_alarm; # List of devices which developed a hardware # after being rebooted my $pause; # Seconds to pause between devices my %phoenix; # Did it survive the reboot? my %reboot_oids; # SNMP OIDs for rebooting devices my $wait; # Seconds to wait for devices to reboot # Define global variables $debug = 0; # 10 = Logging # 9 = Database SELECT operations # 8 = Per IP/MAC/Port processing # 7 = Database INSERT/UPDATE/DELETE # 6 = Dump SNMP var # 5 = Dump snmp_packets # 4 = Grody: print big var # 3 = Verbose: print mid var # 2 = Simple: print small var # 1 = Basic: subroutine trace # 0 = Disable debugging $program_name = 'seq-reboot'; $usage = 'Usage: seq-reboot -s {yes|no} [-d {integer}] [-r] [-a | -e {expr} | -f {filename} | target1 target2 target3 ...]'; $version = '1.7.0'; # Define user-configurable variables # Binaries $grab_hosts = '/bin/cat /etc/hosts'; # Pause parameters $long = 30; $mid = 10; $short = 5; # Ping Stuff $ping_count = 3; $ping_timeout = 1; # SNMP Stuff # Optimize performance by sorting your community strings and SNMP version # list, most frequently used to the left, least frequently used to the right @mib_dir = qw(/opt/vdops/share/snmp/mibs); @mib_file = qw/ALL/; @snmp_read_list = qw/public/; @snmp_write_list = qw/secret/; @snmp_version_list = qw/2 1/; # Syslog stuff $syslog_facility = 'local5'; $syslog_host = 'localhost'; $syslog_port = 514; $syslog_priority = 'info'; $syslog_socket = 'unix'; # Other possibilites include 'udp' and # 'stream'; depending on the flavor of Unix, # I've employed each of these # Target details @skip_name = qw/swamp /; @suffixes = qw/-agw -dgw -esx -nat -rtr -vpn/; # Timers $pause = 130; # This is how long in seconds I'll pause before # moving on to the next device in the list $wait = 420; # This is how long in seconds I'll wait for a device # to answer pings after being rebooted before # declaring it dead and aborting this script # These are the OIDs for the devices which I know how to reboot. # Notice that I leave off the trailing '.0' %reboot_oids = ( AT => '.1.3.6.1.4.1.207.8.4.4.3.2.0', # Allied Telesyn restart HP => '.1.3.6.1.4.1.11.2.14.11.1.4.1.0', # Hewlett-Packard hpicfReset CatOS => '.1.3.6.1.4.1.9.5.1.1.9.0', # Cisco CatOS sysReset IOS => '.1.3.6.1.4.1.9.2.9.9.0', # Cisco IOS tsMsgSend TAOS => '.1.3.6.1.4.1.529.9.8.0.0' # Ascend sysReset ); # Grab arguments getopts('ad:e:f:s:', \%option); @target = @ARGV; # Set mode if ($option{r}) { $mode = 'report' } elsif (-t STDIN) { $mode = 'interactive' } else { $mode = 'batch' } ### Begin Main Program ############################################### { check_args(); # Check arguments compile_mibs(); # Compile MIB files build_target(); # Populate @target target_check(); # Look for errors in @target basic_info(); # Gather information print_before(); # Tell operator what I will do do_the_work(); # Go for it print_report(); # Tell the operator what I did } ##### End Main Program ############################################### ######################################################################## # Reboot each device in sequence, pausing after each reboot, then # testing for pingability. If at any point a box fails to answer a ping, # bail. This routine basically handles four cases: table built from the # four possible combinations of $dome (yes | no) and $mode (interactive | # batch). See the if/elsif construct in the middle to see how I handle # these four cases. ######################################################################## sub do_the_work { # Debug trace trace_location('begin') if $debug; # Loop through devices TARGET: for my $target (@target) { my ($beginning_alarm_status, $ending_alarm_status); my ($branch, $leaf, $oid, $result, $time, @varbind); $oid = $reboot_oids{$os_flavor{$target}}; ($branch, $leaf) = ($oid =~ /^(.*)\.(\d+)\Z/); @varbind = ($branch, $leaf, 2, 'INTEGER'); # Record the alarm status $beginning_alarm_status = acquire_cisco_alarm_status($target); # Grab time $time = get_time(); # If we are playing for keeps, do it if ($dome) { print "Rebooting $target at $time ... " if $mode eq 'interactive'; say('') if $debug; log_it("Rebooting $target at $time"); $result = snmpSet( { host => $target, varbind => \@varbind } ); } else { print "Rebooting $target at $time ... " if $mode eq 'interactive'; sleep $short; $result = 0; say 'just kidding' if $mode eq 'interactive'; } # Grab time $time = get_time(); # Figure out what happened if ($result) { say "reset at $time" if $mode eq 'interactive'; log_it("$target reset"); } else { say "failed to reset at $time" if $mode eq 'interactive'; log_it("$target failed to reset"); } # Wait a little to give the SNMP agent time to reboot the device sleep $mid; # Did the device survive the experience? $phoenix{$target} = poll($target); # If the device survived, continue; otherwise, die if ($phoenix{$target} == 1) { print_it("$target is answering pings, continuing\n"); } elsif ($phoenix{$target} == 0) { print_it("$target did not return to life, bailing\n"); die; } # Record the alarm status $ending_alarm_status = acquire_cisco_alarm_status($target); # If the device changed alarm state across the reboot, log the change if ($beginning_alarm_status == 1 and $ending_alarm_status == 0) { print_it("Cleared hardware alarm on $target"); } elsif ($beginning_alarm_status == 0 and $ending_alarm_status == 1) { print_it("$target is now reporting a hardware alarm"); push @devices_in_alarm, $target; } elsif ($beginning_alarm_status == 1 and $ending_alarm_status == 1) { print_it("$target is still reporting a hardware alarm"); } } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Did the device survive the reboot? Ping it until it answers ... or # until I exceed $wait ######################################################################## sub poll { my $hit; my $miss; my $now; # Current time my $start; # Start time my $t; # Iterates across ticks my $target = shift; my $ticks; # Number of ticks to wait or pause # Debug trace trace_location('begin') if $debug; # Calculate $ticks $start = time(); $ticks = int $wait/$short; # Initialize $hit = 0; $miss = 0; # Ping target every few seconds POLL: for ($t = 0; $t < $ticks; $t++) { sleep $short; if (ping_it($target)) { print $BANG if $mode eq 'interactive'; $hit++; sleep $short; } else { $hit-- unless $hit == 0; print $DOT if $mode eq 'interactive'; } # If we have hit three pings, move on last POLL if $hit > 2; # If we have been waiting for longer than $wait, then bail $now = time(); last POLL if ($now - $start) > $wait; } # Calculate $ticks $start = time(); $ticks = int $pause/$short; # I don't want to charge ahead here and reboot the next device ... # rather, I want to wait a little ... to let things like routing # tables and HSRP Active/Standby status to stabilize for ($t = 0; $t < $ticks; $t++) { if (ping_it($target)) { print $BANG if $mode eq 'interactive'; } else { print $DOT if $mode eq 'interactive'; $miss++; } sleep $short; } say('') if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; # Return result # If we have missed more than two pings during $pause, bail if ($miss < 3) { return 1 } else { return 0 } } ######################################################################## # Tell the operator what I did ######################################################################## sub print_report { my $handle; my $now = get_now(); # Debug trace trace_location('begin') if $debug; # Direct output to screen or to file if ($mode eq 'interactive') { $handle = *STDOUT; } else { open $handle, '>>', $report_file or die "Cannot open $report_file: $!\n"; } # Print report if ($dome) { print {$handle} "\n# Here are the boxes I rebooted\n"; } else { print {$handle} "\n# Here are the boxes I would have rebooted, had you been serious\n"; } # If devices are in alarm, list them if (@devices_in_alarm > 0) { print {$handle} "#\n"; print {$handle} "# The following devices are reporting hardware alarms:\n"; print {$handle} "# @devices_in_alarm\n"; print {$handle} "#\n"; } print <