#!/opt/vdops/bin/perl # This script reboots a list of devices in sequence, waiting for each one # to return to life before continuing to the next. In addition, while a # device is rebooting, it pings a specified 'canary' device, tracking # whether or not the canary is hitting or missing pings. If either the # original target or the associated canary fail to recover, the script # bails and notifies the specified alpha pagers. At the end of a # successful run, the script produces a report. # V Who When What # --------------------------------------------------------------------------- # 1.7.6 skendric 2009-11-30 Load String::Similarity # 1.7.5 skendric 2009-10-28 Prettify the report when we bail due to failed # device # 1.7.4 skendric 2009-02-01 Replace Thread::Running with Thread::State # 1.7.3 skendric 2008-12-31 Support new snmpSet format # 1.7.2 skendric 2008-11-03 Log results of canary_check # 1.7.1 skendric 2008-02-05 Update command-line options # 1.7.0 skendric 2007-11-21 Check for hardware alarms in redundant partners # before rebooting # 1.6.2 skendric 2007-10-02 Dawdle after rebooting pokey devices # 1.6.1 2007-09-03 Filter target list using @skip_name # 1.6.0 skendric 2007-03-07 Prune target if associated canary starts off # dead # 1.5.9 skendric 2007-03-21 Stylistic mods # 1.5.8 skendric 2007-03-06 Simplify canary_check # 1.5.7 skendric 2007-01-15 Fix bug in bailing/paging routine # 1.5.6 skendric 2007-01-09 Require consecutive hits before declaring a # a target alive after a reboot # 1.5.5 skendric 2006-12-01 Replace Object Values with OIDs # 1.5.4 skendric 2006-09-05 Fix a couple warnings # 1.5.3 skendric 2006-06-04 Repeat paging loop when bailing # 1.5.2 skendric 2006-03-20 Add 'slosh' to the watch_canary algorithm # Fixed bug in canary_check which skipped last # target. Fixed bug in reporting which # prevented report file from being updated # 1.5.1 skendric 2005-11-05 Upgrade to new FHCRC::VDOPS module structure # 1.2.6 skendric 2005-10-10 Add time stamps to progress messages # 1.2.5 skendric 2005-08-22 More debugging, work around thread/return bug # 1.2.4 skendric 2005-08-08 More debugging # 1.2.3 skendric 2005-08-01 Fiddled with watch_canary algorithm # 1.2.2 skendric 2005-07-04 Fix parameters to snmpSet in do_the_work # 1.2.1 skendric 2005-06-08 Escalate all warnings to exceptions # 1.2.0 skendric 2005-05-09 Support Netops.pm-1.2 # 1.1.0 skendric 2005-05-08 Fix bug in watch_canary # 1.0.0 skendric 2004-11-12 First Version # Author: Stuart Kendrick, sbk {put at sign here} skendric {put dot here} com # # Source: http://www.skendric.com/device # # This software is available under the GNU GENERAL PUBLIC LICENSE, see # http://www.fsf.org/licenses/gpl.html # # # This script takes the following high-level approach: # -Cycle through a list of devices, rebooting each in turn, # pausing between each reboot # -While it waits for the device to return to life, ping # a specified canary device # -If any device fails to return to life, bail # -For each device which does return to life, log the number # of pings which the canary device missed # # This script is modeled heavily off 'seq-reboot' # # Requirements: # -The target(s) must be pingable # # -The target(s) must reboot when sysReset.0 is set to "2" (CatOS) # or when tsMsgSend.0 is set to "2" (IOS). # # -IOS devices must have the line "snmp system-shutdown" in their # running config files # # -The following MIB modules stashed in /opt/vdops/share/snmp/mibs, # or wherever it is that you store MIB modules: # CISCO-PRODUCTS-MIB.my # # -PERL modules: the FHCRC::Netops collection # # # Assumptions: # # # Tested on: # -perl-5.10.1 # -net-snmp-5.5 # # # Instructions: # -Customize the script for your site: find the 'user-configurable # variables' section and modify as appropriate # -Play with the script in "firing blanks" mode, to get a feel for # what it does # -When you are ready to do damage, run it # # # Caveats: # # # Known Bugs: # # # To do: # -Add support for SNMPv3 # -Log canary results to a database # # Begin script # Load modules use strict; use warnings; use feature 'say'; use feature 'switch'; use threads; use threads::shared; use Carp qw(carp cluck croak confess); use Data::Dumper; use English qw( -no_match_vars ); use Getopt::Std; use List::MoreUtils qw(any first_index); use Net::Ping::External qw(ping); use String::Similarity; use Thread::Queue; use Thread::State; use FHCRC::Netops::CiscoTools 1.3.1; use FHCRC::Netops::HostTools 1.0.3; use FHCRC::Netops::NetopsTools 2.0.7; use FHCRC::Netops::NetopsData 1.3.0; use FHCRC::Netops::PingTools 1.1.5; use FHCRC::Netops::SNMPTools 1.3.9; use FHCRC::Netops::Utilities 1.3.9; # Declare global variables my $abort : shared; # Flag which watch_canary() sets to tell the # main thread to abort. watch_canary() sets # this flag if a canary goes down and stays down # during an event my %canary; # Nodes behind @target my @devices_in_alarm; # List of devices which developed a hardware # after being rebooted my $pause; # Seconds to pause between devices my %pings_down; # A hash of references to arrays (keyed by $target) # tracking the number of pings which the relevant # canary missed for each event (typically, a two # element array: the first element being the # 'down' event and the second element the 'up' event) my %more : shared; # Boolean identifying whether or not a canary # experienced more than two missed ping windows my @pokey; # Some devices need more time than others to recover; # if a device belongs to this list, wait an extra # ten minutes before continuing my %reborn; # Did it survive the reboot? my @recipients; # List of alpha pager identities to contact # in event of issues my %time_down; # A hash of references to arrays (keyed by $target) # tracking the number of seconds during which the # relevant canary was missing pings for each event # (typically, a two element array: the first # element being the 'down' event and the second # element the 'up' event) my $tolerance; # Once a target has been rebooted and has started # to answer pings, how many hit pings do I require # before declaring it up? And while I'm waiting # for routing tables and HSRP processes to recover, # how many missed pings will I tolerate before I get # nervous and bail? my $wait; # Seconds to wait for devices to reboot # Define global variables $debug = 0; # 10 = Logging # 9 = Database SELECT operations # 8 = Per IP/MAC/Port processing # 7 = Database INSERT/UPDATE/DELETE # 6 = Dump SNMP var # 5 = Dump snmp_packets # 4 = Grody: print big var # 3 = Verbose: print mid var # 2 = Simple: print small var # 1 = Basic: subroutine trace # 0 = Disable debugging $program_name = 'red-reboot'; $usage = 'Usage: red-reboot -s {yes|no} [-d {integer}] [-r] -f {filename}'; $version = '1.7.6'; # Define user-configurable variables # Binaries $grab_hosts = '/bin/cat /etc/hosts'; # Notification @recipients = qw/skendric/; $snpp_host = 'snpp.fhcrc.org'; # Pause parameters $long = 30; $mid = 10; $short = 5; # Ping Stuff $ping_count = 3; $ping_timeout = 1; # Report stuff $report_file = '/home/netops/rpts/red-reboot.txt'; $report_subject = 'Redundancy Testing Report'; # Target details @skip_name = qw//; # Timers $pause = 120; # This is how long in seconds I'll pause before # moving on to the next device in the list $tolerance = 3; # This is how many hit pings I require before declaring # a target alive after a reboot ... or sick during # pause time $wait = 540; # This is how long in seconds I'll wait for a device # to answer pings after I've rebooted it before # declaring it dead and aborting this script @pokey = qw/dfsr-a-esx gbsr-a-esx j4sr-a-esx m1sr-a-esx m2sr-a-esx m3sr-a-esx m4sr-a-esx/; # SNMP Stuff # Optimize performance by sorting your community strings and SNMP version # list, most frequently used to the left, least frequently used to the right @mib_dir = qw(/opt/vdops/share/snmp/mibs); @mib_file = qw/ALL/; @snmp_read_list = qw/public/; @snmp_write_list = qw/secret/; @snmp_version_list = qw/2/; # Syslog stuff $syslog_facility = 'local5'; $syslog_host = 'localhost'; $syslog_port = 514; $syslog_priority = 'info'; $syslog_socket = 'unix'; # Other possibilites include 'udp' and # 'stream'; depending on the flavor of Unix, # I've employed each of these # Grab arguments getopts('d:f:rs:', \%option); die "Must specify a target list via -f\n" unless $option{f}; # Set mode if ($option{r}) { $mode = 'report' } elsif (-t STDIN) { $mode = 'interactive' } else { $mode = 'batch' } ### Begin Main Program ############################################### { check_args(); # Check arguments # compile_mibs(); # Compile MIB files read_files(); # Populate @target and @canary target_check(); # Look for errors in @target canary_check(); # Look for errors in @canary basic_info(); # Gather information sanity_check(); # Sanity check print_before(); # Tell operator what I will do do_the_work(); # Go for it print_report(); # Tell the operator what I did } ##### End Main Program ############################################### ######################################################################## # Something is wrong: talk about it, produce the report, and then die ######################################################################## sub bail_gracefully { my %page_arg; my $target = shift; my $text; # Debug trace trace_location('begin') if $debug; # Build the text if (defined $reborn{$target} and $reborn{$target} == 0) { $text = "$target did not recover from a scheduled reboot. "; } if (defined $abort) { $text .= "$canary{$target} has quit answering pings. "; } $text .= '--red-reboot'; # Build argument hash %page_arg = ( host => $snpp_host, message => $text, recipient => \@recipients ); # Notify relevant people. Do it twice ... just to be sure for (my $i = 0; $i < 2; $i++) { send_page(\%page_arg); sleep 300; } # Populate the data hashes with dashes, as needed for my $target (@target) { $pings_down{$target}->[0] = $DASH unless defined $pings_down{$target}->[0]; $pings_down{$target}->[1] = $DASH unless defined $pings_down{$target}->[1]; $time_down{$target}->[0] = $DASH unless defined $time_down{$target}->[0]; $time_down{$target}->[1] = $DASH unless defined $time_down{$target}->[1]; $more{$target} = $DASH unless defined $more{$target}; } # Produce report print_report(); # Die if (defined $reborn{$target} and $reborn{$target} == 0) { say "$target did not survive"; log_it("$target did not survive") if $dome; } if (defined $abort) { say "$canary{$target} did not survive"; log_it("$canary{$target} did not survive") if $dome; } die "Bailing\n"; # Debug trace trace_location('end') if $debug; } ######################################################################## # Reboot each device in sequence, pausing after each reboot, then # testing for pingability. If at any point a box fails to answer a ping, # bail. This routine basically handles four cases: table built from the # four possible combinations of $dome (yes | no) and $mode (interactive | # batch). See the if/elsif construct in the middle to see how I handle # these four cases. ######################################################################## sub do_the_work { my $data_queue; # Queue for communicating with watch_canary my $pings; # Number of missed pings, from watch_canary my $time; # Number of seconds, from watch_canary my $thr; # Reference to thread # Debug trace trace_location('begin') if $debug; # Initialize variables $pings = -2; $time = -2; # Loop through devices TARGET: for my $target (@target) { my $beginning_alarm_status; my $ending_alarm_status; # If this device's partner is reporting a hardware alarm, skip it for my $device (@devices_in_alarm) { if ( (similarity $target, $device) > .8 ) { log_it("I believe that $target and $device belong to the same redundant set, and I see that $device is reporting a hardware alarm, so I will skip $device"); next TARGET; } } # Verify that the canary is still alive unless (ping_it($canary{$target})) { say "$canary{$target} is not answering pings, bailing"; log_it("$canary{$target} is not answering pings, bailing") if $dome; $abort = 1; bail_gracefully($target); } # Record the alarm status $beginning_alarm_status = acquire_cisco_alarm_status($target); # Start pinging canary $data_queue = Thread::Queue->new; ($thr) = threads->new(\&watch_canary, $target, $data_queue); sleep $short; # If we are playing for keeps, do it if ($dome) { my ($result, @varbind); given ($os_flavor{$target}) { when ('CatOS') { say 'Setting sysReset.0 i 2 (reset)' if $debug > 3; push @varbind, '.1.3.6.1.4.1.9.5.1.1.9.0', 'INTEGER', 2; $result = snmpSet( {host => $target, varbind => \@varbind} ); } when ('IOS') { say 'Setting tsMsgSend.0 i 2 (reload)' if $debug > 3; push @varbind, '.1.3.6.1.4.1.9.2.9.9.0', 'INTEGER', 2; $result = snmpSet( {host => $target, varbind => \@varbind} ); } default { say "I don't know how to reboot $target ... "; log_it("I don't know how to reboot $target") if $dome; } } # Notify operator given ($result) { when (0) { say "Rebooting $target at " . scalar localtime() . ' ... failed'; log_it("Rebooting $target ... failed"); } when (1) { say "Rebooting $target at " . scalar localtime() . ' ... success'; log_it("Rebooting $target ... success"); } default { $result = 'undefined' unless defined $result; say "Problem rebooting $target at " . scalar localtime() . "; result = $result"; log_it("Problem rebooting $target; result = $result"); } } } # End 'Playing for keeps' # Otherwise we are just having fun else { print_it("Rebooting $target ... just kidding"); sleep $short; } # End 'just having fun' # Wait a little to give the SNMP agent time to reboot the device sleep $mid; # Did the device survive the experience? $reborn{$target} = poll($target); # Now, tell watch_canary to stop and record the number of missed # pings. The 'my $results' section works around a bug which limits # the number of args a thread can return to 1. $data_queue->enqueue('quit'); sleep $short; if ($thr->is_joinable) { my $results = $thr->join; $pings = $results->[0]; $time = $results->[1]; $pings_down{$target} = $results->[2]; $time_down{$target} = $results->[3]; # Why doesn't this line work? Why do I pull out the arguments above # in such a tedious fashion? --sk # ($pings, $time, $pings_down{$target}, $time_down{$target}) = $thr->join; } else { say "For $target, watch_canary($canary{$target}) isn't ready to join"; log_it("For $target, watch_canary($canary{$target}) isn't ready to join") if $dome; $pings = -1; $time = -1; } say "Summary: $canary{$target} missed $pings pings over $time seconds"; log_it("Summary: $canary{$target} missed $pings pings over $time seconds") if $dome; # If watch_canary() is telling us to bail, then quit bail_gracefully($target) if $abort; # If the device survived, continue; otherwise, quit if ($reborn{$target} == 1) { say "Target $target is answering pings, continuing at " . scalar localtime(); log_it("Target $target is answering pings, continuing") if $dome; } elsif ($reborn{$target} == 0) { say "Target $target did not survive the reboot, bailing"; log_it("Target $target did not survive the reboot, bailing") if $dome; bail_gracefully($target); } # Record the alarm status $ending_alarm_status = acquire_cisco_alarm_status($target); # If the device changed alarm state across the reboot, log the change if ($beginning_alarm_status == 1 and $ending_alarm_status == 0) { log_it("Cleared hardware alarm on $target"); } elsif ($beginning_alarm_status == 0 and $ending_alarm_status == 1) { log_it("$target is now reporting a hardware alarm"); push @devices_in_alarm, $target; } elsif ($beginning_alarm_status == 1 and $ending_alarm_status == 1) { log_it("$target is still reporting a hardware alarm"); } } # End 'Loop through devices' # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Check @target and @canary for problems ######################################################################## sub canary_check { my $alive_ref; # List of targets which answered a ping my $dead_ref; # List of targets which didn't answer a ping my $error_ref; # List of targets for which the pinger encountered # an unspecified error my $unknown_ref; # List of targets which the pinger couldn't resolve # into addresses my @canary; # List of canaries extracted from %canary my @keep; # List of canaries which passed tests my @nuke; # List of problematic canaries # Debug trace trace_location('begin') if $debug; # Populate @canary @canary = values %canary; # Notify operator print_it('Resolving canary list to IP addresses...'); # Loop through canaries, checking for errors CANARY: for my $canary (@canary) { # Initialize @error $error{$canary} = $EMPTY_STR; # If host is an IP address, remove from list if ($canary =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) { print_it("\n$canary must be a hostname, not an IP address, ignoring"); push @nuke, $canary; next CANARY; } # If host is not resolvable to an IP address, remove from list unless (defined(gethostbyname($canary))) { print_it("\n'$canary' is not a known hostname, ignoring"); push @nuke, $canary; next CANARY; } # Record successes push @keep, $canary; # Entertain operator print $BANG if $mode eq 'interactive'; } # Remove entries which failed name resolution @canary = @keep; undef @keep; say "\n" if $mode eq 'interactive'; # Notify operator say 'Pinging canary list...' if $mode eq 'interactive'; # If we can't ping the canary, remove from the list ($alive_ref, $dead_ref, $unknown_ref, $error_ref) = ping_list(\@canary); @keep = @$alive_ref; # Give the dead hosts a second chance for my $host (@$dead_ref) { if (ping_it($host)) { say "$host returned a ping on second chance" if $debug > 3; push @keep, $host; } else { push @nuke, $host; } } # Whine about dead canaries for my $dead (@nuke) { print_it("$dead is not returning pings"); } for my $error (@$error_ref) { print_it("Encountered an error when pinging $error"); } for my $unknown (@$unknown_ref) { print_it("Encountered an unknown problem when pinging $unknown"); } # Remove entries which missed pings @canary = @keep; # Toss unknowns and errors push @nuke, @$unknown_ref; push @nuke, @$error_ref; # Notify operator say "\nRemoving @nuke\n" if ($mode eq 'interactive' and @nuke > 0); # Prune @target and %canary # Iterate through dead canaries for my $nuke (@nuke) { # Find targets relying on dead canaries for my $target (keys %canary) { my $canary = $canary{$target}; if ($nuke eq $canary) { # Remove targets which are relying on dead canaries SPLICE_TARGET: for (my $i = 0; $i < @target; $i++) { my $index = first_index { $_ eq $target} @target; last SPLICE_TARGET if $index == -1; splice @target, $index, 1; } # Remove dead canaries delete $canary{$target}; print_it("Removing $target from list of devices to be rebooted"); } } } # Check that @target and %canary are consistent for my $target (@target) { unless (defined $canary{$target}) { print_it("$target exists in \@target, but \$canary{\$target} does not exist, bailing"); die 'Canary integrity error'; } } # Print debugging info if ($debug > 1) { for my $target (@target) { say "$target --> $canary{$target}"; } } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Did the device survive the reboot? Ping it until it answers three # times in succession ... or until I exceed $wait ######################################################################## sub poll { my $hit; # Number of pings which the target has hit my $miss; # Number of pings which the target missed my $now; # Current time my $start; # Start time my $t; # Iterates across ticks my $target = shift; my $ticks; # Number of ticks to wait or pause # Debug trace trace_location('begin') if $debug; # Define local variables $miss = 0; $start = time(); $ticks = int $wait/$short; # Notify operator say "Waiting on $target"; log_it("Waiting on $target") if $dome; # Ping target every few seconds POLL: for ($t = 0; $t < $ticks; $t++) { sleep $short; if ( ping_it($target) ) { print $BANG if $mode eq 'interactive'; $hit++; last POLL if $hit >= $tolerance; } else { print $DOT if $mode eq 'interactive'; $hit = 0; } # If we have been waiting for longer than $wait, then bail $now = time(); if ($now - $start > $wait) { say "\nExceeded wait time of $wait for target $target"; log_it("Exceeded wait time of $wait for target $target") if $dome; last POLL; } } # Make things look pretty say('') if $mode eq 'interactive'; # Wait a bit sleep $mid; # Calculate $ticks $start = time(); $ticks = int $pause/$short; # Notify operator say "\nPausing on $target"; log_it("Pausing on $target") if $dome; # I don't want to charge ahead here and reboot the next device ... # rather, I want to wait a little ... to let things like routing # tables and HSRP Active/Standby status to stabilize for ($t = 0; $t < $ticks; $t++) { if ( ping_it($target) ) { print $BANG if $mode eq 'interactive'; } else { print $DOT if $mode eq 'interactive'; $miss++; } sleep $short; } # If device needs more time to recover from a reboot than your average bear, # wait sleep 600 if any { $_ eq $target } @pokey; # Make things look pretty say('') if $mode eq 'interactive'; # Report to operator if ($miss > 0) { say "\nTarget $target missed $miss pings during pause time"; log_it("Target $target missed $miss pings during pause time") if $dome; } # Debug trace trace_location('end') if $debug; # Return answer if ($miss <= $tolerance) { return 1 } else { return 0 } } ######################################################################## # Tell the operator what I did ######################################################################## sub print_report { my $handle; my $now = get_now(); # Debug trace trace_location('begin') if $debug; # Direct output to screen or to file if ($mode eq 'interactive') { $handle = *STDOUT; } else { open $handle, '>>', $report_file or warn "Cannot open $report_file: $!\n"; } # If devices are in alarm, list them if (@devices_in_alarm > 0) { print {$handle} "#\n"; print {$handle} "# The following devices are reporting hardware alarms:\n"; print {$handle} "# @devices_in_alarm\n"; print {$handle} "#\n"; } print {$handle} <[0], $time_down{$target}->[0], $pings_down{$target}->[1], $time_down{$target}->[1], $more{$target}; } print {$handle} "\n\n\n\n"; unless ($handle =~ /STDOUT/) { close $handle or warn "Cannot close $report_file: $!\n"; } # Make things look pretty say "Ending $PROGRAM_NAME" if $mode eq 'interactive'; log_it("Ending $PROGRAM_NAME"); # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Tell the operator what I will do ######################################################################## sub print_before { # Debug trace trace_location('begin') if $debug; return 1 if $mode eq 'batch'; print < $wait/2; say "partWay = $partWay" if $debug > 1; # Initialize variables $missed = 0; $more{$target} = $EMPTY_STR; $slosh = 0; undef $begin; undef $end; undef $signal; # Loop for a long time. PING: for ($t = 0; $t < $wait; $t++) { # If the device answers the ping if ( ping_it($canary) ) { # If $begin is not defined, but we have reached the part-way # point, then the 'down' event was a non-issue: the canary # didn't miss any pings. Record 0 in the relevant data # structures and keep going. if (not defined $begin and $t == $partWay) { print $HASH; push @timeDn, 0; push @pingsDn, 0; say "\nDown event was a non-issue" if $debug > 1; } # If $begin is defined, but $end is not; then the canary has been # missing pings; and right this moment, it started to hit them # again. I could declare it up ... but it turns out that canaries # sometimes waver at this point. So wait until we have hit five # successive pings before declaring victory. At that point, record # the event, reset $begin and $end and $missed elsif (defined $begin and not defined $end) { $slosh++; if ($slosh > 5) { print $HASH; $end = time; $diff = $end - $begin; push @timeDn, $diff; push @pingsDn, $missed; undef $begin; undef $end; undef $diff; $missed = 0; $slosh = 0; say "\n$canary{$target} has begun to hit pings" if $debug > 1; } } # Otherwise, just entertain the operator else { print $HASH; } } # Otherwise, the canary missed the ping: increment $missed. # Also, if this is the first missed ping for this event, set $begin else { print $DASH; $missed++; unless (defined $begin) { say "\n$canary{$target} has begun to miss pings" if $debug > 1; $begin = time; } } # Check to see if the main thread would like us to quit $signal = $queue->dequeue_nb; last PING if defined $signal; } # End wait loop # Make things look pretty say('') if $mode eq 'interactive'; # OK, now that we have finished pinging the canary, figure out # what happened # If we exitted the 'for' loop because we reached $wait, something # is wrong ... the main thread should have asked us to quit before # we reached wait if ($t >= $wait) { say "\nSomething is wrong: watch_canary() reached wait $wait"; log_it("Something is wrong: watch_canary() reached wait $wait") if $dome; } # If the canary isn't answering pings, then we have a problem # (Another way to do this would be to check $missed ... unless # it is zero, we have a problem.) if (defined $begin and not defined $end) { $abort = 1; $end = time; $diff = $end - $begin; push @timeDn, $diff; push @pingsDn, $missed; say "$canary{$target} is still down, bailing"; log_it("$canary{$target} is still down, bailing") if $dome; } # If $begin and $end are both undefined, then the canary was answering # pings when we exitted the loop. Figure out which data structures # need to be updated. (Another way to do this would be to check $missed; # if it is zero, then we fall into this case.) elsif (not defined $begin and not defined $end) { # If the 'down' event was a non-issue, then the ($t == $partWay) clause # should have caught this and recorded 0 in @pingsDn[0] and @timeDn[0] # However, check this; and if it didn't, then record it now if (not defined $pingsDn[0]) { $pingsDn[0] = 0; $timeDn[0] = 0; say "\nDown event was a non-issue" if $debug; } # If the 'up' event was a non-issue, then we haven't yet had # time to record that fact ... handle this now if (not defined $pingsDn[1]) { $pingsDn[1] = 0; $timeDn[1] = 0; $end = time; say "\nUp event as a non-issue" if $debug; } } # Talk about the results for (my $k = 0; $k < @pingsDn; $k++) { if ($k == 0) { # 'Down' event say "\nWhen $target went down, $canary missed $pingsDn[$k] pings over $timeDn[$k] seconds"; log_it("When $target went down, $canary missed $pingsDn[$k] pings over $timeDn[$k] seconds") if $dome; } # 'Up' event elsif ($k == 1) { say "When $target came up, $canary missed $pingsDn[$k] pings over $timeDn[$k] seconds"; log_it("When $target came up, $canary missed $pingsDn[$k] pings over $timeDn[$k] seconds") if $dome; } # This means that the canary experienced more than two windows of # missed pings ... I don't understand why this happens, but I see it # occur regularly. Record the event and whine else { say "For k=$k, $canary missed $pingsDn[$k] pings over $timeDn[$k] seconds ..."; log_it("For k=$k, $canary missed $pingsDn[$k] pings over $timeDn[$k] seconds ...") if $dome; say "Partway point was $partWay seconds"; log_it("Partway point was $partWay seconds") if $dome; $more{$target} = 'yes'; } # Print additional debug info # Sum missed pings and time $sumPings += $pingsDn[$k]; $sumTime += $timeDn[$k]; } # Debug info if ($debug) { say(''); say "sumPings = $sumPings"; say "sumTime = $sumTime"; say "pingsDn = @pingsDn"; say "timeDn = @timeDn"; } # As of 8-22-2005, a bug limits the number of arguments I can return to 1. # Stuff everything into an array and return a reference to it. my @results = ($sumPings, $sumTime, \@pingsDn, \@timeDn); # Debug trace trace_location('end') if $debug; # return ($sumPings, $sumTime, \@pingsDn, \@timeDn); return \@results; } ######################################################################## # Output help ######################################################################## sub HELP_MESSAGE { print <