#!/opt/vdops/bin/perl # This script checks to see if the current time falls within a user-defined # window. If so, it disables paging in NodeWatch, runs 'red-reboot', and then # re-enables paging. If not, it exits. # V Who When What # --------------------------------------------------------------------------- # 1.6.1 skendric 02-21-2011 Support Netops-1.4.0 # 1.6.0 skendric 02-05-2010 Upgrade to perl 5.10.1 # 1.5.8 skendric 10-02-2009 Remove InterNAP # 1.5.7 skendric 08-02-2009 Specify timezone when constructing DateTime # object # 1.5.6 skendric 07-05-2009 Futz with service provider message # 1.5.5 skendric 04-17-2009 Replace Date::EzDate with DateTime # 1.5.4 skendric 03-02-2009 Sleep before notifying service providers # 1.5.3 skendric 02-13-2009 Post begin/end messages to TOC # 1.5.2 skendric 03-03-2008 Fix bug in mail routines which was skipping # most recipients # 1.5.1 skendric 01-07-2008 More debugging # 1.5.0 skendric 12-04-2007 More debugging # 1.4.2 skendric 12-03-2007 Update PNW GigaPOP IP addresses # 1.4.1 skendric 11-26-2007 Make e-mail notification more robust # 1.4.0 skendric 11-19-2007 Add command-line switches to disable/enable # functions # 1.3.0 skendric 11-05-2007 Update PNW-GigaPOP IP addresses # 1.2.9 skendric 07-02-2007 Update service provider IP addresses # 1.2.8 skendric 12-03-2006 Stylistic mods # 1.2.7 skendric 09-04-2006 Add it-server to baby-sitter hash # 1.2.6 skendric 06-18-2006 Add e-mail to BlueArc about indigo.fhcrc.org # 1.2.5 skendric 06-04-2006 Run on Sunday but check to see if the following # day is the first Monday of the month # 1.2.4 skendric 11-07-2005 Upgrade to new FHCRC::VDOPS module structure # 1.2.3 skendric 06-04-2005 Updated GigaPOP addresses # 1.2.2 skendric 05-09-2005 Fixed bug in mail handling # 1.2.1 skendric 04-24-2005 Enhanced debugging # 1.2.0 skendric 03-05-2005 Handle case where service provider gear # doesn't survive the testing # 1.1.1 skendric 09-08-2004 Bug fixes in recipient list # 1.1.0 skendric 09-01-2004 Send InterNAP and GigaPOP e-mail # 1.0.1 skendric 08-08-2004 Replace Time::Period with Date::EzDate # 1.0.0 skendric 07-18-2004 First version # # # Author: Stuart Kendrick, sbk@skendric.com # # Source: http://www.skendric.com/device # # This software is available under the GNU GENERAL PUBLIC LICENSE, see # http://www.fsf.org/licenses/gpl.html # # # This script takes the following approach: # -Send e-mail to service providers, notifying them of start # -Open nodewatch.options, turn 'Node Reports via Action' off # -run 'red-reboot' # -Open nodewatch.options, turn 'Node Reports via Action' on # -Send e-mail to service providers, notifying them of end # # # Requirements: # # # Assumptions: # # # Tested on: # -perl-5.12.2 # # # Instructions: # -Customize the script for your site: find the 'user-configurable # variables' section and modify as appropriate # -Try it out # # # Caveats: # # # Known Bugs: # # # To do: # # # Begin script # Load modules use v5.12.0; use strict; use warnings FATAL => 'all'; use feature 'say'; use feature 'switch'; use Data::Dumper; use DateTime; use English; use File::Copy 'cp'; use Getopt::Std; use Mail::Send; use Regexp::Common; use FHCRC::Netops::HostTools 1.0.5; use FHCRC::Netops::NetopsTools 2.3.1; use FHCRC::Netops::NetopsData 1.4.4; use FHCRC::Netops::PingTools 1.2.0; use FHCRC::Netops::SNMPTools 1.5.4; use FHCRC::Netops::Utilities 1.4.5; # Declare global variables my $baby_sitter; # Boolean telling us whether or not to notify # baby sitters my %baby_sitter; # Hash of host monitoring service e-mail # addresses, keyed by name of service # provider my %begin_status; # Status of border routers at start my @border_routers; # List of border routers my %end_status; # Status of border routers at end my %monitored_hosts; # Hash of arrays of host names keyed by # service provider my $node_file; # Location of red-reboot node list my $nodewatch_nodes; # Location of nodewatch.options my @operators; # The owners/managers of the testing function my $temp_file; # Temporary file my $service_provider; # Boolean telling us whether or not to notify # service providers my %sp_address; # Hash of service provider NOC e-mail # addresses keyed by service provider my $subject; # Subject line of message my $target_date; # Day of the week on which testing occurs my $time_override; # Boolean telling us whether or not to override # date/time filter my $usage; # Usage message # Define global variables $debug = 0; $program_name = 'redundancy-testing'; $usage = 'Usage: redundancy-testing -b yes|no [-d {integer}] -p yes|no -s yes|no -t yes|no'; $version = '1.6.1'; # Define user-configurable variables # Dates $target_date = 'Monday'; # Files $nodewatch_nodes = '/opt/vdops/etc/nodewatch/nodewatch.options'; $temp_file = '/opt/vdops/etc/nodewatch/nodewatch.options.tmp'; $node_file = '/home/netops/etc/red-reboot.nodes'; # Hosts %monitored_hosts = ( 'BlueArc' => ['bluedisk.company.com'], 'NetApp' => ['netappdisk.company.com'] ); # Mail %baby_sitter = ( 'BlueArc' => ['support@vendor1.com','sys-admin@company.com'], 'NetApp' => ['support@vendor2.com', 'sys-admin@company.com'] ); @operators = qw/sys-admin@company.com/; $subject = 'Analyze results of red-reboot run'; %sp_address = ( 'ISP' => 'noc@isp.net', ); # Ping Parameters $ping_count = 3; $ping_timeout = 1; # Target details @border_routers = qw/isp-a-rtr isp-b-rtr/; # Grab arguments getopts('b:d:p:s:t:y:z:', \%option); # Set debug level $debug = defined $option{d} ? $option{d} : 0; unless ($RE{num}{int}->matches($debug)) { say '-d {debug level} must be an integer'; exit; } # Set mode if (-t STDIN) { $mode = 'interactive' } else { $mode = 'batch' } ##### Begin Main Program ############################################### { # Read config file read_config(); # Check arguments handle_arguments(); # Unless the time/date is appropriate, quit exit unless is_it_time(); # Disable NodeWatch Paging flop('0') unless $debug; # Determine status router_status(\%begin_status); # Post to the TOC post_to_toc('begin'); # Notify service providers notify_service_providers('begin'); # Notify host monitoring services notify_baby_sitters('begin'); # Run 'red-reboot' system("/home/netops/bin/red-reboot -s $dome -d $debug -f $node_file -r"); # Enable NodeWatch Paging flop('1') unless $debug; # Determine status router_status(\%end_status); # Notify service providers notify_service_providers('end'); # Notify host monitoring services notify_baby_sitters('end'); # Post to the TOC post_to_toc('end'); # Send tickler notify_operator(); } ##### End Main Program ################################################# ######################################################################## # Change the value of 'Node Reports via Action' in nodewatch.options ######################################################################## sub flop { my $old; my $new = shift; # Debug trace trace_location('begin') if $debug; # Set $old given ($new) { when (0) { $old = 1 } when (1) { $old = 0 } } # Open files open my $input, '<', $nodewatch_nodes or die "Cannot open $nodewatch_nodes: $!"; open my $output, '>', $temp_file or die "Cannot open $temp_file: $!"; # Find option while (my $line = <$input>) { $line =~ s/$old/$new/ if $line =~ /Node Reports via Action/i; print {$output} $line; } # Clean up close $input or warn "Cannot close $input: $!"; close $output or warn "Cannot close $output: $!"; # Copy temp file on top of original file cp ("$temp_file", "$nodewatch_nodes") or die "Could not copy $temp_file to $nodewatch_nodes:$!"; # Remove temp file unlink $temp_file; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Handle arguments ######################################################################## sub handle_arguments { # Debug trace trace_location('begin') if $debug; # Baby sitters given ($option{b}) { when ('yes') { $baby_sitter = 1; log_it("We are notifying baby sitters"); } when ('no') { $baby_sitter = 0; log_it("We are not notifying baby sitters"); } default { die "The -b option must be either 'yes' or 'no'\n"; } } $baby_sitter = $option{b}; # Service providers given ($option{p}) { when ('yes') { $service_provider = 1; log_it("We are notifying service providers"); } when ('no') { $service_provider = 0; log_it("We are not notifying service providers"); } default { die "The -p option must be either 'yes' or 'no'\n"; } } $service_provider = $option{p}; # Seriousness given ($option{s}) { when ('yes') { $dome = 1; log_it("We are serious"); } when ('no') { $dome = 0; log_it("We are not serious"); } default { die "The -s option must be either 'yes' or 'no'\n"; } } $dome = $option{s}; # Time override given ($option{t}) { when ('yes') { $time_override = 1; log_it("We are overriding time"); } when ('no') { $time_override = 0; log_it("We are not overriding time"); } default { die "The -t option must be either 'yes' or 'no'\n"; } } $time_override = $option{t}; # If we aren't serious, reset baby sitters and service providers to # operators if ($dome eq 'no') { print_it("We aren't serious, so set baby-sitter and service provider notification to first operator"); %baby_sitter = ( 'BlueArc' => [ $operators[0] ] ); $subject = 'Analyze results of red-reboot run: dry run'; %sp_address = ( 'PNW GigaPOP' => $operators[0], 'InterNAP' => $operators[0] ); } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Is it time to run? ######################################################################## sub is_it_time { my ($dt, $day_of_week, $day_of_month, $time_is_now); # Debug trace trace_location('begin') if $debug; # Create object $dt = DateTime->now( time_zone => 'local' ); # This cronjob launches Sunday night ... but we check the following # Monday to see if it is the first Monday of the month $dt->add( days => 1); # Identify some date parameters $day_of_week = $dt->day_name; $day_of_month = $dt->day; # Is tomorrow the first Monday of the month? if ($day_of_week eq $target_date) { if ($day_of_month < 8) { $time_is_now = 1; } else { $time_is_now = 0; } } else { $time_is_now = 0; } # If we are in override mode, ignore the result of this calculation $time_is_now = 1 if $time_override eq 'yes'; # Notify operator if ($mode eq 'interactive') { if ($time_is_now) { say 'It is time'; } else { say 'It is not time'; } } print_it("Ending $PROGRAM_NAME") unless $time_is_now; # Debug trace trace_location('end') if $debug; return $time_is_now; } ######################################################################## # Remind staff to analyze results ######################################################################## sub notify_operator { my $fh; my $msg; # Debug trace trace_location('begin') if $debug; # Sleep a while, to let things settle out print_it("Sleeping for ten minutes"); sleep 600; # Build message header $msg = Mail::Send->new(); $msg->to(@operators); $msg->subject($subject); # Build message body $fh = $msg->open; open my $report, '<', $node_file or warn "Cannot open $node_file: $!"; while (my $line = <$report>) { print $fh $line; } # Send message $fh->close or log_it("Cannot send $node_file to operators"); # Make things look pretty log_it("Ending $PROGRAM_NAME"); say "Ending $PROGRAM_NAME" if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Notify host monitoring services ######################################################################## sub notify_baby_sitters { my $event = shift; my $fh; my $msg; # Debug trace trace_location('begin') if $debug; # Override goto END unless $baby_sitter eq 'yes'; # Create message object $msg = Mail::Send->new(); # Walk through host monitoring services, notifying them of event BABY_SITTER: for my $bs (keys %baby_sitter) { next BABY_SITTER unless @{$monitored_hosts{$bs}} > 0; say "Processing $bs" if $debug; # Beginning of maintenance work if ($event eq 'begin') { say 'Processing begin' if $debug; log_it("Sending mail to $bs at beginning"); # Build message header $msg->to(@{$baby_sitter{$bs}}); $msg->cc(@operators); $msg->subject("FHCRC / Beginning Maintenance Work / $bs"); # Build message body $fh = $msg->open; print $fh "Dear $bs,\n"; print $fh "\n"; print $fh "Over the next handful of hours, we will reboot the\n"; print $fh "Ethernet switches feeding @{$monitored_hosts{$bs}}.\n"; print $fh "\n"; print $fh "\n"; print $fh "Please ignore alarms related to Ethernet NICs. We\n"; print $ fh "will send another message when we finish our work.\n"; print $fh "\n"; print $fh " --Fred Hutchinson Cancer Research Center"; print $fh " Information Technology (it-server\@fhcrc.org)\n"; } # End if event eq 'begin' # End of maintenance work elsif ($event eq 'end') { say 'Processing end' if $debug; log_it("Sending mail to $bs at end"); # Build message header $msg->to(@{$baby_sitter{$bs}}); $msg->cc(@operators); $msg->subject("FHCRC / Ending Maintenance Work / $bs"); # Build message body $fh = $msg->open; print $fh "Dear $bs,\n"; print $fh "\n"; print $fh "We have finished our maintenance work, during which\n"; print $fh "we rebooted the Ethernet switches feeding "; print $fh "@{$monitored_hosts{$bs}}\n"; print $fh "Please treat alarms following this message as an actual "; print $fh "event.\n"; print $fh "\n"; print $fh " --Fred Hutchinson Cancer Research Center"; print $fh " Information Technology (it-server\@fhcrc.org)\n"; } # End if event eq 'begin' # Send message $fh->close or log_it("Cannot send $event message to $bs"); } # End finish walking %baby_sitter END: # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Notify service providers of reboots ######################################################################## sub notify_service_providers { my $event = shift; my $fh; my $msg; # Debug trace trace_location('begin') if $debug; # Override goto END unless $service_provider eq 'yes'; # Create message object $msg = Mail::Send->new(); # Walk through service providers, notifying them of event for my $sp (keys %sp_address) { say "Processing $sp" if $debug; my $lost = 0; my $skip = 'no'; # Beginning of maintenance work if ($event eq 'begin') { say 'Processing begin' if $debug; # Figure out whether or not any of our routers feeding this service # provider are down for my $router (@border_routers) { my ($name) = ($router) =~ /(\w+)-\w-rtr/; if ($sp =~ /$name/i) { $skip = 'yes' if $begin_status{$router} == 0; } } # If all routers for this service provider are up, then send them mail if ($skip eq 'no') { log_it("Sending mail to $sp at beginning"); # Build message header $msg->to($sp_address{$sp}) unless $debug; $msg->cc(@operators); $msg->subject("FHCRC / Beginning Maintenance Work / $sp"); # Build message body $fh = $msg->open; print {$fh} "Dear NOC,\n"; print {$fh} "\n"; print {$fh} "Over the next couple hours, we will reboot the "; print {$fh} "routers terminating our connections with you.\n"; print {$fh} "\n"; # Add service provider specific information print {$fh} "fhcrc-wes-pnw-.*.ptcpnt.pnw-gigapop.net (209.124.188.150, 209.124.190.150, 209.124.191.150): icarsttlwa01-02 and icarsttlwa45-01\n"; print {$fh} "fhcrc-ads-pnw-.*.ptcpnt.pnw-gigapop.net (209.124.188.152, 209.124.190.152, 209.124.191.152)\n"; print {$fh} "\n"; print {$fh} "Please ignore. We will send another message when we "; print {$fh} "finish our work.\n"; print {$fh} "\n"; print {$fh} " --FHCRC Information Technology (netops\@fhcrc.org)\n"; } # End if skip eq 'no' } # End if event eq 'begin' # End of maintenance work elsif ($event eq 'end') { say 'Processing end' if $debug; # Figure out whether or not any of our routers feeding this service # provider changed from up to down for my $router (keys %begin_status) { my ($name) = ($router) =~ /(\w+)-\w-rtr/; if ($sp =~ /$name/i) { # We lost a router if ($begin_status{$router} == 1 and $end_status{$router} == 0) { $lost++; } # We started off with a router down elsif ($begin_status{$router} == 0) { $skip = 'yes'; } } } # If all routers for this service provider were up when we started, # then send them mail announcing the results if ($skip eq 'no') { log_it("Sending mail to $sp at end"); # Build message header $msg->to($sp_address{$sp}) unless $debug; $msg->cc(@operators); $msg->subject("FHCRC / Ending Maintenance Work / $sp"); # If all routers for this service provider are still up, # then tell them that things are fine if ($lost == 0) { log_it("Message to $sp says that maintenance was successful"); # Build message body $fh = $msg->open; print {$fh} "Dear NOC,\n"; print {$fh} "\n"; print {$fh} "We have finished our maintenance work. From here on, "; print {$fh} "please treat a loss of connectivity to one of these "; print {$fh} "routers as an actual event.\n"; print {$fh} "\n"; } # Otherwise, tell them that we have a problem and are working on it elsif ($lost > 0) { log_it("Message to $sp says that we have problems"); # Build message body $fh = $msg->open; print {$fh} "Dear NOC,\n"; print {$fh} "\n"; print {$fh} "We have finished our maintenance work. However, "; print {$fh} "as you have no doubt noticed, some of our gear did "; print {$fh} "not survive the experience. We are working on the "; print {$fh} "issue and will send you another message when we "; print {$fh} "have resolved the problem."; print {$fh} "\n"; } # Add service provider specific information print {$fh} "fhcrc-wes-pnw-.*.ptcpnt.pnw-gigapop.net (209.124.188.150, 209.124.190.150, 209.124.191.150)\n"; print {$fh} "fhcrc-ads-pnw-.*.ptcpnt.pnw-gigapop.net (209.124.188.152, 209.124.190.152, 209.124.191.152)\n"; print {$fh} "\n"; print {$fh} "Thank you,\n"; print {$fh} "\n"; print {$fh} " --FHCRC Information Technology (netops\@fhcrc.org)\n"; } # End if skip eq 'no' } # End if event eq 'end' # Send message $fh->close or log_it("Cannot send skip = $skip, lost = $lost message to $sp"); } # End finish walking %sp_addresses END: # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Post a message to the TOC ######################################################################## sub post_to_toc { my $event = shift; # Debug trace trace_location('begin') if $debug; # Send a string to syslog which the TOC will pick up and post to the # 'apager' frame if ($event eq 'begin') { log_it("apager Duty: Beginning high-availability testing of switches/routers, NodeWatch paging suppressed --sk"); } elsif ($event eq 'end') { log_it("apager Duty: Ending high-availability testing of switches/routers, NodeWatch paging re-enabled --sk"); } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Figure out whether or not any of our border routers are down ######################################################################## sub router_status { my $status = shift; # Debug trace trace_location('begin') if $debug; # Notify operator say 'Pinging border routers...' if $mode eq 'interactive'; # Ping the targets for my $router (@border_routers) { $status->{$router} = ping_it($router); log_it("$router is down") if $status->{$router} == 0; } # Make things look nice say "\n" if $mode eq 'interactive'; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Output help ######################################################################## sub HELP_MESSAGE { print <