#!/opt/vdops/bin/perl # This program stuffs a variable into a shared memory segment, sends a # pre-defined message to syslog, waits a little, and then checks the # value of the variable. If it has changed, it surmises that swatch # successfully launched its counter-part, 'pong-swatch', and that all is # well. If the variable hasn't changed, then it surmises that swatch # isn't well, and it pages duty with a warning message # V Who When What # --------------------------------------------------------------------------- # 1.6.0 skendric 2010-11-14 Upgrade to perl-5.12.2 # 1.5.4 skendric 2009-05-21 Comment out __DIE__ assignment # 1.5.3 skendric 2009-04-09 Replace get_date_and_time with get_date/time # 1.5.2 skendric 2008-09-04 Give pong-swatch more time to respond # 1.5.1 skendric 2008-07-17 Log to syslog while checking for growth # 1.5.0 skendric 2008-07-11 Manage both swatch and toclogd # 1.4.1 skendric 2008-06-30 Encapsulate warnings in talk_about_it # 1.4.0 skendric 2008-06-30 Record aberrant activity in a log # 1.3.1 skendric 2008-06-20 Restrict pages to wakeful hours # 1.3.0 skendric 2008-04-28 Add command-line parameters to control # paging and mailing behavior # 1.2.1 skendric 2008-04-16 Verify that we can read syslog; replace # manual interactivity check with IO::Interactive # 1.2.0 skendric 2008-03-25 Check that syslog is growing # 1.1.5 skendric 2008-02-27 Replace shriek_to_operator with send_page # 1.1.4 skendric 2008-02-27 Notify watchers via mail of significant events # 1.1.3 skendric 2007-11-19 Increase logging # 1.1.2 skendric 2007-11-09 Don't page if running interactively # 1.1.1 skendric 2007-06-21 Fiddle with log messages # 1.1.0 skendric 2007-05-04 Restart swatch as needed # 1.0.0 skendric 2006-12-16 First version # Load modules use strict; use warnings; use feature 'say'; use feature 'switch'; use Carp qw(carp cluck croak confess); use English; use Getopt::Std; use IO::Interactive qw(interactive is_interactive); use IPC::Shareable; use List::MoreUtils qw(any); use Proc::ProcessTable; use Sys::Hostname; use Time::Period; use FHCRC::Netops::NetopsData; use FHCRC::Netops::Utilities; # Declare global variables my $alive; # Boolean telling us whether or not swatch is alive my $alive_knot; # IPC::Shareable object my $glue = 'pisw'; # IPC::Shareable memory segment identifier my %knot_options; # Options to hand to IPC::Shareable my $logfile; # Where we record aberrant activity my $mail; # Boolean controlling whether or not we send mail # when we encounter difficulties my $max_proc; # Maximum number of swatch-related processes # we allow before intervening my $normal_swatch_pid_count; # The number of swatch processes in the table # under normal conditions my $normal_toclogd_pid_count; # The number of toclogd processes in the table # under normal conditions my $nodename; # Nodename of this host my @operators; # List of pager names to notify if swatch is dead my $owner; # The username of swatch's owner my $page; # Boolean: determines whether or not we'll page if # we are unable to get swatch working my %page_arg; # Arg hash for the send_page routine my $page_window; # A Time::Period window during which the operators # want to receive malfunctioning pages my $pause; # Seconds to wait for pong-swatch to change $alive my @process_names; # List of process names related to swatch my $shriek; # Msg to send to @operators if swatch is dead my $start_swatch; # Command to use for restarting swatch my $start_toclogd; # Command to use for restarting swatch my $syslog; # Location of syslog my @watchers; # List of e-mail addresses wanting to hear about # events # Define global variables $debug = 0; # 4 = Enable grody debugging # 3 = Enable verbose debugging # 2 = Enable simple debugging # 1 = Enable basic debugging # 0 = Disable debugging # The shared variable through which we communicate with pong-swatch %knot_options = ( key => $glue, create => 1, exclusive => 0, mode => 0664, destroy => 1, ); # Location of files $logfile = '/home/tocops/rpts/swatch-restart-history'; $syslog = '/var/log/syslog'; # Miscellaneous $nodename = hostname; $pause = 15; $program_name = 'ping-swatch'; $usage = 'ping-swatch [-s yes|no] [-p yes|no] [-m yes|no]'; $version = '1.6.0'; # Process details $max_proc = 20; $normal_swatch_pid_count = 3; $normal_toclogd_pid_count = 1; $owner = 'tocops'; @process_names = qw/swatch toclogd/; $start_swatch = '/etc/init.d/swatch start'; $start_toclogd = '/etc/init.d/toclogd start'; # Notification stuff #@operators = qw/duty/; @operators = qw/skendric/; $page_window = 'hr 5am-7pm'; $snpp_host = 'localhost'; @watchers = qw/skendric@fhcrc.org/; # Syslog stuff $syslog_facility = 'local5'; $syslog_host = 'localhost'; $syslog_port = 514; $syslog_priority = 'info'; $syslog_socket = 'unix'; # Other possibilites include 'udp' and # 'stream'; depending on the flavor of Unix, # I've employed each of these # Grab arguments getopts('d:m:p:s:', \%option); $debug = defined $option{d} ? $option{d} : 0 ; given ($option{s}) { when ('yes') { $dome = 1 } when ('no') { $dome = 0 } when (undef) { $dome = 1 } default { die '-s must be either yes or no' } } given ($option{p}) { when ('yes') { $page = 1 } when ('no') { $page = 0 } when (undef) { $page = 0 } default { die '-p must be either yes or no' } } given ($option{m}) { when ('yes') { $mail = 1 } when ('no') { $mail = 0 } when (undef) { $mail = 0 } default { die '-p must be either yes or no' } } ### Begin Main Program ############################################### { lay_ground_work(); # Initialization check_process_table(); # If the process table is overflowing with # swatch-related entries, kick swatch et al look_for_toclogd(); # Look for exactly one toclogd process; # else fix the issue tickle_syslog(); # Send tickler message to syslog respond_to_success(); # If swatch is alive, quit check_syslog(); # If syslog is frozen, whine restart_process('swatch'); # Stop and start swatch tickle_syslog(); # Send tickler message to syslog grin_or_wail(); # Quit quietly or notify operator of failure } ##### End Main Program ############################################### ######################################################################## # Given a string, add a date & time and append the line to the logfile ######################################################################## sub append_to_logfile { my $date; my $msg = shift; my $time; # Debug trace trace_location('begin') if $debug; # Get time $date = get_date(); $time = get_time(); # Prepend time stamp $msg = $date . $SPACE . $time . $SPACE . $SPACE . $msg; # Append to log file if (open my $log, '>>', $logfile) { print {$log} "$msg\n"; close $log or warn "Cannot close $logfile: $!"; } else { warn "Cannot open $logfile: $!"; log_it($msg); } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Populate %SIG # I've forgotten why I mess with signal handlers. Recently, # I found that assigning the __DIE__ handler lead to the following error # message: Can't locate DateTime/TimeZone/Local/linux.pm . So, I've # commented out the __DIE__ assigment 2009-05-21 --sk ######################################################################## sub assign_signals { # The stop process signals that we'll use to run shut_down() # $SIG{__DIE__} = \&shut_down; $SIG{HUP} = \&shut_down; $SIG{INT} = \&shut_down; $SIG{KILL} = \&shut_down; $SIG{STOP} = \&shut_down; $SIG{TERM} = \&shut_down; return 1; } ######################################################################## # Extract pids from the process table which are owned by $owner. If # this number exceeds a threshold, restart relevant processes ######################################################################## sub check_process_table { my $num_proc; # Number of processes owned by $owner my $table = Proc::ProcessTable->new(); # Debug trace trace_location('begin') if $debug; # Walk the process table, looking for likely prospects for my $p ( @{$table->table} ) { $num_proc++ if getpwuid($p->uid) eq $owner; } # If this number exceeds the threshold, restart related processes if ($num_proc > $max_proc) { my $subject = "Too many $owner processes in process table: $num_proc"; my $body = "Restarting @process_names"; talk_about_it($subject, $body, 2); for my $process (@process_names) { restart_process($process); } } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Verify that syslog is growing. If it isn't, whine ######################################################################## sub check_syslog { my $begin; # Size of syslog when we begin this routine my $end; # Size of syslog when we finish this routine # Debug trace trace_location('begin') if $debug; # Gather syslog size $begin = -s $syslog; log_it("Beginning syslog size = $begin bytes"); say "Pausing for $pause seconds" if $debug; sleep $pause; $end = -s $syslog; log_it("Ending syslog size = $end bytes"); # If syslog is frozen, whine and quit if ($begin == $end) { my $subject = "$nodename sees syslog as frozen"; my $body = "Size of $syslog equals $begin --ping-swatch"; talk_about_it($subject, $body, 5); } else { say 'Syslog is growing' if $debug; } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Extract processes from the process table which are likely associated # with the named process. Return a hash of entries keyed by pid ######################################################################## sub find_processes { my %likely; my $process_name = shift; my $table = Proc::ProcessTable->new(); # Debug trace trace_location('begin') if $debug; # Sanity check confess 'Must receive a process_name' unless defined $process_name; confess "Unknown process $process_name" unless any { $_ eq $process_name } @process_names; # Walk the process table, looking for likely prospects for my $p ( @{$table->table} ) { if (getpwuid($p->uid) eq $owner) { given ($process_name) { when ('swatch') { if ( looks_like_swatch($p->cmndline) ) { $likely{$p->pid} = $p->cmndline; } } when ('toclogd') { if ( looks_like_toclogd($p->cmndline) ) { $likely{$p->pid} = $p->cmndline; } } } # End 'given' } # End 'If process owned by $owner' } # End 'Walk the process table' # Debug info if ($debug) { say 'Processes are:'; for my $pid (keys %likely) { say " $pid: $likely{$pid}"; } } # Debug trace trace_location('end') if $debug; return \%likely; } ######################################################################## # If swatch is alive, beam with satisfaction and quit. Otherwise, # notify the operator of failure ######################################################################## sub grin_or_wail { my $body; my $subject; # Debug trace trace_location('begin') if $debug; # Figure out whether or not pong-swatch updated $alive given ($alive) { when (1) { # Log success message $subject = 'Swatch has been revived'; talk_about_it($subject, '', 2); } when (0) { # Shriek $subject = "Swatch on $nodename is dead"; $body = "Please restart swatch; see http://swatch.fhcrc.org for details --ping-swatch"; talk_about_it($subject, $body, 5); } when (undef) { $subject = 'Unexpected ping-swatch undef error'; $body = 'alive = undef, which does not make sense --ping-swatch'; talk_about_it($subject, $body, 4); } default { # This shouldn't happen $subject = 'Unexpected ping-swatch error'; $body = "alive = $alive, which does not make sense --ping-swatch"; talk_about_it($subject, $body, 4); } } # Leave tracks say "Ending $PROGRAM_NAME v$version" if is_interactive; log_it("Ending $PROGRAM_NAME v$version"); # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Given a reference to a hash of process names keyed by pids , kill them. # Use the second argument as a string for logging ######################################################################## sub kill_processes { my $pid_ref = shift; my $process_name = shift; my $body; my $result; my $subject; # Debug trace trace_location('begin') if $debug; # Sanity check confess 'Must receive a pid_ref' unless defined $pid_ref; confess 'Must receive a process_name' unless defined $process_name; confess 'Argument wrong type' unless ref $pid_ref eq 'HASH'; confess "Unknown process $process_name" unless any { $_ eq $process_name } @process_names; # Try killing them using SIGTERM if (keys %$pid_ref > 0) { my (@pids, @processes); # Notify operator say "Sending SIGTERM to $process_name processes..." if is_interactive; # Send SIGTERM @pids = keys %$pid_ref; @processes = values %$pid_ref; $subject = 'Sending SIGTERM to the following processes'; $body = join $CR, @processes; if ($dome) { say $subject if $debug; say $body if $debug; $result = kill 'TERM', @pids; say "Sent SIGTERM to $result pids" if $debug; $subject = "SIGTERMed $result pids"; talk_about_it($subject, '', 3); } else { print_it("$subject ... just kidding"); } } # End 'Try killing the with SIGTERM' # Refresh list of processes given ($process_name) { when ('swatch') { $pid_ref = find_processes('swatch') } when ('toclogd') { $pid_ref = find_processes('toclogd') } } # Try killing them using SIGKILL if (keys %$pid_ref > 0) { my (@pids, @processes); # Notify operator say "Sending SIGKILL to $process_name processes..." if is_interactive; # Send SIGKILL @pids = keys %$pid_ref; @processes = values %$pid_ref; $subject = 'Sending SIGKILL to the following processes'; $body = join $CR, @processes; if ($dome) { say $subject if $debug; say $body if $debug; $result = kill 'KILL', @pids; say "Sent SIGKILL to $result pids" if $debug; $subject = "SIGKILLed $result pids"; talk_about_it($subject, '', 3); } else { print_it("$subject ... just kidding"); } } # End 'Trying kill them with SIGKILL' # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Initialize stuff ######################################################################## sub lay_ground_work { # Debug trace trace_location('begin') if $debug; # Check for existence and readability of syslog die "Cannot read $syslog" unless -r $syslog; # Check for existenace and writeability of logfile warn "Cannot write to $logfile" unless -w $logfile; # Leave tracks log_it("Starting $PROGRAM_NAME v$version"); say "Starting $PROGRAM_NAME v$version" if is_interactive; # Assign signals assign_signals(); say 'Assigned signals' if $debug; # Create shared memory segment eval { $alive_knot = tie $alive, 'IPC::Shareable', $glue, { %knot_options } }; if ($@ ne $EMPTY_STR) { shut_down("Bailing from $PROGRAM_NAME v$version: unable to tie 'alive' via IPC::Shareable: $@"); } $alive = 0; say 'Created alive_knot' if $debug; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Look for toclogd in the process table. If it isn't there, start it ######################################################################## sub look_for_toclogd { my $num; my $pid_ref; # Debug trace trace_location('begin') if $debug; # Retrieve a hash of pids $pid_ref = find_processes('toclogd'); # Count processes $num = keys %$pid_ref; # Respond to count given ($num) { when (1) { # Do nothing; this is normal } default { restart_process('toclogd'); } } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Given a string, return 1 if the line looks like it might belong to a # swatch process, 0 otherwise ######################################################################## sub looks_like_swatch { my $answer; my $string = shift; # Debug trace trace_location('begin') if $debug == 4; # Do the work given($string) { when (/swatch.conf.*syslog.*awk/) { $answer = 1 } when (/script/) { $answer = 1 } when (/tail/) { $answer = 1 } default { $answer = 0 } } # Debug info say "Process = $string" if $debug > 2; say ' ==>This is a swatch process' if ($answer and $debug > 2); # Debug trace trace_location('end') if $debug == 4; return $answer; } ######################################################################## # Given a string, return 1 if the line looks like it might belong to a # toclogd process, 0 otherwise ######################################################################## sub looks_like_toclogd { my $answer; my $string = shift; # Debug trace trace_location('begin') if $debug == 4; # Do the work given($string) { when (/toclogd/) { $answer = 1 } default { $answer = 0 } } # Debug info say "Process = $string" if $debug > 2; say ' ==>This is a toclogd process' if ($answer and $debug > 2); # Debug trace trace_location('end') if $debug == 4; return $answer; } ######################################################################## # If swatch is alive, beam with satisfaction and quit ######################################################################## sub respond_to_success { # Debug trace trace_location('begin') if $debug; # If $alive has been updated, log success message if ($alive == 1) { log_it('Swatch is alive'); say 'Swatch is alive' if is_interactive; say "Ending $PROGRAM_NAME v$version" if is_interactive; log_it("Ending $PROGRAM_NAME v$version"); exit 1; } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Given a process name, kill associated pids and restart the process ######################################################################## sub restart_process { my $body; my $subject; my $pid_ref; my $process_name = shift; # Debug trace trace_location('begin') if $debug; # Sanity check confess 'Must provide a process name' unless defined $process_name; # Look for processes $pid_ref = find_processes($process_name); # If we found swatch in the process table, kill it if (keys %$pid_ref > 0) { $subject = "$process_name in process table"; $body = "Killing $process_name"; talk_about_it($subject, $body, 3); kill_processes($pid_ref, $process_name); # Look for processes $pid_ref = find_processes($process_name); # If this process is still in the process table, shriek and give up if (keys %$pid_ref > 0) { $body = "Unable to kill $process_name processes --ping-swatch"; talk_about_it($subject, $body, 5); } } # Otherwise, this process isn't in the process table, note that fact and # continue else { $subject= "$process_name is not in the process table"; $body = 'continuing'; talk_about_it($subject, $body, 3); } # Restart process given ($process_name) { when ('swatch') { $subject = "Invoking $start_swatch"; system($start_swatch); } when ('toclogd') { $subject = "Invoking $start_toclogd"; system($start_toclogd); } } talk_about_it($subject, '', 3); say "Pausing for $pause seconds" if $debug; sleep $pause; # Look for processes $pid_ref = find_processes($process_name); # If this process isn't in the process table, shriek and bail given ($process_name) { when ('swatch') { unless (keys %$pid_ref == $normal_swatch_pid_count) { $subject = 'Swatch is not in the process table'; $body = 'Despite repeated attempts to restart it --ping-swatch'; talk_about_it($subject, $body, 5); } } when ('toclogd') { unless (keys %$pid_ref == $normal_toclogd_pid_count) { $subject = 'toclogd is not in the process table'; $body = 'Despite repeated attempts to restart it --ping-swatch'; talk_about_it($subject, $body, 5); } } } # Debug info trace_location('end') if $debug; return 1; } ######################################################################## # Destroy shared memory segment ######################################################################## sub shut_down { my $msg = shift; say $msg if is_interactive; log_it($msg); IPC::Shareable->clean_up(); exit 1; } ######################################################################## # Given several message strings and a severity level, talk about what # happened ######################################################################## sub talk_about_it { my $subject = shift; my $body = shift; my $level = shift; my %mail_arg; my %page_arg; # Debug trace trace_location('begin') if $debug; # Sanity check confess 'Must provide a subject' unless defined $subject; confess 'Must provide a body' unless defined $body; confess 'Must provide a level' unless defined $level; confess "Level $level must be an integer" unless $INTEGER->matches($level); # Consider speaking, logging, reporting, mailing, and paging given ($level) { when (1) { print {interactive} "$body\n"; # Progress to STDOUT } when (2) { print {interactive} "$body\n"; # Progress to STDOUT log_it("$subject $body"); # Log aberrant behavior to syslog } when (3) { print {interactive} "$body\n"; # Progress to STDOUT log_it("$subject $body"); # Log aberrant behavior to syslog append_to_logfile("$subject: $body"); # Record aberrant behavior } when (4) { print {interactive} "$body\n"; # Progress to STDOUT log_it("$subject $body"); # Log aberrant behavior to syslog append_to_logfile("$subject: $body"); # Record aberrant behavior # Mail watchers if ($dome and $mail) { %mail_arg = ( body => $body, subject => $subject, recipient => \@watchers ); send_mail(\%mail_arg); # Notify watchers via mail } } when (5) { print {interactive} "$body\n"; # Progress to STDOUT log_it("$subject $body"); # Log aberrant behavior to syslog append_to_logfile("$subject: $body"); # Record aberrant behavior # Mail watchers if ($dome and $mail) { %mail_arg = ( body => $body, subject => $subject, recipient => \@watchers ); send_mail(\%mail_arg); # Notify watchers via mail } # Page operators if ($dome and $page) { my $message = 'Duty: ' . $subject . $SPACE . $body; %page_arg = ( host => $snpp_host, message => $message, recipient => \@operators ); send_page(\%page_arg); # Notify operators via page } } } # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Given a string, return 1 if the line looks like it might belong to a # swatch process, 0 otherwise ######################################################################## sub tickle_syslog { # Debug trace trace_location('begin') if $debug; # Send 'ping' message to syslog log_it("Pinging swatch: are you there?"); say 'Sent ping message to syslog' if $debug; # Wait for swatch to do its work say "Pausing for $pause seconds" if $debug; sleep $pause; # Debug trace trace_location('end') if $debug; return 1; } ######################################################################## # Output help ######################################################################## sub HELP_MESSAGE { print <