#!/usr/bin/perl =head1 Black hole detection =encoding utf8 This feeds from Torque logs and decides if there is a black hole =head2 PRE What is a black hole? It's difficult to say, but it can be something like this: - A worker node that kills jobs There are two kind of black holes - Fast BH, that kill the job very quickly - Slow BH, that has something less visible, that kills the job after some time =head3 Black Hole Scenarios 1- A single worker node finishes a high percentage of FAILED jobs in a short time 2- A single worker node finishes a many jobs DONE in a short time, but the jobs seem to be returning no error (and they actually fail) 3- A few worker nodes finish a high percentage of jobs like in cases 1 and 2. =head3 NO Black Hole Scenarios 4- Many worker nodes finish a high number of DONE jobs in a short time, because they are short jobs. This does not have to happen simultaneously. 5- Many worker nodes finish a high percentage of FAILED jobs. The jobs are probably bogus. =head3 Terminology (this is completely arbitrary, based just on personal experience): - A high percentage of FAILED jobs is more than 80% in the last 10 jobs - 'Many worker nodes' means more than 5 - A high number of DONE jobs is more than 10 jobs in one hour =head3 Other considerations - Every time there is a black hole we have to at least send an email - Every day (or any pre-defined period) we should also send an email with a list of WN that fail jobs. It's a way that humans can catch Slow BH. - This could be a good place to gather also some general statistics. =head2 DESIGN We should keep track when a job finishes (we don't need to know when it started). This is what's needed at least from that job: - Finish time - WN name - Exit value (we may not even need this) The black hole detection is done on every worker node, so we should keep track of every WN separately. The most simple data structure would be a set of (time, exit value) for every WN. =head3 Information from Torque The files in /var/spool/pbs/server_priv/accounting/ gives a line every time a job changes status. This is there kind of line we are looking for (state:E) 04/26/2010 00:19:03;E;3804127.ce01.lcg.cscs.ch;user=atlasprd group=atlas jobname=ANA_6483b4df_79 queue=atlas ctime=1272231881 qtime=1272231881 etime=1272231881 start=1272232376 owner=atlasprd@arc01.lcg.cscs.ch exec_host=wn24.lcg.cscs.ch/9 Resource_List.cput=25:25:23 Resource_List.mem=2000mb Resource_List.neednodes=1 Resource_List.nodect=1 Resource_List.nodes=1 Resource_List.walltime=25:25:23 session=13638 end=1272233943 Exit_status=0 resources_used.cput=00:22:19 resources_used.mem=598580kb resources_used.vmem=784920kb resources_used.walltime=00:26:07 =cut use strict; use warnings; use vars qw(%BYHOST %BYUSER %BANLIST $VERBOSE $DEBUG $ENABLE_EMAIL $ACTIVATE_BAN); # CONSTANTS about TRUE black holes use constant TRUE_BH_CHECK_TIME => 600; # Seconds to check back for true black holes use constant TRUE_BH_MIN_JOBS => 10; # The number of jobs to start caring about use constant TRUE_BH_FAIL_RATE => 0.9; # The rate of failures we don't allow use constant TRUE_BH_MAX_JOBS => 45; # The number of jobs we will never allow to finish (it was 30 with 8 slots) use constant TRUE_BH_MAX_BANNED => 3; # The number of nodes allowed to be banned # CONSTANT about POSSIBLE black holes use constant POSSIBLE_BH_CHECK_TIME => 7200; # Seconds to check back for possible black holes =item SendEmail (To, Subject, Body) This is a simple email sender sub. Source: http://perl.about.com/od/email/a/perlemailsub.htm Example: sendEmail("toemail\@mydomain.com", "fromemail\@mydomain.com", "Simple email.", "This is a test of the email function."); =cut sub SendEmail { my ($subject, $message) = @_; open(MAIL, "|/bin/mail -s '$subject' lcgroot\@cscs.ch"); print MAIL "$message\n"; close(MAIL); } # END of SendEmail =item ParseExitLine (text) An exit line is a line from Torque with a ;E; status. This sub parses what is after the last semicolon on that line =cut sub ParseExitLine { my $text = shift; my %ITEM; chop ($text); # Remove last new_line # Create a HASH with all entries foreach ( split (/ /,$text) ) { (my $key, my $var) = split (/=/,$_); $ITEM{$key}=$var; } # And return, in order, only the ones we need my $hostname = $ITEM{'exec_host'}; $hostname =~ s/\/.*//; # Remove the slash and everything afterwards return $hostname, $ITEM{'Exit_status'}, $ITEM{'end'}, $ITEM{'end'} - $ITEM{'start'}, # Total time in seconds $ITEM{'user'}; } # END of ParseExitLine =item PrintStatistics (minutes) This prints statistics from the last $minutes. 0 means no limit. =cut sub PrintStatistics { my $minutes = shift; my $timelimit; # If the parameter is 0, there should be no time limit if ($minutes > 0) { $timelimit = time() - ($minutes * 60); } else { $timelimit = 0; } my @result_matrix; # Stats by host my $jobcount; my $joberrors; my $totalexectime; my $errorrate; while( my ($hostname, $jobs) = each %BYHOST) { $jobcount = 0; $joberrors = 0; $totalexectime = 0; $errorrate = 0; # Gather info while( my ($jobname, $job) = each %$jobs) { # We only count it if it is after the time limit if ($job->{'endtime'} > $timelimit) { $jobcount++; if ( $job->{'exitvalue'} != 0 ) { $joberrors++; } $totalexectime += $job->{'totaltime'}; } } if ($jobcount > 0) { # Include the results into the matrix $errorrate = sprintf ("%.2f",$joberrors * 100 / $jobcount); unshift (@result_matrix, [$hostname, $jobcount, $joberrors, $errorrate, $totalexectime] ); }; } # We can sort the matrix as we want # @result_matrix = sort {$a->[1] <=> $b->[1]} @result_matrix; @result_matrix = sort {$a->[0] cmp $b->[0]} @result_matrix; # We gather general stats to compare with individual ones my $average_jobs = 0; my $average_error_rate = 0; my $average_exec_time = 0; foreach my $item (@result_matrix) { $average_jobs += $item->[1]; $average_error_rate += $item->[3]; $average_exec_time += $item->[4]; } if (scalar (@result_matrix) > 0) { $average_jobs /= scalar (@result_matrix); $average_error_rate /= scalar (@result_matrix); $average_exec_time /= scalar (@result_matrix); } # Print stats my $output = "*** Job Statistics ***\n"; $output .= " Total WN with jobs: ".scalar (@result_matrix)."\n"; $output .= " Average jobs per WN: ".sprintf ("%.2f",$average_jobs)."\n"; $output .= " Average error rate: ".sprintf ("%.2f",$average_error_rate)." %\n"; $output .= " Average execution time: ".int($average_exec_time)." s \n"; if (scalar (keys %BANLIST) > 0) { $output .= "*** Banned nodes ***\n"; foreach (keys %BANLIST) { $output .= " $_\n";} } $output .= "*** Nodes with (> average_jobs * 2) OR (< average_jobs / 3) OR (> 20% error_rate) :\n"; foreach my $item (@result_matrix) { #Only print 'interesting' information if ( ($item->[1] > $average_jobs * 2) # More than double of the average jobs or ($item->[1] < $average_jobs / 3) # Less than a third of the average jobs or ($item->[3] > 20) # More than 20% of the jobs failed ) { $output .= " $item->[0] JobCount:$item->[1] Errors:$item->[2] Error_rate:$item->[3]% TotalExecTime:$item->[4]\n"; } } print $output; if ($ENABLE_EMAIL) { SendEmail ("Node Statistics", "$output\n" );} } # END of PrintStatistics =item BanNode ($hostname, $reason); This bans a node from the Torque resource list. - We don't try to ban a node that was already banned. - We don't ban more than X nodes, just to prevent bugs in this program - We always inform the admins in any case =cut sub BanNode { my $hostname = shift; my $reason = shift; # Check if the node is already banned if ( defined $BANLIST{ "$hostname" } ) { print "NOT BANNING $hostname, already banned: $reason\n"; if ($ENABLE_EMAIL) { SendEmail ("Black Hole detected (no action taken)", "NOT BANNING $hostname, already banned: $reason\n"); } } elsif (keys (%BANLIST) >= TRUE_BH_MAX_BANNED){ print "NOT BANNING $hostname, max number of banned nodes reached (".TRUE_BH_MAX_BANNED."): $reason\n"; if ($ENABLE_EMAIL) { SendEmail ("Black Hole detected (no action taken)", "NOT BANNING $hostname, max number of banned nodes reached (".TRUE_BH_MAX_BANNED."): $reason\n"); } } else { print "BANNING $hostname: $reason\n"; $BANLIST{"$hostname"} = time(); if ($ACTIVATE_BAN) { system "/usr/bin/pbsnodes -o $hostname"; } if ($ENABLE_EMAIL) { SendEmail ("Black Hole detected", "BANNING $hostname: $reason\n" ); } } } # END of BanNode =item LookForTrueBlackHoles ($hostname, $timenow); This looks for black holes in the hostname starting from end_time backwards that we know are true black holes. We must be sure of that. We ban the node if that's the case. - There is another function to search for possible black holes - The time limit to search back is defined by the constant TRUE_BH_CHECK_TIME =cut sub LookForTrueBlackHoles { # Parameters my $hostname = shift; my $timenow = shift; # Other vars my $jobcount = 0; # Counts the number of jobs my $failcount = 0; # The number of fails my $failrate = 0; # It's fails/total. my $tmp_hash = $BYHOST{"$hostname"}; # Gather relevant information while( my ($job, $jobdata) = each %$tmp_hash) { # See if we are in timeframe AND the job was not cancelled by the user if (($jobdata->{'endtime'} > $timenow - TRUE_BH_CHECK_TIME ) && ($jobdata->{'exitvalue'} != 271)) { $jobcount ++; # Count only if exited with error if ($jobdata->{'exitvalue'} != 0) { $failcount ++; } } } # Calculate the rest of the decission variables if ($jobcount > 0) { $failrate = $failcount / $jobcount; } # Decide if it is a black hole if ($DEBUG) { print "Deciding about $hostname. This period I found $jobcount jobs with $failcount failures \n"; } if ( ($jobcount > TRUE_BH_MIN_JOBS) && ($failrate > TRUE_BH_FAIL_RATE) ) { # We see too many jobs failing BanNode ($hostname, "Many jobs failing ($failcount/$jobcount)"); } elsif ($jobcount > TRUE_BH_MAX_JOBS) { # We just see too many jobs BanNode ($hostname, "Too many jobs finished ($jobcount) in last ".TRUE_BH_CHECK_TIME." seconds"); } } # END of LookForTrueBlackHoles =item SIGUSR1 () We want to capture this signal to print statistics when we want =cut sub SigUsr1Handler { PrintStatistics (0); } use sigtrap 'handler', \&SigUsr1Handler, 'USR1'; =item main () This is the beginning. We have to read lines from STDIO, and if it is an Exit line, perform a series of operations: - Parse the line and get the working elements - Insert the elements in the Memory arrays - Detect if the new element represents a Black Hole - Cleanup =cut $VERBOSE = 0; $DEBUG = 0; $ENABLE_EMAIL = 1; $ACTIVATE_BAN = 1; if ($ENABLE_EMAIL) { SendEmail ("Enabling Black Hole Detection", "Program started"); } while (<>) { my @inputline = split (/;/,$_); if ($inputline[1] eq 'E') { my $jobname = $inputline[2]; my ($host_name, $exit_value, $end_time, $total_time, $user_name) = ParseExitLine $inputline[3]; if ($DEBUG) { print "Job finished $jobname in Host:$host_name ExitValue:$exit_value EndTime:$end_time TotalTime:$total_time User:$user_name\n"; } # Now we feed the global variables with the new job information $BYHOST{ "$host_name" } { "$jobname" } = { exitvalue => "$exit_value", endtime => "$end_time", totaltime => $total_time, username => "$user_name" }; # $BYUSER{ "$user_name" } { "$jobname" } = { exitvalue => "$exit_value", endtime => "$end_time", totaltime => $total_time, hostname => "$host_name" }; # We consider "NOW" as the output time from last jobcount to look for black holes # Every time a new job ends we search for black holes on that node LookForTrueBlackHoles ($host_name, $end_time); # LookForPossibleBlackHoles ($host_name, $end_time); }; } PrintStatistics (0); =item while( my ($k, $v) = each %BYHOST ) { print "key: $k, value: $v.\n"; } foreach my $key (keys %{ $BYHOST {'wn31.lcg.cscs.ch'} }) { print "$key\n"; } print $BYHOST {'wn31.lcg.cscs.ch'}{'3808975.ce01.lcg.cscs.ch'}{'endtime'}; =cut