#!/bin/ksh
#
#  
#
# File        : mon_runaway.sh
#
# Objective   : report possible (but plausible) runaway processes
#
# Argument    : -
#
# Returns 1st line : -1      Error detected
#                     0      No runaway processes
#                     1      Runaway processes
#         2nd line : Error or process information
#             
#
# Date (yyyymmdd)   Who              What
# ---------------   --------------   -------------------------------------------------
# 2006 10 20        Peter van Nes    Initial release

# Declare variables
# SCHEDPRIO  Threshold to indicate possible suspect processes
#            Processes with an scheduling priority higher than
#            the threshold are marked as suspect processes.
#            Lowering the value will increase the possibility a process
#            will be marked as suspect process.
# SAMPTIME   Sample time in seconds. Specifies the interval between the
#            snapshots of the process list, which are used to calculate
#            the cumulative CPU time.
# MAXCPUPERC Threshold to mark processes as runaway. Variable specifies 
#            the max. cumulative CPU time as an percentage of the sample
#            time.
export SCHEDPRIO=10
export SAMPTIME=10
export MAXCPUPERC=75


# function detect_runaway
#
# Function can be run in two modes;
# a) Detect suspect processes ; run function without an argument
# b) Detect runaway processes ; run function with processlist from mode a as an argument
#
# Mode a; In this mode the function determines which processes might be runaway processes.
#         Processes with a scheduling priority higher than the defined max. value in 
#         SCHEDPRIO are reported as candidate. The functions returns a list of processes
#         indicated by the PID and the cumulative CPU time since the start of the process
#         The format is {PID}+{CPU} {PID}+{CPU} {PI....
# Mode b; In this mode the function calculates which processes are detected as runaway.
#         Processes with a scheduling priority higher than the defined max. value in
#         SCHEDPRIO AND a cumulative CPU time higher than the percentage specified in
#         MAXCPUPERC are reported as runway. Only processes specified in the argument,
#         generated by running the function in mode a, are investigated. The function 
#         returns a list of runaway processes in the format {PID}({%CPU}) {PID}({%CPU}) ..
#
function detect_runaway {
  SUSP=$1 

  ps -efo cpu,pid,time,args | sed '1d' | sort -nr -k 1 | \
    awk -v susp="$SUSP" '
    BEGIN{ susp split(susp,t,"\n") ; for ( pc in t ){ split(t[pc],u,"+");sproc[u[1]]=u[2] } }
    { if($1>ENVIRON["SCHEDPRIO"]){ split($3,ptime,":");secs=ptime[1]*3600+ptime[2]*60+ptime[3];
        if(susp==""){
          outp=outp $2 "+" secs " "
        }
        else {
          if (sproc[$2]!=""){ runsec = secs - sproc[$2] ; 
            if (runsec > ((ENVIRON["SAMPTIME"]/100)*ENVIRON["MAXCPUPERC"])) {
              outp=outp $2 "(" (runsec / (ENVIRON["SAMPTIME"]/100)) "%) " ; 
            } 
          }    
        } 
      } 
      else { exit }
    }
    END { print outp }'
}


# Detect runaway candidates
SPROC=`detect_runaway` 

# If output is not empty there are runaway candidates
if [[ -n "$SPROC" ]]
then
  # Wait some time to calculate the cpu usage
  sleep $SAMPTIME

  # Detect runway processes based on the suspect list 
  RUNAWAY=`detect_runaway $SPROC`

  # If output is not empty there are runaways 
  if [[ -n "$RUNAWAY" ]]
  then
    echo "1"
    echo "$RUNAWAY"
  else
  # There are no runaways
    echo "0"
    echo "No runaway processes" 
  fi
else
  # There are no suspect processes
  echo "0"
  echo "No suspect processes" 
fi



