gridengine/flex-grid/site/qloadsensor

326 lines
13 KiB
Plaintext
Raw Normal View History

#!/bin/bash
# $Id: qloadsensor 180 2010-09-17 15:46:41Z kasper $
#
# qloadsensor:
# load sensor for particular file systems and floating licenses
#
# NB:
# 1) add the new complexes (via qconf -mc) for the following:
# * complex configurations managed in the shell script
# eg, 'perl -x qloadsensor'
# * complex consumables managed global
# eg, 'qlicserver -c'
# 2) initialize the global complex consumables to be managed
# eg, 'qlicserver -C'
#
# copyright (c) 2003-10 <Mark.Olesen@faurecia.com>
#
# Licensed and distributed under the Creative Commons
# Attribution-NonCommercial-ShareAlike 3.0 License.
# http://creativecommons.org/licenses/by-nc-sa/3.0
# -----------------------------------------------------------------------------
#
# impose default GridEngine environment + ascertain the binary architecture
#
# you likely don't need to adjust these values, since the loadsensor is called
# from sge_execd, which in turn is started from /etc/init.d/n1ge and
# these variables should be correctly exported there
#
[ -d "$SGE_ROOT" ] || { echo "Error: SGE_ROOT=$SGE_ROOT not found"; exit 1; }
: ${SGE_CELL:=default}
: ${SGE_ARCH:=`$SGE_ROOT/util/arch`}
export SGE_ROOT SGE_CELL SGE_ARCH
# -----------------------------------------------------------------------------
# this script should run as the 'admin_user' registered in 'bootstrap'
#
if [ "$UID" -eq 0 ]
then
admin_user=$(sed -ne 's/^admin_user *//p' $SGE_ROOT/$SGE_CELL/common/bootstrap)
: ${admin_user:=root}
if [ $admin_user != root -a $(echo $admin_user | tr "A-Z" "a-z") != none ]
then
exec $SGE_ROOT/utilbin/$SGE_ARCH/adminrun $admin_user $0
fi
fi
# <settings>
# ========================================================================
# now that we are the admin_user, we can source our standard settings
# - customize *all* settings there (eg, license server settings)
# - ENSURE THAT '$SGE_site' IS DEFINED !!!
#
for i in $SGE_ROOT/$SGE_CELL/site/environ; do [ -f $i ] && . $i; done
# define (unique) cluster name if not already defined
if [ -z "$SGE_CLUSTER_NAME" -a -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ]
then
SGE_CLUSTER_NAME=$(cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null)
fi
: ${SGE_CLUSTER_NAME:=default}
export SGE_CLUSTER_NAME
SGE_site="$SGE_ROOT/flex-grid/site"
#
# ========================================================================
# </settings>
###############################################################################
###############################################################################
# CUSTOMIZE THESE SETTINGS - iff. required
qlicserver="$SGE_site/qlicserver config=$SGE_site/../config/local_licenses.conf dir=$SGE_ROOT/flex-grid/cache output=$SGE_ROOT/flex-grid/cache/qlicserver.xml qhost=qhost.xml qstat=qstat.xml"
diskmon="$SGE_site/diskmon.pl"
# END OF CUSTOMIZE SETTINGS
###############################################################################
###############################################################################
#
# the real (not compiled in) architecture
#
os_arch=`$SGE_ROOT/util/arch`
SGE_utilbin=$SGE_ROOT/utilbin/$os_arch
#
# set some constants
#
HOST=$($SGE_utilbin/gethostname -aname)
UQHOST=$(echo $HOST | cut -f1 -d.)
SGE_qmaster=unknown; export SGE_qmaster
# -----------------------------------------------------------------------------
# act_qmaster
#
# extract the unqualified host name from the "act_qmaster" file
# return this value or 'unknown' on failure
#
act_qmaster()
{
tmp=$(cat $SGE_common/act_qmaster 2>/dev/null)
echo ${tmp:-unknown}
}
# -----------------------------------------------------------------------------
# df_info
#
# echo the $1_{total,used,free} space on filesystem $2
#
# gridengine uses the suffixes
# 'k' => blocksize 1000
# 'K' => blocksize 1024
#
# return 0 if 'df' fails
df_info()
{
# 1:tag 2:mount 3:filesys 4:total 5:Used 6:Avail 7:Used% 8:Mount
[ -d "$2" ] && set -- $1 $2 $( df -k -P $2 2>/dev/null | tail -1 )
#!# we could add the following check:
#!# [ "$2" != "$8" ] && set -- $1 $2; # mount point mismatch?
[ "$#" -ge 6 ] || set -- $1 $2 filesystem 0 0 0
echo "$UQHOST:$1_total:$4K"
echo "$UQHOST:$1_used:$5K"
#if [ -w "$2" ]
#then
echo "$UQHOST:$1_free:$6K"
#else
# echo "$UQHOST:$1_free:0"
#fi
}
# invariant values
if [ -e "/proc/cpuinfo" ]
then
# mips=$(awk '{if (/mips/) printf "%.0f\n", $NF}' /proc/cpuinfo | tail -1)
mips=$(awk 'BEGIN {mips=0} /mips/ {if ($NF > mips) mips=$NF }; END {print mips}' /proc/cpuinfo)
else
mips=0
fi
unset os_name
# extract lsb_release
if [ -e "/usr/bin/lsb_release" ]
then
os_name=$(/usr/bin/lsb_release -ircs | xargs echo | sed 's/ /_/g')
else
os_name='unkown'
fi
: ${os_name:=NONE}
# -----------------------------------------------------------------------------
# host_info
#
# report host specific information about filesystems, logins,
# special hardware extensions, etc.
#
host_info()
{
echo "$UQHOST:arch:$os_arch"
echo "$UQHOST:os:$os_name"
# df_info tmp /tmp
df_info scratch /scratch
echo "$UQHOST:mips:$mips"
}
# -----------------------------------------------------------------------------
# iidle_info()
# report a machine's idle time
#
# parse the contents from /proc/interrupts, which looks like the following:
#
# CPU0
# 0: 23024789 XT-PIC timer
# 1: 13 XT-PIC keyboard
# 2: 0 XT-PIC cascade
# 5: 0 XT-PIC usb-uhci
# 8: 2 XT-PIC rtc
# 9: 0 XT-PIC acpi
# 10: 0 XT-PIC ehci-hcd, usb-uhci
# 11: 16687253 XT-PIC eth0, usb-uhci, Intel 82801DB-ICH4, nvidia
# 12: 20 XT-PIC PS/2 Mouse
# 14: 77178 XT-PIC ide0
# 15: 2 XT-PIC ide1
# NMI: 0
# LOC: 0
# ERR: 0
# MIS: 0
#
# or,
#
# CPU0 CPU1
# 0: 12820049 12818168 IO-APIC-edge timer
# 1: 42889 43309 IO-APIC-edge keyboard
# 2: 0 0 XT-PIC cascade
# 8: 2 0 IO-APIC-edge rtc
# 9: 0 0 IO-APIC-edge acpi
# 12: 287235 296531 IO-APIC-edge PS/2 Mouse
# 14: 47423 40923 IO-APIC-edge ide0
# 15: 2 3 IO-APIC-edge ide1
# 16: 7733868 7737081 IO-APIC-level nvidia
# 17: 159 156 IO-APIC-level Intel ICH 82801AA
# 19: 2155710 2159943 IO-APIC-level e100, usb-uhci
# NMI: 0 0
# LOC: 25641034 25641033
# ERR: 0
# MIS: 0
#
# Thus, we need the [-1, 1..$ncpu] fields for the following sources:
# keyboard, Mouse, serial
#
# NB: adding 'usb-uhci' gives problems, since this is sometimes
# attached to the ethernet card
#
# set the variable 'iidle' to the idle time (seconds) since the last call
#
last="0 -1";
iidle_info()
{
set -- $(
perl -e '
my @last = @ARGV;
@ARGV = "/proc/interrupts";
$_ = <>;
my $ncpu = s/\s*CPU\d+//g || 0;
my ( $iidle, $int, $now ) = ( 0, 0, time );
$int += $_
for
map { /\s+(keyboard|Mouse|serial)$/ ? (split)[ 1 .. $ncpu ] : (); }
<>;
if ( $int == $last[-1] ) { # no interactivity since last round
$iidle = ( $now - $last[0] );
}
else {
@last = ( $now, $int );
}
print "$iidle @last\n";
' $last
);
echo "$UQHOST:iidle:$1";
shift; last="$@"; # save for later
}
# -----------------------------------------------------------------------------
#
# The execd running on the qmaster queries the license server
# The contents of 'act_qmaster' should suffice to migrate the load sensor
# for a controlled migration.
#
while :
do
read input || exit 1 # wait for input
[ "$input" = quit ] && exit 0
echo begin # begin load report
host_info # host information
iidle_info # machine's idle time
echo end # end load report
# let the license query run between load reports
# SGE_qmaster=`act_qmaster` # refresh the name of the qmaster
# if [ "$HOST" = "$SGE_qmaster" ]
if [ "$HOST" = "minos19" ]
then
# $qlicserver 2>> qloadsensor.err
$SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/local_licenses.conf output=$SGE_ROOT/flex-grid/cache/qlicserver_local.xml
$SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/abaqus_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml
# $SGE_ROOT/flex-grid/site/qlicserver config=/opt/SGE/flex-grid/config/trelis_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_trelis.xml
$SGE_ROOT/flex-grid/site/qlicserver config=/opt/SGE/flex-grid/config/comsol_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_comsol.xml
$SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/matlab_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml
lockfile $SGE_ROOT/flex-grid/cache/qlicserver.xml.lock
# (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_trelis.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml
# (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml
(sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_comsol.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml
rm -f $SGE_ROOT/flex-grid/cache/qlicserver.xml.lock
# $diskmon -m 2>> qloadsensor.err
# force rescheduling of express jobs
# $SGE_site/qxprs >/dev/null 2>&1
# else
# $diskmon 2>> qloadsensor.err
fi
done
exit 0 # we never get here, but just in case
#------------------------------------------------------------------------------
# feed via 'perl -x' to extract the 'host' complex configuration
#!/usr/bin/perl -w
print <DATA>
__DATA__
#
# host complex configuration
#
#name shortcut type relop requestable consumable default urgency
#---------------------------------------------------------------------------
tmp_total tmpt MEMORY <= YES NO 0 0
tmp_used tmpu MEMORY >= NO NO 0 0
tmp_free tmpf MEMORY <= YES NO 0 0
iidle iidle INT <= YES NO 0 0
mips mips INT <= YES NO 0 0
os os RESTRING == YES NO NONE 0
abaqus abaqus DOUBLE <= YES YES 0 0
cae cae DOUBLE <= YES YES 0 0
comsol comsol DOUBLE <= YES YES 0 0
hyper hyper DOUBLE <= YES YES 0 0
ifort ifort DOUBLE <= YES YES 0 0
matlab matlab DOUBLE <= YES YES 0 0
mcc mcc DOUBLE <= YES YES 0 0
multiphysics multiphysics DOUBLE <= YES YES 0 0
trelis trelis DOUBLE <= YES YES 0 0
scratch_free scratch_free MEMORY <= YES YES 0 0
scratch_total scratch_total MEMORY <= YES NO 0 0
scratch_used scratch_used MEMORY >= NO NO 0 0
# -----------------------------------------------------------------------------