326 lines
13 KiB
Bash
Executable File
326 lines
13 KiB
Bash
Executable File
#!/bin/bash
|
|
# $Id: qloadsensor 180 2010-09-17 15:46:41Z kasper $
|
|
#
|
|
# qloadsensor:
|
|
# load sensor for particular file systems and floating licenses
|
|
#
|
|
# NB:
|
|
# 1) add the new complexes (via qconf -mc) for the following:
|
|
# * complex configurations managed in the shell script
|
|
# eg, 'perl -x qloadsensor'
|
|
# * complex consumables managed global
|
|
# eg, 'qlicserver -c'
|
|
# 2) initialize the global complex consumables to be managed
|
|
# eg, 'qlicserver -C'
|
|
#
|
|
# copyright (c) 2003-10 <Mark.Olesen@faurecia.com>
|
|
#
|
|
# Licensed and distributed under the Creative Commons
|
|
# Attribution-NonCommercial-ShareAlike 3.0 License.
|
|
# http://creativecommons.org/licenses/by-nc-sa/3.0
|
|
# -----------------------------------------------------------------------------
|
|
|
|
#
|
|
# impose default GridEngine environment + ascertain the binary architecture
|
|
#
|
|
# you likely don't need to adjust these values, since the loadsensor is called
|
|
# from sge_execd, which in turn is started from /etc/init.d/n1ge and
|
|
# these variables should be correctly exported there
|
|
#
|
|
[ -d "$SGE_ROOT" ] || { echo "Error: SGE_ROOT=$SGE_ROOT not found"; exit 1; }
|
|
: ${SGE_CELL:=default}
|
|
: ${SGE_ARCH:=`$SGE_ROOT/util/arch`}
|
|
|
|
export SGE_ROOT SGE_CELL SGE_ARCH
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# this script should run as the 'admin_user' registered in 'bootstrap'
|
|
#
|
|
if [ "$UID" -eq 0 ]
|
|
then
|
|
admin_user=$(sed -ne 's/^admin_user *//p' $SGE_ROOT/$SGE_CELL/common/bootstrap)
|
|
: ${admin_user:=root}
|
|
if [ $admin_user != root -a $(echo $admin_user | tr "A-Z" "a-z") != none ]
|
|
then
|
|
exec $SGE_ROOT/utilbin/$SGE_ARCH/adminrun $admin_user $0
|
|
fi
|
|
fi
|
|
|
|
# <settings>
|
|
# ========================================================================
|
|
# now that we are the admin_user, we can source our standard settings
|
|
# - customize *all* settings there (eg, license server settings)
|
|
# - ENSURE THAT '$SGE_site' IS DEFINED !!!
|
|
#
|
|
for i in $SGE_ROOT/$SGE_CELL/site/environ; do [ -f $i ] && . $i; done
|
|
|
|
# define (unique) cluster name if not already defined
|
|
if [ -z "$SGE_CLUSTER_NAME" -a -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ]
|
|
then
|
|
SGE_CLUSTER_NAME=$(cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null)
|
|
fi
|
|
: ${SGE_CLUSTER_NAME:=default}
|
|
export SGE_CLUSTER_NAME
|
|
|
|
SGE_site="$SGE_ROOT/flex-grid/site"
|
|
|
|
#
|
|
# ========================================================================
|
|
# </settings>
|
|
|
|
###############################################################################
|
|
###############################################################################
|
|
# CUSTOMIZE THESE SETTINGS - iff. required
|
|
|
|
qlicserver="$SGE_site/qlicserver config=$SGE_site/../config/local_licenses.conf dir=$SGE_ROOT/flex-grid/cache output=$SGE_ROOT/flex-grid/cache/qlicserver.xml qhost=qhost.xml qstat=qstat.xml"
|
|
diskmon="$SGE_site/diskmon.pl"
|
|
|
|
# END OF CUSTOMIZE SETTINGS
|
|
###############################################################################
|
|
###############################################################################
|
|
|
|
#
|
|
# the real (not compiled in) architecture
|
|
#
|
|
os_arch=`$SGE_ROOT/util/arch`
|
|
SGE_utilbin=$SGE_ROOT/utilbin/$os_arch
|
|
|
|
#
|
|
# set some constants
|
|
#
|
|
HOST=$($SGE_utilbin/gethostname -aname)
|
|
UQHOST=$(echo $HOST | cut -f1 -d.)
|
|
SGE_qmaster=unknown; export SGE_qmaster
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# act_qmaster
|
|
#
|
|
# extract the unqualified host name from the "act_qmaster" file
|
|
# return this value or 'unknown' on failure
|
|
#
|
|
act_qmaster()
|
|
{
|
|
tmp=$(cat $SGE_common/act_qmaster 2>/dev/null)
|
|
echo ${tmp:-unknown}
|
|
}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# df_info
|
|
#
|
|
# echo the $1_{total,used,free} space on filesystem $2
|
|
#
|
|
# gridengine uses the suffixes
|
|
# 'k' => blocksize 1000
|
|
# 'K' => blocksize 1024
|
|
#
|
|
# return 0 if 'df' fails
|
|
df_info()
|
|
{
|
|
# 1:tag 2:mount 3:filesys 4:total 5:Used 6:Avail 7:Used% 8:Mount
|
|
[ -d "$2" ] && set -- $1 $2 $( df -k -P $2 2>/dev/null | tail -1 )
|
|
|
|
#!# we could add the following check:
|
|
#!# [ "$2" != "$8" ] && set -- $1 $2; # mount point mismatch?
|
|
|
|
[ "$#" -ge 6 ] || set -- $1 $2 filesystem 0 0 0
|
|
|
|
echo "$UQHOST:$1_total:$4K"
|
|
echo "$UQHOST:$1_used:$5K"
|
|
|
|
#if [ -w "$2" ]
|
|
#then
|
|
echo "$UQHOST:$1_free:$6K"
|
|
#else
|
|
# echo "$UQHOST:$1_free:0"
|
|
#fi
|
|
}
|
|
|
|
# invariant values
|
|
if [ -e "/proc/cpuinfo" ]
|
|
then
|
|
# mips=$(awk '{if (/mips/) printf "%.0f\n", $NF}' /proc/cpuinfo | tail -1)
|
|
mips=$(awk 'BEGIN {mips=0} /mips/ {if ($NF > mips) mips=$NF }; END {print mips}' /proc/cpuinfo)
|
|
else
|
|
mips=0
|
|
fi
|
|
|
|
unset os_name
|
|
# extract lsb_release
|
|
if [ -e "/usr/bin/lsb_release" ]
|
|
then
|
|
os_name=$(/usr/bin/lsb_release -ircs | xargs echo | sed 's/ /_/g')
|
|
else
|
|
os_name='unkown'
|
|
fi
|
|
: ${os_name:=NONE}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# host_info
|
|
#
|
|
# report host specific information about filesystems, logins,
|
|
# special hardware extensions, etc.
|
|
#
|
|
host_info()
|
|
{
|
|
echo "$UQHOST:arch:$os_arch"
|
|
echo "$UQHOST:os:$os_name"
|
|
# df_info tmp /tmp
|
|
df_info scratch /scratch
|
|
echo "$UQHOST:mips:$mips"
|
|
}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# iidle_info()
|
|
# report a machine's idle time
|
|
#
|
|
# parse the contents from /proc/interrupts, which looks like the following:
|
|
#
|
|
# CPU0
|
|
# 0: 23024789 XT-PIC timer
|
|
# 1: 13 XT-PIC keyboard
|
|
# 2: 0 XT-PIC cascade
|
|
# 5: 0 XT-PIC usb-uhci
|
|
# 8: 2 XT-PIC rtc
|
|
# 9: 0 XT-PIC acpi
|
|
# 10: 0 XT-PIC ehci-hcd, usb-uhci
|
|
# 11: 16687253 XT-PIC eth0, usb-uhci, Intel 82801DB-ICH4, nvidia
|
|
# 12: 20 XT-PIC PS/2 Mouse
|
|
# 14: 77178 XT-PIC ide0
|
|
# 15: 2 XT-PIC ide1
|
|
# NMI: 0
|
|
# LOC: 0
|
|
# ERR: 0
|
|
# MIS: 0
|
|
#
|
|
# or,
|
|
#
|
|
# CPU0 CPU1
|
|
# 0: 12820049 12818168 IO-APIC-edge timer
|
|
# 1: 42889 43309 IO-APIC-edge keyboard
|
|
# 2: 0 0 XT-PIC cascade
|
|
# 8: 2 0 IO-APIC-edge rtc
|
|
# 9: 0 0 IO-APIC-edge acpi
|
|
# 12: 287235 296531 IO-APIC-edge PS/2 Mouse
|
|
# 14: 47423 40923 IO-APIC-edge ide0
|
|
# 15: 2 3 IO-APIC-edge ide1
|
|
# 16: 7733868 7737081 IO-APIC-level nvidia
|
|
# 17: 159 156 IO-APIC-level Intel ICH 82801AA
|
|
# 19: 2155710 2159943 IO-APIC-level e100, usb-uhci
|
|
# NMI: 0 0
|
|
# LOC: 25641034 25641033
|
|
# ERR: 0
|
|
# MIS: 0
|
|
#
|
|
# Thus, we need the [-1, 1..$ncpu] fields for the following sources:
|
|
# keyboard, Mouse, serial
|
|
#
|
|
# NB: adding 'usb-uhci' gives problems, since this is sometimes
|
|
# attached to the ethernet card
|
|
#
|
|
# set the variable 'iidle' to the idle time (seconds) since the last call
|
|
#
|
|
last="0 -1";
|
|
iidle_info()
|
|
{
|
|
set -- $(
|
|
perl -e '
|
|
my @last = @ARGV;
|
|
@ARGV = "/proc/interrupts";
|
|
$_ = <>;
|
|
|
|
my $ncpu = s/\s*CPU\d+//g || 0;
|
|
my ( $iidle, $int, $now ) = ( 0, 0, time );
|
|
|
|
$int += $_
|
|
for
|
|
map { /\s+(keyboard|Mouse|serial)$/ ? (split)[ 1 .. $ncpu ] : (); }
|
|
<>;
|
|
|
|
if ( $int == $last[-1] ) { # no interactivity since last round
|
|
$iidle = ( $now - $last[0] );
|
|
}
|
|
else {
|
|
@last = ( $now, $int );
|
|
}
|
|
|
|
print "$iidle @last\n";
|
|
' $last
|
|
);
|
|
|
|
echo "$UQHOST:iidle:$1";
|
|
|
|
shift; last="$@"; # save for later
|
|
}
|
|
# -----------------------------------------------------------------------------
|
|
#
|
|
# The execd running on the qmaster queries the license server
|
|
# The contents of 'act_qmaster' should suffice to migrate the load sensor
|
|
# for a controlled migration.
|
|
#
|
|
|
|
while :
|
|
do
|
|
read input || exit 1 # wait for input
|
|
[ "$input" = quit ] && exit 0
|
|
|
|
echo begin # begin load report
|
|
host_info # host information
|
|
iidle_info # machine's idle time
|
|
echo end # end load report
|
|
|
|
# let the license query run between load reports
|
|
# SGE_qmaster=`act_qmaster` # refresh the name of the qmaster
|
|
# if [ "$HOST" = "$SGE_qmaster" ]
|
|
if [ "$HOST" = "minos19" ]
|
|
then
|
|
# $qlicserver 2>> qloadsensor.err
|
|
$SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/local_licenses.conf output=$SGE_ROOT/flex-grid/cache/qlicserver_local.xml
|
|
$SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/abaqus_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml
|
|
# $SGE_ROOT/flex-grid/site/qlicserver config=/opt/SGE/flex-grid/config/trelis_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_trelis.xml
|
|
$SGE_ROOT/flex-grid/site/qlicserver config=/opt/SGE/flex-grid/config/comsol_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_comsol.xml
|
|
$SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/matlab_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml
|
|
lockfile $SGE_ROOT/flex-grid/cache/qlicserver.xml.lock
|
|
# (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_trelis.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml
|
|
# (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml
|
|
(sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_comsol.xml | grep -v qlicserver | grep -v resources ; sed '1,/<resources>/d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml
|
|
rm -f $SGE_ROOT/flex-grid/cache/qlicserver.xml.lock
|
|
# $diskmon -m 2>> qloadsensor.err
|
|
# force rescheduling of express jobs
|
|
# $SGE_site/qxprs >/dev/null 2>&1
|
|
# else
|
|
# $diskmon 2>> qloadsensor.err
|
|
fi
|
|
done
|
|
exit 0 # we never get here, but just in case
|
|
|
|
#------------------------------------------------------------------------------
|
|
# feed via 'perl -x' to extract the 'host' complex configuration
|
|
|
|
#!/usr/bin/perl -w
|
|
print <DATA>
|
|
__DATA__
|
|
#
|
|
# host complex configuration
|
|
#
|
|
#name shortcut type relop requestable consumable default urgency
|
|
#---------------------------------------------------------------------------
|
|
tmp_total tmpt MEMORY <= YES NO 0 0
|
|
tmp_used tmpu MEMORY >= NO NO 0 0
|
|
tmp_free tmpf MEMORY <= YES NO 0 0
|
|
iidle iidle INT <= YES NO 0 0
|
|
mips mips INT <= YES NO 0 0
|
|
os os RESTRING == YES NO NONE 0
|
|
abaqus abaqus DOUBLE <= YES YES 0 0
|
|
cae cae DOUBLE <= YES YES 0 0
|
|
comsol comsol DOUBLE <= YES YES 0 0
|
|
hyper hyper DOUBLE <= YES YES 0 0
|
|
ifort ifort DOUBLE <= YES YES 0 0
|
|
matlab matlab DOUBLE <= YES YES 0 0
|
|
mcc mcc DOUBLE <= YES YES 0 0
|
|
multiphysics multiphysics DOUBLE <= YES YES 0 0
|
|
trelis trelis DOUBLE <= YES YES 0 0
|
|
scratch_free scratch_free MEMORY <= YES YES 0 0
|
|
scratch_total scratch_total MEMORY <= YES NO 0 0
|
|
scratch_used scratch_used MEMORY >= NO NO 0 0
|
|
# -----------------------------------------------------------------------------
|