#!/bin/bash # $Id: qloadsensor 180 2010-09-17 15:46:41Z kasper $ # # qloadsensor: # load sensor for particular file systems and floating licenses # # NB: # 1) add the new complexes (via qconf -mc) for the following: # * complex configurations managed in the shell script # eg, 'perl -x qloadsensor' # * complex consumables managed global # eg, 'qlicserver -c' # 2) initialize the global complex consumables to be managed # eg, 'qlicserver -C' # # copyright (c) 2003-10 # # Licensed and distributed under the Creative Commons # Attribution-NonCommercial-ShareAlike 3.0 License. # http://creativecommons.org/licenses/by-nc-sa/3.0 # ----------------------------------------------------------------------------- # # impose default GridEngine environment + ascertain the binary architecture # # you likely don't need to adjust these values, since the loadsensor is called # from sge_execd, which in turn is started from /etc/init.d/n1ge and # these variables should be correctly exported there # [ -d "$SGE_ROOT" ] || { echo "Error: SGE_ROOT=$SGE_ROOT not found"; exit 1; } : ${SGE_CELL:=default} : ${SGE_ARCH:=`$SGE_ROOT/util/arch`} export SGE_ROOT SGE_CELL SGE_ARCH # ----------------------------------------------------------------------------- # this script should run as the 'admin_user' registered in 'bootstrap' # if [ "$UID" -eq 0 ] then admin_user=$(sed -ne 's/^admin_user *//p' $SGE_ROOT/$SGE_CELL/common/bootstrap) : ${admin_user:=root} if [ $admin_user != root -a $(echo $admin_user | tr "A-Z" "a-z") != none ] then exec $SGE_ROOT/utilbin/$SGE_ARCH/adminrun $admin_user $0 fi fi # # ======================================================================== # now that we are the admin_user, we can source our standard settings # - customize *all* settings there (eg, license server settings) # - ENSURE THAT '$SGE_site' IS DEFINED !!! # for i in $SGE_ROOT/$SGE_CELL/site/environ; do [ -f $i ] && . $i; done # define (unique) cluster name if not already defined if [ -z "$SGE_CLUSTER_NAME" -a -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ] then SGE_CLUSTER_NAME=$(cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null) fi : ${SGE_CLUSTER_NAME:=default} export SGE_CLUSTER_NAME SGE_site="$SGE_ROOT/flex-grid/site" # # ======================================================================== # ############################################################################### ############################################################################### # CUSTOMIZE THESE SETTINGS - iff. required qlicserver="$SGE_site/qlicserver config=$SGE_site/../config/local_licenses.conf dir=$SGE_ROOT/flex-grid/cache output=$SGE_ROOT/flex-grid/cache/qlicserver.xml qhost=qhost.xml qstat=qstat.xml" diskmon="$SGE_site/diskmon.pl" # END OF CUSTOMIZE SETTINGS ############################################################################### ############################################################################### # # the real (not compiled in) architecture # os_arch=`$SGE_ROOT/util/arch` SGE_utilbin=$SGE_ROOT/utilbin/$os_arch # # set some constants # HOST=$($SGE_utilbin/gethostname -aname) UQHOST=$(echo $HOST | cut -f1 -d.) SGE_qmaster=unknown; export SGE_qmaster # ----------------------------------------------------------------------------- # act_qmaster # # extract the unqualified host name from the "act_qmaster" file # return this value or 'unknown' on failure # act_qmaster() { tmp=$(cat $SGE_common/act_qmaster 2>/dev/null) echo ${tmp:-unknown} } # ----------------------------------------------------------------------------- # df_info # # echo the $1_{total,used,free} space on filesystem $2 # # gridengine uses the suffixes # 'k' => blocksize 1000 # 'K' => blocksize 1024 # # return 0 if 'df' fails df_info() { # 1:tag 2:mount 3:filesys 4:total 5:Used 6:Avail 7:Used% 8:Mount [ -d "$2" ] && set -- $1 $2 $( df -k -P $2 2>/dev/null | tail -1 ) #!# we could add the following check: #!# [ "$2" != "$8" ] && set -- $1 $2; # mount point mismatch? [ "$#" -ge 6 ] || set -- $1 $2 filesystem 0 0 0 echo "$UQHOST:$1_total:$4K" echo "$UQHOST:$1_used:$5K" #if [ -w "$2" ] #then echo "$UQHOST:$1_free:$6K" #else # echo "$UQHOST:$1_free:0" #fi } # invariant values if [ -e "/proc/cpuinfo" ] then # mips=$(awk '{if (/mips/) printf "%.0f\n", $NF}' /proc/cpuinfo | tail -1) mips=$(awk 'BEGIN {mips=0} /mips/ {if ($NF > mips) mips=$NF }; END {print mips}' /proc/cpuinfo) else mips=0 fi unset os_name # extract lsb_release if [ -e "/usr/bin/lsb_release" ] then os_name=$(/usr/bin/lsb_release -ircs | xargs echo | sed 's/ /_/g') else os_name='unkown' fi : ${os_name:=NONE} # ----------------------------------------------------------------------------- # host_info # # report host specific information about filesystems, logins, # special hardware extensions, etc. # host_info() { echo "$UQHOST:arch:$os_arch" echo "$UQHOST:os:$os_name" # df_info tmp /tmp df_info scratch /scratch echo "$UQHOST:mips:$mips" } # ----------------------------------------------------------------------------- # iidle_info() # report a machine's idle time # # parse the contents from /proc/interrupts, which looks like the following: # # CPU0 # 0: 23024789 XT-PIC timer # 1: 13 XT-PIC keyboard # 2: 0 XT-PIC cascade # 5: 0 XT-PIC usb-uhci # 8: 2 XT-PIC rtc # 9: 0 XT-PIC acpi # 10: 0 XT-PIC ehci-hcd, usb-uhci # 11: 16687253 XT-PIC eth0, usb-uhci, Intel 82801DB-ICH4, nvidia # 12: 20 XT-PIC PS/2 Mouse # 14: 77178 XT-PIC ide0 # 15: 2 XT-PIC ide1 # NMI: 0 # LOC: 0 # ERR: 0 # MIS: 0 # # or, # # CPU0 CPU1 # 0: 12820049 12818168 IO-APIC-edge timer # 1: 42889 43309 IO-APIC-edge keyboard # 2: 0 0 XT-PIC cascade # 8: 2 0 IO-APIC-edge rtc # 9: 0 0 IO-APIC-edge acpi # 12: 287235 296531 IO-APIC-edge PS/2 Mouse # 14: 47423 40923 IO-APIC-edge ide0 # 15: 2 3 IO-APIC-edge ide1 # 16: 7733868 7737081 IO-APIC-level nvidia # 17: 159 156 IO-APIC-level Intel ICH 82801AA # 19: 2155710 2159943 IO-APIC-level e100, usb-uhci # NMI: 0 0 # LOC: 25641034 25641033 # ERR: 0 # MIS: 0 # # Thus, we need the [-1, 1..$ncpu] fields for the following sources: # keyboard, Mouse, serial # # NB: adding 'usb-uhci' gives problems, since this is sometimes # attached to the ethernet card # # set the variable 'iidle' to the idle time (seconds) since the last call # last="0 -1"; iidle_info() { set -- $( perl -e ' my @last = @ARGV; @ARGV = "/proc/interrupts"; $_ = <>; my $ncpu = s/\s*CPU\d+//g || 0; my ( $iidle, $int, $now ) = ( 0, 0, time ); $int += $_ for map { /\s+(keyboard|Mouse|serial)$/ ? (split)[ 1 .. $ncpu ] : (); } <>; if ( $int == $last[-1] ) { # no interactivity since last round $iidle = ( $now - $last[0] ); } else { @last = ( $now, $int ); } print "$iidle @last\n"; ' $last ); echo "$UQHOST:iidle:$1"; shift; last="$@"; # save for later } # ----------------------------------------------------------------------------- # # The execd running on the qmaster queries the license server # The contents of 'act_qmaster' should suffice to migrate the load sensor # for a controlled migration. # while : do read input || exit 1 # wait for input [ "$input" = quit ] && exit 0 echo begin # begin load report host_info # host information iidle_info # machine's idle time echo end # end load report # let the license query run between load reports # SGE_qmaster=`act_qmaster` # refresh the name of the qmaster # if [ "$HOST" = "$SGE_qmaster" ] if [ "$HOST" = "minos19" ] then # $qlicserver 2>> qloadsensor.err $SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/local_licenses.conf output=$SGE_ROOT/flex-grid/cache/qlicserver_local.xml $SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/abaqus_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml # $SGE_ROOT/flex-grid/site/qlicserver config=/opt/SGE/flex-grid/config/trelis_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_trelis.xml $SGE_ROOT/flex-grid/site/qlicserver config=/opt/SGE/flex-grid/config/comsol_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_comsol.xml $SGE_ROOT/flex-grid/site/qlicserver config=$SGE_ROOT/flex-grid/config/matlab_licenses.conf timeout=60 output=$SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml lockfile $SGE_ROOT/flex-grid/cache/qlicserver.xml.lock # (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_trelis.xml | grep -v qlicserver | grep -v resources ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml # (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml (sed '/<\/resources>/,$ d' $SGE_ROOT/flex-grid/cache/qlicserver_abaqus.xml ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_local.xml | grep -v qlicserver | grep -v resources; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_comsol.xml | grep -v qlicserver | grep -v resources ; sed '1,//d' $SGE_ROOT/flex-grid/cache/qlicserver_matlab.xml;) > $SGE_ROOT/flex-grid/cache/qlicserver.xml rm -f $SGE_ROOT/flex-grid/cache/qlicserver.xml.lock # $diskmon -m 2>> qloadsensor.err # force rescheduling of express jobs # $SGE_site/qxprs >/dev/null 2>&1 # else # $diskmon 2>> qloadsensor.err fi done exit 0 # we never get here, but just in case #------------------------------------------------------------------------------ # feed via 'perl -x' to extract the 'host' complex configuration #!/usr/bin/perl -w print __DATA__ # # host complex configuration # #name shortcut type relop requestable consumable default urgency #--------------------------------------------------------------------------- tmp_total tmpt MEMORY <= YES NO 0 0 tmp_used tmpu MEMORY >= NO NO 0 0 tmp_free tmpf MEMORY <= YES NO 0 0 iidle iidle INT <= YES NO 0 0 mips mips INT <= YES NO 0 0 os os RESTRING == YES NO NONE 0 abaqus abaqus DOUBLE <= YES YES 0 0 cae cae DOUBLE <= YES YES 0 0 comsol comsol DOUBLE <= YES YES 0 0 hyper hyper DOUBLE <= YES YES 0 0 ifort ifort DOUBLE <= YES YES 0 0 matlab matlab DOUBLE <= YES YES 0 0 mcc mcc DOUBLE <= YES YES 0 0 multiphysics multiphysics DOUBLE <= YES YES 0 0 trelis trelis DOUBLE <= YES YES 0 0 scratch_free scratch_free MEMORY <= YES YES 0 0 scratch_total scratch_total MEMORY <= YES NO 0 0 scratch_used scratch_used MEMORY >= NO NO 0 0 # -----------------------------------------------------------------------------