#!/bin/sh # # # SGE startup script # #___INFO__MARK_BEGIN__ ########################################################################## # # The Contents of this file are made available subject to the terms of # the Sun Industry Standards Source License Version 1.2 # # Sun Microsystems Inc., March, 2001 # # # Sun Industry Standards Source License Version 1.2 # ================================================= # The contents of this file are subject to the Sun Industry Standards # Source License Version 1.2 (the "License"); You may not use this file # except in compliance with the License. You may obtain a copy of the # License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html # # Software provided under this License is provided on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, # WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, # MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. # See the License for the specific provisions governing your rights and # obligations concerning the Software. # # The Initial Developer of the Original Code is: Sun Microsystems, Inc. # # Copyright: 2001 by Sun Microsystems, Inc. # # All Rights Reserved. # ########################################################################## #___INFO__MARK_END__ # # This script can be called with the following arguments: # # start start qmaster or shadowd # stop Terminates qmaster if we are on the master machine. # restart equivalent to stop followed by start # status check if daemon(s) running (obeys -qmaster, -qmaster) # -qmaster only act on qmaster # -shadowd start act on shadwod if found applicable # -migrate shuts down qmaster if it is running # on another host and start the daemons on this host # # If the file "primary_qmaster" in the $SGE_ROOT/$SGE_CELL/common # exists and it contains the hostname of the current machine and qmaster # is running on another host it will be shut down and started on this host # # Unix commands which may be used in this script: # cat cut tr ls grep awk sed basename # # This script requires the script $SGE_ROOT/util/arch # Customization can be placed in /etc/default/sgemaster or # /etc/sysconfig/sgemaster (according to OS conventions) , which is sourced # after other setup. PATH=/bin:/usr/bin:/sbin:/usr/sbin #--------------------------------------------------------------------------- # The following lines provide the necessary info for adding a startup script # according to the Linux Standard Base Specification (LSB) which can # be found at: # # http://www.linuxfoundation.org/spec/booksets/LSB-Core-generic/LSB-Core-generic/initscrcomconv.html # ### BEGIN INIT INFO # Provides: SGEMASTER # Required-Start: $network $remote_fs # Required-Stop: $network $remote_fs # Default-Start: 3 5 # Default-Stop: 0 1 2 6 # Description: start Grid Engine qmaster, shadowd ### END INIT INFO # chkconfig: 35 95 3 #--------------------------------------------------------------------------- SGE_ROOT=/opt/SGE; export SGE_ROOT SGE_CELL=default; export SGE_CELL unset SGE_QMASTER_PORT unset SGE_EXECD_PORT ARCH=`$SGE_ROOT/util/arch` # library path setting required only for architectures where RUNPATH is not supported [ -d $SGE_ROOT/lib/$ARCH ] && case $ARCH in sol*|lx*) ;; *) shlib_path_name=`$SGE_ROOT/util/arch -lib` old_value=`eval echo '$'$shlib_path_name` if [ x$old_value = x ]; then eval $shlib_path_name=$SGE_ROOT/lib/$ARCH else eval $shlib_path_name=$old_value:$SGE_ROOT/lib/$ARCH fi export $shlib_path_name ;; esac #Include SMF if available NO_SMF=1 if [ -f /lib/svc/share/smf_include.sh ]; then . /lib/svc/share/smf_include.sh smf_present NO_SMF=$? fi [ -f /etc/default/sgemaster ] && . /etc/default/sgemaster [ -f /etc/sysconfig/sgemaster ] && . /etc/sysconfig/sgemaster #--------------------------------------------------------------------------- # Shutdown # Send SIGTERM to process name $1 with pid in file $2 # Shutdown() { name=$1 pidfile=$2 if [ -f $pidfile ]; then pid=`cat $pidfile` maxretries=20 i=0 while [ $i -lt $maxretries ]; do $utilbin_dir/checkprog $pid $name > /dev/null if [ "$?" = 0 ]; then #We keep killing Qmaster so that child processes get killed kill $pid else return 0 fi sleep 2 i=`expr $i + 1` done kill -9 $pid return $? fi } #--------------------------------------------------------------------------- # QmasterSpoolDir # Return qmasters spool directory # QmasterSpoolDir() { qma_spool_dir=`grep qmaster_spool_dir \ $SGE_ROOT/$SGE_CELL/common/bootstrap | \ awk '{ print $2 }'` echo $qma_spool_dir } HostCompare() { host1=$1 host2=$2 ignore_fqdn=true if [ -f $SGE_ROOT/$SGE_CELL/common/bootstrap ]; then ignore_fqdn_txt=`grep ignore_fqdn $SGE_ROOT/$SGE_CELL/common/bootstrap | awk '{print $2}'` case "$ignore_fqdn_txt" in [fF][aA][lL][sS][eE]) ignore_fqdn=false ;; esac fi if [ "$ignore_fqdn" = true ]; then host1=`echo $host1 | cut -f 1 -d .` host2=`echo $host2 | cut -f 1 -d .` fi #translate hostname to lower, because hostname are case insensitive host1=`echo $host1 | tr "[A-Z]" "[a-z]"` host2=`echo $host2 | tr "[A-Z]" "[a-z]"` if [ "$host1" = "$host2" ]; then echo 0 else echo 1 fi } #--------------------------------------------------------------------------- # CheckIfQmasterHost # If our hostname given in $1 is the same as in the "act_qmaster" file # echo "true" else echo "false" # CheckIfQmasterHost() { host=$1 act_qmaster=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` if [ `HostCompare $host $act_qmaster` -eq 0 ]; then echo true else echo false fi } #--------------------------------------------------------------------------- # CheckIfPrimaryQmasterHost # Check if our hostname given in $1 is the same as in the # "primary_qmaster" file # echo true if there is our hostname else echo false # CheckIfPrimaryQmasterHost() { host=$1 fname=$SGE_ROOT/$SGE_CELL/common/primary_qmaster if [ -f $fname ]; then primary_qmaster=`cat $fname` if [ `HostCompare $host $primary_qmaster` -eq 0 ]; then echo true else echo false fi else echo false fi } #--------------------------------------------------------------------------- # CheckIfShadowMasterHost # Check if our hostname given in $1 is contained in the # "shadow_masters" file # echo true if there is our hostname else echo false # CheckIfShadowMasterHost() { host=$1 fname=$SGE_ROOT/$SGE_CELL/common/shadow_masters if [ -f $fname ]; then grep -i "^${host}$" $fname 2>&1 > /dev/null if [ $? = 0 ]; then shadow_host="true" else shadow_host="false" fi else shadow_host="false" fi } #--------------------------------------------------------------------------- # GetPathToBinaries # echo the name of the bin_dir on this system # The check is fulfilled if we can access the qstat binary # echo "none" if we can't determine the binary path GetPathToBinaries() { cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap base=none if [ -f $cfgname ]; then base=`grep binary_path $cfgname | awk '{ print $2 }'` if [ -f $base/qstat ]; then : elif [ -f $SGE_ROOT/util/arch ]; then arch=`$SGE_ROOT/util/arch` if [ -f $base/$arch/qstat ]; then base=$base/$arch fi fi fi echo $base } #--------------------------------------------------------------------------- # GetAdminUser # echo the name of the admin user on this system # echo "root" if admin user retrieval fails GetAdminUser() { cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap user=none if [ -f $cfgname ]; then user=`grep admin_user $cfgname | awk '{ print $2 }'` fi if [ `echo $user|tr "[A-Z]" "[a-z]"` = "none" ]; then user=root fi echo $user } #--------------------------------------------------------------------------- # GetPathToUtilbin # echo the path to the binaries in utilbin # The check is fulfilled if we can access the "gethostname" binary # echo "none" if we can't determine the binary path # GetPathToUtilbin() { base=none if [ -f $SGE_ROOT/util/arch ]; then utilbindir=$SGE_ROOT/utilbin arch=`$SGE_ROOT/util/arch` if [ -f $utilbindir/$arch/gethostname ]; then base=$utilbindir/$arch fi fi echo $base } #--------------------------------------------------------------------------- # CheckRunningQmaster # checks, if sge_qmaster is running # In error case the sge_qmaster didn't start, silently # CheckRunningQmaster() { masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` if [ "$SGE_QMASTER_PORT" = "" ]; then ping_port=`$utilbin_dir/getservbyname -number sge_qmaster` else ping_port=$SGE_QMASTER_PORT fi start=`$SGE_ROOT/utilbin/$ARCH/now 2>/dev/null` running=false retries=0 qping_timeout=false # qping may have a long timeout in case of network or hostname resolution # related problems. # ensure that the test for a running qmaster does not take too long # by limiting the total time and numbers the connection test is repeated # we also require that the qmaster created a PID file before returning while [ $retries -le 30 ]; do $bin_dir/qping -info $masterhost $ping_port qmaster 1 > /dev/null 2>&1 if [ $? -eq 0 ]; then running=true break else now=`$SGE_ROOT/utilbin/$ARCH/now 2>/dev/null` if [ "$now" -lt "$start" ]; then start=$now fi elapsed=`expr $now - $start` if [ $elapsed -gt 60 ]; then if [ $retries -eq 0 ]; then qping_timeout=true fi break fi sleep 2 masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` retries=`expr $retries + 1` fi done if [ $running = "true" ]; then if [ `CheckIfQmasterHost $HOST` = false ]; then echo "sge_qmaster is running on another host (${masterhost})" return 1 else return 0 fi else echo echo "sge_qmaster start problem" if [ $qping_timeout = true ]; then echo "Possibly a network or hostname configuration problem (got timeout)." fi echo return 1 fi } #--------------------------------------------------------------------------- # DetectSMFService - sets service to a mask matching the name # $1 ... name # DetectSMFService() { name=$1 service="" if [ "$noSMF" = true -o $NO_SMF -ne 0 ]; then return fi #Check we have cluster_name file if [ ! -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ]; then echo "Error: could not find $SGE_ROOT/$SGE_CELL/common/cluster_name!" exit $SMF_EXIT_ERR_CONFIG fi #Cluster name must be unique SGE_CLUSTER_NAME=`cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null` service="svc:/application/sge/$name:$SGE_CLUSTER_NAME" #Check if service exists /usr/bin/svcs $service > /dev/null 2>&1 if [ $? -ne 0 ]; then #No such service found in the system service="" fi } #--------------------------------------------------------------------------- usage() { echo "Grid Engine start/stop script. Valid parameters are:" echo "" echo " (no parameters): start qmaster and shadow daemon if applicable" echo " \"start\" ditto." echo " \"stop\" shut down qmaster and shadow daemon if applicable" echo " \"restart\" restart (stop and start) daemons" echo " \"status\" check whether daemon(s) running" echo " \"-qmaster\" only act on qmaster (if applicable)" echo " \"-shadowd\" only act on shadowd (if applicable)" echo " \"-migrate\" shutdown qmaster if it's running on another" echo " host and restart it on this host" echo " Migration only works if this host is an admin host" echo " \"-nosmf\" force no SMF" echo "" echo "Only one of \"start\", \"stop\", or \"restart\" is allowed." echo "Only one of the parameters beginning with \"-\" is allowed. Does not " echo "apply to -nosmf." echo echo "Default argument is \"start\" for all components." echo "Default for \"stop\" is shutting down all components." echo exit 1 } #--------------------------------------------------------------------------- # MAIN Procedure # if [ "$#" -gt 3 -o "$1" = "-h" -o "$1" = "help" ]; then usage fi startup=true qmaster=true shadowd=true qstd=false migrate_qmaster=false noSMF=false stop=false status=false for i in $*; do if [ "$i" = start ]; then startup=true elif [ "$i" = stop ]; then startup=false stop=true elif [ "$i" = restart ]; then stop=true startup=true elif [ "$i" = status ]; then startup=false status=true elif [ "$i" = -qmaster ]; then qmaster=true shadowd=false elif [ "$i" = -shadowd ]; then qmaster=false shadowd=true elif [ "$i" = -migrate ]; then migrate_qmaster=true qmaster=true shadowd=false elif [ "$i" = -nosmf ]; then noSMF=true else usage fi done bin_dir=`GetPathToBinaries` if [ "$bin_dir" = "none" ]; then echo "can't determine path to Grid Engine binaries" exit 5 # LSB compliant exit status - program is not installed fi utilbin_dir=`GetPathToUtilbin` if [ "$utilbin_dir" = "none" ]; then echo "can't determine path to Grid Engine utility binaries" exit 5 # LSB compliant exit status - program is not installed fi qmaster_spool_dir=`QmasterSpoolDir` qma_run_dir=$qmaster_spool_dir HOST=`$utilbin_dir/gethostname -aname` UQHOST=`$utilbin_dir/gethostname -aname | cut -f1 -d.` CheckIfShadowMasterHost $HOST if [ "$stop" = true ]; then if [ $shadowd = true -a $shadow_host = true ]; then echo " Shutting down Grid Engine shadowd" DetectSMFService shadowd if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then svcadm disable -st $service else # Send SIGTERM to shadowd if [ -f $qma_run_dir/shadowd_$UQHOST.pid ]; then Shutdown sge_shadowd $qma_run_dir/shadowd_$UQHOST.pid elif [ -f $qma_run_dir/shadowd_$HOST.pid ]; then Shutdown sge_shadowd $qma_run_dir/shadowd_$HOST.pid fi fi fi if [ $qmaster = true ]; then if [ `CheckIfQmasterHost $HOST` = true ]; then echo " Shutting down Grid Engine qmaster" DetectSMFService qmaster if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then svcadm disable -st $service exit $? else # Send SIGTERM to qmaster Shutdown sge_qmaster $qma_run_dir/qmaster.pid ret=$? if [ -f /var/lock/subsys/sgemaster ]; then uid=`$utilbin_dir/uidgid -uid` if [ "$uid" = "0" -a "$ret" = "0" ]; then rm -f /var/lock/subsys/sgemaster >/dev/null 2>&1 else echo "Can't shut down qmaster!" exit 1 fi fi fi fi fi fi if [ "$startup" = true ]; then # qmaster_host=true if qmaster was running on this host the last time # this host is an execution host qmaster_host=`CheckIfQmasterHost $HOST` primary_qmaster_host=`CheckIfPrimaryQmasterHost $HOST` if [ $qmaster = true -a $qmaster_host = true -a $migrate_qmaster = true ]; then echo " qmaster running on this host. Will not migrate qmaster." exit 1 fi if [ $qmaster = true -a $qmaster_host = false -a \ \( $primary_qmaster_host = true -o $migrate_qmaster = true \) ]; then actual_qmaster_host=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` echo " Shutting down Grid Engine qmaster on host \"$actual_qmaster_host\" ..." qconf_output=`$bin_dir/qconf -ks 2>&1 | grep "denied"` if [ "$qconf_output" != "" ]; then echo " denied: host \"$HOST\" is not an admin host." exit 1 fi $bin_dir/qconf -km > /dev/null 2>&1 qping_count=0 qping_retries=10 qping_exit_state=0 if [ "$SGE_QMASTER_PORT" = "" ]; then ping_port=`$utilbin_dir/getservbyname -number sge_qmaster` else ping_port=$SGE_QMASTER_PORT fi while [ $qping_count -lt $qping_retries ]; do $bin_dir/qping -info $actual_qmaster_host $ping_port qmaster 1 > /dev/null 2>&1 qping_exit_state=$? if [ $qping_exit_state -ne 0 ]; then break fi sleep 3 qping_count=`expr $qping_count + 1` done if [ $qping_exit_state -eq 0 ]; then # qmaster is still running echo " qmaster on host $actual_qmaster_host still alive. Cannot migrate qmaster." exit 1 fi lock_file_read_retries=10 lock_file_read_count=0 lock_file_found=0 while [ $lock_file_read_count -lt $lock_file_read_retries ]; do if [ -f $qmaster_spool_dir/lock ]; then lock_file_found=1 break fi sleep 3 lock_file_read_count=`expr $lock_file_read_count + 1` done if [ $lock_file_found -eq 0 ]; then # old qmaster did not write lock file echo " old qmaster did not write lock file. Cannot migrate qmaster." echo " Please verify that qmaster on host $actual_qmaster_host is down" echo " and make sure that the lock file in qmaster spool directory is" echo " read-able." exit 1 fi qmaster_host=true #If we use SMF, we need to notify the SMF service DetectSMFService qmaster if [ -n "$service" ]; then svccfg -s $service setenv MIGRATE_SMF_STEP true if [ $? -ne 0 ]; then echo "Migration failed!" echo "It seems you do not have permission to modify the $service SMF service." exit 1 else svcadm refresh $service fi fi fi exit_val=0 #Need to check if this is a SMF migration DetectSMFService qmaster if [ -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" -a "$MIGRATE_SMF_STEP" = true ]; then qmaster_host=true fi if [ $qmaster = true -a $qmaster_host = false ]; then echo echo "sge_qmaster didn't start!" echo "This is not a qmaster host!" echo "Check your ${SGE_ROOT}/${SGE_CELL}/common/act_qmaster file!" echo if [ $shadowd = false -o ! -f $SGE_ROOT/$SGE_CELL/common/shadow_masters ]; then exit 1 fi elif [ $qmaster = true ]; then already_running=false #Check if pid file exists if [ -s "$qma_run_dir/qmaster.pid" ]; then daemon_pid=`cat "$qma_run_dir/qmaster.pid"` $utilbin_dir/checkprog $daemon_pid sge_qmaster > /dev/null if [ $? -eq 0 ]; then already_running=true fi fi # We can't detect pid file race, but we'll catch it most of the time if [ "$already_running" = "true" ]; then echo echo "sge_qmaster with PID $daemon_pid is already running" echo else #We want to use smf if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then echo " Starting Grid Engine qmaster" svcadm enable -st $service exit_val=$? #For -migrate with SMF qmaster_host is not yet set for SMF start (2nd) elif [ $qmaster_host = true -o \( -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" \) ]; then echo " Starting Grid Engine qmaster" $bin_dir/sge_qmaster [ $? -eq 0 -a -d /var/lock/subsys ] && touch /var/lock/subsys/sgemaster >/dev/null 2>&1 CheckRunningQmaster exit_val=$? if [ $exit_val -eq 0 -a -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" -a "$MIGRATE_SMF_STEP" = true ]; then svccfg -s $service unsetenv MIGRATE_SMF_STEP if [ $? -ne 0 ]; then echo "Warning: SMF migration cleanup step failed!" echo "It seems you do not have permission to modify the $service SMF service." echo echo "Run following commands manually as root or appropriate user:" echo "svccfg -s $service unsetenv MIGRATE_SMF_STEP" echo "svcadm refresh $service" else svcadm refresh $service fi fi fi if [ $exit_val -ne 0 ]; then echo "sge_qmaster didn't start!" fi fi fi if [ $shadowd = true -a $shadow_host = false ]; then #Display the message only if we have installed any shadowds if [ -f $SGE_ROOT/$SGE_CELL/common/shadow_masters ]; then echo echo "sge_shadowd didn't start!" echo "This is not a shadow master host!" echo "Check your ${SGE_ROOT}/${SGE_CELL}/common/shadow_masters file!" echo elif [ $qmaster = false ]; then #Shadow masters file does not exist and we try to start only shadowd echo echo "sge_shadowd didn't start!" echo "File ${SGE_ROOT}/${SGE_CELL}/common/shadow_masters does not exist!" echo "No shadowd installed?" echo fi if [ $qmaster_host = false -o $qmaster = false ]; then exit 1 fi elif [ $shadowd = true ]; then start_shadowd=true UQpidfile=$qma_run_dir/shadowd_$UQHOST.pid pidfile=$qma_run_dir/shadowd_$HOST.pid if [ -f $pidfile ]; then pid=`cat $pidfile` $utilbin_dir/checkprog $pid sge_shadowd > /dev/null if [ "$?" = 0 ]; then start_shadowd=false fi fi if [ -f $UQpidfile ]; then pid=`cat $UQpidfile` $utilbin_dir/checkprog $pid sge_shadowd > /dev/null if [ "$?" = 0 ]; then start_shadowd=false fi fi if [ $start_shadowd = true ]; then DetectSMFService shadowd echo " Starting Grid Engine shadowd" #We want to use smf if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then svcadm enable -st $service res=$? else $bin_dir/sge_shadowd res=$? fi if [ $res -ne 0 ]; then echo " sge_shadowd didn't start correctly!" exit $res fi else echo " found running sge_shadowd - not starting" fi fi if [ $exit_val -ne 0 ]; then exit $exit_val fi fi master_not=0 shadow_not=0 if [ "$status" = true ]; then if [ "$qmaster" = true ]; then if [ -s "$qma_run_dir/qmaster.pid" ]; then pid=`cat "$qma_run_dir/qmaster.pid"` if $utilbin_dir/checkprog $pid sge_qmaster > /dev/null; then echo "qmaster (pid $pid) is running..." else echo "qmaster (pid $pid) is not running..." master_not=1 fi else echo "qmaster is not running..." master_not=1 fi fi if [ "$shadowd" = true ]; then UQpidfile=$qma_run_dir/shadowd_$UQHOST.pid pidfile=$qma_run_dir/shadowd_$HOST.pid pid=`` shadow_running=0 if [ -s "$UQpidfile" ]; then pid=`cat $UQpidfile` if $utilbin_dir/checkprog $pid sge_shadowd > /dev/null; then shadow_running=1 fi fi if [ -s "$pidfile" ]; then pid=`cat $pidfile` if $utilbin_dir/checkprog $pid sge_shadowd > /dev/null; then shadow_running=1 fi fi if [ -s "$pidfile" ] || [ -s "$UQpidfile" ]; then if [ $shadow_running = 1 ]; then echo "shadowd (pid $pid) is running..." else echo "shadowd (pid $pid) is not running..." shadow_not=1 fi else echo "shadowd (pid $pid) is not running..." shadow_not=1 fi fi # fixme: check LSB values [ $master_not$shadow_not -gt 0 ] && exit 1 || exit 0 fi