465 lines
13 KiB
Plaintext
465 lines
13 KiB
Plaintext
|
#!/bin/sh
|
||
|
#
|
||
|
#
|
||
|
# SGE startup script
|
||
|
#
|
||
|
#___INFO__MARK_BEGIN__
|
||
|
##########################################################################
|
||
|
#
|
||
|
# The Contents of this file are made available subject to the terms of
|
||
|
# the Sun Industry Standards Source License Version 1.2
|
||
|
#
|
||
|
# Sun Microsystems Inc., March, 2001
|
||
|
#
|
||
|
#
|
||
|
# Sun Industry Standards Source License Version 1.2
|
||
|
# =================================================
|
||
|
# The contents of this file are subject to the Sun Industry Standards
|
||
|
# Source License Version 1.2 (the "License"); You may not use this file
|
||
|
# except in compliance with the License. You may obtain a copy of the
|
||
|
# License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
|
||
|
#
|
||
|
# Software provided under this License is provided on an "AS IS" basis,
|
||
|
# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
|
||
|
# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
|
||
|
# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
|
||
|
# See the License for the specific provisions governing your rights and
|
||
|
# obligations concerning the Software.
|
||
|
#
|
||
|
# The Initial Developer of the Original Code is: Sun Microsystems, Inc.
|
||
|
#
|
||
|
# Copyright: 2001 by Sun Microsystems, Inc.
|
||
|
#
|
||
|
# All Rights Reserved.
|
||
|
#
|
||
|
##########################################################################
|
||
|
#___INFO__MARK_END__
|
||
|
|
||
|
#
|
||
|
# This script can be called with the following arguments:
|
||
|
#
|
||
|
# start start execution daemon
|
||
|
# stop Terminates the execution daemon
|
||
|
# and the shepherd. This only works if the execution daemon
|
||
|
# spool directory is in the default location.
|
||
|
# softstop do not kill the shepherd process
|
||
|
# restart equivalent to softstop followed by start
|
||
|
# status check if execd running
|
||
|
#
|
||
|
# Unix commands which may be used in this script:
|
||
|
# cat cut tr ls grep awk sed basename
|
||
|
#
|
||
|
# This script requires the script $SGE_ROOT/util/arch
|
||
|
# Customization can be placed in /etc/default/sgeexecd or
|
||
|
# /etc/sysconfig/sgeexecd (according to OS conventions), which is sourced
|
||
|
# after other setup.
|
||
|
|
||
|
PATH=/bin:/usr/bin:/sbin:/usr/sbin
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# The following lines provide the necessary info for adding a startup script
|
||
|
# according to the Linux Standard Base Specification (LSB) which can
|
||
|
# be found at:
|
||
|
#
|
||
|
# http://www.linuxfoundation.org/spec/booksets/LSB-Core-generic/LSB-Core-generic/initscrcomconv.html
|
||
|
#
|
||
|
### BEGIN INIT INFO
|
||
|
# Provides: SGEEXEC
|
||
|
# Required-Start: $network $remote_fs
|
||
|
# Required-Stop: $network $remote_fs
|
||
|
# Default-Start: 3 5
|
||
|
# Default-Stop: 0 1 2 6
|
||
|
# Description: start Grid Engine execd
|
||
|
### END INIT INFO
|
||
|
# chkconfig: 35 96 2
|
||
|
#---------------------------------------------------------------------------
|
||
|
|
||
|
SGE_ROOT=/opt/SGE; export SGE_ROOT
|
||
|
SGE_CELL=default; export SGE_CELL
|
||
|
unset SGE_QMASTER_PORT
|
||
|
unset SGE_EXECD_PORT
|
||
|
|
||
|
count=0
|
||
|
while [ ! -d "$SGE_ROOT" -a $count -le 120 ]; do
|
||
|
count=`expr $count + 1`
|
||
|
sleep 1
|
||
|
done
|
||
|
|
||
|
ARCH=`$SGE_ROOT/util/arch`
|
||
|
# library path setting required only for architectures where RUNPATH is not supported
|
||
|
[ -d $SGE_ROOT/lib/$ARCH ] &&
|
||
|
case $ARCH in
|
||
|
sol*|lx*)
|
||
|
;;
|
||
|
*)
|
||
|
shlib_path_name=`$SGE_ROOT/util/arch -lib`
|
||
|
old_value=`eval echo '$'$shlib_path_name`
|
||
|
if [ x$old_value = x ]; then
|
||
|
eval $shlib_path_name=$SGE_ROOT/lib/$ARCH
|
||
|
else
|
||
|
eval $shlib_path_name=$old_value:$SGE_ROOT/lib/$ARCH
|
||
|
fi
|
||
|
export $shlib_path_name
|
||
|
;;
|
||
|
esac
|
||
|
|
||
|
[ -f /etc/default/sgeexecd ] && . /etc/default/sgeexecd
|
||
|
[ -f /etc/sysconfig/sgeexecd ] && . /etc/sysconfig/sgeexecd
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# DetectSMFService - sets service to a mask matching the name
|
||
|
# $1 ... name
|
||
|
#
|
||
|
DetectSMFService()
|
||
|
{
|
||
|
name=$1
|
||
|
service=""
|
||
|
|
||
|
if [ "$noSMF" = true ]; then
|
||
|
return
|
||
|
fi
|
||
|
#Otherwise we try is it's available of the system
|
||
|
if [ -f /lib/svc/share/smf_include.sh ]; then
|
||
|
. /lib/svc/share/smf_include.sh
|
||
|
smf_present
|
||
|
if [ $? -ne 0 ]; then
|
||
|
return
|
||
|
fi
|
||
|
else
|
||
|
return
|
||
|
fi
|
||
|
|
||
|
#Check we have cluster_name file
|
||
|
if [ ! -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ]; then
|
||
|
echo "Error: could not find $SGE_ROOT/$SGE_CELL/common/cluster_name!"
|
||
|
exit $SMF_EXIT_ERR_CONFIG
|
||
|
fi
|
||
|
#Cluster name must be unique
|
||
|
SGE_CLUSTER_NAME=`cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null`
|
||
|
|
||
|
service="svc:/application/sge/$name:$SGE_CLUSTER_NAME"
|
||
|
|
||
|
#Check if service exists
|
||
|
/usr/bin/svcs $service > /dev/null 2>&1
|
||
|
if [ $? -ne 0 ]; then
|
||
|
#No such service found in the system
|
||
|
service=""
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# ShutdownSMF
|
||
|
#
|
||
|
ShutdownSMF()
|
||
|
{
|
||
|
if [ -z "$service" ]; then
|
||
|
#We don't have any such SMF service we use normal Shutdown
|
||
|
return
|
||
|
fi
|
||
|
pid=`/usr/bin/svcs -l -p $service | grep "/sge_execd$" | grep -v "^grep" | awk '{print $2}'`
|
||
|
if [ -n "$pid" ]; then
|
||
|
usingSMF="true"
|
||
|
/usr/sbin/svcadm disable -st $service
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# Shutdown
|
||
|
# Send SIGTERM (default) or signal $3 to process name $1 with pid in file $2
|
||
|
#
|
||
|
Shutdown()
|
||
|
{
|
||
|
name=$1
|
||
|
pidfile=$2
|
||
|
signal="-TERM"
|
||
|
|
||
|
if [ $# = 3 ]; then
|
||
|
signal="-$3"
|
||
|
fi
|
||
|
if [ -f $pidfile ]; then
|
||
|
pid=`cat $pidfile`
|
||
|
$utilbin_dir/checkprog $pid $name > /dev/null
|
||
|
if [ "$?" = 0 ]; then
|
||
|
kill $signal $pid
|
||
|
return $?
|
||
|
fi
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# GetPathToBinaries
|
||
|
# echo the name of the bin_dir on this system
|
||
|
# The check is fulfilled if we can access the qstat binary
|
||
|
# echo "none" if we can't determine the binary path
|
||
|
GetPathToBinaries()
|
||
|
{
|
||
|
cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap
|
||
|
|
||
|
base=none
|
||
|
|
||
|
if [ -f $cfgname ]; then
|
||
|
base=`grep binary_path $cfgname | awk '{ print $2 }'`
|
||
|
if [ -f $base/qstat ]; then
|
||
|
:
|
||
|
elif [ -f $SGE_ROOT/util/arch ]; then
|
||
|
arch=`$SGE_ROOT/util/arch`
|
||
|
if [ -f $base/$arch/qstat ]; then
|
||
|
base=$base/$arch
|
||
|
fi
|
||
|
fi
|
||
|
fi
|
||
|
|
||
|
echo $base
|
||
|
}
|
||
|
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# GetAdminUser
|
||
|
# echo the name of the admin user on this system
|
||
|
# echo "root" if admin user retrieval fails
|
||
|
GetAdminUser()
|
||
|
{
|
||
|
cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap
|
||
|
user=none
|
||
|
|
||
|
if [ -f $cfgname ]; then
|
||
|
user=`grep admin_user $cfgname | awk '{ print $2 }'`
|
||
|
fi
|
||
|
|
||
|
if [ `echo $user|tr "[A-Z]" "[a-z]"` = "none" ]; then
|
||
|
user=root
|
||
|
fi
|
||
|
echo $user
|
||
|
}
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# GetPathToUtilbin
|
||
|
# echo the path to the binaries in utilbin
|
||
|
# The check is fulfilled if we can access the "gethostname" binary
|
||
|
# echo "none" if we can't determine the binary path
|
||
|
#
|
||
|
GetPathToUtilbin()
|
||
|
{
|
||
|
base=none
|
||
|
|
||
|
if [ -f $SGE_ROOT/util/arch ]; then
|
||
|
utilbindir=$SGE_ROOT/utilbin
|
||
|
|
||
|
arch=`$SGE_ROOT/util/arch`
|
||
|
if [ -f $utilbindir/$arch/gethostname ]; then
|
||
|
base=$utilbindir/$arch
|
||
|
fi
|
||
|
fi
|
||
|
|
||
|
echo $base
|
||
|
}
|
||
|
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# GetExecdSpoolDir
|
||
|
# get the execution daemon spooling dir from configuration
|
||
|
GetExecdSpoolDir()
|
||
|
{
|
||
|
EXECD_SPOOL_DIR=`$bin_dir/qconf -sconf $UQHOST 2>/dev/null |
|
||
|
grep execd_spool_dir | awk '{ print $2 }'`
|
||
|
if [ "$EXECD_SPOOL_DIR" = "" ]; then
|
||
|
EXECD_SPOOL_DIR=`$bin_dir/qconf -sconf | grep execd_spool_dir | awk '{ print $2 }'`
|
||
|
fi
|
||
|
echo "$EXECD_SPOOL_DIR"
|
||
|
}
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
usage()
|
||
|
{
|
||
|
echo "Grid Engine start/stop script. Valid parameters are:"
|
||
|
echo ""
|
||
|
echo " (no parameters): start execution daemon if applicable"
|
||
|
echo " \"start\" ditto."
|
||
|
echo " \"stop\" shutdown local Grid Engine processes and jobs"
|
||
|
echo " \"softstop\" shutdown local Grid Engine processes (no jobs)"
|
||
|
echo " \"restart\" restart local Grid Engine processes (keeping jobs)"
|
||
|
echo " \"status\" check whether execd runnig"
|
||
|
echo " \"-nosmf\" force no SMF"
|
||
|
echo ""
|
||
|
echo "Only one of \"start\", \"stop\", \"restart\", or \"softstop\" is allowed."
|
||
|
echo
|
||
|
echo "Default argument is \"start\" for all components."
|
||
|
echo "Default for \"stop\" is shutting down all components."
|
||
|
echo
|
||
|
exit 1
|
||
|
}
|
||
|
|
||
|
|
||
|
#---------------------------------------------------------------------------
|
||
|
# MAIN Procedure
|
||
|
#
|
||
|
|
||
|
if [ "$#" -gt 2 -o "$1" = "-h" -o "$1" = "help" ]; then
|
||
|
usage
|
||
|
fi
|
||
|
|
||
|
startup=true
|
||
|
execd=true
|
||
|
softstop=false
|
||
|
stop=false
|
||
|
noSMF=false
|
||
|
status=false
|
||
|
|
||
|
for i in $*; do
|
||
|
if [ "$i" = start ]; then
|
||
|
startup=true
|
||
|
elif [ "$i" = stop ]; then
|
||
|
stop=true
|
||
|
startup=false
|
||
|
elif [ "$i" = softstop ]; then
|
||
|
softstop=true
|
||
|
startup=false
|
||
|
elif [ "$i" = -nosmf ]; then
|
||
|
noSMF=true
|
||
|
elif [ "$i" = restart ]; then
|
||
|
startup=true
|
||
|
softstop=true
|
||
|
elif [ "$i" = status ]; then
|
||
|
startup=false
|
||
|
status=true
|
||
|
else
|
||
|
usage
|
||
|
fi
|
||
|
done
|
||
|
|
||
|
bin_dir=`GetPathToBinaries`
|
||
|
if [ "$bin_dir" = "none" ]; then
|
||
|
echo "can't determine path to Grid Engine binaries"
|
||
|
exit 5 # LSB compliant exit status - program is not installed
|
||
|
fi
|
||
|
|
||
|
utilbin_dir=`GetPathToUtilbin`
|
||
|
if [ "$utilbin_dir" = "none" ]; then
|
||
|
echo "can't determine path to Grid Engine utility binaries"
|
||
|
exit 5 # LSB compliant exit status - program is not installed
|
||
|
fi
|
||
|
|
||
|
# HOST is the aliased name (SGE name)
|
||
|
# UQHOST is the local host name (unqualified name)
|
||
|
HOST=`$utilbin_dir/gethostname -aname`
|
||
|
UQHOST=`$utilbin_dir/gethostname -name | cut -f1 -d.`
|
||
|
|
||
|
execd_run_dir=`GetExecdSpoolDir`/$UQHOST
|
||
|
|
||
|
DetectSMFService execd
|
||
|
|
||
|
if [ $stop = true -o $softstop = true ]; then
|
||
|
# Shutdown execution daemon
|
||
|
if [ $execd = true ]; then
|
||
|
execd_spool_dir=$execd_run_dir
|
||
|
|
||
|
usingSMF=false
|
||
|
echo " Shutting down Grid Engine execution daemon"
|
||
|
#We try to use SMF
|
||
|
ShutdownSMF
|
||
|
#Otherwise we use normal shutdown
|
||
|
if [ "$usingSMF" != true ]; then
|
||
|
# Send SIGTERM to execd
|
||
|
Shutdown sge_execd $execd_run_dir/execd.pid
|
||
|
ret=$?
|
||
|
if [ -f /var/lock/subsys/sgeexecd ]; then
|
||
|
uid=`$utilbin_dir/uidgid -uid`
|
||
|
if [ "$uid" = "0" -a "$ret" = "0" ]; then
|
||
|
rm -f /var/lock/subsys/sgeexecd >/dev/null 2>&1
|
||
|
else
|
||
|
echo "Can't shut down execd!"
|
||
|
exit 1
|
||
|
fi
|
||
|
fi
|
||
|
fi
|
||
|
# execution daemon is started on this host!
|
||
|
if [ "$SGE_EXECD_PORT" = "" ]; then
|
||
|
ping_port=`$utilbin_dir/getservbyname -number sge_execd`
|
||
|
else
|
||
|
ping_port=$SGE_EXECD_PORT
|
||
|
fi
|
||
|
# Wait while daemon is up
|
||
|
retries=0
|
||
|
while [ $retries -le 61 ]; do
|
||
|
$bin_dir/qping -info "$HOST" "$ping_port" execd 1 > /dev/null 2>&1
|
||
|
if [ $? -ne 0 ]; then
|
||
|
break
|
||
|
else
|
||
|
sleep 1
|
||
|
retries=`expr $retries + 1`
|
||
|
fi
|
||
|
done
|
||
|
if [ $retries -eq 61 ]; then
|
||
|
echo "Execd did not stop in 61 seconds!"
|
||
|
exit 1
|
||
|
fi
|
||
|
if [ $softstop = false ]; then
|
||
|
# Send SIGTERM to all shepherds (send SIGTSTP which is converted to SIGTERM by shepherd)
|
||
|
for jobid in `ls $execd_spool_dir/active_jobs 2>/dev/null`; do
|
||
|
echo " Shutting down Grid Engine shepherd of job $jobid"
|
||
|
Shutdown sge_shepherd $execd_spool_dir/active_jobs/$jobid/pid TSTP
|
||
|
done
|
||
|
fi
|
||
|
fi
|
||
|
fi
|
||
|
|
||
|
if [ "$startup" = true ]; then
|
||
|
# Ensure the shepherd will run, e.g. not missing hwloc dynamic lib
|
||
|
if ! $bin_dir/sge_shepherd -help >/dev/null 2>&1; then
|
||
|
echo "sge_shepherd won't run -- dynamic library missing?"
|
||
|
exit 5
|
||
|
fi
|
||
|
|
||
|
#We want to use smf
|
||
|
if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then
|
||
|
svcadm enable -st $service
|
||
|
exit $?
|
||
|
fi
|
||
|
# execution daemon is started on this host!
|
||
|
if [ "$SGE_EXECD_PORT" = "" ]; then
|
||
|
ping_port=`$utilbin_dir/getservbyname -number sge_execd`
|
||
|
else
|
||
|
ping_port=$SGE_EXECD_PORT
|
||
|
fi
|
||
|
echo " Starting Grid Engine execution daemon"
|
||
|
exec 1>/dev/null 2>&1
|
||
|
$bin_dir/sge_execd
|
||
|
[ $? -eq 0 -a -d /var/lock/subsys ] && touch /var/lock/subsys/sgeexecd
|
||
|
|
||
|
#Don't exit until daemon is up
|
||
|
retries=0
|
||
|
while [ $retries -le 61 ]; do
|
||
|
$bin_dir/qping -info "$HOST" "$ping_port" execd 1 > /dev/null 2>&1
|
||
|
if [ $? -eq 0 ]; then
|
||
|
break
|
||
|
else
|
||
|
sleep 1
|
||
|
retries=`expr $retries + 1`
|
||
|
fi
|
||
|
done
|
||
|
if [ $retries -eq 61 ]; then
|
||
|
echo "Execd did not start in 61 seconds!"
|
||
|
exit 1
|
||
|
fi
|
||
|
exit 0
|
||
|
fi
|
||
|
|
||
|
if [ "$status" = true ]; then
|
||
|
if [ -f $pidfile ]; then
|
||
|
pid=`cat $pidfile`
|
||
|
if $utilbin_dir/checkprog $pid $name > /dev/null; then
|
||
|
echo "execd (pid $pid) is running..."
|
||
|
exit 0
|
||
|
else
|
||
|
echo "execd (pid $pid) is not running..."
|
||
|
exit 1
|
||
|
fi
|
||
|
else
|
||
|
echo "execd is not running..."
|
||
|
exit 1
|
||
|
fi
|
||
|
fi
|