Compare commits

...

5 Commits

Author SHA1 Message Date
097ccac7bd add scripts for allocating / releasing GPU's 2021-05-27 11:50:46 +02:00
d9ca7c3127 Add support for pe_prolog and pe_epilog files
Send all command line args to the files to be run
Add script to remove empty *.pe* and *.po* files after running pe jobs
Scripts expect this 11 commandline arguments: $pe_hostfile $host $job_owner $job_id $job_name $pe $pe_slots $queue $stdout_path $stderr_path $merge_stderr
2021-05-25 13:22:39 +02:00
d76510b2aa Fixing pro-epilog_wrapper.sh to prevent error
pro-epilog_wrapper exited with error when dir contains more than 1 file
2021-05-25 13:17:42 +02:00
d49fd18668 fixing path to prolog.d, epilog.d, pe_prolog.d, pe_epilog.d; remove echo 2021-05-25 12:14:15 +02:00
02913e042d create modular prolog and epilog setup
all files named  [0-9][0-9]* in /opt/SGE/local/scripts/{prolog,epilog,pe_prolog,pe_epilog}.d and in /usr/local/etc/gridengine/{prolog,epilog,pe_prolog,pe_epilog}.d will be executed.
2021-05-25 11:33:08 +02:00
11 changed files with 140 additions and 0 deletions

1
bin/epilog Symbolic link
View File

@ -0,0 +1 @@
pro-epilog_wrapper.sh

1
bin/pe_epilog Symbolic link
View File

@ -0,0 +1 @@
pro-epilog_wrapper.sh

1
bin/pe_prolog Symbolic link
View File

@ -0,0 +1 @@
pro-epilog_wrapper.sh

20
bin/pro-epilog_wrapper.sh Executable file
View File

@ -0,0 +1,20 @@
#!/bin/bash
PATH=/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin
FILES=""
SCRIPTNAME=$(basename $0)
DIRS="/opt/SGE/local/${SCRIPTNAME}.d /usr/local/etc/gridengine/${SCRIPTNAME}.d"
for DIR in ${DIRS} ; do
if [ -d ${DIR} ] ; then
if [ ! -z "$(ls ${DIR}/[0-9][0-9]*)" ] ; then
FILES="${FILES} $(ls ${DIR}/[0-9][0-9]*)"
fi
fi
done
FILESSORTED="$(echo ${FILES} | sort)"
for FILE in ${FILESSORTED} ; do
${FILE} $@ || exit $?
done
exit 0

1
bin/prolog Symbolic link
View File

@ -0,0 +1 @@
pro-epilog_wrapper.sh

17
epilog.d/95-GPU_release.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/bash
###############################################
# release allocated GPUS
###############################################
### set variables
LOCK_FILE=/tmp/gpu-lockfile
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
if [ ! -z "${files}" ] ; then
for file in ${files} ; do
rm -f ${file} || exit 1
done
fi
exit 0

0
pe_epilog.d/.gitkeep Normal file
View File

View File

@ -0,0 +1,28 @@
#!/bin/bash
## Delete the STDOUT and STDERR files (.o and .e) if they are empty
## ( we do not want to delete non-empty files, they may contain useful
## troubleshooting or debug information ... )
##
## input args:
# 1: $pe_hostfile
# 2: $host
# 3: $job_owner
# 4: $job_id
# 5: $job_name
# 6: $pe
# 7: $pe_slots
# 8: $queue
# 9: $stdout_path
# 10: $stderr_path
# 11: $merge_stderr
stdout_path=${9}
stderr_path=${10}
[ -r ${stdout_path} -a -f ${stdout_path} ] && [ ! -s ${stdout_path} ] && rm -f ${stdout_path}
[ -r ${stderr_path} -a -f ${stderr_path} ] && [ ! -s ${stderr_path} ] && rm -f ${stderr_path}
exit 0

0
pe_prolog.d/.gitkeep Normal file
View File

0
prolog.d/.gitkeep Normal file
View File

71
prolog.d/10-GPU_allocate.sh Executable file
View File

@ -0,0 +1,71 @@
#!/bin/bash
##########################################################
# Allocate requested GPU's:
# step 1: get resource GPU
# step 2: loop over installed GPU's
# step 2a: try to set lock file
# step 2b: set CUDA_VISIBLE_DEVICES
# step 3: add CUDA_VISIBLE_DEVICES to job environment
##########################################################
### set variables
LOCK_FILE=/tmp/gpu-lockfile
function debug() {
echo "$@"
}
### function clean_up
# exit with error code
# 0: no error
# 99: reschedule job
# 100: put job in error state
# else: put queue in error state
function clean_up() {
error_code=${1:=0}
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
if [ ! -z "${files}" ] ; then
for file in ${files} ; do
rm -f ${file} || exit 1
done
fi
exit ${error_code}
}
### get requested number of GPU's
# use hard resource list first
NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
# set NGPUS to zero if empty
if [ -z "${NGPUS}" ] ; then
NGPUS=0
fi
# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0
## loop over devices and try to allocate one until enough GPU's are allocated
CUDA_VISIBLE_DEVICES=''
count=0
if [ "${NGPUS}" -gt "0" ] ; then
for gpu in ${GPU_LIST} ; do
if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
let "count++"
fi
# exit loop when enough GPUS are allocated
[ "${count}" -ge "${NGPUS}" ] && break
done
fi
## add CUDA_VISIBLE_DEVICES to job's environment
if [ "${count}" -ge "${NGPUS}" ] ; then
ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
[ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
else
clean_up 99
fi
# clean exit
exit 0