Kasper D. Fischer
deaa1effd9
qstat can not be found on host with outdated debian versions. These hosts do not have GPU's installed.
72 lines
2.1 KiB
Bash
Executable File
72 lines
2.1 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
##########################################################
|
|
# Allocate requested GPU's:
|
|
# step 1: get resource GPU
|
|
# step 2: loop over installed GPU's
|
|
# step 2a: try to set lock file
|
|
# step 2b: set CUDA_VISIBLE_DEVICES
|
|
# step 3: add CUDA_VISIBLE_DEVICES to job environment
|
|
##########################################################
|
|
|
|
### set variables
|
|
LOCK_FILE=/tmp/gpu-lockfile
|
|
|
|
function debug() {
|
|
echo "$@"
|
|
}
|
|
|
|
### function clean_up
|
|
# exit with error code
|
|
# 0: no error
|
|
# 99: reschedule job
|
|
# 100: put job in error state
|
|
# else: put queue in error state
|
|
function clean_up() {
|
|
error_code=${1:=0}
|
|
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
|
if [ ! -z "${files}" ] ; then
|
|
for file in ${files} ; do
|
|
rm -f ${file} || exit 1
|
|
done
|
|
fi
|
|
exit ${error_code}
|
|
}
|
|
|
|
### get requested number of GPU's
|
|
# use hard resource list first
|
|
NGPUS=$(qstat -j ${JOB_ID} 2>/dev/null | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
|
|
# set NGPUS to zero if empty
|
|
if [ -z "${NGPUS}" ] ; then
|
|
NGPUS=0
|
|
fi
|
|
|
|
# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
|
|
[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0
|
|
|
|
## loop over devices and try to allocate one until enough GPU's are allocated
|
|
CUDA_VISIBLE_DEVICES=''
|
|
count=0
|
|
if [ "${NGPUS}" -gt "0" ] ; then
|
|
for gpu in ${GPU_LIST} ; do
|
|
if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
|
|
echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
|
|
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
|
|
let "count++"
|
|
fi
|
|
# exit loop when enough GPUS are allocated
|
|
[ "${count}" -ge "${NGPUS}" ] && break
|
|
done
|
|
fi
|
|
|
|
## add CUDA_VISIBLE_DEVICES to job's environment
|
|
if [ "${count}" -ge "${NGPUS}" ] ; then
|
|
ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
|
|
[ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
|
|
else
|
|
clean_up 99
|
|
fi
|
|
|
|
# clean exit
|
|
exit 0
|