add scripts for allocating / releasing GPU's
This commit is contained in:
parent
d9ca7c3127
commit
097ccac7bd
17
epilog.d/95-GPU_release.sh
Executable file
17
epilog.d/95-GPU_release.sh
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
###############################################
|
||||||
|
# release allocated GPUS
|
||||||
|
###############################################
|
||||||
|
|
||||||
|
### set variables
|
||||||
|
LOCK_FILE=/tmp/gpu-lockfile
|
||||||
|
|
||||||
|
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
||||||
|
if [ ! -z "${files}" ] ; then
|
||||||
|
for file in ${files} ; do
|
||||||
|
rm -f ${file} || exit 1
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
71
prolog.d/10-GPU_allocate.sh
Executable file
71
prolog.d/10-GPU_allocate.sh
Executable file
@ -0,0 +1,71 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
##########################################################
|
||||||
|
# Allocate requested GPU's:
|
||||||
|
# step 1: get resource GPU
|
||||||
|
# step 2: loop over installed GPU's
|
||||||
|
# step 2a: try to set lock file
|
||||||
|
# step 2b: set CUDA_VISIBLE_DEVICES
|
||||||
|
# step 3: add CUDA_VISIBLE_DEVICES to job environment
|
||||||
|
##########################################################
|
||||||
|
|
||||||
|
### set variables
|
||||||
|
LOCK_FILE=/tmp/gpu-lockfile
|
||||||
|
|
||||||
|
function debug() {
|
||||||
|
echo "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
### function clean_up
|
||||||
|
# exit with error code
|
||||||
|
# 0: no error
|
||||||
|
# 99: reschedule job
|
||||||
|
# 100: put job in error state
|
||||||
|
# else: put queue in error state
|
||||||
|
function clean_up() {
|
||||||
|
error_code=${1:=0}
|
||||||
|
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
||||||
|
if [ ! -z "${files}" ] ; then
|
||||||
|
for file in ${files} ; do
|
||||||
|
rm -f ${file} || exit 1
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
exit ${error_code}
|
||||||
|
}
|
||||||
|
|
||||||
|
### get requested number of GPU's
|
||||||
|
# use hard resource list first
|
||||||
|
NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
|
||||||
|
# set NGPUS to zero if empty
|
||||||
|
if [ -z "${NGPUS}" ] ; then
|
||||||
|
NGPUS=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
|
||||||
|
[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0
|
||||||
|
|
||||||
|
## loop over devices and try to allocate one until enough GPU's are allocated
|
||||||
|
CUDA_VISIBLE_DEVICES=''
|
||||||
|
count=0
|
||||||
|
if [ "${NGPUS}" -gt "0" ] ; then
|
||||||
|
for gpu in ${GPU_LIST} ; do
|
||||||
|
if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
|
||||||
|
echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
|
||||||
|
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
|
||||||
|
let "count++"
|
||||||
|
fi
|
||||||
|
# exit loop when enough GPUS are allocated
|
||||||
|
[ "${count}" -ge "${NGPUS}" ] && break
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
## add CUDA_VISIBLE_DEVICES to job's environment
|
||||||
|
if [ "${count}" -ge "${NGPUS}" ] ; then
|
||||||
|
ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
|
||||||
|
[ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
|
||||||
|
else
|
||||||
|
clean_up 99
|
||||||
|
fi
|
||||||
|
|
||||||
|
# clean exit
|
||||||
|
exit 0
|
Loading…
Reference in New Issue
Block a user