From 097ccac7bd8de8b3ba8cf3f4b64d89a06b05245a Mon Sep 17 00:00:00 2001 From: "Kasper D. Fischer" Date: Wed, 26 May 2021 19:48:41 +0200 Subject: [PATCH] add scripts for allocating / releasing GPU's --- epilog.d/95-GPU_release.sh | 17 +++++++++ prolog.d/10-GPU_allocate.sh | 71 +++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100755 epilog.d/95-GPU_release.sh create mode 100755 prolog.d/10-GPU_allocate.sh diff --git a/epilog.d/95-GPU_release.sh b/epilog.d/95-GPU_release.sh new file mode 100755 index 0000000..cd981fe --- /dev/null +++ b/epilog.d/95-GPU_release.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +############################################### +# release allocated GPUS +############################################### + +### set variables +LOCK_FILE=/tmp/gpu-lockfile + +files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo) +if [ ! -z "${files}" ] ; then + for file in ${files} ; do + rm -f ${file} || exit 1 + done +fi + +exit 0 diff --git a/prolog.d/10-GPU_allocate.sh b/prolog.d/10-GPU_allocate.sh new file mode 100755 index 0000000..8e7f42c --- /dev/null +++ b/prolog.d/10-GPU_allocate.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +########################################################## +# Allocate requested GPU's: +# step 1: get resource GPU +# step 2: loop over installed GPU's +# step 2a: try to set lock file +# step 2b: set CUDA_VISIBLE_DEVICES +# step 3: add CUDA_VISIBLE_DEVICES to job environment +########################################################## + +### set variables +LOCK_FILE=/tmp/gpu-lockfile + +function debug() { + echo "$@" +} + +### function clean_up +# exit with error code +# 0: no error +# 99: reschedule job +# 100: put job in error state +# else: put queue in error state +function clean_up() { + error_code=${1:=0} + files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo) + if [ ! -z "${files}" ] ; then + for file in ${files} ; do + rm -f ${file} || exit 1 + done + fi + exit ${error_code} +} + +### get requested number of GPU's +# use hard resource list first +NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p") +# set NGPUS to zero if empty +if [ -z "${NGPUS}" ] ; then + NGPUS=0 +fi + +# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed)) +[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0 + +## loop over devices and try to allocate one until enough GPU's are allocated +CUDA_VISIBLE_DEVICES='' +count=0 +if [ "${NGPUS}" -gt "0" ] ; then + for gpu in ${GPU_LIST} ; do + if [ ! -f ${LOCK_FILE}-${gpu} ] ; then + echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99 + CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}" + let "count++" + fi + # exit loop when enough GPUS are allocated + [ "${count}" -ge "${NGPUS}" ] && break + done +fi + +## add CUDA_VISIBLE_DEVICES to job's environment +if [ "${count}" -ge "${NGPUS}" ] ; then + ENV_FILE=$SGE_JOB_SPOOL_DIR/environment + [ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100 +else + clean_up 99 +fi + +# clean exit +exit 0 \ No newline at end of file