add scripts for allocating / releasing GPU's

2021-05-26 19:48:41 +02:00
parent d9ca7c3127
commit 097ccac7bd
2 changed files with 88 additions and 0 deletions
--- a/epilog.d/95-GPU_release.sh
+++ b/epilog.d/95-GPU_release.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+###############################################
+# release allocated GPUS
+###############################################
+
+### set variables
+LOCK_FILE=/tmp/gpu-lockfile
+
+files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
+if [ ! -z "${files}" ] ; then
+    for file in ${files} ; do
+        rm -f ${file} || exit 1
+    done
+fi
+
+exit 0
--- a/prolog.d/10-GPU_allocate.sh
+++ b/prolog.d/10-GPU_allocate.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+##########################################################
+# Allocate requested GPU's:
+#   step 1: get resource GPU
+#   step 2: loop over installed GPU's
+#   step 2a: try to set lock file
+#   step 2b: set CUDA_VISIBLE_DEVICES
+#   step 3: add CUDA_VISIBLE_DEVICES to job environment
+##########################################################
+
+### set variables
+LOCK_FILE=/tmp/gpu-lockfile
+
+function debug() {
+    echo "$@"
+}
+
+### function clean_up
+# exit with error code
+# 0: no error
+# 99: reschedule job
+# 100: put job in error state
+# else: put queue in error state
+function clean_up() {
+    error_code=${1:=0}
+    files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
+    if [ ! -z "${files}" ] ; then
+        for file in ${files} ; do
+            rm -f ${file} || exit 1
+        done
+    fi
+    exit ${error_code}
+}
+
+### get requested number of GPU's
+# use hard resource list first
+NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
+# set NGPUS to zero if empty
+if [ -z "${NGPUS}" ] ; then
+    NGPUS=0
+fi
+
+# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
+[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0
+
+## loop over devices and try to allocate one until enough GPU's are allocated
+CUDA_VISIBLE_DEVICES=''
+count=0
+if [ "${NGPUS}" -gt "0" ] ; then
+    for gpu in ${GPU_LIST} ; do
+        if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
+            echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
+            CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
+            let "count++"
+        fi
+        # exit loop when enough GPUS are allocated
+        [ "${count}" -ge "${NGPUS}" ] && break
+    done
+fi
+
+## add CUDA_VISIBLE_DEVICES to job's environment
+if [ "${count}" -ge "${NGPUS}" ] ; then
+    ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
+    [ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
+else
+    clean_up 99 
+fi
+
+# clean exit
+exit 0