add scripts for allocating / releasing GPU's

Add support for pe_prolog and pe_epilog files
Send all command line args to the files to be run Add script to remove empty *.pe* and *.po* files after running pe jobs Scripts expect this 11 commandline arguments: $pe_hostfile $host $job_owner $job_id $job_name $pe $pe_slots $queue $stdout_path $stderr_path $merge_stderr
2021-05-27 11:50:46 +02:00 · 2021-05-25 13:22:39 +02:00 · 2021-05-25 13:17:42 +02:00 · 2021-05-25 12:14:15 +02:00 · 2021-05-25 11:33:08 +02:00
11 changed files with 140 additions and 0 deletions
--- a/bin/epilog
+++ b/bin/epilog
@ -0,0 +1 @@
+pro-epilog_wrapper.sh
--- a/bin/pe_epilog
+++ b/bin/pe_epilog
@ -0,0 +1 @@
+pro-epilog_wrapper.sh
--- a/bin/pe_prolog
+++ b/bin/pe_prolog
@ -0,0 +1 @@
+pro-epilog_wrapper.sh
--- a/bin/pro-epilog_wrapper.sh
+++ b/bin/pro-epilog_wrapper.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+PATH=/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin
+
+FILES=""
+SCRIPTNAME=$(basename $0)
+DIRS="/opt/SGE/local/${SCRIPTNAME}.d /usr/local/etc/gridengine/${SCRIPTNAME}.d"
+for DIR in ${DIRS} ; do
+	if [ -d ${DIR} ] ; then
+		if [ ! -z "$(ls ${DIR}/[0-9][0-9]*)" ] ; then
+			FILES="${FILES} $(ls ${DIR}/[0-9][0-9]*)"
+		fi
+	fi
+done
+FILESSORTED="$(echo ${FILES} | sort)"
+
+for FILE in ${FILESSORTED} ; do
+	${FILE} $@ || exit $?
+done
+exit 0
--- a/bin/prolog
+++ b/bin/prolog
@ -0,0 +1 @@
+pro-epilog_wrapper.sh
--- a/epilog.d/95-GPU_release.sh
+++ b/epilog.d/95-GPU_release.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+###############################################
+# release allocated GPUS
+###############################################
+
+### set variables
+LOCK_FILE=/tmp/gpu-lockfile
+
+files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
+if [ ! -z "${files}" ] ; then
+    for file in ${files} ; do
+        rm -f ${file} || exit 1
+    done
+fi
+
+exit 0
--- a/pe_epilog.d/.gitkeep
+++ b/pe_epilog.d/.gitkeep
--- a/pe_epilog.d/99-rm_empty_pe_logs.sh
+++ b/pe_epilog.d/99-rm_empty_pe_logs.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+## Delete the STDOUT and STDERR files (.o and .e) if they are empty
+##  ( we do not want to delete non-empty files, they may contain useful
+##    troubleshooting or debug information ... )
+##
+
+## input args:
+#	1: $pe_hostfile
+#	2: $host
+#	3: $job_owner
+#	4: $job_id
+#	5: $job_name
+#	6: $pe
+#	7: $pe_slots
+#	8: $queue
+#	9: $stdout_path
+#	10: $stderr_path
+#	11: $merge_stderr
+
+stdout_path=${9}
+stderr_path=${10}
+
+[ -r ${stdout_path} -a -f ${stdout_path} ] && [ ! -s ${stdout_path} ] && rm -f ${stdout_path}
+[ -r ${stderr_path} -a -f ${stderr_path} ] && [ ! -s ${stderr_path} ] && rm -f ${stderr_path}
+
+exit 0
+
--- a/pe_prolog.d/.gitkeep
+++ b/pe_prolog.d/.gitkeep
--- a/prolog.d/.gitkeep
+++ b/prolog.d/.gitkeep
--- a/prolog.d/10-GPU_allocate.sh
+++ b/prolog.d/10-GPU_allocate.sh
@ -0,0 +1,71 @@
+#!/bin/bash
+
+##########################################################
+# Allocate requested GPU's:
+#   step 1: get resource GPU
+#   step 2: loop over installed GPU's
+#   step 2a: try to set lock file
+#   step 2b: set CUDA_VISIBLE_DEVICES
+#   step 3: add CUDA_VISIBLE_DEVICES to job environment
+##########################################################
+
+### set variables
+LOCK_FILE=/tmp/gpu-lockfile
+
+function debug() {
+    echo "$@"
+}
+
+### function clean_up
+# exit with error code
+# 0: no error
+# 99: reschedule job
+# 100: put job in error state
+# else: put queue in error state
+function clean_up() {
+    error_code=${1:=0}
+    files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
+    if [ ! -z "${files}" ] ; then
+        for file in ${files} ; do
+            rm -f ${file} || exit 1
+        done
+    fi
+    exit ${error_code}
+}
+
+### get requested number of GPU's
+# use hard resource list first
+NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
+# set NGPUS to zero if empty
+if [ -z "${NGPUS}" ] ; then
+    NGPUS=0
+fi
+
+# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
+[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0
+
+## loop over devices and try to allocate one until enough GPU's are allocated
+CUDA_VISIBLE_DEVICES=''
+count=0
+if [ "${NGPUS}" -gt "0" ] ; then
+    for gpu in ${GPU_LIST} ; do
+        if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
+            echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
+            CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
+            let "count++"
+        fi
+        # exit loop when enough GPUS are allocated
+        [ "${count}" -ge "${NGPUS}" ] && break
+    done
+fi
+
+## add CUDA_VISIBLE_DEVICES to job's environment
+if [ "${count}" -ge "${NGPUS}" ] ; then
+    ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
+    [ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
+else
+    clean_up 99 
+fi
+
+# clean exit
+exit 0
Author	SHA1	Message	Date
Kasper D. Fischer	097ccac7bd	add scripts for allocating / releasing GPU's	2021-05-27 11:50:46 +02:00
Kasper D. Fischer	d9ca7c3127	Add support for pe_prolog and pe_epilog files Send all command line args to the files to be run Add script to remove empty .pe and .po files after running pe jobs Scripts expect this 11 commandline arguments: $pe_hostfile $host $job_owner $job_id $job_name $pe $pe_slots $queue $stdout_path $stderr_path $merge_stderr	2021-05-25 13:22:39 +02:00
Kasper D. Fischer	d76510b2aa	Fixing pro-epilog_wrapper.sh to prevent error pro-epilog_wrapper exited with error when dir contains more than 1 file	2021-05-25 13:17:42 +02:00
Kasper D. Fischer	d49fd18668	fixing path to prolog.d, epilog.d, pe_prolog.d, pe_epilog.d; remove echo	2021-05-25 12:14:15 +02:00
Kasper D. Fischer	02913e042d	create modular prolog and epilog setup all files named [0-9][0-9]* in /opt/SGE/local/scripts/{prolog,epilog,pe_prolog,pe_epilog}.d and in /usr/local/etc/gridengine/{prolog,epilog,pe_prolog,pe_epilog}.d will be executed.	2021-05-25 11:33:08 +02:00