#!/bin/bash ########################################################## # Allocate requested GPU's: # step 1: get resource GPU # step 2: loop over installed GPU's # step 2a: try to set lock file # step 2b: set CUDA_VISIBLE_DEVICES # step 3: add CUDA_VISIBLE_DEVICES to job environment ########################################################## ### set variables LOCK_FILE=/tmp/gpu-lockfile function debug() { echo "$@" } ### function clean_up # exit with error code # 0: no error # 99: reschedule job # 100: put job in error state # else: put queue in error state function clean_up() { error_code=${1:=0} files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo) if [ ! -z "${files}" ] ; then for file in ${files} ; do rm -f ${file} || exit 1 done fi exit ${error_code} } ### get requested number of GPU's # use hard resource list first NGPUS=$(qstat -j ${JOB_ID} 2>/dev/null | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p") # set NGPUS to zero if empty if [ -z "${NGPUS}" ] ; then NGPUS=0 fi # get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed)) [ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0 ## loop over devices and try to allocate one until enough GPU's are allocated CUDA_VISIBLE_DEVICES='' count=0 if [ "${NGPUS}" -gt "0" ] ; then for gpu in ${GPU_LIST} ; do if [ ! -f ${LOCK_FILE}-${gpu} ] ; then echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99 CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}" let "count++" fi # exit loop when enough GPUS are allocated [ "${count}" -ge "${NGPUS}" ] && break done fi ## add CUDA_VISIBLE_DEVICES to job's environment if [ "${count}" -ge "${NGPUS}" ] ; then ENV_FILE=$SGE_JOB_SPOOL_DIR/environment [ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100 else clean_up 99 fi # clean exit exit 0