#!/bin/bash

##########################################################
# Allocate requested GPU's:
#   step 1: get resource GPU
#   step 2: loop over installed GPU's
#   step 2a: try to set lock file
#   step 2b: set CUDA_VISIBLE_DEVICES
#   step 3: add CUDA_VISIBLE_DEVICES to job environment
##########################################################

### set variables
LOCK_FILE=/tmp/gpu-lockfile

function debug() {
    echo "$@"
}

### function clean_up
# exit with error code
# 0: no error
# 99: reschedule job
# 100: put job in error state
# else: put queue in error state
function clean_up() {
    error_code=${1:=0}
    files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
    if [ ! -z "${files}" ] ; then
        for file in ${files} ; do
            rm -f ${file} || exit 1
        done
    fi
    exit ${error_code}
}

### get requested number of GPU's
# use hard resource list first
NGPUS=$(qstat -j ${JOB_ID} 2>/dev/null | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
# set NGPUS to zero if empty
if [ -z "${NGPUS}" ] ; then
    NGPUS=0
fi

# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e 2>/dev/null) || exit 0

## loop over devices and try to allocate one until enough GPU's are allocated
CUDA_VISIBLE_DEVICES=''
count=0
if [ "${NGPUS}" -gt "0" ] ; then
    for gpu in ${GPU_LIST} ; do
        if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
            echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
            CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
            let "count++"
        fi
        # exit loop when enough GPUS are allocated
        [ "${count}" -ge "${NGPUS}" ] && break
    done
fi

## add CUDA_VISIBLE_DEVICES to job's environment
if [ "${count}" -ge "${NGPUS}" ] ; then
    ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
    [ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
else
    clean_up 99 
fi

# clean exit
exit 0