Compare commits
5 Commits
a5963b445b
...
097ccac7bd
Author | SHA1 | Date | |
---|---|---|---|
097ccac7bd | |||
d9ca7c3127 | |||
d76510b2aa | |||
d49fd18668 | |||
02913e042d |
1
bin/epilog
Symbolic link
1
bin/epilog
Symbolic link
@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
1
bin/pe_epilog
Symbolic link
1
bin/pe_epilog
Symbolic link
@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
1
bin/pe_prolog
Symbolic link
1
bin/pe_prolog
Symbolic link
@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
20
bin/pro-epilog_wrapper.sh
Executable file
20
bin/pro-epilog_wrapper.sh
Executable file
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
PATH=/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin
|
||||
|
||||
FILES=""
|
||||
SCRIPTNAME=$(basename $0)
|
||||
DIRS="/opt/SGE/local/${SCRIPTNAME}.d /usr/local/etc/gridengine/${SCRIPTNAME}.d"
|
||||
for DIR in ${DIRS} ; do
|
||||
if [ -d ${DIR} ] ; then
|
||||
if [ ! -z "$(ls ${DIR}/[0-9][0-9]*)" ] ; then
|
||||
FILES="${FILES} $(ls ${DIR}/[0-9][0-9]*)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
FILESSORTED="$(echo ${FILES} | sort)"
|
||||
|
||||
for FILE in ${FILESSORTED} ; do
|
||||
${FILE} $@ || exit $?
|
||||
done
|
||||
exit 0
|
1
bin/prolog
Symbolic link
1
bin/prolog
Symbolic link
@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
17
epilog.d/95-GPU_release.sh
Executable file
17
epilog.d/95-GPU_release.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
###############################################
|
||||
# release allocated GPUS
|
||||
###############################################
|
||||
|
||||
### set variables
|
||||
LOCK_FILE=/tmp/gpu-lockfile
|
||||
|
||||
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
||||
if [ ! -z "${files}" ] ; then
|
||||
for file in ${files} ; do
|
||||
rm -f ${file} || exit 1
|
||||
done
|
||||
fi
|
||||
|
||||
exit 0
|
0
pe_epilog.d/.gitkeep
Normal file
0
pe_epilog.d/.gitkeep
Normal file
28
pe_epilog.d/99-rm_empty_pe_logs.sh
Executable file
28
pe_epilog.d/99-rm_empty_pe_logs.sh
Executable file
@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
|
||||
## Delete the STDOUT and STDERR files (.o and .e) if they are empty
|
||||
## ( we do not want to delete non-empty files, they may contain useful
|
||||
## troubleshooting or debug information ... )
|
||||
##
|
||||
|
||||
## input args:
|
||||
# 1: $pe_hostfile
|
||||
# 2: $host
|
||||
# 3: $job_owner
|
||||
# 4: $job_id
|
||||
# 5: $job_name
|
||||
# 6: $pe
|
||||
# 7: $pe_slots
|
||||
# 8: $queue
|
||||
# 9: $stdout_path
|
||||
# 10: $stderr_path
|
||||
# 11: $merge_stderr
|
||||
|
||||
stdout_path=${9}
|
||||
stderr_path=${10}
|
||||
|
||||
[ -r ${stdout_path} -a -f ${stdout_path} ] && [ ! -s ${stdout_path} ] && rm -f ${stdout_path}
|
||||
[ -r ${stderr_path} -a -f ${stderr_path} ] && [ ! -s ${stderr_path} ] && rm -f ${stderr_path}
|
||||
|
||||
exit 0
|
||||
|
0
pe_prolog.d/.gitkeep
Normal file
0
pe_prolog.d/.gitkeep
Normal file
0
prolog.d/.gitkeep
Normal file
0
prolog.d/.gitkeep
Normal file
71
prolog.d/10-GPU_allocate.sh
Executable file
71
prolog.d/10-GPU_allocate.sh
Executable file
@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
|
||||
##########################################################
|
||||
# Allocate requested GPU's:
|
||||
# step 1: get resource GPU
|
||||
# step 2: loop over installed GPU's
|
||||
# step 2a: try to set lock file
|
||||
# step 2b: set CUDA_VISIBLE_DEVICES
|
||||
# step 3: add CUDA_VISIBLE_DEVICES to job environment
|
||||
##########################################################
|
||||
|
||||
### set variables
|
||||
LOCK_FILE=/tmp/gpu-lockfile
|
||||
|
||||
function debug() {
|
||||
echo "$@"
|
||||
}
|
||||
|
||||
### function clean_up
|
||||
# exit with error code
|
||||
# 0: no error
|
||||
# 99: reschedule job
|
||||
# 100: put job in error state
|
||||
# else: put queue in error state
|
||||
function clean_up() {
|
||||
error_code=${1:=0}
|
||||
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
||||
if [ ! -z "${files}" ] ; then
|
||||
for file in ${files} ; do
|
||||
rm -f ${file} || exit 1
|
||||
done
|
||||
fi
|
||||
exit ${error_code}
|
||||
}
|
||||
|
||||
### get requested number of GPU's
|
||||
# use hard resource list first
|
||||
NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
|
||||
# set NGPUS to zero if empty
|
||||
if [ -z "${NGPUS}" ] ; then
|
||||
NGPUS=0
|
||||
fi
|
||||
|
||||
# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
|
||||
[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0
|
||||
|
||||
## loop over devices and try to allocate one until enough GPU's are allocated
|
||||
CUDA_VISIBLE_DEVICES=''
|
||||
count=0
|
||||
if [ "${NGPUS}" -gt "0" ] ; then
|
||||
for gpu in ${GPU_LIST} ; do
|
||||
if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
|
||||
echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
|
||||
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
|
||||
let "count++"
|
||||
fi
|
||||
# exit loop when enough GPUS are allocated
|
||||
[ "${count}" -ge "${NGPUS}" ] && break
|
||||
done
|
||||
fi
|
||||
|
||||
## add CUDA_VISIBLE_DEVICES to job's environment
|
||||
if [ "${count}" -ge "${NGPUS}" ] ; then
|
||||
ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
|
||||
[ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
|
||||
else
|
||||
clean_up 99
|
||||
fi
|
||||
|
||||
# clean exit
|
||||
exit 0
|
Loading…
x
Reference in New Issue
Block a user