move everything to local folder
This commit is contained in:
1
local/bin/epilog
Symbolic link
1
local/bin/epilog
Symbolic link
@@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
||||
1
local/bin/pe_epilog
Symbolic link
1
local/bin/pe_epilog
Symbolic link
@@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
||||
1
local/bin/pe_prolog
Symbolic link
1
local/bin/pe_prolog
Symbolic link
@@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
||||
20
local/bin/pro-epilog_wrapper.sh
Executable file
20
local/bin/pro-epilog_wrapper.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
PATH=/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin
|
||||
|
||||
FILES=""
|
||||
SCRIPTNAME=$(basename $0)
|
||||
DIRS="/opt/SGE/local/${SCRIPTNAME}.d /usr/local/etc/gridengine/${SCRIPTNAME}.d"
|
||||
for DIR in ${DIRS} ; do
|
||||
if [ -d ${DIR} ] ; then
|
||||
if [ ! -z "$(ls ${DIR}/[0-9][0-9]*)" ] ; then
|
||||
FILES="${FILES} $(ls ${DIR}/[0-9][0-9]*)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
FILESSORTED="$(echo ${FILES} | sort)"
|
||||
|
||||
for FILE in ${FILESSORTED} ; do
|
||||
${FILE} $@ || exit $?
|
||||
done
|
||||
exit 0
|
||||
1
local/bin/prolog
Symbolic link
1
local/bin/prolog
Symbolic link
@@ -0,0 +1 @@
|
||||
pro-epilog_wrapper.sh
|
||||
5
local/bin/qlogin_wrapper
Executable file
5
local/bin/qlogin_wrapper
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
# $Id: qlogin_wrapper 175 2010-09-15 15:34:28Z kasper $
|
||||
HOST=$1
|
||||
PORT=$2
|
||||
/usr/bin/ssh -XAq -p $PORT $USER@$HOST
|
||||
25
local/bin/suspend.sh
Executable file
25
local/bin/suspend.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/usr/bin/ksh
|
||||
# $Id: suspend.sh 365 2013-11-18 09:58:17Z kasper $
|
||||
# This script should be added as the SUSPEND_METHOD in the
|
||||
# queue definition with a $job_pid, $job_id, and $job_owner arguments.
|
||||
# e.g. script.sh $job_pid $job_id $job_owner
|
||||
|
||||
if [ -z "$3" ]
|
||||
then
|
||||
echo "Usage: $0 \$job_pid \$job_id \$job_owner"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
stat=`pgrep -g $1`
|
||||
if [ ! -z "$stat" ]
|
||||
then
|
||||
#echo "Sending $sig to $1" >> ~$3/qdel_log.log
|
||||
/usr/bin/pkill --signal SIGTSTP -g $1
|
||||
else
|
||||
echo "Process $1 not found for job $2" >> ~$3/qdel_log.log
|
||||
echo "Unable to suspend." >> ~$3/qdel_log.log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#uncomment the following for debugging
|
||||
#echo "Suspending Job $2 " >> ~$3/qdel_log.log
|
||||
30
local/bin/term.sh
Executable file
30
local/bin/term.sh
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/ksh
|
||||
# $Id: term.sh 364 2013-11-18 09:55:05Z kasper $
|
||||
# This script should be added as the TERMINATE_METHOD in the
|
||||
# queue definition with $job_pid, $job_id, $job_owner, and interval arguments.
|
||||
# e.g. script.sh $job_pid $job_id $job_owner 90
|
||||
|
||||
if [ -z "$4" ]
|
||||
then
|
||||
echo "Usage: $0 \$job_pid \$job_id \$job_owner interval"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#echo "Term script Running on: $USER $1 $2 $3 $4" >> ~$3/qdel_log.log
|
||||
#echo `pgrep -g $1` >> ~$3/qdel_log.log
|
||||
|
||||
for sig in 2 15 9
|
||||
do
|
||||
stat=`pgrep -g $1 -u $3`
|
||||
if [ ! -z "$stat" ]
|
||||
then
|
||||
#echo "Sending $sig to $1" >> ~$3/qdel_log.log
|
||||
/usr/bin/pkill --signal $sig -g $1
|
||||
sleep $4
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
#uncomment the following for debugging
|
||||
#echo "Job $2 killed." >> ~$3/qdel_log.log
|
||||
17
local/epilog.d/95-GPU_release.sh
Executable file
17
local/epilog.d/95-GPU_release.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
###############################################
|
||||
# release allocated GPUS
|
||||
###############################################
|
||||
|
||||
### set variables
|
||||
LOCK_FILE=/tmp/gpu-lockfile
|
||||
|
||||
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
||||
if [ ! -z "${files}" ] ; then
|
||||
for file in ${files} ; do
|
||||
rm -f ${file} || exit 1
|
||||
done
|
||||
fi
|
||||
|
||||
exit 0
|
||||
11
local/epilog.d/99-rm_empty_logs.sh
Executable file
11
local/epilog.d/99-rm_empty_logs.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#! /bin/bash
|
||||
# $Id: epilog 181 2010-09-17 15:55:28Z kasper $
|
||||
|
||||
## Delete the STDOUT and STDERR files (.o and .e) if they are empty
|
||||
## ( we do not want to delete non-empty files, they may contain useful
|
||||
## troubleshooting or debug information ... )
|
||||
##
|
||||
[ -r $SGE_STDOUT_PATH -a -f $SGE_STDOUT_PATH ] && [ ! -s $SGE_STDOUT_PATH ] && rm -f $SGE_STDOUT_PATH
|
||||
[ -r $SGE_STDERR_PATH -a -f $SGE_STDERR_PATH ] && [ ! -s $SGE_STDERR_PATH ] && rm -f $SGE_STDERR_PATH
|
||||
|
||||
exit 0
|
||||
81
local/examples/jobs/matlab_script.sh
Executable file
81
local/examples/jobs/matlab_script.sh
Executable file
@@ -0,0 +1,81 @@
|
||||
#!/bin/bash
|
||||
|
||||
#############################################################
|
||||
# This example produces a very simple plot and #
|
||||
# saves it as Matlab figure file and as PNG file #
|
||||
#############################################################
|
||||
|
||||
#############################################################
|
||||
# set qsub options #
|
||||
#############################################################
|
||||
# run in low.q
|
||||
#$ -l low
|
||||
|
||||
# request enough memory
|
||||
#$ -l h_vmem=8G,memory=8G,h_stack=8M
|
||||
|
||||
# request 1 matlab license.
|
||||
#$ -l matlab=1
|
||||
|
||||
# Name the job 'Matlab'
|
||||
#$ -N Matlab
|
||||
|
||||
# send e-mail after job has finished
|
||||
# use the -M option to define your e-mail address
|
||||
# #$ -M meine-email@example.org
|
||||
#$ -m e
|
||||
|
||||
# join stdout and stderr in one file
|
||||
#$ -j y
|
||||
|
||||
#############################################################
|
||||
# output hostname and date (comment out if not needed) #
|
||||
#############################################################
|
||||
echo "Runnning Matlab on host " `hostname`
|
||||
echo "Starting Matlab at " `date`
|
||||
|
||||
#############################################################
|
||||
# launch matlab #
|
||||
#############################################################
|
||||
|
||||
# run non-interactive Matlab session
|
||||
# use no display (-nodisplay)
|
||||
# don't show splash screen at startup (-nosplash)
|
||||
# don't start the matlab desktop (-nodesktop)
|
||||
# use software opengl (-softwareopengl)
|
||||
# only use single threaded computations (limit to use of 1 core, -singleCompThread)
|
||||
# execute all matlab commands between '<< END' and matching 'END'
|
||||
|
||||
# Don't forget to add 'exit' and 'END' after replacing
|
||||
# the commands with your own!
|
||||
|
||||
/opt/matlab/bin/matlab -nodisplay -nosplash -nodesktop -softwareopengl -singleCompThread << END
|
||||
|
||||
% get environment variable JOB_ID
|
||||
jobid=str2num(getenv('JOB_ID'));
|
||||
if isempty(jobid)
|
||||
jobid = 0;
|
||||
end
|
||||
|
||||
% create filenames for the figure
|
||||
filename=sprintf('matlab_figure_%d', jobid);
|
||||
|
||||
% create new empty figure and save figure handle
|
||||
fh = figure();
|
||||
|
||||
% draw plot
|
||||
plot(-pi:0.01:pi, sin(-pi:0.01:pi));
|
||||
|
||||
% save figure as matlab figure and PNG
|
||||
saveas(fh, filename, 'fig');
|
||||
saveas(fh, filename, 'png');
|
||||
|
||||
% EXIT MATLAB
|
||||
exit;
|
||||
|
||||
END
|
||||
|
||||
#############################################################
|
||||
# output date (comment out if not needed) #
|
||||
#############################################################
|
||||
echo "Matlab finnished at " `date`
|
||||
75
local/examples/jobs/periodic_sleeper.sh
Executable file
75
local/examples/jobs/periodic_sleeper.sh
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/bin/bash
|
||||
# This job script takes a nap for 10 seconds (or paramter $2) every 30 minutes (or paramter $1)
|
||||
|
||||
# SGE options
|
||||
#$ -N PSleeper
|
||||
#$ -l scf=1M,mem=100M,h_vmem=100M
|
||||
#$ -q normal.q
|
||||
#$ -cwd
|
||||
|
||||
# process args
|
||||
case "$1" in
|
||||
-h)
|
||||
echo "usage: $0 [-h | [-d] [T] [nap]]"
|
||||
echo "periodically take a nap"
|
||||
echo ""
|
||||
echo "-h print this help and exit"
|
||||
echo "-d print debug info"
|
||||
echo "T take a nap every T minutes (default: 30)"
|
||||
echo "nap take a nap for nap seconds (default: 10)"
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
debug=0
|
||||
terse="-terse"
|
||||
debug_flag=""
|
||||
do_echo=0
|
||||
T=30
|
||||
nap=10
|
||||
while (( "$#" )); do
|
||||
case "$1" in
|
||||
-d)
|
||||
debug=1
|
||||
terse=""
|
||||
debug_flag="-d"
|
||||
do_echo=1
|
||||
;;
|
||||
*)
|
||||
T=${1:-30}
|
||||
nap=${2:-10}
|
||||
break
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
;;
|
||||
esac
|
||||
|
||||
# set other variables
|
||||
next=$(date -d "${T} minutes" +%Y%m%d%H%M)
|
||||
script=/opt/SGE/examples/jobs/periodic_sleeper.sh
|
||||
|
||||
# output some informations
|
||||
if [ ${debug} -eq 1 ]; then
|
||||
echo "T = ${T}, nap=${nap}"
|
||||
echo "next run at ${next} (YYYYMMDDhhmm)"
|
||||
echo "debug_flag = ${debug_flag}, do_echo = ${do_echo}"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# commands to run in Grid Engine
|
||||
/opt/SGE/examples/jobs/sleeper.sh ${nap} ${do_echo}
|
||||
|
||||
# re-submit script to execute in T minutes
|
||||
jobid=$(qsub ${terse} -a ${next} ${script} ${debug_flag} ${T} ${nap})
|
||||
exit_code=$?
|
||||
if [ ${debug} -eq 1 ]; then
|
||||
echo "${jobid}"
|
||||
fi
|
||||
if [ ${exit_code} -ne 0 ]; then
|
||||
if [ ${debug} -eq 1 ]; then
|
||||
echo "${jobid}"
|
||||
echo "Ups, something went wrong, check output!"
|
||||
fi
|
||||
exit ${exit_code}
|
||||
fi
|
||||
44
local/examples/jobs/show_available_cuda_devices.sh
Executable file
44
local/examples/jobs/show_available_cuda_devices.sh
Executable file
@@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
#############################################################
|
||||
# use tensorflow to show availabel GPU devices #
|
||||
# optional argument is the conda environment to use #
|
||||
# default is tf-gpu #
|
||||
#############################################################
|
||||
TF_ENV=${1:-tf-gpu}
|
||||
if [ ${TF_ENV} = "-h" ] ; then
|
||||
echo "Usage: $(basename $0) [tensor_flow_env]"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
#############################################################
|
||||
# set qsub options #
|
||||
#############################################################
|
||||
#$ -cwd
|
||||
#$ -N CUDAtest
|
||||
#$ -l memory=64G,h_vmem=64G
|
||||
|
||||
#############################################################
|
||||
# initialize conda #
|
||||
#############################################################
|
||||
__conda_setup="$('/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
|
||||
if [ $? -eq 0 ]; then
|
||||
eval "$__conda_setup"
|
||||
else
|
||||
if [ -f "/opt/anaconda3/etc/profile.d/conda.sh" ]; then
|
||||
. "/opt/anaconda3/etc/profile.d/conda.sh"
|
||||
else
|
||||
export PATH="/opt/anaconda3/bin:$PATH"
|
||||
fi
|
||||
fi
|
||||
unset __conda_setup
|
||||
# <<< conda initialize <<<
|
||||
|
||||
#############################################################
|
||||
# activate conda env ent call python commands #
|
||||
#############################################################
|
||||
conda activate ${TF_ENV}
|
||||
export TF_CPP_MIN_LOG_LEVEL=3
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-''}."
|
||||
python3 -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())"
|
||||
conda deactivate
|
||||
37
local/examples/jobs/simple_conda_test.sh
Executable file
37
local/examples/jobs/simple_conda_test.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#! /bin/bash
|
||||
|
||||
#############################################################
|
||||
# This example show a list of availabel conda environments #
|
||||
#############################################################
|
||||
|
||||
#############################################################
|
||||
# set qsub options #
|
||||
#############################################################
|
||||
# run in low.q
|
||||
#$ -l low
|
||||
|
||||
# request enough memory
|
||||
# #$ -l h_vmem=8G,memory=8G,h_stack=8M
|
||||
|
||||
# Name the job 'Conda-Test'
|
||||
#$ -N Conda-Test
|
||||
|
||||
#############################################################
|
||||
# initialize conda #
|
||||
#############################################################
|
||||
__conda_setup="$('/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
|
||||
if [ $? -eq 0 ]; then
|
||||
eval "$__conda_setup"
|
||||
else
|
||||
if [ -f "/opt/anaconda3/etc/profile.d/conda.sh" ]; then
|
||||
. "/opt/anaconda3/etc/profile.d/conda.sh"
|
||||
else
|
||||
export PATH="/opt/anaconda3/bin:$PATH"
|
||||
fi
|
||||
fi
|
||||
unset __conda_setup
|
||||
|
||||
#############################################################
|
||||
# show conda environments #
|
||||
#############################################################
|
||||
conda env list
|
||||
33
local/patch/fix_euid_set_bug.patch
Normal file
33
local/patch/fix_euid_set_bug.patch
Normal file
@@ -0,0 +1,33 @@
|
||||
diff --git a/source/daemons/shepherd/shepherd.c b/source/daemons/shepherd/shepherd.c
|
||||
index 5af1463..e7b2831 100644
|
||||
--- a/source/daemons/shepherd/shepherd.c
|
||||
+++ b/source/daemons/shepherd/shepherd.c
|
||||
@@ -299,17 +299,24 @@ static int handle_io_file(const char* file, const char* owner, bool rw) {
|
||||
}
|
||||
}
|
||||
|
||||
- /* reset egid and euid to the stored values */
|
||||
- if (sge_seteuid(old_euid) != 0) {
|
||||
- shepherd_trace("Cannot reset euid %s due to %s", owner, strerror(errno));
|
||||
- SGE_CLOSE(fd);
|
||||
+ /* set effective user-id to root again, because only root is allowed to change
|
||||
+ * the euid to any other than the current user-id. */
|
||||
+ if (sge_seteuid(SGE_SUPERUSER_UID) != 0) {
|
||||
+ shepherd_trace("Cannot become root due to %s", strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
+
|
||||
+ /* reset egid and euid to the stored values (e.g. those of the sgeadmin user) */
|
||||
if (sge_setegid(old_egid) != 0) {
|
||||
shepherd_trace("Cannot reset egid %s due to %s", owner, strerror(errno));
|
||||
SGE_CLOSE(fd);
|
||||
return -1;
|
||||
}
|
||||
+ if (sge_seteuid(old_euid) != 0) {
|
||||
+ shepherd_trace("Cannot reset euid %s due to %s", owner, strerror(errno));
|
||||
+ SGE_CLOSE(fd);
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
return fd;
|
||||
}
|
||||
0
local/pe_epilog.d/.gitkeep
Normal file
0
local/pe_epilog.d/.gitkeep
Normal file
28
local/pe_epilog.d/99-rm_empty_pe_logs.sh
Executable file
28
local/pe_epilog.d/99-rm_empty_pe_logs.sh
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
|
||||
## Delete the STDOUT and STDERR files (.o and .e) if they are empty
|
||||
## ( we do not want to delete non-empty files, they may contain useful
|
||||
## troubleshooting or debug information ... )
|
||||
##
|
||||
|
||||
## input args:
|
||||
# 1: $pe_hostfile
|
||||
# 2: $host
|
||||
# 3: $job_owner
|
||||
# 4: $job_id
|
||||
# 5: $job_name
|
||||
# 6: $pe
|
||||
# 7: $pe_slots
|
||||
# 8: $queue
|
||||
# 9: $stdout_path
|
||||
# 10: $stderr_path
|
||||
# 11: $merge_stderr
|
||||
|
||||
stdout_path=${9}
|
||||
stderr_path=${10}
|
||||
|
||||
[ -r ${stdout_path} -a -f ${stdout_path} ] && [ ! -s ${stdout_path} ] && rm -f ${stdout_path}
|
||||
[ -r ${stderr_path} -a -f ${stderr_path} ] && [ ! -s ${stderr_path} ] && rm -f ${stderr_path}
|
||||
|
||||
exit 0
|
||||
|
||||
0
local/pe_prolog.d/.gitkeep
Normal file
0
local/pe_prolog.d/.gitkeep
Normal file
0
local/prolog.d/.gitkeep
Normal file
0
local/prolog.d/.gitkeep
Normal file
71
local/prolog.d/10-GPU_allocate.sh
Executable file
71
local/prolog.d/10-GPU_allocate.sh
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
|
||||
##########################################################
|
||||
# Allocate requested GPU's:
|
||||
# step 1: get resource GPU
|
||||
# step 2: loop over installed GPU's
|
||||
# step 2a: try to set lock file
|
||||
# step 2b: set CUDA_VISIBLE_DEVICES
|
||||
# step 3: add CUDA_VISIBLE_DEVICES to job environment
|
||||
##########################################################
|
||||
|
||||
### set variables
|
||||
LOCK_FILE=/tmp/gpu-lockfile
|
||||
|
||||
function debug() {
|
||||
echo "$@"
|
||||
}
|
||||
|
||||
### function clean_up
|
||||
# exit with error code
|
||||
# 0: no error
|
||||
# 99: reschedule job
|
||||
# 100: put job in error state
|
||||
# else: put queue in error state
|
||||
function clean_up() {
|
||||
error_code=${1:=0}
|
||||
files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo)
|
||||
if [ ! -z "${files}" ] ; then
|
||||
for file in ${files} ; do
|
||||
rm -f ${file} || exit 1
|
||||
done
|
||||
fi
|
||||
exit ${error_code}
|
||||
}
|
||||
|
||||
### get requested number of GPU's
|
||||
# use hard resource list first
|
||||
NGPUS=$(qstat -j ${JOB_ID} 2>/dev/null | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p")
|
||||
# set NGPUS to zero if empty
|
||||
if [ -z "${NGPUS}" ] ; then
|
||||
NGPUS=0
|
||||
fi
|
||||
|
||||
# get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed))
|
||||
[ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e 2>/dev/null) || exit 0
|
||||
|
||||
## loop over devices and try to allocate one until enough GPU's are allocated
|
||||
CUDA_VISIBLE_DEVICES=''
|
||||
count=0
|
||||
if [ "${NGPUS}" -gt "0" ] ; then
|
||||
for gpu in ${GPU_LIST} ; do
|
||||
if [ ! -f ${LOCK_FILE}-${gpu} ] ; then
|
||||
echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99
|
||||
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}"
|
||||
let "count++"
|
||||
fi
|
||||
# exit loop when enough GPUS are allocated
|
||||
[ "${count}" -ge "${NGPUS}" ] && break
|
||||
done
|
||||
fi
|
||||
|
||||
## add CUDA_VISIBLE_DEVICES to job's environment
|
||||
if [ "${count}" -ge "${NGPUS}" ] ; then
|
||||
ENV_FILE=$SGE_JOB_SPOOL_DIR/environment
|
||||
[ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100
|
||||
else
|
||||
clean_up 99
|
||||
fi
|
||||
|
||||
# clean exit
|
||||
exit 0
|
||||
Reference in New Issue
Block a user