Compare commits
	
		
			5 Commits
		
	
	
		
			a5963b445b
			...
			097ccac7bd
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 097ccac7bd | |||
| d9ca7c3127 | |||
| d76510b2aa | |||
| d49fd18668 | |||
| 02913e042d | 
							
								
								
									
										1
									
								
								bin/epilog
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								bin/epilog
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pro-epilog_wrapper.sh | ||||||
							
								
								
									
										1
									
								
								bin/pe_epilog
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								bin/pe_epilog
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pro-epilog_wrapper.sh | ||||||
							
								
								
									
										1
									
								
								bin/pe_prolog
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								bin/pe_prolog
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pro-epilog_wrapper.sh | ||||||
							
								
								
									
										20
									
								
								bin/pro-epilog_wrapper.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										20
									
								
								bin/pro-epilog_wrapper.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,20 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | 
 | ||||||
|  | PATH=/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin | ||||||
|  | 
 | ||||||
|  | FILES="" | ||||||
|  | SCRIPTNAME=$(basename $0) | ||||||
|  | DIRS="/opt/SGE/local/${SCRIPTNAME}.d /usr/local/etc/gridengine/${SCRIPTNAME}.d" | ||||||
|  | for DIR in ${DIRS} ; do | ||||||
|  | 	if [ -d ${DIR} ] ; then | ||||||
|  | 		if [ ! -z "$(ls ${DIR}/[0-9][0-9]*)" ] ; then | ||||||
|  | 			FILES="${FILES} $(ls ${DIR}/[0-9][0-9]*)" | ||||||
|  | 		fi | ||||||
|  | 	fi | ||||||
|  | done | ||||||
|  | FILESSORTED="$(echo ${FILES} | sort)" | ||||||
|  | 
 | ||||||
|  | for FILE in ${FILESSORTED} ; do | ||||||
|  | 	${FILE} $@ || exit $? | ||||||
|  | done | ||||||
|  | exit 0 | ||||||
							
								
								
									
										1
									
								
								bin/prolog
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								bin/prolog
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pro-epilog_wrapper.sh | ||||||
							
								
								
									
										17
									
								
								epilog.d/95-GPU_release.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								epilog.d/95-GPU_release.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,17 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | 
 | ||||||
|  | ############################################### | ||||||
|  | # release allocated GPUS | ||||||
|  | ############################################### | ||||||
|  | 
 | ||||||
|  | ### set variables | ||||||
|  | LOCK_FILE=/tmp/gpu-lockfile | ||||||
|  | 
 | ||||||
|  | files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo) | ||||||
|  | if [ ! -z "${files}" ] ; then | ||||||
|  |     for file in ${files} ; do | ||||||
|  |         rm -f ${file} || exit 1 | ||||||
|  |     done | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | exit 0 | ||||||
							
								
								
									
										0
									
								
								pe_epilog.d/.gitkeep
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								pe_epilog.d/.gitkeep
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										28
									
								
								pe_epilog.d/99-rm_empty_pe_logs.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										28
									
								
								pe_epilog.d/99-rm_empty_pe_logs.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,28 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | 
 | ||||||
|  | ## Delete the STDOUT and STDERR files (.o and .e) if they are empty | ||||||
|  | ##  ( we do not want to delete non-empty files, they may contain useful | ||||||
|  | ##    troubleshooting or debug information ... ) | ||||||
|  | ## | ||||||
|  | 
 | ||||||
|  | ## input args: | ||||||
|  | #	1: $pe_hostfile | ||||||
|  | #	2: $host | ||||||
|  | #	3: $job_owner | ||||||
|  | #	4: $job_id | ||||||
|  | #	5: $job_name | ||||||
|  | #	6: $pe | ||||||
|  | #	7: $pe_slots | ||||||
|  | #	8: $queue | ||||||
|  | #	9: $stdout_path | ||||||
|  | #	10: $stderr_path | ||||||
|  | #	11: $merge_stderr | ||||||
|  | 
 | ||||||
|  | stdout_path=${9} | ||||||
|  | stderr_path=${10} | ||||||
|  | 
 | ||||||
|  | [ -r ${stdout_path} -a -f ${stdout_path} ] && [ ! -s ${stdout_path} ] && rm -f ${stdout_path} | ||||||
|  | [ -r ${stderr_path} -a -f ${stderr_path} ] && [ ! -s ${stderr_path} ] && rm -f ${stderr_path} | ||||||
|  | 
 | ||||||
|  | exit 0 | ||||||
|  | 
 | ||||||
							
								
								
									
										0
									
								
								pe_prolog.d/.gitkeep
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								pe_prolog.d/.gitkeep
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								prolog.d/.gitkeep
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								prolog.d/.gitkeep
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										71
									
								
								prolog.d/10-GPU_allocate.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										71
									
								
								prolog.d/10-GPU_allocate.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,71 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | 
 | ||||||
|  | ########################################################## | ||||||
|  | # Allocate requested GPU's: | ||||||
|  | #   step 1: get resource GPU | ||||||
|  | #   step 2: loop over installed GPU's | ||||||
|  | #   step 2a: try to set lock file | ||||||
|  | #   step 2b: set CUDA_VISIBLE_DEVICES | ||||||
|  | #   step 3: add CUDA_VISIBLE_DEVICES to job environment | ||||||
|  | ########################################################## | ||||||
|  | 
 | ||||||
|  | ### set variables | ||||||
|  | LOCK_FILE=/tmp/gpu-lockfile | ||||||
|  | 
 | ||||||
|  | function debug() { | ||||||
|  |     echo "$@" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ### function clean_up | ||||||
|  | # exit with error code | ||||||
|  | # 0: no error | ||||||
|  | # 99: reschedule job | ||||||
|  | # 100: put job in error state | ||||||
|  | # else: put queue in error state | ||||||
|  | function clean_up() { | ||||||
|  |     error_code=${1:=0} | ||||||
|  |     files=$(grep -s -l $JOB_ID ${LOCK_FILE}-* | xargs echo) | ||||||
|  |     if [ ! -z "${files}" ] ; then | ||||||
|  |         for file in ${files} ; do | ||||||
|  |             rm -f ${file} || exit 1 | ||||||
|  |         done | ||||||
|  |     fi | ||||||
|  |     exit ${error_code} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ### get requested number of GPU's | ||||||
|  | # use hard resource list first | ||||||
|  | NGPUS=$(qstat -j ${JOB_ID} | sed -n "s/hard resource_list:.*gpu=\([[:digit:]]\+\).*/\1/p") | ||||||
|  | # set NGPUS to zero if empty | ||||||
|  | if [ -z "${NGPUS}" ] ; then | ||||||
|  |     NGPUS=0 | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # get list of installed GPU's (exit without error if nvidia-smi is not available (i. e. no GPU's installed)) | ||||||
|  | [ -f /usr/bin/nvidia-smi ] && GPU_LIST=$(/usr/bin/nvidia-smi -L | cut -f1 -d":" | cut -f2 -d" " | xargs shuf -e) || exit 0 | ||||||
|  | 
 | ||||||
|  | ## loop over devices and try to allocate one until enough GPU's are allocated | ||||||
|  | CUDA_VISIBLE_DEVICES='' | ||||||
|  | count=0 | ||||||
|  | if [ "${NGPUS}" -gt "0" ] ; then | ||||||
|  |     for gpu in ${GPU_LIST} ; do | ||||||
|  |         if [ ! -f ${LOCK_FILE}-${gpu} ] ; then | ||||||
|  |             echo ${JOB_ID} > ${LOCK_FILE}-${gpu} || clean_up 99 | ||||||
|  |             CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES} ${gpu}" | ||||||
|  |             let "count++" | ||||||
|  |         fi | ||||||
|  |         # exit loop when enough GPUS are allocated | ||||||
|  |         [ "${count}" -ge "${NGPUS}" ] && break | ||||||
|  |     done | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | ## add CUDA_VISIBLE_DEVICES to job's environment | ||||||
|  | if [ "${count}" -ge "${NGPUS}" ] ; then | ||||||
|  |     ENV_FILE=$SGE_JOB_SPOOL_DIR/environment | ||||||
|  |     [ -f ${ENV_FILE} -a -w ${ENV_FILE} ] && echo "CUDA_VISIBLE_DEVICES=$(echo ${CUDA_VISIBLE_DEVICES} | sed 's/^ //' | sed 's/ /,/g')" >> ${ENV_FILE} || clean_up 100 | ||||||
|  | else | ||||||
|  |     clean_up 99  | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # clean exit | ||||||
|  | exit 0 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user