testsuite/expect/test1.14 - SchedMD/slurm - Git at Google

 #!/usr/bin/env expect
 ############################################################################
 # Purpose: Test of Slurm functionality
 #          Test exclusive resource allocation for a step (--exclusive option).
 ############################################################################
 # Copyright (C) SchedMD LLC.
 # Copyright (C) 2007 The Regents of the University of California.
 # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 # Written by Morris Jette <jette1@llnl.gov>
 # CODE-OCEC-09-009. All rights reserved.
 #
 # This file is part of Slurm, a resource management program.
 # For details, see <https://slurm.schedmd.com/>.
 # Please also read the included file: DISCLAIMER.
 #
 # Slurm is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
 # Software Foundation; either version 2 of the License, or (at your option)
 # any later version.
 #
 # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along
 # with Slurm; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 ############################################################################
 source ./globals

 set file_in         "$test_dir/test.input"
 set file_out        "$test_dir/test.output"
 set job_id           0

 if {[check_config_select "linear"]} {
     skip "Test requires SelectType != linear"
 }

 proc cleanup {} {
 	global job_id file_in file_out test_dir

 	cancel_job $job_id
 	file delete $file_in $file_out
 	file delete $test_dir/step1 $test_dir/step2 $test_dir/step3 $test_dir/next
 }

 #
 # Delete left-over input script
 # Build input script file
 # Run one more step than allocated CPUs and make sure it waits
 #
 proc test_cpus {} {
 	global bin_rm file_in file_out test_dir
 	global bin_touch srun bin_sleep job_id

 	cleanup
 	make_bash_script $file_in "
 		echo starting step1 and step 2
 		$srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -c '
 			$bin_touch $test_dir/step1;
 			while \[ \! -f $test_dir/step3 \]
 			do $bin_sleep 0.25;
 			done
 		' &

 		$srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -xvc '
 			$bin_touch $test_dir/step2;
 			while \[ \! -f $test_dir/next \]
 			do $bin_sleep 0.25;
 			done
 		' &
 		s2=\$\!

 		echo waiting to verify step1 and step2 are running
 		while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \]
 		do
 			$bin_sleep 0.25;
 		done

 		echo \'starting step3 (should not be started until step2 ends)\'
 		$srun --cpus-per-task=2 --mem=1 -v --exclusive -n1 bash -c '
 			$bin_touch $test_dir/step3
 		' &
 		wait
 	"
 	set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"]
 	subtest {![wait_for_job -timeout 30 $job_id RUNNING]} "Job ($job_id) should start"
 	subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1"
 	subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2"
 	subtest { [wait_for_file -timeout 30 $test_dir/step3]} "File ($test_dir/step3) shouldn't exist yet"
 	run_command -fail "$bin_touch $test_dir/next"
 	subtest {![wait_for_file $test_dir/step3]} "File ($test_dir/step3) should be created by step3"
 	subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish"
 }

 #
 # Delete left-over input script
 # Build another input script file
 # Run one more step than allocated CPUs with immediate option and make aborts
 #
 proc test_cpus_immediate {} {
 	global bin_rm file_in file_out  test_dir
 	global bin_sleep scontrol sbatch number
 	global job_id srun bin_cat bin_touch

 	cleanup
 	make_bash_script $file_in "
 		echo starting step1 and step2
 		$srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -c '
 			$bin_touch $test_dir/step1;
 			while \[ \! -f $test_dir/next \]
 			do $bin_sleep 0.25;
 			done
 		' &

 		$srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -xvc '
 			$bin_touch $test_dir/step2;
 			while \[ \! -f $test_dir/next \]
 			do $bin_sleep 0.25;
 			done
 		' &
 		s2=\$\!

 		echo waiting to verify step1 and step2 are running
 		while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \]
 		do
 			$bin_sleep 0.25;
 		done

 		echo submitting step3 with --immediate
 		$srun --cpus-per-task=2 --mem=1 -v --immediate=1 --exclusive -n1 bash -c '
 			$bin_touch $test_dir/step3
 		' &
 		s3=\$\!
 		echo waiting for step3 to fail
 		wait \$s3
 		echo step3: \$?

 		touch $test_dir/next

 		wait
 	"

 	set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"]
 	subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish"
 	subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1"
 	subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2"
 	subtest {![wait_for_file $test_dir/next]} "File ($test_dir/next) should be created after step3"
 	subtest {[wait_for_file -timeout 5 $test_dir/step3]} "File ($test_dir/step3) should NOT be created, because step3 should fail"
 }

 #
 # Verify that all GPUs and other GRES are allocated with the --exclusive flag
 #
 proc test_gpus {node_name} {
 	global bin_sleep

 	cleanup

 	# Get the total number of GPUs in the test node
 	set gres_node [get_node_param $node_name "Gres"]
 	set gpu_tot   [dict get [count_gres $gres_node] "gpu"]

 	#
 	# Verify that all GPUs and other GRES are allocated with the --exclusive flag
 	#
 	set job_id [submit_job -fail "-n1 -N1 -w $node_name --gres=gpu --exclusive -e none -o none --wrap '$bin_sleep 10'"]
 	subtest {![wait_for_job $job_id RUNNING]} "Job ($job_id) should start"

 	# Check all GRES of the node were allocated on the job
 	check_exclusive_gres $job_id $node_name
 }

 testproc test_cpus

 if {[param_contains [get_config_param "SchedulerParameters"] "defer"]} {
 	skip_following_testprocs "Skipping immediate test since SchedulerParameters=defer is set"
 }
 testproc test_cpus_immediate
 run_following_testprocs

 set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"]
 if { [llength $node_name] != 1 } {
 	skip_following_testprocs "This test need to be able to submit jobs with at least --gres=gpu:2"
 }
 if {![param_contains [get_config_param "AccountingStorageTRES"] "gres/gpu"]} {
 	skip_following_testprocs "This test requires AccountingStorageTRES=gres/gpu"
 }

 testproc test_gpus $node_name
	#!/usr/bin/env expect
	############################################################################
	# Purpose: Test of Slurm functionality
	# Test exclusive resource allocation for a step (--exclusive option).
	############################################################################
	# Copyright (C) SchedMD LLC.
	# Copyright (C) 2007 The Regents of the University of California.
	# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	# Written by Morris Jette <jette1@llnl.gov>
	# CODE-OCEC-09-009. All rights reserved.
	#
	# This file is part of Slurm, a resource management program.
	# For details, see <https://slurm.schedmd.com/>.
	# Please also read the included file: DISCLAIMER.
	#
	# Slurm is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 2 of the License, or (at your option)
	# any later version.
	#
	# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	# details.
	#
	# You should have received a copy of the GNU General Public License along
	# with Slurm; if not, write to the Free Software Foundation, Inc.,
	# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	############################################################################
	source ./globals

	set file_in "$test_dir/test.input"
	set file_out "$test_dir/test.output"
	set job_id 0

	if {[check_config_select "linear"]} {
	skip "Test requires SelectType != linear"
	}

	proc cleanup {} {
	global job_id file_in file_out test_dir

	cancel_job $job_id
	file delete $file_in $file_out
	file delete $test_dir/step1 $test_dir/step2 $test_dir/step3 $test_dir/next
	}

	#
	# Delete left-over input script
	# Build input script file
	# Run one more step than allocated CPUs and make sure it waits
	#
	proc test_cpus {} {
	global bin_rm file_in file_out test_dir
	global bin_touch srun bin_sleep job_id

	cleanup
	make_bash_script $file_in "
	echo starting step1 and step 2
	$srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -c '
	$bin_touch $test_dir/step1;
	while \[ \! -f $test_dir/step3 \]
	do $bin_sleep 0.25;
	done
	' &

	$srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -xvc '
	$bin_touch $test_dir/step2;
	while \[ \! -f $test_dir/next \]
	do $bin_sleep 0.25;
	done
	' &
	s2=\$\!

	echo waiting to verify step1 and step2 are running
	while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \]
	do
	$bin_sleep 0.25;
	done

	echo \'starting step3 (should not be started until step2 ends)\'
	$srun --cpus-per-task=2 --mem=1 -v --exclusive -n1 bash -c '
	$bin_touch $test_dir/step3
	' &
	wait
	"
	set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"]
	subtest {![wait_for_job -timeout 30 $job_id RUNNING]} "Job ($job_id) should start"
	subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1"
	subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2"
	subtest { [wait_for_file -timeout 30 $test_dir/step3]} "File ($test_dir/step3) shouldn't exist yet"
	run_command -fail "$bin_touch $test_dir/next"
	subtest {![wait_for_file $test_dir/step3]} "File ($test_dir/step3) should be created by step3"
	subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish"
	}

	#
	# Delete left-over input script
	# Build another input script file
	# Run one more step than allocated CPUs with immediate option and make aborts
	#
	proc test_cpus_immediate {} {
	global bin_rm file_in file_out test_dir
	global bin_sleep scontrol sbatch number
	global job_id srun bin_cat bin_touch

	cleanup
	make_bash_script $file_in "
	echo starting step1 and step2
	$srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -c '
	$bin_touch $test_dir/step1;
	while \[ \! -f $test_dir/next \]
	do $bin_sleep 0.25;
	done
	' &

	$srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -xvc '
	$bin_touch $test_dir/step2;
	while \[ \! -f $test_dir/next \]
	do $bin_sleep 0.25;
	done
	' &
	s2=\$\!

	echo waiting to verify step1 and step2 are running
	while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \]
	do
	$bin_sleep 0.25;
	done

	echo submitting step3 with --immediate
	$srun --cpus-per-task=2 --mem=1 -v --immediate=1 --exclusive -n1 bash -c '
	$bin_touch $test_dir/step3
	' &
	s3=\$\!
	echo waiting for step3 to fail
	wait \$s3
	echo step3: \$?

	touch $test_dir/next

	wait
	"

	set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"]
	subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish"
	subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1"
	subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2"
	subtest {![wait_for_file $test_dir/next]} "File ($test_dir/next) should be created after step3"
	subtest {[wait_for_file -timeout 5 $test_dir/step3]} "File ($test_dir/step3) should NOT be created, because step3 should fail"
	}

	#
	# Verify that all GPUs and other GRES are allocated with the --exclusive flag
	#
	proc test_gpus {node_name} {
	global bin_sleep

	cleanup

	# Get the total number of GPUs in the test node
	set gres_node [get_node_param $node_name "Gres"]
	set gpu_tot [dict get [count_gres $gres_node] "gpu"]

	#
	# Verify that all GPUs and other GRES are allocated with the --exclusive flag
	#
	set job_id [submit_job -fail "-n1 -N1 -w $node_name --gres=gpu --exclusive -e none -o none --wrap '$bin_sleep 10'"]
	subtest {![wait_for_job $job_id RUNNING]} "Job ($job_id) should start"

	# Check all GRES of the node were allocated on the job
	check_exclusive_gres $job_id $node_name
	}

	testproc test_cpus

	if {[param_contains [get_config_param "SchedulerParameters"] "defer"]} {
	skip_following_testprocs "Skipping immediate test since SchedulerParameters=defer is set"
	}
	testproc test_cpus_immediate
	run_following_testprocs

	set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"]
	if { [llength $node_name] != 1 } {
	skip_following_testprocs "This test need to be able to submit jobs with at least --gres=gpu:2"
	}
	if {![param_contains [get_config_param "AccountingStorageTRES"] "gres/gpu"]} {
	skip_following_testprocs "This test requires AccountingStorageTRES=gres/gpu"
	}

	testproc test_gpus $node_name