| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test exclusive resource allocation for a step (--exclusive option). |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # Copyright (C) 2007 The Regents of the University of California. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_in "$test_dir/test.input" |
| set file_out "$test_dir/test.output" |
| set job_id 0 |
| |
| if {[check_config_select "linear"]} { |
| skip "Test requires SelectType != linear" |
| } |
| |
| proc cleanup {} { |
| global job_id file_in file_out test_dir |
| |
| cancel_job $job_id |
| file delete $file_in $file_out |
| file delete $test_dir/step1 $test_dir/step2 $test_dir/step3 $test_dir/next |
| } |
| |
| # |
| # Delete left-over input script |
| # Build input script file |
| # Run one more step than allocated CPUs and make sure it waits |
| # |
| proc test_cpus {} { |
| global bin_rm file_in file_out test_dir |
| global bin_touch srun bin_sleep job_id |
| |
| cleanup |
| make_bash_script $file_in " |
| echo starting step1 and step 2 |
| $srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -c ' |
| $bin_touch $test_dir/step1; |
| while \[ \! -f $test_dir/step3 \] |
| do $bin_sleep 0.25; |
| done |
| ' & |
| |
| $srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -xvc ' |
| $bin_touch $test_dir/step2; |
| while \[ \! -f $test_dir/next \] |
| do $bin_sleep 0.25; |
| done |
| ' & |
| s2=\$\! |
| |
| echo waiting to verify step1 and step2 are running |
| while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \] |
| do |
| $bin_sleep 0.25; |
| done |
| |
| echo \'starting step3 (should not be started until step2 ends)\' |
| $srun --cpus-per-task=2 --mem=1 -v --exclusive -n1 bash -c ' |
| $bin_touch $test_dir/step3 |
| ' & |
| wait |
| " |
| set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"] |
| subtest {![wait_for_job -timeout 30 $job_id RUNNING]} "Job ($job_id) should start" |
| subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1" |
| subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2" |
| subtest { [wait_for_file -timeout 30 $test_dir/step3]} "File ($test_dir/step3) shouldn't exist yet" |
| run_command -fail "$bin_touch $test_dir/next" |
| subtest {![wait_for_file $test_dir/step3]} "File ($test_dir/step3) should be created by step3" |
| subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish" |
| } |
| |
| # |
| # Delete left-over input script |
| # Build another input script file |
| # Run one more step than allocated CPUs with immediate option and make aborts |
| # |
| proc test_cpus_immediate {} { |
| global bin_rm file_in file_out test_dir |
| global bin_sleep scontrol sbatch number |
| global job_id srun bin_cat bin_touch |
| |
| cleanup |
| make_bash_script $file_in " |
| echo starting step1 and step2 |
| $srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -c ' |
| $bin_touch $test_dir/step1; |
| while \[ \! -f $test_dir/next \] |
| do $bin_sleep 0.25; |
| done |
| ' & |
| |
| $srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -xvc ' |
| $bin_touch $test_dir/step2; |
| while \[ \! -f $test_dir/next \] |
| do $bin_sleep 0.25; |
| done |
| ' & |
| s2=\$\! |
| |
| echo waiting to verify step1 and step2 are running |
| while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \] |
| do |
| $bin_sleep 0.25; |
| done |
| |
| echo submitting step3 with --immediate |
| $srun --cpus-per-task=2 --mem=1 -v --immediate=1 --exclusive -n1 bash -c ' |
| $bin_touch $test_dir/step3 |
| ' & |
| s3=\$\! |
| echo waiting for step3 to fail |
| wait \$s3 |
| echo step3: \$? |
| |
| touch $test_dir/next |
| |
| wait |
| " |
| |
| set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"] |
| subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish" |
| subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1" |
| subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2" |
| subtest {![wait_for_file $test_dir/next]} "File ($test_dir/next) should be created after step3" |
| subtest {[wait_for_file -timeout 5 $test_dir/step3]} "File ($test_dir/step3) should NOT be created, because step3 should fail" |
| } |
| |
| # |
| # Verify that all GPUs and other GRES are allocated with the --exclusive flag |
| # |
| proc test_gpus {node_name} { |
| global bin_sleep |
| |
| cleanup |
| |
| # Get the total number of GPUs in the test node |
| set gres_node [get_node_param $node_name "Gres"] |
| set gpu_tot [dict get [count_gres $gres_node] "gpu"] |
| |
| # |
| # Verify that all GPUs and other GRES are allocated with the --exclusive flag |
| # |
| set job_id [submit_job -fail "-n1 -N1 -w $node_name --gres=gpu --exclusive -e none -o none --wrap '$bin_sleep 10'"] |
| subtest {![wait_for_job $job_id RUNNING]} "Job ($job_id) should start" |
| |
| # Check all GRES of the node were allocated on the job |
| check_exclusive_gres $job_id $node_name |
| } |
| |
| testproc test_cpus |
| |
| if {[param_contains [get_config_param "SchedulerParameters"] "defer"]} { |
| skip_following_testprocs "Skipping immediate test since SchedulerParameters=defer is set" |
| } |
| testproc test_cpus_immediate |
| run_following_testprocs |
| |
| set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"] |
| if { [llength $node_name] != 1 } { |
| skip_following_testprocs "This test need to be able to submit jobs with at least --gres=gpu:2" |
| } |
| if {![param_contains [get_config_param "AccountingStorageTRES"] "gres/gpu"]} { |
| skip_following_testprocs "This test requires AccountingStorageTRES=gres/gpu" |
| } |
| |
| testproc test_gpus $node_name |