|  | #!/usr/bin/env expect | 
|  | ############################################################################ | 
|  | # Purpose: Test of Slurm functionality | 
|  | #          Test exclusive resource allocation for a step (--exclusive option). | 
|  | ############################################################################ | 
|  | # Copyright (C) SchedMD LLC. | 
|  | # Copyright (C) 2007 The Regents of the University of California. | 
|  | # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | 
|  | # Written by Morris Jette <jette1@llnl.gov> | 
|  | # CODE-OCEC-09-009. All rights reserved. | 
|  | # | 
|  | # This file is part of Slurm, a resource management program. | 
|  | # For details, see <https://slurm.schedmd.com/>. | 
|  | # Please also read the included file: DISCLAIMER. | 
|  | # | 
|  | # Slurm is free software; you can redistribute it and/or modify it under | 
|  | # the terms of the GNU General Public License as published by the Free | 
|  | # Software Foundation; either version 2 of the License, or (at your option) | 
|  | # any later version. | 
|  | # | 
|  | # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | # details. | 
|  | # | 
|  | # You should have received a copy of the GNU General Public License along | 
|  | # with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | ############################################################################ | 
|  | source ./globals | 
|  |  | 
|  | set file_in         "$test_dir/test.input" | 
|  | set file_out        "$test_dir/test.output" | 
|  | set job_id           0 | 
|  |  | 
|  | if {[check_config_select "linear"]} { | 
|  | skip "Test requires SelectType != linear" | 
|  | } | 
|  |  | 
|  | proc cleanup {} { | 
|  | global job_id file_in file_out test_dir | 
|  |  | 
|  | cancel_job $job_id | 
|  | file delete $file_in $file_out | 
|  | file delete $test_dir/step1 $test_dir/step2 $test_dir/step3 $test_dir/next | 
|  | } | 
|  |  | 
|  | # | 
|  | # Delete left-over input script | 
|  | # Build input script file | 
|  | # Run one more step than allocated CPUs and make sure it waits | 
|  | # | 
|  | proc test_cpus {} { | 
|  | global bin_rm file_in file_out test_dir | 
|  | global bin_touch srun bin_sleep job_id | 
|  |  | 
|  | cleanup | 
|  | make_bash_script $file_in " | 
|  | echo starting step1 and step 2 | 
|  | $srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -c ' | 
|  | $bin_touch $test_dir/step1; | 
|  | while \[ \! -f $test_dir/step3 \] | 
|  | do $bin_sleep 0.25; | 
|  | done | 
|  | ' & | 
|  |  | 
|  | $srun --cpus-per-task=2 --mem=1 --exclusive -n1 bash -xvc ' | 
|  | $bin_touch $test_dir/step2; | 
|  | while \[ \! -f $test_dir/next \] | 
|  | do $bin_sleep 0.25; | 
|  | done | 
|  | ' & | 
|  | s2=\$\! | 
|  |  | 
|  | echo waiting to verify step1 and step2 are running | 
|  | while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \] | 
|  | do | 
|  | $bin_sleep 0.25; | 
|  | done | 
|  |  | 
|  | echo \'starting step3 (should not be started until step2 ends)\' | 
|  | $srun --cpus-per-task=2 --mem=1 -v --exclusive -n1 bash -c ' | 
|  | $bin_touch $test_dir/step3 | 
|  | ' & | 
|  | wait | 
|  | " | 
|  | set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"] | 
|  | subtest {![wait_for_job -timeout 30 $job_id RUNNING]} "Job ($job_id) should start" | 
|  | subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1" | 
|  | subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2" | 
|  | subtest { [wait_for_file -timeout 30 $test_dir/step3]} "File ($test_dir/step3) shouldn't exist yet" | 
|  | run_command -fail "$bin_touch $test_dir/next" | 
|  | subtest {![wait_for_file $test_dir/step3]} "File ($test_dir/step3) should be created by step3" | 
|  | subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish" | 
|  | } | 
|  |  | 
|  | # | 
|  | # Delete left-over input script | 
|  | # Build another input script file | 
|  | # Run one more step than allocated CPUs with immediate option and make aborts | 
|  | # | 
|  | proc test_cpus_immediate {} { | 
|  | global bin_rm file_in file_out  test_dir | 
|  | global bin_sleep scontrol sbatch number | 
|  | global job_id srun bin_cat bin_touch | 
|  |  | 
|  | cleanup | 
|  | make_bash_script $file_in " | 
|  | echo starting step1 and step2 | 
|  | $srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -c ' | 
|  | $bin_touch $test_dir/step1; | 
|  | while \[ \! -f $test_dir/next \] | 
|  | do $bin_sleep 0.25; | 
|  | done | 
|  | ' & | 
|  |  | 
|  | $srun --cpus-per-task=2 --mem=1 --exclusive --immediate=1 -n1 bash -xvc ' | 
|  | $bin_touch $test_dir/step2; | 
|  | while \[ \! -f $test_dir/next \] | 
|  | do $bin_sleep 0.25; | 
|  | done | 
|  | ' & | 
|  | s2=\$\! | 
|  |  | 
|  | echo waiting to verify step1 and step2 are running | 
|  | while \[ \! -f $test_dir/step1 -a \! -f $test_dir/step2 \] | 
|  | do | 
|  | $bin_sleep 0.25; | 
|  | done | 
|  |  | 
|  | echo submitting step3 with --immediate | 
|  | $srun --cpus-per-task=2 --mem=1 -v --immediate=1 --exclusive -n1 bash -c ' | 
|  | $bin_touch $test_dir/step3 | 
|  | ' & | 
|  | s3=\$\! | 
|  | echo waiting for step3 to fail | 
|  | wait \$s3 | 
|  | echo step3: \$? | 
|  |  | 
|  | touch $test_dir/next | 
|  |  | 
|  | wait | 
|  | " | 
|  |  | 
|  | set job_id [submit_job -fail "--cpus-per-task=2 -N1 -n2 -t2 --gres=craynetwork:0 --mem=10 --output=$file_out $file_in"] | 
|  | subtest {![wait_for_job -timeout 30 $job_id DONE]} "Job ($job_id) should finish" | 
|  | subtest {![wait_for_file $test_dir/step1]} "File ($test_dir/step1) should be created by step1" | 
|  | subtest {![wait_for_file $test_dir/step2]} "File ($test_dir/step2) should be created by step2" | 
|  | subtest {![wait_for_file $test_dir/next]} "File ($test_dir/next) should be created after step3" | 
|  | subtest {[wait_for_file -timeout 5 $test_dir/step3]} "File ($test_dir/step3) should NOT be created, because step3 should fail" | 
|  | } | 
|  |  | 
|  | # | 
|  | # Verify that all GPUs and other GRES are allocated with the --exclusive flag | 
|  | # | 
|  | proc test_gpus {node_name} { | 
|  | global bin_sleep | 
|  |  | 
|  | cleanup | 
|  |  | 
|  | # Get the total number of GPUs in the test node | 
|  | set gres_node [get_node_param $node_name "Gres"] | 
|  | set gpu_tot   [dict get [count_gres $gres_node] "gpu"] | 
|  |  | 
|  | # | 
|  | # Verify that all GPUs and other GRES are allocated with the --exclusive flag | 
|  | # | 
|  | set job_id [submit_job -fail "-n1 -N1 -w $node_name --gres=gpu --exclusive -e none -o none --wrap '$bin_sleep 10'"] | 
|  | subtest {![wait_for_job $job_id RUNNING]} "Job ($job_id) should start" | 
|  |  | 
|  | # Check all GRES of the node were allocated on the job | 
|  | check_exclusive_gres $job_id $node_name | 
|  | } | 
|  |  | 
|  | testproc test_cpus | 
|  |  | 
|  | if {[param_contains [get_config_param "SchedulerParameters"] "defer"]} { | 
|  | skip_following_testprocs "Skipping immediate test since SchedulerParameters=defer is set" | 
|  | } | 
|  | testproc test_cpus_immediate | 
|  | run_following_testprocs | 
|  |  | 
|  | set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"] | 
|  | if { [llength $node_name] != 1 } { | 
|  | skip_following_testprocs "This test need to be able to submit jobs with at least --gres=gpu:2" | 
|  | } | 
|  | if {![param_contains [get_config_param "AccountingStorageTRES"] "gres/gpu"]} { | 
|  | skip_following_testprocs "This test requires AccountingStorageTRES=gres/gpu" | 
|  | } | 
|  |  | 
|  | testproc test_gpus $node_name |