| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test that the core spec option in sbatch allocates the correct |
| # number of cores and that tasks spread over multiple nodes |
| # when there is not enough resources on one node. |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_in "$test_dir/input" |
| set file_out "$test_dir/output" |
| set spec_in "$test_dir/spec_core_script\.in" |
| set job_id 0 |
| |
| proc cleanup {} { |
| global job_id |
| |
| cancel_job $job_id |
| } |
| |
| |
| ############################################################################# |
| # |
| # Checks that the node uses the correct number of specialized cores |
| # and that the number of nodes the job uses is correct. |
| # |
| # exp_node = 0: job must only use the specified node |
| # exp_node = 1: job must use more then specified node |
| # exp_node = -1: job must fail because the job exceeds the number of cores |
| # |
| ############################################################################# |
| proc core_spec_job {task node core_spec exp_nodes} { |
| global sbatch scontrol spec_in file_out number thread_cnt |
| global cpu_tot |
| set job_id 0 |
| set num_nodes 0 |
| |
| # Determine the number of tasks that can be run |
| set cpu_used_by_spec [expr $thread_cnt * $core_spec] |
| if {$cpu_tot > $cpu_used_by_spec} { |
| set task_limit [expr $cpu_tot - $cpu_used_by_spec] |
| } else { |
| set task_limit 1 |
| } |
| |
| set ntasks [expr abs($task_limit + $task)] |
| if {$ntasks == 0} { |
| set ntasks 1 |
| } |
| |
| set error_chk 0 |
| spawn $sbatch -t1 -w$node -S$core_spec -n$ntasks -o$file_out $spec_in |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| -re "error" { |
| if {$exp_nodes != -1} { |
| fail "sbatch should not have produced an error" |
| } |
| set error_chk 1 |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$job_id == 0 && $error_chk == 0} { |
| fail "Job was not submitted" |
| |
| } elseif {$exp_nodes == -1 && $job_id != 0} { |
| fail "This job should have failed but did not" |
| |
| } elseif {$exp_nodes == -1 && $error_chk != 0} { |
| log_debug "This error is expected do not worry" |
| |
| } else { |
| wait_for_job -fail $job_id "RUNNING" |
| set core_chk 0 |
| spawn $scontrol show job $job_id |
| expect { |
| -re "NumNodes=($number)" { |
| set num_nodes $expect_out(1,string) |
| exp_continue |
| } |
| -re "CoreSpec=$core_spec" { |
| set core_chk 1 |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| subtest {$core_chk != 0} "Check that the node uses the correct number of specialized cores" |
| |
| wait_for_job -fail $job_id "DONE" |
| } |
| |
| if {$exp_nodes == 1} { |
| subtest {$num_nodes > 1} "Check that the job uses the correct number of nodes" "Job $job_id should use more then 1 node" |
| } |
| |
| if {$exp_nodes == 0} { |
| subtest {$num_nodes == 1} "Check that the job uses the correct number of nodes" "Job $job_id should use only $node" |
| } |
| } |
| |
| ############################################################################# |
| # |
| # Tests begin here |
| # |
| ############################################################################# |
| |
| # Run basic client tool checks before validating configuration |
| set output [run_command_output -xfail -subtest "$sbatch --thread-spec=1 -S1 --wrap='/bin/true'"] |
| subtest {[regexp "sbatch: fatal: -S/--core-spec and --thred-spec options are mutually exclusive" $output]} "sbatch should fail for both --thread-spec and -S given" |
| |
| set output [run_command_output -xfail -subtest "SBATCH_CORE_SPEC=1 SBATCH_THREAD_SPEC=1 $sbatch --wrap='/bin/true'"] |
| subtest {[regexp "Both --core-spec and --thread-spec set using environment variables. Those options are mutually exclusive" $output]} "sbatch should fail when both SBATCH_CORE_SPEC SBATCH_THREAD_SPEC set in environment and no cli given" |
| |
| if {[check_config_select "linear"]} { |
| skip "This test is incompatible with select/linear" |
| } |
| if {[param_contains [get_config_param "SelectTypeParameters"] "CR_SOCKET"]} { |
| skip "This test is incompatible with CR_SOCKET allocations" |
| } |
| |
| if {![get_config_param "AllowSpecResourcesUsage"]} { |
| skip "AllowSpecResourcesUsage not configured to permit core specialization" |
| } |
| |
| # Remove any vestigial files |
| exec $bin_rm -f $file_in $file_out $spec_in |
| |
| make_bash_script $file_in " |
| first=\$($scontrol show hostnames \$SLURM_JOB_NODELIST\ | head -n1)\ |
| |
| $scontrol show node \$first\ |
| |
| " |
| make_bash_script $spec_in "sleep 5" |
| |
| spawn $sbatch --exclusive -t1 -N2 -o$file_out $file_in |
| expect { |
| -re "Batch job submission failed" { |
| skip "Can't test srun task distribution" |
| } |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id == 0} { |
| fail "sbatch did not submit job" |
| } |
| |
| wait_for_file -fail $file_out |
| |
| set first_node "" |
| set core_cnt 0 |
| set cpu_tot 1 |
| set socket_cnt 1 |
| set thread_cnt 1 |
| |
| spawn $bin_cat $file_out |
| expect { |
| -re "NodeName=($re_word_str)" { |
| set first_node $expect_out(1,string) |
| exp_continue |
| } |
| -re "CoresPerSocket=($number)" { |
| set core_cnt $expect_out(1,string) |
| exp_continue |
| } |
| -re "CPUTot=($number)" { |
| set cpu_tot $expect_out(1,string) |
| exp_continue |
| } |
| -re "Sockets=($number)" { |
| set socket_cnt $expect_out(1,string) |
| exp_continue |
| } |
| -re "ThreadsPerCore=($number)" { |
| set thread_cnt $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "cat is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| set core_cnt [expr $core_cnt * $socket_cnt] |
| if {$core_cnt == 0} { |
| fail "sbatch did not find the number of cores" |
| } |
| if {$core_cnt < 4} { |
| skip "Core count too low for testing ($core_cnt < 4)" |
| } |
| |
| # The environment variable should get overwritten. |
| set env(SBATCH_THREAD_SPEC) "99" |
| |
| # |
| # Using the core spec within the node limits |
| # |
| log_info "Run within the specified node" |
| core_spec_job 0 $first_node [expr $core_cnt - 2] 0 |
| core_spec_job -2 $first_node [expr $core_cnt - 2] 0 |
| |
| # |
| # Using core spec with more tasks then the node can handle. This should |
| # cause the tasks to spread across multiple nodes as needed |
| # |
| log_info "Spread job across multiple nodes" |
| core_spec_job 1 $first_node [expr $core_cnt - 2] 1 |
| core_spec_job 1 $first_node [expr $core_cnt - 1] 1 |
| |
| # |
| # Using core spec with more cores then the specified node has |
| # |
| log_info "Fail by trying to use more cores than exist" |
| core_spec_job 1 $first_node [expr $core_cnt + 5] -1 |
| core_spec_job 1 $first_node [expr $core_cnt + 7] -1 |