| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test of CPU affinity/binding support. |
| ############################################################################ |
| # Copyright (C) 2005 The Regents of the University of California. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_prog "$test_name.prog" |
| set job_id 0 |
| |
| # |
| # Test if CPU affinity support is supported. |
| # |
| if {![param_contains [get_affinity_types] "affinity"]} { |
| skip "CPU affinity not supported on this system" |
| } |
| log_info "Task affinity plugin installed" |
| |
| set show_partition_output [run_command_output -fail "$scontrol show partition [default_partition]"] |
| if [regexp {OverSubscribe=FORCE} $show_partition_output] { |
| skip "This test is not compatible with OverSubscribe=FORCE" |
| } |
| |
| set node_name [get_nodes_by_request "-N1 --exclusive -t5"] |
| if { [llength $node_name] != 1 } { |
| skip "This test need to be able to submit jobs with at least -N1 --exclusive -t5" |
| } |
| if {[get_node_param $node_name "CoreSpecCount"] != "MISSING"} { |
| skip "This test is incompatible with nodes that have a CoreSpecCount (e.g. $node_name)" |
| } |
| if {[get_node_param $node_name "CPUSpecList"] != "MISSING"} { |
| skip "This test is incompatible with nodes that have a CPUSpecList (e.g. $node_name)" |
| } |
| |
| proc cleanup {} { |
| global job_id file_prog test_status STATUS_FAIL |
| |
| cancel_job $job_id |
| file delete $file_prog |
| |
| if {$test_status == $STATUS_FAIL} { |
| log_warn "This test can fail if the node configuration in slurm.conf (sockets, cores, threads) differs from the actual configuration for the default partition" |
| } |
| } |
| |
| # |
| # Build a test program to report affinity by task |
| # |
| exec $bin_rm -f $file_prog |
| exec $bin_cc -I$build_dir $file_prog.c -o $file_prog |
| exec $bin_chmod 700 $file_prog |
| |
| # |
| # Create an allocation |
| # |
| set timeout $max_job_delay |
| spawn $salloc -N1 --exclusive -t5 -w $node_name $bin_bash |
| expect { |
| -re "Granted job allocation ($number)" { |
| set job_id $expect_out(1,string) |
| reset_bash_prompt |
| exp_continue |
| } |
| -re $test_prompt { |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| } |
| |
| # |
| # Run a job step to get allocated processor count and affinity |
| # |
| set mask 0 |
| set task_cnt 0 |
| send "$srun -c1 ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_cnt |
| set mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| |
| if {$task_cnt > 32} { |
| send "exit\r" |
| expect { |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| skip "Expect unable to work with more than 32-bit numbers" |
| } |
| |
| # |
| # Run --cpu-bind:map_cpu with NaN or octal value - this should fail |
| # |
| set output [run_command_output -xfail "$srun -c1 --cpu-bind=verbose,map_cpu:NaN hostname"] |
| subtest {[regexp "error: Failed to validate number: NaN, the offending character is N" $output]} "The command should fail because of invalid argument" |
| |
| set output [run_command_output -xfail "$srun -c1 --cpu-bind=verbose,map_cpu:0x0 hostname"] |
| subtest {[regexp "error: Failed to validate number: 0x0, the offending character is x" $output]} "The command should fail because of invalid argument" |
| |
| # Run a job step with verbosity and all tasks on CPU 0 |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=verbose,map_cpu:0 ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $task_cnt} "Affinity mask should be consistent for a job step with verbosity and all tasks on CPU 0" "$task_mask != $task_cnt" |
| |
| set verbose_cnt 0 |
| send "$srun -c1 --cpu-bind=verbose,map_cpu:0 ./$file_prog\r" |
| expect { |
| -re "cpu-bind=MAP|cpu-bind-cores=MAP|cpu-bind-sockets=MAP|cpu-bind-threads=MAP" { |
| incr verbose_cnt |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| # Both task/affinity and task/cpu will generate verbose message, |
| # so check for double messages in case both plugins are configured. |
| subtest {$verbose_cnt == $task_cnt || $verbose_cnt == [expr $task_cnt * 2]} "Verbose messages count should be consistent" "$verbose_cnt != $task_cnt" |
| |
| # |
| # Run all tasks all bound to the same CPU by specifying a map (for each CPU) |
| # |
| set cpu_cnt 0 |
| while {$cpu_cnt < $task_cnt} { |
| set mask_sum 0 |
| set mask [ expr 1 << $cpu_cnt ] |
| send "$srun -c1 --cpu-bind=map_cpu:$cpu_cnt ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr mask_sum $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$mask_sum == $task_cnt * $mask} "Affinity mask should be consistent for all tasks bound to the same CPU by specifying a map" "$mask_sum != $task_cnt * $mask" |
| incr cpu_cnt 1 |
| } |
| |
| # |
| # Check if invalid mask_cpu fails option values validation |
| # |
| set output [run_command_output -xfail "$srun -c1 --cpu-bind=verbose,mask_cpu:NaN hostname"] |
| subtest {[regexp "error: Failed to validate number: NaN, the offending character is N" $output]} "The command should fail because of invalid argument" |
| |
| # |
| # Run all tasks all bound to the same CPU by specifying a mask (for each CPU) |
| # |
| set cpu_cnt 0 |
| while {$cpu_cnt < $task_cnt} { |
| set mask_sum 0 |
| set mask [ expr 1 << $cpu_cnt ] |
| set mstr [ uint2hex $mask ] |
| send "$srun -c1 --cpu-bind=mask_cpu:$mstr ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr mask_sum $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$mask_sum == $task_cnt * $mask} "Affinity mask should be consistent for all tasks bound to the same CPU by specifying a mask" "$mask_sum != $task_cnt * $mask" |
| incr cpu_cnt 1 |
| } |
| |
| # |
| # Generate forward and reverse masks and maps |
| # |
| set cpu_cnt 0 |
| set fwd_mask "" |
| set fwd_map "" |
| set rev_mask "" |
| set rev_map "" |
| set alt_mask "" |
| set alt_map "" |
| set full_mask [ expr (1 << $task_cnt) - 1 ] |
| while {$cpu_cnt < $task_cnt} { |
| set mask_sum 0 |
| set mask [ expr 1 << $cpu_cnt ] |
| set mstr [ uint2hex $mask ] |
| set fwd_mask "$fwd_mask,$mstr" |
| set fwd_map "$fwd_map,$cpu_cnt" |
| set rev_mask "$mstr,$rev_mask" |
| set rev_map "$cpu_cnt,$rev_map" |
| if { $cpu_cnt % 2 } { |
| set alt_mask "$mstr,$alt_mask" |
| set alt_map "$cpu_cnt,$alt_map" |
| } else { |
| set alt_mask "$alt_mask,$mstr" |
| set alt_map "$alt_map,$cpu_cnt" |
| } |
| if { $cpu_cnt == 0 } { |
| set fwd_mask "$mstr" |
| set fwd_map "$cpu_cnt" |
| set rev_mask "$mstr" |
| set rev_map "$cpu_cnt" |
| set alt_mask "$mstr" |
| set alt_map "$cpu_cnt" |
| } |
| incr cpu_cnt 1 |
| } |
| |
| log_debug "full_mask: $full_mask" |
| log_debug "fwd_map: $fwd_map" |
| log_debug "fwd_mask: $fwd_mask" |
| log_debug "rev_map: $rev_map" |
| log_debug "rev_mask: $rev_mask" |
| log_debug "alt_map: $alt_map" |
| log_debug "alt_mask: $alt_mask" |
| |
| # |
| # Run all tasks bound to a different CPU by specifying a forward map |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=map_cpu:$fwd_map ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a forward map" "$task_mask != $full_mask" |
| |
| # |
| # Run all tasks bound to a different CPU by specifying a reverse map |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=map_cpu:$rev_map ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a reverse map" "$task_mask != $full_mask" |
| |
| # |
| # Run all tasks bound to a different CPU by specifying an alternating map |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=map_cpu:$alt_map ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying an alternating map" "$task_mask != $full_mask" |
| |
| # |
| # Run all tasks bound to a different CPU by specifying a forward mask |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=mask_cpu:$fwd_mask ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a forward mask" "$task_mask != $full_mask" |
| |
| # |
| # Run all tasks bound to a different CPU by specifying a reverse mask |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=mask_cpu:$rev_mask ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a reverse mask" "$task_mask != $full_mask" |
| |
| # |
| # Run all tasks bound to a different CPU by specifying an alternating mask |
| # |
| set task_mask 0 |
| send "$srun -c1 --cpu-bind=mask_cpu:$alt_mask ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re $test_prompt { |
| } |
| } |
| subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying an alternating mask" "$task_mask != $full_mask" |
| |
| # |
| # Terminate the job, free the allocation |
| # |
| send "exit\r" |
| expect { |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| eof { |
| wait |
| } |
| } |