blob: 47fb4d3ec22931bbf07121b2ace913e06d94863c [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test of CPU affinity/binding support.
############################################################################
# Copyright (C) 2005 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_prog "$test_name.prog"
set job_id 0
#
# Test if CPU affinity support is supported.
#
if {![param_contains [get_affinity_types] "affinity"]} {
skip "CPU affinity not supported on this system"
}
log_info "Task affinity plugin installed"
set show_partition_output [run_command_output -fail "$scontrol show partition [default_partition]"]
if [regexp {OverSubscribe=FORCE} $show_partition_output] {
skip "This test is not compatible with OverSubscribe=FORCE"
}
set node_name [get_nodes_by_request "-N1 --exclusive -t5"]
if { [llength $node_name] != 1 } {
skip "This test need to be able to submit jobs with at least -N1 --exclusive -t5"
}
if {[get_node_param $node_name "CoreSpecCount"] != "MISSING"} {
skip "This test is incompatible with nodes that have a CoreSpecCount (e.g. $node_name)"
}
if {[get_node_param $node_name "CPUSpecList"] != "MISSING"} {
skip "This test is incompatible with nodes that have a CPUSpecList (e.g. $node_name)"
}
proc cleanup {} {
global job_id file_prog test_status STATUS_FAIL
cancel_job $job_id
file delete $file_prog
if {$test_status == $STATUS_FAIL} {
log_warn "This test can fail if the node configuration in slurm.conf (sockets, cores, threads) differs from the actual configuration for the default partition"
}
}
#
# Build a test program to report affinity by task
#
exec $bin_rm -f $file_prog
exec $bin_cc -I$build_dir $file_prog.c -o $file_prog
exec $bin_chmod 700 $file_prog
#
# Create an allocation
#
set timeout $max_job_delay
spawn $salloc -N1 --exclusive -t5 -w $node_name $bin_bash
expect {
-re "Granted job allocation ($number)" {
set job_id $expect_out(1,string)
reset_bash_prompt
exp_continue
}
-re $test_prompt {
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
}
#
# Run a job step to get allocated processor count and affinity
#
set mask 0
set task_cnt 0
send "$srun -c1 ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_cnt
set mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
if {$task_cnt > 32} {
send "exit\r"
expect {
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
eof {
wait
}
}
skip "Expect unable to work with more than 32-bit numbers"
}
#
# Run --cpu-bind:map_cpu with NaN or octal value - this should fail
#
set output [run_command_output -xfail "$srun -c1 --cpu-bind=verbose,map_cpu:NaN hostname"]
subtest {[regexp "error: Failed to validate number: NaN, the offending character is N" $output]} "The command should fail because of invalid argument"
set output [run_command_output -xfail "$srun -c1 --cpu-bind=verbose,map_cpu:0x0 hostname"]
subtest {[regexp "error: Failed to validate number: 0x0, the offending character is x" $output]} "The command should fail because of invalid argument"
# Run a job step with verbosity and all tasks on CPU 0
#
set task_mask 0
send "$srun -c1 --cpu-bind=verbose,map_cpu:0 ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $task_cnt} "Affinity mask should be consistent for a job step with verbosity and all tasks on CPU 0" "$task_mask != $task_cnt"
set verbose_cnt 0
send "$srun -c1 --cpu-bind=verbose,map_cpu:0 ./$file_prog\r"
expect {
-re "cpu-bind=MAP|cpu-bind-cores=MAP|cpu-bind-sockets=MAP|cpu-bind-threads=MAP" {
incr verbose_cnt
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
# Both task/affinity and task/cpu will generate verbose message,
# so check for double messages in case both plugins are configured.
subtest {$verbose_cnt == $task_cnt || $verbose_cnt == [expr $task_cnt * 2]} "Verbose messages count should be consistent" "$verbose_cnt != $task_cnt"
#
# Run all tasks all bound to the same CPU by specifying a map (for each CPU)
#
set cpu_cnt 0
while {$cpu_cnt < $task_cnt} {
set mask_sum 0
set mask [ expr 1 << $cpu_cnt ]
send "$srun -c1 --cpu-bind=map_cpu:$cpu_cnt ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr mask_sum $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$mask_sum == $task_cnt * $mask} "Affinity mask should be consistent for all tasks bound to the same CPU by specifying a map" "$mask_sum != $task_cnt * $mask"
incr cpu_cnt 1
}
#
# Check if invalid mask_cpu fails option values validation
#
set output [run_command_output -xfail "$srun -c1 --cpu-bind=verbose,mask_cpu:NaN hostname"]
subtest {[regexp "error: Failed to validate number: NaN, the offending character is N" $output]} "The command should fail because of invalid argument"
#
# Run all tasks all bound to the same CPU by specifying a mask (for each CPU)
#
set cpu_cnt 0
while {$cpu_cnt < $task_cnt} {
set mask_sum 0
set mask [ expr 1 << $cpu_cnt ]
set mstr [ uint2hex $mask ]
send "$srun -c1 --cpu-bind=mask_cpu:$mstr ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr mask_sum $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$mask_sum == $task_cnt * $mask} "Affinity mask should be consistent for all tasks bound to the same CPU by specifying a mask" "$mask_sum != $task_cnt * $mask"
incr cpu_cnt 1
}
#
# Generate forward and reverse masks and maps
#
set cpu_cnt 0
set fwd_mask ""
set fwd_map ""
set rev_mask ""
set rev_map ""
set alt_mask ""
set alt_map ""
set full_mask [ expr (1 << $task_cnt) - 1 ]
while {$cpu_cnt < $task_cnt} {
set mask_sum 0
set mask [ expr 1 << $cpu_cnt ]
set mstr [ uint2hex $mask ]
set fwd_mask "$fwd_mask,$mstr"
set fwd_map "$fwd_map,$cpu_cnt"
set rev_mask "$mstr,$rev_mask"
set rev_map "$cpu_cnt,$rev_map"
if { $cpu_cnt % 2 } {
set alt_mask "$mstr,$alt_mask"
set alt_map "$cpu_cnt,$alt_map"
} else {
set alt_mask "$alt_mask,$mstr"
set alt_map "$alt_map,$cpu_cnt"
}
if { $cpu_cnt == 0 } {
set fwd_mask "$mstr"
set fwd_map "$cpu_cnt"
set rev_mask "$mstr"
set rev_map "$cpu_cnt"
set alt_mask "$mstr"
set alt_map "$cpu_cnt"
}
incr cpu_cnt 1
}
log_debug "full_mask: $full_mask"
log_debug "fwd_map: $fwd_map"
log_debug "fwd_mask: $fwd_mask"
log_debug "rev_map: $rev_map"
log_debug "rev_mask: $rev_mask"
log_debug "alt_map: $alt_map"
log_debug "alt_mask: $alt_mask"
#
# Run all tasks bound to a different CPU by specifying a forward map
#
set task_mask 0
send "$srun -c1 --cpu-bind=map_cpu:$fwd_map ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a forward map" "$task_mask != $full_mask"
#
# Run all tasks bound to a different CPU by specifying a reverse map
#
set task_mask 0
send "$srun -c1 --cpu-bind=map_cpu:$rev_map ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a reverse map" "$task_mask != $full_mask"
#
# Run all tasks bound to a different CPU by specifying an alternating map
#
set task_mask 0
send "$srun -c1 --cpu-bind=map_cpu:$alt_map ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying an alternating map" "$task_mask != $full_mask"
#
# Run all tasks bound to a different CPU by specifying a forward mask
#
set task_mask 0
send "$srun -c1 --cpu-bind=mask_cpu:$fwd_mask ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a forward mask" "$task_mask != $full_mask"
#
# Run all tasks bound to a different CPU by specifying a reverse mask
#
set task_mask 0
send "$srun -c1 --cpu-bind=mask_cpu:$rev_mask ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying a reverse mask" "$task_mask != $full_mask"
#
# Run all tasks bound to a different CPU by specifying an alternating mask
#
set task_mask 0
send "$srun -c1 --cpu-bind=mask_cpu:$alt_mask ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re $test_prompt {
}
}
subtest {$task_mask == $full_mask} "Affinity mask should be consistent for all tasks bound to a different CPU by specifying an alternating mask" "$task_mask != $full_mask"
#
# Terminate the job, free the allocation
#
send "exit\r"
expect {
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
eof {
wait
}
}