blob: 8f7a6be43239a97b704e8329cbd0b74929e24b72 [file] [log] [blame]
#!/usr/bin/expect
############################################################################
# Purpose: Test of SLURM functionality
# Test of CPU affinity support.
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "WARNING: ..." with an explanation of why the test can't be made, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2005-2006 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# UCRL-CODE-226842.
#
# This file is part of SLURM, a resource management program.
# For details, see <http://www.llnl.gov/linux/slurm/>.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set test_id "18.36"
set exit_code 0
set job_id 0
set file_prog "test$test_id.prog"
print_header $test_id
#
# Test if task affinity support is supported.
#
set affinity 0
log_user 0
spawn $scontrol show config
expect {
-re "task/affinity" {
set affinity 1
exp_continue
}
eof {
wait
}
}
log_user 1
if {$affinity == 0} {
send_user "\nWARNING: task affinity not supported on this system\n"
exit 0
}
send_user "\ntask affinity plugin installed\n"
#
# Build a test program to report affinity by task
#
exec $bin_rm -f $file_prog
exec $bin_make -f /dev/null $file_prog
exec $bin_chmod 700 $file_prog
#
# Create an allocation
#
set salloc_pid [spawn $salloc -N1 --verbose -t2 $bin_bash]
expect {
-re "Granted job allocation ($number)" {
set jobid $expect_out(1,string)
}
timeout {
send_user "\nFAILURE: salloc not responding\n"
if {$job_id != 0} {
cancel_job $job_id
}
slow_kill [expr 0 - $salloc_pid]
exit 1
}
}
send "env |grep SLURM_JOB_CPUS_PER_NODE\n"
expect {
-re "SLURM_JOB_CPUS_PER_NODE=($number)" {
set available_cpus $expect_out(1,string)
}
}
#
# Run a job step to get allocated processor count and affinity
#
expect -re $prompt
set mask 0
set task_cnt 0
send "$slaunch -n $available_cpus $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_cnt
set mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
}
-re $prompt
}
#
# Run a job step with affinity
#
set expected_mask [ expr ((1 << $task_cnt) - 1) ]
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=rank $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
}
-re $prompt
}
if {$task_mask != $expected_mask} {
send_user "\nFAILURE: affinity mask inconsistency ($task_mask,$mask)\n"
set exit_code 1
}
#
# Run a job step with verbosity and all tasks on CPU 0
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=verbose,map_cpu:0 $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
}
-re $prompt
}
if {$task_mask != $task_cnt} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$task_cnt)\n"
set exit_code 1
}
set verbose_cnt 0
send "$slaunch -n $available_cpus --cpu_bind=verbose,map_cpu:0 $file_prog\n"
expect {
-re "cpu_bind=MAP" {
incr verbose_cnt
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
}
-re $prompt
}
if {$verbose_cnt != $task_cnt} {
send_user "\nFAILURE: verbose messages count inconsisent ($verbose_cnt,$task_cnt)\n"
set exit_code 1
}
#
# Run all tasks all bound to the same CPU by specifying a map (for each CPU)
#
set cpu_cnt 0
while {$cpu_cnt < $task_cnt} {
set mask_sum 0
set mask [ expr 1 << $cpu_cnt ]
send "$slaunch -n $available_cpus --cpu_bind=map_cpu:$cpu_cnt $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr mask_sum $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$mask_sum != $task_cnt * $mask} {
send_user "\nFAILURE: affinity mask inconsistent ($mask_sum,$task_cnt)\n"
set exit_code 1
}
incr cpu_cnt 1
}
#
# Run all tasks all bound to the same CPU by specifying a mask (for each CPU)
#
set cpu_cnt 0
while {$cpu_cnt < $task_cnt} {
set mask_sum 0
set mask [ expr 1 << $cpu_cnt ]
set mstr [ dec2hex16 $mask]
send "$slaunch -n $available_cpus --cpu_bind=mask_cpu:$mstr $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr mask_sum $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$mask_sum != $task_cnt * $mask} {
send_user "\nFAILURE: affinity mask inconsistent ($mask_sum,$task_cnt)\n"
set exit_code 1
}
incr cpu_cnt 1
}
#
# Generate foward and reverse masks and maps
#
set cpu_cnt 0
set fwd_mask ""
set fwd_map ""
set rev_mask ""
set rev_map ""
set alt_mask ""
set alt_map ""
set full_mask [ expr (1 << $task_cnt) - 1 ]
while {$cpu_cnt < $task_cnt} {
set mask_sum 0
set mask [ expr 1 << $cpu_cnt ]
set mstr [ dec2hex16 $mask]
set fwd_mask "$fwd_mask,$mstr"
set fwd_map "$fwd_map,$cpu_cnt"
set rev_mask "$mstr,$rev_mask"
set rev_map "$cpu_cnt,$rev_map"
if { $cpu_cnt % 2 } {
set alt_mask "$mstr,$alt_mask"
set alt_map "$cpu_cnt,$alt_map"
} else {
set alt_mask "$alt_mask,$mstr"
set alt_map "$alt_map,$cpu_cnt"
}
if { $cpu_cnt == 0 } {
set fwd_mask "$mstr"
set fwd_map "$cpu_cnt"
set rev_mask "$mstr"
set rev_map "$cpu_cnt"
set alt_mask "$mstr"
set alt_map "$cpu_cnt"
}
incr cpu_cnt 1
}
send_user "\n"
send_user "full_mask: $full_mask\n"
send_user "fwd_map: $fwd_map\n"
send_user "fwd_mask: $fwd_mask\n"
send_user "rev_map: $rev_map\n"
send_user "rev_mask: $rev_mask\n"
send_user "alt_map: $alt_map\n"
send_user "alt_mask: $alt_mask\n"
#
# Run all tasks bound to a different CPU by specifying a forward map
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=map_cpu:$fwd_map $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$task_mask != $full_mask} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$full_mask)\n"
set exit_code 1
}
#
# Run all tasks bound to a different CPU by specifying a reverse map
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=map_cpu:$rev_map $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$task_mask != $full_mask} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$full_mask)\n"
set exit_code 1
}
#
# Run all tasks bound to a different CPU by specifying an alternating map
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=map_cpu:$alt_map $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$task_mask != $full_mask} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$full_mask)\n"
set exit_code 1
}
#
# Run all tasks bound to a different CPU by specifying a forward mask
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=mask_cpu:$fwd_mask $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$task_mask != $full_mask} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$full_mask)\n"
set exit_code 1
}
#
# Run all tasks bound to a different CPU by specifying a reverse mask
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=mask_cpu:$rev_mask $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$task_mask != $full_mask} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$full_mask)\n"
set exit_code 1
}
#
# Run all tasks bound to a different CPU by specifying an alternating mask
#
set task_mask 0
send "$slaunch -n $available_cpus --cpu_bind=mask_cpu:$alt_mask $file_prog\n"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
exp_continue
}
timeout {
send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n"
set exit_code 1
exp_continue
}
-re $prompt
}
if {$task_mask != $full_mask} {
send_user "\nFAILURE: affinity mask inconsistent ($task_mask,$full_mask)\n"
set exit_code 1
}
#
# Terminate the job, free the allocation
#
send "exit\n"
expect {
-re "error" {
send_user "\nFAILURE: some error occurred\n"
set exit_code 1
}
timeout {
send_user "\nFAILURE: salloc not responding or failure to recognize prompt\n"
if {$job_id != 0} {
cancel_job $job_id
}
slow_kill [expr 0 - $salloc_pid]
set exit_code 1
}
eof {
wait
}
}
if {$exit_code == 0} {
exec $bin_rm -f $file_prog
send_user "\nSUCCESS\n"
} else {
send_user "\nNOTE: This test can fail if the node configuration in slurm.conf \n"
send_user " (sockets, cores, threads) differs from the actual configuration\n"
}
exit $exit_code