blob: f78d94bf6a0e13688abd98a4abdad7a8a2ce8c20 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test of CPU affinity support for multi-core systems.
############################################################################
# Copyright (C) 2005-2007 The Regents of the University of California.
# Copyright (C) 2008 Lawrence Livermore National Security.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_prog "$test_name.prog"
if {[param_contains [get_config_param "SelectTypeParameters"] "CR_ONE_TASK_PER_CORE"]} {
skip "This test is incompatible SelectTypeParameters=CR_ONE_TASK_PER_CORE"
}
#
# Test if CPU affinity support is supported.
#
if {![param_contains [get_affinity_types] "affinity"]} {
skip "CPU affinity not supported on this system"
}
log_debug "Task affinity plugin installed"
set task_params [string tolower [get_config_param "TaskPluginParam"]]
if {[string first $task_params "boards"] != -1 ||
[string first $task_params "sockets"] != -1 ||
[string first $task_params "cores"] != -1} {
skip "TaskPluginParams=$task_params not supported"
}
# Get Core Cpu Mask -- 1st cpu of each core.
proc get_core_mask {} {
global srun test_prompt
set last_core_id -1
set last_socket_id -1
set core_mask 0
set cpu_cnt 0
send "$srun /usr/bin/lscpu -a --parse=cpu,core,socket -y\r"
expect {
-re "(\\d+),(\\d+),(\\d+)" {
set cpu_id $expect_out(1,string)
set core_id $expect_out(2,string)
set socket_id $expect_out(3,string)
log_debug "cpu_cnt:$cpu_cnt cpu:$cpu_id core:$core_id socket:$socket_id"
# Reset for next socket
if {$socket_id > $last_socket_id} {
set last_core_id -1
set last_socket_id $socket_id
}
if {$core_id > $last_core_id} {
set last_core_id $core_id
set mask [expr 1 << $cpu_cnt]
incr core_mask $mask
log_debug "core_mask:$core_mask mask:$mask"
}
incr cpu_cnt
exp_continue
}
timeout {
fail "failed to get core mask"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
return $core_mask
}
set force 0
log_user 0
spawn $scontrol show partition [default_partition]
expect {
-re "OverSubscribe=FORCE" {
set force 1
exp_continue
}
eof {
wait
}
}
log_user 1
if {$force == 1} {
skip "This test is not compatible with OverSubscribe=FORCE"
}
# Identify a usable node
set timeout $max_job_delay
set node_name ""
spawn $srun -N1 --exclusive --verbose $bin_printenv SLURMD_NODENAME
expect {
-re "on host ($re_word_str)" {
set node_name $expect_out(1,string)
exp_continue
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$node_name eq ""} {
fail "Failed to get a usable node name"
}
# Determine how many cpus, sockets, cores, and threads the node has
set num_cputot 0
set num_sockets 0
set num_cores 0
set num_threads 0
log_user 0
spawn $scontrol show node $node_name
expect {
-re "CoresPerSocket=($number)" {
set num_cores $expect_out(1,string)
exp_continue
}
-re "CPUTot=($number)" {
set num_cputot $expect_out(1,string)
exp_continue
}
-re "Sockets=($number)" {
set num_sockets $expect_out(1,string)
exp_continue
}
-re "ThreadsPerCore=($number)" {
set num_threads $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
if {$num_cputot == 0 || $num_sockets == 0 || $num_cores == 0 || $num_threads == 0} {
skip "Could not determine number of CPUs:Sockets:Cores:Threads (saw $num_cputot:$num_sockets:$num_cores:$num_threads)"
}
# Intel KNL nodes with 272 threads generate huge numeric masks
# which are too large for expect integer values to manage
set total_thread_count [expr $num_sockets * $num_cores * $num_threads]
set total_core_count [expr $num_sockets * $num_cores]
if {$total_thread_count > 64} {
skip "Total thread count too large for Expect to process ($total_thread_count > 64). Expect unable to work with more than 32-bit numbers"
}
log_debug "Node config: CPUs=$num_cputot Sockets=$num_sockets Cores=$num_cores Threads=$num_threads"
if {$num_cputot < $total_thread_count} {
skip "CPUs=$num_cputot < (Sockets($num_sockets)*Cores($num_cores)*Threads($num_threads)). Test incompatible with lowered CPUs count"
}
proc cleanup {} {
global file_prog test_status STATUS_FAIL
file delete $file_prog
if {$test_status == $STATUS_FAIL} {
if [param_contains [get_config_param "SlurmdParameters"] "config_overrides"] {
log_warn "This test can fail if the node configuration in slurm.conf (sockets, cores, threads) differs from the actual configuration or if using task/cgroup without task/affinity"
}
}
}
#
# Build a test program to report affinity by task
#
exec $bin_cc -I$build_dir $file_prog.c -o $file_prog
exec $bin_chmod 700 $file_prog
#
# Create an allocation
#
global env
set env(SLURM_CPU_BIND) "verbose"
spawn $salloc -w $node_name -N1 --exclusive --verbose -t2 $bin_bash
expect {
-re "Granted job allocation ($number)" {
set job_id $expect_out(1,string)
reset_bash_prompt
exp_continue
}
-re $test_prompt {
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
}
#############################################################################
#
# Run a job step to get allocated processor count
#
set task_cnt 0
send "$srun -c1 ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_cnt
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
#############################################################################
#
# Run a job step with affinity to verify unique masks
#
set expected_mask [ expr ((1 << $task_cnt) - 1) ]
set task_mask 0
send "$srun -c1 -n $task_cnt --cpu-bind=threads ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "srun (from --allocate) not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$task_mask == $expected_mask} "Affinity mask is consistent for a job step with affinity to verify unique masks" "$task_mask != $expected_mask"
#############################################################################
#
# Run a job step with affinity to verify unique masks with min -B 1:1:1
#
# Because -B1:1:1 sets 1 thread-per-core, don't do more tasks than number of
# cores.
set core_task_cnt $task_cnt
if {$core_task_cnt > $total_core_count} {
set core_task_cnt $total_core_count
}
set expected_mask [get_core_mask]
log_debug "core_mask: $expected_mask"
set task_mask 0
send "$srun -c1 -n $core_task_cnt -B 1:1:1 ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "srun (from --allocate) not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$task_mask == $expected_mask} "Affinity mask is consistent for a job step with affinity to verify unique masks with min -B 1:1:1" "$task_mask != $expected_mask"
#############################################################################
#
# Run varying number of sockets, verify task count and number of set bits
#
set this_cnt 1
while {$this_cnt <= $num_sockets} {
set expected_tasks [ expr $this_cnt * $num_cores * $num_threads ]
set num_tasks 0
set num_bits 0
set task_mask 0
send "$srun -B $this_cnt-$this_cnt:$num_cores:$num_threads ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
incr num_tasks 1
# count number of set bits
set this_mask $expect_out(2,string)
while {$this_mask > 0} {
if {$this_mask & 1} {
incr num_bits 1
}
set this_mask [ expr $this_mask >> 1 ]
}
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "srun (from --allocate) not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt sockets" "$num_tasks != $expected_tasks"
subtest {$num_bits == $expected_tasks} "Number of set bits is consistent for $this_cnt sockets" "$num_bits != $expected_tasks"
incr this_cnt 1
}
#############################################################################
#
# Run varying number of cores, verify task count and number of set bits
#
set this_cnt 1
while {$this_cnt <= $num_cores} {
set expected_tasks [ expr $num_sockets * $this_cnt * $num_threads ]
set num_tasks 0
set num_bits 0
set task_mask 0
send "$srun -B $num_sockets:$this_cnt-$this_cnt:$num_threads ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
incr num_tasks 1
# count number of set bits
set this_mask $expect_out(2,string)
while {$this_mask > 0} {
if {$this_mask & 1} {
incr num_bits 1
}
set this_mask [ expr $this_mask >> 1 ]
}
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt cores" "$num_tasks != $expected_tasks"
subtest {$num_bits == $expected_tasks} "Number of set bits is consistent for $this_cnt cores" "$num_bits != $expected_tasks"
incr this_cnt 1
}
#############################################################################
#
# Run varying number of threads, verify task count and number of set bits
#
set this_cnt 1
while {$this_cnt <= $num_threads} {
set expected_tasks [ expr $num_sockets * $num_cores * $this_cnt ]
set num_tasks 0
set num_bits 0
set task_mask 0
send "$srun -B $num_sockets:$num_cores:$this_cnt-$this_cnt ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
incr num_tasks 1
# count number of set bits
set this_mask $expect_out(2,string)
while {$this_mask > 0} {
if {$this_mask & 1} {
incr num_bits 1
}
set this_mask [ expr $this_mask >> 1 ]
}
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt threads" "$num_tasks != $expected_tasks"
subtest {$num_bits == $expected_tasks} "Number of set bits is consistent for $this_cnt threads" "$num_bits != $expected_tasks"
incr this_cnt 1
}
#############################################################################
#
# Run varying cpus per task, verify task count and number of set bits
#
set this_cnt 1
while {$this_cnt <= $core_task_cnt} {
set expected_tasks 1
set num_tasks 0
set num_bits 0
set task_mask 0
send "$srun -c$this_cnt -B 1:1:1 ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
incr num_tasks 1
# count number of set bits
set this_mask $expect_out(2,string)
while {$this_mask > 0} {
if {$this_mask & 1} {
incr num_bits 1
}
set this_mask [ expr $this_mask >> 1 ]
}
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt cpus per task" "$num_tasks != $expected_tasks"
subtest {$num_bits == $this_cnt} "Number of set bits is consistent for $this_cnt cpus per task" "$num_bits != $this_cnt"
incr this_cnt 1
}
#############################################################################
#
# Run a job step with plane distribution to exercise option
# Automatic binding in slurm version 2.0 will bind one task per core
#
set expected_mask [ expr ((1 << $task_cnt) - 1) ]
set task_mask 0
send "$srun -n $task_cnt -m plane=4 ./$file_prog\r"
expect {
-re "TASK_ID:($number),MASK:($number)" {
incr task_mask $expect_out(2,string)
exp_continue
}
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
-re "$srun" {
# just so we don't grab the srun call
exp_continue
}
-re $test_prompt
}
subtest {$task_mask == $expected_mask} "Affinity mask is consistent for a job step with plane distribution" "$task_mask != $expected_mask"
#############################################################################
#
# Terminate the job, free the allocation
#
send "exit\r"
expect {
-re "error" {
fail "Some error occurred"
}
timeout {
fail "salloc not responding or failure to recognize prompt"
}
eof {
wait
}
}