| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test of CPU affinity support for multi-core systems. |
| ############################################################################ |
| # Copyright (C) 2005-2007 The Regents of the University of California. |
| # Copyright (C) 2008 Lawrence Livermore National Security. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_prog "$test_name.prog" |
| |
| if {[param_contains [get_config_param "SelectTypeParameters"] "CR_ONE_TASK_PER_CORE"]} { |
| skip "This test is incompatible SelectTypeParameters=CR_ONE_TASK_PER_CORE" |
| } |
| |
| # |
| # Test if CPU affinity support is supported. |
| # |
| if {![param_contains [get_affinity_types] "affinity"]} { |
| skip "CPU affinity not supported on this system" |
| } |
| log_debug "Task affinity plugin installed" |
| |
| set task_params [string tolower [get_config_param "TaskPluginParam"]] |
| if {[string first $task_params "boards"] != -1 || |
| [string first $task_params "sockets"] != -1 || |
| [string first $task_params "cores"] != -1} { |
| skip "TaskPluginParams=$task_params not supported" |
| } |
| |
| # Get Core Cpu Mask -- 1st cpu of each core. |
| proc get_core_mask {} { |
| global srun test_prompt |
| |
| set last_core_id -1 |
| set last_socket_id -1 |
| set core_mask 0 |
| set cpu_cnt 0 |
| send "$srun /usr/bin/lscpu -a --parse=cpu,core,socket -y\r" |
| expect { |
| -re "(\\d+),(\\d+),(\\d+)" { |
| set cpu_id $expect_out(1,string) |
| set core_id $expect_out(2,string) |
| set socket_id $expect_out(3,string) |
| log_debug "cpu_cnt:$cpu_cnt cpu:$cpu_id core:$core_id socket:$socket_id" |
| |
| # Reset for next socket |
| if {$socket_id > $last_socket_id} { |
| set last_core_id -1 |
| set last_socket_id $socket_id |
| } |
| |
| if {$core_id > $last_core_id} { |
| set last_core_id $core_id |
| set mask [expr 1 << $cpu_cnt] |
| incr core_mask $mask |
| log_debug "core_mask:$core_mask mask:$mask" |
| } |
| incr cpu_cnt |
| exp_continue |
| } |
| timeout { |
| fail "failed to get core mask" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| return $core_mask |
| } |
| |
| set force 0 |
| log_user 0 |
| spawn $scontrol show partition [default_partition] |
| expect { |
| -re "OverSubscribe=FORCE" { |
| set force 1 |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| if {$force == 1} { |
| skip "This test is not compatible with OverSubscribe=FORCE" |
| } |
| |
| # Identify a usable node |
| set timeout $max_job_delay |
| set node_name "" |
| spawn $srun -N1 --exclusive --verbose $bin_printenv SLURMD_NODENAME |
| expect { |
| -re "on host ($re_word_str)" { |
| set node_name $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$node_name eq ""} { |
| fail "Failed to get a usable node name" |
| } |
| |
| # Determine how many cpus, sockets, cores, and threads the node has |
| set num_cputot 0 |
| set num_sockets 0 |
| set num_cores 0 |
| set num_threads 0 |
| log_user 0 |
| spawn $scontrol show node $node_name |
| expect { |
| -re "CoresPerSocket=($number)" { |
| set num_cores $expect_out(1,string) |
| exp_continue |
| } |
| -re "CPUTot=($number)" { |
| set num_cputot $expect_out(1,string) |
| exp_continue |
| } |
| -re "Sockets=($number)" { |
| set num_sockets $expect_out(1,string) |
| exp_continue |
| } |
| -re "ThreadsPerCore=($number)" { |
| set num_threads $expect_out(1,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| if {$num_cputot == 0 || $num_sockets == 0 || $num_cores == 0 || $num_threads == 0} { |
| skip "Could not determine number of CPUs:Sockets:Cores:Threads (saw $num_cputot:$num_sockets:$num_cores:$num_threads)" |
| } |
| # Intel KNL nodes with 272 threads generate huge numeric masks |
| # which are too large for expect integer values to manage |
| set total_thread_count [expr $num_sockets * $num_cores * $num_threads] |
| set total_core_count [expr $num_sockets * $num_cores] |
| if {$total_thread_count > 64} { |
| skip "Total thread count too large for Expect to process ($total_thread_count > 64). Expect unable to work with more than 32-bit numbers" |
| } |
| log_debug "Node config: CPUs=$num_cputot Sockets=$num_sockets Cores=$num_cores Threads=$num_threads" |
| if {$num_cputot < $total_thread_count} { |
| skip "CPUs=$num_cputot < (Sockets($num_sockets)*Cores($num_cores)*Threads($num_threads)). Test incompatible with lowered CPUs count" |
| } |
| |
| proc cleanup {} { |
| global file_prog test_status STATUS_FAIL |
| |
| file delete $file_prog |
| |
| if {$test_status == $STATUS_FAIL} { |
| if [param_contains [get_config_param "SlurmdParameters"] "config_overrides"] { |
| log_warn "This test can fail if the node configuration in slurm.conf (sockets, cores, threads) differs from the actual configuration or if using task/cgroup without task/affinity" |
| } |
| } |
| |
| } |
| |
| # |
| # Build a test program to report affinity by task |
| # |
| exec $bin_cc -I$build_dir $file_prog.c -o $file_prog |
| exec $bin_chmod 700 $file_prog |
| |
| # |
| # Create an allocation |
| # |
| global env |
| set env(SLURM_CPU_BIND) "verbose" |
| spawn $salloc -w $node_name -N1 --exclusive --verbose -t2 $bin_bash |
| expect { |
| -re "Granted job allocation ($number)" { |
| set job_id $expect_out(1,string) |
| reset_bash_prompt |
| exp_continue |
| } |
| -re $test_prompt { |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| } |
| |
| ############################################################################# |
| # |
| # Run a job step to get allocated processor count |
| # |
| set task_cnt 0 |
| send "$srun -c1 ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_cnt |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| ############################################################################# |
| # |
| # Run a job step with affinity to verify unique masks |
| # |
| set expected_mask [ expr ((1 << $task_cnt) - 1) ] |
| set task_mask 0 |
| send "$srun -c1 -n $task_cnt --cpu-bind=threads ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "srun (from --allocate) not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| subtest {$task_mask == $expected_mask} "Affinity mask is consistent for a job step with affinity to verify unique masks" "$task_mask != $expected_mask" |
| |
| ############################################################################# |
| # |
| # Run a job step with affinity to verify unique masks with min -B 1:1:1 |
| # |
| # Because -B1:1:1 sets 1 thread-per-core, don't do more tasks than number of |
| # cores. |
| set core_task_cnt $task_cnt |
| if {$core_task_cnt > $total_core_count} { |
| set core_task_cnt $total_core_count |
| } |
| |
| set expected_mask [get_core_mask] |
| log_debug "core_mask: $expected_mask" |
| |
| set task_mask 0 |
| send "$srun -c1 -n $core_task_cnt -B 1:1:1 ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "srun (from --allocate) not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| subtest {$task_mask == $expected_mask} "Affinity mask is consistent for a job step with affinity to verify unique masks with min -B 1:1:1" "$task_mask != $expected_mask" |
| |
| ############################################################################# |
| # |
| # Run varying number of sockets, verify task count and number of set bits |
| # |
| set this_cnt 1 |
| while {$this_cnt <= $num_sockets} { |
| set expected_tasks [ expr $this_cnt * $num_cores * $num_threads ] |
| set num_tasks 0 |
| set num_bits 0 |
| set task_mask 0 |
| send "$srun -B $this_cnt-$this_cnt:$num_cores:$num_threads ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| incr num_tasks 1 |
| # count number of set bits |
| set this_mask $expect_out(2,string) |
| while {$this_mask > 0} { |
| if {$this_mask & 1} { |
| incr num_bits 1 |
| } |
| set this_mask [ expr $this_mask >> 1 ] |
| } |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "srun (from --allocate) not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| |
| subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt sockets" "$num_tasks != $expected_tasks" |
| subtest {$num_bits == $expected_tasks} "Number of set bits is consistent for $this_cnt sockets" "$num_bits != $expected_tasks" |
| incr this_cnt 1 |
| } |
| |
| |
| ############################################################################# |
| # |
| # Run varying number of cores, verify task count and number of set bits |
| # |
| set this_cnt 1 |
| while {$this_cnt <= $num_cores} { |
| set expected_tasks [ expr $num_sockets * $this_cnt * $num_threads ] |
| set num_tasks 0 |
| set num_bits 0 |
| set task_mask 0 |
| send "$srun -B $num_sockets:$this_cnt-$this_cnt:$num_threads ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| incr num_tasks 1 |
| # count number of set bits |
| set this_mask $expect_out(2,string) |
| while {$this_mask > 0} { |
| if {$this_mask & 1} { |
| incr num_bits 1 |
| } |
| set this_mask [ expr $this_mask >> 1 ] |
| } |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| |
| subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt cores" "$num_tasks != $expected_tasks" |
| subtest {$num_bits == $expected_tasks} "Number of set bits is consistent for $this_cnt cores" "$num_bits != $expected_tasks" |
| incr this_cnt 1 |
| } |
| |
| |
| ############################################################################# |
| # |
| # Run varying number of threads, verify task count and number of set bits |
| # |
| set this_cnt 1 |
| while {$this_cnt <= $num_threads} { |
| set expected_tasks [ expr $num_sockets * $num_cores * $this_cnt ] |
| set num_tasks 0 |
| set num_bits 0 |
| set task_mask 0 |
| send "$srun -B $num_sockets:$num_cores:$this_cnt-$this_cnt ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| incr num_tasks 1 |
| # count number of set bits |
| set this_mask $expect_out(2,string) |
| while {$this_mask > 0} { |
| if {$this_mask & 1} { |
| incr num_bits 1 |
| } |
| set this_mask [ expr $this_mask >> 1 ] |
| } |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| |
| subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt threads" "$num_tasks != $expected_tasks" |
| subtest {$num_bits == $expected_tasks} "Number of set bits is consistent for $this_cnt threads" "$num_bits != $expected_tasks" |
| incr this_cnt 1 |
| } |
| |
| ############################################################################# |
| # |
| # Run varying cpus per task, verify task count and number of set bits |
| # |
| set this_cnt 1 |
| while {$this_cnt <= $core_task_cnt} { |
| set expected_tasks 1 |
| set num_tasks 0 |
| set num_bits 0 |
| set task_mask 0 |
| send "$srun -c$this_cnt -B 1:1:1 ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| incr num_tasks 1 |
| # count number of set bits |
| set this_mask $expect_out(2,string) |
| while {$this_mask > 0} { |
| if {$this_mask & 1} { |
| incr num_bits 1 |
| } |
| set this_mask [ expr $this_mask >> 1 ] |
| } |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| |
| subtest {$num_tasks == $expected_tasks} "Number of tasks is consistent for $this_cnt cpus per task" "$num_tasks != $expected_tasks" |
| subtest {$num_bits == $this_cnt} "Number of set bits is consistent for $this_cnt cpus per task" "$num_bits != $this_cnt" |
| incr this_cnt 1 |
| } |
| |
| ############################################################################# |
| # |
| # Run a job step with plane distribution to exercise option |
| # Automatic binding in slurm version 2.0 will bind one task per core |
| # |
| set expected_mask [ expr ((1 << $task_cnt) - 1) ] |
| set task_mask 0 |
| send "$srun -n $task_cnt -m plane=4 ./$file_prog\r" |
| expect { |
| -re "TASK_ID:($number),MASK:($number)" { |
| incr task_mask $expect_out(2,string) |
| exp_continue |
| } |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| -re "$srun" { |
| # just so we don't grab the srun call |
| exp_continue |
| } |
| -re $test_prompt |
| } |
| |
| subtest {$task_mask == $expected_mask} "Affinity mask is consistent for a job step with plane distribution" "$task_mask != $expected_mask" |
| |
| ############################################################################# |
| # |
| # Terminate the job, free the allocation |
| # |
| send "exit\r" |
| expect { |
| -re "error" { |
| fail "Some error occurred" |
| } |
| timeout { |
| fail "salloc not responding or failure to recognize prompt" |
| } |
| eof { |
| wait |
| } |
| } |