blob: a3f3f47f6e82d3e6c2b48c7cdca90a3f959b8936 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test accounting for MPS resources with various allocation options
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
############################################################################
source ./globals
set file_in1 "$test_dir/input1"
set file_in2 "$test_dir/input2"
set file_out "$test_dir/output"
set job_id 0
#
# Validate the job, batch step and step 0 of a job have the proper MPS counts
# No step to test if step_mps == -1
#
# NOTE: AllocTRES and ReqTRES values for all steps (including batch step)
# are reported based upon the job specification
#
proc test_acct { job_id job_mps step_mps req_mps have_mps_types batch_mps } {
global re_word_str number
global bin_cat bin_rm file_out sacct
if {$job_id == 0} {
return
}
log_debug "Job $job_id Expecting job MPS:$job_mps Step MPS:$step_mps"
# Wait for ReqTRES to be populated for the job. Job completion records are
# sent immediately to the database, but job start records get queued and
# can take longer.
wait_for_command_match -fail "$sacct -X -n -o ReqTRES --parsable2 -j $job_id" "gres/mps"
set output [run_command_output -fail "$sacct --job=$job_id --parsable2 --start=now-15minutes --format JobID,ReqTRES,AllocTRES --noheader"]
set job_tres_dict [dict create]
foreach line [split $output "\n"] {
lassign [split $line "|"] JobID ReqTRES AllocTRES
dict set job_tres_dict $JobID ReqTRES $ReqTRES
dict set job_tres_dict $JobID AllocTRES $AllocTRES
}
# Check and count reported mps on the step
if {$step_mps != -1} {
set mps_reported_count 0
if ![dict exists $job_tres_dict "$job_id.0"] {
fail "sacct did not report a record for step $job_id.0"
}
set tres_dict [dict get $job_tres_dict "$job_id.0"]
foreach tres_value [dict values $tres_dict] {
if {$have_mps_types} {
foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] {
subtest {$mps_count == $step_mps} "Verify step MPS count reported by sacct" "$mps_count != $step_mps"
incr mps_reported_count
}
} else {
foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] {
subtest {$mps_count == $step_mps} "Verify step MPS count reported by sacct" "$mps_count != $step_mps"
incr mps_reported_count
}
}
}
subtest {$mps_reported_count == 1} "sacct should report step MPS 1 time" "$mps_reported_count != 1"
}
# Check and count reported batch mps on the job
set mps_reported_count 0
if ![dict exists $job_tres_dict "$job_id.batch"] {
fail "sacct did not report a record for job $job_id.batch"
}
set tres_dict [dict get $job_tres_dict "$job_id.batch"]
foreach tres_value [dict values $tres_dict] {
if {$have_mps_types} {
foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] {
subtest {$mps_count == $batch_mps} "Batch MPS reported by sacct should be $batch_mps" "$mps_count != $batch_mps"
incr mps_reported_count
}
} else {
foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] {
subtest {$mps_count == $batch_mps} "Batch MPS reported by sacct should be $batch_mps" "$mps_count != $batch_mps"
incr mps_reported_count
}
}
}
subtest {$mps_reported_count == 1} "sacct should report batch MPS 1 time" "found $mps_reported_count times"
# Check and count reported mps on the job
set mps_reported_count 0
if ![dict exists $job_tres_dict $job_id] {
fail "sacct did not report a record for job $job_id"
}
set tres_dict [dict get $job_tres_dict $job_id]
foreach tres_value [dict values $tres_dict] {
if {$have_mps_types} {
foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] {
subtest {$mps_count == $job_mps} "Verify job MPS count reported by sacct" "$mps_count != $job_mps"
incr mps_reported_count
}
} else {
foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] {
subtest {$mps_count == $job_mps} "Verify job MPS count reported by sacct" "$mps_count != $job_mps"
incr mps_reported_count
}
}
}
subtest {$mps_reported_count == 2} "sacct should report job MPS 2 times" "found $mps_reported_count times"
}
#
# Validate the job, batch step and step 0 of a job have the proper MPS counts
# No step to test if step_mps == -1
#
# NOTE: AllocTRES and ReqTRES values for all steps (including batch step)
# are reported based upon the job specification
#
proc test_out_file { file_out target } {
global re_word_str number bin_cat
wait_for_file -fail $file_out
set match 0
spawn $bin_cat $file_out
expect {
-re "AllocTRES=.*,gres/mps=($number)" {
set match $expect_out(1,string)
exp_continue
}
-re "AllocTRES=.*,gres/mps:($re_word_str)=($number)" {
set match $expect_out(2,string)
exp_continue
}
eof {
wait
}
}
subtest {$match == $target} "Verify MPS accounting" "$match != $target"
}
#
# Helper function to find $batch_mps from different outputs
#
proc get_batch_mps { file_out } {
global bin_cat
set batch_host "unknown"
set batch_mps "unknown"
set output [run_command_output -fail "$bin_cat $file_out"]
set nodes [regexp -all -line -inline { Nodes=+.*} $output]
set node_line [lindex $nodes 0]
if {[llength $nodes] > 1} {
# Output type where nodes are split on 2 lines, set $node_line to grab the BatchHost node
# BatchHost=74dc179a_n1
# ...
# >Nodes=74dc179a_n1 CPU_IDs=0-1 Mem=150 GRES=[[mps:2]](IDX:0-1)<
# Nodes=74dc179a_n2 CPU_IDs=0-1 Mem=150 GRES=mps:1(IDX:0)
regexp -all -line {BatchHost=(.*)} $output - batch_host
set node_line [lindex [lsearch -all -inline $nodes *$batch_host*] 0]
}
if {![regexp {mps:(?:[^:( ]+:)?(\d+)} $node_line - batch_mps]} {
fail "Unable to get batch_mps"
}
return $batch_mps
}
set store_tres [string tolower [get_config_param "AccountingStorageTRES"]]
set store_mps [string first "gres/mps" $store_tres]
if {$store_mps == -1} {
skip "This test requires accounting for MPS"
}
if {![check_config_select "cons_tres"]} {
skip "This test is only compatible with select/cons_tres"
}
if { [get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test requires AccountStorageType=slurmdbd"
}
set nb_nodes [llength [get_nodes_by_request "--gres=mps:100 -N2 -t2"]]
if { $nb_nodes == 0} {
skip "This test requires being able to submit job with --gres=mps:100 -N2"
}
proc cleanup {} {
global job_id
cancel_job $job_id
}
#
# Test --gres=mps option by job
#
log_info "TEST 1: --gres=mps option by job"
make_bash_script $file_in1 "
$scontrol -dd show job \${SLURM_JOBID}
exit 0"
set req_mps 49
set target [expr $nb_nodes * $req_mps]
exec $bin_rm -f $file_out
set timeout $max_job_delay
spawn $sbatch --gres=craynetwork:0 --gres=mps:$req_mps -N$nb_nodes -t1 -o $file_out -J $test_name $file_in1
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "Job not submitted"
}
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out
set batch_mps [get_batch_mps $file_out]
set have_mps_types 0
set match 0
spawn $bin_cat $file_out
expect {
-re "AllocTRES=.*,gres/mps=($number)" {
set match $expect_out(1,string)
exp_continue
}
-re "AllocTRES=.*,gres/mps:($re_word_str)=($number)" {
if {$match == 0} {
set have_mps_types 1
set match $expect_out(2,string)
}
exp_continue
}
eof {
wait
}
}
if {$match != $target} {
fail "Failed to account for proper MPS count ($match != $target)"
}
test_acct $job_id $target -1 $req_mps $have_mps_types $batch_mps
#
# Test --gres=mps option by step
#
make_bash_script $file_in1 "
$srun $file_in2
exit 0"
make_bash_script $file_in2 "
if \[ \$SLURM_PROCID -eq 0 \]; then
$scontrol -dd show job \${SLURM_JOBID}
$scontrol show step \${SLURM_JOBID}.\${SLURM_STEPID}
fi
exit 0"
log_info "TEST 2: --gres=mps option by step"
set req_mps 51
set target [expr $nb_nodes * $req_mps]
exec $bin_rm -f $file_out
set job_id 0
set timeout $max_job_delay
spawn $sbatch --gres=craynetwork:0 --gres=mps:$req_mps -N$nb_nodes -t1 -o $file_out -J $test_name $file_in1
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "Job not submitted"
}
wait_for_job -fail $job_id "DONE"
set batch_mps [get_batch_mps $file_out]
test_out_file $file_out $target
test_acct $job_id $target $target $req_mps $have_mps_types $batch_mps