| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test accounting for MPS resources with various allocation options |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_in1 "$test_dir/input1" |
| set file_in2 "$test_dir/input2" |
| set file_out "$test_dir/output" |
| set job_id 0 |
| |
| # |
| # Validate the job, batch step and step 0 of a job have the proper MPS counts |
| # No step to test if step_mps == -1 |
| # |
| # NOTE: AllocTRES and ReqTRES values for all steps (including batch step) |
| # are reported based upon the job specification |
| # |
| proc test_acct { job_id job_mps step_mps req_mps have_mps_types batch_mps } { |
| global re_word_str number |
| global bin_cat bin_rm file_out sacct |
| |
| if {$job_id == 0} { |
| return |
| } |
| |
| log_debug "Job $job_id Expecting job MPS:$job_mps Step MPS:$step_mps" |
| |
| # Wait for ReqTRES to be populated for the job. Job completion records are |
| # sent immediately to the database, but job start records get queued and |
| # can take longer. |
| wait_for_command_match -fail "$sacct -X -n -o ReqTRES --parsable2 -j $job_id" "gres/mps" |
| |
| set output [run_command_output -fail "$sacct --job=$job_id --parsable2 --start=now-15minutes --format JobID,ReqTRES,AllocTRES --noheader"] |
| set job_tres_dict [dict create] |
| foreach line [split $output "\n"] { |
| lassign [split $line "|"] JobID ReqTRES AllocTRES |
| dict set job_tres_dict $JobID ReqTRES $ReqTRES |
| dict set job_tres_dict $JobID AllocTRES $AllocTRES |
| } |
| |
| # Check and count reported mps on the step |
| if {$step_mps != -1} { |
| set mps_reported_count 0 |
| if ![dict exists $job_tres_dict "$job_id.0"] { |
| fail "sacct did not report a record for step $job_id.0" |
| } |
| set tres_dict [dict get $job_tres_dict "$job_id.0"] |
| foreach tres_value [dict values $tres_dict] { |
| if {$have_mps_types} { |
| foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] { |
| subtest {$mps_count == $step_mps} "Verify step MPS count reported by sacct" "$mps_count != $step_mps" |
| incr mps_reported_count |
| } |
| } else { |
| foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] { |
| subtest {$mps_count == $step_mps} "Verify step MPS count reported by sacct" "$mps_count != $step_mps" |
| incr mps_reported_count |
| } |
| } |
| } |
| |
| subtest {$mps_reported_count == 1} "sacct should report step MPS 1 time" "$mps_reported_count != 1" |
| } |
| |
| # Check and count reported batch mps on the job |
| set mps_reported_count 0 |
| if ![dict exists $job_tres_dict "$job_id.batch"] { |
| fail "sacct did not report a record for job $job_id.batch" |
| } |
| set tres_dict [dict get $job_tres_dict "$job_id.batch"] |
| foreach tres_value [dict values $tres_dict] { |
| if {$have_mps_types} { |
| foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] { |
| subtest {$mps_count == $batch_mps} "Batch MPS reported by sacct should be $batch_mps" "$mps_count != $batch_mps" |
| incr mps_reported_count |
| } |
| } else { |
| foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] { |
| subtest {$mps_count == $batch_mps} "Batch MPS reported by sacct should be $batch_mps" "$mps_count != $batch_mps" |
| incr mps_reported_count |
| } |
| } |
| } |
| subtest {$mps_reported_count == 1} "sacct should report batch MPS 1 time" "found $mps_reported_count times" |
| |
| # Check and count reported mps on the job |
| set mps_reported_count 0 |
| if ![dict exists $job_tres_dict $job_id] { |
| fail "sacct did not report a record for job $job_id" |
| } |
| set tres_dict [dict get $job_tres_dict $job_id] |
| foreach tres_value [dict values $tres_dict] { |
| if {$have_mps_types} { |
| foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] { |
| subtest {$mps_count == $job_mps} "Verify job MPS count reported by sacct" "$mps_count != $job_mps" |
| incr mps_reported_count |
| } |
| } else { |
| foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] { |
| subtest {$mps_count == $job_mps} "Verify job MPS count reported by sacct" "$mps_count != $job_mps" |
| incr mps_reported_count |
| } |
| } |
| } |
| |
| subtest {$mps_reported_count == 2} "sacct should report job MPS 2 times" "found $mps_reported_count times" |
| } |
| |
| # |
| # Validate the job, batch step and step 0 of a job have the proper MPS counts |
| # No step to test if step_mps == -1 |
| # |
| # NOTE: AllocTRES and ReqTRES values for all steps (including batch step) |
| # are reported based upon the job specification |
| # |
| proc test_out_file { file_out target } { |
| global re_word_str number bin_cat |
| |
| wait_for_file -fail $file_out |
| |
| set match 0 |
| spawn $bin_cat $file_out |
| expect { |
| -re "AllocTRES=.*,gres/mps=($number)" { |
| set match $expect_out(1,string) |
| exp_continue |
| } |
| -re "AllocTRES=.*,gres/mps:($re_word_str)=($number)" { |
| set match $expect_out(2,string) |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| subtest {$match == $target} "Verify MPS accounting" "$match != $target" |
| } |
| |
| # |
| # Helper function to find $batch_mps from different outputs |
| # |
| proc get_batch_mps { file_out } { |
| global bin_cat |
| |
| set batch_host "unknown" |
| set batch_mps "unknown" |
| set output [run_command_output -fail "$bin_cat $file_out"] |
| set nodes [regexp -all -line -inline { Nodes=+.*} $output] |
| set node_line [lindex $nodes 0] |
| |
| if {[llength $nodes] > 1} { |
| # Output type where nodes are split on 2 lines, set $node_line to grab the BatchHost node |
| # BatchHost=74dc179a_n1 |
| # ... |
| # >Nodes=74dc179a_n1 CPU_IDs=0-1 Mem=150 GRES=[[mps:2]](IDX:0-1)< |
| # Nodes=74dc179a_n2 CPU_IDs=0-1 Mem=150 GRES=mps:1(IDX:0) |
| |
| regexp -all -line {BatchHost=(.*)} $output - batch_host |
| set node_line [lindex [lsearch -all -inline $nodes *$batch_host*] 0] |
| } |
| |
| if {![regexp {mps:(?:[^:( ]+:)?(\d+)} $node_line - batch_mps]} { |
| fail "Unable to get batch_mps" |
| } |
| return $batch_mps |
| } |
| |
| set store_tres [string tolower [get_config_param "AccountingStorageTRES"]] |
| set store_mps [string first "gres/mps" $store_tres] |
| if {$store_mps == -1} { |
| skip "This test requires accounting for MPS" |
| } |
| |
| if {![check_config_select "cons_tres"]} { |
| skip "This test is only compatible with select/cons_tres" |
| } |
| |
| if { [get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test requires AccountStorageType=slurmdbd" |
| } |
| |
| set nb_nodes [llength [get_nodes_by_request "--gres=mps:100 -N2 -t2"]] |
| if { $nb_nodes == 0} { |
| skip "This test requires being able to submit job with --gres=mps:100 -N2" |
| } |
| |
| proc cleanup {} { |
| global job_id |
| |
| cancel_job $job_id |
| } |
| |
| # |
| # Test --gres=mps option by job |
| # |
| |
| log_info "TEST 1: --gres=mps option by job" |
| |
| make_bash_script $file_in1 " |
| $scontrol -dd show job \${SLURM_JOBID} |
| exit 0" |
| |
| set req_mps 49 |
| set target [expr $nb_nodes * $req_mps] |
| exec $bin_rm -f $file_out |
| set timeout $max_job_delay |
| spawn $sbatch --gres=craynetwork:0 --gres=mps:$req_mps -N$nb_nodes -t1 -o $file_out -J $test_name $file_in1 |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id == 0} { |
| fail "Job not submitted" |
| } |
| |
| wait_for_job -fail $job_id "DONE" |
| |
| wait_for_file -fail $file_out |
| |
| set batch_mps [get_batch_mps $file_out] |
| set have_mps_types 0 |
| set match 0 |
| spawn $bin_cat $file_out |
| expect { |
| -re "AllocTRES=.*,gres/mps=($number)" { |
| set match $expect_out(1,string) |
| exp_continue |
| } |
| -re "AllocTRES=.*,gres/mps:($re_word_str)=($number)" { |
| if {$match == 0} { |
| set have_mps_types 1 |
| set match $expect_out(2,string) |
| } |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| if {$match != $target} { |
| fail "Failed to account for proper MPS count ($match != $target)" |
| } |
| test_acct $job_id $target -1 $req_mps $have_mps_types $batch_mps |
| |
| # |
| # Test --gres=mps option by step |
| # |
| make_bash_script $file_in1 " |
| $srun $file_in2 |
| exit 0" |
| |
| make_bash_script $file_in2 " |
| if \[ \$SLURM_PROCID -eq 0 \]; then |
| $scontrol -dd show job \${SLURM_JOBID} |
| $scontrol show step \${SLURM_JOBID}.\${SLURM_STEPID} |
| fi |
| exit 0" |
| |
| log_info "TEST 2: --gres=mps option by step" |
| |
| set req_mps 51 |
| set target [expr $nb_nodes * $req_mps] |
| exec $bin_rm -f $file_out |
| set job_id 0 |
| set timeout $max_job_delay |
| spawn $sbatch --gres=craynetwork:0 --gres=mps:$req_mps -N$nb_nodes -t1 -o $file_out -J $test_name $file_in1 |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id == 0} { |
| fail "Job not submitted" |
| } |
| |
| wait_for_job -fail $job_id "DONE" |
| set batch_mps [get_batch_mps $file_out] |
| |
| test_out_file $file_out $target |
| test_acct $job_id $target $target $req_mps $have_mps_types $batch_mps |