|  | #!/usr/bin/env expect | 
|  | ############################################################################ | 
|  | # Purpose: Test of Slurm functionality | 
|  | #          Test accounting for MPS resources with various allocation options | 
|  | ############################################################################ | 
|  | # Copyright (C) SchedMD LLC. | 
|  | # | 
|  | # This file is part of Slurm, a resource management program. | 
|  | # For details, see <https://slurm.schedmd.com/>. | 
|  | # Please also read the included file: DISCLAIMER. | 
|  | # | 
|  | # Slurm is free software; you can redistribute it and/or modify it under | 
|  | # the terms of the GNU General Public License as published by the Free | 
|  | # Software Foundation; either version 2 of the License, or (at your option) | 
|  | # any later version. | 
|  | # | 
|  | # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | # details. | 
|  | # | 
|  | # You should have received a copy of the GNU General Public License along | 
|  | # with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | # 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA. | 
|  | ############################################################################ | 
|  | source ./globals | 
|  |  | 
|  | set file_in1       "$test_dir/input1" | 
|  | set file_in2       "$test_dir/input2" | 
|  | set file_out       "$test_dir/output" | 
|  | set job_id         0 | 
|  |  | 
|  | # | 
|  | # Validate the job, batch step and step 0 of a job have the proper MPS counts | 
|  | # No step to test if step_mps == -1 | 
|  | # | 
|  | # NOTE: AllocTRES and ReqTRES values for all steps (including batch step) | 
|  | #	are reported based upon the job specification | 
|  | # | 
|  | proc test_acct { job_id job_mps step_mps req_mps have_mps_types batch_mps } { | 
|  | global re_word_str number | 
|  | global bin_cat bin_rm file_out sacct | 
|  |  | 
|  | if {$job_id == 0} { | 
|  | return | 
|  | } | 
|  |  | 
|  | log_debug "Job $job_id Expecting job MPS:$job_mps  Step MPS:$step_mps" | 
|  |  | 
|  | # Wait for ReqTRES to be populated for the job. Job completion records are | 
|  | # sent immediately to the database, but job start records get queued and | 
|  | # can take longer. | 
|  | wait_for_command_match -fail "$sacct -X -n -o ReqTRES --parsable2 -j $job_id" "gres/mps" | 
|  |  | 
|  | set output [run_command_output -fail "$sacct --job=$job_id --parsable2 --start=now-15minutes --format JobID,ReqTRES,AllocTRES --noheader"] | 
|  | set job_tres_dict [dict create] | 
|  | foreach line [split $output "\n"] { | 
|  | lassign [split $line "|"] JobID ReqTRES AllocTRES | 
|  | dict set job_tres_dict $JobID ReqTRES $ReqTRES | 
|  | dict set job_tres_dict $JobID AllocTRES $AllocTRES | 
|  | } | 
|  |  | 
|  | # Check and count reported mps on the step | 
|  | if {$step_mps != -1} { | 
|  | set mps_reported_count 0 | 
|  | if ![dict exists $job_tres_dict "$job_id.0"] { | 
|  | fail "sacct did not report a record for step $job_id.0" | 
|  | } | 
|  | set tres_dict [dict get $job_tres_dict "$job_id.0"] | 
|  | foreach tres_value [dict values $tres_dict] { | 
|  | if {$have_mps_types} { | 
|  | foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] { | 
|  | subtest {$mps_count == $step_mps} "Verify step MPS count reported by sacct" "$mps_count != $step_mps" | 
|  | incr mps_reported_count | 
|  | } | 
|  | } else { | 
|  | foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] { | 
|  | subtest {$mps_count == $step_mps} "Verify step MPS count reported by sacct" "$mps_count != $step_mps" | 
|  | incr mps_reported_count | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | subtest {$mps_reported_count == 1} "sacct should report step MPS 1 time" "$mps_reported_count != 1" | 
|  | } | 
|  |  | 
|  | # Check and count reported batch mps on the job | 
|  | set mps_reported_count 0 | 
|  | if ![dict exists $job_tres_dict "$job_id.batch"] { | 
|  | fail "sacct did not report a record for job $job_id.batch" | 
|  | } | 
|  | set tres_dict [dict get $job_tres_dict "$job_id.batch"] | 
|  | foreach tres_value [dict values $tres_dict] { | 
|  | if {$have_mps_types} { | 
|  | foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] { | 
|  | subtest {$mps_count == $batch_mps} "Batch MPS reported by sacct should be $batch_mps" "$mps_count != $batch_mps" | 
|  | incr mps_reported_count | 
|  | } | 
|  | } else { | 
|  | foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] { | 
|  | subtest {$mps_count == $batch_mps} "Batch MPS reported by sacct should be $batch_mps" "$mps_count != $batch_mps" | 
|  | incr mps_reported_count | 
|  | } | 
|  | } | 
|  | } | 
|  | subtest {$mps_reported_count == 1} "sacct should report batch MPS 1 time" "found $mps_reported_count times" | 
|  |  | 
|  | # Check and count reported mps on the job | 
|  | set mps_reported_count 0 | 
|  | if ![dict exists $job_tres_dict $job_id] { | 
|  | fail "sacct did not report a record for job $job_id" | 
|  | } | 
|  | set tres_dict [dict get $job_tres_dict $job_id] | 
|  | foreach tres_value [dict values $tres_dict] { | 
|  | if {$have_mps_types} { | 
|  | foreach {{} mps_count} [regexp -all -inline {gres/mps:[^=]+=(\d+)} $tres_value] { | 
|  | subtest {$mps_count == $job_mps} "Verify job MPS count reported by sacct" "$mps_count != $job_mps" | 
|  | incr mps_reported_count | 
|  | } | 
|  | } else { | 
|  | foreach {{} mps_count} [regexp -all -inline {gres/mps=(\d+)} $tres_value] { | 
|  | subtest {$mps_count == $job_mps} "Verify job MPS count reported by sacct" "$mps_count != $job_mps" | 
|  | incr mps_reported_count | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | subtest {$mps_reported_count == 2} "sacct should report job MPS 2 times" "found $mps_reported_count times" | 
|  | } | 
|  |  | 
|  | # | 
|  | # Validate the job, batch step and step 0 of a job have the proper MPS counts | 
|  | # No step to test if step_mps == -1 | 
|  | # | 
|  | # NOTE: AllocTRES and ReqTRES values for all steps (including batch step) | 
|  | #	are reported based upon the job specification | 
|  | # | 
|  | proc test_out_file { file_out target } { | 
|  | global re_word_str number bin_cat | 
|  |  | 
|  | wait_for_file -fail $file_out | 
|  |  | 
|  | set match 0 | 
|  | spawn $bin_cat $file_out | 
|  | expect { | 
|  | -re "AllocTRES=.*,gres/mps=($number)" { | 
|  | set match $expect_out(1,string) | 
|  | exp_continue | 
|  | } | 
|  | -re "AllocTRES=.*,gres/mps:($re_word_str)=($number)" { | 
|  | set match $expect_out(2,string) | 
|  | exp_continue | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | subtest {$match == $target} "Verify MPS accounting" "$match != $target" | 
|  | } | 
|  |  | 
|  | # | 
|  | # Helper function to find $batch_mps from different outputs | 
|  | # | 
|  | proc get_batch_mps { file_out } { | 
|  | global bin_cat | 
|  |  | 
|  | set batch_host "unknown" | 
|  | set batch_mps  "unknown" | 
|  | set output [run_command_output -fail "$bin_cat $file_out"] | 
|  | set nodes [regexp -all -line -inline {    Nodes=+.*} $output] | 
|  | set node_line [lindex $nodes 0] | 
|  |  | 
|  | if {[llength $nodes] > 1} { | 
|  | # Output type where nodes are split on 2 lines, set $node_line to grab the BatchHost node | 
|  | #  BatchHost=74dc179a_n1 | 
|  | #  ... | 
|  | # >Nodes=74dc179a_n1 CPU_IDs=0-1 Mem=150 GRES=[[mps:2]](IDX:0-1)< | 
|  | #  Nodes=74dc179a_n2 CPU_IDs=0-1 Mem=150 GRES=mps:1(IDX:0) | 
|  |  | 
|  | regexp -all -line {BatchHost=(.*)} $output - batch_host | 
|  | set node_line [lindex [lsearch -all -inline $nodes *$batch_host*] 0] | 
|  | } | 
|  |  | 
|  | if {![regexp {mps:(?:[^:( ]+:)?(\d+)} $node_line - batch_mps]} { | 
|  | fail "Unable to get batch_mps" | 
|  | } | 
|  | return $batch_mps | 
|  | } | 
|  |  | 
|  | set store_tres [string tolower [get_config_param "AccountingStorageTRES"]] | 
|  | set store_mps [string first "gres/mps" $store_tres] | 
|  | if {$store_mps == -1} { | 
|  | skip "This test requires accounting for MPS" | 
|  | } | 
|  |  | 
|  | if {![check_config_select "cons_tres"]} { | 
|  | skip "This test is only compatible with select/cons_tres" | 
|  | } | 
|  |  | 
|  | if { [get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { | 
|  | skip "This test requires AccountStorageType=slurmdbd" | 
|  | } | 
|  |  | 
|  | set nb_nodes [llength [get_nodes_by_request "--gres=mps:100 -N2 -t2"]] | 
|  | if { $nb_nodes == 0} { | 
|  | skip "This test requires being able to submit job with --gres=mps:100 -N2" | 
|  | } | 
|  |  | 
|  | proc cleanup {} { | 
|  | global job_id | 
|  |  | 
|  | cancel_job $job_id | 
|  | } | 
|  |  | 
|  | # | 
|  | # Test --gres=mps option by job | 
|  | # | 
|  |  | 
|  | log_info "TEST 1: --gres=mps option by job" | 
|  |  | 
|  | make_bash_script $file_in1 " | 
|  | $scontrol -dd show job \${SLURM_JOBID} | 
|  | exit 0" | 
|  |  | 
|  | set req_mps 49 | 
|  | set target [expr $nb_nodes * $req_mps] | 
|  | exec $bin_rm -f $file_out | 
|  | set timeout $max_job_delay | 
|  | spawn $sbatch --gres=craynetwork:0 --gres=mps:$req_mps -N$nb_nodes -t1 -o $file_out -J $test_name $file_in1 | 
|  | expect { | 
|  | -re "Submitted batch job ($number)" { | 
|  | set job_id $expect_out(1,string) | 
|  | exp_continue | 
|  | } | 
|  | timeout { | 
|  | fail "sbatch not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | if {$job_id == 0} { | 
|  | fail "Job not submitted" | 
|  | } | 
|  |  | 
|  | wait_for_job -fail $job_id "DONE" | 
|  |  | 
|  | wait_for_file -fail $file_out | 
|  |  | 
|  | set batch_mps [get_batch_mps $file_out] | 
|  | set have_mps_types 0 | 
|  | set match 0 | 
|  | spawn $bin_cat $file_out | 
|  | expect { | 
|  | -re "AllocTRES=.*,gres/mps=($number)" { | 
|  | set match $expect_out(1,string) | 
|  | exp_continue | 
|  | } | 
|  | -re "AllocTRES=.*,gres/mps:($re_word_str)=($number)" { | 
|  | if {$match == 0} { | 
|  | set have_mps_types 1 | 
|  | set match $expect_out(2,string) | 
|  | } | 
|  | exp_continue | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | if {$match != $target} { | 
|  | fail "Failed to account for proper MPS count ($match != $target)" | 
|  | } | 
|  | test_acct $job_id $target -1 $req_mps $have_mps_types $batch_mps | 
|  |  | 
|  | # | 
|  | # Test --gres=mps option by step | 
|  | # | 
|  | make_bash_script $file_in1 " | 
|  | $srun $file_in2 | 
|  | exit 0" | 
|  |  | 
|  | make_bash_script $file_in2 " | 
|  | if \[ \$SLURM_PROCID -eq 0 \]; then | 
|  | $scontrol -dd show job \${SLURM_JOBID} | 
|  | $scontrol show step \${SLURM_JOBID}.\${SLURM_STEPID} | 
|  | fi | 
|  | exit 0" | 
|  |  | 
|  | log_info "TEST 2: --gres=mps option by step" | 
|  |  | 
|  | set req_mps 51 | 
|  | set target [expr $nb_nodes * $req_mps] | 
|  | exec $bin_rm -f $file_out | 
|  | set job_id 0 | 
|  | set timeout $max_job_delay | 
|  | spawn $sbatch --gres=craynetwork:0 --gres=mps:$req_mps -N$nb_nodes -t1 -o $file_out -J $test_name $file_in1 | 
|  | expect { | 
|  | -re "Submitted batch job ($number)" { | 
|  | set job_id $expect_out(1,string) | 
|  | exp_continue | 
|  | } | 
|  | timeout { | 
|  | fail "sbatch not responding" | 
|  | } | 
|  | eof { | 
|  | wait | 
|  | } | 
|  | } | 
|  | if {$job_id == 0} { | 
|  | fail "Job not submitted" | 
|  | } | 
|  |  | 
|  | wait_for_job -fail $job_id "DONE" | 
|  | set batch_mps [get_batch_mps $file_out] | 
|  |  | 
|  | test_out_file $file_out $target | 
|  | test_acct $job_id $target $target $req_mps $have_mps_types $batch_mps |