| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test MPS resource limits with various allocation options |
| # |
| # Requires: AccountingStorageEnforce=limits |
| # AccountingStorageTRES=gres/mps |
| # SelectType=select/cons_tres |
| # Administrator permissions |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. |
| ############################################################################ |
| source ./globals |
| source ./globals_accounting |
| |
| set acct "${test_name}_acct" |
| set file_in "$test_dir/input" |
| set file_out1 "$test_dir/output1" |
| set file_out2 "$test_dir/output2" |
| set cluster [get_config_param "ClusterName"] |
| set user [get_my_user_name] |
| set job_id1 0 |
| set job_id2 0 |
| |
| proc setup { mps_limit } { |
| global acct cluster user |
| |
| set acct_req(cluster) $cluster |
| set acct_req(parent) "root" |
| set acct_req(maxtres) "gres/mps=$mps_limit" |
| |
| set user_req(cluster) $cluster |
| set user_req(account) $acct |
| |
| if { [add_acct $acct [array get acct_req]] } { |
| fail "Child account was not added" |
| } |
| |
| if { [add_user $user [array get user_req]] } { |
| fail "User was not added to child account" |
| } |
| } |
| |
| set store_tres [string tolower [get_config_param "AccountingStorageTRES"]] |
| set store_mps [string first "gres/mps:" $store_tres] |
| if {$store_mps != -1} { |
| skip "This test requires homogeneous MPS accounting (NO Type)" |
| } |
| set store_mps [string first "gres/mps" $store_tres] |
| if {$store_mps == -1} { |
| skip "This test requires accounting for MPS" |
| } elseif {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test can't be run without AccountStorageType=slurmdbd" |
| } elseif {![param_contains [get_config_param "AccountingStorageEnforce"] "limits"]} { |
| skip "This test can't be run without AccountingStorageEnforce=limits" |
| } elseif {[get_admin_level] ne "Administrator"} { |
| skip "This test can't be run without being an Accounting administrator" |
| } |
| |
| if {![check_config_select "cons_tres"]} { |
| skip "This test is only compatible with select/cons_tres" |
| } |
| |
| set nb_nodes [get_partition_param [default_partition] "TotalNodes"] |
| log_debug "Default partition node count is $nb_nodes" |
| if {$nb_nodes > 1} { |
| set nb_nodes 2 |
| } |
| set mps_cnt [get_highest_mps_count $nb_nodes] |
| if {$mps_cnt < 100} { |
| skip "This test requires 100 or more MPS per gpu on $nb_nodes nodes of the default partition" |
| } |
| log_debug "MPS count is $mps_cnt" |
| |
| proc cleanup {} { |
| global acct job_id1 job_id2 |
| |
| cancel_job [list $job_id1 $job_id2] |
| remove_acct "" $acct |
| } |
| |
| # Remove any vestigial test account |
| cleanup |
| |
| # Add parent account (off root) |
| set mps_limit [expr $mps_cnt * $nb_nodes] |
| if {$mps_limit > 8} { |
| set mps_limit 50 |
| } else { |
| incr mps_limit -1 |
| } |
| setup $mps_limit |
| |
| make_bash_script $file_in " |
| $scontrol -dd show job \${SLURM_JOBID} | grep mps |
| exit 0" |
| |
| # |
| # Test --gres=mps option by job (first job over limit, second job under limit) |
| # |
| log_info "TEST 1: --gres=mps option by job (first job over limit, second job under limit)" |
| |
| set timeout $max_job_delay |
| set mps_good_cnt [expr ($mps_limit + $nb_nodes - 1) / $nb_nodes] |
| if {$nb_nodes == 1} { |
| set mps_fail_cnt [expr $mps_limit + 1] |
| } else { |
| set mps_fail_cnt [expr $mps_good_cnt + 1] |
| } |
| spawn $sbatch --account=$acct --gres=craynetwork:0 --gres=mps:$mps_fail_cnt -N$nb_nodes -t1 -o $file_out1 -J $test_name $file_in |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id1 $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id1 == 0} { |
| fail "Job not submitted" |
| } |
| |
| spawn $sbatch --account=$acct --gres=craynetwork:0 --gres=mps:$mps_good_cnt -N$nb_nodes -t1 -o $file_out2 -J $test_name $file_in |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id2 $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$job_id2 == 0} { |
| fail "Job not submitted" |
| } |
| |
| wait_for_job -fail $job_id2 "DONE" |
| |
| set match 0 |
| spawn $scontrol show job $job_id1 |
| expect { |
| -re "JobState=PENDING" { |
| incr match |
| exp_continue |
| } |
| -re "Reason=.*AssocMaxGRESPerJob" { |
| incr match |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$match != 2} { |
| fail "Job ($job_id1) state is bad" |
| } |