| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test GPU resource limits with various allocation options |
| # |
| # Requires: AccountingStorageEnforce=limits |
| # AccountingStorageTRES=gres/gpu |
| # SelectType=select/cons_tres |
| # Administrator permissions |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. |
| ############################################################################ |
| source ./globals |
| source ./globals_accounting |
| |
| set acct "${test_name}_acct" |
| set cluster [get_config_param "ClusterName"] |
| set file_in "$test_dir/input" |
| set file_out1 "$test_dir/output1" |
| set file_out2 "$test_dir/output2" |
| set one_task_pc 0 |
| set user [get_my_user_name] |
| set job_id1 0 |
| set job_id2 0 |
| |
| proc setup { gpu_limit } { |
| global acct cluster user |
| |
| set acct_req(cluster) $cluster |
| set acct_req(parent) "root" |
| set acct_req(maxtres) "gres/gpu=$gpu_limit" |
| |
| set user_req(cluster) $cluster |
| set user_req(account) $acct |
| |
| if { [add_acct $acct [array get acct_req]] } { |
| fail "Child account was not added" |
| } |
| |
| if { [add_user $user [array get user_req]] } { |
| fail "User was not added to child account" |
| } |
| } |
| |
| set store_tres [string tolower [get_config_param "AccountingStorageTRES"]] |
| set store_mps [string first "gres/gpu:" $store_tres] |
| if {$store_mps != -1} { |
| skip "This test requires homogeneous GPU accounting (NO Type)" |
| } |
| set store_gpu [string first "gres/gpu" $store_tres] |
| if {$store_gpu == -1} { |
| skip "This test requires accounting for GPUs" |
| } elseif {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test can't be run without AccountStorageType=slurmdbd" |
| } elseif {![param_contains [get_config_param "AccountingStorageEnforce"] "limits"]} { |
| skip "This test can't be run without AccountingStorageEnforce=limits" |
| } elseif {[get_admin_level] ne "Administrator"} { |
| skip "This test can't be run without being an Accounting administrator" |
| } |
| |
| if {![check_config_select "cons_tres"]} { |
| skip "This test is only compatible with select/cons_tres" |
| } |
| if {[param_contains [get_config_param "SelectTypeParameters"] "CR_ONE_TASK_PER_CORE"]} { |
| set one_task_pc 1 |
| } |
| |
| set nb_nodes [get_partition_param [default_partition] "TotalNodes"] |
| log_debug "Default partition node count is $nb_nodes" |
| if {$nb_nodes > 1} { |
| set nb_nodes 2 |
| } |
| set gpu_cnt [get_highest_gres_count $nb_nodes "gpu"] |
| if {$gpu_cnt < 2} { |
| skip "This test requires 2 or more GPUs on $nb_nodes nodes of the default partition" |
| } |
| |
| set node_name [get_nodes_by_request "--gres=gpu:1 -n1 -t1"] |
| if { [llength $node_name] != 1 } { |
| skip "This test need to be able to submit jobs with at least --gres=gpu:1" |
| } |
| set cpus_per_node [get_node_param $node_name "CPUTot"] |
| set sockets_per_node [get_node_param $node_name "Sockets"] |
| set cores_per_socket [get_node_param $node_name "CoresPerSocket"] |
| set cpus_per_socket [expr $cpus_per_node / $sockets_per_node] |
| |
| log_debug "GPUs per node is $gpu_cnt" |
| log_debug "Sockets per node is $sockets_per_node" |
| log_debug "CPUs per socket is $cpus_per_socket" |
| log_debug "CPUs per node is $cpus_per_node" |
| |
| if {$cpus_per_node < 3} { |
| skip "This test requires 3 or more CPUs per node in the default partition" |
| } |
| |
| proc cleanup {} { |
| global job_id1 job_id2 acct |
| |
| cancel_job [list $job_id1 $job_id2] |
| remove_acct "" $acct |
| } |
| |
| # Remove any vestigial test account |
| cleanup |
| |
| # Add parent account (off root) |
| set gpu_limit [expr $gpu_cnt * $nb_nodes] |
| if {$gpu_limit > 8} { |
| set gpu_limit 8 |
| } else { |
| incr gpu_limit -1 |
| } |
| setup $gpu_limit |
| |
| make_bash_script $file_in " |
| $scontrol -dd show job \${SLURM_JOBID} | grep gpu |
| exit 0" |
| |
| # |
| # Test --gpus option by job (first job over limit, second job under limit) |
| # |
| log_info "TEST 1: --gpus option by job (first job over limit, second job under limit)" |
| |
| set timeout $max_job_delay |
| file delete $file_out1 $file_out2 |
| set gpu_fail_cnt [expr $gpu_limit + 1] |
| set job_id1 [submit_job -fail "--account=$acct --gres=craynetwork:0 --gpus=$gpu_fail_cnt -t1 -o $file_out1 -J $test_name $file_in"] |
| set job_id2 [submit_job -fail "--account=$acct --gres=craynetwork:0 --gpus=$gpu_limit -t1 -o $file_out2 -J $test_name $file_in"] |
| |
| wait_for_job -fail $job_id2 "DONE" |
| |
| set match 0 |
| spawn $scontrol show job $job_id1 |
| expect { |
| -re "JobState=PENDING" { |
| incr match |
| exp_continue |
| } |
| -re "Reason=.*AssocMaxGRESPerJob" { |
| incr match |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$match != 2} { |
| fail "Job ($job_id1) state is bad" |
| } |
| cancel_job $job_id1 |
| |
| # |
| # Test --gpus-per-node option by job (first job over limit, second job under limit) |
| # |
| log_info "TEST 2: --gpus-per-node option by job (first job over limit, second job under limit)" |
| |
| set timeout $max_job_delay |
| file delete $file_out1 $file_out2 |
| set gpu_good_cnt [expr $gpu_limit / $nb_nodes] |
| if {$nb_nodes == 1} { |
| set gpu_fail_cnt [expr $gpu_limit + 1] |
| } else { |
| set gpu_fail_cnt [expr $gpu_good_cnt + 1] |
| } |
| set job_id1 [submit_job -fail "--account=$acct --gres=craynetwork:0 --gpus-per-node=$gpu_fail_cnt -N$nb_nodes -t1 -o $file_out1 -J $test_name $file_in"] |
| set job_id2 [submit_job -fail "--account=$acct --gres=craynetwork:0 --gpus-per-node=$gpu_good_cnt -N$nb_nodes -t1 -o $file_out2 -J $test_name $file_in"] |
| |
| wait_for_job -fail $job_id2 "DONE" |
| |
| set match 0 |
| spawn $scontrol show job $job_id1 |
| expect { |
| -re "JobState=PENDING" { |
| incr match |
| exp_continue |
| } |
| -re "Reason=.*AssocMaxGRESPerJob" { |
| incr match |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$match != 2} { |
| fail "Job ($job_id1) state is bad" |
| } |
| cancel_job $job_id1 |
| |
| # |
| # Test --gpus-per-task option by job (first job over limit, second job under limit) |
| # |
| log_info "TEST 3: --gpus-per-task option by job (first job over limit, second job under limit)" |
| |
| set total_cores [expr $cores_per_socket * $sockets_per_node] |
| if {$one_task_pc && $cpus_per_node > $total_cores} { |
| set ntasks_per_core [expr $cpus_per_node / $total_cores] |
| set extra_opt "--ntasks-per-core=$ntasks_per_core" |
| } else { |
| set extra_opt "-t1" |
| } |
| |
| set timeout $max_job_delay |
| file delete $file_out1 $file_out2 |
| set gpu_good_cnt $gpu_limit |
| set gpu_fail_cnt [expr $gpu_limit + 1] |
| set job_id1 [submit_job -fail "--account=$acct --gres=craynetwork:0 --gpus-per-task=1 -n$gpu_fail_cnt $extra_opt -t1 -o $file_out1 -J $test_name $file_in"] |
| set job_id2 [submit_job -fail "--account=$acct --gres=craynetwork:0 --gpus-per-task=1 -n$gpu_good_cnt $extra_opt -t1 -o $file_out2 -J $test_name $file_in"] |
| |
| wait_for_job -fail $job_id2 "DONE" |
| |
| set match 0 |
| spawn $scontrol show job $job_id1 |
| expect { |
| -re "JobState=PENDING" { |
| incr match |
| exp_continue |
| } |
| -re "Reason=.*AssocMaxGRESPerJob" { |
| incr match |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$match != 2} { |
| fail "Job ($job_id1) state is bad" |
| } |