| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test that the OverSubscribe option in partitions is being enforced. |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set input_file_name "input" |
| set file_in "$test_dir/$input_file_name" |
| set test_part_1 "${test_name}_part_1" |
| set test_part_2 "${test_name}_part_2" |
| set test_part_3 "${test_name}_part_3" |
| set num_jobs 0 |
| set cr_core 0 |
| set cr_cpu 0 |
| set cr_socket 0 |
| set cpu_cnt 0 |
| set socket_cnt 0 |
| set thread_cnt 0 |
| set oversubscribe_j_cnt 2 |
| set job_id 0 |
| set node_name "" |
| |
| # TODO: Temporary variables to debug bug 12729 (remove once fixed) |
| set config_dir [get_conf_path] |
| |
| if {[check_config_select "linear"]} { |
| skip "Test is not compatible with a config of SelectType=select/linear" |
| } |
| if {![is_super_user]} { |
| skip "This test can't be run except as SlurmUser" |
| } |
| set select_type_parameters [get_config_param "SelectTypeParameters"] |
| if {[param_contains $select_type_parameters "CR_SOCKET*"]} { |
| skip "This test is incompatible with CR_SOCKET allocations" |
| } |
| if {[param_contains $select_type_parameters "CR_CORE*"]} { |
| set cr_core 1 |
| } |
| if {[param_contains $select_type_parameters "CR_CPU*"]} { |
| set cr_cpu 1 |
| } |
| set job_mem_opt "--mem-per-cpu=4" |
| if {![param_contains $select_type_parameters "*MEMORY"]} { |
| set job_mem_opt "--comment=no_mem" |
| subskip "SelectTypeParameters needs to include MEMORY for a better oversubscribe testing" |
| } |
| |
| set node_name [get_nodes_by_request "-N1 --exclusive -t1"] |
| if { [llength $node_name] != 1 } { |
| skip "This test need to be able to submit jobs with at least -N1 --exclusive -t1" |
| } |
| if {[get_node_param $node_name "CoreSpecCount"] != "MISSING"} { |
| skip "This test is incompatible with nodes that have a CoreSpecCount (e.g. $node_name)" |
| } |
| if {[get_node_param $node_name "CPUSpecList"] != "MISSING"} { |
| skip "This test is incompatible with nodes that have a CPUSpecList (e.g. $node_name)" |
| } |
| |
| proc cleanup {} { |
| global job_id scontrol test_part_1 test_part_2 test_part_3 |
| |
| cancel_job $job_id |
| run_command -none "$scontrol delete partitionname=$test_part_1" |
| run_command -none "$scontrol delete partitionname=$test_part_2" |
| run_command -none "$scontrol delete partitionname=$test_part_3" |
| |
| # TODO: Temporary cleanup of debug for bug 12729 (remove once fixed) |
| global config_dir |
| run_command "$scontrol show node" |
| restore_conf $config_dir/slurm.conf |
| run_command "$scontrol reconfigure" |
| } |
| |
| proc cr_core_cpu { node } { |
| global cr_cpu cr_core core_cnt socket_cnt scontrol number |
| global cpu_cnt thread_cnt |
| |
| set node_conf [dict get [get_nodes $node] $node] |
| set core_cnt [dict get $node_conf "CoresPerSocket"] |
| set cpu_cnt [dict get $node_conf "CPUTot"] |
| set socket_cnt [dict get $node_conf "Sockets"] |
| set thread_cnt [dict get $node_conf "ThreadsPerCore"] |
| |
| if { $cr_cpu == 1 } { |
| return [expr $cpu_cnt / $thread_cnt] |
| } elseif { $cr_core == 1 } { |
| return [expr $core_cnt * $socket_cnt] |
| } else { |
| fail "An error occurred when getting the cpu or core count" |
| } |
| } |
| |
| |
| # Get default_queue_depth to see how many jobs will start at one time |
| proc default_queue_depth { } { |
| global number |
| |
| set depth 100 |
| set scheduler_params [get_config_param "SchedulerParameters"] |
| regexp "default_queue_depth=($number)" $scheduler_params - depth |
| |
| return $depth |
| } |
| |
| proc sub_job { job oversubscribe part } { |
| global file_in job_id job_mem_opt |
| |
| if { $oversubscribe } { |
| set job_id [submit_job -fail "-a$job -t1 $job_mem_opt -p$part --oversubscribe -o/dev/null $file_in"] |
| } else { |
| set job_id [submit_job -fail "-a$job -t1 $job_mem_opt -p$part -o/dev/null $file_in"] |
| } |
| } |
| |
| proc check_job { exp_num_jobs } { |
| global squeue job_id num_jobs input_file_name |
| |
| # Wait a bit for the job to start |
| wait_for_job -fail ${job_id}_0 "RUNNING" |
| |
| # Wait for the expected number of jobs to be running |
| if [ |
| wait_for {$job_cnt == $exp_num_jobs} { |
| set job_cnt [ |
| regexp -all $input_file_name [ |
| run_command_output -fail "$squeue --job=$job_id --state=RUNNING,SUSPENDED -Oname --noheader" |
| ] |
| ] |
| } |
| ] { |
| fail "The number of possible jobs that could run were not reached ($job_cnt != $exp_num_jobs). This could be due to memory or other limits." |
| } |
| } |
| |
| ##################################Test Begins################################## |
| |
| # TODO: Temporarily increase logging to debug bug 12729 (remove once fixed) |
| save_conf $config_dir/slurm.conf |
| run_command -none "$bin_echo SlurmdDebug=debug3 >> $config_dir/slurm.conf" |
| run_command -none "$bin_echo DebugFlags=cgroup >> $config_dir/slurm.conf" |
| run_command "$scontrol reconfigure" |
| |
| make_bash_script $file_in "sleep 10" |
| |
| #################################### |
| # |
| # Test partition with oversubscribe=NO |
| # |
| #################################### |
| log_info "Test partition with oversubscribe=NO" |
| |
| # Determine the number of cores or CPUs |
| set num_jobs [cr_core_cpu $node_name] |
| set job_start [default_queue_depth] |
| if { $num_jobs >= $job_start } { |
| skip "default_queue_depth less than core count ($num_jobs < $job_start), can not run this test" |
| } |
| |
| # Create first test partition |
| run_command -fail "$scontrol create partitionname=$test_part_1 OverSubscribe=NO nodes=$node_name" |
| |
| # Submit a job with oversubscribe |
| sub_job "0-$num_jobs" 1 $test_part_1 |
| |
| # Check that the correct number of jobs are running |
| check_job $num_jobs |
| cancel_job $job_id |
| |
| # Submit a job without oversubscribe |
| sub_job "0-$num_jobs" 0 $test_part_1 |
| check_job $num_jobs |
| cancel_job $job_id |
| |
| # Delete first test partition |
| run_command -fail "$scontrol delete partitionname=$test_part_1" |
| |
| #################################### |
| # |
| # Test partition with oversubscribe=YES:2 |
| # |
| #################################### |
| set new_job_limit [expr $num_jobs * 2] |
| if [param_contains [get_config_param "PreemptMode"] "GANG"] { |
| skip "Test partition with oversubscribe=YES:2 incompatible with gang scheduling" |
| } elseif { $new_job_limit >= $job_start } { |
| skip "default_queue_depth less than desired job count ($new_job_limit < $job_start), can not run oversubscribe=YES:2" |
| } else { |
| log_info "Test partition with oversubscribe=YES:2" |
| |
| # Make a new partition with oversubscribe=yes:2 |
| run_command -fail "$scontrol create partitionname=$test_part_2 OverSubscribe=YES:$oversubscribe_j_cnt nodes=$node_name" |
| |
| # Submit a job with oversubscribe (expect 2 jobs per core/CPU) |
| sub_job "0-$new_job_limit" 1 $test_part_2 |
| check_job $new_job_limit |
| cancel_job $job_id |
| |
| # Submit a job without oversubscribe (expect 1 job per core/CPU) |
| sub_job "0-$num_jobs" 0 $test_part_2 |
| check_job $num_jobs |
| cancel_job $job_id |
| |
| # Delete second test partition |
| run_command -fail "$scontrol delete partitionname=$test_part_2" |
| } |
| |
| ######################################## |
| # |
| # Test partition with oversubscribe=EXCLUSIVE |
| # |
| ######################################## |
| log_info "Test partition with oversubscribe=EXCLUSIVE" |
| |
| # Make a new partition with oversubscribe=EXCLUSIVE |
| run_command -fail "$scontrol create partitionname=$test_part_3 OverSubscribe=EXCLUSIVE nodes=$node_name" |
| |
| # Submit a job with oversubscribe (expected 1 job per node) |
| sub_job "0-$num_jobs" 1 $test_part_3 |
| check_job 1 |
| cancel_job $job_id |
| |
| # Submit a job with oversubscribe (expected 1 job per node) |
| sub_job "0-$num_jobs" 0 $test_part_3 |
| check_job 1 |
| cancel_job $job_id |
| |
| # Delete third test partition |
| run_command -fail "$scontrol delete partitionname=$test_part_3" |