blob: c674363d5523e86ee60858196c8f92564a449ccb [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Validate that preemption by qos is enforced
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
set user ""
set acct_1 "${test_name}_acct1"
set acct_2 "${test_name}_acct2"
set qos_1 "${test_name}_qos1"
set qos_2 "${test_name}_qos2"
set file_in "$test_dir/job_script"
set nodes [llength [get_nodes_by_state]]
set job_id 0
set qos_1_id 0
set qos_2_id 0
# job states
set done_state "DONE"
set pending_state "PENDING"
set preempted_state "PREEMPTED"
set running_state "RUNNING"
set eligible_now_job_directive "EligibleTime=now"
if {[get_config_param "PreemptType"] ne "preempt/qos"} {
skip "This test requires that PreemptType=preempt/qos"
}
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test requires use of Slurmdbd"
}
if {$nodes < 2} {
skip "Not enough available nodes ($nodes < 2)"
}
regexp "($number)" [get_config_param "MinJobAge"] {} min_job_age
if {$min_job_age < 10} {
skip "MinJobAge configured too low for this test ($min_job_age < 10)"
}
proc acct_setup { acct_name qos_name pre_qos pre_mode } {
global user nodes sacctmgr
set added 0
spawn $sacctmgr -i create qos $qos_name preempt=$pre_qos preemptmode=$pre_mode maxnodes=$nodes
expect {
-re "Adding QOS" {
incr added 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
spawn $sacctmgr -i create account $acct_name qos=$qos_name
expect {
-re "Adding Account" {
incr added 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
spawn $sacctmgr -i add user $user account=$acct_name
expect {
-re "Associations" {
incr added 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
if {$added != 3} {
fail "Account was not created properly"
}
}
proc sub_job { acct1 {het_job false} } {
global nodes file_in sbatch number
set job_id 0
if { $het_job } {
set nodesless1 [expr $nodes - 1]
spawn $sbatch -o/dev/null --exclusive -N1 -A$acct1 : -N$nodesless1 -A$acct1 $file_in
} else {
spawn $sbatch -o/dev/null --exclusive -N$nodes -A$acct1 $file_in
}
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
} else {
return $job_id
}
}
proc cleanup {} {
global user acct_1 acct_2 qos_1 qos_2 qos_1_id qos_2_id sacctmgr
set cleanacct(0) $acct_1
set cleanacct(1) $acct_2
set cleanqos(0) $qos_1
set cleanqos(1) $qos_2
set clean 0
wait_for_account_done $acct_1,$acct_2
for {set i 0} {$i<2} {incr i 1} {
run_command "$sacctmgr -i remove user $user where account=$cleanacct($i)"
spawn $sacctmgr delete -i account $cleanacct($i)
expect {
-re "(Deleting accounts|Nothing deleted)" {
incr clean 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
spawn $sacctmgr delete -i qos $cleanqos($i)
expect {
-re "(Deleting QOS|Nothing deleted)" {
incr clean 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
}
if { $clean != 4 } {
log_warn "Unable to clean up accounts and qos"
}
}
cleanup
make_bash_script $file_in "sleep 30"
set user [get_my_user_name]
# Preempt modes to test
set preempt_mode_cancel "cancel"
set preempt_mode_requeue "requeue"
set preempt_mode_list "$preempt_mode_cancel $preempt_mode_requeue"
# Job types to test for each mode
set job_type_batch "batch"
set job_type_het "het"
set job_type_list "$job_type_batch $job_type_het"
log_info "**** Cleanup from previous run ****"
acct_setup $acct_1 $qos_1 "" $preempt_mode_cancel
acct_setup $acct_2 $qos_2 $qos_1 cluster
foreach preempt_mode $preempt_mode_list {
foreach job_type $job_type_list {
set mode_type_string [string toupper "$preempt_mode $job_type"]
log_info "*** TEST PREEMPT $mode_type_string JOB ***"
set mod_qos_vals(preemptmode) $preempt_mode
mod_qos $qos_1 [array get mod_qos_vals]
if { $job_type == $job_type_batch } {
set have_het_job 0
} else {
set have_het_job 1
}
set qos_1_id [sub_job $acct_1 $have_het_job]
wait_for_job -fail $qos_1_id $running_state $have_het_job
set qos_2_id [sub_job $acct_2]
wait_for_job -fail $qos_2_id $running_state
if { $preempt_mode == $preempt_mode_cancel } {
set jobs_ok false
wait_for {$jobs_ok} {
set jobs_ok [check_job_state $qos_1_id $preempted_state $have_het_job]
}
subtest {$jobs_ok} "Job ($qos_1_id) should be fully in the $preempted_state state"
} else {
# Requeue state
set jobs_ok false
wait_for {$jobs_ok} {
set jobs_ok [check_job_state $qos_1_id $pending_state $have_het_job]
}
subtest {$jobs_ok} "Job ($qos_1_id) should be fully in the $pending_state state"
# Make job eligible to run now to avoid delay
run_command "$scontrol update job $qos_2_id $eligible_now_job_directive"
wait_for_job -fail $qos_2_id $done_state
# Wait for requeued job to restart
# Make job eligible to run now to avoid delay
run_command "$scontrol update job $qos_1_id $eligible_now_job_directive"
wait_for_job -fail $qos_1_id $running_state $have_het_job
set jobs_ok false
wait_for {$jobs_ok} {
set jobs_ok [check_job_state $qos_1_id $running_state $have_het_job]
}
subtest {$jobs_ok} "Job ($qos_1_id) should be fully in the $running_state state"
}
wait_for_account_done $acct_1,$acct_2
}
}
set max_test 3
set test 0
while {($test < $max_test) &&
![wait_for_node -timeout 1 planned 1]} {
incr test
$bin_sleep 10
}