blob: b147e8366b6e065b70fc9a9bedd1363b12ef438d [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Validate requeue'ing of federated jobs.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <http://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
set long_script "$test_dir/long"
set complete_script "$test_dir/bash"
set exit_script "$test_dir/exit"
set exithold_script "$test_dir/exithold"
set fail_script "$test_dir/fail"
set file_out "$test_dir/output"
set prolog_script "$test_dir/prolog"
set prologctl_script "$test_dir/prologctl"
set epilog_script "$test_dir/epilog"
set fed_name "feda"
set long_running_job_id ""
set long_running_job_id2 ""
set job_id 0
set user_name ""
set origin_cluster ""
set non_origin_clusters ""
set dbd_delay 10
#
# Check accounting config and bail if not found.
#
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
if {[get_admin_level] ne "Administrator"} {
skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin"
}
regexp "($number)" [get_config_param "MinJobAge"] {} min_age
if {$min_age < 10} {
skip "MinJobAge too low for this test ($min_age < 10)"
}
if {![check_federation_setup]} {
skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local"
}
if {![check_federation_up]} {
skip "This test can't be run without all clusters up"
}
proc cancel_federation_jobs { } {
global bin_sleep scancel user_name fedc1 fedc2 fedc3
spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name
expect {
eof {
wait
}
}
sleep 5
}
proc cleanup { } {
global fed_name
cancel_federation_jobs
delete_federations $fed_name
}
proc check_ctl_state { job_id state cluster } {
global scontrol
set job_state 0
spawn $scontrol -M$cluster -a --local show job $job_id
expect {
-re "JobState=$state" {
set job_state 1
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if {$job_state != 1} {
fail "Job ($job_id) state was not $state on cluster ($cluster)"
}
}
proc check_missing_job { job_id cluster } {
global scontrol
set matched 0
spawn $scontrol -M$cluster -a --local show job $job_id
expect {
"slurm_load_jobs error: Invalid job id specified" {
set matched 1
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if {$matched != 1} {
fail "Found an actual job ($job_id) on cluster ($cluster). This is not supposed to happen"
}
}
# Count the number of jobs and steps with a specific job ID and state
# NOTE: Skip "extern" job container optionally spawned by "PrologFlags=contain"
proc check_dbd_states { job_id states cluster min_cnt } {
global sacct
set state_num 0
spawn $sacct -M$cluster --job=$job_id --duplicates --parsable2 --start=now-15minutes --noheader -o JobID,State
expect {
-re "(\[0-9_\\.a-z\]+)\\|($states)" {
if {[string first "extern" $expect_out(1,string)] == -1} {
incr state_num
}
exp_continue
}
timeout {
fail "sacct is not responding"
}
eof {
wait
}
}
if {$state_num < $min_cnt} {
fail "Didn't find expected count $min_cnt (>$state_num) with state '$states' for job ($job_id) on cluster ($cluster)"
}
return 0
}
proc requeue_job { id } {
global scontrol
spawn $scontrol requeue $id
expect {
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
}
proc requeuehold_job { id } {
global scontrol
spawn $scontrol requeuehold $id
expect {
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
}
proc requeuehold_se_job { id } {
global scontrol
spawn $scontrol requeuehold state=specialexit $id
expect {
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
}
proc release_job { id } {
global scontrol
spawn $scontrol release $id
expect {
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
}
proc check_hold { job } {
global scontrol number
set matches 0
spawn $scontrol -a --local show job $job
expect {
"Priority=0" {
incr matches
exp_continue
}
"Reason=job_requeued_in_held_state" {
incr matches
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if { $matches != 2 } {
fail "Priority was not set to a non zero value after it was released"
}
}
proc check_exit_hold { job } {
global scontrol number
set matches 0
spawn $scontrol -a --local show job $job
expect {
"Priority=0" {
incr matches
exp_continue
}
"Reason=JobHeldUser" {
incr matches
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if { $matches != 2 } {
fail "Priority was not set to a non zero value after it was released"
}
}
proc check_restart_cnt { file cnt } {
global bin_grep bin_sleep
# wait for prolog, etc. to finish
sleep 5
set match 0
spawn $bin_grep "SLURM_RESTART_COUNT" $file
expect {
"SLURM_RESTART_COUNT=$cnt" {
set match 1
exp_continue
}
timeout {
fail "grep ($file) not responding"
}
eof {
wait
}
}
if {!$match} {
fail "Failed to find SLURM_RESTART_COUNT=$cnt in output file"
}
}
spawn $bin_id -un
expect {
-re "($re_word_str)" {
set user_name $expect_out(1,string)
}
eof {
wait
}
}
proc get_slurm_conf { cluster } {
global scontrol
log_user 1
set conf ""
spawn $scontrol -M$cluster show config
expect {
-re "SLURM_CONF\\s+=\\s+(\\S+)" {
set conf $expect_out(1,string)
exp_continue
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
log_user 1
return $conf
}
# Remove existing setup
cleanup
# Add clusters to federation
if [setup_federation $fed_name] {
fail "Failed to setup federation"
}
set requeue_exit_num [get_config_param "RequeueExit"]
set requeue_exithold_num [get_config_param "RequeueExitHold"]
make_bash_script $long_script "sleep 9000"
make_bash_script $complete_script "env; $bin_sleep 25"
make_bash_script $fail_script "BadCommand"
make_bash_script $prolog_script "exit 0"
make_bash_script $prologctl_script "exit 0"
make_bash_script $epilog_script "exit 0"
if {$requeue_exit_num ne "(null)"} {
make_bash_script $exit_script "env; $bin_sleep 25; exit $requeue_exit_num"
} else {
log_warn "Configure RequeueExit=# to test."
}
if {$requeue_exithold_num ne "(null)"} {
make_bash_script $exithold_script "env; $bin_sleep 25; exit $requeue_exithold_num"
} else {
log_warn "Configure RequeueExitHold=# to test."
}
# get number of nodes per cluster
set node_count [llength [get_nodes_by_state idle,alloc,comp "[default_partition] --local"]]
set origin_cluster [get_config_param "ClusterName"]
set all_cluster_list [list $fedc1 $fedc2 $fedc3]
set non_origin_cluster_list [lsearch -all -inline -not -exact $all_cluster_list $origin_cluster]
set non_origin_clusters [join $non_origin_cluster_list ","]
log_info "Origin: $origin_cluster non-origins: $non_origin_clusters"
log_info "################################################################"
log_info "Test requeue of a running job on origin cluster"
log_info "################################################################"
# Submit jobs that fill up fed2,fed3
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t999999 --requeue -M$fedc2 $long_script
expect {
-re "Submitted batch job ($number)" {
set long_running_job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
set run_cluster [wait_for_fed_job $long_running_job_id RUNNING $fedc2]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t999999 --requeue -M$fedc3 $long_script
expect {
-re "Submitted batch job ($number)" {
set long_running_job_id2 $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
set run_cluster [wait_for_fed_job $long_running_job_id2 RUNNING $fedc3]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
# Wait for the job to be in the running state
set run_cluster [wait_for_fed_job $job_id RUNNING $origin_cluster]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
# Requeue the job while it is running
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Check to see if the job state is PENDING after the requeue
# federation will requeue job on all clusters
check_dbd_states $job_id REQUEUED $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test requeue of a completed job on origin cluster"
log_info "################################################################"
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING ""]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
# Wait for the job to be in the complete state
set done_cluster [wait_for_fed_job $job_id DONE $run_cluster]
if {$done_cluster eq ""} {
fail "Didn't find cluster with completed job"
}
# Requeue the job when it is complete
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Check to see if the job state is PENDING after the requeue
# federation will requeue job on all clusters
check_dbd_states $job_id REQUEUED $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test requeue of a failed job on origin cluster"
log_info "################################################################"
set job_id 0
spawn $sbatch -N1 -o /dev/null -e /dev/null -t 1 --requeue $fail_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
# Wait for the job to be in the complete state
set run_cluster [wait_for_fed_job $job_id DONE $origin_cluster]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
# Requeue the job when it is complete
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Check to see if the job state is PENDING after the requeue
# federation will requeue job on all clusters
check_dbd_states $job_id REQUEUED $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
if {[cancel_job $long_running_job_id]} {
fail "Unable to cancel job ($long_running_job_id)"
}
if {[cancel_job $long_running_job_id2]} {
fail "Unable to cancel job ($long_running_job_id2)"
}
log_info "################################################################"
log_info "Test requeue of running job on sibling cluster"
log_info "################################################################"
# Submit job that consumes all nodes on first cluster
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t999999 --requeue -M$origin_cluster $long_script
expect {
-re "Submitted batch job ($number)" {
set long_running_job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
set run_cluster [wait_for_fed_job $long_running_job_id RUNNING $origin_cluster]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
log_info "$run_cluster is running job"
# make sure that the origin has gotten word that the job is running and the
# origin revokes the job.
set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster]
if {$rv_origin_cluster eq ""} {
fail "Origin cluster hasn't revoked job"
}
# Requeue the job while it is running
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Give time for states to be updated in the db.
sleep $dbd_delay
# Check to see if the job state is PENDING after the requeue
# federation will requeue job on all clusters
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
check_dbd_states $job_id REQUEUED $run_cluster 1
check_dbd_states $job_id PENDING $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test requeue on subset of siblings"
log_info "################################################################"
spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue -M$fedc1,$fedc2 $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
log_info "$run_cluster is running job"
# make sure that the origin has gotten word that the job is running and the
# origin revokes the job.
set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster]
if {$rv_origin_cluster eq ""} {
fail "Origin cluster hasn't revoked job"
}
# Requeue the job while it is running
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Give time for states to be updated in the db.
sleep $dbd_delay
# Check to see if the job state is PENDING after the requeue
# federation will requeue job on all clusters
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
check_dbd_states $job_id REQUEUED $run_cluster 1
check_dbd_states $job_id PENDING $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_missing_job $job_id $fedc3
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test requeue of a completed job that ran on sibling"
log_info "################################################################"
# long running job is already running on origin cluster so job should go to
# other cluster
spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
log_info "$run_cluster is running job"
set ret_cluster [wait_for_fed_job $job_id DONE $run_cluster]
if {$ret_cluster eq ""} {
fail "Didn't find cluster with completed job"
}
# Requeue the job when it is complete
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Give time for states to be updated in the db.
sleep $dbd_delay
# Check to see if the job state is PENDING after the requeue
# federation will requeue job on all clusters
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
# A completed job on a sibling could already be gone from the controller and
# the db_index could be lost so the dbd state will stay as completed.
#check_dbd_states $job_id REQUEUED $run_cluster 1
check_dbd_states $job_id COMPLETED $run_cluster 1
check_dbd_states $job_id PENDING $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
log_info "################################################################"
log_info "Test that SLURM_RESTART_COUNT is set for job requeued on sibling"
log_info "################################################################"
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 1
# check that sibling that didn't run the job has a REVOKED state in the db.
set revoked_sib ""
if {$fedc1 ne $origin_cluster && $fedc1 ne $run_cluster} {
set revoked_sib $fedc1
} elseif {$fedc2 ne $origin_cluster && $fedc2 ne $run_cluster} {
set revoked_sib $fedc2
} else {
set revoked_sib $fedc3
}
check_dbd_states $job_id REVOKED $revoked_sib 1
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test requeue of a cancelled job that ran on sibling"
log_info "################################################################"
# long running job is already running on origin cluster so job should go to
# other cluster
spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
# Give time for origin to know that the job was started. If the cancel comes
# before it knows that the job started then the origin will cancel the local
# and remote job. If it knows that the job is running on the remote, then it
# will only send the request to the remote and wait for it to report back that
# the job is gone.
sleep 2
log_info "$run_cluster is running job"
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
set ret_cluster [wait_for_fed_job $job_id DONE $run_cluster]
if {$ret_cluster eq ""} {
fail "Didn't find cluster with completed job"
}
# Requeue the job when it is complete
requeue_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
# Give time for states to be updated in the db.
sleep $dbd_delay
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
#check_dbd_states $job_id REQUEUED $run_cluster 1
check_dbd_states $job_id PENDING $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 1
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test scontrol requeuehold,release of a running sibling job"
log_info "################################################################"
# long running job is already running on origin cluster so job should go to
# other cluster
spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
log_info "$run_cluster is running job"
# make sure that the origin has gotten word that the job is running and the
# origin revokes the job.
set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster]
if {$rv_origin_cluster eq ""} {
fail "Origin cluster hasn't revoked job"
}
# Requeue the job while it is running
requeuehold_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $origin_cluster]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with running job"
}
check_hold $job_id
# Give time for states to be updated in the db.
sleep $dbd_delay
check_ctl_state $job_id PENDING $origin_cluster
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
# release hold on job, siblings will be submitted to all clusters
release_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 1
log_info "################################################################"
log_info "Test scontrol requeuehold,release of a completed sibling job"
log_info "################################################################"
# Just let previous job finish and then requeue it.
set run_cluster [wait_for_fed_job $job_id DONE $run_cluster]
if {$run_cluster eq ""} {
fail "Didn't find cluster with completed job"
}
# Requeue the job when it is complete
requeuehold_job $job_id
set run_cluster [wait_for_fed_job $job_id PENDING $origin_cluster]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
check_hold $job_id
# Give time for states to be updated in the db.
sleep $dbd_delay
check_ctl_state $job_id PENDING $origin_cluster
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
# release hold on job, siblings will be submitted to all clusters
release_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with running job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with running job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with running job"
}
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 2
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
log_info "################################################################"
log_info "Test scontrol requeuehold specialexit,release of a running sibling job"
log_info "################################################################"
# long running job is already running on origin cluster so job should go to
# other cluster
spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
log_info "$run_cluster is running job"
# make sure that the origin has gotten word that the job is running and the
# origin revokes the job.
set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster]
if {$rv_origin_cluster eq ""} {
fail "Origin cluster hasn't revoked job"
}
# Requeue the job when it is complete
requeuehold_se_job $job_id
set se_cluster [wait_for_fed_job $job_id SPECIAL_EXIT $origin_cluster]
if {$se_cluster eq ""} {
fail "Didn't find job in SE state"
}
check_hold $job_id
check_ctl_state $job_id SPECIAL_EXIT $origin_cluster
# Give time for states to be updated in the db.
sleep $dbd_delay
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
# release hold on job, siblings will be submitted to all clusters
release_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 1
log_info "################################################################"
log_info "Test scontrol requeuehold specialexit,release of a completed sibling job"
log_info "################################################################"
# Just let previous job finish and then requeue it.
set run_cluster [wait_for_fed_job $job_id DONE $run_cluster]
if {$run_cluster eq ""} {
fail "Didn't find cluster with completed job"
}
# Requeue the job when it is complete
requeuehold_se_job $job_id
set se_cluster [wait_for_fed_job $job_id SPECIAL_EXIT $origin_cluster]
if {$se_cluster eq ""} {
fail "Didn't find job in SE state"
}
check_hold $job_id
check_ctl_state $job_id SPECIAL_EXIT $origin_cluster
# Give time for states to be updated in the db.
sleep $dbd_delay
check_dbd_states $job_id REVOKED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
# release hold on job, siblings will be submitted to all clusters
release_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 2
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
if {$requeue_exit_num ne "(null)"} {
log_info "################################################################"
log_info "Test RequeueExit=#"
log_info "################################################################"
spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $exit_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
log_info "$run_cluster is running job"
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find job in pending state"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find job in pending state"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find job in pending state"
}
# Give time for states to be updated in the db.
sleep $dbd_delay
check_dbd_states $job_id REQUEUED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
#check_dbd_states $job_id REVOKED $run_cluster 1
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
check_restart_cnt $file_out 1
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
}
if {$requeue_exit_num ne "(null)"} {
log_info "################################################################"
log_info "Test RequeueExitHold=#"
log_info "################################################################"
spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $exithold_script
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
}
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
log_info "$run_cluster is running job"
set pend_cluster [wait_for_fed_job $job_id SPECIAL_EXIT $origin_cluster]
if {$pend_cluster eq ""} {
fail "Didn't find job in pending state"
}
check_exit_hold $job_id
# Give time for states to be updated in the db.
sleep $dbd_delay
check_ctl_state $job_id SPECIAL_EXIT $origin_cluster
check_dbd_states $job_id REQUEUED $origin_cluster 1
check_dbd_states $job_id PENDING $origin_cluster 1
# release hold on job, siblings will be submitted to all clusters
release_job $job_id
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3]
if {$pend_cluster eq ""} {
fail "Didn't find cluster with pending job"
}
check_ctl_state $job_id PENDING $fedc1
check_ctl_state $job_id PENDING $fedc2
check_ctl_state $job_id PENDING $fedc3
set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters]
if {$run_cluster eq ""} {
fail "Didn't find cluster with running job"
}
if {$run_cluster eq $origin_cluster} {
fail "Requeued job ran on origin cluster. Expected to run on a different cluster"
}
check_restart_cnt $file_out 1
if {[cancel_job $job_id]} {
fail "Unable to cancel job ($job_id)"
}
}