| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Validate requeue'ing of federated jobs. |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <http://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| source ./globals_accounting |
| source ./globals_federation |
| |
| set long_script "$test_dir/long" |
| set complete_script "$test_dir/bash" |
| set exit_script "$test_dir/exit" |
| set exithold_script "$test_dir/exithold" |
| set fail_script "$test_dir/fail" |
| set file_out "$test_dir/output" |
| set prolog_script "$test_dir/prolog" |
| set prologctl_script "$test_dir/prologctl" |
| set epilog_script "$test_dir/epilog" |
| set fed_name "feda" |
| set long_running_job_id "" |
| set long_running_job_id2 "" |
| set job_id 0 |
| set user_name "" |
| set origin_cluster "" |
| set non_origin_clusters "" |
| set dbd_delay 10 |
| |
| # |
| # Check accounting config and bail if not found. |
| # |
| if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test can't be run without a usable AccountStorageType" |
| } |
| |
| if {[get_admin_level] ne "Administrator"} { |
| skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin" |
| } |
| |
| regexp "($number)" [get_config_param "MinJobAge"] {} min_age |
| if {$min_age < 10} { |
| skip "MinJobAge too low for this test ($min_age < 10)" |
| } |
| |
| if {![check_federation_setup]} { |
| skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local" |
| } |
| |
| if {![check_federation_up]} { |
| skip "This test can't be run without all clusters up" |
| } |
| |
| proc cancel_federation_jobs { } { |
| global bin_sleep scancel user_name fedc1 fedc2 fedc3 |
| |
| spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name |
| expect { |
| eof { |
| wait |
| } |
| } |
| sleep 5 |
| } |
| |
| proc cleanup { } { |
| global fed_name |
| |
| cancel_federation_jobs |
| delete_federations $fed_name |
| } |
| |
| proc check_ctl_state { job_id state cluster } { |
| global scontrol |
| |
| set job_state 0 |
| spawn $scontrol -M$cluster -a --local show job $job_id |
| expect { |
| -re "JobState=$state" { |
| set job_state 1 |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$job_state != 1} { |
| fail "Job ($job_id) state was not $state on cluster ($cluster)" |
| } |
| } |
| |
| proc check_missing_job { job_id cluster } { |
| global scontrol |
| |
| set matched 0 |
| spawn $scontrol -M$cluster -a --local show job $job_id |
| expect { |
| "slurm_load_jobs error: Invalid job id specified" { |
| set matched 1 |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$matched != 1} { |
| fail "Found an actual job ($job_id) on cluster ($cluster). This is not supposed to happen" |
| } |
| } |
| |
| # Count the number of jobs and steps with a specific job ID and state |
| # NOTE: Skip "extern" job container optionally spawned by "PrologFlags=contain" |
| proc check_dbd_states { job_id states cluster min_cnt } { |
| global sacct |
| |
| set state_num 0 |
| spawn $sacct -M$cluster --job=$job_id --duplicates --parsable2 --start=now-15minutes --noheader -o JobID,State |
| expect { |
| -re "(\[0-9_\\.a-z\]+)\\|($states)" { |
| if {[string first "extern" $expect_out(1,string)] == -1} { |
| incr state_num |
| } |
| exp_continue |
| } |
| timeout { |
| fail "sacct is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$state_num < $min_cnt} { |
| fail "Didn't find expected count $min_cnt (>$state_num) with state '$states' for job ($job_id) on cluster ($cluster)" |
| } |
| |
| return 0 |
| } |
| |
| proc requeue_job { id } { |
| |
| global scontrol |
| |
| spawn $scontrol requeue $id |
| expect { |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| |
| proc requeuehold_job { id } { |
| global scontrol |
| |
| spawn $scontrol requeuehold $id |
| expect { |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| proc requeuehold_se_job { id } { |
| global scontrol |
| |
| spawn $scontrol requeuehold state=specialexit $id |
| expect { |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| proc release_job { id } { |
| global scontrol |
| |
| spawn $scontrol release $id |
| expect { |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| proc check_hold { job } { |
| global scontrol number |
| |
| set matches 0 |
| spawn $scontrol -a --local show job $job |
| expect { |
| "Priority=0" { |
| incr matches |
| exp_continue |
| } |
| "Reason=job_requeued_in_held_state" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $matches != 2 } { |
| fail "Priority was not set to a non zero value after it was released" |
| } |
| } |
| |
| proc check_exit_hold { job } { |
| global scontrol number |
| |
| set matches 0 |
| spawn $scontrol -a --local show job $job |
| expect { |
| "Priority=0" { |
| incr matches |
| exp_continue |
| } |
| "Reason=JobHeldUser" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "scontrol is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $matches != 2 } { |
| fail "Priority was not set to a non zero value after it was released" |
| } |
| } |
| |
| proc check_restart_cnt { file cnt } { |
| global bin_grep bin_sleep |
| |
| # wait for prolog, etc. to finish |
| sleep 5 |
| |
| set match 0 |
| spawn $bin_grep "SLURM_RESTART_COUNT" $file |
| expect { |
| "SLURM_RESTART_COUNT=$cnt" { |
| set match 1 |
| exp_continue |
| } |
| timeout { |
| fail "grep ($file) not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {!$match} { |
| fail "Failed to find SLURM_RESTART_COUNT=$cnt in output file" |
| } |
| } |
| |
| spawn $bin_id -un |
| expect { |
| -re "($re_word_str)" { |
| set user_name $expect_out(1,string) |
| } |
| eof { |
| wait |
| } |
| } |
| |
| proc get_slurm_conf { cluster } { |
| global scontrol |
| |
| log_user 1 |
| set conf "" |
| spawn $scontrol -M$cluster show config |
| expect { |
| -re "SLURM_CONF\\s+=\\s+(\\S+)" { |
| set conf $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| |
| return $conf |
| } |
| |
| |
| # Remove existing setup |
| cleanup |
| |
| # Add clusters to federation |
| if [setup_federation $fed_name] { |
| fail "Failed to setup federation" |
| } |
| |
| set requeue_exit_num [get_config_param "RequeueExit"] |
| set requeue_exithold_num [get_config_param "RequeueExitHold"] |
| |
| make_bash_script $long_script "sleep 9000" |
| make_bash_script $complete_script "env; $bin_sleep 25" |
| make_bash_script $fail_script "BadCommand" |
| make_bash_script $prolog_script "exit 0" |
| make_bash_script $prologctl_script "exit 0" |
| make_bash_script $epilog_script "exit 0" |
| |
| if {$requeue_exit_num ne "(null)"} { |
| make_bash_script $exit_script "env; $bin_sleep 25; exit $requeue_exit_num" |
| } else { |
| log_warn "Configure RequeueExit=# to test." |
| } |
| if {$requeue_exithold_num ne "(null)"} { |
| make_bash_script $exithold_script "env; $bin_sleep 25; exit $requeue_exithold_num" |
| } else { |
| log_warn "Configure RequeueExitHold=# to test." |
| } |
| |
| |
| |
| # get number of nodes per cluster |
| set node_count [llength [get_nodes_by_state idle,alloc,comp "[default_partition] --local"]] |
| |
| set origin_cluster [get_config_param "ClusterName"] |
| set all_cluster_list [list $fedc1 $fedc2 $fedc3] |
| set non_origin_cluster_list [lsearch -all -inline -not -exact $all_cluster_list $origin_cluster] |
| set non_origin_clusters [join $non_origin_cluster_list ","] |
| log_info "Origin: $origin_cluster non-origins: $non_origin_clusters" |
| |
| |
| log_info "################################################################" |
| log_info "Test requeue of a running job on origin cluster" |
| log_info "################################################################" |
| |
| # Submit jobs that fill up fed2,fed3 |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t999999 --requeue -M$fedc2 $long_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set long_running_job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| set run_cluster [wait_for_fed_job $long_running_job_id RUNNING $fedc2] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t999999 --requeue -M$fedc3 $long_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set long_running_job_id2 $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| set run_cluster [wait_for_fed_job $long_running_job_id2 RUNNING $fedc3] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| # Wait for the job to be in the running state |
| set run_cluster [wait_for_fed_job $job_id RUNNING $origin_cluster] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| # Requeue the job while it is running |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Check to see if the job state is PENDING after the requeue |
| # federation will requeue job on all clusters |
| check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test requeue of a completed job on origin cluster" |
| log_info "################################################################" |
| |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| set run_cluster [wait_for_fed_job $job_id RUNNING ""] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| # Wait for the job to be in the complete state |
| set done_cluster [wait_for_fed_job $job_id DONE $run_cluster] |
| if {$done_cluster eq ""} { |
| fail "Didn't find cluster with completed job" |
| } |
| |
| # Requeue the job when it is complete |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Check to see if the job state is PENDING after the requeue |
| # federation will requeue job on all clusters |
| check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test requeue of a failed job on origin cluster" |
| log_info "################################################################" |
| set job_id 0 |
| spawn $sbatch -N1 -o /dev/null -e /dev/null -t 1 --requeue $fail_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| # Wait for the job to be in the complete state |
| set run_cluster [wait_for_fed_job $job_id DONE $origin_cluster] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| # Requeue the job when it is complete |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Check to see if the job state is PENDING after the requeue |
| # federation will requeue job on all clusters |
| check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| if {[cancel_job $long_running_job_id]} { |
| fail "Unable to cancel job ($long_running_job_id)" |
| } |
| if {[cancel_job $long_running_job_id2]} { |
| fail "Unable to cancel job ($long_running_job_id2)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test requeue of running job on sibling cluster" |
| log_info "################################################################" |
| |
| # Submit job that consumes all nodes on first cluster |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t999999 --requeue -M$origin_cluster $long_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set long_running_job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| set run_cluster [wait_for_fed_job $long_running_job_id RUNNING $origin_cluster] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| # make sure that the origin has gotten word that the job is running and the |
| # origin revokes the job. |
| set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster] |
| if {$rv_origin_cluster eq ""} { |
| fail "Origin cluster hasn't revoked job" |
| } |
| |
| # Requeue the job while it is running |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| # Check to see if the job state is PENDING after the requeue |
| # federation will requeue job on all clusters |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_dbd_states $job_id PENDING $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test requeue on subset of siblings" |
| log_info "################################################################" |
| |
| spawn $sbatch -N$node_count --exclusive -o /dev/null -e /dev/null -t1 --requeue -M$fedc1,$fedc2 $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| # make sure that the origin has gotten word that the job is running and the |
| # origin revokes the job. |
| set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster] |
| if {$rv_origin_cluster eq ""} { |
| fail "Origin cluster hasn't revoked job" |
| } |
| |
| # Requeue the job while it is running |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| # Check to see if the job state is PENDING after the requeue |
| # federation will requeue job on all clusters |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_dbd_states $job_id PENDING $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_missing_job $job_id $fedc3 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test requeue of a completed job that ran on sibling" |
| log_info "################################################################" |
| |
| # long running job is already running on origin cluster so job should go to |
| # other cluster |
| |
| spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| set ret_cluster [wait_for_fed_job $job_id DONE $run_cluster] |
| if {$ret_cluster eq ""} { |
| fail "Didn't find cluster with completed job" |
| } |
| |
| # Requeue the job when it is complete |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| # Check to see if the job state is PENDING after the requeue |
| # federation will requeue job on all clusters |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| |
| # A completed job on a sibling could already be gone from the controller and |
| # the db_index could be lost so the dbd state will stay as completed. |
| #check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_dbd_states $job_id COMPLETED $run_cluster 1 |
| |
| check_dbd_states $job_id PENDING $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| log_info "################################################################" |
| log_info "Test that SLURM_RESTART_COUNT is set for job requeued on sibling" |
| log_info "################################################################" |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 1 |
| |
| # check that sibling that didn't run the job has a REVOKED state in the db. |
| set revoked_sib "" |
| if {$fedc1 ne $origin_cluster && $fedc1 ne $run_cluster} { |
| set revoked_sib $fedc1 |
| |
| } elseif {$fedc2 ne $origin_cluster && $fedc2 ne $run_cluster} { |
| set revoked_sib $fedc2 |
| |
| } else { |
| set revoked_sib $fedc3 |
| } |
| check_dbd_states $job_id REVOKED $revoked_sib 1 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test requeue of a cancelled job that ran on sibling" |
| log_info "################################################################" |
| |
| # long running job is already running on origin cluster so job should go to |
| # other cluster |
| |
| spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| # Give time for origin to know that the job was started. If the cancel comes |
| # before it knows that the job started then the origin will cancel the local |
| # and remote job. If it knows that the job is running on the remote, then it |
| # will only send the request to the remote and wait for it to report back that |
| # the job is gone. |
| sleep 2 |
| |
| log_info "$run_cluster is running job" |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| set ret_cluster [wait_for_fed_job $job_id DONE $run_cluster] |
| if {$ret_cluster eq ""} { |
| fail "Didn't find cluster with completed job" |
| } |
| |
| # Requeue the job when it is complete |
| requeue_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| #check_dbd_states $job_id REQUEUED $run_cluster 1 |
| check_dbd_states $job_id PENDING $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 1 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| |
| log_info "################################################################" |
| log_info "Test scontrol requeuehold,release of a running sibling job" |
| log_info "################################################################" |
| |
| # long running job is already running on origin cluster so job should go to |
| # other cluster |
| |
| spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| # make sure that the origin has gotten word that the job is running and the |
| # origin revokes the job. |
| set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster] |
| if {$rv_origin_cluster eq ""} { |
| fail "Origin cluster hasn't revoked job" |
| } |
| |
| # Requeue the job while it is running |
| requeuehold_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $origin_cluster] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| check_hold $job_id |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| check_ctl_state $job_id PENDING $origin_cluster |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| |
| |
| # release hold on job, siblings will be submitted to all clusters |
| release_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 1 |
| |
| log_info "################################################################" |
| log_info "Test scontrol requeuehold,release of a completed sibling job" |
| log_info "################################################################" |
| |
| # Just let previous job finish and then requeue it. |
| |
| set run_cluster [wait_for_fed_job $job_id DONE $run_cluster] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with completed job" |
| } |
| |
| # Requeue the job when it is complete |
| requeuehold_job $job_id |
| |
| set run_cluster [wait_for_fed_job $job_id PENDING $origin_cluster] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| check_hold $job_id |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| check_ctl_state $job_id PENDING $origin_cluster |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| |
| # release hold on job, siblings will be submitted to all clusters |
| release_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 2 |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| log_info "################################################################" |
| log_info "Test scontrol requeuehold specialexit,release of a running sibling job" |
| log_info "################################################################" |
| |
| # long running job is already running on origin cluster so job should go to |
| # other cluster |
| |
| spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $complete_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| # make sure that the origin has gotten word that the job is running and the |
| # origin revokes the job. |
| set rv_origin_cluster [wait_for_fed_job $job_id REVOKED $origin_cluster] |
| if {$rv_origin_cluster eq ""} { |
| fail "Origin cluster hasn't revoked job" |
| } |
| |
| # Requeue the job when it is complete |
| requeuehold_se_job $job_id |
| |
| set se_cluster [wait_for_fed_job $job_id SPECIAL_EXIT $origin_cluster] |
| if {$se_cluster eq ""} { |
| fail "Didn't find job in SE state" |
| } |
| check_hold $job_id |
| check_ctl_state $job_id SPECIAL_EXIT $origin_cluster |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| |
| |
| # release hold on job, siblings will be submitted to all clusters |
| release_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 1 |
| |
| log_info "################################################################" |
| log_info "Test scontrol requeuehold specialexit,release of a completed sibling job" |
| log_info "################################################################" |
| |
| # Just let previous job finish and then requeue it. |
| |
| set run_cluster [wait_for_fed_job $job_id DONE $run_cluster] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with completed job" |
| } |
| |
| # Requeue the job when it is complete |
| requeuehold_se_job $job_id |
| |
| set se_cluster [wait_for_fed_job $job_id SPECIAL_EXIT $origin_cluster] |
| if {$se_cluster eq ""} { |
| fail "Didn't find job in SE state" |
| } |
| check_hold $job_id |
| check_ctl_state $job_id SPECIAL_EXIT $origin_cluster |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| check_dbd_states $job_id REVOKED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| |
| |
| # release hold on job, siblings will be submitted to all clusters |
| release_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 2 |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| |
| if {$requeue_exit_num ne "(null)"} { |
| log_info "################################################################" |
| log_info "Test RequeueExit=#" |
| log_info "################################################################" |
| |
| spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $exit_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find job in pending state" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find job in pending state" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find job in pending state" |
| } |
| |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| check_dbd_states $job_id REQUEUED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| #check_dbd_states $job_id REVOKED $run_cluster 1 |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| check_restart_cnt $file_out 1 |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| } |
| |
| if {$requeue_exit_num ne "(null)"} { |
| log_info "################################################################" |
| log_info "Test RequeueExitHold=#" |
| log_info "################################################################" |
| |
| spawn $sbatch -N$node_count --exclusive -o $file_out -e /dev/null -t1 --requeue $exithold_script |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch is not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if { $job_id == 0 } { |
| fail "sbatch did not submit job" |
| } |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| log_info "$run_cluster is running job" |
| |
| set pend_cluster [wait_for_fed_job $job_id SPECIAL_EXIT $origin_cluster] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find job in pending state" |
| } |
| |
| check_exit_hold $job_id |
| # Give time for states to be updated in the db. |
| sleep $dbd_delay |
| |
| check_ctl_state $job_id SPECIAL_EXIT $origin_cluster |
| check_dbd_states $job_id REQUEUED $origin_cluster 1 |
| check_dbd_states $job_id PENDING $origin_cluster 1 |
| |
| # release hold on job, siblings will be submitted to all clusters |
| release_job $job_id |
| |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc1] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc2] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| set pend_cluster [wait_for_fed_job $job_id PENDING $fedc3] |
| if {$pend_cluster eq ""} { |
| fail "Didn't find cluster with pending job" |
| } |
| |
| check_ctl_state $job_id PENDING $fedc1 |
| check_ctl_state $job_id PENDING $fedc2 |
| check_ctl_state $job_id PENDING $fedc3 |
| |
| set run_cluster [wait_for_fed_job $job_id RUNNING $non_origin_clusters] |
| if {$run_cluster eq ""} { |
| fail "Didn't find cluster with running job" |
| } |
| |
| if {$run_cluster eq $origin_cluster} { |
| fail "Requeued job ran on origin cluster. Expected to run on a different cluster" |
| } |
| |
| check_restart_cnt $file_out 1 |
| |
| if {[cancel_job $job_id]} { |
| fail "Unable to cancel job ($job_id)" |
| } |
| } |