|  | #!/usr/bin/env expect | 
|  | ############################################################################ | 
|  | # Purpose: Test of Slurm functionality | 
|  | #          Validate that sacct -D shows correct job steps and states | 
|  | #          when a job is requeued | 
|  | ############################################################################ | 
|  | # Copyright (C) SchedMD LLC. | 
|  | # | 
|  | # This file is part of Slurm, a resource management program. | 
|  | # For details, see <https://slurm.schedmd.com/>. | 
|  | # Please also read the included file: DISCLAIMER. | 
|  | # | 
|  | # Slurm is free software; you can redistribute it and/or modify it under | 
|  | # the terms of the GNU General Public License as published by the Free | 
|  | # Software Foundation; either version 2 of the License, or (at your option) | 
|  | # any later version. | 
|  | # | 
|  | # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | # details. | 
|  | # | 
|  | # You should have received a copy of the GNU General Public License along | 
|  | # with Slurm; if not, write to the Free Software Foundation, Inc. | 
|  | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | ############################################################################ | 
|  | source ./globals | 
|  |  | 
|  | set job_id     0 | 
|  | set node       [get_nodes_by_request -fail "-t1 --exclusive"] | 
|  |  | 
|  | set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"] | 
|  | if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} { | 
|  | skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)" | 
|  | } | 
|  | if {![is_super_user]} { | 
|  | skip "Test can only be run as SlurmUser" | 
|  | } | 
|  |  | 
|  | proc cleanup {} { | 
|  | global job_id | 
|  |  | 
|  | cancel_job $job_id | 
|  | } | 
|  |  | 
|  | proc mod_state { state reason } { | 
|  | global scontrol node | 
|  |  | 
|  | set output [run_command_output "$scontrol update nodename=$node state=$state reason=$reason"] | 
|  | set bad_state [regexp "Invalid node state" $output] | 
|  |  | 
|  | if {$bad_state == 1 && $state eq "resume" && [get_config_param "ReturnToService"] == 2} { | 
|  | log_warn "This error is expected, no worries" | 
|  | set bad_state 0 | 
|  | } | 
|  | if {$bad_state == 1} { | 
|  | fail "Problem changing node state" | 
|  | } | 
|  | } | 
|  |  | 
|  | proc check_step { num } { | 
|  | global sacct job_id | 
|  |  | 
|  | set output [run_command_output -fail "$sacct --job=$job_id\.batch -D --start=now-15minutes --noheader --format=jobid -P"] | 
|  | set steps [regexp -all "batch" $output] | 
|  |  | 
|  | subtest {$num == $steps} "Check number of steps" "$steps != $num" | 
|  | } | 
|  |  | 
|  | # Count the number of jobs and steps with a specific job ID and state | 
|  | proc check_sacct_states {states} { | 
|  | global job_id sacct | 
|  |  | 
|  | # This test will requeue jobs making those jobs be eligible in the | 
|  | # future from sacct's perspective.  Since sacct only shows eligible | 
|  | # jobs we have to specify end in the future. | 
|  | set output [run_command_output -fail "$sacct --job=$job_id --duplicates --parsable2 --start=now-15minutes --end=tomorrow --noheader -o JobID,State"] | 
|  | # NOTE: Skip "extern" job container optionally spawned by "PrologFlags=contain" | 
|  | set state_num [regexp -all "\[0-9_\]+(\.(?!extern)\[a-z\]+)*\\|$states" $output] | 
|  |  | 
|  | return $state_num | 
|  | } | 
|  |  | 
|  | if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { | 
|  | skip "Not using accounting_storage/slurmdbd" | 
|  | } | 
|  |  | 
|  | # Submit job to be requeued | 
|  | log_info "Test 1" | 
|  | set job_id [submit_job -fail "-N1 -w$node --exclusive -o/dev/null --requeue --wrap='$bin_sleep 20'"] | 
|  |  | 
|  | wait_for_job -fail $job_id "RUNNING" | 
|  |  | 
|  | # Wait for batch script to start (after message delays, prologs, etc.) | 
|  | sleep 5 | 
|  |  | 
|  | # Set the node that the job is running on to down | 
|  | mod_state "down" "$test_name" | 
|  |  | 
|  | # Wait a little bit for node state to change | 
|  | sleep 5 | 
|  |  | 
|  | # Set the node back to resume | 
|  | mod_state "resume" "$test_name" | 
|  |  | 
|  | # Check the job state | 
|  | log_info "Test 2" | 
|  | wait_for_job -fail $job_id "PENDING" | 
|  |  | 
|  | # Wait for the state changes to propagate to the database for sacct | 
|  | sleep 5 | 
|  | # The job state should be NODE_FAIL | 
|  | set fail_count [check_sacct_states "NODE_FAIL"] | 
|  | subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1" | 
|  |  | 
|  | # The batch step state should be CANCELLED | 
|  | set canc_count [check_sacct_states "CANCELLED"] | 
|  | subtest {$canc_count == 1} "Test CANCELLED count" "$canc_count != 1" | 
|  |  | 
|  | # The requeued job state should be PENDING | 
|  | set pend_count [check_sacct_states "PENDING"] | 
|  | subtest {$pend_count == 1} "Test PENDING count" "$pend_count != 1" | 
|  |  | 
|  | wait_for_job -fail $job_id "RUNNING" | 
|  |  | 
|  | # Wait for batch script to start (after message delays, prologs, etc.) | 
|  | sleep 5 | 
|  |  | 
|  |  | 
|  | log_info "Test 3" | 
|  | set fail_count [check_sacct_states "NODE_FAIL"] | 
|  | subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1" | 
|  |  | 
|  | set canc_count [check_sacct_states "CANCELLED"] | 
|  | subtest {$canc_count == 1} "Test CANCELLED count" "$canc_count != 1" | 
|  |  | 
|  | set run_count [check_sacct_states "RUNNING"] | 
|  | # The requeued job and its batch step should now be running. | 
|  | subtest {$run_count == 2} "Test RUNNING count" "$run_count != 2" | 
|  |  | 
|  | # Requeue the job | 
|  | run_command -fail "$scontrol requeue $job_id" | 
|  |  | 
|  | # Wait a bit for the job to be requeued then check its state | 
|  | sleep 8 | 
|  | wait_for_job -fail $job_id "PENDING" | 
|  |  | 
|  | # Wait for the state changes to propagate to the database for sacct | 
|  | sleep 5 | 
|  | log_info "Test 4" | 
|  | set fail_count [check_sacct_states "NODE_FAIL"] | 
|  | subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1" | 
|  |  | 
|  | set req_count [check_sacct_states "REQUEUE"] | 
|  | subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1" | 
|  |  | 
|  | # The first and second batch steps should both show CANCELLED | 
|  | set canc_count [check_sacct_states "CANCELLED"] | 
|  | subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2" | 
|  |  | 
|  | set pend_count [check_sacct_states "PENDING"] | 
|  | subtest {$pend_count == 1} "Test PENDING count" "$pend_count != 1" | 
|  |  | 
|  | wait_for_job -fail $job_id "RUNNING" | 
|  |  | 
|  | # Wait for batch script to start (after message delays, prologs, etc.) | 
|  | sleep 5 | 
|  |  | 
|  | # Check for steps after requeue. There should be 3 batch steps - the first 2 | 
|  | # that are CANCELLED, and now the last one that is running. | 
|  | check_step 3 | 
|  |  | 
|  |  | 
|  | log_info "Test 5" | 
|  | set fail_count [check_sacct_states "NODE_FAIL"] | 
|  | subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1" | 
|  |  | 
|  | set req_count [check_sacct_states "REQUEUE"] | 
|  | subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1" | 
|  |  | 
|  | set canc_count [check_sacct_states "CANCELLED"] | 
|  | subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2" | 
|  |  | 
|  | # The job and its batch step should be RUNNING | 
|  | set run_count [check_sacct_states "RUNNING"] | 
|  | subtest {$run_count == 2} "Test RUNNING count" "$run_count != 2" | 
|  |  | 
|  | wait_for_job -fail $job_id "DONE" | 
|  |  | 
|  | # Check steps after job has completed | 
|  | check_step 3 | 
|  | log_info "Test 6" | 
|  | set fail_count [check_sacct_states "NODE_FAIL"] | 
|  | subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1" | 
|  |  | 
|  | set req_count [check_sacct_states "REQUEUE"] | 
|  | subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1" | 
|  |  | 
|  | set canc_count [check_sacct_states "CANCELLED"] | 
|  | subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2" | 
|  |  | 
|  | set comp_count [check_sacct_states "COMPLETED"] | 
|  | subtest {$comp_count == 2} "Test COMPLETED count" "$comp_count != 2" |