| #!/usr/bin/expect |
| ############################################################################ |
| # Purpose: Test of BLUEGENE SLURM functionality |
| # Test sacct functionality and accuracy. |
| # |
| # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR |
| # "FAILURE: ..." otherwise with an explanation of the failure, OR |
| # anything else indicates a failure mode that must be investigated. |
| ############################################################################ |
| # Copyright (C) 2006 The Regents of the University of California. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Danny Auble <da@llnl.gov> |
| # UCRL-CODE-226842. |
| # |
| # This file is part of SLURM, a resource management program. |
| # For details, see <http://www.llnl.gov/linux/slurm/>. |
| # |
| # SLURM is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with SLURM; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set test_id "8.6" |
| set exit_code 0 |
| set file_in "test$test_id.input" |
| set job_id 0 |
| set matches 0 |
| set 32node_block_cnt 16 |
| set 128node_block_cnt 8 |
| set 512node_block_cnt 8 |
| set 1knode_block_cnt 8 |
| set 4knode_block_cnt 8 |
| set 8knode_block_cnt 8 |
| set 16knode_block_cnt 8 |
| set 32knode_block_cnt 8 |
| |
| # job paramters |
| set sleep_time 5 |
| |
| print_header $test_id |
| |
| if {[test_bluegene] == 0} { |
| send_user "\nWARNING: This test is only compatable with bluegene systems\n" |
| exit $exit_code |
| } |
| |
| # |
| # Delete left-over input script files |
| # Build input script file |
| # |
| exec $bin_rm -f $file_in |
| exec echo "#!$bin_bash" >$file_in |
| exec echo "$bin_sleep $sleep_time " >>$file_in |
| exec $bin_chmod 700 $file_in |
| |
| # |
| set timeout [expr $max_job_delay + $sleep_time] |
| |
| # make a bunch of 32 cnode blocks |
| |
| proc run_batch_jobs { node_cnt job_cnt file_in } { |
| global srun number kill_srun |
| set start_cnt 0 |
| for {set inx 0} {$inx < $job_cnt} {incr inx} { |
| set srun_pid [spawn $srun --batch --output=/dev/null -t2 -N$node_cnt-$node_cnt $file_in] |
| expect { |
| -re "More processors requested than permitted" |
| { |
| send_user "This error was expected, no worries\n" |
| return -1 |
| |
| } |
| -re "jobid ($number) submitted" { |
| incr start_cnt |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exp_continue |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| slow_kill $srun_pid |
| return 0 |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| if { $start_cnt != $job_cnt } { |
| return 0 |
| } else { |
| return 1 |
| } |
| } |
| |
| # Wait up to 900 seconds for all jobs to terminate |
| # Return 0 if all jobs done, remainin job count otherwise |
| proc wait_for_all_jobs { job_name } { |
| global squeue bin_sleep |
| |
| set last_matches 0 |
| send_user "Waiting for all jobs to terminate\n" |
| for {set inx 0} {$inx < 60} {incr inx} { |
| exec $bin_sleep 15 |
| set matches 0 |
| log_user 0 |
| spawn $squeue -o %j |
| expect { |
| -re "$job_name" { |
| incr matches |
| exp_continue |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| if {$matches == 0} { |
| send_user "All jobs complete\n" |
| return 0 |
| } |
| send_user " $matches jobs remaining\n" |
| if {$matches == $last_matches} { |
| return $matches |
| } |
| set last_matches $matches |
| } |
| return $matches |
| } |
| |
| if {[run_batch_jobs 32 $32node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 32 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 128 $32node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 128 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 512 $512node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 512 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 1k $1knode_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 1k cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 4k $4knode_block_cnt $file_in ] == 0} { |
| send_user "\nFAILURE: 4k cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 8k $8knode_block_cnt $file_in ] == 0} { |
| send_user "\nFAILURE: 8k cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 16k $16knode_block_cnt $file_in ] == 0} { |
| send_user "\nFAILURE: 16k cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 32k $32knode_block_cnt $file_in ] == 0} { |
| send_user "\nFAILURE: 32k cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 1k $1knode_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 1k cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 512 $512node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 512 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 128 $128node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 128 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 32 $32node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 32 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[run_batch_jobs 512 $512node_block_cnt $file_in ] != 1} { |
| send_user "\nFAILURE: 512 cnodes can't be created\n" |
| set exit_code 1 |
| } |
| |
| if {[wait_for_all_jobs $file_in] != 0} { |
| send_user "\nFAILURE: some submitted jobs failed to terminate\n" |
| set exit_code 1 |
| } |
| |
| if {$exit_code == 0} { |
| exec rm -f $file_in |
| send_user "\nSUCCESS\n" |
| } |
| exit $exit_code |