blob: 1834f6d982b4a229268b6a03decc3431799f60f4 [file] [log] [blame]
#!/usr/bin/expect
############################################################################
# Purpose: Test of BLUEGENE SLURM functionality
# Test sacct functionality and accuracy.
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2006 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Danny Auble <da@llnl.gov>
# UCRL-CODE-226842.
#
# This file is part of SLURM, a resource management program.
# For details, see <http://www.llnl.gov/linux/slurm/>.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set test_id "8.6"
set exit_code 0
set file_in "test$test_id.input"
set job_id 0
set matches 0
set 32node_block_cnt 16
set 128node_block_cnt 8
set 512node_block_cnt 8
set 1knode_block_cnt 8
set 4knode_block_cnt 8
set 8knode_block_cnt 8
set 16knode_block_cnt 8
set 32knode_block_cnt 8
# job paramters
set sleep_time 5
print_header $test_id
if {[test_bluegene] == 0} {
send_user "\nWARNING: This test is only compatable with bluegene systems\n"
exit $exit_code
}
#
# Delete left-over input script files
# Build input script file
#
exec $bin_rm -f $file_in
exec echo "#!$bin_bash" >$file_in
exec echo "$bin_sleep $sleep_time " >>$file_in
exec $bin_chmod 700 $file_in
#
set timeout [expr $max_job_delay + $sleep_time]
# make a bunch of 32 cnode blocks
proc run_batch_jobs { node_cnt job_cnt file_in } {
global srun number kill_srun
set start_cnt 0
for {set inx 0} {$inx < $job_cnt} {incr inx} {
set srun_pid [spawn $srun --batch --output=/dev/null -t2 -N$node_cnt-$node_cnt $file_in]
expect {
-re "More processors requested than permitted"
{
send_user "This error was expected, no worries\n"
return -1
}
-re "jobid ($number) submitted" {
incr start_cnt
exp_continue
}
-re "Unable to contact" {
send_user "\nFAILURE: slurm appears to be down\n"
exp_continue
}
timeout {
send_user "\nFAILURE: srun not responding\n"
slow_kill $srun_pid
return 0
}
eof {
wait
}
}
}
if { $start_cnt != $job_cnt } {
return 0
} else {
return 1
}
}
# Wait up to 900 seconds for all jobs to terminate
# Return 0 if all jobs done, remainin job count otherwise
proc wait_for_all_jobs { job_name } {
global squeue bin_sleep
set last_matches 0
send_user "Waiting for all jobs to terminate\n"
for {set inx 0} {$inx < 60} {incr inx} {
exec $bin_sleep 15
set matches 0
log_user 0
spawn $squeue -o %j
expect {
-re "$job_name" {
incr matches
exp_continue
}
eof {
wait
}
}
log_user 1
if {$matches == 0} {
send_user "All jobs complete\n"
return 0
}
send_user " $matches jobs remaining\n"
if {$matches == $last_matches} {
return $matches
}
set last_matches $matches
}
return $matches
}
if {[run_batch_jobs 32 $32node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 32 cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 128 $32node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 128 cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 512 $512node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 512 cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 1k $1knode_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 1k cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 4k $4knode_block_cnt $file_in ] == 0} {
send_user "\nFAILURE: 4k cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 8k $8knode_block_cnt $file_in ] == 0} {
send_user "\nFAILURE: 8k cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 16k $16knode_block_cnt $file_in ] == 0} {
send_user "\nFAILURE: 16k cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 32k $32knode_block_cnt $file_in ] == 0} {
send_user "\nFAILURE: 32k cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 1k $1knode_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 1k cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 512 $512node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 512 cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 128 $128node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 128 cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 32 $32node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 32 cnodes can't be created\n"
set exit_code 1
}
if {[run_batch_jobs 512 $512node_block_cnt $file_in ] != 1} {
send_user "\nFAILURE: 512 cnodes can't be created\n"
set exit_code 1
}
if {[wait_for_all_jobs $file_in] != 0} {
send_user "\nFAILURE: some submitted jobs failed to terminate\n"
set exit_code 1
}
if {$exit_code == 0} {
exec rm -f $file_in
send_user "\nSUCCESS\n"
}
exit $exit_code