blob: 00e06594b687c890adf6c1c26043f727e9e18b90 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test hdf5 acct_gather_profile (--profile=task)
############################################################################
# Copyright (C) 2013 Bull S. A. S.
# Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois.
#
# Written by Rod Schultz <rod.schultz@bull.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_out "$test_dir/output"
set file_prog "$test_name.prog"
set hdf5_file "$test_name.h5"
set job_id 0
proc cleanup {} {
global file_prog hdf5_file
file delete $file_prog $hdf5_file
}
# Check if acct_gather_profile/hdf5 is installed
set profile 0
log_user 0
spawn $scontrol show config
expect {
-re "acct_gather_profile/hdf5" {
set profile 1
}
eof {
wait
}
}
log_user 1
if {$profile == 0} {
skip "acct_gather_profile/hdf5 not installed on this system"
}
log_info "Acct_gather_profile/hdf5 plugin installed"
log_debug "Note: this test takes 3 minutes to run"
set task_freq [get_job_acct_freq]
if {$task_freq < 30} {
log_warn "jobacct_gather_freq < 30 ($task_freq), results are unreliable"
}
#
# Build a test program to put a known load on the system
#
exec $bin_rm -f $file_prog
exec $bin_cc -I$build_dir $file_prog.c -lm -o $file_prog
exec $bin_chmod 700 $file_prog
set timeout [expr $max_job_delay + 200]
spawn $srun --acctg-freq=$task_freq --profile=task -t5 ./$file_prog
expect {
-re "SLURM_JobId=($number)" {
set job_id $expect_out(1,string)
exp_continue
}
-re "error: PROFILE: Impossible to create the table" {
exec rm -f $file_prog
skip "This error is cause by a bug in HDF5 1.10.0, and is fixed in 1.10.1. Please upgrade to run this test"
}
-re "error:" {
fail "Something happened on start of job"
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
set timeout 10
spawn $sh5util -j $job_id
expect {
"Output file generated is empty" {
fail "sh5util merge didn't make anything"
}
timeout {
fail "sh5util merge not responding"
}
eof {
wait
}
}
spawn $sh5util -j $job_id -E -l Node:TimeSeries -s Tasks -o $file_out
expect {
"Output file generated is empty" {
fail "sh5util extract didn't make anything"
}
timeout {
fail "sh5util extract not responding"
}
eof {
wait
}
}
set line ""
set nerr 0
set lno 0
set fd 0
set last_et 0
set fd [open $file_out "r"]
set et_col -1
set cpu_util_col -1
set read_disk_col -1
while {$fd > 0 && [gets $fd line] != -1} {
incr lno
set tokens [split $line ","]
if {$lno == 1} {
set et_col [lsearch $tokens "ElapsedTime"]
set cpu_util_col [lsearch $tokens "CPUUtilization"]
set read_disk_col [lsearch $tokens "ReadMB"]
if {$et_col == -1} {
fail "No ElapsedTime column found"
}
if {$cpu_util_col == -1} {
fail "No CPUUtilization column found"
}
if {$read_disk_col == -1} {
fail "No ReadMB column found"
}
continue
}
set et [lindex $tokens $et_col]
set cur_et [expr $et - $last_et]
set last_et $et
if { $lno == 2 } {
continue
}
if {$cur_et < $task_freq} {
log_warn "Poll interval was only $cur_et instead of expected $task_freq on line $lno"
incr nerr
}
set cputil [lindex $tokens $cpu_util_col]
# The range on cpu utilization is pretty wide
# Linux accounting resolution is only to one second, so in a
# typical 30 interval an extra second is 3%. The burn loop
# consumes a bit more that asked for. There is additional type
# managing the I/O portion. Slurm and linux also consume some
# cpu.
if {$cputil < 38.0 || $cputil > 47.0 } {
log_warn "CPU Busy $cputil not near 40% on line $lno"
incr nerr
}
set rdmb [lindex $tokens $read_disk_col]
set low_rd [expr 0.975 * 10 * $cur_et]
set hi_rd [expr 1.025 * 10 * $cur_et]
if {$rdmb < $low_rd || $rdmb > $hi_rd } {
log_warn "Read Megabytes $rdmb not near 100.0 on line $lno"
incr nerr
}
}
close $fd
if {$lno == 0} {
fail "Output file ($file_out) is empty"
}
if {$nerr > 3} {
fail "Too many values out of range ($nerr)"
}