testsuite/expect/test12.2 - SchedMD/slurm - Git at Google

 #!/usr/bin/env expect
 ############################################################################
 # Purpose: Test of Slurm functionality
 #          Test sacct functionality and accuracy.
 ############################################################################
 # Copyright (C) 2005 The Regents of the University of California.
 # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 # Written by Morris Jette <jette1@llnl.gov>
 # CODE-OCEC-09-009. All rights reserved.
 #
 # This file is part of Slurm, a resource management program.
 # For details, see <https://slurm.schedmd.com/>.
 # Please also read the included file: DISCLAIMER.
 #
 # Slurm is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
 # Software Foundation; either version 2 of the License, or (at your option)
 # any later version.
 #
 # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along
 # with Slurm; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 ############################################################################
 source ./globals

 set file_in     "$test_dir/input"
 set file_out    "$test_dir/output"
 set file_err    "$test_dir/error"
 set file_prog   "$test_name.prog"
 set file_stem   $test_name
 set job_id      0
 set step_id     0
 set matches     0

 # job parameters
 set mem_size    204800; # 200 MiB
 set file_size   10485760
 set sleep_time  32
 set ret_code    42
 set num_tasks   3
 set max_rss_tolerance 10240; # 10 MiB
 set ave_rss_tolerance 20480; # 20 MiB
 set max_time_error 10
 set job_mem_limit [expr ($mem_size + $max_rss_tolerance + $ave_rss_tolerance) / 1024]

 # Expected values in sacct
 set expected_state    "FAILED"
 set expected_ret_code $ret_code
 if {[get_config_param "KillOnBadExit"] == 1} {
 	set expected_state "CANCELLED"
 	set expected_ret_code "0:15"
 }

 #
 # Check requirements
 #
 set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
 if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} {
 	skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
 }
 if {[get_config_param JobAcctGatherType] != "jobacct_gather/linux" &&
     [get_config_param JobAcctGatherType] != "jobacct_gather/cgroup"} {
 	skip "Job accounting information not gathered on this system"
 }
 if {[get_config_param AccountingStorageType] == "accounting_storage/none"} {
 	skip "Job accounting information not stored on this system"
 }

 set nodes [get_nodes_by_request "-n$num_tasks --mem=$job_mem_limit -t2"]
 if {![llength $nodes]} {
 	skip "Unable to test with current node configuration"
 }

 proc cleanup {} {
 	global file_prog file_stem

 	file delete $file_prog $file_stem.read $file_stem.write
 }

 proc _get_mem {prog} {
 	global float number mem_size job_id step_id max_rss_tolerance ave_rss_tolerance num_tasks

 	set result   [list]
 	set max_rss -1
 	set mem_task -1
 	set ave_rss -1
 	set prog_base [file rootname [file tail $prog]]

 	set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format maxrss,maxrsstask,averss --noconvert"]

 	dict set subtest cond [regexp "($number)\\D($number)\\D($number)" $output - max_rss mem_task ave_rss]
 	dict set subtest desc "$prog_base should provide the right output format"
 	dict set subtest diag "$output"
 	lappend result $subtest
 	if {![dict get $subtest cond]} {
 		return $result
 	}

 	set max_rss [scale_to_ks $max_rss ""]
 	# Since we will be multiplying ave_rss by num_tasks below and expecting it to be greater than
 	# max_rss, we must add 1 to ave_rss to compensate for the small potential truncation loss that
 	# can occur in the slurm code when calculating AveRSS (effectively int(TotRSS / tasks)).
 	set ave_rss [scale_to_ks [expr $ave_rss + 1] ""]

 	dict set subtest cond [tolerance $mem_size $max_rss $max_rss_tolerance]
 	dict set subtest desc "MaxRSS is within expected tolerance for $prog_base"
 	dict set subtest diag "Observed MaxRSS ($max_rss) is not within tolerance $max_rss_tolerance of expected value ($mem_size)"
 	lappend result $subtest

 	# We expect ave_rss to be greater than or equal to max_rss / num_tasks.
 	# We are assuming that the two non-memory tasks will not climb above a total of 10 MiB.
 	dict set subtest cond [tolerance $max_rss [expr $ave_rss * $num_tasks] "+$ave_rss_tolerance"]
 	dict set subtest desc "AveRSS is within expected tolerance for $prog_base"
 	dict set subtest diag "Observed AveRSS ($ave_rss) is not within tolerance +[expr $ave_rss_tolerance / $num_tasks] of expected value ([expr $max_rss / $num_tasks])"
 	lappend result $subtest

 	return $result
 }

 # Check the job written and read file size.
 proc _get_file_size {prog} {
 	global number float job_id step_id file_size

 	set result          [list]
 	set max_disk_write  -1
 	set ave_disk_write  -1
 	set max_disk_read   -1
 	set ave_disk_read   -1
 	set prog_base [file rootname [file tail $prog]]

 	set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format MaxDiskWrite,AveDiskWrite,MaxDiskRead,AveDiskRead,MaxDiskWriteTask,MaxDiskReadTask --noconvert"]

 	dict set subtest cond [regexp "($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($number)\\D($number)" $output \
 			       - \
 			       max_disk_write scale1 \
 			       ave_disk_write scale2 \
 			       max_disk_read  scale3 \
 			       ave_disk_read  scale4 \
 			       w_task r_task]
 	dict set subtest desc "$prog_base should provide the right output format"
 	dict set subtest diag "$output"
 	lappend result $subtest
 	if {![dict get $subtest cond]} {
 		return $result
 	}

 	set max_disk_write [scale_to_megs $max_disk_write $scale1]
 	set ave_disk_write [scale_to_megs $ave_disk_write $scale2]
 	set max_disk_read  [scale_to_megs $max_disk_read  $scale3]
 	set ave_disk_read  [scale_to_megs $ave_disk_read  $scale4]

 	dict set subtest cond [expr {$w_task == 1}]
 	dict set subtest desc "$prog_base should provide MaxDiskWriteTask equal to 1"
 	dict set subtest diag "$w_task != 1"
 	lappend result $subtest

 	dict set subtest cond [expr {$r_task == 2}]
 	dict set subtest desc "$prog_base should provide MaxDiskReadTask equal to 2"
 	dict set subtest diag "$r_task != 2"
 	lappend result $subtest

 	dict set subtest cond [tolerance $max_disk_write $max_disk_read "0.3"]
 	dict set subtest desc "$prog_base should provide MaxDiskWrite close to MaxDiskRead with 0.3 tolerance"
 	dict set subtest diag "$max_disk_write is too different from $max_disk_read"
 	lappend result $subtest

 	dict set subtest cond [tolerance $ave_disk_write $ave_disk_read "0.3"]
 	dict set subtest desc "$prog_base should provide AveDiskWrite close to AveDiskRead with 0.3 tolerance"
 	dict set subtest diag "$ave_disk_write is too different from $ave_disk_read"
 	lappend result $subtest

 	return $result
 }

 proc wait_and_subtest {test prog} {
 	set all_passed false
 	wait_for -timeout 15 {$all_passed} {
 		set result [$test $prog]

 		set all_passed true
 		foreach subtest $result {
 			if {![dict get $subtest cond]} {
 				set all_passed false
 			}
 		}
 	}
 	foreach subtest $result {
 		subtest [dict get $subtest cond] [dict get $subtest desc] [dict get $subtest diag]
 	}
 }

 #
 # Delete left-over program and rebuild it
 # Compilation is not optimized to avoid memset to be skipped.
 #
 run_command -fail "$bin_cc -o $file_prog ${file_prog}.c"
 run_command -fail "$bin_chmod 700 $file_prog"

 make_bash_script $file_in "
     $srun ./$file_prog $ret_code $sleep_time $mem_size $file_size $file_stem
 "

 # Create a file to read
 set fd [open ${file_stem}.read "wb"]
 for {set i 0} {$i < $file_size} {incr i} {
     puts -nonewline $fd [binary format n $i]
 }
 close $fd

 #
 # Run a simple job
 # Usage: test12.2.prog <exit_code> <sleep_secs> <mem_kb>
 # <file_size> <file_stem>
 #
 set config_prob 0
 set timeout [expr $max_job_delay + $sleep_time]
 set job_id [submit_job -fail "-n$num_tasks --mem-per-cpu=$job_mem_limit --output=$file_out --error=$file_err -t2 $file_in"]

 #
 # Wait for job to run
 #
 wait_for_job -fail $job_id "RUNNING"

 wait_and_subtest _get_mem       $sstat
 wait_and_subtest _get_file_size $sstat

 #
 # Wait for job to complete
 #
 wait_for_job -fail $job_id "DONE"

 wait_and_subtest _get_mem       $sacct
 wait_and_subtest _get_file_size $sacct

 #
 # Report basic sacct info
 #
 set output "COMPLETING"
 wait_for -fail {![regexp "COMPLETING" $output]} {
 	set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format jobid,jobname,state,exitcode --starttime=00:00"]
 }
 subtest {[regexp "$job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" $output]} "sacct should report $job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" "$output"

 #
 # Report the sacct accounting info: Elapsed
 #
 set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format elapsed --starttime=00:00"]
 set value  [convert_time_str $output "secs"]
 subtest {[tolerance $sleep_time $value +$max_time_error]} "Elapsed time reported by sacct should be close to $sleep_time" "$value too different from $sleep_time"

 #
 # Report the sacct accounting info: TotalCPU
 #
 set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format totalcpu --starttime=00:00"]
 set value  [convert_time_str $output "secs"]
 subtest {$value <= $sleep_time} "TotalCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"

 #
 # Report the sacct accounting info: MinCPU
 #
 set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format mincpu --starttime=00:00"]
 set value  [convert_time_str $output "secs"]
 subtest {$value <= $sleep_time} "MinCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"
	#!/usr/bin/env expect
	############################################################################
	# Purpose: Test of Slurm functionality
	# Test sacct functionality and accuracy.
	############################################################################
	# Copyright (C) 2005 The Regents of the University of California.
	# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	# Written by Morris Jette <jette1@llnl.gov>
	# CODE-OCEC-09-009. All rights reserved.
	#
	# This file is part of Slurm, a resource management program.
	# For details, see <https://slurm.schedmd.com/>.
	# Please also read the included file: DISCLAIMER.
	#
	# Slurm is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 2 of the License, or (at your option)
	# any later version.
	#
	# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	# details.
	#
	# You should have received a copy of the GNU General Public License along
	# with Slurm; if not, write to the Free Software Foundation, Inc.,
	# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	############################################################################
	source ./globals

	set file_in "$test_dir/input"
	set file_out "$test_dir/output"
	set file_err "$test_dir/error"
	set file_prog "$test_name.prog"
	set file_stem $test_name
	set job_id 0
	set step_id 0
	set matches 0

	# job parameters
	set mem_size 204800; # 200 MiB
	set file_size 10485760
	set sleep_time 32
	set ret_code 42
	set num_tasks 3
	set max_rss_tolerance 10240; # 10 MiB
	set ave_rss_tolerance 20480; # 20 MiB
	set max_time_error 10
	set job_mem_limit [expr ($mem_size + $max_rss_tolerance + $ave_rss_tolerance) / 1024]

	# Expected values in sacct
	set expected_state "FAILED"
	set expected_ret_code $ret_code
	if {[get_config_param "KillOnBadExit"] == 1} {
	set expected_state "CANCELLED"
	set expected_ret_code "0:15"
	}

	#
	# Check requirements
	#
	set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
	if {[param_contains $accounting_storage_enforce "nosteps"] \|\| [param_contains $accounting_storage_enforce "nojobs"]} {
	skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
	}
	if {[get_config_param JobAcctGatherType] != "jobacct_gather/linux" &&
	[get_config_param JobAcctGatherType] != "jobacct_gather/cgroup"} {
	skip "Job accounting information not gathered on this system"
	}
	if {[get_config_param AccountingStorageType] == "accounting_storage/none"} {
	skip "Job accounting information not stored on this system"
	}

	set nodes [get_nodes_by_request "-n$num_tasks --mem=$job_mem_limit -t2"]
	if {![llength $nodes]} {
	skip "Unable to test with current node configuration"
	}

	proc cleanup {} {
	global file_prog file_stem

	file delete $file_prog $file_stem.read $file_stem.write
	}

	proc _get_mem {prog} {
	global float number mem_size job_id step_id max_rss_tolerance ave_rss_tolerance num_tasks

	set result [list]
	set max_rss -1
	set mem_task -1
	set ave_rss -1
	set prog_base [file rootname [file tail $prog]]

	set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format maxrss,maxrsstask,averss --noconvert"]

	dict set subtest cond [regexp "($number)\\D($number)\\D($number)" $output - max_rss mem_task ave_rss]
	dict set subtest desc "$prog_base should provide the right output format"
	dict set subtest diag "$output"
	lappend result $subtest
	if {![dict get $subtest cond]} {
	return $result
	}

	set max_rss [scale_to_ks $max_rss ""]
	# Since we will be multiplying ave_rss by num_tasks below and expecting it to be greater than
	# max_rss, we must add 1 to ave_rss to compensate for the small potential truncation loss that
	# can occur in the slurm code when calculating AveRSS (effectively int(TotRSS / tasks)).
	set ave_rss [scale_to_ks [expr $ave_rss + 1] ""]

	dict set subtest cond [tolerance $mem_size $max_rss $max_rss_tolerance]
	dict set subtest desc "MaxRSS is within expected tolerance for $prog_base"
	dict set subtest diag "Observed MaxRSS ($max_rss) is not within tolerance $max_rss_tolerance of expected value ($mem_size)"
	lappend result $subtest

	# We expect ave_rss to be greater than or equal to max_rss / num_tasks.
	# We are assuming that the two non-memory tasks will not climb above a total of 10 MiB.
	dict set subtest cond [tolerance $max_rss [expr $ave_rss * $num_tasks] "+$ave_rss_tolerance"]
	dict set subtest desc "AveRSS is within expected tolerance for $prog_base"
	dict set subtest diag "Observed AveRSS ($ave_rss) is not within tolerance +[expr $ave_rss_tolerance / $num_tasks] of expected value ([expr $max_rss / $num_tasks])"
	lappend result $subtest

	return $result
	}

	# Check the job written and read file size.
	proc _get_file_size {prog} {
	global number float job_id step_id file_size

	set result [list]
	set max_disk_write -1
	set ave_disk_write -1
	set max_disk_read -1
	set ave_disk_read -1
	set prog_base [file rootname [file tail $prog]]

	set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format MaxDiskWrite,AveDiskWrite,MaxDiskRead,AveDiskRead,MaxDiskWriteTask,MaxDiskReadTask --noconvert"]

	dict set subtest cond [regexp "($float)(\[MGT\])\\D($float)(\[MGT\])\\D($float)(\[MGT\])\\D($float)(\[MGT\])\\D($number)\\D($number)" $output \
	- \
	max_disk_write scale1 \
	ave_disk_write scale2 \
	max_disk_read scale3 \
	ave_disk_read scale4 \
	w_task r_task]
	dict set subtest desc "$prog_base should provide the right output format"
	dict set subtest diag "$output"
	lappend result $subtest
	if {![dict get $subtest cond]} {
	return $result
	}

	set max_disk_write [scale_to_megs $max_disk_write $scale1]
	set ave_disk_write [scale_to_megs $ave_disk_write $scale2]
	set max_disk_read [scale_to_megs $max_disk_read $scale3]
	set ave_disk_read [scale_to_megs $ave_disk_read $scale4]

	dict set subtest cond [expr {$w_task == 1}]
	dict set subtest desc "$prog_base should provide MaxDiskWriteTask equal to 1"
	dict set subtest diag "$w_task != 1"
	lappend result $subtest

	dict set subtest cond [expr {$r_task == 2}]
	dict set subtest desc "$prog_base should provide MaxDiskReadTask equal to 2"
	dict set subtest diag "$r_task != 2"
	lappend result $subtest

	dict set subtest cond [tolerance $max_disk_write $max_disk_read "0.3"]
	dict set subtest desc "$prog_base should provide MaxDiskWrite close to MaxDiskRead with 0.3 tolerance"
	dict set subtest diag "$max_disk_write is too different from $max_disk_read"
	lappend result $subtest

	dict set subtest cond [tolerance $ave_disk_write $ave_disk_read "0.3"]
	dict set subtest desc "$prog_base should provide AveDiskWrite close to AveDiskRead with 0.3 tolerance"
	dict set subtest diag "$ave_disk_write is too different from $ave_disk_read"
	lappend result $subtest

	return $result
	}

	proc wait_and_subtest {test prog} {
	set all_passed false
	wait_for -timeout 15 {$all_passed} {
	set result [$test $prog]

	set all_passed true
	foreach subtest $result {
	if {![dict get $subtest cond]} {
	set all_passed false
	}
	}
	}
	foreach subtest $result {
	subtest [dict get $subtest cond] [dict get $subtest desc] [dict get $subtest diag]
	}
	}

	#
	# Delete left-over program and rebuild it
	# Compilation is not optimized to avoid memset to be skipped.
	#
	run_command -fail "$bin_cc -o $file_prog ${file_prog}.c"
	run_command -fail "$bin_chmod 700 $file_prog"

	make_bash_script $file_in "
	$srun ./$file_prog $ret_code $sleep_time $mem_size $file_size $file_stem
	"

	# Create a file to read
	set fd [open ${file_stem}.read "wb"]
	for {set i 0} {$i < $file_size} {incr i} {
	puts -nonewline $fd [binary format n $i]
	}
	close $fd

	#
	# Run a simple job
	# Usage: test12.2.prog <exit_code> <sleep_secs> <mem_kb>
	# <file_size> <file_stem>
	#
	set config_prob 0
	set timeout [expr $max_job_delay + $sleep_time]
	set job_id [submit_job -fail "-n$num_tasks --mem-per-cpu=$job_mem_limit --output=$file_out --error=$file_err -t2 $file_in"]

	#
	# Wait for job to run
	#
	wait_for_job -fail $job_id "RUNNING"

	wait_and_subtest _get_mem $sstat
	wait_and_subtest _get_file_size $sstat

	#
	# Wait for job to complete
	#
	wait_for_job -fail $job_id "DONE"

	wait_and_subtest _get_mem $sacct
	wait_and_subtest _get_file_size $sacct

	#
	# Report basic sacct info
	#
	set output "COMPLETING"
	wait_for -fail {![regexp "COMPLETING" $output]} {
	set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format jobid,jobname,state,exitcode --starttime=00:00"]
	}
	subtest {[regexp "$job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" $output]} "sacct should report $job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" "$output"

	#
	# Report the sacct accounting info: Elapsed
	#
	set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format elapsed --starttime=00:00"]
	set value [convert_time_str $output "secs"]
	subtest {[tolerance $sleep_time $value +$max_time_error]} "Elapsed time reported by sacct should be close to $sleep_time" "$value too different from $sleep_time"

	#
	# Report the sacct accounting info: TotalCPU
	#
	set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format totalcpu --starttime=00:00"]
	set value [convert_time_str $output "secs"]
	subtest {$value <= $sleep_time} "TotalCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"

	#
	# Report the sacct accounting info: MinCPU
	#
	set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format mincpu --starttime=00:00"]
	set value [convert_time_str $output "secs"]
	subtest {$value <= $sleep_time} "MinCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"