testsuite/python/tests/test_116_26.py - SchedMD/slurm - Git at Google

 ############################################################################
 # Copyright (C) SchedMD LLC.
 ############################################################################
 import atf
 import pytest


 node_count = 2
 slurm_user = atf.properties["slurm-user"]


 # Setup
 @pytest.fixture(scope="module", autouse=True)
 def setup():
     atf.require_nodes(node_count)
     atf.require_config_parameter("SlurmdTimeout", 5)
     atf.require_slurm_running()


 @pytest.fixture(scope="function")
 def no_kill_job(request):
     """Submit a job that should not be killed on node failure, and resume node"""

     no_kill = request.param
     if no_kill:
         no_kill_arg = "--no-kill "
     else:
         no_kill_arg = ""
     job_id = atf.submit_job_sbatch(
         f"{no_kill_arg} -N{node_count} --wrap 'sleep 300'", fatal=True
     )
     atf.wait_for_job_state(job_id, "RUNNING", fatal=True)

     # Get an allocated node that it's not the BatchHost before it is removed
     # from the job so we can clean it up
     batch_node = atf.get_job_parameter(job_id, "BatchHost")
     node_list = atf.node_range_to_list(atf.get_job_parameter(job_id, "NodeList"))
     node = next((n for n in node_list if n != batch_node), None)
     if not node:
         pytest.fail(
             "Unable to find a node different than BatchHost, this shouldn't happen"
         )

     yield job_id, node, no_kill

     # Return the node to the idle state
     atf.cancel_jobs([job_id])
     atf.run_command(f"scontrol update nodename={node} state=resume", user=slurm_user)


 @pytest.mark.parametrize("no_kill_job", [True, False], indirect=True)
 def test_no_kill(no_kill_job):
     """Verify job with --no-kill option is not killed on node failure"""

     job_id, node, no_kill = no_kill_job

     job_nodes = atf.node_range_to_list(atf.get_job_parameter(job_id, "NodeList"))
     atf.run_command(
         f"scontrol update nodename={node} state=down reason=test_nokill",
         user=slurm_user,
     )
     atf.wait_for_node_state(node, "DOWN", fatal=True)
     job_nodes_post_node_down = atf.node_range_to_list(
         atf.get_job_parameter(job_id, "NodeList")
     )

     if no_kill:
         # With --no-kill: job should keep running without the down node
         assert (
             len(job_nodes_post_node_down) == len(job_nodes) - 1
         ), "The job with --no-kill should lose only one node after it was brought down"
         assert (
             atf.get_job_parameter(job_id, "JobState") == "RUNNING"
         ), "The job should keep running with --no-kill set"
         assert (
             atf.get_job_parameter(job_id, "Restarts") == 0
         ), "The job should not be restarted/requeued"
     else:
         # Without --no-kill: job should be requeued
         assert (
             atf.get_job_parameter(job_id, "JobState") != "RUNNING"
         ), "The job should not keep running without --no-kill set"

         # Wait until the initial job is completed and restarted/requeued
         requeued = atf.repeat_until(
             lambda: atf.get_jobs()[job_id],
             lambda job: job["JobState"] == "PENDING"
             and job["Restarts"] == 1
             and job["Reason"] == "BeginTime",
         )
         assert (
             requeued
         ), f"Job should be requeued when run without --no--kill, but {atf.get_jobs[job_id]}"
	############################################################################
	# Copyright (C) SchedMD LLC.
	############################################################################
	import atf
	import pytest


	node_count = 2
	slurm_user = atf.properties["slurm-user"]


	# Setup
	@pytest.fixture(scope="module", autouse=True)
	def setup():
	atf.require_nodes(node_count)
	atf.require_config_parameter("SlurmdTimeout", 5)
	atf.require_slurm_running()


	@pytest.fixture(scope="function")
	def no_kill_job(request):
	"""Submit a job that should not be killed on node failure, and resume node"""

	no_kill = request.param
	if no_kill:
	no_kill_arg = "--no-kill "
	else:
	no_kill_arg = ""
	job_id = atf.submit_job_sbatch(
	f"{no_kill_arg} -N{node_count} --wrap 'sleep 300'", fatal=True
	)
	atf.wait_for_job_state(job_id, "RUNNING", fatal=True)

	# Get an allocated node that it's not the BatchHost before it is removed
	# from the job so we can clean it up
	batch_node = atf.get_job_parameter(job_id, "BatchHost")
	node_list = atf.node_range_to_list(atf.get_job_parameter(job_id, "NodeList"))
	node = next((n for n in node_list if n != batch_node), None)
	if not node:
	pytest.fail(
	"Unable to find a node different than BatchHost, this shouldn't happen"
	)

	yield job_id, node, no_kill

	# Return the node to the idle state
	atf.cancel_jobs([job_id])
	atf.run_command(f"scontrol update nodename={node} state=resume", user=slurm_user)


	@pytest.mark.parametrize("no_kill_job", [True, False], indirect=True)
	def test_no_kill(no_kill_job):
	"""Verify job with --no-kill option is not killed on node failure"""

	job_id, node, no_kill = no_kill_job

	job_nodes = atf.node_range_to_list(atf.get_job_parameter(job_id, "NodeList"))
	atf.run_command(
	f"scontrol update nodename={node} state=down reason=test_nokill",
	user=slurm_user,
	)
	atf.wait_for_node_state(node, "DOWN", fatal=True)
	job_nodes_post_node_down = atf.node_range_to_list(
	atf.get_job_parameter(job_id, "NodeList")
	)

	if no_kill:
	# With --no-kill: job should keep running without the down node
	assert (
	len(job_nodes_post_node_down) == len(job_nodes) - 1
	), "The job with --no-kill should lose only one node after it was brought down"
	assert (
	atf.get_job_parameter(job_id, "JobState") == "RUNNING"
	), "The job should keep running with --no-kill set"
	assert (
	atf.get_job_parameter(job_id, "Restarts") == 0
	), "The job should not be restarted/requeued"
	else:
	# Without --no-kill: job should be requeued
	assert (
	atf.get_job_parameter(job_id, "JobState") != "RUNNING"
	), "The job should not keep running without --no-kill set"

	# Wait until the initial job is completed and restarted/requeued
	requeued = atf.repeat_until(
	lambda: atf.get_jobs()[job_id],
	lambda job: job["JobState"] == "PENDING"
	and job["Restarts"] == 1
	and job["Reason"] == "BeginTime",
	)
	assert (
	requeued
	), f"Job should be requeued when run without --no--kill, but {atf.get_jobs[job_id]}"