Merge branch 'megan/t23653-fix_double_free' into 'master'
See merge request SchedMD/dev/slurm!2068
diff --git a/src/common/job_record.c b/src/common/job_record.c
index 5e24654..dc28d44 100644
--- a/src/common/job_record.c
+++ b/src/common/job_record.c
@@ -1371,7 +1371,8 @@
return slurm_sort_int_list_asc(&node1->node_index, &node2->node_index);
}
-extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr)
+extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr,
+ uint16_t protocol_version)
{
uint16_t *arbitrary_tasks_np = NULL;
int rc = SLURM_SUCCESS;
@@ -1380,6 +1381,7 @@
char *host, *prev_host = NULL;
node_inx_cnt_t *node_inx_cnts;
hostlist_t *hl = hostlist_create(job_ptr->details->req_nodes);
+ int num_names = hostlist_count(hl);
hostlist_sort(hl);
arbitrary_tasks_np = xcalloc(num_nodes, sizeof(uint16_t));
@@ -1417,6 +1419,33 @@
goto cleanup;
}
+ if (num_names != job_ptr->details->num_tasks) {
+ error("Task count (%u) for %pJ is not equal to the number of nodes in the requested arbitrary node list (%s)",
+ job_ptr->details->num_tasks, job_ptr,
+ job_ptr->details->req_nodes);
+
+ /* Reject arbitrary jobs with bad task counts in 25.11+. */
+ if (protocol_version >= SLURM_25_11_PROTOCOL_VERSION) {
+ free(prev_host);
+ rc = ESLURM_BAD_TASK_COUNT;
+ goto cleanup;
+ }
+
+ /*
+ * Allow existing older version jobs from save state to be
+ * loaded. This prevents all jobs being lost during an
+ * upgrade. Reallocate the arbitrary_tasks_np array to prevent
+ * an invalid read if the number of tasks > num_names. This
+ * can be removed when upgrading from
+ * SLURM_25_05_PROTOCOL_VERSION is not supported, and the
+ * protocol_version parameter to this function can be removed.
+ */
+ if (num_names < job_ptr->details->num_tasks)
+ xrecalloc(arbitrary_tasks_np,
+ job_ptr->details->num_tasks,
+ sizeof(arbitrary_tasks_np[0]));
+ }
+
node_inx_cnts[cur_node].node_index = node_name_get_inx(prev_host);
free(prev_host);
@@ -1846,7 +1875,8 @@
job_ptr->details->work_dir = work_dir;
if (((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
- SLURM_DIST_ARBITRARY) && job_record_calc_arbitrary_tpn(job_ptr))
+ SLURM_DIST_ARBITRARY) &&
+ job_record_calc_arbitrary_tpn(job_ptr, job_ptr->start_protocol_ver))
return SLURM_ERROR;
return SLURM_SUCCESS;
diff --git a/src/common/job_record.h b/src/common/job_record.h
index 52b2bea..3db8030 100644
--- a/src/common/job_record.h
+++ b/src/common/job_record.h
@@ -705,7 +705,8 @@
extern int load_step_state(job_record_t *job_ptr, buf_t *buffer,
uint16_t protocol_version);
-extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr);
+extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr,
+ uint16_t protocol_version);
extern void job_record_pack_details_common(
job_details_t *detail_ptr, buf_t *buffer, uint16_t protocol_version);
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index e4dcffb..26866b7 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -8645,12 +8645,14 @@
detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie);
detail_ptr->x11_target = xstrdup(job_desc->x11_target);
detail_ptr->x11_target_port = job_desc->x11_target_port;
+ if (job_desc->num_tasks != NO_VAL)
+ detail_ptr->num_tasks = job_desc->num_tasks;
if (job_desc->req_nodes) {
if ((job_desc->task_dist & SLURM_DIST_STATE_BASE) ==
SLURM_DIST_ARBITRARY) {
detail_ptr->req_nodes = xstrdup(job_desc->req_nodes);
- if ((error_code =
- job_record_calc_arbitrary_tpn(job_ptr)))
+ if ((error_code = job_record_calc_arbitrary_tpn(
+ job_ptr, SLURM_PROTOCOL_VERSION)))
return error_code;
} else {
detail_ptr->req_nodes =
@@ -8716,8 +8718,6 @@
detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
if (job_desc->overcommit != NO_VAL8)
detail_ptr->overcommit = job_desc->overcommit;
- if (job_desc->num_tasks != NO_VAL)
- detail_ptr->num_tasks = job_desc->num_tasks;
if (job_desc->ntasks_per_node != NO_VAL16) {
detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
if ((detail_ptr->overcommit == 0) &&
diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c
index ebc03a7..62d8202 100644
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -2101,12 +2101,14 @@
msg->flags |= SLURM_NO_AUTH_CRED;
slurm_send_rc_msg(msg, SLURM_PROTOCOL_AUTHENTICATION_ERROR);
slurm_free_msg(msg);
+ conmgr_queue_close_fd(con);
return SLURM_SUCCESS;
} else if (unpack_rc) {
error("%s: [%s] rejecting malformed RPC and closing connection: %s",
__func__, conmgr_fd_get_name(con),
slurm_strerror(unpack_rc));
slurm_free_msg(msg);
+ conmgr_queue_close_fd(con);
return unpack_rc;
}
diff --git a/testsuite/README b/testsuite/README
index c11087e..d353706 100644
--- a/testsuite/README
+++ b/testsuite/README
@@ -45,6 +45,11 @@
SlurmInstallDir: Slurm Installation Directory (value of --prefix configure option)
SlurmConfigDir: Slurm Configuration Directory (compiled-in location of slurm.conf)
+If you are running in local-config, if the user running the test is SlurmUser or
+AdminLevel, you better specify an unprivileged user to run the test commands as:
+
+SlurmTestUser: Linux user without AdminLevel in the local-config Slurm.
+
Tips:
* the include and exclude filters use (perl 5) regular expressions
diff --git a/testsuite/python/conftest.py b/testsuite/python/conftest.py
index e6c929a..dab3d0c 100644
--- a/testsuite/python/conftest.py
+++ b/testsuite/python/conftest.py
@@ -14,7 +14,7 @@
# import shutil
import sys
-# from pathlib import Path
+from pathlib import Path
sys.path.append(sys.path[0] + "/lib")
import atf
@@ -125,7 +125,7 @@
color_log_level(logging.TRACE, purple=True, bold=True)
-def update_tmp_path_exec_permissions():
+def update_tmp_path_exec_permissions(path):
"""
For pytest versions 6+ the tmp path it uses no longer has
public exec permissions for dynamically created directories by default.
@@ -140,15 +140,22 @@
Bug 16568
"""
- user_name = atf.get_user_name()
- path = f"/tmp/pytest-of-{user_name}"
-
if os.path.isdir(path):
os.chmod(path, 0o777)
for root, dirs, files in os.walk(path):
for d in dirs:
os.chmod(os.path.join(root, d), 0o777)
+ # Ensure access for parent dirs too
+ path = path.resolve()
+ if not path.is_relative_to("/tmp"):
+ pytest.fail(f"Unexpected tmp path outside /tmp: {path}")
+
+ subdir = Path("/tmp")
+ for part in path.relative_to("/tmp").parts:
+ subdir = subdir / part
+ os.chmod(subdir, 0o777)
+
@pytest.fixture(scope="module", autouse=True)
def module_setup(request, tmp_path_factory):
@@ -179,7 +186,7 @@
name = name[:30]
atf.properties["test_name"] = name
atf.module_tmp_path = tmp_path_factory.mktemp(name, numbered=True)
- update_tmp_path_exec_permissions()
+ update_tmp_path_exec_permissions(atf.module_tmp_path)
# Module-level fixtures should run from within the module_tmp_path
os.chdir(atf.module_tmp_path)
@@ -318,6 +325,7 @@
logging.info(request.function.__doc__)
# Start each test inside the tmp_path
+ update_tmp_path_exec_permissions(tmp_path)
monkeypatch.chdir(tmp_path)
diff --git a/testsuite/python/lib/atf.py b/testsuite/python/lib/atf.py
index 95ba0a2..99bcc0e 100644
--- a/testsuite/python/lib/atf.py
+++ b/testsuite/python/lib/atf.py
@@ -182,6 +182,10 @@
if env_vars is not None:
command = env_vars.strip() + " " + command
+ # If user is not specified but test-user is set, then set user to test-user
+ if not user and properties["test-user-set"]:
+ user = properties["test-user"]
+
start_time = time.time()
invocation_message = "Running command"
if user is not None:
@@ -189,7 +193,7 @@
invocation_message += f": {command}"
logging.log(log_command_level, invocation_message)
try:
- if user is not None and user != properties["test-user"]:
+ if user is not None:
if not properties["sudo-rights"]:
pytest.skip(
"This test requires the test user to have unprompted sudo rights",
@@ -1278,7 +1282,9 @@
slurm_prefix = f"{properties['slurm-sbin-dir']}/.."
version_str = (
- run_command_output(f"sudo {slurm_prefix}/{component} -V", quiet=True)
+ run_command_output(
+ f"{slurm_prefix}/{component} -V", quiet=True, user="root"
+ )
.strip()
.replace("slurm ", "")
)
@@ -2481,7 +2487,7 @@
False
"""
- user_name = get_user_name()
+ user_name = properties["test-user"]
run_command(f"scancel -u {user_name}", fatal=fatal, quiet=quiet)
@@ -4125,7 +4131,9 @@
augmentation_dict[parameter_name] = parameter_value
elif parameter_name == "Features":
required_features = set(parameter_value.split(","))
- node_features = set(lower_node_dict.get("features", "").split(","))
+ features = lower_node_dict.get("features", [])
+ features = features[0] if features else ""
+ node_features = set(features.split(","))
if not required_features.issubset(node_features):
if node_qualifies:
node_qualifies = False
@@ -4224,7 +4232,16 @@
with open(script_name, "w") as f:
f.write("#!/bin/bash\n")
f.write(script_contents)
- os.chmod(script_name, 0o0700)
+
+ run_command(f"chmod 777 {script_name}", user="root", fatal=True, quiet=True)
+
+ if properties["test-user-set"]:
+ run_command(
+ f"chown {properties['test-user']} {script_name}",
+ user="root",
+ fatal=True,
+ quiet=True,
+ )
def wait_for_file(file_name, **repeat_until_kwargs):
@@ -4883,7 +4900,7 @@
if match := re.search(r"^\s*(?i:SlurmUser)\s*=\s*(.*)$", line):
properties["slurm-user"] = match.group(1)
else:
- # slurm.conf is not readable as test-user. We will try reading it as root
+ # slurm.conf is not readable, we will try reading it as root
results = run_command(
f"grep -i SlurmUser {slurm_config_file}", user="root", quiet=True
)
@@ -4894,7 +4911,16 @@
properties["slurm-user"] = match.group(1)
properties["submitted-jobs"] = []
-properties["test-user"] = pwd.getpwuid(os.getuid()).pw_name
+if "slurmtestuser" in testsuite_config:
+ properties["test-user"] = testsuite_config["slurmtestuser"]
+ properties["test-user-set"] = True
+else:
+ properties["test-user"] = pwd.getpwuid(os.getuid()).pw_name
+ properties["test-user-set"] = False
+
+properties["test-user-uid"] = pwd.getpwnam(properties["test-user"]).pw_uid
+properties["test-user-gid"] = pwd.getpwnam(properties["test-user"]).pw_gid
+
properties["auto-config"] = False
properties["allow-slurmdbd-modify"] = False
properties["slurmrestd-started"] = False
diff --git a/testsuite/python/tests/test_100_2.py b/testsuite/python/tests/test_100_2.py
index 5179193..456c3c4 100644
--- a/testsuite/python/tests/test_100_2.py
+++ b/testsuite/python/tests/test_100_2.py
@@ -281,7 +281,6 @@
"-N1 --qos=qos1",
"false",
fatal=True,
- user=atf.properties["test-user"],
)
)
jobs.append(atf.submit_job("sbatch", "-N2 --qos=qos2", "false", fatal=True))
@@ -295,7 +294,6 @@
"-N1 --qos=qos1",
"sleep 300",
fatal=True,
- user=atf.properties["test-user"],
)
)
jobs.append(atf.submit_job("sbatch", "-N2 --qos=qos2", "sleep 300", fatal=True))
diff --git a/testsuite/python/tests/test_102_1.py b/testsuite/python/tests/test_102_1.py
index 73647d5..bfc2aea 100644
--- a/testsuite/python/tests/test_102_1.py
+++ b/testsuite/python/tests/test_102_1.py
@@ -54,6 +54,15 @@
)
+@pytest.fixture(scope="module", autouse=True)
+def clean_clusters():
+ yield
+ atf.run_command(
+ f"sacctmgr -i delete cluster {cluster_string}",
+ user=atf.properties["slurm-user"],
+ )
+
+
@pytest.fixture(scope="function")
def no_federations():
atf.run_command(
diff --git a/testsuite/python/tests/test_105_5.py b/testsuite/python/tests/test_105_5.py
index 9ba1ccc..ad39540 100644
--- a/testsuite/python/tests/test_105_5.py
+++ b/testsuite/python/tests/test_105_5.py
@@ -8,7 +8,7 @@
import pytest
import re
-user_name = atf.get_user_name()
+user_name = atf.properties["test-user"]
@pytest.fixture(scope="module", autouse=True)
diff --git a/testsuite/python/tests/test_115_1.py b/testsuite/python/tests/test_115_1.py
index 9a3d37b..c0cbb10 100644
--- a/testsuite/python/tests/test_115_1.py
+++ b/testsuite/python/tests/test_115_1.py
@@ -201,10 +201,10 @@
# Verify they were all populated with an id
for user in user1, user2:
if user not in user_account_id:
- atf.log_die(f"Account association for {user} was not created")
+ pytest.fail(f"Account association for {user} was not created")
for account in account1, account2, account3:
if account not in user_account_id[user]:
- atf.log_die(f"Association for {user} and {account} was not created")
+ pytest.fail(f"Association for {user} and {account} was not created")
# Populate user_wckey_id dictionary with user-wckey association ids
output = atf.run_command_output(
@@ -221,9 +221,9 @@
# Verify they were all populated with an id
for user in user1, user2:
if user not in user_wckey_id:
- atf.log_die(f"WCKey association for {user} was not created")
+ pytest.fail(f"WCKey association for {user} was not created")
if wckey1 not in user_wckey_id[user]:
- atf.log_die(f"Association for {user} and {wckey1} was not created")
+ pytest.fail(f"Association for {user} and {wckey1} was not created")
@pytest.fixture(scope="module")
@@ -284,17 +284,17 @@
rf"{cluster}\|{account1}\|{user_account_id[user1][account1]}\|{wckey1}\|{user_wckey_id[user1][wckey1]}\|{job1_start_string}\|{job1_end_string}\|{job1_duration_string}",
output,
):
- atf.log_die("The job accounting data was not loaded correctly for job1")
+ pytest.fail("The job accounting data was not loaded correctly for job1")
if not re.search(
rf"{cluster}\|{account3}\|{user_account_id[user2][account3]}\|{wckey1}\|{user_wckey_id[user2][wckey1]}\|{job2_start_string}\|{job2_end_string}\|{job2_duration_string}",
output,
):
- atf.log_die("The job accounting data was not loaded correctly for job2")
+ pytest.fail("The job accounting data was not loaded correctly for job2")
if not re.search(
rf"{cluster}\|{account2}\|{user_account_id[user1][account2]}\|{wckey1}\|{user_wckey_id[user1][wckey1]}\|{job3_start_string}\|{job3_end_string}\|{job3_duration_string}",
output,
):
- atf.log_die("The job accounting data was not loaded correctly for job3")
+ pytest.fail("The job accounting data was not loaded correctly for job3")
# Use sacctmgr to see if the node event loaded
output = atf.run_command_output(
@@ -306,14 +306,14 @@
rf"{cluster}\|\|{period_start_string}\|{period_end_string}\|{cluster_cpus}",
output,
):
- atf.log_die(
+ pytest.fail(
"The event accounting data was not loaded correctly for the cluster"
)
if not re.search(
rf"{cluster}\|{node0}\|{node0_start_string}\|{node0_end_string}\|{node0_cpus}",
output,
):
- atf.log_die("The event accounting data was not loaded correctly for node0")
+ pytest.fail("The event accounting data was not loaded correctly for node0")
# Use sacctmgr to roll up the time period
atf.run_command_output(
diff --git a/testsuite/python/tests/test_116_12.py b/testsuite/python/tests/test_116_12.py
index 3014cd0..619a207 100644
--- a/testsuite/python/tests/test_116_12.py
+++ b/testsuite/python/tests/test_116_12.py
@@ -126,7 +126,7 @@
fpc.remove_file(file_err)
# Test %u puts the user name in the file name
- user_name = atf.get_user_name()
+ user_name = atf.properties["test-user"]
file_out = fpc.create_file_path("u")
atf.run_job(f"--output={file_out} -N1 -O id")
file_out = fpc.get_tmp_file()
@@ -175,7 +175,6 @@
srun -O --output={file_out} true
done""",
)
- os.chmod(file_in, 0o0777)
job_id = atf.submit_job_sbatch(f"-N{node_count} --output /dev/null {str(file_in)}")
atf.wait_for_job_state(job_id, "DONE")
tmp_dir_list = os.listdir(tmp_path)
@@ -192,7 +191,6 @@
srun -O --error={file_err} true
done""",
)
- os.chmod(file_in, 0o0777)
job_id = atf.submit_job_sbatch(f"-N{node_count} --output /dev/null {str(file_in)}")
atf.wait_for_job_state(job_id, "DONE")
tmp_dir_list = os.listdir(tmp_path)
diff --git a/testsuite/python/tests/test_116_44.py b/testsuite/python/tests/test_116_44.py
index c58d8ff..8c5e007 100644
--- a/testsuite/python/tests/test_116_44.py
+++ b/testsuite/python/tests/test_116_44.py
@@ -25,7 +25,7 @@
num_tasks = 4
node_count = 1
- my_uid = os.getuid()
+ test_uid = atf.properties["test-user-uid"]
atf.make_bash_script(
task_prolog,
@@ -65,10 +65,10 @@
atf.wait_for_file(file_out_pre)
output = atf.run_command_output(f"cat {file_out_pre}")
- match_uid = re.findall(rf"uid={my_uid}", output)
+ match_uid = re.findall(rf"uid={test_uid}", output)
assert len(match_uid) == num_tasks, "Task prolog output is missing or uid mismatch"
atf.wait_for_file(file_out_post)
output = atf.run_command_output(f"cat {file_out_post}")
- match_uid = re.findall(rf"uid={my_uid}", output)
+ match_uid = re.findall(rf"uid={test_uid}", output)
assert len(match_uid) == num_tasks, "Task epilog output is missing or uid mismatch"
diff --git a/testsuite/python/tests/test_116_46.py b/testsuite/python/tests/test_116_46.py
index 98d23ea..058507d 100644
--- a/testsuite/python/tests/test_116_46.py
+++ b/testsuite/python/tests/test_116_46.py
@@ -2,7 +2,6 @@
# Copyright (C) SchedMD LLC.
############################################################################
import atf
-import os
import pytest
import re
@@ -31,7 +30,6 @@
sleep 20
""",
)
- os.chmod(file_in, 0o0755)
run_error = atf.run_command_error(
f"srun -n{task_count} -N{node_count} -O -W2 {file_in}"
)
diff --git a/testsuite/python/tests/test_116_47.py b/testsuite/python/tests/test_116_47.py
index 13c100f..22a0a11 100644
--- a/testsuite/python/tests/test_116_47.py
+++ b/testsuite/python/tests/test_116_47.py
@@ -2,7 +2,6 @@
# Copyright (C) SchedMD LLC.
############################################################################
import atf
-import os
import pytest
import re
@@ -21,7 +20,7 @@
# Verify that a job executes as the appropriate user and group
def test_uid(srun_output):
"""Verify that a job executes as the invoking user"""
- test_uid = os.geteuid()
+ test_uid = atf.properties["test-user-uid"]
job_uid = (
int(match.group(1)) if (match := re.search(r"uid=(\d+)", srun_output)) else None
)
@@ -30,7 +29,7 @@
def test_gid(srun_output):
"""Verify that a job executes as the invoking group"""
- test_gid = os.getegid()
+ test_gid = atf.properties["test-user-gid"]
job_gid = (
int(match.group(1)) if (match := re.search(r"gid=(\d+)", srun_output)) else None
)
diff --git a/testsuite/python/tests/test_123_4.py b/testsuite/python/tests/test_123_4.py
index c1e7b45..75e9036 100644
--- a/testsuite/python/tests/test_123_4.py
+++ b/testsuite/python/tests/test_123_4.py
@@ -32,10 +32,8 @@
result = atf.run_command(create_res, user=atf.properties["slurm-user"])
assert result["exit_code"] == 0, "Couldn't create the reservation!"
- # Try to run a job as atf
- result = atf.run_command(
- f"srun -N1 --reservation={res_name} true", user=atf.properties["test-user"]
- )
+ # Try to run a job as test-user
+ result = atf.run_command(f"srun -N1 --reservation={res_name} true")
assert result["exit_code"] != 0, "The job should have been denied for user!"
assert (
"Access denied" in result["stderr"]
diff --git a/testsuite/python/tests/test_123_5.py b/testsuite/python/tests/test_123_5.py
index 9b20de8..adddcae 100644
--- a/testsuite/python/tests/test_123_5.py
+++ b/testsuite/python/tests/test_123_5.py
@@ -86,7 +86,6 @@
# Try to run a job as in the wrong QOS
result = atf.run_command(
f"srun -N1 --reservation={res_name} --account={acct1} --qos=normal true",
- user=atf.properties["test-user"],
)
assert (
result["exit_code"] != 0
@@ -98,7 +97,6 @@
# Try to run a job as in the correct QOS
result = atf.run_command(
f"srun -N1 --reservation={res_name} --account={acct1} --qos=normal,{qos1} true",
- user=atf.properties["test-user"],
)
assert (
result["exit_code"] == 0
diff --git a/testsuite/python/tests/test_123_6.py b/testsuite/python/tests/test_123_6.py
index 1f940c8..2f52140 100644
--- a/testsuite/python/tests/test_123_6.py
+++ b/testsuite/python/tests/test_123_6.py
@@ -80,7 +80,6 @@
# Try to run a job as in the wrong partition
result = atf.run_command(
f"srun -N1 --reservation={res_name} --partition={partitions[1]} true",
- user=atf.properties["test-user"],
)
assert (
result["exit_code"] != 0
@@ -92,7 +91,6 @@
# Try to run a job that can use either partition
result = atf.run_command(
f"srun -N1 --reservation={res_name} --partition={partitions[1]},{partitions[0]} true",
- user=atf.properties["test-user"],
)
assert (
result["exit_code"] == 0