Merge branch 't21444-Stop_inval_read_arb_dist' into 'master'
See merge request SchedMD/dev/slurm!266
diff --git a/src/common/job_record.c b/src/common/job_record.c
index 5e24654..dc28d44 100644
--- a/src/common/job_record.c
+++ b/src/common/job_record.c
@@ -1371,7 +1371,8 @@
return slurm_sort_int_list_asc(&node1->node_index, &node2->node_index);
}
-extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr)
+extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr,
+ uint16_t protocol_version)
{
uint16_t *arbitrary_tasks_np = NULL;
int rc = SLURM_SUCCESS;
@@ -1380,6 +1381,7 @@
char *host, *prev_host = NULL;
node_inx_cnt_t *node_inx_cnts;
hostlist_t *hl = hostlist_create(job_ptr->details->req_nodes);
+ int num_names = hostlist_count(hl);
hostlist_sort(hl);
arbitrary_tasks_np = xcalloc(num_nodes, sizeof(uint16_t));
@@ -1417,6 +1419,33 @@
goto cleanup;
}
+ if (num_names != job_ptr->details->num_tasks) {
+ error("Task count (%u) for %pJ is not equal to the number of nodes in the requested arbitrary node list (%s)",
+ job_ptr->details->num_tasks, job_ptr,
+ job_ptr->details->req_nodes);
+
+ /* Reject arbitrary jobs with bad task counts in 25.11+. */
+ if (protocol_version >= SLURM_25_11_PROTOCOL_VERSION) {
+ free(prev_host);
+ rc = ESLURM_BAD_TASK_COUNT;
+ goto cleanup;
+ }
+
+ /*
+ * Allow existing older version jobs from save state to be
+ * loaded. This prevents all jobs being lost during an
+ * upgrade. Reallocate the arbitrary_tasks_np array to prevent
+ * an invalid read if the number of tasks > num_names. This
+ * can be removed when upgrading from
+ * SLURM_25_05_PROTOCOL_VERSION is not supported, and the
+ * protocol_version parameter to this function can be removed.
+ */
+ if (num_names < job_ptr->details->num_tasks)
+ xrecalloc(arbitrary_tasks_np,
+ job_ptr->details->num_tasks,
+ sizeof(arbitrary_tasks_np[0]));
+ }
+
node_inx_cnts[cur_node].node_index = node_name_get_inx(prev_host);
free(prev_host);
@@ -1846,7 +1875,8 @@
job_ptr->details->work_dir = work_dir;
if (((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
- SLURM_DIST_ARBITRARY) && job_record_calc_arbitrary_tpn(job_ptr))
+ SLURM_DIST_ARBITRARY) &&
+ job_record_calc_arbitrary_tpn(job_ptr, job_ptr->start_protocol_ver))
return SLURM_ERROR;
return SLURM_SUCCESS;
diff --git a/src/common/job_record.h b/src/common/job_record.h
index 52b2bea..3db8030 100644
--- a/src/common/job_record.h
+++ b/src/common/job_record.h
@@ -705,7 +705,8 @@
extern int load_step_state(job_record_t *job_ptr, buf_t *buffer,
uint16_t protocol_version);
-extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr);
+extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr,
+ uint16_t protocol_version);
extern void job_record_pack_details_common(
job_details_t *detail_ptr, buf_t *buffer, uint16_t protocol_version);
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index e4dcffb..26866b7 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -8645,12 +8645,14 @@
detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie);
detail_ptr->x11_target = xstrdup(job_desc->x11_target);
detail_ptr->x11_target_port = job_desc->x11_target_port;
+ if (job_desc->num_tasks != NO_VAL)
+ detail_ptr->num_tasks = job_desc->num_tasks;
if (job_desc->req_nodes) {
if ((job_desc->task_dist & SLURM_DIST_STATE_BASE) ==
SLURM_DIST_ARBITRARY) {
detail_ptr->req_nodes = xstrdup(job_desc->req_nodes);
- if ((error_code =
- job_record_calc_arbitrary_tpn(job_ptr)))
+ if ((error_code = job_record_calc_arbitrary_tpn(
+ job_ptr, SLURM_PROTOCOL_VERSION)))
return error_code;
} else {
detail_ptr->req_nodes =
@@ -8716,8 +8718,6 @@
detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
if (job_desc->overcommit != NO_VAL8)
detail_ptr->overcommit = job_desc->overcommit;
- if (job_desc->num_tasks != NO_VAL)
- detail_ptr->num_tasks = job_desc->num_tasks;
if (job_desc->ntasks_per_node != NO_VAL16) {
detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
if ((detail_ptr->overcommit == 0) &&