Prevent invalid read if task count is more than arbitrary nodelist

If the job is using arbitrary distribution and the number of node names in
the arbitrary node list is less then the task count there will be an
invalid memory read in _at_tpn_limit() when laying out the tasks. The task
count needs to be equal to the number of node names in the nodelist. If the
task count is not equal return an ESLURM_BAD_TASK_COUNT error.

This also has to prevent the loss of all jobs when upgrading to 25.11 if
any job would hit this error. Do to the new restriction, the jobs being
loaded from versions 25.05 and lower will still be allowed regardless of
the task count. However, arbitrary_tasks_np will be reallocated to prevent
an invalid read from happening.

Changelog: slurmctld - Prevent an invalid read and a possible crash by
 rejecting any arbitrary distribution jobs that do not specify a task count
 equal to the number of node names in their node list. This does not affect
 srun, salloc, or sbatch if -n is not used since they set the default task
 count.
Ticket: 21444
diff --git a/src/common/job_record.c b/src/common/job_record.c
index 5e24654..dc28d44 100644
--- a/src/common/job_record.c
+++ b/src/common/job_record.c
@@ -1371,7 +1371,8 @@
 	return slurm_sort_int_list_asc(&node1->node_index, &node2->node_index);
 }
 
-extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr)
+extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr,
+					 uint16_t protocol_version)
 {
 	uint16_t *arbitrary_tasks_np = NULL;
 	int rc = SLURM_SUCCESS;
@@ -1380,6 +1381,7 @@
 	char *host, *prev_host = NULL;
 	node_inx_cnt_t *node_inx_cnts;
 	hostlist_t *hl = hostlist_create(job_ptr->details->req_nodes);
+	int num_names = hostlist_count(hl);
 	hostlist_sort(hl);
 
 	arbitrary_tasks_np = xcalloc(num_nodes, sizeof(uint16_t));
@@ -1417,6 +1419,33 @@
 		goto cleanup;
 	}
 
+	if (num_names != job_ptr->details->num_tasks) {
+		error("Task count (%u) for %pJ is not equal to the number of nodes in the requested arbitrary node list (%s)",
+		      job_ptr->details->num_tasks, job_ptr,
+		      job_ptr->details->req_nodes);
+
+		/* Reject arbitrary jobs with bad task counts in 25.11+. */
+		if (protocol_version >= SLURM_25_11_PROTOCOL_VERSION) {
+			free(prev_host);
+			rc = ESLURM_BAD_TASK_COUNT;
+			goto cleanup;
+		}
+
+		/*
+		 * Allow existing older version jobs from save state to be
+		 * loaded. This prevents all jobs being lost during an
+		 * upgrade. Reallocate the arbitrary_tasks_np array to prevent
+		 * an invalid read if the number of tasks > num_names. This
+		 * can be removed when upgrading from
+		 * SLURM_25_05_PROTOCOL_VERSION is not supported, and the
+		 * protocol_version parameter to this function can be removed.
+		 */
+		if (num_names < job_ptr->details->num_tasks)
+			xrecalloc(arbitrary_tasks_np,
+				  job_ptr->details->num_tasks,
+				  sizeof(arbitrary_tasks_np[0]));
+	}
+
 	node_inx_cnts[cur_node].node_index = node_name_get_inx(prev_host);
 	free(prev_host);
 
@@ -1846,7 +1875,8 @@
 	job_ptr->details->work_dir = work_dir;
 
 	if (((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
-	     SLURM_DIST_ARBITRARY) && job_record_calc_arbitrary_tpn(job_ptr))
+	     SLURM_DIST_ARBITRARY) &&
+	    job_record_calc_arbitrary_tpn(job_ptr, job_ptr->start_protocol_ver))
 		return SLURM_ERROR;
 
 	return SLURM_SUCCESS;
diff --git a/src/common/job_record.h b/src/common/job_record.h
index 52b2bea..3db8030 100644
--- a/src/common/job_record.h
+++ b/src/common/job_record.h
@@ -705,7 +705,8 @@
 extern int load_step_state(job_record_t *job_ptr, buf_t *buffer,
 			   uint16_t protocol_version);
 
-extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr);
+extern int job_record_calc_arbitrary_tpn(job_record_t *job_ptr,
+					 uint16_t protocol_version);
 
 extern void job_record_pack_details_common(
 	job_details_t *detail_ptr, buf_t *buffer, uint16_t protocol_version);
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index e4dcffb..26866b7 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -8645,12 +8645,14 @@
 	detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie);
 	detail_ptr->x11_target = xstrdup(job_desc->x11_target);
 	detail_ptr->x11_target_port = job_desc->x11_target_port;
+	if (job_desc->num_tasks != NO_VAL)
+		detail_ptr->num_tasks = job_desc->num_tasks;
 	if (job_desc->req_nodes) {
 		if ((job_desc->task_dist & SLURM_DIST_STATE_BASE) ==
 		    SLURM_DIST_ARBITRARY) {
 			detail_ptr->req_nodes = xstrdup(job_desc->req_nodes);
-			if ((error_code =
-			     job_record_calc_arbitrary_tpn(job_ptr)))
+			if ((error_code = job_record_calc_arbitrary_tpn(
+				job_ptr, SLURM_PROTOCOL_VERSION)))
 				return error_code;
 		} else {
 			detail_ptr->req_nodes =
@@ -8716,8 +8718,6 @@
 		detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
 	if (job_desc->overcommit != NO_VAL8)
 		detail_ptr->overcommit = job_desc->overcommit;
-	if (job_desc->num_tasks != NO_VAL)
-		detail_ptr->num_tasks = job_desc->num_tasks;
 	if (job_desc->ntasks_per_node != NO_VAL16) {
 		detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
 		if ((detail_ptr->overcommit == 0) &&