|  | /*****************************************************************************\ | 
|  | *  multi_prog.c - executing program according to task rank | 
|  | *                 set MPIR_PROCDESC accordingly | 
|  | * | 
|  | *  NOTE: The logic could be eliminated if slurmstepd kept track of the | 
|  | *  executable name for each task and returned that inforatmion in a new | 
|  | *  launch response message (with multiple executable names). | 
|  | ***************************************************************************** | 
|  | *  Produced at National University of Defense Technology (China) | 
|  | *  Written by Hongjia Cao <hjcao@nudt.edu.cn> | 
|  | *  and | 
|  | *  Lawrence Livermore National Laboratory (cf, DISCLAIMER). | 
|  | *  Written by Morris Jette <jette1@llnl.gov>. | 
|  | *  CODE-OCEC-09-009. All rights reserved. | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #include "config.h" | 
|  |  | 
|  | #include <ctype.h> | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  | #include <sys/stat.h> | 
|  | #include <sys/types.h> | 
|  | #include <unistd.h> | 
|  |  | 
|  | #include "src/common/bitstring.h" | 
|  | #include "src/common/log.h" | 
|  | #include "src/common/xassert.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/common/xstring.h" | 
|  | #include "src/common/proc_args.h" | 
|  |  | 
|  | #include "debugger.h" | 
|  | #include "multi_prog.h" | 
|  | #include "opt.h" | 
|  |  | 
|  | static void | 
|  | _set_range(int low_num, int high_num, char *exec_name, bool ignore_duplicates) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | for (i = low_num; i <= high_num; i++) { | 
|  | MPIR_PROCDESC *tv; | 
|  | tv = &MPIR_proctable[i]; | 
|  | if (tv->executable_name == NULL) { | 
|  | tv->executable_name = xstrdup(exec_name); | 
|  | } else if (!ignore_duplicates) { | 
|  | error("duplicate configuration for task %d ignored", | 
|  | i); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void _set_exec_names(char *ranks, char *exec_name, int ntasks) | 
|  | { | 
|  | char *ptrptr = NULL; | 
|  | int low_num, high_num, num, i; | 
|  |  | 
|  | if ((ranks[0] == '*') && (ranks[1] == '\0')) { | 
|  | low_num = 0; | 
|  | high_num = ntasks - 1; | 
|  | _set_range(low_num, high_num, exec_name, true); | 
|  | return; | 
|  | } | 
|  |  | 
|  | ptrptr = ranks; | 
|  | for (i=0; i<ntasks; i++) { | 
|  | if (!isdigit(ptrptr[0])) | 
|  | goto invalid; | 
|  |  | 
|  | num = strtol(ptrptr, &ptrptr, 10); | 
|  |  | 
|  | if ((ptrptr[0] == ',') || (ptrptr[0] == '\0')) { | 
|  | low_num = MAX(0, num); | 
|  | high_num = MIN((ntasks-1), num); | 
|  | _set_range(low_num, high_num, exec_name, false); | 
|  | } else if (ptrptr[0] == '-') { | 
|  | low_num = MAX(0, num); | 
|  | num = strtol(ptrptr+1, &ptrptr, 10); | 
|  | if ((ptrptr[0] != ',') && (ptrptr[0] != '\0')) | 
|  | goto invalid; | 
|  | high_num = MIN((ntasks-1), num); | 
|  | _set_range(low_num, high_num, exec_name, false); | 
|  | } else | 
|  | goto invalid; | 
|  | if (ptrptr[0] == '\0') | 
|  | break; | 
|  | ptrptr++; | 
|  | } | 
|  | return; | 
|  |  | 
|  | invalid: | 
|  | error ("Invalid task range specification (%s) ignored.", ranks); | 
|  | return; | 
|  | } | 
|  |  | 
|  | extern int mpir_set_multi_name(int ntasks, const char *config_fname) | 
|  | { | 
|  | FILE *config_fd; | 
|  | char line[BUF_SIZE]; | 
|  | char *ranks, *exec_name, *p, *ptrptr; | 
|  | int line_num = 0; | 
|  | bool last_line_break = false, line_break = false; | 
|  | int line_len; | 
|  | int i; | 
|  |  | 
|  | for (i = 0; i < ntasks; i++) { | 
|  | MPIR_PROCDESC *tv; | 
|  | tv = &MPIR_proctable[i]; | 
|  | tv->executable_name = NULL; | 
|  | } | 
|  |  | 
|  | config_fd = fopen(config_fname, "r"); | 
|  | if (config_fd == NULL) { | 
|  | error("Unable to open configuration file %s", config_fname); | 
|  | return -1; | 
|  | } | 
|  | while (fgets(line, sizeof(line), config_fd)) { | 
|  | line_num ++; | 
|  | line_len = strlen(line); | 
|  | if (line_len >= (sizeof(line) - 1)) { | 
|  | error ("Line %d of configuration file %s too long", | 
|  | line_num, config_fname); | 
|  | fclose(config_fd); | 
|  | return -1; | 
|  | } | 
|  | if ((line_len > 0 && line[line_len - 1] == '\\') ||  /* EOF */ | 
|  | (line_len > 1 && line[line_len - 2] == '\\' && | 
|  | line[line_len - 1] == '\n')) | 
|  | line_break = true; | 
|  | else | 
|  | line_break = false; | 
|  |  | 
|  | if (last_line_break) { | 
|  | last_line_break = line_break; | 
|  | continue; | 
|  | } | 
|  | last_line_break = line_break; | 
|  | p = line; | 
|  | while (*p != '\0' && isspace (*p)) /* remove leading spaces */ | 
|  | p ++; | 
|  |  | 
|  | if (*p == '#') /* only whole-line comments handled */ | 
|  | continue; | 
|  |  | 
|  | if (*p == '\0') /* blank line ignored */ | 
|  | continue; | 
|  |  | 
|  | ranks = strtok_r(p, " \t\n", &ptrptr); | 
|  | exec_name = strtok_r(NULL, " \t\n", &ptrptr); | 
|  | if (!ranks || !exec_name) { | 
|  | error("Line %d of configuration file %s is invalid", | 
|  | line_num, config_fname); | 
|  | fclose(config_fd); | 
|  | return -1; | 
|  | } | 
|  | _set_exec_names(ranks, exec_name, ntasks); | 
|  | } | 
|  | fclose(config_fd); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | extern void | 
|  | mpir_init(int num_tasks) | 
|  | { | 
|  | MPIR_proctable_size = num_tasks; | 
|  | MPIR_proctable = xmalloc(sizeof(MPIR_PROCDESC) * num_tasks); | 
|  | if (MPIR_proctable == NULL) { | 
|  | error("Unable to initialize MPIR_proctable: %m"); | 
|  | exit(error_exit); | 
|  | } | 
|  | } | 
|  |  | 
|  | extern void | 
|  | mpir_cleanup(void) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | for (i = 0; i < MPIR_proctable_size; i++) { | 
|  | xfree(MPIR_proctable[i].host_name); | 
|  | xfree(MPIR_proctable[i].executable_name); | 
|  | } | 
|  | xfree(MPIR_proctable); | 
|  | } | 
|  |  | 
|  | extern void mpir_set_executable_names(const char *executable_name, | 
|  | uint32_t task_offset, | 
|  | uint32_t task_count) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | if (task_offset == NO_VAL) | 
|  | task_offset = 0; | 
|  | xassert((task_offset + task_count) <= MPIR_proctable_size); | 
|  | for (i = task_offset; i < (task_offset + task_count); i++) { | 
|  | MPIR_proctable[i].executable_name = xstrdup(executable_name); | 
|  | // info("NAME[%d]:%s", i, executable_name); | 
|  | } | 
|  | } | 
|  |  | 
|  | extern void | 
|  | mpir_dump_proctable(void) | 
|  | { | 
|  | MPIR_PROCDESC *tv; | 
|  | int i; | 
|  |  | 
|  | for (i = 0; i < MPIR_proctable_size; i++) { | 
|  | tv = &MPIR_proctable[i]; | 
|  | info("task:%d, host:%s, pid:%d, executable:%s", | 
|  | i, tv->host_name, tv->pid, tv->executable_name); | 
|  | } | 
|  | } | 
|  |  | 
|  | static int | 
|  | _update_task_mask(int low_num, int high_num, slurm_opt_t *opt_local, | 
|  | bitstr_t **task_mask, bool ignore_duplicates) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | if (low_num > high_num) { | 
|  | error("Invalid task range, %d-%d", low_num, high_num); | 
|  | return -1; | 
|  | } | 
|  | if (low_num < 0) { | 
|  | error("Invalid task id, %d < 0", low_num); | 
|  | return -1; | 
|  | } | 
|  | if (high_num >= opt_local->ntasks) { | 
|  | static bool i_set_ntasks = false; | 
|  | if (opt_local->ntasks_set && !i_set_ntasks) { | 
|  | error("Invalid task id, %d >= ntasks", high_num); | 
|  | return -1; | 
|  | } else { | 
|  | opt_local->ntasks = high_num + 1; | 
|  | opt_local->ntasks_set = true; | 
|  | i_set_ntasks = true; | 
|  | bit_realloc((*task_mask), opt_local->ntasks); | 
|  | } | 
|  | } | 
|  | for (i=low_num; i<=high_num; i++) { | 
|  | if (bit_test((*task_mask), i)) { | 
|  | if (ignore_duplicates) | 
|  | continue; | 
|  | error("Duplicate record for task %d", i); | 
|  | return -1; | 
|  | } | 
|  | bit_set((*task_mask), i); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int | 
|  | _validate_ranks(char *ranks, slurm_opt_t *opt_local, bitstr_t **task_mask) | 
|  | { | 
|  | static bool has_asterisk = false; | 
|  | char *range = NULL, *p = NULL; | 
|  | char *ptrptr = NULL, *upper = NULL; | 
|  | int low_num, high_num; | 
|  |  | 
|  | if (ranks[0] == '*' && ranks[1] == '\0') { | 
|  | low_num = 0; | 
|  | high_num = opt_local->ntasks - 1; | 
|  | opt_local->ntasks_set = true; /* do not allow to change later */ | 
|  | has_asterisk = true;	/* must be last MPMD spec line */ | 
|  | opt_local->srun_opt->multi_prog_cmds++; | 
|  | return _update_task_mask(low_num, high_num, opt_local, | 
|  | task_mask, true); | 
|  | } | 
|  |  | 
|  | for (range = strtok_r(ranks, ",", &ptrptr); range != NULL; | 
|  | range = strtok_r(NULL, ",", &ptrptr)) { | 
|  | /* | 
|  | * Non-contiguous tasks are split into multiple commands | 
|  | * in the mpmd_set so count each token separately | 
|  | */ | 
|  | opt_local->srun_opt->multi_prog_cmds++; | 
|  | p = range; | 
|  | while (*p != '\0' && isdigit (*p)) | 
|  | p ++; | 
|  |  | 
|  | if (has_asterisk) { | 
|  | error("Task range specification with asterisk must " | 
|  | "be last"); | 
|  | return -1; | 
|  | } else if (*p == '\0') { /* single rank */ | 
|  | low_num  = atoi(range); | 
|  | high_num = low_num; | 
|  | } else if (*p == '-') { /* lower-upper */ | 
|  | upper = ++ p; | 
|  | while (isdigit (*p)) | 
|  | p ++; | 
|  | if (*p != '\0') { | 
|  | error ("Invalid task range specification"); | 
|  | return -1; | 
|  | } | 
|  | low_num  = atoi(range); | 
|  | high_num = atoi(upper); | 
|  | } else { | 
|  | error ("Invalid task range specification (%s)", | 
|  | range); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | if (_update_task_mask(low_num, high_num, opt_local, | 
|  | task_mask, false)) | 
|  | return -1; | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Verify that we have a valid executable program specified for each task | 
|  | *	when the --multi-prog option is used. | 
|  | * IN config_name - MPMD configuration file name | 
|  | * IN/OUT opt_local - slurm options | 
|  | * RET 0 on success, -1 otherwise | 
|  | */ | 
|  | extern int | 
|  | verify_multi_name(char *config_fname, slurm_opt_t *opt_local) | 
|  | { | 
|  | FILE *config_fd; | 
|  | char line[BUF_SIZE]; | 
|  | char *ranks, *exec_name, *p, *ptrptr, *fullpath = NULL; | 
|  | int line_num = 0, i, rc = 0; | 
|  | bool last_line_break = false, line_break = false; | 
|  | int line_len; | 
|  | bitstr_t *task_mask; | 
|  |  | 
|  | if (opt_local->ntasks <= 0) { | 
|  | error("Invalid task count %d", opt_local->ntasks); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | opt_local->srun_opt->multi_prog_cmds = 0; | 
|  |  | 
|  | config_fd = fopen(config_fname, "r"); | 
|  | if (config_fd == NULL) { | 
|  | error("Unable to open configuration file %s", config_fname); | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | task_mask = bit_alloc(opt_local->ntasks); | 
|  | while (fgets(line, sizeof(line), config_fd)) { | 
|  | line_num++; | 
|  | line_len = strlen(line); | 
|  | if (line_len >= (sizeof(line) - 1)) { | 
|  | error ("Line %d of configuration file %s too long", | 
|  | line_num, config_fname); | 
|  | rc = -1; | 
|  | goto fini; | 
|  | } | 
|  | if ((line_len > 0 && line[line_len - 1] == '\\') ||  /* EOF */ | 
|  | (line_len > 1 && line[line_len - 2] == '\\' && | 
|  | line[line_len - 1] == '\n')) | 
|  | line_break = true; | 
|  | else | 
|  | line_break = false; | 
|  | if (last_line_break) { | 
|  | last_line_break = line_break; | 
|  | continue; | 
|  | } | 
|  | last_line_break = line_break; | 
|  | p = line; | 
|  | while (*p != '\0' && isspace (*p)) /* remove leading spaces */ | 
|  | p ++; | 
|  |  | 
|  | if (*p == '#') /* only whole-line comments handled */ | 
|  | continue; | 
|  |  | 
|  | if (*p == '\0') /* blank line ignored */ | 
|  | continue; | 
|  |  | 
|  | ranks = strtok_r(p, " \t\n", &ptrptr); | 
|  | exec_name = strtok_r(NULL, " \t\n", &ptrptr); | 
|  | if (!ranks || !exec_name) { | 
|  | error("Line %d of configuration file %s invalid", | 
|  | line_num, config_fname); | 
|  | rc = -1; | 
|  | goto fini; | 
|  | } | 
|  | if (_validate_ranks(ranks, opt_local, &task_mask)) { | 
|  | error("Line %d of configuration file %s invalid", | 
|  | line_num, config_fname); | 
|  | rc = -1; | 
|  | goto fini; | 
|  | } | 
|  | if (opt_local->srun_opt->test_exec && | 
|  | !(fullpath = search_path( | 
|  | opt_local->chdir, exec_name, true, X_OK, true))) { | 
|  | error("Line %d of configuration file %s, program %s not executable", | 
|  | line_num, config_fname, exec_name); | 
|  | rc = -1; | 
|  | goto fini; | 
|  | } | 
|  | xfree(fullpath); | 
|  | } | 
|  |  | 
|  | for (i = 0; i < opt_local->ntasks; i++) { | 
|  | if (!bit_test(task_mask, i)) { | 
|  | error("Configuration file %s invalid, " | 
|  | "no record for task id %d", | 
|  | config_fname, i); | 
|  | rc = -1; | 
|  | goto fini; | 
|  | } | 
|  | } | 
|  |  | 
|  | fini:	fclose(config_fd); | 
|  | FREE_NULL_BITMAP(task_mask); | 
|  | return rc; | 
|  | } |