blob: 6b73b70e8994530a0234ee9f59e3145c7a5e4160 [file] [log] [blame] [edit]
/*****************************************************************************\
* step_ctx.c - step_ctx task functions for use by AIX/POE
*****************************************************************************
* Copyright (C) 2004-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.schedmd.com/slurmdocs/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <errno.h>
#include <pthread.h>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <netinet/in.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/types.h>
#include "slurm/slurm.h"
#include "src/common/bitstring.h"
#include "src/common/hostlist.h"
#include "src/common/net.h"
#include "src/common/slurm_cred.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/common/slurm_cred.h"
#include "src/api/step_ctx.h"
static void
_job_fake_cred(struct slurm_step_ctx_struct *ctx)
{
slurm_cred_arg_t arg;
uint32_t node_cnt = ctx->step_resp->step_layout->node_cnt;
arg.jobid = ctx->job_id;
arg.stepid = ctx->step_resp->job_step_id;
arg.uid = ctx->user_id;
arg.job_nhosts = node_cnt;
arg.job_hostlist = ctx->step_resp->step_layout->node_list;
arg.job_mem_limit = 0;
arg.step_hostlist = ctx->step_req->node_list;
arg.step_mem_limit = 0;
arg.job_core_bitmap = bit_alloc(node_cnt);
bit_nset(arg.job_core_bitmap, 0, node_cnt-1);
arg.step_core_bitmap = bit_alloc(node_cnt);
bit_nset(arg.step_core_bitmap, 0, node_cnt-1);
arg.cores_per_socket = xmalloc(sizeof(uint16_t));
arg.cores_per_socket[0] = 1;
arg.sockets_per_node = xmalloc(sizeof(uint16_t));
arg.sockets_per_node[0] = 1;
arg.sock_core_rep_count = xmalloc(sizeof(uint32_t));
arg.sock_core_rep_count[0] = node_cnt;
ctx->step_resp->cred = slurm_cred_faker(&arg);
}
static job_step_create_request_msg_t *_create_step_request(
const slurm_step_ctx_params_t *step_params)
{
job_step_create_request_msg_t *step_req =
xmalloc(sizeof(job_step_create_request_msg_t));
step_req->job_id = step_params->job_id;
step_req->user_id = (uint32_t)step_params->uid;
step_req->min_nodes = step_params->min_nodes;
step_req->max_nodes = step_params->max_nodes;
step_req->cpu_count = step_params->cpu_count;
step_req->num_tasks = step_params->task_count;
step_req->relative = step_params->relative;
step_req->resv_port_cnt = step_params->resv_port_cnt;
step_req->exclusive = step_params->exclusive;
step_req->immediate = step_params->immediate;
step_req->ckpt_interval = step_params->ckpt_interval;
step_req->ckpt_dir = xstrdup(step_params->ckpt_dir);
step_req->features = xstrdup(step_params->features);
step_req->gres = xstrdup(step_params->gres);
step_req->task_dist = step_params->task_dist;
step_req->plane_size = step_params->plane_size;
step_req->node_list = xstrdup(step_params->node_list);
step_req->network = xstrdup(step_params->network);
step_req->name = xstrdup(step_params->name);
step_req->no_kill = step_params->no_kill;
step_req->overcommit = step_params->overcommit ? 1 : 0;
step_req->mem_per_cpu = step_params->mem_per_cpu;
step_req->time_limit = step_params->time_limit;
return step_req;
}
/*
* slurm_step_ctx_create - Create a job step and its context.
* IN step_params - job step parameters
* RET the step context or NULL on failure with slurm errno set
* NOTE: Free allocated memory using slurm_step_ctx_destroy.
*/
extern slurm_step_ctx_t *
slurm_step_ctx_create (const slurm_step_ctx_params_t *step_params)
{
struct slurm_step_ctx_struct *ctx = NULL;
job_step_create_request_msg_t *step_req = NULL;
job_step_create_response_msg_t *step_resp = NULL;
int sock = -1;
short port = 0;
int errnum = 0;
/* First copy the user's step_params into a step request
* struct */
step_req = _create_step_request(step_params);
/* We will handle the messages in the step_launch.c mesage handler,
* but we need to open the socket right now so we can tell the
* controller which port to use.
*/
if (net_stream_listen(&sock, &port) < 0) {
errnum = errno;
error("unable to initialize step context socket: %m");
slurm_free_job_step_create_request_msg(step_req);
goto fail;
}
step_req->port = port;
step_req->host = xshort_hostname();
if ((slurm_job_step_create(step_req, &step_resp) < 0) ||
(step_resp == NULL)) {
errnum = errno;
slurm_free_job_step_create_request_msg(step_req);
close(sock);
goto fail;
}
ctx = xmalloc(sizeof(struct slurm_step_ctx_struct));
ctx->launch_state = NULL;
ctx->magic = STEP_CTX_MAGIC;
ctx->job_id = step_req->job_id;
ctx->user_id = step_req->user_id;
ctx->step_req = step_req;
ctx->step_resp = step_resp;
ctx->verbose_level = step_params->verbose_level;
ctx->launch_state = step_launch_state_create(ctx);
ctx->launch_state->slurmctld_socket_fd = sock;
fail:
errno = errnum;
return (slurm_step_ctx_t *)ctx;
}
/*
* slurm_step_ctx_create_no_alloc - Create a job step and its context without
* getting an allocation.
* IN step_params - job step parameters
* IN step_id - since we are faking it give me the id to use
* RET the step context or NULL on failure with slurm errno set
* NOTE: Free allocated memory using slurm_step_ctx_destroy.
*/
extern slurm_step_ctx_t *
slurm_step_ctx_create_no_alloc (const slurm_step_ctx_params_t *step_params,
uint32_t step_id)
{
struct slurm_step_ctx_struct *ctx = NULL;
job_step_create_request_msg_t *step_req = NULL;
job_step_create_response_msg_t *step_resp = NULL;
int sock = -1;
short port = 0;
int errnum = 0;
int cyclic = (step_params->task_dist == SLURM_DIST_CYCLIC);
/* First copy the user's step_params into a step request struct */
step_req = _create_step_request(step_params);
/* We will handle the messages in the step_launch.c mesage handler,
* but we need to open the socket right now so we can tell the
* controller which port to use.
*/
if (net_stream_listen(&sock, &port) < 0) {
errnum = errno;
error("unable to initialize step context socket: %m");
slurm_free_job_step_create_request_msg(step_req);
goto fail;
}
step_req->port = port;
step_req->host = xshort_hostname();
/* Then make up a reponse with only certain things filled in */
step_resp = (job_step_create_response_msg_t *)
xmalloc(sizeof(job_step_create_response_msg_t));
step_resp->step_layout = fake_slurm_step_layout_create(
step_req->node_list,
NULL, NULL,
step_req->min_nodes,
step_req->num_tasks);
if (switch_alloc_jobinfo(&step_resp->switch_job) < 0)
fatal("switch_alloc_jobinfo: %m");
if (switch_build_jobinfo(step_resp->switch_job,
step_resp->step_layout->node_list,
step_resp->step_layout->tasks,
cyclic, step_req->network) < 0)
fatal("switch_build_jobinfo: %m");
step_resp->job_step_id = step_id;
ctx = xmalloc(sizeof(struct slurm_step_ctx_struct));
ctx->launch_state = NULL;
ctx->magic = STEP_CTX_MAGIC;
ctx->job_id = step_req->job_id;
ctx->user_id = step_req->user_id;
ctx->step_req = step_req;
ctx->step_resp = step_resp;
ctx->verbose_level = step_params->verbose_level;
ctx->launch_state = step_launch_state_create(ctx);
ctx->launch_state->slurmctld_socket_fd = sock;
_job_fake_cred(ctx);
fail:
errno = errnum;
return (slurm_step_ctx_t *)ctx;
}
/*
* slurm_step_ctx_get - get parameters from a job step context.
* IN ctx - job step context generated by slurm_step_ctx_create
* RET SLURM_SUCCESS or SLURM_ERROR (with slurm_errno set)
*/
extern int
slurm_step_ctx_get (slurm_step_ctx_t *ctx, int ctx_key, ...)
{
va_list ap;
int rc = SLURM_SUCCESS;
uint32_t node_inx;
uint16_t **uint16_array_pptr = (uint16_t **) NULL;
uint32_t *uint32_ptr;
uint32_t **uint32_array_pptr = (uint32_t **) NULL;
char **char_array_pptr = (char **) NULL;
job_step_create_response_msg_t ** step_resp_pptr;
slurm_cred_t **cred; /* Slurm job credential */
switch_jobinfo_t **switch_job;
int *int_ptr;
int **int_array_pptr = (int **) NULL;
if ((ctx == NULL) || (ctx->magic != STEP_CTX_MAGIC)) {
slurm_seterrno(EINVAL);
return SLURM_ERROR;
}
va_start(ap, ctx_key);
switch (ctx_key) {
case SLURM_STEP_CTX_JOBID:
uint32_ptr = (uint32_t *) va_arg(ap, void *);
*uint32_ptr = ctx->job_id;
break;
case SLURM_STEP_CTX_STEPID:
uint32_ptr = (uint32_t *) va_arg(ap, void *);
*uint32_ptr = ctx->step_resp->job_step_id;
break;
case SLURM_STEP_CTX_TASKS:
uint16_array_pptr = (uint16_t **) va_arg(ap, void *);
*uint16_array_pptr = ctx->step_resp->step_layout->tasks;
break;
case SLURM_STEP_CTX_TID:
node_inx = va_arg(ap, uint32_t);
if (node_inx > ctx->step_resp->step_layout->node_cnt) {
slurm_seterrno(EINVAL);
rc = SLURM_ERROR;
break;
}
uint32_array_pptr = (uint32_t **) va_arg(ap, void *);
*uint32_array_pptr =
ctx->step_resp->step_layout->tids[node_inx];
break;
case SLURM_STEP_CTX_RESP:
step_resp_pptr = (job_step_create_response_msg_t **)
va_arg(ap, void *);
*step_resp_pptr = ctx->step_resp;
break;
case SLURM_STEP_CTX_CRED:
cred = (slurm_cred_t **) va_arg(ap, void *);
*cred = ctx->step_resp->cred;
break;
case SLURM_STEP_CTX_SWITCH_JOB:
switch_job = (switch_jobinfo_t **) va_arg(ap, void *);
*switch_job = ctx->step_resp->switch_job;
break;
case SLURM_STEP_CTX_NUM_HOSTS:
uint32_ptr = (uint32_t *) va_arg(ap, void *);
*uint32_ptr = ctx->step_resp->step_layout->node_cnt;
break;
case SLURM_STEP_CTX_HOST:
node_inx = va_arg(ap, uint32_t);
if (node_inx > ctx->step_resp->step_layout->node_cnt) {
slurm_seterrno(EINVAL);
rc = SLURM_ERROR;
break;
}
char_array_pptr = (char **) va_arg(ap, void *);
*char_array_pptr = nodelist_nth_host(
ctx->step_resp->step_layout->node_list, node_inx);
break;
case SLURM_STEP_CTX_USER_MANAGED_SOCKETS:
int_ptr = va_arg(ap, int *);
int_array_pptr = va_arg(ap, int **);
if (ctx->launch_state == NULL
|| ctx->launch_state->user_managed_io == false
|| ctx->launch_state->io.user == NULL) {
*int_ptr = 0;
*int_array_pptr = (int *)NULL;
rc = SLURM_ERROR;
break;
}
*int_ptr = ctx->launch_state->tasks_requested;
*int_array_pptr = ctx->launch_state->io.user->sockets;
break;
default:
slurm_seterrno(EINVAL);
rc = SLURM_ERROR;
}
va_end(ap);
return rc;
}
/*
* slurm_jobinfo_ctx_get - get parameters from jobinfo context.
* IN jobinfo - job information from context, returned by slurm_step_ctx_get()
* IN data_type - type of data required, specific to the switch type
* OUT data - the requested data type
* RET SLURM_SUCCESS or SLURM_ERROR (with slurm_errno set)
*/
extern int
slurm_jobinfo_ctx_get(switch_jobinfo_t *jobinfo, int data_type, void *data)
{
if (jobinfo == NULL) {
slurm_seterrno(EINVAL);
return SLURM_ERROR;
}
return switch_g_get_jobinfo(jobinfo, data_type, data);
}
/*
* slurm_step_ctx_destroy - free allocated memory for a job step context.
* IN ctx - job step context generated by slurm_step_ctx_create
* RET SLURM_SUCCESS or SLURM_ERROR (with slurm_errno set)
*/
extern int
slurm_step_ctx_destroy (slurm_step_ctx_t *ctx)
{
if ((ctx == NULL) || (ctx->magic != STEP_CTX_MAGIC)) {
slurm_seterrno(EINVAL);
return SLURM_ERROR;
}
slurm_free_job_step_create_request_msg(ctx->step_req);
slurm_free_job_step_create_response_msg(ctx->step_resp);
step_launch_state_destroy(ctx->launch_state);
xfree(ctx);
return SLURM_SUCCESS;
}
/*
* slurm_step_ctx_daemon_per_node_hack - Hack the step context
* to run a single process per node, regardless of the settings
* selected at slurm_step_ctx_create time.
*
* This is primarily used on AIX by the slurm_ll_api in support of
* poe. The slurm_ll_api will want to launch a single pmd daemon
* on each node regardless of the number of tasks running on each
* node.
* IN ctx - job step context generated by slurm_step_ctx_create
* RET SLURM_SUCCESS or SLURM_ERROR (with slurm_errno set)
*/
extern int
slurm_step_ctx_daemon_per_node_hack(slurm_step_ctx_t *ctx)
{
slurm_step_layout_t *new_layout, *old_layout;
int i;
if ((ctx == NULL) || (ctx->magic != STEP_CTX_MAGIC)) {
slurm_seterrno(EINVAL);
return SLURM_ERROR;
}
/* hack the context node count */
ctx->step_req->num_tasks = ctx->step_req->min_nodes;
/* hack the context step layout */
old_layout = ctx->step_resp->step_layout;
new_layout = (slurm_step_layout_t *)
xmalloc(sizeof(slurm_step_layout_t));
new_layout->node_cnt = old_layout->node_cnt;
new_layout->task_cnt = old_layout->node_cnt;
new_layout->node_list = xstrdup(old_layout->node_list);
slurm_step_layout_destroy(old_layout);
new_layout->tasks = (uint16_t *) xmalloc(sizeof(uint16_t) *
new_layout->node_cnt);
new_layout->tids = (uint32_t **) xmalloc(sizeof(uint32_t *) *
new_layout->node_cnt);
for (i = 0; i < new_layout->node_cnt; i++) {
new_layout->tasks[i] = 1;
new_layout->tids[i] = (uint32_t *)xmalloc(sizeof(uint32_t));
new_layout->tids[i][0] = i;
}
ctx->step_resp->step_layout = new_layout;
/* recreate the launch state structure now that the settings
have changed */
step_launch_state_destroy(ctx->launch_state);
ctx->launch_state = step_launch_state_create(ctx);
return SLURM_SUCCESS;
}
/*
* slurm_step_ctx_params_t_init - This initializes parameters
* in the structure that you will pass to slurm_step_ctx_create().
* This function will NOT allocate any new memory.
* IN ptr - pointer to a structure allocated by the user. The structure will
* be initialized.
*/
extern void slurm_step_ctx_params_t_init (slurm_step_ctx_params_t *ptr)
{
char *jobid_str;
/* zero the entire structure */
memset(ptr, 0, sizeof(slurm_step_ctx_params_t));
/* now set anything that shouldn't be 0 or NULL by default */
ptr->relative = (uint16_t)NO_VAL;
ptr->task_dist = SLURM_DIST_CYCLIC;
ptr->plane_size = (uint16_t)NO_VAL;
ptr->resv_port_cnt = (uint16_t)NO_VAL;
ptr->uid = getuid();
if ((jobid_str = getenv("SLURM_JOB_ID")) != NULL) {
ptr->job_id = (uint32_t)atol(jobid_str);
} else if ((jobid_str = getenv("SLURM_JOBID")) != NULL) {
/* handle old style env variable for backwards compatibility */
ptr->job_id = (uint32_t)atol(jobid_str);
} else {
ptr->job_id = (uint32_t)NO_VAL;
}
}