| /*****************************************************************************\ |
| * step_mgr.c - manage the job step information of slurm |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov>, et. al. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <https://computing.llnl.gov/linux/slurm/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifdef HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #include <time.h> |
| #include <ctype.h> |
| #include <errno.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/types.h> |
| #include <string.h> |
| #include <strings.h> |
| #include <unistd.h> |
| |
| #include <slurm/slurm_errno.h> |
| |
| #include "src/common/bitstring.h" |
| #include "src/common/checkpoint.h" |
| #include "src/common/forward.h" |
| #include "src/common/slurm_accounting_storage.h" |
| #include "src/common/slurm_jobacct_gather.h" |
| #include "src/common/slurm_protocol_interface.h" |
| #include "src/common/switch.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/port_mgr.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/srun_comm.h" |
| |
| #define MAX_RETRIES 10 |
| |
| static int _count_cpus(bitstr_t *bitmap); |
| static struct step_record * _create_step_record (struct job_record *job_ptr); |
| static void _dump_step_layout(struct step_record *step_ptr); |
| static void _free_step_rec(struct step_record *step_ptr); |
| static bool _is_mem_resv(void); |
| static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer); |
| static bitstr_t * _pick_step_nodes (struct job_record *job_ptr, |
| job_step_create_request_msg_t *step_spec, |
| int cpus_per_task, bool batch_step, |
| int *return_code); |
| static hostlist_t _step_range_to_hostlist(struct step_record *step_ptr, |
| uint32_t range_first, uint32_t range_last); |
| static int _step_hostname_to_inx(struct step_record *step_ptr, |
| char *node_name); |
| static void _step_dealloc_lps(struct step_record *step_ptr); |
| |
| |
| /* |
| * _create_step_record - create an empty step_record for the specified job. |
| * IN job_ptr - pointer to job table entry to have step record added |
| * RET a pointer to the record or NULL if error |
| * NOTE: allocates memory that should be xfreed with delete_step_record |
| */ |
| static struct step_record * _create_step_record(struct job_record *job_ptr) |
| { |
| struct step_record *step_ptr; |
| |
| xassert(job_ptr); |
| /* NOTE: Reserve highest step ID values for NO_VAL and |
| * SLURM_BATCH_SCRIPT */ |
| if (job_ptr->next_step_id >= 0xfffffff0) { |
| /* avoid step records in the accounting database */ |
| info("job %u has reached step id limit", job_ptr->job_id); |
| return NULL; |
| } |
| |
| step_ptr = (struct step_record *) xmalloc(sizeof (struct step_record)); |
| |
| last_job_update = time(NULL); |
| step_ptr->job_ptr = job_ptr; |
| step_ptr->start_time = time(NULL) ; |
| step_ptr->time_limit = INFINITE ; |
| step_ptr->jobacct = jobacct_gather_g_create(NULL); |
| step_ptr->ckpt_dir = NULL; |
| step_ptr->requid = -1; |
| |
| if (list_append (job_ptr->step_list, step_ptr) == NULL) |
| fatal ("_create_step_record: unable to allocate memory"); |
| |
| return step_ptr; |
| } |
| |
| |
| /* |
| * delete_step_records - delete step record for specified job_ptr |
| * IN job_ptr - pointer to job table entry to have step records removed |
| * IN filter - determine which job steps to delete |
| * 0: delete all job steps |
| * 1: delete only job steps without a switch allocation |
| */ |
| extern void |
| delete_step_records (struct job_record *job_ptr, int filter) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| xassert(job_ptr); |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| |
| last_job_update = time(NULL); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if ((filter == 1) && (step_ptr->switch_job)) |
| continue; |
| |
| list_remove (step_iterator); |
| if (step_ptr->switch_job) { |
| switch_g_job_step_complete( |
| step_ptr->switch_job, |
| step_ptr->step_layout->node_list); |
| switch_free_jobinfo(step_ptr->switch_job); |
| } |
| checkpoint_free_jobinfo(step_ptr->check_job); |
| _free_step_rec(step_ptr); |
| } |
| |
| list_iterator_destroy (step_iterator); |
| } |
| |
| /* _free_step_rec - delete a step record's data structures */ |
| static void _free_step_rec(struct step_record *step_ptr) |
| { |
| xfree(step_ptr->host); |
| xfree(step_ptr->name); |
| slurm_step_layout_destroy(step_ptr->step_layout); |
| jobacct_gather_g_destroy(step_ptr->jobacct); |
| FREE_NULL_BITMAP(step_ptr->core_bitmap_job); |
| FREE_NULL_BITMAP(step_ptr->exit_node_bitmap); |
| FREE_NULL_BITMAP(step_ptr->step_node_bitmap); |
| xfree(step_ptr->resv_port_array); |
| xfree(step_ptr->resv_ports); |
| xfree(step_ptr->network); |
| xfree(step_ptr->ckpt_dir); |
| xfree(step_ptr); |
| } |
| |
| /* |
| * delete_step_record - delete record for job step for specified job_ptr |
| * and step_id |
| * IN job_ptr - pointer to job table entry to have step record removed |
| * IN step_id - id of the desired job step |
| * RET 0 on success, errno otherwise |
| */ |
| int |
| delete_step_record (struct job_record *job_ptr, uint32_t step_id) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| int error_code; |
| |
| xassert(job_ptr); |
| error_code = ENOENT; |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| last_job_update = time(NULL); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if (step_ptr->step_id == step_id) { |
| list_remove (step_iterator); |
| /* FIXME: If job step record is preserved after completion, |
| * the switch_g_job_step_complete() must be called upon completion |
| * and not upon record purging. Presently both events occur |
| * simultaneously. */ |
| if (step_ptr->switch_job) { |
| switch_g_job_step_complete( |
| step_ptr->switch_job, |
| step_ptr->step_layout->node_list); |
| switch_free_jobinfo (step_ptr->switch_job); |
| } |
| resv_port_free(step_ptr); |
| checkpoint_free_jobinfo (step_ptr->check_job); |
| _free_step_rec(step_ptr); |
| error_code = 0; |
| break; |
| } |
| } |
| |
| list_iterator_destroy (step_iterator); |
| return error_code; |
| } |
| |
| |
| /* |
| * dump_step_desc - dump the incoming step initiate request message |
| * IN step_spec - job step request specification from RPC |
| */ |
| void |
| dump_step_desc(job_step_create_request_msg_t *step_spec) |
| { |
| debug3("StepDesc: user_id=%u job_id=%u node_count=%u cpu_count=%u", |
| step_spec->user_id, step_spec->job_id, |
| step_spec->node_count, step_spec->cpu_count); |
| debug3(" num_tasks=%u relative=%u task_dist=%u node_list=%s", |
| step_spec->num_tasks, step_spec->relative, |
| step_spec->task_dist, step_spec->node_list); |
| debug3(" host=%s port=%u name=%s network=%s exclusive=%u", |
| step_spec->host, step_spec->port, step_spec->name, |
| step_spec->network, step_spec->exclusive); |
| debug3(" checkpoint-dir=%s checkpoint_int=%u", |
| step_spec->ckpt_dir, step_spec->ckpt_interval); |
| debug3(" mem_per_cpu=%u resv_port_cnt=%u immediate=%u no_kill=%u", |
| step_spec->mem_per_cpu, step_spec->resv_port_cnt, |
| step_spec->immediate, step_spec->no_kill); |
| debug3(" overcommit=%d time_limit=%u", |
| step_spec->overcommit, step_spec->time_limit); |
| } |
| |
| |
| /* |
| * find_step_record - return a pointer to the step record with the given |
| * job_id and step_id |
| * IN job_ptr - pointer to job table entry to have step record added |
| * IN step_id - id of the desired job step or NO_VAL for first one |
| * RET pointer to the job step's record, NULL on error |
| */ |
| struct step_record * |
| find_step_record(struct job_record *job_ptr, uint32_t step_id) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| if (job_ptr == NULL) |
| return NULL; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if ((step_ptr->step_id == step_id) || (step_id == NO_VAL)) |
| break; |
| } |
| list_iterator_destroy (step_iterator); |
| |
| return step_ptr; |
| } |
| |
| |
| /* |
| * job_step_signal - signal the specified job step |
| * IN job_id - id of the job to be cancelled |
| * IN step_id - id of the job step to be cancelled |
| * IN signal - user id of user issuing the RPC |
| * IN uid - user id of user issuing the RPC |
| * RET 0 on success, otherwise ESLURM error code |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| int job_step_signal(uint32_t job_id, uint32_t step_id, |
| uint16_t signal, uid_t uid) |
| { |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| error("job_step_cancel: invalid job id %u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| if (!IS_JOB_RUNNING(job_ptr)) { |
| verbose("job_step_signal: step %u.%u can not be sent signal " |
| "%u from state=%s", job_id, step_id, signal, |
| job_state_string(job_ptr->job_state)); |
| return ESLURM_TRANSITION_STATE_NO_UPDATE; |
| } |
| |
| if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) { |
| error("Security violation, JOB_CANCEL RPC from uid %d", |
| uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| step_ptr = find_step_record(job_ptr, step_id); |
| if (step_ptr == NULL) { |
| info("job_step_cancel step %u.%u not found", |
| job_id, step_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| /* save user ID of the one who requested the job be cancelled */ |
| if (signal == SIGKILL) { |
| step_ptr->requid = uid; |
| srun_step_complete(step_ptr); |
| } |
| |
| signal_step_tasks(step_ptr, signal, REQUEST_SIGNAL_TASKS); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * signal_step_tasks - send specific signal to specific job step |
| * IN step_ptr - step record pointer |
| * IN signal - signal to send |
| * IN msg_type - message type to send |
| */ |
| void signal_step_tasks(struct step_record *step_ptr, uint16_t signal, |
| slurm_msg_type_t msg_type) |
| { |
| int i; |
| kill_tasks_msg_t *kill_tasks_msg; |
| agent_arg_t *agent_args = NULL; |
| |
| xassert(step_ptr); |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = msg_type; |
| agent_args->retry = 1; |
| agent_args->hostlist = hostlist_create(""); |
| kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t)); |
| kill_tasks_msg->job_id = step_ptr->job_ptr->job_id; |
| kill_tasks_msg->job_step_id = step_ptr->step_id; |
| kill_tasks_msg->signal = signal; |
| |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(step_ptr->step_node_bitmap, i) == 0) |
| continue; |
| hostlist_push(agent_args->hostlist, |
| node_record_table_ptr[i].name); |
| agent_args->node_count++; |
| #ifdef HAVE_FRONT_END /* Operate only on front-end */ |
| break; |
| #endif |
| } |
| |
| if (agent_args->node_count == 0) { |
| xfree(kill_tasks_msg); |
| hostlist_destroy(agent_args->hostlist); |
| xfree(agent_args); |
| return; |
| } |
| |
| agent_args->msg_args = kill_tasks_msg; |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| |
| /* |
| * job_step_complete - note normal completion the specified job step |
| * IN job_id - id of the job to be completed |
| * IN step_id - id of the job step to be completed |
| * IN uid - user id of user issuing the RPC |
| * IN requeue - job should be run again if possible |
| * IN job_return_code - job's return code, if set then set state to JOB_FAILED |
| * RET 0 on success, otherwise ESLURM error code |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, |
| bool requeue, uint32_t job_return_code) |
| { |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| int error_code; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| info("job_step_complete: invalid job id %u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) { |
| error("Security violation, JOB_COMPLETE RPC from uid %d", |
| uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| step_ptr = find_step_record(job_ptr, step_id); |
| if (step_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID; |
| |
| jobacct_storage_g_step_complete(acct_db_conn, step_ptr); |
| _step_dealloc_lps(step_ptr); |
| |
| if ((job_ptr->kill_on_step_done) |
| && (list_count(job_ptr->step_list) <= 1) |
| && (!IS_JOB_FINISHED(job_ptr))) |
| return job_complete(job_id, uid, requeue, job_return_code); |
| |
| last_job_update = time(NULL); |
| error_code = delete_step_record(job_ptr, step_id); |
| if (error_code == ENOENT) { |
| info("job_step_complete step %u.%u not found", job_id, |
| step_id); |
| return ESLURM_ALREADY_DONE; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _pick_step_nodes - select nodes for a job step that satisfy its requirements |
| * we satisfy the super-set of constraints. |
| * IN job_ptr - pointer to job to have new step started |
| * IN step_spec - job step specification |
| * IN cpus_per_task - NOTE could be zero |
| * IN batch_step - if set then step is a batch script |
| * OUT return_code - exit code or SLURM_SUCCESS |
| * global: node_record_table_ptr - pointer to global node table |
| * NOTE: returns all of a job's nodes if step_spec->node_count == INFINITE |
| * NOTE: returned bitmap must be freed by the caller using bit_free() |
| */ |
| static bitstr_t * |
| _pick_step_nodes (struct job_record *job_ptr, |
| job_step_create_request_msg_t *step_spec, |
| int cpus_per_task, |
| bool batch_step, int *return_code) |
| { |
| struct node_record *node_ptr; |
| bitstr_t *nodes_avail = NULL, *nodes_idle = NULL; |
| bitstr_t *nodes_picked = NULL, *node_tmp = NULL; |
| int error_code, nodes_picked_cnt=0, cpus_picked_cnt = 0, i, task_cnt; |
| int mem_blocked_nodes = 0, mem_blocked_cpus = 0; |
| ListIterator step_iterator; |
| struct step_record *step_p; |
| job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; |
| |
| xassert(job_resrcs_ptr); |
| xassert(job_resrcs_ptr->cpus); |
| xassert(job_resrcs_ptr->cpus_used); |
| |
| *return_code = SLURM_SUCCESS; |
| if (job_ptr->node_bitmap == NULL) { |
| *return_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| return NULL; |
| } |
| |
| nodes_avail = bit_copy (job_ptr->node_bitmap); |
| if (nodes_avail == NULL) |
| fatal("bit_copy malloc failure"); |
| bit_and (nodes_avail, up_node_bitmap); |
| |
| if (step_spec->mem_per_cpu && |
| ((job_resrcs_ptr->memory_allocated == NULL) || |
| (job_resrcs_ptr->memory_used == NULL))) { |
| error("_pick_step_nodes: lack memory allocation details " |
| "to enforce memory limits for job %u", job_ptr->job_id); |
| step_spec->mem_per_cpu = 0; |
| } |
| |
| if (job_ptr->next_step_id == 0) { |
| if (job_ptr->details && job_ptr->details->prolog_running) { |
| *return_code = ESLURM_PROLOG_RUNNING; |
| return NULL; |
| } |
| for (i=bit_ffs(job_ptr->node_bitmap); i<node_record_count; |
| i++) { |
| if (!bit_test(job_ptr->node_bitmap, i)) |
| continue; |
| node_ptr = node_record_table_ptr + i; |
| if (IS_NODE_POWER_SAVE(node_ptr) || |
| IS_NODE_NO_RESPOND(node_ptr)) { |
| /* Node is/was powered down. Need to wait |
| * for it to start responding again. */ |
| FREE_NULL_BITMAP(nodes_avail); |
| *return_code = ESLURM_NODES_BUSY; |
| /* Update job's end-time to allow for node |
| * boot time. */ |
| if (job_ptr->time_limit != INFINITE) { |
| job_ptr->end_time = time(NULL) + |
| (job_ptr->time_limit * 60); |
| } |
| return NULL; |
| } |
| } |
| job_ptr->job_state &= (~JOB_CONFIGURING); |
| debug("Configuration for job %u complete", job_ptr->job_id); |
| } |
| |
| /* In exclusive mode, just satisfy the processor count. |
| * Do not use nodes that have no unused CPUs or insufficient |
| * unused memory */ |
| if (step_spec->exclusive) { |
| int avail_cpus, avail_tasks, total_cpus, total_tasks, node_inx; |
| int i_first, i_last; |
| uint32_t avail_mem, total_mem; |
| uint32_t nodes_picked_cnt = 0; |
| uint32_t tasks_picked_cnt = 0, total_task_cnt = 0; |
| bitstr_t *selected_nodes = NULL; |
| |
| if (step_spec->node_list) { |
| error_code = node_name2bitmap(step_spec->node_list, |
| false, |
| &selected_nodes); |
| if (error_code) { |
| info("_pick_step_nodes: invalid node list (%s) " |
| "for job step %u", |
| step_spec->node_list, job_ptr->job_id); |
| bit_free(selected_nodes); |
| goto cleanup; |
| } |
| if (!bit_super_set(selected_nodes, |
| job_ptr->node_bitmap)) { |
| info("_pick_step_nodes: selected nodes (%s) " |
| "not in job %u", |
| step_spec->node_list, job_ptr->job_id); |
| bit_free(selected_nodes); |
| goto cleanup; |
| } |
| if (!bit_super_set(selected_nodes, up_node_bitmap)) { |
| info("_pick_step_nodes: selected node is DOWN", |
| step_spec->node_list); |
| bit_free(selected_nodes); |
| goto cleanup; |
| } |
| } |
| |
| node_inx = -1; |
| i_first = bit_ffs(job_resrcs_ptr->node_bitmap); |
| i_last = bit_fls(job_resrcs_ptr->node_bitmap); |
| for (i=i_first; i<=i_last; i++) { |
| if (!bit_test(job_resrcs_ptr->node_bitmap, i)) |
| continue; |
| node_inx++; |
| if (!bit_test(nodes_avail, i)) |
| continue; /* node now DOWN */ |
| avail_cpus = job_resrcs_ptr->cpus[node_inx] - |
| job_resrcs_ptr->cpus_used[node_inx]; |
| total_cpus = job_resrcs_ptr->cpus[node_inx]; |
| if (cpus_per_task > 0) { |
| avail_tasks = avail_cpus / cpus_per_task; |
| total_tasks = total_cpus / cpus_per_task; |
| } else { |
| avail_tasks = step_spec->num_tasks; |
| total_tasks = step_spec->num_tasks; |
| } |
| if (step_spec->mem_per_cpu) { |
| avail_mem = job_resrcs_ptr-> |
| memory_allocated[node_inx] - |
| job_resrcs_ptr->memory_used[node_inx]; |
| task_cnt = avail_mem / step_spec->mem_per_cpu; |
| if (cpus_per_task > 0) |
| task_cnt /= cpus_per_task; |
| avail_tasks = MIN(avail_tasks, task_cnt); |
| |
| total_mem = job_resrcs_ptr-> |
| memory_allocated[node_inx]; |
| task_cnt = total_mem / step_spec->mem_per_cpu; |
| if (cpus_per_task > 0) |
| task_cnt /= cpus_per_task; |
| total_tasks = MIN(total_tasks, task_cnt); |
| } |
| if ((avail_tasks <= 0) || |
| ((selected_nodes == NULL) && |
| (nodes_picked_cnt >= step_spec->node_count) && |
| (tasks_picked_cnt > 0) && |
| (tasks_picked_cnt >= step_spec->num_tasks))) |
| bit_clear(nodes_avail, i); |
| else { |
| nodes_picked_cnt++; |
| tasks_picked_cnt += avail_tasks; |
| } |
| total_task_cnt += total_tasks; |
| } |
| |
| if (selected_nodes) { |
| if (!bit_equal(selected_nodes, nodes_avail)) { |
| /* some required nodes have no available |
| * processors, defer request */ |
| tasks_picked_cnt = 0; |
| } |
| bit_free(selected_nodes); |
| } |
| |
| if (tasks_picked_cnt >= step_spec->num_tasks) |
| return nodes_avail; |
| FREE_NULL_BITMAP(nodes_avail); |
| if (total_task_cnt >= step_spec->num_tasks) |
| *return_code = ESLURM_NODES_BUSY; |
| else |
| *return_code =ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| return NULL; |
| } |
| |
| if (step_spec->mem_per_cpu && _is_mem_resv()) { |
| int node_inx = 0, usable_mem; |
| for (i=bit_ffs(job_resrcs_ptr->node_bitmap); |
| i<node_record_count; i++) { |
| if (!bit_test(job_resrcs_ptr->node_bitmap, i)) |
| continue; |
| usable_mem = job_resrcs_ptr-> |
| memory_allocated[node_inx] - |
| job_resrcs_ptr->memory_used[node_inx]; |
| task_cnt = usable_mem / step_spec->mem_per_cpu; |
| if (cpus_per_task > 0) |
| task_cnt /= cpus_per_task; |
| if (task_cnt <= 0) { |
| if (step_spec->node_count == INFINITE) { |
| FREE_NULL_BITMAP(nodes_avail); |
| *return_code = |
| ESLURM_INVALID_TASK_MEMORY; |
| return NULL; |
| } |
| bit_clear(nodes_avail, i); |
| mem_blocked_nodes++; |
| task_cnt = job_resrcs_ptr-> |
| memory_allocated[node_inx] / |
| step_spec->mem_per_cpu; |
| mem_blocked_cpus += MIN(task_cnt, |
| job_resrcs_ptr-> |
| cpus[node_inx]); |
| } |
| if (++node_inx >= job_resrcs_ptr->nhosts) |
| break; |
| } |
| } |
| |
| if (step_spec->node_count == INFINITE) /* use all nodes */ |
| return nodes_avail; |
| |
| if (step_spec->node_list) { |
| bitstr_t *selected_nodes = NULL; |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) |
| info("selected nodelist is %s", step_spec->node_list); |
| error_code = node_name2bitmap(step_spec->node_list, false, |
| &selected_nodes); |
| |
| if (error_code) { |
| info("_pick_step_nodes: invalid node list %s", |
| step_spec->node_list); |
| bit_free(selected_nodes); |
| goto cleanup; |
| } |
| if (!bit_super_set(selected_nodes, job_ptr->node_bitmap)) { |
| info ("_pick_step_nodes: requested nodes %s not part " |
| "of job %u", |
| step_spec->node_list, job_ptr->job_id); |
| bit_free(selected_nodes); |
| goto cleanup; |
| } |
| if (!bit_super_set(selected_nodes, nodes_avail)) { |
| *return_code = ESLURM_INVALID_TASK_MEMORY; |
| info ("_pick_step_nodes: requested nodes %s " |
| "have inadequate memory", |
| step_spec->node_list); |
| bit_free(selected_nodes); |
| goto cleanup; |
| } |
| if (step_spec->task_dist == SLURM_DIST_ARBITRARY) { |
| /* if we are in arbitrary mode we need to make |
| * sure we aren't running on an elan switch. |
| * If we aren't change the number of nodes |
| * available to the number we were given since |
| * that is what the user wants to run on. |
| */ |
| if (!strcmp(slurmctld_conf.switch_type, |
| "switch/elan")) { |
| error("Can't do an ARBITRARY task layout with " |
| "switch type elan. Switching DIST type " |
| "to BLOCK"); |
| xfree(step_spec->node_list); |
| step_spec->task_dist = SLURM_DIST_BLOCK; |
| FREE_NULL_BITMAP(selected_nodes); |
| step_spec->node_count = |
| bit_set_count(nodes_avail); |
| } else { |
| step_spec->node_count = |
| bit_set_count(selected_nodes); |
| } |
| } |
| if (selected_nodes) { |
| /* use selected nodes to run the job and |
| * make them unavailable for future use */ |
| |
| /* If we have selected more than we requested |
| * make the available nodes equal to the |
| * selected nodes and we will pick from that |
| * list later on in the function. |
| * Other than that copy the nodes selected as |
| * the nodes we want. |
| */ |
| if (step_spec->node_count && |
| (bit_set_count(selected_nodes) > |
| step_spec->node_count)) { |
| nodes_picked = |
| bit_alloc(bit_size(nodes_avail)); |
| if (nodes_picked == NULL) |
| fatal("bit_alloc malloc failure"); |
| bit_free(nodes_avail); |
| nodes_avail = selected_nodes; |
| selected_nodes = NULL; |
| } else { |
| nodes_picked = bit_copy(selected_nodes); |
| bit_not(selected_nodes); |
| bit_and(nodes_avail, selected_nodes); |
| bit_free(selected_nodes); |
| } |
| } |
| } else { |
| nodes_picked = bit_alloc(bit_size(nodes_avail)); |
| if (nodes_picked == NULL) |
| fatal("bit_alloc malloc failure"); |
| } |
| |
| if (step_spec->relative != (uint16_t)NO_VAL) { |
| /* Remove first (step_spec->relative) nodes from |
| * available list */ |
| bitstr_t *relative_nodes = NULL; |
| relative_nodes = |
| bit_pick_cnt(nodes_avail, step_spec->relative); |
| if (relative_nodes == NULL) { |
| info ("_pick_step_nodes: " |
| "Invalid relative value (%u) for job %u", |
| step_spec->relative, job_ptr->job_id); |
| goto cleanup; |
| } |
| bit_not (relative_nodes); |
| bit_and (nodes_avail, relative_nodes); |
| bit_free (relative_nodes); |
| } else { |
| nodes_idle = bit_alloc (bit_size (nodes_avail) ); |
| if (nodes_idle == NULL) |
| fatal("bit_alloc malloc failure"); |
| step_iterator = |
| list_iterator_create(job_ptr->step_list); |
| while ((step_p = (struct step_record *) |
| list_next(step_iterator))) { |
| bit_or(nodes_idle, step_p->step_node_bitmap); |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) { |
| char *temp; |
| temp = bitmap2node_name(step_p-> |
| step_node_bitmap); |
| info("step %u.%u has nodes %s", |
| job_ptr->job_id, step_p->step_id, temp); |
| xfree(temp); |
| } |
| } |
| list_iterator_destroy (step_iterator); |
| bit_not(nodes_idle); |
| bit_and(nodes_idle, nodes_avail); |
| } |
| |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) { |
| char *temp1, *temp2; |
| temp1 = bitmap2node_name(nodes_avail); |
| temp2 = bitmap2node_name(nodes_idle); |
| info("step pick %u nodes, avail:%s idle:%s", |
| step_spec->node_count, temp1, temp2); |
| xfree(temp1); |
| xfree(temp2); |
| } |
| |
| /* if user specifies step needs a specific processor count and |
| * all nodes have the same processor count, just translate this to |
| * a node count */ |
| if (step_spec->cpu_count && job_ptr->job_resrcs && |
| (job_ptr->job_resrcs->cpu_array_cnt == 1) && |
| (job_ptr->job_resrcs->cpu_array_value)) { |
| i = (step_spec->cpu_count + |
| (job_ptr->job_resrcs->cpu_array_value[0] - 1)) / |
| job_ptr->job_resrcs->cpu_array_value[0]; |
| step_spec->node_count = (i > step_spec->node_count) ? |
| i : step_spec->node_count ; |
| //step_spec->cpu_count = 0; |
| } |
| |
| if (step_spec->node_count) { |
| nodes_picked_cnt = bit_set_count(nodes_picked); |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) { |
| verbose("got %u %d", step_spec->node_count, |
| nodes_picked_cnt); |
| } |
| if (nodes_idle && |
| (bit_set_count(nodes_idle) >= step_spec->node_count) && |
| (step_spec->node_count > nodes_picked_cnt)) { |
| node_tmp = bit_pick_cnt(nodes_idle, |
| (step_spec->node_count - |
| nodes_picked_cnt)); |
| if (node_tmp == NULL) |
| goto cleanup; |
| bit_or (nodes_picked, node_tmp); |
| bit_not (node_tmp); |
| bit_and (nodes_idle, node_tmp); |
| bit_and (nodes_avail, node_tmp); |
| bit_free (node_tmp); |
| node_tmp = NULL; |
| nodes_picked_cnt = step_spec->node_count; |
| } |
| if (step_spec->node_count > nodes_picked_cnt) { |
| node_tmp = bit_pick_cnt(nodes_avail, |
| (step_spec->node_count - |
| nodes_picked_cnt)); |
| if (node_tmp == NULL) { |
| if (step_spec->node_count <= |
| (bit_set_count(nodes_avail) + |
| nodes_picked_cnt + |
| mem_blocked_nodes)) { |
| *return_code = |
| ESLURM_INVALID_TASK_MEMORY; |
| } |
| goto cleanup; |
| } |
| bit_or (nodes_picked, node_tmp); |
| bit_not (node_tmp); |
| bit_and (nodes_avail, node_tmp); |
| bit_free (node_tmp); |
| node_tmp = NULL; |
| nodes_picked_cnt = step_spec->node_count; |
| } |
| } |
| |
| if (step_spec->cpu_count) { |
| /* make sure the selected nodes have enough cpus */ |
| cpus_picked_cnt = _count_cpus(nodes_picked); |
| /* user is requesting more cpus than we got from the |
| * picked nodes we should return with an error */ |
| if (step_spec->cpu_count > cpus_picked_cnt) { |
| if (step_spec->cpu_count <= |
| (cpus_picked_cnt + mem_blocked_cpus)) { |
| *return_code = ESLURM_INVALID_TASK_MEMORY; |
| } |
| debug2("Have %d nodes with %d cpus which is less " |
| "than what the user is asking for (%d cpus) " |
| "aborting.", |
| nodes_picked_cnt, cpus_picked_cnt, |
| step_spec->cpu_count); |
| goto cleanup; |
| } |
| } |
| |
| FREE_NULL_BITMAP(nodes_avail); |
| FREE_NULL_BITMAP(nodes_idle); |
| return nodes_picked; |
| |
| cleanup: |
| FREE_NULL_BITMAP(nodes_avail); |
| FREE_NULL_BITMAP(nodes_idle); |
| FREE_NULL_BITMAP(nodes_picked); |
| if (*return_code == SLURM_SUCCESS) |
| *return_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| return NULL; |
| } |
| |
| /* |
| * _count_cpus - report how many cpus are associated with the identified nodes |
| * IN bitmap - map of nodes to tally |
| * RET cpu count |
| * globals: node_record_count - number of nodes configured |
| * node_record_table_ptr - pointer to global node table |
| */ |
| static int _count_cpus(bitstr_t *bitmap) |
| { |
| int i, sum; |
| |
| sum = 0; |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(bitmap, i) != 1) |
| continue; |
| if (slurmctld_conf.fast_schedule) |
| sum += node_record_table_ptr[i].config_ptr->cpus; |
| else |
| sum += node_record_table_ptr[i].cpus; |
| } |
| return sum; |
| } |
| |
| /* Update the step's core bitmaps, create as needed. |
| * Add the specified task count for a specific node in the job's |
| * and step's allocation */ |
| static void _pick_step_cores(struct step_record *step_ptr, |
| job_resources_t *job_resrcs_ptr, |
| int job_node_inx, uint16_t task_cnt) |
| { |
| int bit_offset, core_inx, i, sock_inx; |
| uint16_t sockets, cores; |
| int cpu_cnt = (int) task_cnt; |
| bool use_all_cores; |
| static int last_core_inx; |
| |
| if (!step_ptr->core_bitmap_job) { |
| step_ptr->core_bitmap_job = bit_alloc(bit_size(job_resrcs_ptr-> |
| core_bitmap)); |
| } |
| if (get_job_resources_cnt(job_resrcs_ptr, job_node_inx, &sockets, &cores)) |
| fatal("get_job_resources_cnt"); |
| |
| if (task_cnt == (cores * sockets)) |
| use_all_cores = true; |
| else |
| use_all_cores = false; |
| if (step_ptr->cpus_per_task > 0) |
| cpu_cnt *= step_ptr->cpus_per_task; |
| |
| /* select idle cores first */ |
| for (core_inx=0; core_inx<cores; core_inx++) { |
| for (sock_inx=0; sock_inx<sockets; sock_inx++) { |
| bit_offset = get_job_resources_offset(job_resrcs_ptr, |
| job_node_inx, |
| sock_inx, |
| core_inx); |
| if (bit_offset < 0) |
| fatal("get_job_resources_offset"); |
| if (!bit_test(job_resrcs_ptr->core_bitmap, bit_offset)) |
| continue; |
| if ((use_all_cores == false) && |
| bit_test(job_resrcs_ptr->core_bitmap_used, bit_offset)) |
| continue; |
| bit_set(job_resrcs_ptr->core_bitmap_used, bit_offset); |
| bit_set(step_ptr->core_bitmap_job, bit_offset); |
| #if 0 |
| info("step alloc N:%d S:%dC :%d", |
| job_node_inx, sock_inx, core_inx); |
| #endif |
| if (--cpu_cnt == 0) |
| return; |
| } |
| } |
| if (use_all_cores) |
| return; |
| |
| /* We need to over-subscribe one or more cores. |
| * Use last_core_inx to avoid putting all of the extra |
| * work onto core zero */ |
| verbose("job step needs to over-subscribe cores"); |
| last_core_inx = (last_core_inx + 1) % cores; |
| for (i=0; i<cores; i++) { |
| core_inx = (last_core_inx + i) % cores; |
| for (sock_inx=0; sock_inx<sockets; sock_inx++) { |
| bit_offset = get_job_resources_offset(job_resrcs_ptr, |
| job_node_inx, |
| sock_inx, |
| core_inx); |
| if (bit_offset < 0) |
| fatal("get_job_resources_offset"); |
| if (!bit_test(job_resrcs_ptr->core_bitmap, bit_offset)) |
| continue; |
| if (bit_test(step_ptr->core_bitmap_job, bit_offset)) |
| continue; /* already taken by this step */ |
| bit_set(step_ptr->core_bitmap_job, bit_offset); |
| #if 0 |
| info("step alloc N:%d S:%dC :%d", |
| job_node_inx, sock_inx, core_inx); |
| #endif |
| if (--cpu_cnt == 0) |
| return; |
| } |
| } |
| } |
| |
| |
| /* Update a job's record of allocated CPUs when a job step gets scheduled */ |
| extern void step_alloc_lps(struct step_record *step_ptr) |
| { |
| struct job_record *job_ptr = step_ptr->job_ptr; |
| job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; |
| int cpus_alloc; |
| int i_node, i_first, i_last; |
| int job_node_inx = -1, step_node_inx = -1; |
| bool pick_step_cores = true; |
| |
| xassert(job_resrcs_ptr); |
| xassert(job_resrcs_ptr->cpus); |
| xassert(job_resrcs_ptr->cpus_used); |
| |
| if (step_ptr->step_layout == NULL) /* batch step */ |
| return; |
| |
| i_first = bit_ffs(job_resrcs_ptr->node_bitmap); |
| i_last = bit_fls(job_resrcs_ptr->node_bitmap); |
| if (i_first == -1) /* empty bitmap */ |
| return; |
| |
| #ifdef HAVE_BG |
| pick_step_cores = false; |
| #else |
| xassert(job_resrcs_ptr->core_bitmap); |
| xassert(job_resrcs_ptr->core_bitmap_used); |
| if (step_ptr->core_bitmap_job) { |
| /* "scontrol reconfig" of live system */ |
| pick_step_cores = false; |
| } else if ((step_ptr->exclusive == 0) || |
| (step_ptr->cpu_count == job_ptr->total_procs)) { |
| /* Step uses all of job's cores |
| * Just copy the bitmap to save time */ |
| step_ptr->core_bitmap_job = bit_copy(job_resrcs_ptr->core_bitmap); |
| pick_step_cores = false; |
| } |
| #endif |
| |
| if (step_ptr->mem_per_cpu && _is_mem_resv() && |
| ((job_resrcs_ptr->memory_allocated == NULL) || |
| (job_resrcs_ptr->memory_used == NULL))) { |
| error("step_alloc_lps: lack memory allocation details " |
| "to enforce memory limits for job %u", job_ptr->job_id); |
| step_ptr->mem_per_cpu = 0; |
| } |
| |
| for (i_node = i_first; i_node <= i_last; i_node++) { |
| if (!bit_test(job_resrcs_ptr->node_bitmap, i_node)) |
| continue; |
| job_node_inx++; |
| if (!bit_test(step_ptr->step_node_bitmap, i_node)) |
| continue; |
| step_node_inx++; |
| if (job_node_inx >= job_resrcs_ptr->nhosts) |
| fatal("step_alloc_lps: node index bad"); |
| /* NOTE: The --overcommit option can result in |
| * cpus_used[] having a higher value than cpus[] */ |
| cpus_alloc = step_ptr->step_layout->tasks[step_node_inx] * |
| step_ptr->cpus_per_task; |
| job_resrcs_ptr->cpus_used[job_node_inx] += cpus_alloc; |
| if (step_ptr->mem_per_cpu && _is_mem_resv()) { |
| job_resrcs_ptr->memory_used[job_node_inx] += |
| (step_ptr->mem_per_cpu * cpus_alloc); |
| } |
| if (pick_step_cores) { |
| _pick_step_cores(step_ptr, job_resrcs_ptr, |
| job_node_inx, |
| step_ptr->step_layout-> |
| tasks[step_node_inx]); |
| } |
| if (slurm_get_debug_flags() & DEBUG_FLAG_CPU_BIND) |
| _dump_step_layout(step_ptr); |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) { |
| info("step alloc of %s procs: %u of %u", |
| node_record_table_ptr[i_node].name, |
| job_resrcs_ptr->cpus_used[job_node_inx], |
| job_resrcs_ptr->cpus[job_node_inx]); |
| } |
| if (step_node_inx == (step_ptr->step_layout->node_cnt - 1)) |
| break; |
| } |
| |
| } |
| |
| /* Dump a job step's CPU binding information. |
| * NOTE: The core_bitmap_job and node index are based upon |
| * the _job_ allocation */ |
| static void _dump_step_layout(struct step_record *step_ptr) |
| { |
| struct job_record* job_ptr = step_ptr->job_ptr; |
| job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; |
| int i, bit_inx, core_inx, node_inx, rep, sock_inx; |
| |
| if ((step_ptr->core_bitmap_job == NULL) || |
| (job_resrcs_ptr == NULL) || (job_resrcs_ptr->cores_per_socket == NULL)) |
| return; |
| |
| info("===================="); |
| info("step_id:%u.%u", job_ptr->job_id, step_ptr->step_id); |
| for (i=0, bit_inx=0, node_inx=0; node_inx<job_resrcs_ptr->nhosts; i++) { |
| for (rep=0; rep<job_resrcs_ptr->sock_core_rep_count[i]; rep++) { |
| for (sock_inx=0; |
| sock_inx<job_resrcs_ptr->sockets_per_node[i]; |
| sock_inx++) { |
| for (core_inx=0; |
| core_inx<job_resrcs_ptr->cores_per_socket[i]; |
| core_inx++) { |
| if (bit_test(step_ptr-> |
| core_bitmap_job, |
| bit_inx++)) { |
| info("JobNode[%d] Socket[%d] " |
| "Core[%d] is allocated", |
| node_inx, sock_inx, |
| core_inx); |
| } |
| } |
| } |
| node_inx++; |
| } |
| } |
| info("===================="); |
| } |
| |
| static void _step_dealloc_lps(struct step_record *step_ptr) |
| { |
| struct job_record *job_ptr = step_ptr->job_ptr; |
| job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; |
| int cpus_alloc; |
| int i_node, i_first, i_last; |
| int job_node_inx = -1, step_node_inx = -1; |
| |
| xassert(job_resrcs_ptr); |
| xassert(job_resrcs_ptr->cpus); |
| xassert(job_resrcs_ptr->cpus_used); |
| |
| if (step_ptr->step_layout == NULL) /* batch step */ |
| return; |
| |
| i_first = bit_ffs(job_resrcs_ptr->node_bitmap); |
| i_last = bit_fls(job_resrcs_ptr->node_bitmap); |
| if (i_first == -1) /* empty bitmap */ |
| return; |
| |
| if (step_ptr->mem_per_cpu && _is_mem_resv() && |
| ((job_resrcs_ptr->memory_allocated == NULL) || |
| (job_resrcs_ptr->memory_used == NULL))) { |
| error("_step_dealloc_lps: lack memory allocation details " |
| "to enforce memory limits for job %u", job_ptr->job_id); |
| step_ptr->mem_per_cpu = 0; |
| } |
| |
| for (i_node = i_first; i_node <= i_last; i_node++) { |
| if (!bit_test(job_resrcs_ptr->node_bitmap, i_node)) |
| continue; |
| job_node_inx++; |
| if (!bit_test(step_ptr->step_node_bitmap, i_node)) |
| continue; |
| step_node_inx++; |
| if (job_node_inx >= job_resrcs_ptr->nhosts) |
| fatal("_step_dealloc_lps: node index bad"); |
| cpus_alloc = step_ptr->step_layout->tasks[step_node_inx] * |
| step_ptr->cpus_per_task; |
| if (job_resrcs_ptr->cpus_used[job_node_inx] >= cpus_alloc) |
| job_resrcs_ptr->cpus_used[job_node_inx] -= cpus_alloc; |
| else { |
| error("_step_dealloc_lps: cpu underflow for %u.%u", |
| job_ptr->job_id, step_ptr->step_id); |
| job_resrcs_ptr->cpus_used[job_node_inx] = 0; |
| } |
| if (step_ptr->mem_per_cpu && _is_mem_resv()) { |
| uint32_t mem_use = step_ptr->mem_per_cpu * cpus_alloc; |
| if (job_resrcs_ptr->memory_used[job_node_inx] >= mem_use) { |
| job_resrcs_ptr->memory_used[job_node_inx] -= |
| mem_use; |
| } else { |
| error("_step_dealloc_lps: " |
| "mem underflow for %u.%u", |
| job_ptr->job_id, step_ptr->step_id); |
| job_resrcs_ptr->memory_used[job_node_inx] = 0; |
| } |
| } |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) { |
| info("step dealloc of %s procs: %u of %u", |
| node_record_table_ptr[i_node].name, |
| job_resrcs_ptr->cpus_used[job_node_inx], |
| job_resrcs_ptr->cpus[job_node_inx]); |
| } |
| if (step_node_inx == (step_ptr->step_layout->node_cnt - 1)) |
| break; |
| } |
| |
| #ifndef HAVE_BG |
| xassert(job_resrcs_ptr->core_bitmap); |
| xassert(job_resrcs_ptr->core_bitmap_used); |
| if (step_ptr->core_bitmap_job) { |
| /* Mark the job's cores as no longer in use */ |
| bit_not(step_ptr->core_bitmap_job); |
| bit_and(job_resrcs_ptr->core_bitmap_used, |
| step_ptr->core_bitmap_job); |
| /* no need for bit_not(step_ptr->core_bitmap_job); */ |
| FREE_NULL_BITMAP(step_ptr->core_bitmap_job); |
| } |
| #endif |
| } |
| |
| /* |
| * step_create - creates a step_record in step_specs->job_id, sets up the |
| * according to the step_specs. |
| * IN step_specs - job step specifications |
| * OUT new_step_record - pointer to the new step_record (NULL on error) |
| * IN kill_job_when_step_done - if set kill the job on step completion |
| * IN batch_step - if set then step is a batch script |
| * RET - 0 or error code |
| * NOTE: don't free the returned step_record because that is managed through |
| * the job. |
| */ |
| extern int |
| step_create(job_step_create_request_msg_t *step_specs, |
| struct step_record** new_step_record, |
| bool kill_job_when_step_done, bool batch_step) |
| { |
| struct step_record *step_ptr; |
| struct job_record *job_ptr; |
| bitstr_t *nodeset; |
| int cpus_per_task, node_count, ret_code, i; |
| time_t now = time(NULL); |
| char *step_node_list = NULL; |
| uint32_t orig_cpu_count; |
| |
| *new_step_record = NULL; |
| job_ptr = find_job_record (step_specs->job_id); |
| if (job_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID ; |
| |
| if ((job_ptr->details == NULL) || IS_JOB_SUSPENDED(job_ptr)) |
| return ESLURM_DISABLED; |
| |
| if (IS_JOB_PENDING(job_ptr)) { |
| /* NOTE: LSF creates a job allocation for batch jobs. |
| * After the allocation has been made, LSF submits a |
| * job to run in that allocation (sbatch --jobid= ...). |
| * If that job is pending either LSF messed up or LSF is |
| * not being used. We have seen this problem with Moab. */ |
| return ESLURM_DUPLICATE_JOB_ID; |
| } |
| |
| /* NOTE: We have already confirmed the UID originating |
| * the request is identical with step_specs->user_id */ |
| if (step_specs->user_id != job_ptr->user_id) |
| return ESLURM_ACCESS_DENIED ; |
| |
| if (batch_step) { |
| info("user %u attempting to run batch script within " |
| "an existing job", step_specs->user_id); |
| /* This seems hazardous to allow, but LSF seems to |
| * work this way, so don't treat it as an error. */ |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr) || |
| (job_ptr->end_time <= time(NULL))) |
| return ESLURM_ALREADY_DONE; |
| |
| if ((step_specs->task_dist != SLURM_DIST_CYCLIC) && |
| (step_specs->task_dist != SLURM_DIST_BLOCK) && |
| (step_specs->task_dist != SLURM_DIST_CYCLIC_CYCLIC) && |
| (step_specs->task_dist != SLURM_DIST_BLOCK_CYCLIC) && |
| (step_specs->task_dist != SLURM_DIST_CYCLIC_BLOCK) && |
| (step_specs->task_dist != SLURM_DIST_BLOCK_BLOCK) && |
| (step_specs->task_dist != SLURM_DIST_PLANE) && |
| (step_specs->task_dist != SLURM_DIST_ARBITRARY)) |
| return ESLURM_BAD_DIST; |
| |
| if ((step_specs->task_dist == SLURM_DIST_ARBITRARY) && |
| (!strcmp(slurmctld_conf.switch_type, "switch/elan"))) { |
| return ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED; |
| } |
| |
| if ((step_specs->host && |
| (strlen(step_specs->host) > MAX_STR_LEN)) || |
| (step_specs->node_list && |
| (strlen(step_specs->node_list) > MAX_STR_LEN)) || |
| (step_specs->network && |
| (strlen(step_specs->network) > MAX_STR_LEN)) || |
| (step_specs->name && |
| (strlen(step_specs->name) > MAX_STR_LEN)) || |
| (step_specs->ckpt_dir && |
| (strlen(step_specs->ckpt_dir) > MAX_STR_LEN))) |
| return ESLURM_PATHNAME_TOO_LONG; |
| |
| /* if the overcommit flag is checked, we 0 set cpu_count=0 |
| * which makes it so we don't check to see the available cpus |
| */ |
| orig_cpu_count = step_specs->cpu_count; |
| |
| if (step_specs->overcommit) { |
| if (step_specs->exclusive) { |
| /* Not really a legitimate combination, try to |
| * exclusively allocate one CPU per task */ |
| step_specs->overcommit = 0; |
| step_specs->cpu_count = step_specs->num_tasks; |
| } else |
| step_specs->cpu_count = 0; |
| } |
| |
| /* determine cpus_per_task value by reversing what srun does */ |
| if (step_specs->num_tasks < 1) |
| return ESLURM_BAD_TASK_COUNT; |
| |
| /* we set cpus_per_task to 0 if we can't spread them evenly |
| over the nodes (hetergeneous systems) */ |
| if (!step_specs->cpu_count |
| || (step_specs->cpu_count % step_specs->num_tasks)) |
| cpus_per_task = 0; |
| else { |
| cpus_per_task = step_specs->cpu_count / step_specs->num_tasks; |
| if (cpus_per_task < 1) |
| cpus_per_task = 1; |
| } |
| |
| if (step_specs->no_kill > 1) |
| step_specs->no_kill = 1; |
| |
| if (job_ptr->kill_on_step_done) |
| /* Don't start more steps, job already being cancelled */ |
| return ESLURM_ALREADY_DONE; |
| job_ptr->kill_on_step_done = kill_job_when_step_done; |
| |
| job_ptr->time_last_active = now; |
| nodeset = _pick_step_nodes(job_ptr, step_specs, |
| cpus_per_task, batch_step, &ret_code); |
| if (nodeset == NULL) |
| return ret_code; |
| node_count = bit_set_count(nodeset); |
| |
| if (step_specs->num_tasks == NO_VAL) { |
| if (step_specs->cpu_count != NO_VAL) |
| step_specs->num_tasks = step_specs->cpu_count; |
| else |
| step_specs->num_tasks = node_count; |
| } |
| |
| if (step_specs->num_tasks > |
| (node_count*slurmctld_conf.max_tasks_per_node)) { |
| error("step has invalid task count: %u", |
| step_specs->num_tasks); |
| bit_free(nodeset); |
| return ESLURM_BAD_TASK_COUNT; |
| } |
| |
| step_ptr = _create_step_record (job_ptr); |
| if (step_ptr == NULL) { |
| bit_free(nodeset); |
| return ESLURMD_TOOMANYSTEPS; |
| } |
| step_ptr->step_id = job_ptr->next_step_id++; |
| |
| /* set the step_record values */ |
| |
| /* Here is where the node list is set for the step */ |
| if (step_specs->node_list && |
| (step_specs->task_dist == SLURM_DIST_ARBITRARY)) { |
| step_node_list = xstrdup(step_specs->node_list); |
| xfree(step_specs->node_list); |
| step_specs->node_list = bitmap2node_name(nodeset); |
| } else { |
| step_node_list = bitmap2node_name(nodeset); |
| xfree(step_specs->node_list); |
| step_specs->node_list = xstrdup(step_node_list); |
| } |
| if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) { |
| verbose("got %s and %s looking for %d nodes", step_node_list, |
| step_specs->node_list, step_specs->node_count); |
| } |
| step_ptr->step_node_bitmap = nodeset; |
| |
| switch(step_specs->task_dist) { |
| case SLURM_DIST_CYCLIC: |
| case SLURM_DIST_CYCLIC_CYCLIC: |
| case SLURM_DIST_CYCLIC_BLOCK: |
| step_ptr->cyclic_alloc = 1; |
| break; |
| default: |
| step_ptr->cyclic_alloc = 0; |
| break; |
| } |
| |
| step_ptr->port = step_specs->port; |
| step_ptr->host = xstrdup(step_specs->host); |
| step_ptr->batch_step = batch_step; |
| step_ptr->cpus_per_task = (uint16_t)cpus_per_task; |
| step_ptr->mem_per_cpu = step_specs->mem_per_cpu; |
| step_ptr->ckpt_interval = step_specs->ckpt_interval; |
| step_ptr->ckpt_time = now; |
| step_ptr->cpu_count = orig_cpu_count; |
| step_ptr->exit_code = NO_VAL; |
| step_ptr->exclusive = step_specs->exclusive; |
| step_ptr->ckpt_dir = xstrdup(step_specs->ckpt_dir); |
| step_ptr->no_kill = step_specs->no_kill; |
| |
| /* step's name and network default to job's values if not |
| * specified in the step specification */ |
| if (step_specs->name && step_specs->name[0]) |
| step_ptr->name = xstrdup(step_specs->name); |
| else |
| step_ptr->name = xstrdup(job_ptr->name); |
| if (step_specs->network && step_specs->network[0]) |
| step_ptr->network = xstrdup(step_specs->network); |
| else |
| step_ptr->network = xstrdup(job_ptr->network); |
| |
| /* the step time_limit is recorded as submitted (INFINITE |
| * or partition->max_time by default), but the allocation |
| * time limits may cut it short */ |
| if (step_specs->time_limit == NO_VAL || step_specs->time_limit == 0 || |
| step_specs->time_limit == INFINITE) { |
| step_ptr->time_limit = INFINITE; |
| } else { |
| /* enforce partition limits if necessary */ |
| if ((step_specs->time_limit > job_ptr->part_ptr->max_time) && |
| slurmctld_conf.enforce_part_limits) { |
| info("_step_create: step time greater than partition's " |
| "(%u > %u)", step_specs->time_limit, |
| job_ptr->part_ptr->max_time); |
| delete_step_record (job_ptr, step_ptr->step_id); |
| return ESLURM_INVALID_TIME_LIMIT; |
| } |
| step_ptr->time_limit = step_specs->time_limit; |
| } |
| |
| /* a batch script does not need switch info */ |
| if (!batch_step) { |
| step_ptr->step_layout = |
| step_layout_create(step_ptr, |
| step_node_list, |
| step_specs->node_count, |
| step_specs->num_tasks, |
| (uint16_t)cpus_per_task, |
| step_specs->task_dist, |
| step_specs->plane_size); |
| xfree(step_node_list); |
| if (!step_ptr->step_layout) { |
| delete_step_record (job_ptr, step_ptr->step_id); |
| if (step_specs->mem_per_cpu) |
| return ESLURM_INVALID_TASK_MEMORY; |
| return SLURM_ERROR; |
| } |
| |
| if ((step_specs->resv_port_cnt != (uint16_t) NO_VAL) && |
| (step_specs->resv_port_cnt == 0)) { |
| /* reserved port count set to maximum task count on |
| * any node plus one */ |
| for (i=0; i<step_ptr->step_layout->node_cnt; i++) { |
| step_specs->resv_port_cnt = |
| MAX(step_specs->resv_port_cnt, |
| step_ptr->step_layout->tasks[i]); |
| } |
| step_specs->resv_port_cnt++; |
| } |
| if (step_specs->resv_port_cnt != (uint16_t) NO_VAL) { |
| step_ptr->resv_port_cnt = step_specs->resv_port_cnt; |
| i = resv_port_alloc(step_ptr); |
| if (i != SLURM_SUCCESS) { |
| delete_step_record (job_ptr, |
| step_ptr->step_id); |
| return i; |
| } |
| } |
| |
| if (switch_alloc_jobinfo (&step_ptr->switch_job) < 0) |
| fatal ("step_create: switch_alloc_jobinfo error"); |
| |
| if (switch_build_jobinfo(step_ptr->switch_job, |
| step_ptr->step_layout->node_list, |
| step_ptr->step_layout->tasks, |
| step_ptr->cyclic_alloc, |
| step_ptr->network) < 0) { |
| error("switch_build_jobinfo: %m"); |
| delete_step_record (job_ptr, step_ptr->step_id); |
| return ESLURM_INTERCONNECT_FAILURE; |
| } |
| step_alloc_lps(step_ptr); |
| } else |
| xfree(step_node_list); |
| if (checkpoint_alloc_jobinfo (&step_ptr->check_job) < 0) |
| fatal ("step_create: checkpoint_alloc_jobinfo error"); |
| *new_step_record = step_ptr; |
| jobacct_storage_g_step_start(acct_db_conn, step_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr, |
| char *step_node_list, |
| uint32_t node_count, |
| uint32_t num_tasks, |
| uint16_t cpus_per_task, |
| uint16_t task_dist, |
| uint32_t plane_size) |
| { |
| uint16_t cpus_per_node[node_count]; |
| uint32_t cpu_count_reps[node_count]; |
| int cpu_inx = -1; |
| int i, usable_cpus, usable_mem; |
| int set_nodes = 0/* , set_tasks = 0 */; |
| int pos = -1; |
| int first_bit, last_bit; |
| struct job_record *job_ptr = step_ptr->job_ptr; |
| job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; |
| |
| xassert(job_resrcs_ptr); |
| xassert(job_resrcs_ptr->cpus); |
| xassert(job_resrcs_ptr->cpus_used); |
| |
| if (step_ptr->mem_per_cpu && _is_mem_resv() && |
| ((job_resrcs_ptr->memory_allocated == NULL) || |
| (job_resrcs_ptr->memory_used == NULL))) { |
| error("step_layout_create: lack memory allocation details " |
| "to enforce memory limits for job %u", job_ptr->job_id); |
| step_ptr->mem_per_cpu = 0; |
| } |
| |
| /* build the cpus-per-node arrays for the subset of nodes |
| * used by this job step */ |
| first_bit = bit_ffs(step_ptr->step_node_bitmap); |
| last_bit = bit_fls(step_ptr->step_node_bitmap); |
| for (i = first_bit; i <= last_bit; i++) { |
| if (bit_test(step_ptr->step_node_bitmap, i)) { |
| /* find out the position in the job */ |
| pos = bit_get_pos_num(job_resrcs_ptr->node_bitmap, i); |
| if (pos == -1) |
| return NULL; |
| if (pos >= job_resrcs_ptr->nhosts) |
| fatal("step_layout_create: node index bad"); |
| if (step_ptr->exclusive) { |
| usable_cpus = job_resrcs_ptr->cpus[pos] - |
| job_resrcs_ptr->cpus_used[pos]; |
| } else |
| usable_cpus = job_resrcs_ptr->cpus[pos]; |
| if (step_ptr->mem_per_cpu && _is_mem_resv()) { |
| usable_mem = |
| job_resrcs_ptr->memory_allocated[pos]- |
| job_resrcs_ptr->memory_used[pos]; |
| usable_mem /= step_ptr->mem_per_cpu; |
| usable_cpus = MIN(usable_cpus, usable_mem); |
| } |
| if (usable_cpus <= 0) { |
| error("step_layout_create no usable cpus"); |
| return NULL; |
| } |
| debug3("step_layout cpus = %d pos = %d", |
| usable_cpus, pos); |
| |
| if ((cpu_inx == -1) || |
| (cpus_per_node[cpu_inx] != usable_cpus)) { |
| cpu_inx++; |
| |
| cpus_per_node[cpu_inx] = usable_cpus; |
| cpu_count_reps[cpu_inx] = 1; |
| } else |
| cpu_count_reps[cpu_inx]++; |
| set_nodes++; |
| /*FIX ME: on a heterogeneous system running |
| the linear select plugin we could get a node |
| that doesn't have as many cpus as we decided |
| we needed for each task. This would result |
| in not getting a task for the node we |
| received. This is usually in error. This |
| only happens when the person doesn't specify |
| how many cpus_per_task they want, and we |
| have to come up with a number, in this case |
| it is wrong. |
| */ |
| /* if (cpus_per_task > 0) */ |
| /* set_tasks += */ |
| /* (uint16_t)usable_cpus / cpus_per_task; */ |
| /* else */ |
| /* /\* since cpus_per_task is 0 we just */ |
| /* add the number of cpus available */ |
| /* for this job *\/ */ |
| /* set_tasks += usable_cpus; */ |
| /* info("usable_cpus is %d and set_tasks %d %d", */ |
| /* usable_cpus, set_tasks, cpus_per_task); */ |
| if (set_nodes == node_count) |
| break; |
| } |
| } |
| |
| /* if (set_tasks < num_tasks) { */ |
| /* error("Resources only available for %u of %u tasks", */ |
| /* set_tasks, num_tasks); */ |
| /* return NULL; */ |
| /* } */ |
| |
| /* layout the tasks on the nodes */ |
| return slurm_step_layout_create(step_node_list, |
| cpus_per_node, cpu_count_reps, |
| node_count, num_tasks, |
| cpus_per_task, |
| task_dist, |
| plane_size); |
| } |
| |
| /* Pack the data for a specific job step record |
| * IN step - pointer to a job step record |
| * IN/OUT buffer - location to store data, pointers automatically advanced |
| */ |
| static void _pack_ctld_job_step_info(struct step_record *step_ptr, Buf buffer) |
| { |
| int task_cnt; |
| char *node_list = NULL; |
| time_t begin_time, run_time; |
| bitstr_t *pack_bitstr; |
| #ifdef HAVE_FRONT_END |
| /* On front end systems the steps are only |
| * given the first node to run off of |
| * so we need to make them appear like |
| * they are running on the entire |
| * space (which they really are). |
| */ |
| task_cnt = step_ptr->job_ptr->num_procs; |
| node_list = step_ptr->job_ptr->nodes; |
| pack_bitstr = step_ptr->job_ptr->node_bitmap; |
| #else |
| pack_bitstr = step_ptr->step_node_bitmap; |
| if (step_ptr->step_layout) { |
| task_cnt = step_ptr->step_layout->task_cnt; |
| node_list = step_ptr->step_layout->node_list; |
| } else { |
| task_cnt = step_ptr->job_ptr->num_procs; |
| node_list = step_ptr->job_ptr->nodes; |
| } |
| #endif |
| pack32(step_ptr->job_ptr->job_id, buffer); |
| pack32(step_ptr->step_id, buffer); |
| pack16(step_ptr->ckpt_interval, buffer); |
| pack32(step_ptr->job_ptr->user_id, buffer); |
| #ifdef HAVE_BG |
| if (step_ptr->job_ptr->total_procs) |
| pack32(step_ptr->job_ptr->total_procs, buffer); |
| else |
| pack32(step_ptr->job_ptr->num_procs, buffer); |
| #else |
| pack32(step_ptr->cpu_count, buffer); |
| #endif |
| pack32(task_cnt, buffer); |
| |
| pack32(step_ptr->time_limit, buffer); |
| pack_time(step_ptr->start_time, buffer); |
| if (IS_JOB_SUSPENDED(step_ptr->job_ptr)) { |
| run_time = step_ptr->pre_sus_time; |
| } else { |
| begin_time = MAX(step_ptr->start_time, |
| step_ptr->job_ptr->suspend_time); |
| run_time = step_ptr->pre_sus_time + |
| difftime(time(NULL), begin_time); |
| } |
| pack_time(run_time, buffer); |
| packstr(step_ptr->job_ptr->partition, buffer); |
| packstr(step_ptr->resv_ports, buffer); |
| packstr(node_list, buffer); |
| packstr(step_ptr->name, buffer); |
| packstr(step_ptr->network, buffer); |
| pack_bit_fmt(pack_bitstr, buffer); |
| packstr(step_ptr->ckpt_dir, buffer); |
| } |
| |
| /* |
| * pack_ctld_job_step_info_response_msg - packs job step info |
| * IN job_id - specific id or NO_VAL for all |
| * IN step_id - specific id or NO_VAL for all |
| * IN uid - user issuing request |
| * IN show_flags - job step filtering options |
| * OUT buffer - location to store data, pointers automatically advanced |
| * RET - 0 or error code |
| * NOTE: MUST free_buf buffer |
| */ |
| extern int pack_ctld_job_step_info_response_msg( |
| uint32_t job_id, uint32_t step_id, uid_t uid, |
| uint16_t show_flags, Buf buffer) |
| { |
| ListIterator job_iterator; |
| ListIterator step_iterator; |
| int error_code = 0; |
| uint32_t steps_packed = 0, tmp_offset; |
| struct step_record *step_ptr; |
| struct job_record *job_ptr; |
| time_t now = time(NULL); |
| int valid_job = 0; |
| |
| pack_time(now, buffer); |
| pack32(steps_packed, buffer); /* steps_packed placeholder */ |
| |
| part_filter_set(uid); |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = list_next(job_iterator))) { |
| if ((job_id != NO_VAL) && (job_ptr->job_id != job_id)) |
| continue; |
| |
| if (((show_flags & SHOW_ALL) == 0) |
| && (job_ptr->part_ptr) |
| && (job_ptr->part_ptr->hidden)) |
| continue; |
| |
| if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) |
| && (job_ptr->user_id != uid) |
| && !validate_super_user(uid)) |
| continue; |
| |
| valid_job = 1; |
| |
| step_iterator = list_iterator_create(job_ptr->step_list); |
| while ((step_ptr = list_next(step_iterator))) { |
| if ((step_id != NO_VAL) |
| && (step_ptr->step_id != step_id)) |
| continue; |
| _pack_ctld_job_step_info(step_ptr, buffer); |
| steps_packed++; |
| } |
| list_iterator_destroy(step_iterator); |
| } |
| list_iterator_destroy(job_iterator); |
| |
| if (list_count(job_list) && !valid_job && !steps_packed) |
| error_code = ESLURM_INVALID_JOB_ID; |
| |
| part_filter_clear(); |
| |
| /* put the real record count in the message body header */ |
| tmp_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, 0); |
| pack_time(now, buffer); |
| pack32(steps_packed, buffer); |
| set_buf_offset(buffer, tmp_offset); |
| |
| return error_code; |
| } |
| |
| /* |
| * kill_step_on_node - determine if the specified job has any job steps |
| * allocated to the specified node and kill them unless no_kill flag |
| * is set on the step |
| * IN job_ptr - pointer to an active job record |
| * IN node_ptr - pointer to a node record |
| * RET count of killed job steps |
| */ |
| extern int kill_step_on_node(struct job_record *job_ptr, |
| struct node_record *node_ptr) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| int found = 0; |
| int bit_position; |
| |
| if ((job_ptr == NULL) || (node_ptr == NULL)) |
| return found; |
| |
| bit_position = node_ptr - node_record_table_ptr; |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if (step_ptr->no_kill || |
| (bit_test(step_ptr->step_node_bitmap, bit_position) == 0)) |
| continue; |
| info("killing step %u.%u on down node %s", |
| job_ptr->job_id, step_ptr->step_id, node_ptr->name); |
| srun_step_complete(step_ptr); |
| signal_step_tasks(step_ptr, SIGKILL, REQUEST_TERMINATE_TASKS); |
| found++; |
| } |
| |
| list_iterator_destroy (step_iterator); |
| return found; |
| } |
| |
| /* |
| * job_step_checkpoint - perform some checkpoint operation |
| * IN ckpt_ptr - checkpoint request message |
| * IN uid - user id of the user issuing the RPC |
| * IN conn_fd - file descriptor on which to send reply |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_step_checkpoint(checkpoint_msg_t *ckpt_ptr, |
| uid_t uid, slurm_fd conn_fd) |
| { |
| int rc = SLURM_SUCCESS; |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| checkpoint_resp_msg_t resp_data; |
| slurm_msg_t resp_msg; |
| |
| slurm_msg_t_init(&resp_msg); |
| |
| /* find the job */ |
| job_ptr = find_job_record (ckpt_ptr->job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| if ((uid != job_ptr->user_id) && (uid != 0)) { |
| rc = ESLURM_ACCESS_DENIED ; |
| goto reply; |
| } |
| if (IS_JOB_PENDING(job_ptr)) { |
| rc = ESLURM_JOB_PENDING; |
| goto reply; |
| } else if (IS_JOB_SUSPENDED(job_ptr)) { |
| /* job can't get cycles for checkpoint |
| * if it is already suspended */ |
| rc = ESLURM_DISABLED; |
| goto reply; |
| } else if (!IS_JOB_RUNNING(job_ptr)) { |
| rc = ESLURM_ALREADY_DONE; |
| goto reply; |
| } |
| |
| memset((void *)&resp_data, 0, sizeof(checkpoint_resp_msg_t)); |
| step_ptr = find_step_record(job_ptr, ckpt_ptr->step_id); |
| if (step_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| } else { |
| if (ckpt_ptr->image_dir == NULL) { |
| ckpt_ptr->image_dir = xstrdup(step_ptr->ckpt_dir); |
| } |
| xstrfmtcat(ckpt_ptr->image_dir, "/%u.%u", job_ptr->job_id, |
| step_ptr->step_id); |
| |
| rc = checkpoint_op(ckpt_ptr->job_id, ckpt_ptr->step_id, |
| step_ptr, ckpt_ptr->op, ckpt_ptr->data, |
| ckpt_ptr->image_dir, &resp_data.event_time, |
| &resp_data.error_code, |
| &resp_data.error_msg); |
| last_job_update = time(NULL); |
| } |
| |
| reply: |
| if ((rc == SLURM_SUCCESS) && |
| ((ckpt_ptr->op == CHECK_ABLE) || (ckpt_ptr->op == CHECK_ERROR))) { |
| resp_msg.msg_type = RESPONSE_CHECKPOINT; |
| resp_msg.data = &resp_data; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| } else { |
| return_code_msg_t rc_msg; |
| rc_msg.return_code = rc; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| resp_msg.data = &rc_msg; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| } |
| return rc; |
| } |
| |
| /* |
| * job_step_checkpoint_comp - note job step checkpoint completion |
| * IN ckpt_ptr - checkpoint complete status message |
| * IN uid - user id of the user issuing the RPC |
| * IN conn_fd - file descriptor on which to send reply |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr, |
| uid_t uid, slurm_fd conn_fd) |
| { |
| int rc = SLURM_SUCCESS; |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| slurm_msg_t resp_msg; |
| return_code_msg_t rc_msg; |
| |
| slurm_msg_t_init(&resp_msg); |
| |
| /* find the job */ |
| job_ptr = find_job_record (ckpt_ptr->job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| if ((uid != job_ptr->user_id) && (uid != 0)) { |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| if (IS_JOB_PENDING(job_ptr)) { |
| rc = ESLURM_JOB_PENDING; |
| goto reply; |
| } else if (!IS_JOB_RUNNING(job_ptr) && |
| !IS_JOB_SUSPENDED(job_ptr)) { |
| rc = ESLURM_ALREADY_DONE; |
| goto reply; |
| } |
| |
| step_ptr = find_step_record(job_ptr, ckpt_ptr->step_id); |
| if (step_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } else { |
| rc = checkpoint_comp((void *)step_ptr, ckpt_ptr->begin_time, |
| ckpt_ptr->error_code, ckpt_ptr->error_msg); |
| last_job_update = time(NULL); |
| } |
| |
| reply: |
| rc_msg.return_code = rc; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| resp_msg.data = &rc_msg; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| return rc; |
| } |
| |
| /* |
| * job_step_checkpoint_task_comp - note task checkpoint completion |
| * IN ckpt_ptr - checkpoint task complete status message |
| * IN uid - user id of the user issuing the RPC |
| * IN conn_fd - file descriptor on which to send reply |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_step_checkpoint_task_comp(checkpoint_task_comp_msg_t *ckpt_ptr, |
| uid_t uid, slurm_fd conn_fd) |
| { |
| int rc = SLURM_SUCCESS; |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| slurm_msg_t resp_msg; |
| return_code_msg_t rc_msg; |
| |
| slurm_msg_t_init(&resp_msg); |
| |
| /* find the job */ |
| job_ptr = find_job_record (ckpt_ptr->job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| if ((uid != job_ptr->user_id) && (uid != 0)) { |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| if (IS_JOB_PENDING(job_ptr)) { |
| rc = ESLURM_JOB_PENDING; |
| goto reply; |
| } else if (!IS_JOB_RUNNING(job_ptr) && |
| !IS_JOB_SUSPENDED(job_ptr)) { |
| rc = ESLURM_ALREADY_DONE; |
| goto reply; |
| } |
| |
| step_ptr = find_step_record(job_ptr, ckpt_ptr->step_id); |
| if (step_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } else { |
| rc = checkpoint_task_comp((void *)step_ptr, |
| ckpt_ptr->task_id, ckpt_ptr->begin_time, |
| ckpt_ptr->error_code, ckpt_ptr->error_msg); |
| last_job_update = time(NULL); |
| } |
| |
| reply: |
| rc_msg.return_code = rc; |
| resp_msg.msg_type = RESPONSE_SLURM_RC; |
| resp_msg.data = &rc_msg; |
| (void) slurm_send_node_msg(conn_fd, &resp_msg); |
| return rc; |
| } |
| |
| /* |
| * step_partial_comp - Note the completion of a job step on at least |
| * some of its nodes |
| * IN req - step_completion_msg RPC from slurmstepd |
| * IN uid - UID issuing the request |
| * OUT rem - count of nodes for which responses are still pending |
| * OUT max_rc - highest return code for any step thus far |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, |
| int *rem, uint32_t *max_rc) |
| { |
| struct job_record *job_ptr; |
| struct step_record *step_ptr; |
| int nodes, rem_nodes; |
| |
| /* find the job, step, and validate input */ |
| job_ptr = find_job_record (req->job_id); |
| if (job_ptr == NULL) { |
| info("step_partial_comp: JobID=%u invalid", req->job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| if (IS_JOB_PENDING(job_ptr)) { |
| info("step_partial_comp: JobID=%u pending", req->job_id); |
| return ESLURM_JOB_PENDING; |
| } |
| |
| if ((!validate_super_user(uid)) && (uid != job_ptr->user_id)) { |
| /* Normally from slurmstepd, from srun on some failures */ |
| error("Security violation: " |
| "REQUEST_STEP_COMPLETE RPC for job %u from uid=%u", |
| job_ptr->job_id, (unsigned int) uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| step_ptr = find_step_record(job_ptr, req->job_step_id); |
| if (step_ptr == NULL) { |
| info("step_partial_comp: StepID=%u.%u invalid", |
| req->job_id, req->job_step_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| if (step_ptr->batch_step) { |
| if (rem) |
| *rem = 0; |
| step_ptr->exit_code = req->step_rc; |
| if (max_rc) |
| *max_rc = step_ptr->exit_code; |
| jobacct_gather_g_aggregate(step_ptr->jobacct, req->jobacct); |
| /* we don't want to delete the step record here since |
| right after we delete this step again if we delete |
| it here we won't find it when we try the second |
| time */ |
| //delete_step_record(job_ptr, req->job_step_id); |
| return SLURM_SUCCESS; |
| } |
| if (req->range_last < req->range_first) { |
| error("step_partial_comp: JobID=%u range=%u-%u", |
| req->job_id, req->range_first, req->range_last); |
| return EINVAL; |
| } |
| |
| jobacct_gather_g_aggregate(step_ptr->jobacct, req->jobacct); |
| |
| if (!step_ptr->exit_node_bitmap) { |
| /* initialize the node bitmap for exited nodes */ |
| nodes = bit_set_count(step_ptr->step_node_bitmap); |
| if (req->range_last >= nodes) { /* range is zero origin */ |
| error("step_partial_comp: JobID=%u last=%u, nodes=%d", |
| req->job_id, req->range_last, nodes); |
| return EINVAL; |
| } |
| step_ptr->exit_node_bitmap = bit_alloc(nodes); |
| if (step_ptr->exit_node_bitmap == NULL) |
| fatal("bit_alloc: %m"); |
| step_ptr->exit_code = req->step_rc; |
| } else { |
| nodes = _bitstr_bits(step_ptr->exit_node_bitmap); |
| if (req->range_last >= nodes) { /* range is zero origin */ |
| error("step_partial_comp: JobID=%u last=%u, nodes=%d", |
| req->job_id, req->range_last, nodes); |
| return EINVAL; |
| } |
| step_ptr->exit_code = MAX(step_ptr->exit_code, req->step_rc); |
| } |
| |
| bit_nset(step_ptr->exit_node_bitmap, |
| req->range_first, req->range_last); |
| rem_nodes = bit_clear_count(step_ptr->exit_node_bitmap); |
| if (rem) |
| *rem = rem_nodes; |
| if (rem_nodes == 0) { |
| /* release all switch windows */ |
| if (step_ptr->switch_job) { |
| debug2("full switch release for step %u.%u, " |
| "nodes %s", req->job_id, |
| req->job_step_id, |
| step_ptr->step_layout->node_list); |
| switch_g_job_step_complete( |
| step_ptr->switch_job, |
| step_ptr->step_layout->node_list); |
| switch_free_jobinfo (step_ptr->switch_job); |
| step_ptr->switch_job = NULL; |
| } |
| } else if (switch_g_part_comp() && step_ptr->switch_job) { |
| /* release switch windows on completed nodes, |
| * must translate range numbers to nodelist */ |
| hostlist_t hl; |
| char *node_list; |
| int new_size = 8096; |
| |
| hl = _step_range_to_hostlist(step_ptr, |
| req->range_first, req->range_last); |
| node_list = (char *) xmalloc(new_size); |
| while (hostlist_ranged_string(hl, new_size, |
| node_list) == -1) { |
| new_size *= 2; |
| xrealloc(node_list, new_size ); |
| } |
| debug2("partitial switch release for step %u.%u, " |
| "nodes %s", req->job_id, |
| req->job_step_id, node_list); |
| switch_g_job_step_part_comp( |
| step_ptr->switch_job, node_list); |
| hostlist_destroy(hl); |
| xfree(node_list); |
| } |
| |
| if (max_rc) |
| *max_rc = step_ptr->exit_code; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* convert a range of nodes allocated to a step to a hostlist with |
| * names of those nodes */ |
| static hostlist_t _step_range_to_hostlist(struct step_record *step_ptr, |
| uint32_t range_first, uint32_t range_last) |
| { |
| int i, node_inx = -1; |
| hostlist_t hl = hostlist_create(""); |
| |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(step_ptr->step_node_bitmap, i) == 0) |
| continue; |
| node_inx++; |
| if ((node_inx >= range_first) |
| && (node_inx <= range_last)) { |
| hostlist_push(hl, |
| node_record_table_ptr[i].name); |
| } |
| } |
| |
| return hl; |
| } |
| |
| /* convert a single node name to it's offset within a step's |
| * nodes allocation. returns -1 on error */ |
| static int _step_hostname_to_inx(struct step_record *step_ptr, |
| char *node_name) |
| { |
| struct node_record *node_ptr; |
| int i, node_inx, node_offset = 0; |
| |
| node_ptr = find_node_record(node_name); |
| if (node_ptr == NULL) |
| return -1; |
| node_inx = node_ptr - node_record_table_ptr; |
| |
| for (i = 0; i < node_inx; i++) { |
| if (bit_test(step_ptr->step_node_bitmap, i)) |
| node_offset++; |
| } |
| return node_offset; |
| } |
| |
| extern int step_epilog_complete(struct job_record *job_ptr, |
| char *node_name) |
| { |
| int rc = 0, node_inx, step_offset; |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| struct node_record *node_ptr; |
| |
| if (!switch_g_part_comp()) { |
| /* don't bother with partitial completions */ |
| return 0; |
| } |
| if ((node_ptr = find_node_record(node_name)) == NULL) |
| return 0; |
| node_inx = node_ptr - node_record_table_ptr; |
| |
| step_iterator = list_iterator_create(job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| if ((!step_ptr->switch_job) |
| || (bit_test(step_ptr->step_node_bitmap, node_inx) == 0)) |
| continue; |
| if (step_ptr->exit_node_bitmap) { |
| step_offset = _step_hostname_to_inx( |
| step_ptr, node_name); |
| if ((step_offset < 0) |
| || bit_test(step_ptr->exit_node_bitmap, |
| step_offset)) |
| continue; |
| bit_set(step_ptr->exit_node_bitmap, |
| step_offset); |
| } |
| rc++; |
| debug2("partitial switch release for step %u.%u, " |
| "epilog on %s", job_ptr->job_id, |
| step_ptr->step_id, node_name); |
| switch_g_job_step_part_comp( |
| step_ptr->switch_job, node_name); |
| } |
| list_iterator_destroy (step_iterator); |
| |
| return rc; |
| } |
| |
| static void |
| _suspend_job_step(struct job_record *job_ptr, |
| struct step_record *step_ptr, time_t now) |
| { |
| if ((job_ptr->suspend_time) |
| && (job_ptr->suspend_time > step_ptr->start_time)) { |
| step_ptr->pre_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else { |
| step_ptr->pre_sus_time += |
| difftime(now, step_ptr->start_time); |
| } |
| |
| } |
| |
| /* Update time stamps for job step suspend */ |
| extern void |
| suspend_job_step(struct job_record *job_ptr) |
| { |
| time_t now = time(NULL); |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| _suspend_job_step(job_ptr, step_ptr, now); |
| } |
| list_iterator_destroy (step_iterator); |
| } |
| |
| static void |
| _resume_job_step(struct job_record *job_ptr, |
| struct step_record *step_ptr, time_t now) |
| { |
| if ((job_ptr->suspend_time) && |
| (job_ptr->suspend_time < step_ptr->start_time)) { |
| step_ptr->tot_sus_time += |
| difftime(now, step_ptr->start_time); |
| } else { |
| step_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } |
| } |
| |
| /* Update time stamps for job step resume */ |
| extern void |
| resume_job_step(struct job_record *job_ptr) |
| { |
| time_t now = time(NULL); |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| _resume_job_step(job_ptr, step_ptr, now); |
| } |
| list_iterator_destroy (step_iterator); |
| } |
| |
| |
| /* |
| * dump_job_step_state - dump the state of a specific job step to a buffer, |
| * load with load_step_state |
| * IN step_ptr - pointer to job step for which information is to be dumpped |
| * IN/OUT buffer - location to store data, pointers automatically advanced |
| */ |
| extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer) |
| { |
| pack32(step_ptr->step_id, buffer); |
| pack16(step_ptr->cyclic_alloc, buffer); |
| pack16(step_ptr->port, buffer); |
| pack16(step_ptr->ckpt_interval, buffer); |
| pack16(step_ptr->cpus_per_task, buffer); |
| pack16(step_ptr->resv_port_cnt, buffer); |
| |
| pack8(step_ptr->no_kill, buffer); |
| |
| pack32(step_ptr->cpu_count, buffer); |
| pack32(step_ptr->mem_per_cpu, buffer); |
| pack32(step_ptr->exit_code, buffer); |
| if (step_ptr->exit_code != NO_VAL) { |
| pack_bit_fmt(step_ptr->exit_node_bitmap, buffer); |
| pack16((uint16_t) _bitstr_bits(step_ptr->exit_node_bitmap), |
| buffer); |
| } |
| if (step_ptr->core_bitmap_job) { |
| uint32_t core_size = bit_size(step_ptr->core_bitmap_job); |
| pack32(core_size, buffer); |
| pack_bit_fmt(step_ptr->core_bitmap_job, buffer); |
| } else |
| pack32((uint32_t) 0, buffer); |
| |
| pack32(step_ptr->time_limit, buffer); |
| pack_time(step_ptr->start_time, buffer); |
| pack_time(step_ptr->pre_sus_time, buffer); |
| pack_time(step_ptr->tot_sus_time, buffer); |
| pack_time(step_ptr->ckpt_time, buffer); |
| |
| packstr(step_ptr->host, buffer); |
| packstr(step_ptr->resv_ports, buffer); |
| packstr(step_ptr->name, buffer); |
| packstr(step_ptr->network, buffer); |
| packstr(step_ptr->ckpt_dir, buffer); |
| pack16(step_ptr->batch_step, buffer); |
| if (!step_ptr->batch_step) { |
| pack_slurm_step_layout(step_ptr->step_layout, buffer); |
| switch_pack_jobinfo(step_ptr->switch_job, buffer); |
| } |
| checkpoint_pack_jobinfo(step_ptr->check_job, buffer); |
| } |
| |
| /* |
| * Create a new job step from data in a buffer (as created by |
| * dump_job_step_state) |
| * IN/OUT - job_ptr - point to a job for which the step is to be loaded. |
| * IN/OUT buffer - location to get data from, pointers advanced |
| */ |
| extern int load_step_state(struct job_record *job_ptr, Buf buffer) |
| { |
| struct step_record *step_ptr = NULL; |
| uint8_t no_kill; |
| uint16_t cyclic_alloc, port, batch_step, bit_cnt; |
| uint16_t ckpt_interval, cpus_per_task, resv_port_cnt; |
| uint32_t core_size, cpu_count, exit_code, mem_per_cpu, name_len; |
| uint32_t step_id, time_limit; |
| time_t start_time, pre_sus_time, tot_sus_time, ckpt_time; |
| char *host = NULL, *ckpt_dir = NULL, *core_job = NULL; |
| char *resv_ports = NULL, *name = NULL, *network = NULL, *bit_fmt = NULL; |
| switch_jobinfo_t *switch_tmp = NULL; |
| check_jobinfo_t check_tmp = NULL; |
| slurm_step_layout_t *step_layout = NULL; |
| |
| safe_unpack32(&step_id, buffer); |
| safe_unpack16(&cyclic_alloc, buffer); |
| safe_unpack16(&port, buffer); |
| safe_unpack16(&ckpt_interval, buffer); |
| safe_unpack16(&cpus_per_task, buffer); |
| safe_unpack16(&resv_port_cnt, buffer); |
| |
| safe_unpack8(&no_kill, buffer); |
| |
| safe_unpack32(&cpu_count, buffer); |
| safe_unpack32(&mem_per_cpu, buffer); |
| safe_unpack32(&exit_code, buffer); |
| if (exit_code != NO_VAL) { |
| safe_unpackstr_xmalloc(&bit_fmt, &name_len, buffer); |
| safe_unpack16(&bit_cnt, buffer); |
| } |
| safe_unpack32(&core_size, buffer); |
| if (core_size) |
| safe_unpackstr_xmalloc(&core_job, &name_len, buffer); |
| |
| safe_unpack32(&time_limit, buffer); |
| safe_unpack_time(&start_time, buffer); |
| safe_unpack_time(&pre_sus_time, buffer); |
| safe_unpack_time(&tot_sus_time, buffer); |
| safe_unpack_time(&ckpt_time, buffer); |
| |
| safe_unpackstr_xmalloc(&host, &name_len, buffer); |
| safe_unpackstr_xmalloc(&resv_ports, &name_len, buffer); |
| safe_unpackstr_xmalloc(&name, &name_len, buffer); |
| safe_unpackstr_xmalloc(&network, &name_len, buffer); |
| safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer); |
| safe_unpack16(&batch_step, buffer); |
| if (!batch_step) { |
| if (unpack_slurm_step_layout(&step_layout, buffer)) |
| goto unpack_error; |
| switch_alloc_jobinfo(&switch_tmp); |
| if (switch_unpack_jobinfo(switch_tmp, buffer)) |
| goto unpack_error; |
| } |
| checkpoint_alloc_jobinfo(&check_tmp); |
| if (checkpoint_unpack_jobinfo(check_tmp, buffer)) |
| goto unpack_error; |
| |
| /* validity test as possible */ |
| if (cyclic_alloc > 1) { |
| error("Invalid data for job %u.%u: cyclic_alloc=%u", |
| job_ptr->job_id, step_id, cyclic_alloc); |
| goto unpack_error; |
| } |
| if (no_kill > 1) { |
| error("Invalid data for job %u.%u: no_kill=%u", |
| job_ptr->job_id, step_id, no_kill); |
| goto unpack_error; |
| } |
| |
| step_ptr = find_step_record(job_ptr, step_id); |
| if (step_ptr == NULL) |
| step_ptr = _create_step_record(job_ptr); |
| if (step_ptr == NULL) |
| goto unpack_error; |
| |
| /* set new values */ |
| step_ptr->step_id = step_id; |
| step_ptr->cpu_count = cpu_count; |
| step_ptr->cpus_per_task= cpus_per_task; |
| step_ptr->cyclic_alloc = cyclic_alloc; |
| step_ptr->resv_port_cnt= resv_port_cnt; |
| step_ptr->resv_ports = resv_ports; |
| step_ptr->name = name; |
| step_ptr->network = network; |
| step_ptr->no_kill = no_kill; |
| step_ptr->ckpt_dir = ckpt_dir; |
| step_ptr->port = port; |
| step_ptr->ckpt_interval= ckpt_interval; |
| step_ptr->mem_per_cpu = mem_per_cpu; |
| step_ptr->host = host; |
| host = NULL; /* re-used, nothing left to free */ |
| step_ptr->batch_step = batch_step; |
| step_ptr->start_time = start_time; |
| step_ptr->time_limit = time_limit; |
| step_ptr->pre_sus_time = pre_sus_time; |
| step_ptr->tot_sus_time = tot_sus_time; |
| step_ptr->ckpt_time = ckpt_time; |
| |
| slurm_step_layout_destroy(step_ptr->step_layout); |
| step_ptr->step_layout = step_layout; |
| |
| step_ptr->switch_job = switch_tmp; |
| step_ptr->check_job = check_tmp; |
| |
| step_ptr->exit_code = exit_code; |
| if (bit_fmt) { |
| /* NOTE: This is only recovered if a job step completion |
| * is actively in progress at step save time. Otherwise |
| * the bitmap is NULL. */ |
| step_ptr->exit_node_bitmap = bit_alloc(bit_cnt); |
| if (step_ptr->exit_node_bitmap == NULL) |
| fatal("bit_alloc: %m"); |
| if (bit_unfmt(step_ptr->exit_node_bitmap, bit_fmt)) { |
| error("error recovering exit_node_bitmap from %s", |
| bit_fmt); |
| } |
| xfree(bit_fmt); |
| } |
| if (core_size) { |
| step_ptr->core_bitmap_job = bit_alloc(core_size); |
| if (bit_unfmt(step_ptr->core_bitmap_job, core_job)) { |
| error("error recovering core_bitmap_job from %s", |
| core_job); |
| } |
| xfree(core_job); |
| } |
| |
| if (step_ptr->step_layout && step_ptr->step_layout->node_list) { |
| switch_g_job_step_allocated(switch_tmp, |
| step_ptr->step_layout->node_list); |
| } else { |
| switch_g_job_step_allocated(switch_tmp, NULL); |
| } |
| info("recovered job step %u.%u", job_ptr->job_id, step_id); |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| xfree(host); |
| xfree(resv_ports); |
| xfree(name); |
| xfree(network); |
| xfree(ckpt_dir); |
| xfree(bit_fmt); |
| xfree(core_job); |
| if (switch_tmp) |
| switch_free_jobinfo(switch_tmp); |
| slurm_step_layout_destroy(step_layout); |
| return SLURM_FAILURE; |
| } |
| |
| /* Perform periodic job step checkpoints (per user request) */ |
| extern void step_checkpoint(void) |
| { |
| static int ckpt_run = -1; |
| time_t now = time(NULL), ckpt_due; |
| ListIterator job_iterator; |
| struct job_record *job_ptr; |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| time_t event_time; |
| uint32_t error_code; |
| char *error_msg; |
| checkpoint_msg_t ckpt_req; |
| |
| /* Exit if "checkpoint/none" is configured */ |
| if (ckpt_run == -1) { |
| char *ckpt_type = slurm_get_checkpoint_type(); |
| if (strcasecmp(ckpt_type, "checkpoint/none")) |
| ckpt_run = 1; |
| else |
| ckpt_run = 0; |
| xfree(ckpt_type); |
| } |
| if (ckpt_run == 0) |
| return; |
| |
| job_iterator = list_iterator_create(job_list); |
| while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
| if (!IS_JOB_RUNNING(job_ptr)) |
| continue; |
| if (job_ptr->batch_flag && |
| (job_ptr->ckpt_interval != 0)) { /* periodic job ckpt */ |
| ckpt_due = job_ptr->ckpt_time + |
| (job_ptr->ckpt_interval * 60); |
| if (ckpt_due > now) |
| continue; |
| /* |
| * DO NOT initiate a checkpoint request if the job is |
| * started just now, in case it is restarting from checkpoint. |
| */ |
| ckpt_due = job_ptr->start_time + |
| (job_ptr->ckpt_interval * 60); |
| if (ckpt_due > now) |
| continue; |
| |
| ckpt_req.op = CHECK_CREATE; |
| ckpt_req.data = 0; |
| ckpt_req.job_id = job_ptr->job_id; |
| ckpt_req.step_id = SLURM_BATCH_SCRIPT; |
| ckpt_req.image_dir = NULL; |
| job_checkpoint(&ckpt_req, getuid(), -1); |
| job_ptr->ckpt_time = now; |
| last_job_update = now; |
| continue; /* ignore periodic step ckpt */ |
| } |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) |
| list_next (step_iterator))) { |
| char *image_dir = NULL; |
| if (step_ptr->ckpt_interval == 0) |
| continue; |
| ckpt_due = step_ptr->ckpt_time + |
| (step_ptr->ckpt_interval * 60); |
| if (ckpt_due > now) |
| continue; |
| /* |
| * DO NOT initiate a checkpoint request if the step is |
| * started just now, in case it is restarting from |
| * checkpoint. |
| */ |
| ckpt_due = step_ptr->start_time + |
| (step_ptr->ckpt_interval * 60); |
| if (ckpt_due > now) |
| continue; |
| |
| step_ptr->ckpt_time = now; |
| last_job_update = now; |
| image_dir = xstrdup(step_ptr->ckpt_dir); |
| xstrfmtcat(image_dir, "/%u.%u", job_ptr->job_id, |
| step_ptr->step_id); |
| (void) checkpoint_op(job_ptr->job_id, |
| step_ptr->step_id, |
| step_ptr, CHECK_CREATE, 0, |
| image_dir, &event_time, |
| &error_code, &error_msg); |
| xfree(image_dir); |
| } |
| list_iterator_destroy (step_iterator); |
| } |
| list_iterator_destroy(job_iterator); |
| } |
| |
| static void _signal_step_timelimit(struct job_record *job_ptr, |
| struct step_record *step_ptr, time_t now) |
| { |
| int i; |
| kill_job_msg_t *kill_step; |
| agent_arg_t *agent_args = NULL; |
| |
| xassert(step_ptr); |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_KILL_TIMELIMIT; |
| agent_args->retry = 1; |
| agent_args->hostlist = hostlist_create(""); |
| kill_step = xmalloc(sizeof(kill_job_msg_t)); |
| kill_step->job_id = job_ptr->job_id; |
| kill_step->step_id = step_ptr->step_id; |
| kill_step->job_state = job_ptr->job_state; |
| kill_step->job_uid = job_ptr->user_id; |
| kill_step->nodes = xstrdup(job_ptr->nodes); |
| kill_step->time = now; |
| kill_step->select_jobinfo = select_g_select_jobinfo_copy( |
| job_ptr->select_jobinfo); |
| |
| for (i = 0; i < node_record_count; i++) { |
| if (bit_test(step_ptr->step_node_bitmap, i) == 0) |
| continue; |
| hostlist_push(agent_args->hostlist, |
| node_record_table_ptr[i].name); |
| agent_args->node_count++; |
| #ifdef HAVE_FRONT_END /* Operate only on front-end */ |
| break; |
| #endif |
| } |
| |
| if (agent_args->node_count == 0) { |
| hostlist_destroy(agent_args->hostlist); |
| xfree(agent_args); |
| if (kill_step->select_jobinfo) { |
| select_g_select_jobinfo_free( |
| kill_step->select_jobinfo); |
| } |
| xfree(kill_step); |
| return; |
| } |
| |
| agent_args->msg_args = kill_step; |
| agent_queue_request(agent_args); |
| return; |
| } |
| |
| extern void |
| check_job_step_time_limit (struct job_record *job_ptr, time_t now) |
| { |
| ListIterator step_iterator; |
| struct step_record *step_ptr; |
| uint32_t job_run_mins = 0; |
| |
| xassert(job_ptr); |
| |
| if (job_ptr->job_state != JOB_RUNNING) |
| return; |
| |
| step_iterator = list_iterator_create (job_ptr->step_list); |
| while ((step_ptr = (struct step_record *) list_next (step_iterator))) { |
| |
| if (step_ptr->time_limit == INFINITE || |
| step_ptr->time_limit == NO_VAL) |
| continue; |
| job_run_mins = (uint32_t) (((now - step_ptr->start_time) - |
| step_ptr->tot_sus_time) / 60); |
| if (job_run_mins >= step_ptr->time_limit) { |
| /* this step has timed out */ |
| info("check_job_step_time_limit: job %u step %u " |
| "has timed out (%u)", |
| job_ptr->job_id, step_ptr->step_id, |
| step_ptr->time_limit); |
| _signal_step_timelimit(job_ptr, step_ptr, now); |
| } |
| } |
| |
| list_iterator_destroy (step_iterator); |
| } |
| |
| /* Return true if memory is a reserved resources, false otherwise */ |
| static bool _is_mem_resv(void) |
| { |
| static bool mem_resv_value = false; |
| static bool mem_resv_tested = false; |
| |
| if (!mem_resv_tested) { |
| mem_resv_tested = true; |
| slurm_ctl_conf_t *conf = slurm_conf_lock(); |
| if ((conf->select_type_param == CR_MEMORY) || |
| (conf->select_type_param == CR_SOCKET_MEMORY) || |
| (conf->select_type_param == CR_CORE_MEMORY) || |
| (conf->select_type_param == CR_CPU_MEMORY)) |
| mem_resv_value = true; |
| slurm_conf_unlock(); |
| } |
| |
| return mem_resv_value; |
| } |