| /*****************************************************************************\ |
| * select_bluegene.c - node selection plugin for Blue Gene system. |
| ***************************************************************************** |
| * Copyright (C) 2004-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2011 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Dan Phung <phung4@llnl.gov> Danny Auble <da@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://www.schedmd.com/slurmdocs/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "src/common/slurm_xlator.h" |
| #include "bg_core.h" |
| #include "bg_read_config.h" |
| #include "bg_defined_block.h" |
| |
| #ifndef HAVE_BG_L_P |
| # include "ba_bgq/block_allocator.h" |
| #else |
| # include "ba/block_allocator.h" |
| #endif |
| |
| #include "src/slurmctld/trigger_mgr.h" |
| #include <fcntl.h> |
| |
| #define HUGE_BUF_SIZE (1024*16) |
| |
| /* These are defined here so when we link with something other than |
| * the slurmctld we will have these symbols defined. They will get |
| * overwritten when linking with the slurmctld. |
| */ |
| #if defined (__APPLE__) |
| slurmctld_config_t slurmctld_config __attribute__((weak_import)); |
| slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import)); |
| struct node_record *node_record_table_ptr __attribute__((weak_import)) = NULL; |
| int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER; |
| List part_list __attribute__((weak_import)) = NULL; |
| int node_record_count __attribute__((weak_import)); |
| time_t last_node_update __attribute__((weak_import)); |
| time_t last_job_update __attribute__((weak_import)); |
| char *alpha_num __attribute__((weak_import)) = |
| "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
| void *acct_db_conn __attribute__((weak_import)) = NULL; |
| char *slurmctld_cluster_name __attribute__((weak_import)) = NULL; |
| slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL; |
| #else |
| slurmctld_config_t slurmctld_config; |
| slurm_ctl_conf_t slurmctld_conf; |
| struct node_record *node_record_table_ptr = NULL; |
| int bg_recover = NOT_FROM_CONTROLLER; |
| List part_list = NULL; |
| int node_record_count; |
| time_t last_node_update; |
| time_t last_job_update; |
| char *alpha_num = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
| void *acct_db_conn = NULL; |
| char *slurmctld_cluster_name = NULL; |
| slurmdb_cluster_rec_t *working_cluster_rec = NULL; |
| #endif |
| |
| /* |
| * These variables are required by the generic plugin interface. If they |
| * are not found in the plugin, the plugin loader will ignore it. |
| * |
| * plugin_name - a string giving a human-readable description of the |
| * plugin. There is no maximum length, but the symbol must refer to |
| * a valid string. |
| * |
| * plugin_type - a string suggesting the type of the plugin or its |
| * applicability to a particular form of data or method of data handling. |
| * If the low-level plugin API is used, the contents of this string are |
| * unimportant and may be anything. SLURM uses the higher-level plugin |
| * interface which requires this string to be of the form |
| * |
| * <application>/<method> |
| * |
| * where <application> is a description of the intended application of |
| * the plugin (e.g., "select" for SLURM node selection) and <method> |
| * is a description of how this plugin satisfies that application. SLURM will |
| * only load select plugins if the plugin_type string has a |
| * prefix of "select/". |
| * |
| * plugin_version - an unsigned 32-bit integer giving the version number |
| * of the plugin. If major and minor revisions are desired, the major |
| * version number may be multiplied by a suitable magnitude constant such |
| * as 100 or 1000. Various SLURM versions will likely require a certain |
| * minimum version for their plugins as the node selection API matures. |
| */ |
| const char plugin_name[] = "BlueGene node selection plugin"; |
| const char plugin_type[] = "select/bluegene"; |
| const uint32_t plugin_id = 100; |
| const uint32_t plugin_version = 200; |
| |
| /* Global variables */ |
| bg_config_t *bg_conf = NULL; |
| bg_lists_t *bg_lists = NULL; |
| time_t last_bg_update; |
| pthread_mutex_t block_state_mutex = PTHREAD_MUTEX_INITIALIZER; |
| int blocks_are_created = 0; |
| int num_unused_cpus = 0; |
| int num_possible_unused_cpus = 0; |
| slurmctld_lock_t job_read_lock = { |
| NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; |
| |
| extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data); |
| |
| static void _destroy_bg_config(bg_config_t *bg_conf) |
| { |
| if (bg_conf) { |
| if (bg_conf->blrts_list) { |
| list_destroy(bg_conf->blrts_list); |
| bg_conf->blrts_list = NULL; |
| } |
| |
| xfree(bg_conf->bridge_api_file); |
| |
| xfree(bg_conf->default_blrtsimage); |
| xfree(bg_conf->default_linuximage); |
| xfree(bg_conf->default_mloaderimage); |
| xfree(bg_conf->default_ramdiskimage); |
| |
| if (bg_conf->linux_list) { |
| list_destroy(bg_conf->linux_list); |
| bg_conf->linux_list = NULL; |
| } |
| |
| if (bg_conf->mloader_list) { |
| list_destroy(bg_conf->mloader_list); |
| bg_conf->mloader_list = NULL; |
| } |
| |
| if (bg_conf->ramdisk_list) { |
| list_destroy(bg_conf->ramdisk_list); |
| bg_conf->ramdisk_list = NULL; |
| } |
| |
| xfree(bg_conf->slurm_user_name); |
| xfree(bg_conf->slurm_node_prefix); |
| xfree(bg_conf); |
| } |
| } |
| |
| static void _destroy_bg_lists(bg_lists_t *bg_lists) |
| { |
| if (bg_lists) { |
| if (bg_lists->booted) { |
| list_destroy(bg_lists->booted); |
| bg_lists->booted = NULL; |
| } |
| |
| if (bg_lists->job_running) { |
| list_destroy(bg_lists->job_running); |
| bg_lists->job_running = NULL; |
| num_unused_cpus = 0; |
| } |
| |
| if (bg_lists->main) { |
| list_destroy(bg_lists->main); |
| bg_lists->main = NULL; |
| } |
| |
| if (bg_lists->valid_small32) { |
| list_destroy(bg_lists->valid_small32); |
| bg_lists->valid_small32 = NULL; |
| } |
| if (bg_lists->valid_small64) { |
| list_destroy(bg_lists->valid_small64); |
| bg_lists->valid_small64 = NULL; |
| } |
| if (bg_lists->valid_small128) { |
| list_destroy(bg_lists->valid_small128); |
| bg_lists->valid_small128 = NULL; |
| } |
| if (bg_lists->valid_small256) { |
| list_destroy(bg_lists->valid_small256); |
| bg_lists->valid_small256 = NULL; |
| } |
| |
| xfree(bg_lists); |
| } |
| } |
| |
| #ifdef HAVE_BG |
| static int _delete_old_blocks(List curr_block_list, List found_block_list) |
| { |
| ListIterator itr_curr, itr_found; |
| bg_record_t *found_record = NULL, *init_record = NULL; |
| List destroy_list = list_create(NULL); |
| |
| xassert(curr_block_list); |
| xassert(found_block_list); |
| |
| slurm_mutex_lock(&block_state_mutex); |
| if (!bg_recover) { |
| info("removing all current blocks (clean start)"); |
| itr_curr = list_iterator_create(curr_block_list); |
| while ((init_record = list_next(itr_curr))) { |
| list_remove(itr_curr); |
| |
| init_record->modifying = 0; |
| |
| /* The block needs to exist in the main list |
| * just to make sure we query the state. */ |
| if (!(found_record = find_bg_record_in_list( |
| bg_lists->main, |
| init_record->bg_block_id))) |
| list_push(bg_lists->main, init_record); |
| else { |
| destroy_bg_record(init_record); |
| init_record = found_record; |
| } |
| /* Make sure this block isn't in an |
| error state since if it is it won't |
| disappear. */ |
| if (init_record->state & BG_BLOCK_ERROR_FLAG) |
| resume_block(init_record); |
| list_push(destroy_list, init_record); |
| } |
| list_iterator_destroy(itr_curr); |
| } else { |
| info("removing unspecified blocks"); |
| itr_curr = list_iterator_create(curr_block_list); |
| while ((init_record = list_next(itr_curr))) { |
| itr_found = list_iterator_create(found_block_list); |
| while ((found_record = list_next(itr_found))) { |
| if (!strcmp(init_record->bg_block_id, |
| found_record->bg_block_id)) { |
| /* don't delete this one */ |
| break; |
| } |
| } |
| list_iterator_destroy(itr_found); |
| |
| if (found_record == NULL) { |
| list_remove(itr_curr); |
| |
| init_record->modifying = 0; |
| |
| /* The block needs to exist in the main list |
| * just to make sure we query the state. */ |
| if (!(found_record = find_bg_record_in_list( |
| bg_lists->main, |
| init_record->bg_block_id))) |
| list_push(bg_lists->main, init_record); |
| else { |
| destroy_bg_record(init_record); |
| init_record = found_record; |
| } |
| /* Make sure this block isn't in an |
| error state since if it is it won't |
| disappear. */ |
| if (init_record->state & BG_BLOCK_ERROR_FLAG) |
| resume_block(init_record); |
| |
| /* Since we can't requeue a running |
| job in the free block function (not |
| thread safe here) we must do it |
| now. |
| */ |
| if ((init_record->job_running > NO_JOB_RUNNING) |
| || init_record->job_ptr) { |
| /* Don't worry about dealing |
| with this job here. Trying |
| to requeue/cancel now will |
| cause a race condition |
| locking up the slurmctld. |
| It will be handled when the |
| blocks are synced. This |
| should only happen if the |
| bluegene.conf gets changed |
| and jobs are running on |
| blocks that don't exist in |
| the new config (hopefully |
| rarely). |
| */ |
| init_record->job_running = |
| NO_JOB_RUNNING; |
| init_record->job_ptr = NULL; |
| } else if (init_record->job_list && |
| list_count(init_record->job_list)) |
| list_flush(init_record->job_list); |
| list_push(destroy_list, init_record); |
| } |
| } |
| list_iterator_destroy(itr_curr); |
| } |
| slurm_mutex_unlock(&block_state_mutex); |
| |
| free_block_list(NO_VAL, destroy_list, 1, 0); |
| list_destroy(destroy_list); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _set_bg_lists() |
| { |
| if (!bg_lists) |
| bg_lists = xmalloc(sizeof(bg_lists_t)); |
| |
| slurm_mutex_lock(&block_state_mutex); |
| |
| if (bg_lists->booted) |
| list_destroy(bg_lists->booted); |
| bg_lists->booted = list_create(NULL); |
| |
| if (bg_lists->job_running) |
| list_destroy(bg_lists->job_running); |
| bg_lists->job_running = list_create(NULL); |
| |
| if (bg_lists->main) |
| list_destroy(bg_lists->main); |
| bg_lists->main = list_create(destroy_bg_record); |
| |
| slurm_mutex_unlock(&block_state_mutex); |
| |
| } |
| |
| static bg_record_t *_translate_info_2_record(block_info_t *block_info) |
| { |
| bg_record_t *bg_record = NULL; |
| bitstr_t *mp_bitmap = NULL, *ionode_bitmap = NULL; |
| |
| mp_bitmap = bit_alloc(node_record_count); |
| ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); |
| |
| if (block_info->mp_inx |
| && inx2bitstr(mp_bitmap, block_info->mp_inx) == -1) |
| error("Job state recovered incompatible with " |
| "bluegene.conf. mp=%u", |
| node_record_count); |
| if (block_info->ionode_inx |
| && inx2bitstr(ionode_bitmap, block_info->ionode_inx) == -1) |
| error("Job state recovered incompatible with " |
| "bluegene.conf. ionodes=%u", |
| bg_conf->ionodes_per_mp); |
| |
| bg_record = xmalloc(sizeof(bg_record_t)); |
| bg_record->magic = BLOCK_MAGIC; |
| bg_record->bg_block_id = block_info->bg_block_id; |
| block_info->bg_block_id = NULL; |
| bg_record->mp_str = block_info->mp_str; |
| block_info->mp_str = NULL; |
| bg_record->ionode_bitmap = ionode_bitmap; |
| ionode_bitmap = NULL; |
| |
| if (block_info->ionode_str) { |
| ba_set_ionode_str(bg_record); |
| if (!bg_record->ionode_str |
| || strcmp(block_info->ionode_str, bg_record->ionode_str)) { |
| error("block %s didn't compute with the correct " |
| "ionode_str. Stored as '%s' and " |
| "came back as '%s'", |
| bg_record->bg_block_id, |
| block_info->ionode_str, bg_record->ionode_str); |
| } |
| } |
| |
| bg_record->mp_bitmap = mp_bitmap; |
| mp_bitmap = NULL; |
| |
| /* put_block_in_error_state should be |
| called after the bg_lists->main has been |
| made. We can't call it here since |
| this record isn't the record kept |
| around in bg_lists->main. |
| */ |
| bg_record->state = block_info->state; |
| |
| bg_record->cnode_cnt = block_info->cnode_cnt; |
| bg_record->mp_count = bit_set_count(bg_record->mp_bitmap); |
| |
| /* Don't copy the job_list from the block_info, we will fill |
| it in later in the job sync. |
| */ |
| bg_record->job_running = NO_JOB_RUNNING; |
| if (bg_conf->sub_blocks && (bg_record->mp_count == 1)) |
| bg_record->job_list = list_create(NULL); |
| |
| #ifdef HAVE_BGL |
| bg_record->node_use = block_info->node_use; |
| #endif |
| memcpy(bg_record->conn_type, block_info->conn_type, |
| sizeof(bg_record->conn_type)); |
| |
| bg_record->blrtsimage = block_info->blrtsimage; |
| block_info->blrtsimage = NULL; |
| bg_record->linuximage = block_info->linuximage; |
| block_info->linuximage = NULL; |
| bg_record->mloaderimage = block_info->mloaderimage; |
| block_info->mloaderimage = NULL; |
| bg_record->ramdiskimage = block_info->ramdiskimage; |
| block_info->ramdiskimage = NULL; |
| |
| bg_record->reason = block_info->reason; |
| block_info->reason = NULL; |
| |
| slurm_free_block_info_members(block_info); |
| return bg_record; |
| } |
| |
| static void _local_pack_block_job_info(struct job_record *job_ptr, Buf buffer, |
| uint16_t protocol_version) |
| { |
| block_job_info_t block_job; |
| select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data; |
| |
| memset(&block_job, 0, sizeof(block_job_info_t)); |
| block_job.job_id = job_ptr->job_id; |
| block_job.user_id = job_ptr->user_id; |
| if (jobinfo) { |
| block_job.user_name = jobinfo->user_name; |
| block_job.cnodes = jobinfo->ionode_str; |
| } else |
| error("NO JOBINFO for job %u magic %u!!!!!!!!!!!!!!", |
| job_ptr->job_id, job_ptr->magic); |
| |
| /* block_job.cnode_inx -- try not to set */ |
| slurm_pack_block_job_info(&block_job, buffer, protocol_version); |
| } |
| |
| /* Pack all relevent information about a block */ |
| /* NOTE: There is a matching pack function in |
| * common/slurm_protocol_pack.c dealing with the block_info_t |
| * structure there. If anything changes here please update that as well. |
| * The unpack for this is in common/slurm_protocol_pack.c |
| */ |
| static void _pack_block(bg_record_t *bg_record, Buf buffer, |
| uint16_t protocol_version) |
| { |
| #ifdef HAVE_BGQ |
| int dim; |
| #endif |
| uint32_t count = NO_VAL, running_job = 0; |
| struct job_record *job_ptr; |
| ListIterator itr; |
| |
| if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) { |
| packstr(bg_record->bg_block_id, buffer); |
| packstr(bg_record->blrtsimage, buffer); |
| pack_bit_fmt(bg_record->mp_bitmap, buffer); |
| #ifdef HAVE_BGQ |
| pack32(SYSTEM_DIMENSIONS, buffer); |
| for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) |
| pack16(bg_record->conn_type[dim], buffer); |
| #else |
| pack32(1, buffer); /* for dimensions of conn_type */ |
| pack16(bg_record->conn_type[0], buffer); |
| #endif |
| packstr(bg_record->ionode_str, buffer); |
| pack_bit_fmt(bg_record->ionode_bitmap, buffer); |
| |
| if (bg_record->job_list) |
| count = list_count(bg_record->job_list); |
| |
| if (count && count != NO_VAL) { |
| pack32(count, buffer); |
| itr = list_iterator_create(bg_record->job_list); |
| while ((job_ptr = list_next(itr))) { |
| if (job_ptr->magic != JOB_MAGIC) { |
| error("_pack_block: " |
| "bad magic found when " |
| "packing block %s", |
| bg_record->bg_block_id); |
| list_delete_item(itr); |
| slurm_pack_block_job_info( |
| NULL, buffer, |
| protocol_version); |
| continue; |
| } |
| _local_pack_block_job_info( |
| job_ptr, buffer, protocol_version); |
| } |
| list_iterator_destroy(itr); |
| } else if (bg_record->job_ptr |
| && (bg_record->job_ptr->magic == JOB_MAGIC)) { |
| pack32(1, buffer); |
| _local_pack_block_job_info( |
| bg_record->job_ptr, buffer, protocol_version); |
| } else |
| pack32(count, buffer); |
| |
| count = NO_VAL; |
| |
| packstr(bg_record->linuximage, buffer); |
| packstr(bg_record->mloaderimage, buffer); |
| packstr(bg_record->mp_str, buffer); |
| pack32(bg_record->cnode_cnt, buffer); |
| pack32(bg_record->cnode_err_cnt, buffer); |
| pack16((uint16_t)bg_record->node_use, buffer); |
| packstr(bg_record->ramdiskimage, buffer); |
| packstr(bg_record->reason, buffer); |
| pack16((uint16_t)bg_record->state, buffer); |
| } else if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) { |
| packstr(bg_record->bg_block_id, buffer); |
| packstr(bg_record->blrtsimage, buffer); |
| pack_bit_fmt(bg_record->mp_bitmap, buffer); |
| #ifdef HAVE_BGQ |
| pack32(SYSTEM_DIMENSIONS, buffer); |
| for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) |
| pack16(bg_record->conn_type[dim], buffer); |
| #else |
| pack32(1, buffer); /* for dimensions of conn_type */ |
| pack16(bg_record->conn_type[0], buffer); |
| #endif |
| packstr(bg_record->ionode_str, buffer); |
| pack_bit_fmt(bg_record->ionode_bitmap, buffer); |
| |
| if (bg_record->job_list) |
| count = list_count(bg_record->job_list); |
| pack32(count, buffer); |
| if (count && count != NO_VAL) { |
| itr = list_iterator_create(bg_record->job_list); |
| while ((job_ptr = list_next(itr))) { |
| if (job_ptr->magic != JOB_MAGIC) { |
| error("_pack_block 2.3: " |
| "bad magic found when " |
| "packing block %s", |
| bg_record->bg_block_id); |
| list_delete_item(itr); |
| continue; |
| } |
| _local_pack_block_job_info( |
| job_ptr, buffer, protocol_version); |
| } |
| list_iterator_destroy(itr); |
| } |
| if ((count == 1) && running_job) |
| pack32((uint32_t)running_job, buffer); |
| else |
| pack32((uint32_t)bg_record->job_running, buffer); |
| count = NO_VAL; |
| |
| packstr(bg_record->linuximage, buffer); |
| packstr(bg_record->mloaderimage, buffer); |
| packstr(bg_record->mp_str, buffer); |
| packnull(buffer); /* for mp_used_str */ |
| pack32((uint32_t)bg_record->cnode_cnt, buffer); |
| pack16((uint16_t)bg_record->node_use, buffer); |
| packnull(buffer); /* for user_name */ |
| packstr(bg_record->ramdiskimage, buffer); |
| packstr(bg_record->reason, buffer); |
| pack16((uint16_t)bg_record->state, buffer); |
| packnull(buffer); /* for mp_used_inx */ |
| } else if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { |
| packstr(bg_record->bg_block_id, buffer); |
| #ifdef HAVE_BGL |
| packstr(bg_record->blrtsimage, buffer); |
| #endif |
| pack_bit_fmt(bg_record->mp_bitmap, buffer); |
| pack16((uint16_t)bg_record->conn_type[0], buffer); |
| packstr(bg_record->ionode_str, buffer); |
| pack_bit_fmt(bg_record->ionode_bitmap, buffer); |
| pack32((uint32_t)bg_record->job_running, buffer); |
| packstr(bg_record->linuximage, buffer); |
| packstr(bg_record->mloaderimage, buffer); |
| packstr(bg_record->mp_str, buffer); |
| pack32((uint32_t)bg_record->cnode_cnt, buffer); |
| #ifdef HAVE_BGL |
| pack16((uint16_t)bg_record->node_use, buffer); |
| #endif |
| packnull(buffer); /* for user_name */ |
| packstr(bg_record->ramdiskimage, buffer); |
| packstr(bg_record->reason, buffer); |
| pack16((uint16_t)bg_record->state, buffer); |
| } else if (protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { |
| packstr(bg_record->bg_block_id, buffer); |
| #ifdef HAVE_BGL |
| packstr(bg_record->blrtsimage, buffer); |
| #endif |
| pack_bit_fmt(bg_record->mp_bitmap, buffer); |
| pack16((uint16_t)bg_record->conn_type[0], buffer); |
| packstr(bg_record->ionode_str, buffer); |
| pack_bit_fmt(bg_record->ionode_bitmap, buffer); |
| pack32((uint32_t)bg_record->job_running, buffer); |
| packstr(bg_record->linuximage, buffer); |
| packstr(bg_record->mloaderimage, buffer); |
| packstr(bg_record->mp_str, buffer); |
| pack32((uint32_t)bg_record->cnode_cnt, buffer); |
| #ifdef HAVE_BGL |
| pack16((uint16_t)bg_record->node_use, buffer); |
| #endif |
| packnull(buffer); /* for user_name */ |
| packstr(bg_record->ramdiskimage, buffer); |
| pack16((uint16_t)bg_record->state, buffer); |
| } |
| } |
| |
| /* Pack all extra information about a block (Only needed for saving state.) */ |
| static void _pack_block_ext(bg_record_t *bg_record, Buf buffer, |
| uint16_t protocol_version) |
| { |
| ListIterator itr; |
| ba_mp_t *ba_mp; |
| uint32_t count = NO_VAL; |
| int i; |
| |
| xassert(bg_record); |
| |
| if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) { |
| if (bg_record->ba_mp_list) |
| count = list_count(bg_record->ba_mp_list); |
| pack32(count, buffer); |
| if (count && count != NO_VAL) { |
| itr = list_iterator_create(bg_record->ba_mp_list); |
| while ((ba_mp = list_next(itr))) |
| pack_ba_mp(ba_mp, buffer, protocol_version); |
| list_iterator_destroy(itr); |
| |
| } |
| pack32(bg_record->cpu_cnt, buffer); |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) { |
| pack16(bg_record->geo[i], buffer); |
| pack16(bg_record->start[i], buffer); |
| } |
| |
| pack16(bg_record->full_block, buffer); |
| pack32(bg_record->switch_count, buffer); |
| } else { |
| /* didn't exist before 2.3 */ |
| } |
| } |
| |
| /* UNPack all extra information about a block */ |
| static int _unpack_block_ext(bg_record_t *bg_record, Buf buffer, |
| uint16_t protocol_version) |
| { |
| ba_mp_t *ba_mp; |
| uint32_t count = NO_VAL; |
| int i; |
| uint16_t temp16; |
| |
| xassert(bg_record); |
| |
| if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) { |
| safe_unpack32(&count, buffer); |
| if (count == NO_VAL) { |
| error("_unpack_block_ext: bg_record record has no " |
| "mp_list"); |
| goto unpack_error; |
| } |
| bg_record->ba_mp_list = list_create(destroy_ba_mp); |
| for (i=0; i<count; i++) { |
| if (unpack_ba_mp(&ba_mp, buffer, protocol_version) |
| == SLURM_ERROR) |
| goto unpack_error; |
| list_append(bg_record->ba_mp_list, ba_mp); |
| } |
| safe_unpack32(&bg_record->cpu_cnt, buffer); |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) { |
| safe_unpack16(&bg_record->geo[i], buffer); |
| safe_unpack16(&bg_record->start[i], buffer); |
| } |
| safe_unpack16(&temp16, buffer); |
| bg_record->full_block = temp16; |
| safe_pack32(bg_record->switch_count, buffer); |
| } else { |
| /* packing didn't exist before 2.3, so set things up |
| * to go forward */ |
| if (bg_conf->mp_cnode_cnt > bg_record->cnode_cnt) { |
| bg_record->cpu_cnt = bg_conf->cpus_per_mp / |
| (bg_conf->mp_cnode_cnt / bg_record->cnode_cnt); |
| } else { |
| bg_record->cpu_cnt = bg_conf->cpus_per_mp |
| * bg_record->mp_count; |
| } |
| process_nodes(bg_record, true); |
| } |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("Problem unpacking extended block info for %s, " |
| "removing from list", |
| bg_record->bg_block_id); |
| return SLURM_ERROR; |
| } |
| |
| static int _load_state_file(List curr_block_list, char *dir_name) |
| { |
| int state_fd, i; |
| char *state_file = NULL; |
| Buf buffer = NULL; |
| char *data = NULL; |
| int data_size = 0; |
| block_info_msg_t *block_ptr = NULL; |
| bg_record_t *bg_record = NULL; |
| char temp[256]; |
| List results = NULL; |
| int data_allocated, data_read = 0; |
| char *ver_str = NULL; |
| uint32_t ver_str_len; |
| char *name = NULL; |
| struct part_record *part_ptr = NULL; |
| bitstr_t *usable_mp_bitmap = NULL; |
| ListIterator itr = NULL; |
| uint16_t protocol_version = (uint16_t)NO_VAL; |
| uint32_t record_count; |
| |
| xassert(curr_block_list); |
| xassert(dir_name); |
| |
| state_file = xstrdup(dir_name); |
| xstrcat(state_file, "/block_state"); |
| state_fd = open(state_file, O_RDONLY); |
| if (state_fd < 0) { |
| error("No block state file (%s) to recover", state_file); |
| xfree(state_file); |
| return SLURM_SUCCESS; |
| } else { |
| data_allocated = BUF_SIZE; |
| data = xmalloc(data_allocated); |
| while (1) { |
| data_read = read(state_fd, &data[data_size], |
| BUF_SIZE); |
| if (data_read < 0) { |
| if (errno == EINTR) |
| continue; |
| else { |
| error("Read error on %s: %m", |
| state_file); |
| break; |
| } |
| } else if (data_read == 0) /* eof */ |
| break; |
| data_size += data_read; |
| data_allocated += data_read; |
| xrealloc(data, data_allocated); |
| } |
| close(state_fd); |
| } |
| xfree(state_file); |
| |
| buffer = create_buf(data, data_size); |
| safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); |
| debug3("Version string in block_state header is %s", ver_str); |
| if (ver_str) { |
| if (!strcmp(ver_str, BLOCK_STATE_VERSION)) { |
| protocol_version = SLURM_PROTOCOL_VERSION; |
| } else if (!strcmp(ver_str, BLOCK_2_2_STATE_VERSION)) { |
| protocol_version = SLURM_2_2_PROTOCOL_VERSION; |
| } else if (!strcmp(ver_str, BLOCK_2_1_STATE_VERSION)) { |
| protocol_version = SLURM_2_1_PROTOCOL_VERSION; |
| } |
| } |
| |
| if (protocol_version == (uint16_t)NO_VAL) { |
| error("***********************************************"); |
| error("Can not recover block state, " |
| "data version incompatible"); |
| error("***********************************************"); |
| xfree(ver_str); |
| free_buf(buffer); |
| return EFAULT; |
| } |
| xfree(ver_str); |
| safe_unpack32(&record_count, buffer); |
| |
| /* In older versions of the code we stored things in a |
| block_info_msg_t. This isn't the case anymore so in the |
| newer code we don't store the timestamp since it isn't |
| really needed. |
| */ |
| if (protocol_version <= SLURM_2_2_PROTOCOL_VERSION) { |
| time_t last_save; |
| safe_unpack_time(&last_save, buffer); |
| } |
| |
| slurm_mutex_lock(&block_state_mutex); |
| reset_ba_system(true); |
| |
| /* Locks are already in place to protect part_list here */ |
| usable_mp_bitmap = bit_alloc(node_record_count); |
| itr = list_iterator_create(part_list); |
| while ((part_ptr = list_next(itr))) { |
| /* we only want to use mps that are in partitions */ |
| if (!part_ptr->node_bitmap) { |
| debug4("Partition %s doesn't have any nodes in it.", |
| part_ptr->name); |
| continue; |
| } |
| bit_or(usable_mp_bitmap, part_ptr->node_bitmap); |
| } |
| list_iterator_destroy(itr); |
| |
| if (bit_ffs(usable_mp_bitmap) == -1) { |
| fatal("We don't have any nodes in any partitions. " |
| "Can't create blocks. " |
| "Please check your slurm.conf."); |
| } |
| |
| for (i=0; i<record_count; i++) { |
| block_info_t block_info; |
| |
| if (slurm_unpack_block_info_members( |
| &block_info, buffer, protocol_version)) |
| goto unpack_error; |
| |
| if (!(bg_record = _translate_info_2_record(&block_info))) |
| continue; |
| |
| if (_unpack_block_ext(bg_record, buffer, protocol_version) |
| != SLURM_SUCCESS) { |
| destroy_bg_record(bg_record); |
| goto unpack_error; |
| } |
| |
| /* This means the block here wasn't able to be |
| processed correctly, so don't add. |
| */ |
| if (!bg_record->mp_count) { |
| error("block %s(%s) can't be made in the current " |
| "system, but was around in the previous one.", |
| bg_record->bg_block_id, bg_record->mp_str); |
| list_destroy(results); |
| destroy_bg_record(bg_record); |
| continue; |
| } |
| |
| if ((bg_conf->layout_mode == LAYOUT_OVERLAP) |
| || bg_record->full_block) |
| reset_ba_system(false); |
| |
| if (bg_record->ba_mp_list) { |
| /* only do this for blocks bigger than 1 |
| midplane */ |
| if (bg_record->cpu_cnt >= bg_conf->cpus_per_mp) |
| if (check_and_set_mp_list(bg_record->ba_mp_list) |
| == SLURM_ERROR) |
| error("something happened in the " |
| "load of %s, keeping it " |
| "around though", |
| bg_record->bg_block_id); |
| } else { |
| select_ba_request_t ba_request; |
| ba_set_removable_mps(usable_mp_bitmap, 1); |
| /* we want the mps that aren't |
| * in this record to mark them as used |
| */ |
| if (ba_set_removable_mps(bg_record->mp_bitmap, 1) |
| != SLURM_SUCCESS) |
| fatal("1 It doesn't seem we have a bitmap " |
| "for %s", |
| bg_record->bg_block_id); |
| #ifdef HAVE_BGQ |
| results = list_create(destroy_ba_mp); |
| #else |
| results = list_create(NULL); |
| #endif |
| /* info("adding back %s %s", bg_record->bg_block_id, */ |
| /* bg_record->mp_str); */ |
| memset(&ba_request, 0, sizeof(ba_request)); |
| memcpy(ba_request.start, bg_record->start, |
| sizeof(bg_record->start)); |
| memcpy(ba_request.geometry, bg_record->geo, |
| sizeof(bg_record->geo)); |
| memcpy(ba_request.conn_type, bg_record->conn_type, |
| sizeof(bg_record->conn_type)); |
| ba_request.start_req = 1; |
| name = set_bg_block(results, &ba_request); |
| |
| ba_reset_all_removed_mps(); |
| |
| if (!name) { |
| error("I was unable to make the " |
| "requested block."); |
| list_destroy(results); |
| destroy_bg_record(bg_record); |
| bg_record = NULL; |
| continue; |
| } |
| |
| |
| snprintf(temp, sizeof(temp), "%s%s", |
| bg_conf->slurm_node_prefix, |
| name); |
| |
| xfree(name); |
| if (strcmp(temp, bg_record->mp_str)) { |
| fatal("bad wiring in preserved state " |
| "(found %s, but allocated %s) " |
| "YOU MUST COLDSTART", |
| bg_record->mp_str, temp); |
| } |
| if (bg_record->ba_mp_list) |
| list_destroy(bg_record->ba_mp_list); |
| #ifdef HAVE_BGQ |
| bg_record->ba_mp_list = results; |
| results = NULL; |
| #else |
| bg_record->ba_mp_list = list_create(destroy_ba_mp); |
| copy_node_path(results, &bg_record->ba_mp_list); |
| list_destroy(results); |
| #endif |
| } |
| |
| // bridge_block_create(bg_record); |
| list_push(curr_block_list, bg_record); |
| } |
| |
| FREE_NULL_BITMAP(usable_mp_bitmap); |
| |
| sort_bg_record_inc_size(curr_block_list); |
| slurm_mutex_unlock(&block_state_mutex); |
| |
| info("Recovered %d blocks", list_count(curr_block_list)); |
| slurm_free_block_info_msg(block_ptr); |
| free_buf(buffer); |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| FREE_NULL_BITMAP(usable_mp_bitmap); |
| slurm_mutex_unlock(&block_state_mutex); |
| error("Incomplete block data checkpoint file"); |
| free_buf(buffer); |
| |
| return SLURM_FAILURE; |
| } |
| |
| static void _handle_existing_block(bg_record_t *bg_record) |
| { |
| char *conn_type; |
| char node_str[256]; |
| xassert(bg_record); |
| |
| format_node_name(bg_record, node_str, sizeof(node_str)); |
| conn_type = conn_type_string_full(bg_record->conn_type); |
| info("Existing: BlockID:%s Nodes:%s Conn:%s", |
| bg_record->bg_block_id, node_str, conn_type); |
| xfree(conn_type); |
| /* Sanity check to make sure we have the correct setup from |
| the save. |
| */ |
| if (bg_conf->sub_blocks && bg_record->mp_count == 1) { |
| ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list); |
| xassert(ba_mp); |
| if (!ba_mp->cnode_bitmap) { |
| error("_handle_existing_block: No cnode_bitmap " |
| "for block %s, creating it", |
| bg_record->bg_block_id); |
| if ((ba_mp->cnode_bitmap = |
| ba_create_ba_mp_cnode_bitmap(bg_record))) { |
| if (!ba_mp->cnode_err_bitmap) |
| ba_mp->cnode_err_bitmap = |
| bit_alloc(bg_conf-> |
| mp_cnode_cnt); |
| FREE_NULL_BITMAP(ba_mp->cnode_usable_bitmap); |
| ba_mp->cnode_usable_bitmap = |
| bit_copy(ba_mp->cnode_bitmap); |
| } |
| } |
| } |
| |
| if (bg_record->state & BG_BLOCK_ERROR_FLAG) |
| put_block_in_error_state(bg_record, NULL); |
| else if (((bg_record->state == BG_BLOCK_INITED) |
| || (bg_record->state == BG_BLOCK_BOOTING)) |
| && !block_ptr_exist_in_list(bg_lists->booted, bg_record)) |
| list_push(bg_lists->booted, bg_record); |
| } |
| |
| /* |
| * _validate_config_blocks - Match slurm configuration information with |
| * current BG block configuration. |
| * IN/OUT curr_block_list - List of blocks already existing on the system. |
| * IN/OUT found_block_list - List of blocks found on the system |
| * that are listed in the bluegene.conf. |
| * NOTE: Both of the lists above should be created with list_create(NULL) |
| * since the bg_lists->main will contain the complete list of pointers |
| * and be destroyed with it. |
| * |
| * RET - SLURM_SUCCESS if no blocks need to be deleted, else an error |
| * code. Writes bg_block_id into bg_lists->main records. |
| */ |
| |
| static int _validate_config_blocks(List curr_block_list, |
| List found_block_list, char *dir) |
| { |
| int rc = SLURM_ERROR; |
| bg_record_t* bg_record = NULL; |
| bg_record_t* init_bg_record = NULL; |
| int full_created = 0; |
| ListIterator itr_conf; |
| ListIterator itr_curr; |
| char tmp_char[256]; |
| int dim; |
| |
| xassert(curr_block_list); |
| xassert(found_block_list); |
| |
| /* read in state from last run. */ |
| if (bg_recover) |
| rc = _load_state_file(curr_block_list, dir); |
| |
| #ifndef HAVE_BG_FILES |
| if (rc != SLURM_SUCCESS) |
| return rc; |
| #endif |
| /* read current bg block info into curr_block_list This |
| * happens in the state load before this in emulation mode */ |
| if (bridge_blocks_load_curr(curr_block_list) == SLURM_ERROR) |
| return SLURM_ERROR; |
| |
| if (!bg_recover) |
| return SLURM_ERROR; |
| |
| #ifdef HAVE_BG_FILES |
| /* Since we just checked all the blocks from state against that |
| in the database we can now check to see if there were once |
| blocks that are now gone from the database and remove them |
| from the list. |
| */ |
| itr_curr = list_iterator_create(curr_block_list); |
| while ((bg_record = list_next(itr_curr))) { |
| if (bg_record->modifying) { |
| bg_record->modifying = 0; |
| continue; |
| } |
| error("Found state for block %s, but that " |
| "block isn't in the system anymore, removing", |
| bg_record->bg_block_id); |
| list_delete_item(itr_curr); |
| } |
| list_iterator_destroy(itr_curr); |
| #endif |
| |
| if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { |
| /* Since we don't read the blocks in a Dynamic system |
| we can just transfer the list here and return. |
| */ |
| list_transfer(bg_lists->main, curr_block_list); |
| |
| itr_conf = list_iterator_create(bg_lists->main); |
| while ((bg_record = list_next(itr_conf))) |
| _handle_existing_block(bg_record); |
| list_iterator_destroy(itr_conf); |
| return SLURM_SUCCESS; |
| } |
| |
| /* Only when we are looking at a non-dynamic system do we need |
| to go through the following logic to make sure things are insync. |
| */ |
| itr_curr = list_iterator_create(curr_block_list); |
| itr_conf = list_iterator_create(bg_lists->main); |
| while ((bg_record = list_next(itr_conf))) { |
| list_iterator_reset(itr_curr); |
| while ((init_bg_record = list_next(itr_curr))) { |
| if (!bit_equal(bg_record->mp_bitmap, |
| init_bg_record->mp_bitmap)) |
| continue; /* wrong nodes */ |
| if (!bit_equal(bg_record->ionode_bitmap, |
| init_bg_record->ionode_bitmap)) |
| continue; |
| if ((bg_record->conn_type[0] < SELECT_SMALL) |
| && (init_bg_record->conn_type[0] < SELECT_SMALL)) { |
| for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++) { |
| /* Only look at how far we |
| have set. The bg_record |
| should of been set up |
| correctly in the |
| parse_blockreq() function. |
| */ |
| if (bg_record->conn_type[dim] == |
| (uint16_t)NO_VAL) { |
| dim = SYSTEM_DIMENSIONS; |
| break; |
| } |
| |
| if (bg_record->conn_type[dim] != |
| init_bg_record->conn_type[dim]) |
| break; /* wrong conn_type */ |
| } |
| if (dim < SYSTEM_DIMENSIONS) |
| continue; |
| } |
| copy_bg_record(init_bg_record, bg_record); |
| /* remove from the curr list since we just |
| matched it no reason to keep it around |
| anymore */ |
| list_delete_item(itr_curr); |
| break; |
| } |
| |
| if (!bg_record->bg_block_id) { |
| format_node_name(bg_record, tmp_char, |
| sizeof(tmp_char)); |
| info("Block found in bluegene.conf to be " |
| "created: Nodes:%s", |
| tmp_char); |
| } else { |
| if (bg_record->full_block) |
| full_created = 1; |
| |
| list_push(found_block_list, bg_record); |
| _handle_existing_block(bg_record); |
| } |
| } |
| |
| if (!full_created) { |
| list_iterator_reset(itr_curr); |
| while ((init_bg_record = list_next(itr_curr))) { |
| if (init_bg_record->full_block) { |
| list_remove(itr_curr); |
| bg_record = init_bg_record; |
| |
| list_append(bg_lists->main, bg_record); |
| list_push(found_block_list, bg_record); |
| |
| _handle_existing_block(bg_record); |
| |
| break; |
| } |
| } |
| } |
| |
| list_iterator_destroy(itr_conf); |
| list_iterator_destroy(itr_curr); |
| if (!list_count(curr_block_list)) |
| rc = SLURM_SUCCESS; |
| else |
| rc = SLURM_ERROR; |
| return rc; |
| } |
| |
| static List _get_config(void) |
| { |
| config_key_pair_t *key_pair; |
| List my_list = list_create(destroy_config_key_pair); |
| |
| if (!my_list) |
| fatal("malloc failure on list_create"); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("DefaultConnType"); |
| key_pair->value = conn_type_string_full(bg_conf->default_conn_type); |
| list_append(my_list, key_pair); |
| |
| #ifndef HAVE_BG_FILES |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("Emulated"); |
| key_pair->value = xstrdup("yes"); |
| list_append(my_list, key_pair); |
| #endif |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("MaxBlockInError"); |
| key_pair->value = xstrdup_printf("%u", bg_conf->max_block_err); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("MidPlaneNodeCnt"); |
| key_pair->value = xstrdup_printf("%u", bg_conf->mp_cnode_cnt); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("NodeCPUCnt"); |
| key_pair->value = xstrdup_printf("%u", bg_conf->cpu_ratio); |
| list_append(my_list, key_pair); |
| |
| #ifdef HAVE_BGL |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("BlrtsImage"); |
| key_pair->value = xstrdup(bg_conf->default_blrtsimage); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("LinuxImage"); |
| key_pair->value = xstrdup(bg_conf->default_linuximage); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("RamDiskImage"); |
| key_pair->value = xstrdup(bg_conf->default_ramdiskimage); |
| list_append(my_list, key_pair); |
| #elif defined HAVE_BGP |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("CnloadImage"); |
| key_pair->value = xstrdup(bg_conf->default_linuximage); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("IoloadImage"); |
| key_pair->value = xstrdup(bg_conf->default_ramdiskimage); |
| list_append(my_list, key_pair); |
| #endif |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("BridgeAPILogFile"); |
| key_pair->value = xstrdup(bg_conf->bridge_api_file); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("BridgeAPIVerbose"); |
| key_pair->value = xstrdup_printf("%u", bg_conf->bridge_api_verb); |
| list_append(my_list, key_pair); |
| |
| if (bg_conf->deny_pass) { |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("DenyPassThrough"); |
| if (bg_conf->deny_pass & PASS_DENY_A) |
| xstrcat(key_pair->value, "A,"); |
| if (bg_conf->deny_pass & PASS_DENY_X) |
| xstrcat(key_pair->value, "X,"); |
| if (bg_conf->deny_pass & PASS_DENY_Y) |
| xstrcat(key_pair->value, "Y,"); |
| if (bg_conf->deny_pass & PASS_DENY_Z) |
| xstrcat(key_pair->value, "Z,"); |
| if (key_pair->value) |
| key_pair->value[strlen(key_pair->value)-1] = '\0'; |
| list_append(my_list, key_pair); |
| } |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("IONodesPerMP"); |
| key_pair->value = xstrdup_printf("%u", bg_conf->ionodes_per_mp); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("LayoutMode"); |
| switch(bg_conf->layout_mode) { |
| case LAYOUT_STATIC: |
| key_pair->value = xstrdup("Static"); |
| break; |
| case LAYOUT_OVERLAP: |
| key_pair->value = xstrdup("Overlap"); |
| break; |
| case LAYOUT_DYNAMIC: |
| key_pair->value = xstrdup("Dynamic"); |
| break; |
| default: |
| key_pair->value = xstrdup("Unknown"); |
| break; |
| } |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("MloaderImage"); |
| key_pair->value = xstrdup(bg_conf->default_mloaderimage); |
| list_append(my_list, key_pair); |
| |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("NodeCardNodeCnt"); |
| key_pair->value = xstrdup_printf("%u", bg_conf->nodecard_cnode_cnt); |
| list_append(my_list, key_pair); |
| |
| if (bg_conf->sub_blocks) { |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("AllowSubBlockAllocations"); |
| key_pair->value = xstrdup("Yes"); |
| list_append(my_list, key_pair); |
| } |
| |
| if (bg_conf->sub_mp_sys) { |
| key_pair = xmalloc(sizeof(config_key_pair_t)); |
| key_pair->name = xstrdup("SubMidplaneSystem"); |
| key_pair->value = xstrdup("Yes"); |
| list_append(my_list, key_pair); |
| } |
| |
| list_sort(my_list, (ListCmpF) sort_key_pairs); |
| |
| return my_list; |
| } |
| #endif |
| |
| /* |
| * init() is called when the plugin is loaded, before any other functions |
| * are called. Put global initialization here. |
| */ |
| extern int init(void) |
| { |
| |
| #ifdef HAVE_BG |
| if (!bg_conf) { |
| /* This is needed on all systems where srun wraps the |
| bluegene calling program (i.e. runjob). |
| */ |
| bg_conf = xmalloc(sizeof(bg_config_t)); |
| /* set some defaults for most systems */ |
| bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt = 512; |
| bg_conf->quarter_cnode_cnt = 128; |
| bg_conf->nodecard_cnode_cnt = 32; |
| bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt |
| / bg_conf->nodecard_cnode_cnt; |
| } |
| if (bg_recover != NOT_FROM_CONTROLLER) { |
| #if defined HAVE_BG_L_P && (SYSTEM_DIMENSIONS != 3) |
| fatal("SYSTEM_DIMENSIONS value (%d) invalid for BlueGene", |
| SYSTEM_DIMENSIONS); |
| #elif defined HAVE_BGQ && (SYSTEM_DIMENSIONS != 4) |
| fatal("SYSTEM_DIMENSIONS value (%d) invalid for BGQ", |
| SYSTEM_DIMENSIONS); |
| #endif |
| |
| #if defined HAVE_BG_FILES && defined HAVE_BG_L_P |
| #ifdef HAVE_BGL |
| if (!getenv("CLASSPATH") || !getenv("DB2INSTANCE") |
| || !getenv("VWSPATH")) |
| fatal("db2profile has not been " |
| "run to setup DB2 environment"); |
| |
| if ((SELECT_COPROCESSOR_MODE != RM_PARTITION_COPROCESSOR_MODE) |
| || (SELECT_VIRTUAL_NODE_MODE |
| != RM_PARTITION_VIRTUAL_NODE_MODE)) |
| fatal("enum node_use_type out of sync with rm_api.h"); |
| #endif |
| if ((SELECT_MESH != RM_MESH) |
| || (SELECT_TORUS != RM_TORUS) |
| || (SELECT_NAV != RM_NAV)) |
| fatal("enum conn_type out of sync with rm_api.h"); |
| #endif |
| |
| verbose("%s loading...", plugin_name); |
| /* if this is coming from something other than the controller |
| we don't want to read the config or anything like that. */ |
| _set_bg_lists(); |
| |
| xfree(bg_conf->slurm_user_name); |
| xfree(bg_conf->slurm_node_prefix); |
| slurm_conf_lock(); |
| xassert(slurmctld_conf.slurm_user_name); |
| xassert(slurmctld_conf.node_prefix); |
| bg_conf->slurm_user_name = |
| xstrdup(slurmctld_conf.slurm_user_name); |
| bg_conf->slurm_node_prefix = |
| xstrdup(slurmctld_conf.node_prefix); |
| bg_conf->slurm_debug_flags = slurmctld_conf.debug_flags; |
| bg_conf->slurm_debug_level = slurmctld_conf.slurmctld_debug; |
| slurm_conf_unlock(); |
| |
| if (bg_conf->blrts_list) |
| list_destroy(bg_conf->blrts_list); |
| bg_conf->blrts_list = list_create(destroy_image); |
| if (bg_conf->linux_list) |
| list_destroy(bg_conf->linux_list); |
| bg_conf->linux_list = list_create(destroy_image); |
| if (bg_conf->mloader_list) |
| list_destroy(bg_conf->mloader_list); |
| bg_conf->mloader_list = list_create(destroy_image); |
| if (bg_conf->ramdisk_list) |
| list_destroy(bg_conf->ramdisk_list); |
| bg_conf->ramdisk_list = list_create(destroy_image); |
| |
| ba_init(NULL, 1); |
| |
| verbose("BlueGene plugin loaded successfully"); |
| } |
| verbose("%s loaded", plugin_name); |
| #else |
| if (bg_recover != NOT_FROM_CONTROLLER) |
| fatal("select/bluegene is incompatible with a " |
| "non BlueGene system"); |
| #endif |
| return SLURM_SUCCESS; |
| } |
| |
| extern int fini ( void ) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| ba_fini(); |
| |
| _destroy_bg_config(bg_conf); |
| _destroy_bg_lists(bg_lists); |
| |
| return rc; |
| } |
| |
| /* |
| * The remainder of this file implements the standard SLURM |
| * node selection API. |
| */ |
| |
| /* We rely upon DB2 to save and restore BlueGene state */ |
| extern int select_p_state_save(char *dir_name) |
| { |
| #ifdef HAVE_BG |
| ListIterator itr; |
| bg_record_t *bg_record = NULL; |
| int error_code = 0, log_fd; |
| char *old_file, *new_file, *reg_file; |
| uint32_t blocks_packed = 0, tmp_offset, block_offset; |
| Buf buffer = init_buf(BUF_SIZE); |
| slurmctld_lock_t job_read_lock = |
| { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; |
| |
| DEF_TIMERS; |
| |
| debug("bluegene: select_p_state_save"); |
| START_TIMER; |
| /* write header: time */ |
| packstr(BLOCK_STATE_VERSION, buffer); |
| block_offset = get_buf_offset(buffer); |
| pack32(blocks_packed, buffer); |
| |
| /* Lock job read before block to avoid deadlock job lock is |
| * needed because we look at the job_ptr's to send job info. */ |
| lock_slurmctld(job_read_lock); |
| /* write block records to buffer */ |
| slurm_mutex_lock(&block_state_mutex); |
| itr = list_iterator_create(bg_lists->main); |
| while ((bg_record = list_next(itr))) { |
| if (bg_record->magic != BLOCK_MAGIC) |
| continue; |
| |
| xassert(bg_record->bg_block_id != NULL); |
| |
| _pack_block(bg_record, buffer, SLURM_PROTOCOL_VERSION); |
| _pack_block_ext(bg_record, buffer, SLURM_PROTOCOL_VERSION); |
| blocks_packed++; |
| } |
| list_iterator_destroy(itr); |
| slurm_mutex_unlock(&block_state_mutex); |
| unlock_slurmctld(job_read_lock); |
| tmp_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, block_offset); |
| pack32(blocks_packed, buffer); |
| set_buf_offset(buffer, tmp_offset); |
| /* Maintain config read lock until we copy state_save_location *\ |
| \* unlock_slurmctld(part_read_lock); - see below */ |
| |
| /* write the buffer to file */ |
| slurm_conf_lock(); |
| old_file = xstrdup(slurmctld_conf.state_save_location); |
| xstrcat(old_file, "/block_state.old"); |
| reg_file = xstrdup(slurmctld_conf.state_save_location); |
| xstrcat(reg_file, "/block_state"); |
| new_file = xstrdup(slurmctld_conf.state_save_location); |
| xstrcat(new_file, "/block_state.new"); |
| slurm_conf_unlock(); |
| |
| log_fd = creat(new_file, 0600); |
| if (log_fd < 0) { |
| error("Can't save state, error creating file %s, %m", |
| new_file); |
| error_code = errno; |
| } else { |
| int pos = 0, nwrite = get_buf_offset(buffer), amount; |
| char *data = (char *)get_buf_data(buffer); |
| |
| while (nwrite > 0) { |
| amount = write(log_fd, &data[pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", new_file); |
| error_code = errno; |
| break; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| fsync(log_fd); |
| close(log_fd); |
| } |
| if (error_code) |
| (void) unlink(new_file); |
| else { /* file shuffle */ |
| (void) unlink(old_file); |
| if (link(reg_file, old_file)) |
| debug4("unable to create link for %s -> %s: %m", |
| reg_file, old_file); |
| (void) unlink(reg_file); |
| if (link(new_file, reg_file)) |
| debug4("unable to create link for %s -> %s: %m", |
| new_file, reg_file); |
| (void) unlink(new_file); |
| } |
| xfree(old_file); |
| xfree(reg_file); |
| xfree(new_file); |
| |
| free_buf(buffer); |
| END_TIMER2("select_p_state_save"); |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_state_restore(char *dir_name) |
| { |
| #ifdef HAVE_BG |
| debug("bluegene: select_p_state_restore"); |
| |
| /* found bg blocks already on system */ |
| List curr_block_list = NULL; |
| List found_block_list = NULL; |
| static time_t last_config_update = (time_t) 0; |
| |
| /* only run on startup */ |
| if (last_config_update) |
| return SLURM_SUCCESS; |
| |
| last_config_update = time(NULL); |
| curr_block_list = list_create(destroy_bg_record); |
| found_block_list = list_create(NULL); |
| //#if 0 |
| /* Check to see if the configs we have are correct */ |
| if (_validate_config_blocks(curr_block_list, found_block_list, dir_name) |
| == SLURM_ERROR) { |
| _delete_old_blocks(curr_block_list, found_block_list); |
| } |
| //#endif |
| /* looking for blocks only I created */ |
| if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { |
| info("No blocks created until jobs are submitted"); |
| } else { |
| if (create_defined_blocks(bg_conf->layout_mode, |
| found_block_list) |
| == SLURM_ERROR) { |
| /* error in creating the static blocks, so |
| * blocks referenced by submitted jobs won't |
| * correspond to actual slurm blocks. |
| */ |
| fatal("Error, could not create the static blocks"); |
| return SLURM_ERROR; |
| } |
| } |
| |
| list_destroy(curr_block_list); |
| curr_block_list = NULL; |
| list_destroy(found_block_list); |
| found_block_list = NULL; |
| |
| slurm_mutex_lock(&block_state_mutex); |
| last_bg_update = time(NULL); |
| sort_bg_record_inc_size(bg_lists->main); |
| slurm_mutex_unlock(&block_state_mutex); |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) |
| info("Blocks have finished being created."); |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| /* Sync BG blocks to currently active jobs */ |
| extern int select_p_job_init(List job_list) |
| { |
| #ifdef HAVE_BG |
| int rc = sync_jobs(job_list); |
| |
| /* after we have synced the blocks then we say they are |
| created. */ |
| blocks_are_created = 1; |
| |
| return rc; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern bool select_p_node_ranking(struct node_record *node_ptr, int node_cnt) |
| { |
| return false; |
| } |
| |
| /* All initialization is performed by init() */ |
| extern int select_p_node_init(struct node_record *node_ptr_array, int node_cnt) |
| { |
| #ifdef HAVE_BG |
| int i = 0; |
| uint32_t real_memory, threads, cores; |
| |
| if (!node_ptr_array) |
| return SLURM_SUCCESS; |
| |
| xassert(bg_conf); |
| |
| /* we need the amount of memory for a midplane */ |
| real_memory = bg_conf->mp_cnode_cnt; |
| |
| /* Set up some knowns that perhaps aren't all the way |
| in the slurm.conf. |
| */ |
| #ifdef HAVE_BGL |
| threads = 1; |
| cores = 2; |
| real_memory *= 512; |
| #elif defined HAVE_BGP |
| threads = 1; |
| cores = 4; |
| real_memory *= 2048; |
| #else |
| /* BGQ */ |
| threads = 4; |
| cores = 16; |
| real_memory *= 16384; |
| #endif |
| |
| bg_conf->cpus_per_mp = bg_conf->mp_cnode_cnt * cores; |
| |
| for (i = 0; i < node_cnt; i++) { |
| struct node_record *node_ptr = &node_ptr_array[i]; |
| select_nodeinfo_t *nodeinfo = NULL; |
| |
| if (!node_ptr->name) |
| continue; |
| |
| node_ptr->threads = threads; |
| node_ptr->cores = cores; |
| node_ptr->sockets = bg_conf->mp_cnode_cnt; |
| node_ptr->config_ptr->cpus = node_ptr->cpus = |
| bg_conf->cpus_per_mp; |
| node_ptr->real_memory = real_memory; |
| |
| xassert(node_ptr->select_nodeinfo); |
| nodeinfo = node_ptr->select_nodeinfo->data; |
| xassert(nodeinfo); |
| |
| slurm_mutex_lock(&ba_system_mutex); |
| if (!(nodeinfo->ba_mp = str2ba_mp(node_ptr->name))) { |
| slurm_mutex_unlock(&ba_system_mutex); |
| continue; |
| } |
| nodeinfo->ba_mp->index = i; |
| if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr)) |
| ba_update_mp_state( |
| nodeinfo->ba_mp, node_ptr->node_state); |
| nodeinfo->ba_mp->state = node_ptr->node_state; |
| slurm_mutex_unlock(&ba_system_mutex); |
| } |
| |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| /* |
| * Called by slurmctld when a new configuration file is loaded |
| * or scontrol is used to change block configuration |
| */ |
| extern int select_p_block_init(List part_list) |
| { |
| #ifdef HAVE_BG |
| /* select_p_node_init needs to be called before this to set |
| this up correctly |
| */ |
| |
| if (read_bg_conf() == SLURM_ERROR) { |
| fatal("Error, could not read the file"); |
| return SLURM_ERROR; |
| } |
| |
| if (part_list) { |
| struct part_record *part_ptr = NULL; |
| ListIterator itr = list_iterator_create(part_list); |
| while ((part_ptr = list_next(itr))) { |
| char *this_node_name; |
| hostlist_t host_list; |
| part_ptr->total_cpus = 0; |
| if (!part_ptr->nodes) /* no nodes in partition */ |
| continue; |
| |
| if (!(host_list = hostlist_create(part_ptr->nodes))) { |
| error("hostlist_create error on %s, %m", |
| part_ptr->nodes); |
| continue; |
| } |
| |
| while ((this_node_name = hostlist_shift(host_list))) { |
| struct node_record *node_ptr = |
| find_node_record(this_node_name); |
| if (node_ptr == NULL) { |
| error("select_p_block_init: " |
| "invalid node name %s", |
| this_node_name); |
| free(this_node_name); |
| hostlist_destroy(host_list); |
| continue; |
| } |
| free(this_node_name); |
| part_ptr->total_cpus += node_ptr->cpus; |
| } |
| hostlist_destroy(host_list); |
| |
| part_ptr->max_nodes = part_ptr->max_nodes_orig; |
| part_ptr->min_nodes = part_ptr->min_nodes_orig; |
| select_p_alter_node_cnt(SELECT_SET_MP_CNT, |
| &part_ptr->max_nodes); |
| select_p_alter_node_cnt(SELECT_SET_MP_CNT, |
| &part_ptr->min_nodes); |
| } |
| list_iterator_destroy(itr); |
| } |
| |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| |
| /* |
| * select_p_job_test - Given a specification of scheduling requirements, |
| * identify the nodes which "best" satify the request. The specified |
| * nodes may be DOWN or BUSY at the time of this test as may be used |
| * to deterime if a job could ever run. |
| * IN/OUT job_ptr - pointer to job being scheduled start_time is set |
| * when we can possibly start job. |
| * IN/OUT bitmap - usable nodes are set on input, nodes not required to |
| * satisfy the request are cleared, other left set |
| * IN min_nodes - minimum count of nodes |
| * IN max_nodes - maximum count of nodes (0==don't care) |
| * IN req_nodes - requested (or desired) count of nodes |
| * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now |
| * SELECT_MODE_TEST_ONLY: test if job can ever run |
| * SELECT_MODE_WILL_RUN: determine when and where job can run |
| * IN preemptee_candidates - List of pointers to jobs which can be preempted. |
| * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the |
| * jobs to be preempted to initiate the pending job. Not set |
| * if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL. |
| * RET zero on success, EINVAL otherwise |
| * NOTE: bitmap must be a superset of req_nodes at the time that |
| * select_p_job_test is called |
| */ |
| extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, |
| uint32_t min_nodes, uint32_t max_nodes, |
| uint32_t req_nodes, uint16_t mode, |
| List preemptee_candidates, |
| List *preemptee_job_list) |
| { |
| #ifdef HAVE_BG |
| /* submit_job - is there a block where we have: |
| * 1) geometry requested |
| * 2) min/max nodes (MPs) requested |
| * 3) type: TORUS or MESH or NAV (torus else mesh) |
| * |
| * note: we don't have to worry about security at this level |
| * as the SLURM block logic will handle access rights. |
| */ |
| |
| return submit_job(job_ptr, bitmap, min_nodes, max_nodes, |
| req_nodes, mode, preemptee_candidates, |
| preemptee_job_list); |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_job_begin(struct job_record *job_ptr) |
| { |
| #ifdef HAVE_BG |
| return start_job(job_ptr); |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_job_ready(struct job_record *job_ptr) |
| { |
| #ifdef HAVE_BG |
| int rc = 1; |
| char *block_id = NULL; |
| bg_record_t *bg_record = NULL; |
| |
| rc = get_select_jobinfo(job_ptr->select_jobinfo->data, |
| SELECT_JOBDATA_BLOCK_ID, &block_id); |
| if (rc == SLURM_SUCCESS) { |
| slurm_mutex_lock(&block_state_mutex); |
| bg_record = find_bg_record_in_list(bg_lists->main, block_id); |
| |
| if (bg_record) { |
| uint32_t job_id = NO_JOB_RUNNING, uid = NO_VAL; |
| struct job_record *found_job_ptr = NULL; |
| |
| if (bg_record->job_list |
| && list_count(bg_record->job_list)) { |
| ListIterator itr = list_iterator_create( |
| bg_record->job_list); |
| xassert(itr); |
| while ((found_job_ptr = list_next(itr))) { |
| if (found_job_ptr->magic != JOB_MAGIC) { |
| error("select_p_job_ready: " |
| "bad magic found when " |
| "looking at job %u", |
| job_ptr->job_id); |
| list_delete_item(itr); |
| continue; |
| } |
| |
| if (found_job_ptr->job_id |
| == job_ptr->job_id) |
| break; |
| } |
| list_iterator_destroy(itr); |
| } else if (bg_record->job_ptr) |
| found_job_ptr = bg_record->job_ptr; |
| |
| if (found_job_ptr) { |
| job_id = found_job_ptr->job_id; |
| uid = found_job_ptr->user_id; |
| } |
| |
| if (job_id != job_ptr->job_id) { |
| rc = 0; |
| } else if (!bg_record->free_cnt |
| && (uid == job_ptr->user_id) |
| && (bg_record->state == BG_BLOCK_INITED)) { |
| /* Clear the state just incase we |
| * missed it somehow. */ |
| job_ptr->job_state &= (~JOB_CONFIGURING); |
| last_job_update = time(NULL); |
| rc = 1; |
| } else if (uid != job_ptr->user_id) |
| rc = 0; |
| else |
| rc = READY_JOB_ERROR; /* try again */ |
| } else { |
| /* This means the block has been removed and |
| is no longer valid. This could happen |
| often during an epilog on a busy system. |
| */ |
| debug2("block_ready: block %s not in bg_lists->main.", |
| block_id); |
| rc = READY_JOB_FATAL; /* fatal error */ |
| } |
| slurm_mutex_unlock(&block_state_mutex); |
| } else |
| rc = READY_JOB_ERROR; |
| /* info("returning %d for job %u block %s %d %d", */ |
| /* rc, job_ptr->job_id, block_id, */ |
| /* READY_JOB_ERROR, READY_JOB_FATAL); */ |
| xfree(block_id); |
| return rc; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_job_resized(struct job_record *job_ptr, |
| struct node_record *node_ptr) |
| { |
| return ESLURM_NOT_SUPPORTED; |
| } |
| |
| extern bool select_p_job_expand_allow(void) |
| { |
| return false; |
| } |
| |
| extern int select_p_job_expand(struct job_record *from_job_ptr, |
| struct job_record *to_job_ptr) |
| { |
| return ESLURM_NOT_SUPPORTED; |
| } |
| |
| extern int select_p_job_signal(struct job_record *job_ptr, int signal) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| extern int select_p_job_fini(struct job_record *job_ptr) |
| { |
| #ifdef HAVE_BG |
| return term_job(job_ptr); |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_job_suspend(struct job_record *job_ptr, bool indf_susp) |
| { |
| return ESLURM_NOT_SUPPORTED; |
| } |
| |
| extern int select_p_job_resume(struct job_record *job_ptr, bool indf_susp) |
| { |
| return ESLURM_NOT_SUPPORTED; |
| } |
| |
| extern bitstr_t *select_p_step_pick_nodes(struct job_record *job_ptr, |
| select_jobinfo_t *step_jobinfo, |
| uint32_t node_count) |
| { |
| bitstr_t *picked_mps = NULL; |
| bg_record_t *bg_record = NULL; |
| char *tmp_char = NULL; |
| ba_mp_t *ba_mp = NULL; |
| select_jobinfo_t *jobinfo = NULL; |
| int dim; |
| |
| xassert(job_ptr); |
| |
| slurm_mutex_lock(&block_state_mutex); |
| jobinfo = job_ptr->select_jobinfo->data; |
| bg_record = jobinfo->bg_record; |
| |
| if (!bg_record) |
| fatal("This job %u does not have a bg block " |
| "assigned to it, but for some reason we are " |
| "trying to start a step on it?", |
| job_ptr->job_id); |
| else if (bg_record->magic != BLOCK_MAGIC) { |
| bg_record = find_bg_record_in_list( |
| bg_lists->main, jobinfo->bg_block_id); |
| if (!bg_record || (bg_record->magic != BLOCK_MAGIC)) { |
| error("select_p_step_pick_nodes: " |
| "Whoa, some how we got a bad block for job %u, " |
| "it should be %s but we couldn't find " |
| "it on the system, no step for you, " |
| "and ending job.", |
| job_ptr->job_id, jobinfo->bg_block_id); |
| slurm_mutex_unlock(&block_state_mutex); |
| bg_requeue_job(job_ptr->job_id, 0, 1); |
| return NULL; |
| } |
| error("select_p_step_pick_nodes: Whoa, some how we got a " |
| "bad block for job %u, it should be %s " |
| "(we found it so no big deal, but strange)", |
| job_ptr->job_id, jobinfo->bg_block_id); |
| jobinfo->bg_record = bg_record; |
| } else if ((bg_record->action == BG_BLOCK_ACTION_FREE) |
| && (bg_record->state == BG_BLOCK_INITED)) { |
| /* If we are in the action state of |
| FREE of 'D' since the block won't be able to run any future |
| jobs on it. |
| */ |
| info("select_p_step_pick_nodes: " |
| "Already selected block %s can't be used, " |
| "it has an action item of 'D' on it, ending job %u.", |
| bg_record->bg_block_id, job_ptr->job_id); |
| slurm_mutex_unlock(&block_state_mutex); |
| bg_requeue_job(job_ptr->job_id, 0, 1); |
| return NULL; |
| } |
| |
| xassert(!step_jobinfo->units_used); |
| |
| xfree(step_jobinfo->bg_block_id); |
| step_jobinfo->bg_block_id = xstrdup(bg_record->bg_block_id); |
| step_jobinfo->block_cnode_cnt = bg_record->cnode_cnt; |
| |
| if (((cluster_flags & CLUSTER_FLAG_BGL) |
| || (cluster_flags & CLUSTER_FLAG_BGP)) |
| || ((node_count == bg_record->cnode_cnt) |
| || (node_count > bg_conf->mp_cnode_cnt))) { |
| /* If we are using the whole block (or more than 1 |
| midplane of it) we need to verify |
| if anything else is used. If anything else is used |
| return NULL, else return that we can use the entire |
| thing. |
| On BGL/P This is always the default, no matter how |
| big the step is since you can only run 1 step per block. |
| */ |
| step_jobinfo->dim_cnt = jobinfo->dim_cnt; |
| if (list_count(job_ptr->step_list)) { |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) |
| info("select_p_step_pick_nodes: Looking " |
| "for more than one midplane of " |
| "block %s for job %u, " |
| "but some of it is used.", |
| bg_record->bg_block_id, job_ptr->job_id); |
| goto end_it; |
| } |
| if (!(picked_mps = bit_copy(job_ptr->node_bitmap))) |
| fatal("bit_copy malloc failure"); |
| |
| if (cluster_flags & CLUSTER_FLAG_BGQ) { |
| bitstr_t *used_bitmap; |
| if (node_count > bg_conf->mp_cnode_cnt) { |
| /* Here we have to make sure nothing |
| else is able to run on this block |
| since we are using more than 1 |
| midplane but potentially not the |
| entire allocation. |
| */ |
| FREE_NULL_BITMAP(jobinfo->units_avail); |
| FREE_NULL_BITMAP(jobinfo->units_used); |
| jobinfo->units_avail = |
| ba_create_ba_mp_cnode_bitmap(bg_record); |
| jobinfo->units_used = |
| bit_copy(jobinfo->units_avail); |
| } |
| |
| if (jobinfo->units_avail) |
| used_bitmap = jobinfo->units_used; |
| else { |
| ba_mp = list_peek(bg_record->ba_mp_list); |
| xassert(ba_mp); |
| if (!ba_mp->cnode_bitmap) |
| ba_mp->cnode_bitmap = |
| ba_create_ba_mp_cnode_bitmap( |
| bg_record); |
| used_bitmap = ba_mp->cnode_bitmap; |
| } |
| /* units_used and units_avail will be the |
| same, the exact opposite of used_bitmap. |
| */ |
| step_jobinfo->units_used = bit_copy(used_bitmap); |
| bit_not(step_jobinfo->units_used); |
| step_jobinfo->units_avail = |
| bit_copy(step_jobinfo->units_used); |
| bit_or(used_bitmap, step_jobinfo->units_used); |
| } |
| |
| step_jobinfo->ionode_str = xstrdup(jobinfo->ionode_str); |
| } else if (jobinfo->units_avail) { |
| bitstr_t *total_bitmap = jobinfo->units_used; |
| ba_mp = list_peek(bg_record->ba_mp_list); |
| xassert(ba_mp); |
| if (ba_mp->cnode_err_bitmap) { |
| total_bitmap = bit_copy(jobinfo->units_used); |
| bit_or(total_bitmap, ba_mp->cnode_err_bitmap); |
| } |
| /* handle a sub-block allocation where the allocation |
| itself if a small block. |
| */ |
| step_jobinfo->cnode_cnt = node_count; |
| if (!(ba_sub_block_in_bitmap(step_jobinfo, total_bitmap, 1))) { |
| if (total_bitmap != jobinfo->units_used) |
| FREE_NULL_BITMAP(total_bitmap); |
| goto end_it; |
| } |
| |
| if (total_bitmap != jobinfo->units_used) |
| FREE_NULL_BITMAP(total_bitmap); |
| |
| node_count = step_jobinfo->cnode_cnt; |
| if (!(picked_mps = bit_copy(job_ptr->node_bitmap))) |
| fatal("bit_copy malloc failure"); |
| bit_or(jobinfo->units_used, step_jobinfo->units_used); |
| for (dim = 0; dim < step_jobinfo->dim_cnt; dim++) { |
| /* The IBM software works off a relative |
| position in the block instead of the |
| absolute position used in SLURM. |
| Since conn_type doesn't mean anything for a |
| step we can just overload it since it is getting |
| sent aready and we don't need to bloat |
| anything if we don't have to. |
| |
| So setting it here we can have both |
| absolute and relative. |
| |
| We don't need to add here since we are |
| always only dealing with a block that is 1 |
| midplane or less. |
| */ |
| step_jobinfo->conn_type[dim] = |
| step_jobinfo->start_loc[dim] |
| - bg_record->start_small[dim]; |
| } |
| } else if ((ba_mp = ba_sub_block_in_record( |
| bg_record, &node_count, step_jobinfo))) { |
| if (!(picked_mps = bit_alloc(bit_size(job_ptr->node_bitmap)))) |
| fatal("bit_copy malloc failure"); |
| bit_set(picked_mps, ba_mp->index); |
| for (dim = 0; dim < step_jobinfo->dim_cnt; dim++) { |
| /* The IBM software works off a relative |
| position in the block instead of the |
| absolute position used in SLURM. |
| Since conn_type doesn't mean anything for a |
| step we can just overload it since it is getting |
| sent aready and we don't need to bloat |
| anything if we don't have to. |
| |
| So setting it here we can have both |
| absolute and relative. |
| |
| We add here since if not using the first |
| midplane we have already setup the |
| conn_type to point to the starting point of |
| the relative position in the block. |
| */ |
| step_jobinfo->conn_type[dim] += |
| step_jobinfo->start_loc[dim] |
| - bg_record->start_small[dim]; |
| } |
| } |
| |
| if (picked_mps) { |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { |
| tmp_char = bitmap2node_name(picked_mps); |
| info("select_p_step_pick_nodes: new step for job %u " |
| "will be running on %s(%s)", |
| job_ptr->job_id, bg_record->bg_block_id, tmp_char); |
| xfree(tmp_char); |
| } |
| step_jobinfo->cnode_cnt = node_count; |
| } |
| |
| end_it: |
| |
| slurm_mutex_unlock(&block_state_mutex); |
| |
| return picked_mps; |
| } |
| |
| extern int select_p_step_finish(struct step_record *step_ptr) |
| { |
| bg_record_t *bg_record = NULL; |
| select_jobinfo_t *jobinfo = NULL, *step_jobinfo = NULL; |
| int rc = SLURM_SUCCESS; |
| char *tmp_char = NULL; |
| |
| xassert(step_ptr); |
| |
| |
| if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { |
| debug("step completion %u.%u was received after job " |
| "allocation is already completing, no cleanup needed", |
| step_ptr->job_ptr->job_id, step_ptr->step_id); |
| return SLURM_SUCCESS; |
| } |
| |
| jobinfo = step_ptr->job_ptr->select_jobinfo->data; |
| step_jobinfo = step_ptr->select_jobinfo->data; |
| |
| if (step_jobinfo->cnode_cnt > bg_conf->mp_cnode_cnt) { |
| /* This means we were using units_avail and units_used |
| as midplanes not cnodes for either the whole job |
| allocation or a portion of it. |
| */ |
| FREE_NULL_BITMAP(jobinfo->units_avail); |
| FREE_NULL_BITMAP(jobinfo->units_used); |
| } else if (jobinfo->units_avail) |
| rc = ba_sub_block_in_bitmap_clear( |
| step_jobinfo, jobinfo->units_used); |
| else { |
| slurm_mutex_lock(&block_state_mutex); |
| bg_record = jobinfo->bg_record; |
| |
| if (!bg_record) |
| fatal("This step %u.%u does not have a bg block " |
| "assigned to it, but for some reason we are " |
| "trying to end the step?", |
| step_ptr->job_ptr->job_id, step_ptr->step_id); |
| else if (bg_record->magic != BLOCK_MAGIC) { |
| bg_record = find_bg_record_in_list( |
| bg_lists->main, jobinfo->bg_block_id); |
| if (!bg_record || (bg_record->magic != BLOCK_MAGIC)) { |
| error("select_p_step_finish: " |
| "Whoa, some how we got a bad block " |
| "for job %u, it should be %s but " |
| "we couldn't find it on the system, " |
| "so no real need to clear it up.", |
| step_ptr->job_ptr->job_id, |
| jobinfo->bg_block_id); |
| slurm_mutex_unlock(&block_state_mutex); |
| return SLURM_ERROR; |
| } |
| error("select_p_step_finish: Whoa, some how we " |
| "got a bad block for job %u, it should be %s " |
| "(we found it so no big deal, but strange)", |
| step_ptr->job_ptr->job_id, jobinfo->bg_block_id); |
| jobinfo->bg_record = bg_record; |
| } |
| rc = ba_sub_block_in_record_clear(bg_record, step_ptr); |
| slurm_mutex_unlock(&block_state_mutex); |
| } |
| |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) { |
| tmp_char = bitmap2node_name(step_ptr->step_node_bitmap); |
| info("select_p_step_finish: step %u.%u cleared from %s", |
| step_ptr->job_ptr->job_id, step_ptr->step_id, tmp_char); |
| xfree(tmp_char); |
| } |
| |
| return rc; |
| } |
| |
| /* The unpack for this is in common/slurm_protocol_pack.c */ |
| extern int select_p_pack_select_info(time_t last_query_time, |
| uint16_t show_flags, Buf *buffer_ptr, |
| uint16_t protocol_version) |
| { |
| #ifdef HAVE_BG |
| ListIterator itr; |
| bg_record_t *bg_record = NULL; |
| uint32_t blocks_packed = 0, tmp_offset; |
| Buf buffer; |
| |
| /* check to see if data has changed */ |
| if (last_query_time >= last_bg_update) { |
| debug2("Node select info hasn't changed since %ld", |
| last_bg_update); |
| return SLURM_NO_CHANGE_IN_DATA; |
| } else if (blocks_are_created) { |
| *buffer_ptr = NULL; |
| buffer = init_buf(HUGE_BUF_SIZE); |
| pack32(blocks_packed, buffer); |
| pack_time(last_bg_update, buffer); |
| |
| if (protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { |
| if (bg_lists->main) { |
| slurmctld_lock_t job_read_lock = |
| { NO_LOCK, READ_LOCK, |
| NO_LOCK, NO_LOCK }; |
| /* Lock job read before block to avoid |
| * deadlock job lock is needed because |
| * we look at the job_ptr's to send |
| * job info. */ |
| lock_slurmctld(job_read_lock); |
| slurm_mutex_lock(&block_state_mutex); |
| itr = list_iterator_create(bg_lists->main); |
| while ((bg_record = list_next(itr))) { |
| if (bg_record->magic != BLOCK_MAGIC) |
| continue; |
| _pack_block(bg_record, buffer, |
| protocol_version); |
| blocks_packed++; |
| } |
| list_iterator_destroy(itr); |
| slurm_mutex_unlock(&block_state_mutex); |
| unlock_slurmctld(job_read_lock); |
| } else { |
| error("select_p_pack_select_info: " |
| "no bg_lists->main"); |
| return SLURM_ERROR; |
| } |
| } |
| tmp_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, 0); |
| pack32(blocks_packed, buffer); |
| set_buf_offset(buffer, tmp_offset); |
| |
| *buffer_ptr = buffer; |
| } else { |
| error("select_p_pack_select_info: bg_lists->main not created " |
| "yet"); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo, |
| Buf buffer, |
| uint16_t protocol_version) |
| { |
| return select_nodeinfo_pack(nodeinfo, buffer, protocol_version); |
| } |
| |
| extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo, |
| Buf buffer, |
| uint16_t protocol_version) |
| { |
| return select_nodeinfo_unpack(nodeinfo, buffer, protocol_version); |
| } |
| |
| extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(void) |
| { |
| return select_nodeinfo_alloc(0); |
| } |
| |
| extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo) |
| { |
| return select_nodeinfo_free(nodeinfo); |
| } |
| |
| extern int select_p_select_nodeinfo_set_all(time_t last_query_time) |
| { |
| if (bg_recover != NOT_FROM_CONTROLLER) |
| bridge_status_init(); |
| |
| return select_nodeinfo_set_all(last_query_time); |
| } |
| |
| extern int select_p_select_nodeinfo_set(struct job_record *job_ptr) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo, |
| enum select_nodedata_type dinfo, |
| enum node_states state, |
| void *data) |
| { |
| return select_nodeinfo_get(nodeinfo, dinfo, state, data); |
| } |
| |
| extern select_jobinfo_t *select_p_select_jobinfo_alloc(void) |
| { |
| return alloc_select_jobinfo(); |
| } |
| |
| extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo, |
| enum select_jobdata_type data_type, |
| void *data) |
| { |
| return set_select_jobinfo(jobinfo, data_type, data); |
| } |
| |
| extern int select_p_select_jobinfo_get(select_jobinfo_t *jobinfo, |
| enum select_jobdata_type data_type, |
| void *data) |
| { |
| return get_select_jobinfo(jobinfo, data_type, data); |
| } |
| |
| extern select_jobinfo_t *select_p_select_jobinfo_copy(select_jobinfo_t *jobinfo) |
| { |
| return copy_select_jobinfo(jobinfo); |
| } |
| |
| extern int select_p_select_jobinfo_free(select_jobinfo_t *jobinfo) |
| { |
| return free_select_jobinfo(jobinfo); |
| } |
| |
| extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer, |
| uint16_t protocol_version) |
| { |
| return pack_select_jobinfo(jobinfo, buffer, protocol_version); |
| } |
| |
| extern int select_p_select_jobinfo_unpack(select_jobinfo_t **jobinfo, |
| Buf buffer, |
| uint16_t protocol_version) |
| { |
| return unpack_select_jobinfo(jobinfo, buffer, protocol_version); |
| } |
| |
| extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo, |
| char *buf, size_t size, int mode) |
| { |
| return sprint_select_jobinfo(jobinfo, buf, size, mode); |
| } |
| |
| extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo, |
| int mode) |
| { |
| return xstrdup_select_jobinfo(jobinfo, mode); |
| } |
| |
| extern int select_p_update_block(update_block_msg_t *block_desc_ptr) |
| { |
| #ifdef HAVE_BG |
| int rc = SLURM_SUCCESS; |
| bg_record_t *bg_record = NULL; |
| char reason[200]; |
| List kill_job_list = NULL; |
| kill_job_struct_t *freeit; |
| ListIterator itr; |
| |
| if (!block_desc_ptr->bg_block_id) { |
| error("update_block: No name specified"); |
| return ESLURM_INVALID_BLOCK_NAME; |
| } |
| |
| slurm_mutex_lock(&block_state_mutex); |
| bg_record = find_bg_record_in_list(bg_lists->main, |
| block_desc_ptr->bg_block_id); |
| if (!bg_record) { |
| error("update_block: block %s not found", |
| block_desc_ptr->bg_block_id); |
| slurm_mutex_unlock(&block_state_mutex); |
| return ESLURM_INVALID_BLOCK_NAME; |
| } |
| |
| if (block_desc_ptr->reason) |
| snprintf(reason, sizeof(reason), "%s", block_desc_ptr->reason); |
| else if (block_desc_ptr->state == BG_BLOCK_BOOTING) |
| snprintf(reason, sizeof(reason), |
| "update_block: " |
| "Admin recreated %s.", bg_record->bg_block_id); |
| else if (block_desc_ptr->state == BG_BLOCK_NAV) { |
| if (bg_record->conn_type[0] < SELECT_SMALL) |
| snprintf(reason, sizeof(reason), |
| "update_block: " |
| "Admin removed block %s", |
| bg_record->bg_block_id); |
| else |
| snprintf(reason, sizeof(reason), |
| "update_block: " |
| "Removed all blocks on midplane %s", |
| bg_record->mp_str); |
| |
| } else { |
| uint16_t state = bg_record->state; |
| |
| if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) |
| state |= BG_BLOCK_ERROR_FLAG; |
| else if (state & BG_BLOCK_ERROR_FLAG) |
| state &= (~BG_BLOCK_ERROR_FLAG); |
| else |
| state = block_desc_ptr->state; |
| |
| snprintf(reason, sizeof(reason), |
| "update_block: " |
| "Admin set block %s state to %s", |
| bg_record->bg_block_id, |
| bg_block_state_string(state)); |
| } |
| |
| /* First fail any job running on this block (Not for resume though) */ |
| if (block_desc_ptr->state != BG_BLOCK_TERM) { |
| if (bg_record->job_running > NO_JOB_RUNNING) { |
| if (!kill_job_list) |
| kill_job_list = |
| bg_status_create_kill_job_list(); |
| freeit = xmalloc(sizeof(kill_job_struct_t)); |
| freeit->jobid = bg_record->job_running; |
| list_push(kill_job_list, freeit); |
| } else if (bg_record->job_list |
| && list_count(bg_record->job_list)) { |
| struct job_record *job_ptr; |
| if (!kill_job_list) |
| kill_job_list = |
| bg_status_create_kill_job_list(); |
| itr = list_iterator_create(bg_record->job_list); |
| while ((job_ptr = list_next(itr))) { |
| if (job_ptr->magic != JOB_MAGIC) |
| continue; |
| freeit = xmalloc(sizeof(kill_job_struct_t)); |
| freeit->jobid = job_ptr->job_id; |
| list_push(kill_job_list, freeit); |
| } |
| list_iterator_destroy(itr); |
| } |
| } |
| |
| if (kill_job_list) { |
| slurm_mutex_unlock(&block_state_mutex); |
| bg_status_process_kill_job_list(kill_job_list, 0); |
| list_destroy(kill_job_list); |
| kill_job_list = NULL; |
| slurm_mutex_lock(&block_state_mutex); |
| if (!block_ptr_exist_in_list(bg_lists->main, bg_record)) { |
| slurm_mutex_unlock(&block_state_mutex); |
| error("while trying to put block in " |
| "error state it disappeared"); |
| return SLURM_ERROR; |
| } |
| } |
| |
| if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) { |
| bg_record_t *found_record = NULL; |
| List delete_list = list_create(NULL); |
| bool delete_it = 0; |
| |
| /* This loop shouldn't do much in regular Dynamic mode |
| since there shouldn't be overlapped blocks. But if |
| there is a trouble block that isn't going away and |
| we need to mark it in an error state there could be |
| blocks overlapped where we need to requeue the jobs. |
| */ |
| itr = list_iterator_create(bg_lists->main); |
| while ((found_record = list_next(itr))) { |
| if (bg_record == found_record) |
| continue; |
| |
| if (!blocks_overlap(bg_record, found_record)) { |
| debug2("block %s isn't part of errored %s", |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| continue; |
| } |
| if (found_record->job_running > NO_JOB_RUNNING) { |
| if (found_record->job_ptr |
| && IS_JOB_CONFIGURING( |
| found_record->job_ptr)) |
| info("Pending job %u on block %s " |
| "will try to be requeued " |
| "because overlapping block %s " |
| "is in an error state.", |
| found_record->job_running, |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| else |
| info("Failing job %u on block %s " |
| "because overlapping block %s " |
| "is in an error state.", |
| found_record->job_running, |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| |
| /* This job will be requeued in the |
| free_block_list code below, just |
| make note of it here. |
| */ |
| } else { |
| debug2("block %s is part of errored %s " |
| "but no running job", |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| } |
| resume_block(found_record); |
| list_push(delete_list, found_record); |
| } |
| list_iterator_destroy(itr); |
| slurm_mutex_unlock(&block_state_mutex); |
| if (bg_conf->layout_mode == LAYOUT_DYNAMIC) |
| delete_it = 1; |
| free_block_list(NO_VAL, delete_list, delete_it, 0); |
| list_destroy(delete_list); |
| put_block_in_error_state(bg_record, reason); |
| } else if (block_desc_ptr->state == BG_BLOCK_FREE) { |
| /* Resume the block first and then free the block */ |
| resume_block(bg_record); |
| |
| /* Increment free_cnt to make sure we don't loose this |
| * block since bg_free_block will unlock block_state_mutex. |
| */ |
| bg_record->free_cnt++; |
| bg_free_block(bg_record, 0, 1); |
| bg_record->free_cnt--; |
| slurm_mutex_unlock(&block_state_mutex); |
| } else if (block_desc_ptr->state == BG_BLOCK_TERM) { |
| /* This can't be RM_PARTITION_READY since the enum |
| changed from BGL to BGP and if we are running cross |
| cluster it just doesn't work. |
| */ |
| resume_block(bg_record); |
| slurm_mutex_unlock(&block_state_mutex); |
| } else if (bg_conf->layout_mode == LAYOUT_DYNAMIC |
| && (block_desc_ptr->state == BG_BLOCK_NAV)) { |
| /* This means remove the block from the system. If |
| the block is a small block we need to remove all the |
| blocks on that midplane. |
| */ |
| bg_record_t *found_record = NULL; |
| ListIterator itr; |
| List delete_list = list_create(NULL); |
| |
| list_push(delete_list, bg_record); |
| /* only do the while loop if we are dealing with a |
| small block */ |
| if (bg_record->conn_type[0] < SELECT_SMALL) |
| goto large_block; |
| |
| itr = list_iterator_create(bg_lists->main); |
| while ((found_record = list_next(itr))) { |
| if (bg_record == found_record) |
| continue; |
| |
| if (!bit_equal(bg_record->mp_bitmap, |
| found_record->mp_bitmap)) { |
| debug2("block %s isn't part of to be freed %s", |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| continue; |
| } |
| if (found_record->job_running > NO_JOB_RUNNING) { |
| if (found_record->job_ptr |
| && IS_JOB_CONFIGURING( |
| found_record->job_ptr)) |
| info("Pending job %u on block %s " |
| "will try to be requeued " |
| "because overlapping block %s " |
| "is being removed.", |
| found_record->job_running, |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| else |
| info("Running job %u on block %s " |
| "will try to be requeued " |
| "because overlapping block %s " |
| "is being removed.", |
| found_record->job_running, |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| /* This job will be requeued in the |
| free_block_list code below, just |
| make note of it here. |
| */ |
| } else if (found_record->job_list && |
| list_count(found_record->job_list)) { |
| struct job_record *job_ptr = NULL; |
| ListIterator job_itr = list_iterator_create( |
| found_record->job_list); |
| while ((job_ptr = list_next(job_itr))) { |
| if (job_ptr->magic != JOB_MAGIC) { |
| error("select_p_update_block: " |
| "bad magic found when " |
| "looking at block %s", |
| found_record-> |
| bg_block_id); |
| list_delete_item(itr); |
| continue; |
| } |
| if (IS_JOB_CONFIGURING(job_ptr)) |
| info("Pending job %u on " |
| "block %s " |
| "will try to be requeued " |
| "because related block %s " |
| "is in an error state.", |
| job_ptr->job_id, |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| else |
| info("Running job %u on " |
| "block %s " |
| "will try to be requeued " |
| "because related block %s " |
| "is being removed.", |
| job_ptr->job_id, |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| /* This job will be requeued in the |
| free_block_list code below, just |
| make note of it here. |
| */ |
| } |
| list_iterator_destroy(job_itr); |
| } else { |
| debug2("block %s is part of to be freed %s " |
| "but no running job", |
| found_record->bg_block_id, |
| bg_record->bg_block_id); |
| } |
| list_push(delete_list, found_record); |
| } |
| list_iterator_destroy(itr); |
| |
| large_block: |
| /* make sure if we are removing a block to put it back |
| to a normal state in accounting first */ |
| itr = list_iterator_create(delete_list); |
| while ((found_record = list_next(itr))) { |
| if (found_record->state & BG_BLOCK_ERROR_FLAG) |
| resume_block(found_record); |
| } |
| list_iterator_destroy(itr); |
| |
| slurm_mutex_unlock(&block_state_mutex); |
| free_block_list(NO_VAL, delete_list, 1, 0); |
| list_destroy(delete_list); |
| } else if (block_desc_ptr->state == BG_BLOCK_BOOTING) { |
| /* This means recreate the block, remove it and then |
| recreate it. |
| */ |
| |
| /* make sure if we are removing a block to put it back |
| to a normal state in accounting first */ |
| if (bg_record->state & BG_BLOCK_ERROR_FLAG) |
| resume_block(bg_record); |
| |
| term_jobs_on_block(bg_record->bg_block_id); |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) |
| info("select_p_update_block: " |
| "freeing the block %s.", bg_record->bg_block_id); |
| /* Increment free_cnt to make sure we don't loose this |
| * block since bg_free_block will unlock block_state_mutex. |
| */ |
| bg_record->free_cnt++; |
| bg_free_block(bg_record, 1, 1); |
| bg_record->free_cnt--; |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) |
| info("select_p_update_block: done"); |
| |
| /* Now remove it from the main list since we are |
| looking for a state change and it won't be caught |
| unless it is in the main list until now. |
| */ |
| remove_from_bg_list(bg_lists->main, bg_record); |
| |
| #if defined HAVE_BG_FILES |
| if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) |
| info("select_p_update_block: " |
| "removing %s from database", |
| bg_record->bg_block_id); |
| |
| rc = bridge_block_remove(bg_record); |
| if (rc != SLURM_SUCCESS) { |
| if (rc == BG_ERROR_BLOCK_NOT_FOUND) { |
| debug("select_p_update_block: " |
| "block %s is not found", |
| bg_record->bg_block_id); |
| } else { |
| error("select_p_update_block: " |
| "rm_remove_partition(%s): %s", |
| bg_record->bg_block_id, |
| bg_err_str(rc)); |
| } |
| } else |
| if (bg_conf->slurm_debug_flags |
| & DEBUG_FLAG_SELECT_TYPE) |
| info("select_p_update_block: done %s", |
| (char *)bg_record->bg_block_id); |
| #endif |
| xfree(bg_record->bg_block_id); |
| if (bridge_block_create(bg_record) == SLURM_ERROR) { |
| destroy_bg_record(bg_record); |
| error("select_p_update_block: " |
| "unable to configure block in api"); |
| } else { |
| print_bg_record(bg_record); |
| list_append(bg_lists->main, bg_record); |
| sort_bg_record_inc_size(bg_lists->main); |
| } |
| |
| slurm_mutex_unlock(&block_state_mutex); |
| } else { |
| slurm_mutex_unlock(&block_state_mutex); |
| error("state is ? %s", |
| bg_block_state_string(block_desc_ptr->state)); |
| return ESLURM_INVALID_NODE_STATE; |
| } |
| |
| /* info("%s", reason); */ |
| last_bg_update = time(NULL); |
| |
| return rc; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_update_sub_node (update_block_msg_t *block_desc_ptr) |
| { |
| #ifdef HAVE_BG |
| int rc = SLURM_SUCCESS; |
| int i = 0, j = 0; |
| char coord[SYSTEM_DIMENSIONS+1], *node_name = NULL; |
| char ionodes[128]; |
| int set = 0; |
| double nc_pos = 0, last_pos = -1; |
| bitstr_t *ionode_bitmap = NULL; |
| char *name = NULL; |
| |
| if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { |
| info("You can't use this call unless you are on a Dynamically " |
| "allocated system. Please use update BlockName instead"); |
| rc = ESLURM_INVALID_BLOCK_LAYOUT; |
| goto end_it; |
| } |
| |
| memset(coord, 0, sizeof(coord)); |
| memset(ionodes, 0, 128); |
| if (!block_desc_ptr->mp_str) { |
| error("update_sub_node: No name specified"); |
| rc = ESLURM_INVALID_BLOCK_NAME; |
| goto end_it; |
| } |
| name = block_desc_ptr->mp_str; |
| |
| while (name[j] != '\0') { |
| if (name[j] == '[') { |
| if (set<1) { |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| i = j++; |
| if ((name[j] < '0' |
| || name[j] > 'Z' |
| || (name[j] > '9' |
| && name[j] < 'A'))) { |
| error("update_sub_node: sub block is empty"); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| while (name[i] != '\0') { |
| if (name[i] == ']') |
| break; |
| i++; |
| } |
| if (name[i] != ']') { |
| error("update_sub_node: " |
| "No close (']') on sub block"); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| |
| strncpy(ionodes, name+j, i-j); |
| set++; |
| break; |
| } else if ((name[j] >= '0' |
| && name[j] <= '9') |
| || (name[j] >= 'A' |
| && name[j] <= 'Z')) { |
| if (set) { |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| /* make sure we are asking for a correct name */ |
| for(i = 0; i < SYSTEM_DIMENSIONS; i++) { |
| if ((name[j+i] >= '0' |
| && name[j+i] <= '9') |
| || (name[j+i] >= 'A' |
| && name[j+i] <= 'Z')) |
| continue; |
| |
| error("update_sub_node: " |
| "misformatted name given %s", |
| name); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| |
| strncpy(coord, name+j, |
| SYSTEM_DIMENSIONS); |
| j += SYSTEM_DIMENSIONS-1; |
| set++; |
| } |
| j++; |
| } |
| |
| if (set != 2) { |
| error("update_sub_node: " |
| "I didn't get the base partition and the sub part."); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); |
| bit_unfmt(ionode_bitmap, ionodes); |
| if (bit_ffs(ionode_bitmap) == -1) { |
| error("update_sub_node: Invalid ionode '%s' given.", ionodes); |
| rc = SLURM_ERROR; |
| FREE_NULL_BITMAP(ionode_bitmap); |
| goto end_it; |
| } |
| node_name = xstrdup_printf("%s%s", bg_conf->slurm_node_prefix, coord); |
| /* find out how many nodecards to get for each ionode */ |
| if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) { |
| info("Admin setting %s[%s] in an error state", |
| node_name, ionodes); |
| for(i = 0; i<bg_conf->ionodes_per_mp; i++) { |
| if (bit_test(ionode_bitmap, i)) { |
| if ((int)nc_pos != (int)last_pos) { |
| /* find first bit in nc */ |
| int start_io = |
| (int)nc_pos * bg_conf->io_ratio; |
| down_nodecard(node_name, start_io, |
| 0, NULL); |
| last_pos = nc_pos; |
| } |
| } |
| nc_pos += bg_conf->nc_ratio; |
| } |
| } else if (block_desc_ptr->state == BG_BLOCK_FREE) { |
| info("Admin setting %s[%s] in an free state", |
| node_name, ionodes); |
| up_nodecard(node_name, ionode_bitmap); |
| } else { |
| error("update_sub_node: Unknown state %s", |
| bg_block_state_string(block_desc_ptr->state)); |
| rc = ESLURM_INVALID_BLOCK_STATE; |
| } |
| |
| FREE_NULL_BITMAP(ionode_bitmap); |
| xfree(node_name); |
| |
| last_bg_update = time(NULL); |
| end_it: |
| return rc; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| /* While the realtime server should get all the cnode state changes on |
| * older versions of the IBM driver if a job has a timeout it doesn't |
| * always happen. So what happens is the runjob_mux will now send a |
| * nice cancel to the slurmctld to make sure it gets marked. |
| */ |
| extern int select_p_fail_cnode(struct step_record *step_ptr) |
| { |
| #if defined HAVE_BG && !defined HAVE_BG_L_P |
| bg_record_t *bg_record; |
| select_nodeinfo_t *nodeinfo; |
| select_jobinfo_t *jobinfo; |
| select_jobinfo_t *step_jobinfo; |
| struct node_record *node_ptr = NULL; |
| ListIterator itr, itr2; |
| ba_mp_t *ba_mp = NULL, *found_ba_mp; |
| int i; |
| |
| xassert(step_ptr); |
| |
| jobinfo = step_ptr->job_ptr->select_jobinfo->data; |
| step_jobinfo = step_ptr->select_jobinfo->data; |
| |
| /* block_state must be locked before ba_system */ |
| slurm_mutex_lock(&block_state_mutex); |
| slurm_mutex_lock(&ba_system_mutex); |
| for (i=0; i<bit_size(step_ptr->step_node_bitmap); i++) { |
| if (!bit_test(step_ptr->step_node_bitmap, i)) |
| continue; |
| ba_mp = ba_inx2ba_mp(i); |
| xassert(ba_mp); |
| |
| if (!ba_mp->cnode_err_bitmap) |
| ba_mp->cnode_err_bitmap = |
| bit_alloc(bg_conf->mp_cnode_cnt); |
| |
| if (jobinfo->units_avail) { |
| bit_or(ba_mp->cnode_err_bitmap, |
| step_jobinfo->units_used); |
| } else { |
| bit_nset(ba_mp->cnode_err_bitmap, 0, |
| bit_size(ba_mp->cnode_err_bitmap)-1); |
| } |
| node_ptr = &(node_record_table_ptr[ba_mp->index]); |
| xassert(node_ptr->select_nodeinfo); |
| nodeinfo = (select_nodeinfo_t *)node_ptr->select_nodeinfo->data; |
| xassert(nodeinfo); |
| xfree(nodeinfo->failed_cnodes); |
| nodeinfo->failed_cnodes = ba_node_map_ranged_hostlist( |
| ba_mp->cnode_err_bitmap, ba_mp_geo_system); |
| } |
| |
| if (!ba_mp) { |
| error("select_p_fail_cnode: no ba_mp? " |
| "This should never happen"); |
| slurm_mutex_unlock(&ba_system_mutex); |
| slurm_mutex_unlock(&block_state_mutex); |
| return SLURM_ERROR; |
| } |
| |
| itr = list_iterator_create(bg_lists->main); |
| while ((bg_record = (bg_record_t *)list_next(itr))) { |
| float err_ratio; |
| if (!bit_overlap(step_ptr->step_node_bitmap, |
| bg_record->mp_bitmap)) |
| continue; |
| bg_record->cnode_err_cnt = 0; |
| itr2 = list_iterator_create(bg_record->ba_mp_list); |
| while ((found_ba_mp = (ba_mp_t *)list_next(itr2))) { |
| |
| if (!found_ba_mp->used |
| || !bit_test(step_ptr->step_node_bitmap, |
| found_ba_mp->index)) |
| continue; |
| |
| /* perhaps this block isn't involved in this |
| error */ |
| if (jobinfo->units_avail |
| && found_ba_mp->cnode_usable_bitmap |
| && bit_overlap(found_ba_mp->cnode_usable_bitmap, |
| ba_mp->cnode_err_bitmap)) |
| continue; |
| |
| if (!found_ba_mp->cnode_err_bitmap) |
| found_ba_mp->cnode_err_bitmap = |
| bit_alloc(bg_conf->mp_cnode_cnt); |
| |
| bit_or(found_ba_mp->cnode_err_bitmap, |
| ba_mp->cnode_err_bitmap); |
| bg_record->cnode_err_cnt += |
| bit_set_count(found_ba_mp->cnode_err_bitmap); |
| } |
| list_iterator_destroy(itr2); |
| |
| err_ratio = (float)bg_record->cnode_err_cnt |
| / (float)bg_record->cnode_cnt; |
| bg_record->err_ratio = err_ratio * 100; |
| |
| /* handle really small ratios */ |
| if (!bg_record->err_ratio && bg_record->cnode_err_cnt) |
| bg_record->err_ratio = 1; |
| |
| debug("select_p_fail_cnode: " |
| "count in error for %s is %u with ratio at %u", |
| bg_record->bg_block_id, |
| bg_record->cnode_err_cnt, |
| bg_record->err_ratio); |
| |
| } |
| list_iterator_destroy(itr); |
| slurm_mutex_unlock(&ba_system_mutex); |
| slurm_mutex_unlock(&block_state_mutex); |
| #endif |
| return SLURM_SUCCESS; |
| } |
| |
| extern int select_p_get_info_from_plugin (enum select_plugindata_info dinfo, |
| struct job_record *job_ptr, |
| void *data) |
| { |
| #ifdef HAVE_BG |
| uint16_t *tmp16 = (uint16_t *) data; |
| uint32_t *tmp32 = (uint32_t *) data; |
| List *tmp_list = (List *) data; |
| int rc = SLURM_SUCCESS; |
| |
| switch(dinfo) { |
| case SELECT_CR_PLUGIN: |
| *tmp32 = 0; |
| break; |
| case SELECT_STATIC_PART: |
| if (bg_conf->layout_mode == LAYOUT_DYNAMIC) |
| *tmp16 = 0; |
| else /* LAYOUT_STATIC || LAYOUT_OVERLAP */ |
| *tmp16 = 1; |
| break; |
| |
| case SELECT_CONFIG_INFO: |
| *tmp_list = _get_config(); |
| break; |
| default: |
| error("select_p_get_info_from_plugin info %d invalid", |
| dinfo); |
| rc = SLURM_ERROR; |
| break; |
| } |
| |
| return rc; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_update_node_config (int index) |
| { |
| #ifdef HAVE_BG |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_update_node_state(struct node_record *node_ptr) |
| { |
| #ifdef HAVE_BG |
| ba_mp_t *curr_mp; |
| int rc = SLURM_SUCCESS; |
| |
| xassert(node_ptr); |
| |
| slurm_mutex_lock(&ba_system_mutex); |
| if ((curr_mp = str2ba_mp(node_ptr->name))) |
| ba_update_mp_state(curr_mp, node_ptr->node_state); |
| else |
| rc = SLURM_ERROR; |
| slurm_mutex_unlock(&ba_system_mutex); |
| return rc; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) |
| { |
| #ifdef HAVE_BG |
| job_desc_msg_t *job_desc = (job_desc_msg_t *)data; |
| uint16_t *cpus = (uint16_t *)data; |
| uint32_t *nodes = (uint32_t *)data, tmp = 0; |
| int i; |
| uint16_t req_geometry[SYSTEM_DIMENSIONS]; |
| |
| if (!bg_conf->mp_cnode_cnt) { |
| fatal("select_p_alter_node_cnt: This can't be called " |
| "before init"); |
| } |
| |
| switch (type) { |
| case SELECT_GET_NODE_SCALING: |
| if ((*nodes) != INFINITE) { |
| if (bg_conf->sub_mp_sys) |
| (*nodes) = bg_conf->actual_cnodes_per_mp; |
| else |
| (*nodes) = bg_conf->mp_cnode_cnt; |
| } |
| break; |
| case SELECT_GET_NODE_CPU_CNT: |
| if ((*cpus) != (uint16_t)INFINITE) |
| (*cpus) = bg_conf->cpu_ratio; |
| break; |
| case SELECT_GET_MP_CPU_CNT: |
| if ((*nodes) != INFINITE) |
| (*nodes) = bg_conf->cpus_per_mp; |
| break; |
| case SELECT_SET_MP_CNT: |
| if (((*nodes) == INFINITE) || ((*nodes) == NO_VAL)) |
| tmp = (*nodes); |
| else if ((*nodes) > bg_conf->mp_cnode_cnt) { |
| tmp = (*nodes); |
| tmp /= bg_conf->mp_cnode_cnt; |
| if (tmp < 1) |
| tmp = 1; |
| } else |
| tmp = 1; |
| (*nodes) = tmp; |
| break; |
| case SELECT_APPLY_NODE_MIN_OFFSET: |
| if ((*nodes) == 1) { |
| /* Job will actually get more than one c-node, |
| * but we can't be sure exactly how much so we |
| * don't scale up this value. */ |
| break; |
| } |
| if (bg_conf->sub_mp_sys) |
| (*nodes) = bg_conf->actual_cnodes_per_mp; |
| else |
| (*nodes) *= bg_conf->mp_cnode_cnt; |
| break; |
| case SELECT_APPLY_NODE_MAX_OFFSET: |
| if ((*nodes) != INFINITE) { |
| if (bg_conf->sub_mp_sys) |
| (*nodes) = bg_conf->actual_cnodes_per_mp; |
| else |
| (*nodes) *= bg_conf->mp_cnode_cnt; |
| } |
| break; |
| case SELECT_SET_NODE_CNT: |
| get_select_jobinfo(job_desc->select_jobinfo->data, |
| SELECT_JOBDATA_ALTERED, &tmp); |
| if (tmp == 1) { |
| return SLURM_SUCCESS; |
| } |
| tmp = 1; |
| set_select_jobinfo(job_desc->select_jobinfo->data, |
| SELECT_JOBDATA_ALTERED, &tmp); |
| |
| if (job_desc->min_nodes == (uint32_t) NO_VAL) |
| return SLURM_SUCCESS; |
| else if ((job_desc->min_nodes == 1) |
| && (job_desc->min_cpus != NO_VAL)) { |
| job_desc->min_nodes = job_desc->min_cpus; |
| if (job_desc->ntasks_per_node |
| && job_desc->ntasks_per_node != (uint16_t)NO_VAL) |
| job_desc->min_nodes /= |
| job_desc->ntasks_per_node; |
| } |
| |
| get_select_jobinfo(job_desc->select_jobinfo->data, |
| SELECT_JOBDATA_GEOMETRY, &req_geometry); |
| |
| if (req_geometry[0] != 0 |
| && req_geometry[0] != (uint16_t)NO_VAL) { |
| job_desc->min_nodes = 1; |
| for (i=0; i<SYSTEM_DIMENSIONS; i++) |
| job_desc->min_nodes *= |
| (uint16_t)req_geometry[i]; |
| job_desc->min_nodes *= bg_conf->mp_cnode_cnt; |
| job_desc->max_nodes = job_desc->min_nodes; |
| } |
| |
| /* make sure if the user only specified min_cpus to |
| set min_nodes correctly |
| */ |
| if ((job_desc->min_cpus != NO_VAL) |
| && (job_desc->min_cpus > job_desc->min_nodes)) { |
| float tmp_float = (float)job_desc->min_cpus |
| / (float)bg_conf->cpu_ratio; |
| |
| tmp = (uint32_t)tmp_float; |
| if (tmp_float != (float)tmp) |
| tmp++; |
| if (tmp > job_desc->min_nodes) { |
| /* This means they actually asked for |
| nodes and tasks. |
| */ |
| if ((job_desc->max_nodes != NO_VAL) |
| && (tmp > job_desc->max_nodes)) { |
| #ifndef HAVE_BG_L_P |
| float divisor = 0; |
| /* ntasks_per_node should be |
| * validated beforehand. */ |
| if (job_desc->ntasks_per_node |
| && (job_desc->ntasks_per_node |
| != (uint16_t)NO_VAL)) |
| divisor = (float)job_desc-> |
| ntasks_per_node |
| / bg_conf->cpu_ratio; |
| /* On Q systems you can have 2 |
| processes per thread */ |
| if (!divisor || divisor > 2) { |
| error("Asking for more " |
| "resources than " |
| "possible. Denied."); |
| return SLURM_ERROR; |
| } else |
| tmp /= divisor; |
| #else |
| error("Asking for more resources than " |
| "possible. Requested %u nodes " |
| "and %u " |
| "tasks, giving them %u nodes.", |
| job_desc->min_nodes, |
| job_desc->min_cpus, tmp); |
| #endif |
| } |
| job_desc->min_nodes = tmp; |
| } |
| } |
| |
| /* initialize min_cpus to the min_nodes */ |
| job_desc->min_cpus = job_desc->min_nodes * bg_conf->cpu_ratio; |
| |
| if ((job_desc->max_nodes == (uint32_t) NO_VAL) |
| || (job_desc->max_nodes < job_desc->min_nodes)) |
| job_desc->max_nodes = job_desc->min_nodes; |
| |
| /* See if min_nodes is greater than one base partition */ |
| if (job_desc->min_nodes > bg_conf->mp_cnode_cnt) { |
| /* |
| * if it is make sure it is a factor of |
| * bg_conf->mp_cnode_cnt, if it isn't make it |
| * that way |
| */ |
| tmp = job_desc->min_nodes % bg_conf->mp_cnode_cnt; |
| if (tmp > 0) |
| job_desc->min_nodes += |
| (bg_conf->mp_cnode_cnt-tmp); |
| } |
| tmp = job_desc->min_nodes / bg_conf->mp_cnode_cnt; |
| |
| /* this means it is greater or equal to one mp */ |
| if (tmp > 0) { |
| set_select_jobinfo(job_desc->select_jobinfo->data, |
| SELECT_JOBDATA_NODE_CNT, |
| &job_desc->min_nodes); |
| job_desc->min_nodes = tmp; |
| job_desc->min_cpus = bg_conf->cpus_per_mp * tmp; |
| } else { |
| #ifdef HAVE_BGL |
| if (job_desc->min_nodes <= bg_conf->nodecard_cnode_cnt |
| && bg_conf->nodecard_ionode_cnt) |
| job_desc->min_nodes = |
| bg_conf->nodecard_cnode_cnt; |
| else if (job_desc->min_nodes |
| <= bg_conf->quarter_cnode_cnt) |
| job_desc->min_nodes = |
| bg_conf->quarter_cnode_cnt; |
| else |
| job_desc->min_nodes = |
| bg_conf->mp_cnode_cnt; |
| |
| set_select_jobinfo(job_desc->select_jobinfo->data, |
| SELECT_JOBDATA_NODE_CNT, |
| &job_desc->min_nodes); |
| |
| tmp = bg_conf->mp_cnode_cnt/job_desc->min_nodes; |
| |
| job_desc->min_cpus = bg_conf->cpus_per_mp/tmp; |
| job_desc->min_nodes = 1; |
| #else |
| /* If it is allowed to run sub block allocations then |
| an allocation can be any size. If it doesn't line |
| up with a geometry it will be massaged later. |
| */ |
| if (!bg_conf->sub_blocks) { |
| i = bg_conf->smallest_block; |
| while (i <= bg_conf->mp_cnode_cnt) { |
| if (job_desc->min_nodes <= i) { |
| job_desc->min_nodes = i; |
| break; |
| } |
| i *= 2; |
| } |
| } |
| |
| set_select_jobinfo(job_desc->select_jobinfo->data, |
| SELECT_JOBDATA_NODE_CNT, |
| &job_desc->min_nodes); |
| job_desc->min_cpus = job_desc->min_nodes |
| * bg_conf->cpu_ratio; |
| job_desc->min_nodes = 1; |
| #endif |
| } |
| |
| if (job_desc->max_nodes > bg_conf->mp_cnode_cnt) { |
| tmp = job_desc->max_nodes % bg_conf->mp_cnode_cnt; |
| if (tmp > 0) |
| job_desc->max_nodes += |
| (bg_conf->mp_cnode_cnt-tmp); |
| } |
| tmp = job_desc->max_nodes / bg_conf->mp_cnode_cnt; |
| |
| if (tmp > 0) { |
| job_desc->max_nodes = tmp; |
| job_desc->max_cpus = |
| job_desc->max_nodes * bg_conf->cpus_per_mp; |
| tmp = NO_VAL; |
| } else { |
| #ifdef HAVE_BGL |
| if (job_desc->max_nodes <= bg_conf->nodecard_cnode_cnt |
| && bg_conf->nodecard_ionode_cnt) |
| job_desc->max_nodes = |
| bg_conf->nodecard_cnode_cnt; |
| else if (job_desc->max_nodes |
| <= bg_conf->quarter_cnode_cnt) |
| job_desc->max_nodes = |
| bg_conf->quarter_cnode_cnt; |
| else |
| job_desc->max_nodes = |
| bg_conf->mp_cnode_cnt; |
| |
| tmp = bg_conf->mp_cnode_cnt/job_desc->max_nodes; |
| job_desc->max_cpus = bg_conf->cpus_per_mp/tmp; |
| job_desc->max_nodes = 1; |
| #else |
| if (!bg_conf->sub_blocks) { |
| i = bg_conf->smallest_block; |
| while (i <= bg_conf->mp_cnode_cnt) { |
| if (job_desc->max_nodes <= i) { |
| job_desc->max_nodes = i; |
| break; |
| } |
| i *= 2; |
| } |
| } |
| job_desc->max_cpus = |
| job_desc->max_nodes * bg_conf->cpu_ratio; |
| |
| job_desc->max_nodes = 1; |
| #endif |
| } |
| tmp = NO_VAL; |
| |
| break; |
| default: |
| error("unknown option %d for alter_node_cnt", type); |
| } |
| |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern int select_p_reconfigure(void) |
| { |
| #ifdef HAVE_BG |
| slurm_conf_lock(); |
| if (!slurmctld_conf.slurm_user_name |
| || strcmp(bg_conf->slurm_user_name, slurmctld_conf.slurm_user_name)) |
| error("The slurm user has changed from '%s' to '%s'. " |
| "If this is really what you " |
| "want you will need to restart slurm for this " |
| "change to be enforced in the bluegene plugin.", |
| bg_conf->slurm_user_name, slurmctld_conf.slurm_user_name); |
| if (!slurmctld_conf.node_prefix |
| || strcmp(bg_conf->slurm_node_prefix, slurmctld_conf.node_prefix)) |
| error("Node Prefix has changed from '%s' to '%s'. " |
| "If this is really what you " |
| "want you will need to restart slurm for this " |
| "change to be enforced in the bluegene plugin.", |
| bg_conf->slurm_node_prefix, slurmctld_conf.node_prefix); |
| bg_conf->slurm_debug_flags = slurmctld_conf.debug_flags; |
| bg_conf->slurm_debug_level = slurmctld_conf.slurmctld_debug; |
| set_ba_debug_flags(bg_conf->slurm_debug_flags); |
| slurm_conf_unlock(); |
| |
| return SLURM_SUCCESS; |
| #else |
| return SLURM_ERROR; |
| #endif |
| } |
| |
| extern bitstr_t *select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) |
| { |
| #ifdef HAVE_BG |
| /* Reserve a block of appropriate geometry by issuing a fake job |
| * WILL_RUN call */ |
| int i, rc; |
| uint32_t tmp_u32; |
| uint16_t conn_type[SYSTEM_DIMENSIONS]; |
| uint16_t geo[SYSTEM_DIMENSIONS]; |
| uint16_t reboot = 0; |
| uint16_t rotate = 1; |
| List preemptee_candidates, preemptee_job_list; |
| struct job_record job_rec; |
| bitstr_t *tmp_bitmap; |
| |
| memset(&job_rec, 0, sizeof(struct job_record)); |
| job_rec.details = xmalloc(sizeof(struct job_details)); |
| job_rec.select_jobinfo = select_g_select_jobinfo_alloc(); |
| |
| tmp_u32 = 1; |
| set_select_jobinfo(job_rec.select_jobinfo->data, |
| SELECT_JOBDATA_ALTERED, &tmp_u32); |
| set_select_jobinfo(job_rec.select_jobinfo->data, |
| SELECT_JOBDATA_NODE_CNT, &node_cnt); |
| for (i = 0; i < SYSTEM_DIMENSIONS; i++) { |
| conn_type[i] = SELECT_NAV; |
| geo[i] = 0; |
| } |
| select_g_select_jobinfo_set(job_rec.select_jobinfo, |
| SELECT_JOBDATA_GEOMETRY, &geo); |
| select_g_select_jobinfo_set(job_rec.select_jobinfo, |
| SELECT_JOBDATA_CONN_TYPE, &conn_type); |
| select_g_select_jobinfo_set(job_rec.select_jobinfo, |
| SELECT_JOBDATA_REBOOT, &reboot); |
| select_g_select_jobinfo_set(job_rec.select_jobinfo, |
| SELECT_JOBDATA_ROTATE, &rotate); |
| |
| job_rec.details->min_cpus = node_cnt * bg_conf->cpus_per_mp; |
| job_rec.details->max_cpus = job_rec.details->min_cpus; |
| tmp_bitmap = bit_copy(avail_bitmap); |
| |
| preemptee_candidates = list_create(NULL); |
| if (preemptee_candidates == NULL) |
| fatal("list_create: malloc failure"); |
| |
| rc = submit_job(&job_rec, tmp_bitmap, node_cnt, node_cnt, node_cnt, |
| SELECT_MODE_WILL_RUN, preemptee_candidates, |
| &preemptee_job_list); |
| |
| list_destroy(preemptee_candidates); |
| xfree(job_rec.details); |
| select_g_select_jobinfo_free(job_rec.select_jobinfo); |
| |
| if (rc == SLURM_SUCCESS) { |
| char *resv_nodes = bitmap2node_name(tmp_bitmap); |
| info("Reservation request for %u nodes satisfied with %s", |
| node_cnt, resv_nodes); |
| xfree(resv_nodes); |
| return tmp_bitmap; |
| } else { |
| info("Reservation request for %u nodes failed", node_cnt); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| } |
| #endif |
| return NULL; |
| } |
| |
| extern void select_p_ba_init(node_info_msg_t *node_info_ptr, bool sanity_check) |
| { |
| ba_init(node_info_ptr, sanity_check); |
| } |
| |
| extern void select_p_ba_fini(void) |
| { |
| ba_fini(); |
| } |
| |
| extern int *select_p_ba_get_dims(void) |
| { |
| #ifdef HAVE_BG |
| return DIM_SIZE; |
| #else |
| return NULL; |
| #endif |
| } |