blob: c970828918cc99cac3a8e63bcb30c876fa7ff420 [file] [log] [blame]
/*****************************************************************************\
* xcpuinfo.c - cpuinfo related primitives
*****************************************************************************
* Copyright (C) 2009 CEA/DAM/DIF
* Portions (hwloc) copyright (C) 2012 Bull, <rod.schultz@bull.com>
* Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#define _GNU_SOURCE
#include <ctype.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
#include "src/common/log.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmd/slurmd/get_mach_stat.h"
#include "src/slurmd/slurmd/slurmd.h"
#include "src/common/read_config.h"
#ifdef HAVE_HWLOC
#include <hwloc.h>
#endif
#include "xcpuinfo.h"
#define _DEBUG 0
#define MAX_CPUSET_STR 2048
#define _MAX_SOCKET_INX 1024
#if !defined(HAVE_HWLOC)
static char* _cpuinfo_path = "/proc/cpuinfo";
static int _compute_block_map(uint16_t numproc,
uint16_t **block_map, uint16_t **block_map_inv);
static int _chk_cpuinfo_str(char *buffer, char *keyword, char **valptr);
static int _chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val);
#endif
#if HWLOC_API_VERSION > 0x00020401
/* Contains a bitmap of all the cpus of the node, but only p-cores are set. */
static hwloc_bitmap_t cpuset_tot = NULL;
#endif
static char *restricted_cpus_as_mac = NULL;
extern slurmd_conf_t *conf;
/*
* get_procs - Return the count of procs on this system
* Input: procs - buffer for the CPU count
* Output: procs - filled in with CPU count, "1" if error
* return code - 0 if no error, otherwise errno
*/
extern int
get_procs(uint16_t *procs)
{
#ifdef _SC_NPROCESSORS_ONLN
int my_proc_tally;
*procs = 1;
my_proc_tally = (int)sysconf(_SC_NPROCESSORS_ONLN);
if (my_proc_tally < 1) {
error ("get_procs: error running sysconf(_SC_NPROCESSORS_ONLN)");
return EINVAL;
}
*procs = (uint16_t) my_proc_tally;
#elif defined (HAVE_SYSCTLBYNAME)
int ncpu;
size_t len = sizeof(ncpu);
*procs = 1;
if (sysctlbyname("hw.ncpus", &ncpu, &len, NULL, 0) == -1) {
error("get_procs: error running sysctl(HW_NCPU)");
return EINVAL;
}
*procs = (uint16_t) ncpu;
#else
*procs = 1;
#endif
return 0;
}
#ifdef HAVE_HWLOC
#if _DEBUG
static void _hwloc_children(hwloc_topology_t topology, hwloc_obj_t obj,
int depth)
{
char string[128];
unsigned i;
if (!obj)
return;
hwloc_obj_type_snprintf(string, sizeof(string), obj, 0);
debug("%*s%s", 2 * depth, "", string);
for (i = 0; i < obj->arity; i++) {
_hwloc_children(topology, obj->children[i], depth + 1);
}
}
#endif
/* Return the number of cores which are decentdents of the given objecdt */
static int _core_child_count(hwloc_topology_t topology, hwloc_obj_t obj)
{
int count = 0, i;
if (obj->type == HWLOC_OBJ_CORE)
return 1;
for (i = 0; i < obj->arity; i++)
count += _core_child_count(topology, obj->children[i]);
return count;
}
/*
* This needs to run before _remove_ecores() as the call to
* hwloc_topology_restrict() will change the view.
*
* There appears to be a bug in HWLOC 2.x where the IntelCore list
* is restricted by the cgroup cpuset.
*/
static void _check_full_access(hwloc_topology_t *topology)
{
hwloc_const_bitmap_t complete, allowed;
hwloc_bitmap_t restricted_cpus_mask;
complete = hwloc_topology_get_complete_cpuset(*topology);
allowed = hwloc_topology_get_allowed_cpuset(*topology);
if (!hwloc_bitmap_isequal(complete, allowed)) {
/*
* Get the cpus that are not set in both bitmaps and which
* represent the cpus that slurm will not be able to run on,
* a.k.a. CpuSpecList (without SlurmdOffSpec).
*/
restricted_cpus_mask = hwloc_bitmap_alloc();
hwloc_bitmap_andnot(restricted_cpus_mask, complete, allowed);
restricted_cpus_as_mac = xmalloc(MAX_CPUSET_STR);
/* And convert them into a string*/
hwloc_bitmap_list_snprintf(restricted_cpus_as_mac,
MAX_CPUSET_STR,
restricted_cpus_mask);
hwloc_bitmap_free(restricted_cpus_mask);
warning("%s: subset of restricted cpus (not available for jobs): %s",
__func__, restricted_cpus_as_mac);
/* We don't need this any further */
if (!(slurm_conf.task_plugin_param & SLURMD_SPEC_OVERRIDE))
xfree(restricted_cpus_as_mac);
} else {
debug2("%s: got full access to the cpuset topology", __func__);
}
}
static void _remove_ecores(hwloc_topology_t *topology)
{
#if HWLOC_API_VERSION > 0x00020401
int type_cnt;
hwloc_bitmap_t cpuset;
char *pcore_freq = NULL;
bool found = false;
if (slurm_conf.conf_flags & CONF_FLAG_ECORE)
return;
if (!(type_cnt = hwloc_cpukinds_get_nr(*topology, 0)))
return;
/*
* Handle the removal of Intel E-Cores here.
*
* At the time of writing this Intel Gen 12+ procs have introduced what
* are known as 'P' (performance) and 'E' (efficiency) cores. The
* former can have hyperthreads, where the latter are only single
* threaded, thus creating a situation where we could get a
* heterogeneous socket (which Slurm doesn't like). Here we can restrict
* to only "IntelCore" (P-Cores) and disregard the "IntelAtom"
* (E-Cores).
*
* In the future, if desired, we should probably figure out a way to
* handle these E-Cores through a core spec instead.
*
* This logic should do nothing on any other existing processor.
*
* One notable issue found is that, for processes launching with a
* restricted cpuset, the CPU Kind entry for the P-Cores will only
* include the P-Cores in the active cpuset, and not all of those
* available on the node. However, those unavailable cores are
* listed on another entry with an identical FrequencyMaxMHz value.
* This should be distinct from the slower E-Cores.
*/
cpuset = hwloc_bitmap_alloc();
cpuset_tot = hwloc_bitmap_alloc();
for (int i = 0; i < type_cnt; i++) {
unsigned nr_infos = 0;
struct hwloc_info_s *infos;
if (hwloc_cpukinds_get_info(
*topology, i, cpuset, NULL, &nr_infos, &infos, 0))
fatal("Error getting info from hwloc_cpukinds_get_info() %m");
/* Look for the CPU Kinds entry with CoreType=IntelCore. */
for (int j = 0; j < nr_infos; j++) {
if (!xstrcasecmp(infos[j].name, "CoreType") &&
!xstrcasecmp(infos[j].value, "IntelCore")) {
found = true;
break;
}
}
if (!found)
continue;
/*
* Copy the cpuset over now. This avoids problems with a
* hypothetical system with the FrequencyMaxMHz not being
* listed for the P-Cores.
*/
hwloc_bitmap_or(cpuset_tot, cpuset_tot, cpuset);
/* If found, note the FrequencyMaxMHz value for these cores. */
for (int j = 0; j < nr_infos; j++) {
if (!xstrcasecmp(infos[j].name, "FrequencyMaxMHz")) {
pcore_freq = infos[j].value;
break;
}
}
break;
}
if (!found) {
hwloc_bitmap_free(cpuset);
hwloc_bitmap_free(cpuset_tot);
cpuset_tot = NULL;
return;
}
for (int i = 0; i < type_cnt; i++) {
unsigned nr_infos = 0;
struct hwloc_info_s *infos;
if (hwloc_cpukinds_get_info(
*topology, i, cpuset, NULL, &nr_infos, &infos, 0))
fatal("Error getting info from hwloc_cpukinds_get_info() %m");
/*
* Look for all CPU Kinds with a matching FrequencyMaxMHz value.
* These should all be the P-cores, including those that aren't
* in the available cpuset we are running under.
*/
for (int j = 0; j < nr_infos; j++) {
if (!xstrcasecmp(infos[j].name, "FrequencyMaxMHz") &&
!xstrcasecmp(infos[j].value, pcore_freq)) {
hwloc_bitmap_or(cpuset_tot, cpuset_tot, cpuset);
}
}
}
hwloc_topology_restrict(*topology, cpuset_tot, 0);
hwloc_bitmap_free(cpuset);
#endif
}
/* read or load topology and write if needed
* init and destroy topology must be outside this function */
static int xcpuinfo_hwloc_topo_load(hwloc_topology_t *topology)
{
xassert(topology);
/* parse all system */
hwloc_topology_set_flags(*topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM);
/* ignores cache, misc */
#if HWLOC_API_VERSION < 0x00020000
hwloc_topology_ignore_type(*topology, HWLOC_OBJ_CACHE);
hwloc_topology_ignore_type(*topology, HWLOC_OBJ_MISC);
#else
hwloc_topology_set_type_filter(*topology, HWLOC_OBJ_L1CACHE,
HWLOC_TYPE_FILTER_KEEP_NONE);
hwloc_topology_set_type_filter(*topology, HWLOC_OBJ_L2CACHE,
HWLOC_TYPE_FILTER_KEEP_NONE);
/* need to preserve HWLOC_OBJ_L3CACHE for l3cache_as_socket */
hwloc_topology_set_type_filter(*topology, HWLOC_OBJ_L4CACHE,
HWLOC_TYPE_FILTER_KEEP_NONE);
hwloc_topology_set_type_filter(*topology, HWLOC_OBJ_L5CACHE,
HWLOC_TYPE_FILTER_KEEP_NONE);
hwloc_topology_set_type_filter(*topology, HWLOC_OBJ_MISC,
HWLOC_TYPE_FILTER_KEEP_NONE);
#endif
/* load topology */
debug2("hwloc_topology_load");
if (hwloc_topology_load(*topology)) {
/* error in load hardware topology */
debug("hwloc_topology_load() failed.");
return SLURM_ERROR;
}
_check_full_access(topology);
_remove_ecores(topology);
return SLURM_SUCCESS;
}
/*
* xcpuinfo_hwloc_topo_get - Return detailed cpuinfo on the whole system
* Output: p_cpus - number of processors on the system
* p_boards - number of baseboards (containing sockets)
* p_sockets - number of physical processor sockets
* p_cores - total number of physical CPU cores
* p_threads - total number of hardware execution threads
* block_map - abstract->physical block distribution map
* block_map_inv - physical->abstract block distribution map (inverse)
* return code - 0 if no error, otherwise errno
* NOTE: User must xfree block_map and block_map_inv
*/
extern int xcpuinfo_hwloc_topo_get(
uint16_t *p_cpus, uint16_t *p_boards,
uint16_t *p_sockets, uint16_t *p_cores, uint16_t *p_threads,
uint16_t *p_block_map_size,
uint16_t **p_block_map, uint16_t **p_block_map_inv)
{
enum { SOCKET=0, CORE=1, PU=2, LAST_OBJ=3 };
hwloc_topology_t topology;
hwloc_obj_t obj;
hwloc_obj_type_t objtype[LAST_OBJ];
unsigned idx[LAST_OBJ];
int nobj[LAST_OBJ];
bitstr_t *used_socket = NULL;
int *cores_per_socket;
int actual_cpus;
int macid;
int absid;
int actual_boards = 1, depth, sock_cnt, tot_socks = 0;
int i, used_core_idx, used_sock_idx;
debug2("hwloc_topology_init");
if (hwloc_topology_init(&topology)) {
/* error in initialize hwloc library */
debug("hwloc_topology_init() failed.");
return 1;
}
if (xcpuinfo_hwloc_topo_load(&topology) != SLURM_SUCCESS) {
hwloc_topology_destroy(topology);
return 2;
}
#if _DEBUG
_hwloc_children(topology, hwloc_get_root_obj(topology), 0);
#endif
/*
* Some processors (e.g. AMD Opteron 6000 series) contain multiple
* NUMA nodes per socket. This is a configuration which does not map
* into the hardware entities that Slurm optimizes resource allocation
* for (PU/thread, core, socket, baseboard, node and network switch).
* In order to optimize resource allocations on such hardware, Slurm
* will consider each NUMA node within the socket as a separate socket.
* You can disable this configuring "SchedulerParameters=Ignore_NUMA",
* in which case Slurm will report the correct socket count on the node,
* but not be able to optimize resource allocations on the NUMA nodes.
*/
objtype[SOCKET] = HWLOC_OBJ_SOCKET;
objtype[CORE] = HWLOC_OBJ_CORE;
objtype[PU] = HWLOC_OBJ_PU;
#if HWLOC_API_VERSION >= 0x00020000
if (xstrcasestr(slurm_conf.sched_params, "Ignore_NUMA")) {
info("SchedulerParamaters=Ignore_NUMA not supported by hwloc v2");
}
#else
if (hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)) {
if (xstrcasestr(slurm_conf.sched_params, "Ignore_NUMA")) {
info("Ignoring NUMA nodes within a socket");
} else {
info("Considering each NUMA node as a socket");
objtype[SOCKET] = HWLOC_OBJ_NODE;
}
}
#endif
if (slurm_conf.conf_flags & CONF_FLAG_L3CSOCK) {
#if HWLOC_API_VERSION >= 0x00020000
objtype[SOCKET] = HWLOC_OBJ_L3CACHE;
#else
error("SlurmdParameters=l3cache_as_socket requires hwloc v2");
#endif
} else if (slurm_conf.conf_flags & CONF_FLAG_NNSOCK) {
#if HWLOC_API_VERSION >= 0x00020000
hwloc_obj_t numa_obj = hwloc_get_next_obj_by_type(
topology, HWLOC_OBJ_NODE, NULL);
if (numa_obj && numa_obj->parent) {
objtype[SOCKET] = numa_obj->parent->type;
if (get_log_level() >= LOG_LEVEL_DEBUG2) {
char tmp[128];
hwloc_obj_type_snprintf(tmp, sizeof(tmp),
numa_obj->parent, 0);
debug2("%s: numa_node_as_socket mapped to '%s'",
__func__, tmp);
}
}
#else
error("SlurmdParameters=numa_node_as_socket requires hwloc v2");
#endif
}
/* Groups below root obj are interpreted as boards */
obj = hwloc_get_root_obj(topology);
obj = hwloc_get_next_child(topology, obj, NULL);
if (!hwloc_compare_types(HWLOC_OBJ_GROUP, obj->type))
actual_boards =
MAX(hwloc_get_nbobjs_by_depth(topology, obj->depth), 1);
/*
* Count sockets/NUMA containing any cores.
* KNL NUMA with no cores are NOT counted.
*/
nobj[SOCKET] = 0;
depth = hwloc_get_type_depth(topology, objtype[SOCKET]);
used_socket = bit_alloc(_MAX_SOCKET_INX);
cores_per_socket = xcalloc(_MAX_SOCKET_INX, sizeof(int));
sock_cnt = hwloc_get_nbobjs_by_depth(topology, depth);
for (i = 0; i < sock_cnt; i++) {
obj = hwloc_get_obj_by_depth(topology, depth, i);
if (obj->type == objtype[SOCKET]) {
cores_per_socket[i] = _core_child_count(topology, obj);
if (cores_per_socket[i] > 0) {
nobj[SOCKET]++;
bit_set(used_socket, tot_socks);
}
if (++tot_socks >= _MAX_SOCKET_INX) { /* Bitmap size */
fatal("Socket count exceeds %d, expand data structure size",
_MAX_SOCKET_INX);
break;
}
}
}
nobj[CORE] = hwloc_get_nbobjs_by_type(topology, objtype[CORE]);
/*
* Workaround for hwloc bug, in some cases the topology "children" array
* does not get populated, so _core_child_count() always returns 0
*/
if (nobj[SOCKET] == 0) {
nobj[SOCKET] = hwloc_get_nbobjs_by_type(topology,
objtype[SOCKET]);
if (nobj[SOCKET] == 0) {
debug("%s: fudging nobj[SOCKET] from 0 to 1", __func__);
nobj[SOCKET] = 1;
}
if (nobj[SOCKET] >= _MAX_SOCKET_INX) { /* Bitmap size */
fatal("Socket count exceeds %d, expand data structure size",
_MAX_SOCKET_INX);
}
bit_nset(used_socket, 0, nobj[SOCKET] - 1);
}
/*
* Workaround for hwloc
* hwloc_get_nbobjs_by_type() returns 0 on some architectures.
*/
if ( nobj[CORE] == 0 ) {
debug("%s: fudging nobj[CORE] from 0 to 1", __func__);
nobj[CORE] = 1;
}
if ( nobj[SOCKET] == -1 )
fatal("%s: can not handle nobj[SOCKET] = -1", __func__);
if ( nobj[CORE] == -1 )
fatal("%s: can not handle nobj[CORE] = -1", __func__);
actual_cpus = hwloc_get_nbobjs_by_type(topology, objtype[PU]);
#if 0
/* Used to find workaround above */
info("CORE = %d SOCKET = %d actual_cpus = %d nobj[CORE] = %d",
CORE, SOCKET, actual_cpus, nobj[CORE]);
#endif
if ((actual_cpus % nobj[CORE]) != 0) {
error("Thread count (%d) not multiple of core count (%d)",
actual_cpus, nobj[CORE]);
}
nobj[PU] = actual_cpus / nobj[CORE]; /* threads per core */
if ((nobj[CORE] % nobj[SOCKET]) != 0) {
error("Core count (%d) not multiple of socket count (%d)",
nobj[CORE], nobj[SOCKET]);
}
nobj[CORE] /= nobj[SOCKET]; /* cores per socket */
debug("CPUs:%d Boards:%d Sockets:%d CoresPerSocket:%d ThreadsPerCore:%d",
actual_cpus, actual_boards, nobj[SOCKET], nobj[CORE], nobj[PU]);
/* allocate block_map */
if (p_block_map_size)
*p_block_map_size = (uint16_t)actual_cpus;
if (p_block_map && p_block_map_inv) {
*p_block_map = xcalloc(actual_cpus, sizeof(uint16_t));
*p_block_map_inv = xcalloc(actual_cpus, sizeof(uint16_t));
/* initialize default as linear mapping */
for (i = 0; i < actual_cpus; i++) {
(*p_block_map)[i] = i;
(*p_block_map_inv)[i] = i;
}
/* create map with hwloc */
used_sock_idx = -1;
used_core_idx = -1;
for (idx[SOCKET] = 0; (used_sock_idx + 1) < nobj[SOCKET];
idx[SOCKET]++) {
if (!bit_test(used_socket, idx[SOCKET]))
continue;
used_sock_idx++;
for (idx[CORE] = 0;
idx[CORE] < cores_per_socket[idx[SOCKET]];
idx[CORE]++) {
used_core_idx++;
for (idx[PU]=0; idx[PU]<nobj[PU]; ++idx[PU]) {
/* get hwloc_obj by indexes */
obj=hwloc_get_obj_below_array_by_type(
topology, 3, objtype, idx);
if (!obj)
continue;
macid = obj->os_index;
absid = used_core_idx * nobj[PU] + idx[PU];
if ((macid >= actual_cpus) ||
(absid >= actual_cpus)) {
/* physical or logical ID are
* out of range */
continue;
}
debug4("CPU map[%d]=>%d S:C:T %d:%d:%d", absid, macid,
used_sock_idx, idx[CORE], idx[PU]);
(*p_block_map)[absid] = macid;
(*p_block_map_inv)[macid] = absid;
}
}
}
}
FREE_NULL_BITMAP(used_socket);
xfree(cores_per_socket);
hwloc_topology_destroy(topology);
/* update output parameters */
*p_cpus = actual_cpus;
*p_boards = actual_boards;
*p_sockets = nobj[SOCKET];
*p_cores = nobj[CORE];
*p_threads = nobj[PU];
#if _DEBUG
/*** Display raw data ***/
debug("CPUs:%u Boards:%u Sockets:%u CoresPerSocket:%u ThreadsPerCore:%u",
*p_cpus, *p_boards, *p_sockets, *p_cores, *p_threads);
/* Display the mapping tables */
if (p_block_map && p_block_map_inv) {
debug("------");
debug("Abstract -> Machine logical CPU ID block mapping:");
debug("AbstractId PhysicalId Inverse");
for (i = 0; i < *p_cpus; i++) {
debug3(" %4d %4u %4u",
i, (*p_block_map)[i], (*p_block_map_inv)[i]);
}
debug("------");
}
#endif
return SLURM_SUCCESS;
}
#else
typedef struct cpuinfo {
uint16_t seen;
uint32_t cpuid;
uint32_t physid;
uint16_t physcnt;
uint32_t coreid;
uint16_t corecnt;
uint16_t siblings;
uint16_t cores;
} cpuinfo_t;
static cpuinfo_t *cpuinfo = NULL; /* array of CPU information for get_cpuinfo */
/* Note: file static for qsort/_compare_cpus*/
extern int xcpuinfo_hwloc_topo_get(
uint16_t *p_cpus, uint16_t *p_boards,
uint16_t *p_sockets, uint16_t *p_cores, uint16_t *p_threads,
uint16_t *p_block_map_size,
uint16_t **p_block_map, uint16_t **p_block_map_inv)
{
int retval;
uint16_t numproc;
uint16_t numcpu = 0; /* number of cpus seen */
uint16_t numphys = 0; /* number of unique "physical id"s */
uint16_t numcores = 0; /* number of unique "cores id"s */
uint16_t maxsibs = 0; /* maximum value of "siblings" */
uint16_t maxcores = 0; /* maximum value of "cores" */
uint16_t minsibs = 0xffff; /* minimum value of "siblings" */
uint16_t mincores = 0xffff; /* minimum value of "cores" */
uint32_t maxcpuid = 0; /* maximum CPU ID ("processor") */
uint32_t maxphysid = 0; /* maximum "physical id" */
uint32_t maxcoreid = 0; /* maximum "core id" */
uint32_t mincpuid = 0xffffffff;/* minimum CPU ID ("processor") */
uint32_t minphysid = 0xffffffff;/* minimum "physical id" */
uint32_t mincoreid = 0xffffffff;/* minimum "core id" */
int i;
FILE *cpu_info_file;
char buffer[128];
uint16_t curcpu, sockets, cores, threads;
get_procs(&numproc);
*p_cpus = numproc;
*p_boards = 1; /* Boards not identified from /proc/cpuinfo */
*p_sockets = numproc; /* initially all single core/thread */
*p_cores = 1;
*p_threads = 1;
*p_block_map_size = 0;
*p_block_map = NULL;
*p_block_map_inv = NULL;
cpu_info_file = fopen(_cpuinfo_path, "r");
if (cpu_info_file == NULL) {
error ("%s: error %d opening %s",
__func__, errno, _cpuinfo_path);
return errno;
}
/* Note: assumes all processor IDs are within [0:numproc-1] */
/* treats physical/core IDs as tokens, not indices */
if (cpuinfo)
memset(cpuinfo, 0, numproc * sizeof(cpuinfo_t));
else
cpuinfo = xcalloc(numproc, sizeof(cpuinfo_t));
curcpu = 0;
while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) {
uint32_t val;
if (_chk_cpuinfo_uint32(buffer, "processor", &val)) {
curcpu = numcpu;
numcpu++;
if (curcpu >= numproc) {
info("processor limit reached (%u >= %d)",
curcpu, numproc);
continue;
}
cpuinfo[curcpu].seen = 1;
cpuinfo[curcpu].cpuid = val;
maxcpuid = MAX(maxcpuid, val);
mincpuid = MIN(mincpuid, val);
} else if (_chk_cpuinfo_uint32(buffer, "physical id", &val)) {
/* see if the ID has already been seen */
for (i=0; i<numproc; i++) {
if ((cpuinfo[i].physid == val)
&& (cpuinfo[i].physcnt))
break;
}
if (i == numproc) { /* new ID... */
numphys++; /* ...increment total */
} else { /* existing ID... */
cpuinfo[i].physcnt++; /* ...update ID cnt */
}
if (curcpu < numproc) {
cpuinfo[curcpu].physcnt++;
cpuinfo[curcpu].physid = val;
}
maxphysid = MAX(maxphysid, val);
minphysid = MIN(minphysid, val);
} else if (_chk_cpuinfo_uint32(buffer, "core id", &val)) {
/* see if the ID has already been seen */
for (i = 0; i < numproc; i++) {
if ((cpuinfo[i].coreid == val)
&& (cpuinfo[i].corecnt))
break;
}
if (i == numproc) { /* new ID... */
numcores++; /* ...increment total */
} else { /* existing ID... */
cpuinfo[i].corecnt++; /* ...update ID cnt */
}
if (curcpu < numproc) {
cpuinfo[curcpu].corecnt++;
cpuinfo[curcpu].coreid = val;
}
maxcoreid = MAX(maxcoreid, val);
mincoreid = MIN(mincoreid, val);
} else if (_chk_cpuinfo_uint32(buffer, "siblings", &val)) {
/* Note: this value is a count, not an index */
if (val > numproc) { /* out of bounds, ignore */
debug("siblings is %u (> %d), ignored",
val, numproc);
continue;
}
if (curcpu < numproc)
cpuinfo[curcpu].siblings = val;
maxsibs = MAX(maxsibs, val);
minsibs = MIN(minsibs, val);
} else if (_chk_cpuinfo_uint32(buffer, "cpu cores", &val)) {
/* Note: this value is a count, not an index */
if (val > numproc) { /* out of bounds, ignore */
debug("cores is %u (> %d), ignored",
val, numproc);
continue;
}
if (curcpu < numproc)
cpuinfo[curcpu].cores = val;
maxcores = MAX(maxcores, val);
mincores = MIN(mincores, val);
}
}
fclose(cpu_info_file);
/*** Sanity check ***/
if (minsibs == 0) minsibs = 1; /* guaranteee non-zero */
if (maxsibs == 0) {
minsibs = 1;
maxsibs = 1;
}
if (maxcores == 0) { /* no core data */
mincores = 0;
maxcores = 0;
}
/*** Compute Sockets/Cores/Threads ***/
if ((minsibs == maxsibs) && /* homogeneous system */
(mincores == maxcores)) {
sockets = numphys; /* unique "physical id" */
if (sockets <= 1) { /* verify single socket */
sockets = numcpu / maxsibs; /* maximum "siblings" */
}
if (sockets == 0)
sockets = 1; /* guarantee non-zero */
cores = numcores / sockets; /* unique "core id" */
cores = MAX(maxcores, cores); /* maximum "cpu cores" */
if (cores == 0) {
cores = numcpu / sockets; /* assume multi-core */
if (cores > 1) {
debug3("cpuinfo missing 'core id' or 'cpu cores' but assuming multi-core");
}
}
if (cores == 0)
cores = 1; /* guarantee non-zero */
threads = numcpu / (sockets * cores); /* solve for threads */
if (threads == 0)
threads = 1; /* guarantee non-zero */
} else { /* heterogeneous system */
sockets = numcpu;
cores = 1; /* one core per socket */
threads = 1; /* one core per core */
}
*p_sockets = sockets; /* update output parameters */
*p_cores = cores;
*p_threads = threads;
#if _DEBUG
/*** Display raw data ***/
debug3("numcpu: %u", numcpu);
debug3("numphys: %u", numphys);
debug3("numcores: %u", numcores);
debug3("cores: %u->%u", mincores, maxcores);
debug3("sibs: %u->%u", minsibs, maxsibs);
debug3("cpuid: %u->%u", mincpuid, maxcpuid);
debug3("physid: %u->%u", minphysid, maxphysid);
debug3("coreid: %u->%u", mincoreid, maxcoreid);
for (i = 0; i < numproc; i++) {
debug3("CPU %d:", i);
debug3(" cpuid: %u", cpuinfo[i].cpuid);
debug3(" seen: %u", cpuinfo[i].seen);
debug3(" physid: %u", cpuinfo[i].physid);
debug3(" physcnt: %u", cpuinfo[i].physcnt);
debug3(" siblings: %u", cpuinfo[i].siblings);
debug3(" cores: %u", cpuinfo[i].cores);
debug3(" coreid: %u", cpuinfo[i].coreid);
debug3(" corecnt: %u\n", cpuinfo[i].corecnt);
}
debug3("Sockets: %u", sockets);
debug3("Cores per socket: %u", cores);
debug3("Threads per core: %u", threads);
#endif
*p_block_map_size = numcpu;
retval = _compute_block_map(*p_block_map_size, p_block_map,
p_block_map_inv);
xfree(cpuinfo); /* done with raw cpuinfo data */
return retval;
}
/* _chk_cpuinfo_str
* check a line of cpuinfo data (buffer) for a keyword. If it
* exists, return the string value for that keyword in *valptr.
* Input: buffer - single line of cpuinfo data
* keyword - keyword to check for
* Output: valptr - string value corresponding to keyword
* return code - true if keyword found, false if not found
*/
static int _chk_cpuinfo_str(char *buffer, char *keyword, char **valptr)
{
char *ptr;
if (xstrncmp(buffer, keyword, strlen(keyword)))
return false;
ptr = strstr(buffer, ":");
if (ptr != NULL)
ptr++;
*valptr = ptr;
return true;
}
/* _chk_cpuinfo_uint32
* check a line of cpuinfo data (buffer) for a keyword. If it
* exists, return the uint16 value for that keyword in *valptr.
* Input: buffer - single line of cpuinfo data
* keyword - keyword to check for
* Output: valptr - uint32 value corresponding to keyword
* return code - true if keyword found, false if not found
*/
static int _chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val)
{
char *valptr;
if (_chk_cpuinfo_str(buffer, keyword, &valptr)) {
*val = strtoul(valptr, (char **)NULL, 10);
return true;
} else {
return false;
}
}
/*
* _compute_block_map - Compute abstract->machine block mapping (and inverse)
* allows computation of CPU ID masks for an abstract block distribution
* of logical processors which can then be mapped the IDs used in the
* actual machine processor ID ordering (which can be BIOS/OS dependent)
* Input: numproc - number of processors on the system
* cpu - array of cpuinfo (file static for qsort/_compare_cpus)
* Output: block_map, block_map_inv - asbtract->physical block distribution map
* return code - 0 if no error, otherwise errno
* NOTE: User must free block_map and block_map_inv
*
* For example, given a system with 8 logical processors arranged as:
*
* Sockets: 4
* Cores per socket: 2
* Threads per core: 1
*
* and a logical CPU ID assignment of:
*
* Machine logical CPU ID assignment:
* Logical CPU ID: 0 1 2 3 4 5 6 7
* Physical Socket ID: 0 1 3 2 0 1 3 2
*
* The block_map would be:
*
* Abstract -> Machine logical CPU ID block mapping:
* Input: (Abstract ID) 0 1 2 3 4 5 6 7
* Output: (Machine ID) 0 4 1 5 3 7 2 6 <--- block_map[]
* Physical Socket ID: 0 0 1 1 2 2 3 3
*
* and it's inverse would be:
*
* Machine -> Abstract logical CPU ID block mapping: (inverse)
* Input: (Machine ID) 0 1 2 3 4 5 6 7
* Output: (Abstract ID) 0 2 6 4 1 3 7 5 <--- block_map_inv[]
* Physical Socket ID: 0 1 3 2 0 1 3 2
*/
/* physical cpu comparison with void * arguments to allow use with
* libc qsort()
*/
static int _icmp16(uint16_t a, uint16_t b)
{
if (a < b) {
return -1;
} else if (a == b) {
return 0;
} else {
return 1;
}
}
static int _icmp32(uint32_t a, uint32_t b)
{
if (a < b) {
return -1;
} else if (a == b) {
return 0;
} else {
return 1;
}
}
static int _compare_cpus(const void *a1, const void *b1) {
uint16_t *a = (uint16_t *) a1;
uint16_t *b = (uint16_t *) b1;
int cmp;
cmp = -1 * _icmp16(cpuinfo[*a].seen,cpuinfo[*b].seen); /* seen to front */
if (cmp != 0)
return cmp;
cmp = _icmp32(cpuinfo[*a].physid, cpuinfo[*b].physid); /* key 1: physid */
if (cmp != 0)
return cmp;
cmp = _icmp32(cpuinfo[*a].coreid, cpuinfo[*b].coreid); /* key 2: coreid */
if (cmp != 0)
return cmp;
cmp = _icmp32(cpuinfo[*a].cpuid, cpuinfo[*b].cpuid); /* key 3: cpu id */
return cmp;
}
static int _compute_block_map(uint16_t numproc,
uint16_t **block_map, uint16_t **block_map_inv)
{
uint16_t i;
/* Compute abstract->machine block mapping (and inverse) */
if (block_map) {
*block_map = xcalloc(numproc, sizeof(uint16_t));
for (i = 0; i < numproc; i++) {
(*block_map)[i] = i;
}
qsort(*block_map, numproc, sizeof(uint16_t), &_compare_cpus);
}
if (block_map && block_map_inv) {
*block_map_inv = xcalloc(numproc, sizeof(uint16_t));
for (i = 0; i < numproc; i++) {
uint16_t idx = (*block_map)[i];
(*block_map_inv)[idx] = i;
}
}
#if _DEBUG
/* Display the mapping tables */
debug3("\nMachine logical CPU ID assignment:");
debug3("Logical CPU ID: ");
for (i = 0; i < numproc; i++) {
debug3("%3d", i);
}
debug3("");
debug3("Physical Socket ID: ");
for (i = 0; i < numproc; i++) {
debug3("%3u", cpuinfo[i].physid);
}
debug3("");
if (block_map) {
debug3("\nAbstract -> Machine logical CPU ID block mapping:");
debug3("Input: (Abstract ID) ");
for (i = 0; i < numproc; i++) {
debug3("%3d", i);
}
debug3("");
debug3("Output: (Machine ID) ");
for (i = 0; i < numproc; i++) {
debug3("%3u", (*block_map)[i]);
}
debug3("");
debug3("Physical Socket ID: ");
for (i = 0; i < numproc; i++) {
uint16_t id = (*block_map)[i];
debug3("%3u", cpuinfo[id].physid);
}
debug3("");
}
if (block_map_inv) {
debug3("\nMachine -> Abstract logical CPU ID block mapping: "
"(inverse)");
debug3("Input: (Machine ID) ");
for (i = 0; i < numproc; i++) {
debug3("%3d", i);
}
debug3("");
debug3("Output: (Abstract ID)");
for (i = 0; i < numproc; i++) {
debug3("%3u", (*block_map_inv)[i]);
}
debug3("");
debug3("Physical Socket ID: ");
for (i = 0; i < numproc; i++) {
debug3("%3u", cpuinfo[i].physid);
}
debug3("");
}
#endif
return 0;
}
#endif
/*
* Convert an abstract core range string into a machine-specific CPU range
* string. Abstract id to machine id conversion is done using block_map.
* When a core is set in the input, all its threads will be set in the output.
*
* Inverse of xcpuinfo_mac_to_abs.
*
* Input: lrange - abstract/logical core spec string.
* Output: prange - machine/local/physical CPU spec string.
* return code - SLURM_SUCCESS if no error, otherwise SLURM_ERROR.
*/
int xcpuinfo_abs_to_mac(char *lrange, char **prange)
{
static int total_cores = -1, total_cpus = -1;
bitstr_t* absmap = NULL;
bitstr_t* macmap = NULL;
int icore, ithread;
int absid, macid;
int rc = SLURM_SUCCESS;
if (total_cores == -1) {
total_cores = conf->sockets * conf->cores;
total_cpus = conf->block_map_size;
}
/* allocate bitmap */
absmap = bit_alloc(total_cores);
macmap = bit_alloc(total_cpus);
if (!absmap || !macmap) {
rc = SLURM_ERROR;
goto end_it;
}
/* string to bitmap conversion */
if (bit_unfmt(absmap, lrange)) {
rc = SLURM_ERROR;
goto end_it;
}
/* mapping abstract id to machine id using conf->block_map */
for (icore = 0; icore < total_cores; icore++) {
if (bit_test(absmap, icore)) {
for (ithread = 0; ithread < conf->threads; ithread++) {
/*
* Use actual hardware thread count to get the
* correct offset for the CPU ID.
*/
absid = icore * conf->actual_threads + ithread;
absid %= total_cpus;
macid = conf->block_map[absid];
macid %= total_cpus;
bit_set(macmap, macid);
}
}
}
/* convert machine cpu bitmap to range string */
*prange = xmalloc(total_cpus * 6);
bit_fmt(*prange, total_cpus*6, macmap);
/* free unused bitmaps */
end_it:
FREE_NULL_BITMAP(absmap);
FREE_NULL_BITMAP(macmap);
if (rc != SLURM_SUCCESS)
error("%s: failed", __func__);
return rc;
}
static char *_remove_ecores_range(const char *orig_range)
{
char *pcores_range = NULL;
#if HWLOC_API_VERSION > 0x00020401
hwloc_bitmap_t r = NULL, rout = NULL;
if (slurm_conf.conf_flags & CONF_FLAG_ECORE)
return NULL;
/*
* This comes from _remove_ecores() and contains a bitmap of performance
* cores.
*/
if (!cpuset_tot)
return NULL;
r = hwloc_bitmap_alloc();
if (hwloc_bitmap_list_sscanf(r, orig_range)) {
error("Cannot convert cpuset range %s into a hwloc bitmap",
orig_range);
goto end_it;
}
rout = hwloc_bitmap_alloc();
hwloc_bitmap_and(rout, r, cpuset_tot);
pcores_range = xmalloc(MAX_CPUSET_STR);
hwloc_bitmap_list_snprintf(pcores_range, MAX_CPUSET_STR, rout);
debug2("Reduced original range from %s to %s to only include p-cores",
orig_range, pcores_range);
end_it:
hwloc_bitmap_free(r);
hwloc_bitmap_free(rout);
/* We do not need the cpuset_tot anymore */
hwloc_bitmap_free(cpuset_tot);
cpuset_tot = NULL;
#endif
return pcores_range;
}
/*
* Convert a machine-specific CPU range string into an abstract core range
* string. Machine id to abstract id conversion is done using block_map_inv.
* When a single thread in a core is set in the input, the corresponding core
* will be set in its output.
*
* Inverse of xcpuinfo_abs_to_mac.
*
* Input: in_range - machine/local/physical CPU range string.
* Output: out_range - abstract/logical core range string. Caller should xfree()
* return code - SLURM_SUCCESS if no error, otherwise SLURM_ERROR.
*/
int xcpuinfo_mac_to_abs(char *in_range, char **out_range)
{
static int total_cores = -1, total_cpus = -1;
bitstr_t *macmap = NULL;
bitstr_t *absmap = NULL;
bitstr_t *absmap_core = NULL;
int rc = SLURM_SUCCESS;
if (total_cores == -1) {
total_cores = conf->sockets * conf->cores;
total_cpus = conf->block_map_size;
}
/* allocate bitmaps */
macmap = bit_alloc(total_cpus);
absmap = bit_alloc(total_cpus);
absmap_core = bit_alloc(total_cores);
if (!macmap || !absmap) {
rc = SLURM_ERROR;
goto end_it;
}
/* string to bitmap conversion */
if (bit_unfmt(macmap, in_range)) {
rc = SLURM_ERROR;
goto end_it;
}
/* mapping machine id to abstract id using conf->block_map_inv */
for (int icore = 0; icore < total_cores; icore++) {
for (int ithread = 0; ithread < conf->threads; ithread++) {
int absid, macid;
/*
* Use actual hardware thread count to get the
* correct offset for the CPU ID.
*/
macid = (icore * conf->actual_threads) + ithread;
macid %= total_cpus;
/* Skip this machine CPU id if not in in_range */
if (!bit_test(macmap, macid))
continue;
absid = conf->block_map_inv[macid];
absid %= total_cpus;
bit_set(absmap, absid);
}
}
/* condense abstract CPU bitmap into an abstract core bitmap */
for (int icore = 0; icore < total_cores; icore++) {
for (int ithread = 0; ithread < conf->threads; ithread++) {
/*
* Use actual hardware thread count to get the
* correct offset for the CPU ID.
*/
int icpu = (icore * conf->actual_threads) + ithread;
icpu %= total_cpus;
if (bit_test(absmap, icpu)) {
bit_set(absmap_core, icore);
break;
}
}
}
/* convert abstract core bitmap to range string */
*out_range = xmalloc(total_cores * 6);
bit_fmt(*out_range, total_cores * 6, absmap_core);
/* free unused bitmaps */
end_it:
FREE_NULL_BITMAP(macmap);
FREE_NULL_BITMAP(absmap);
FREE_NULL_BITMAP(absmap_core);
if (rc != SLURM_SUCCESS)
error("%s failed", __func__);
return rc;
}
extern char *xcpuinfo_get_cpuspec(void)
{
char *res_abs_cores = NULL;
bitstr_t *res_core_bitmap = NULL;
bitstr_t *res_cpu_bitmap = NULL;
char *restricted_cpus_as_abs = NULL;
char *pcores_range = NULL;
if (!restricted_cpus_as_mac)
return NULL;
/* We need to remove the e-cores to compute the cpuspec list */
pcores_range = _remove_ecores_range(restricted_cpus_as_mac);
if (pcores_range) {
xcpuinfo_mac_to_abs(pcores_range, &restricted_cpus_as_abs);
xfree(pcores_range);
} else {
xcpuinfo_mac_to_abs(restricted_cpus_as_mac,
&restricted_cpus_as_abs);
}
debug2("%s: restricted cpus as machine: %s",
__func__, restricted_cpus_as_mac);
debug2("%s: restricted cpus as abstract: %s",
__func__, restricted_cpus_as_abs);
if (!restricted_cpus_as_abs || !restricted_cpus_as_abs[0])
goto empty_end;
res_core_bitmap = bit_alloc(MAX_CPU_CNT);
res_cpu_bitmap = bit_alloc(MAX_CPU_CNT);
bit_unfmt(res_core_bitmap, restricted_cpus_as_abs);
for (int core_off = 0; core_off < conf->cores; core_off++) {
if (!bit_test(res_core_bitmap, core_off))
continue;
for (int thread_off = 0; thread_off < conf->threads;
thread_off++) {
int thread_inx =
(core_off * (int) conf->threads) + thread_off;
bit_set(res_cpu_bitmap, thread_inx);
}
}
res_abs_cores = xmalloc(MAX_CPU_CNT);
bit_fmt(res_abs_cores, MAX_CPU_CNT, res_cpu_bitmap);
FREE_NULL_BITMAP(res_core_bitmap);
FREE_NULL_BITMAP(res_cpu_bitmap);
empty_end:
xfree(restricted_cpus_as_abs);
return res_abs_cores;
}