libnuma.c - libnuma-cloud-tpu - Git at Google

 /* Simple NUMA library.
    Copyright (C) 2003,2004,2005,2008 Andi Kleen,SuSE Labs and
    Cliff Wickman,SGI.

    libnuma is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; version
    2.1.

    libnuma is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should find a copy of v2.1 of the GNU Lesser General Public License
    somewhere on your Linux system; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

    All calls are undefined when numa_available returns an error. */
 #define _GNU_SOURCE 1
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <string.h>
 #include <sched.h>
 #include <dirent.h>
 #include <errno.h>
 #include <stdarg.h>
 #include <ctype.h>

 #include <sys/mman.h>
 #include <limits.h>

 #ifdef MEMORY_SANITIZER
 #include <sanitizer/msan_interface.h>
 #endif

 #include "config.h"
 #include "numa.h"
 #include "numaif.h"
 #include "numaint.h"
 #include "util.h"
 #include "affinity.h"

 #define WEAK __attribute__((weak))

 #define CPU_BUFFER_SIZE 4096     /* This limits you to 32768 CPUs */

 /* these are the old (version 1) masks */
 nodemask_t numa_no_nodes;
 nodemask_t numa_all_nodes;
 /* these are now the default bitmask (pointers to) (version 2) */
 struct bitmask *numa_no_nodes_ptr = NULL;
 struct bitmask *numa_all_nodes_ptr = NULL;
 struct bitmask *numa_possible_nodes_ptr = NULL;
 struct bitmask *numa_all_cpus_ptr = NULL;
 struct bitmask *numa_possible_cpus_ptr = NULL;
 /* I would prefer to use symbol versioning to create v1 and v2 versions
    of numa_no_nodes and numa_all_nodes, but the loader does not correctly
    handle versioning of BSS versus small data items */

 struct bitmask *numa_nodes_ptr = NULL;
 static struct bitmask *numa_memnode_ptr = NULL;
 static unsigned long *node_cpu_mask_v1[NUMA_NUM_NODES];
 static struct bitmask **node_cpu_mask_v2;

 WEAK void numa_error(const char *where);

 #ifndef TLS
 #warning "not threadsafe"
 #define __thread
 #endif

 static __thread int bind_policy = MPOL_BIND;
 static __thread unsigned int mbind_flags = 0;
 static int sizes_set=0;
 static int maxconfigurednode = -1;
 static int maxconfiguredcpu = -1;
 static int numprocnode = -1;
 static int numproccpu = -1;
 static int nodemask_sz = 0;
 static int cpumask_sz = 0;

 int numa_exit_on_error = 0;
 int numa_exit_on_warn = 0;
 static void set_sizes(void);

 /*
  * There are two special functions, _init(void) and _fini(void), which
  * are called automatically by the dynamic loader whenever a library is loaded.
  *
  * The v1 library depends upon nodemask_t's of all nodes and no nodes.
  */
 void __attribute__((constructor))
 numa_init(void)
 {
 	int max,i;

 	if (sizes_set)
 		return;

 	set_sizes();
 	/* numa_all_nodes should represent existing nodes on this system */
         max = numa_num_configured_nodes();
         for (i = 0; i < max; i++)
                 nodemask_set_compat((nodemask_t *)&numa_all_nodes, i);
 	memset(&numa_no_nodes, 0, sizeof(numa_no_nodes));
 }

 #define FREE_AND_ZERO(x) if (x) {	\
 		numa_bitmask_free(x);	\
 		x = NULL;		\
 	}

 void __attribute__((destructor))
 numa_fini(void)
 {
 	FREE_AND_ZERO(numa_all_cpus_ptr);
 	FREE_AND_ZERO(numa_possible_cpus_ptr);
 	FREE_AND_ZERO(numa_all_nodes_ptr);
 	FREE_AND_ZERO(numa_possible_nodes_ptr);
 	FREE_AND_ZERO(numa_no_nodes_ptr);
 	FREE_AND_ZERO(numa_memnode_ptr);
 	FREE_AND_ZERO(numa_nodes_ptr);
 }

 /*
  * The following bitmask declarations, bitmask_*() routines, and associated
  * _setbit() and _getbit() routines are:
  * Copyright (c) 2004_2007 Silicon Graphics, Inc. (SGI) All rights reserved.
  * SGI publishes it under the terms of the GNU General Public License, v2,
  * as published by the Free Software Foundation.
  */
 static unsigned int
 _getbit(const struct bitmask *bmp, unsigned int n)
 {
 	if (n < bmp->size)
 		return (bmp->maskp[n/bitsperlong] >> (n % bitsperlong)) & 1;
 	else
 		return 0;
 }

 static void
 _setbit(struct bitmask *bmp, unsigned int n, unsigned int v)
 {
 	if (n < bmp->size) {
 		if (v)
 			bmp->maskp[n/bitsperlong] |= 1UL << (n % bitsperlong);
 		else
 			bmp->maskp[n/bitsperlong] &= ~(1UL << (n % bitsperlong));
 	}
 }

 int
 numa_bitmask_isbitset(const struct bitmask *bmp, unsigned int i)
 {
 	return _getbit(bmp, i);
 }

 struct bitmask *
 numa_bitmask_setall(struct bitmask *bmp)
 {
 	unsigned int i;
 	for (i = 0; i < bmp->size; i++)
 		_setbit(bmp, i, 1);
 	return bmp;
 }

 struct bitmask *
 numa_bitmask_clearall(struct bitmask *bmp)
 {
 	unsigned int i;
 	for (i = 0; i < bmp->size; i++)
 		_setbit(bmp, i, 0);
 	return bmp;
 }

 struct bitmask *
 numa_bitmask_setbit(struct bitmask *bmp, unsigned int i)
 {
 	_setbit(bmp, i, 1);
 	return bmp;
 }

 struct bitmask *
 numa_bitmask_clearbit(struct bitmask *bmp, unsigned int i)
 {
 	_setbit(bmp, i, 0);
 	return bmp;
 }

 unsigned int
 numa_bitmask_nbytes(struct bitmask *bmp)
 {
 	return longsperbits(bmp->size) * sizeof(unsigned long);
 }

 /* where n is the number of bits in the map */
 /* This function should not exit on failure, but right now we cannot really
    recover from this. */
 struct bitmask *
 numa_bitmask_alloc(unsigned int n)
 {
 	struct bitmask *bmp;

 	if (n < 1) {
 		errno = EINVAL;
 		numa_error("request to allocate mask for invalid number");
 		exit(1);
 	}
 	bmp = malloc(sizeof(*bmp));
 	if (bmp == 0)
 		goto oom;
 	bmp->size = n;
 	bmp->maskp = calloc(longsperbits(n), sizeof(unsigned long));
 	if (bmp->maskp == 0) {
 		free(bmp);
 		goto oom;
 	}
 	return bmp;

 oom:
 	numa_error("Out of memory allocating bitmask");
 	exit(1);
 }

 void
 numa_bitmask_free(struct bitmask *bmp)
 {
 	if (bmp == 0)
 		return;
 	free(bmp->maskp);
 	bmp->maskp = (unsigned long *)0xdeadcdef;  /* double free tripwire */
 	free(bmp);
 	return;
 }

 /* True if two bitmasks are equal */
 int
 numa_bitmask_equal(const struct bitmask *bmp1, const struct bitmask *bmp2)
 {
 	unsigned int i;
 	for (i = 0; i < bmp1->size || i < bmp2->size; i++)
 		if (_getbit(bmp1, i) != _getbit(bmp2, i))
 			return 0;
 	return 1;
 }

 /* Hamming Weight: number of set bits */
 unsigned int numa_bitmask_weight(const struct bitmask *bmp)
 {
 	unsigned int i;
 	unsigned int w = 0;
 	for (i = 0; i < bmp->size; i++)
 		if (_getbit(bmp, i))
 			w++;
 	return w;
 }

 /* *****end of bitmask_  routines ************ */

 /* Next two can be overwritten by the application for different error handling */
 WEAK void numa_error(const char *where)
 {
 	int olde = errno;
 	perror(where);
 	if (numa_exit_on_error)
 		exit(1);
 	errno = olde;
 }

 WEAK void numa_warn(int num, const char *fmt, ...)
 {
 	static unsigned warned;
 	va_list ap;
 	int olde = errno;

 	/* Give each warning only once */
 	if ((1<<num) & warned)
 		return;
 	warned |= (1<<num);

 	va_start(ap,fmt);
 	fprintf(stderr, "libnuma: Warning: ");
 	vfprintf(stderr, fmt, ap);
 	fputc('\n', stderr);
 	va_end(ap);

 	errno = olde;
 }

 static void setpol(int policy, struct bitmask *bmp)
 {
 	if (set_mempolicy(policy, bmp->maskp, bmp->size + 1) < 0)
 		numa_error("set_mempolicy");
 }

 static void getpol(int *oldpolicy, struct bitmask *bmp)
 {
 	if (get_mempolicy(oldpolicy, bmp->maskp, bmp->size + 1, 0, 0) < 0)
 		numa_error("get_mempolicy");
 }

 static void dombind(void *mem, size_t size, int pol, struct bitmask *bmp)
 {
 	if (mbind(mem, size, pol, bmp ? bmp->maskp : NULL, bmp ? bmp->size + 1 : 0,
 		  mbind_flags) < 0)
 		numa_error("mbind");
 }

 /* (undocumented) */
 /* gives the wrong answer for hugetlbfs mappings. */
 int numa_pagesize(void)
 {
 	static int pagesize;
 	if (pagesize > 0)
 		return pagesize;
 	pagesize = getpagesize();
 	return pagesize;
 }

 make_internal_alias(numa_pagesize);

 /*
  * Find nodes (numa_nodes_ptr), nodes with memory (numa_memnode_ptr)
  * and the highest numbered existing node (maxconfigurednode).
  */
 static void
 set_configured_nodes(void)
 {
 	DIR *d;
 	struct dirent *de;
 	long long freep;

 	numa_memnode_ptr = numa_allocate_nodemask();
 	numa_nodes_ptr = numa_allocate_nodemask();

 	d = opendir("/sys/devices/system/node");
 	if (!d) {
 		maxconfigurednode = 0;
 	} else {
 		while ((de = readdir(d)) != NULL) {
 			int nd;
 			if (strncmp(de->d_name, "node", 4))
 				continue;
 			nd = strtoul(de->d_name+4, NULL, 0);
 			numa_bitmask_setbit(numa_nodes_ptr, nd);
 			if (numa_node_size64(nd, &freep) > 0)
 				numa_bitmask_setbit(numa_memnode_ptr, nd);
 			if (maxconfigurednode < nd)
 				maxconfigurednode = nd;
 		}
 		closedir(d);
 	}
 }

 /*
  * Convert the string length of an ascii hex mask to the number
  * of bits represented by that mask.
  */
 static int s2nbits(const char *s)
 {
 	return strlen(s) * 32 / 9;
 }

 /* Is string 'pre' a prefix of string 's'? */
 static int strprefix(const char *s, const char *pre)
 {
 	return strncmp(s, pre, strlen(pre)) == 0;
 }

 static const char *mask_size_file = "/proc/self/status";
 static const char *nodemask_prefix = "Mems_allowed:\t";
 /*
  * (do this the way Paul Jackson's libcpuset does it)
  * The nodemask values in /proc/self/status are in an
  * ascii format that uses 9 characters for each 32 bits of mask.
  * (this could also be used to find the cpumask size)
  */
 static void
 set_nodemask_size(void)
 {
 	FILE *fp;
 	char *buf = NULL;
 	size_t bufsize = 0;

 	if ((fp = fopen(mask_size_file, "r")) == NULL)
 		goto done;

 	while (getline(&buf, &bufsize, fp) > 0) {
 #ifdef MEMORY_SANITIZER
     __msan_unpoison_string(buf);
 #endif
 		if (strprefix(buf, nodemask_prefix)) {
 			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
 			break;
 		}
 	}
 	free(buf);
 	fclose(fp);
 done:
 	if (nodemask_sz == 0) {/* fall back on error */
 		int pol;
 		unsigned long *mask = NULL;
 		nodemask_sz = 16;
 		do {
 			nodemask_sz <<= 1;
 			mask = realloc(mask, nodemask_sz / 8);
 			if (!mask)
 				return;
 		} while (get_mempolicy(&pol, mask, nodemask_sz + 1, 0, 0) < 0 && errno == EINVAL &&
 				nodemask_sz < 4096*8);
 		free(mask);
 	}
 }

 /*
  * Read a mask consisting of a sequence of hexadecimal longs separated by
  * commas. Order them correctly and return the number of bits set.
  */
 static int
 read_mask(char *s, struct bitmask *bmp)
 {
 	char *end = s;
 	int tmplen = (bmp->size + bitsperint - 1) / bitsperint;
 	unsigned int tmp[tmplen];
 	unsigned int *start = tmp;
 	unsigned int i, n = 0, m = 0;

 	if (!s)
 		return 0;	/* shouldn't happen */

 	i = strtoul(s, &end, 16);

 	/* Skip leading zeros */
 	while (!i && *end++ == ',') {
 		i = strtoul(end, &end, 16);
 	}

 	if (!i)
 		/* End of string. No mask */
 		return -1;

 	start[n++] = i;
 	/* Read sequence of ints */
 	while (*end++ == ',') {
 		i = strtoul(end, &end, 16);
 		start[n++] = i;

 		/* buffer overflow */
 		if (n > tmplen)
 			return -1;
 	}

 	/*
 	 * Invert sequence of ints if necessary since the first int
 	 * is the highest and we put it first because we read it first.
 	 */
 	while (n) {
 		int w;
 		unsigned long x = 0;
 		/* read into long values in an endian-safe way */
 		for (w = 0; n && w < bitsperlong; w += bitsperint)
 			x |= ((unsigned long)start[n-- - 1] << w);

 		bmp->maskp[m++] = x;
 	}
 	/*
 	 * Return the number of bits set
 	 */
 	return numa_bitmask_weight(bmp);
 }

 /*
  * Read a processes constraints in terms of nodes and cpus from
  * /proc/self/status.
  */
 static void
 set_task_constraints(void)
 {
 	int hicpu = maxconfiguredcpu;
 	int i;
 	char *buffer = NULL;
 	size_t buflen = 0;
 	FILE *f;

 	numa_all_cpus_ptr = numa_allocate_cpumask();
 	numa_possible_cpus_ptr = numa_allocate_cpumask();
 	numa_all_nodes_ptr = numa_allocate_nodemask();
 	numa_possible_nodes_ptr = numa_allocate_cpumask();
 	numa_no_nodes_ptr = numa_allocate_nodemask();

 	f = fopen(mask_size_file, "r");
 	if (!f) {
 		//numa_warn(W_cpumap, "Cannot parse %s", mask_size_file);
 		return;
 	}

 	while (getline(&buffer, &buflen, f) > 0) {
 #ifdef MEMORY_SANITIZER
     __msan_unpoison_string(buffer);
 #endif
 		/* mask starts after [last] tab */
 		char  *mask = strrchr(buffer,'\t') + 1;

 		if (strncmp(buffer,"Cpus_allowed:",13) == 0)
 			numproccpu = read_mask(mask, numa_all_cpus_ptr);

 		if (strncmp(buffer,"Mems_allowed:",13) == 0) {
 			numprocnode = read_mask(mask, numa_all_nodes_ptr);
 		}
 	}
 	fclose(f);
 	free(buffer);

 	for (i = 0; i <= hicpu; i++)
 		numa_bitmask_setbit(numa_possible_cpus_ptr, i);
 	for (i = 0; i <= maxconfigurednode; i++)
 		numa_bitmask_setbit(numa_possible_nodes_ptr, i);

 	/*
 	 * Cpus_allowed in the kernel can be defined to all f's
 	 * i.e. it may be a superset of the actual available processors.
 	 * As such let's reduce numproccpu to the number of actual
 	 * available cpus.
 	 */
 	if (numproccpu <= 0) {
 		for (i = 0; i <= hicpu; i++)
 			numa_bitmask_setbit(numa_all_cpus_ptr, i);
 		numproccpu = hicpu+1;
 	}

 	if (numproccpu > hicpu+1) {
 		numproccpu = hicpu+1;
 		for (i=hicpu+1; i<numa_all_cpus_ptr->size; i++) {
 			numa_bitmask_clearbit(numa_all_cpus_ptr, i);
 		}
 	}

 	if (numprocnode <= 0) {
 		for (i = 0; i <= maxconfigurednode; i++)
 			numa_bitmask_setbit(numa_all_nodes_ptr, i);
 		numprocnode = maxconfigurednode + 1;
 	}

 	return;
 }

 /*
  * Find the highest cpu number possible (in other words the size
  * of a kernel cpumask_t (in bits) - 1)
  */
 static void
 set_numa_max_cpu(void)
 {
 	int len = 4096;
 	int n;
 	int olde = errno;
 	struct bitmask *buffer;

 	do {
 		buffer = numa_bitmask_alloc(len);
 		n = numa_sched_getaffinity_v2_int(0, buffer);
 		/* on success, returns size of kernel cpumask_t, in bytes */
 		if (n < 0) {
 			if (errno == EINVAL) {
 				if (len >= 1024*1024)
 					break;
 				len *= 2;
 				numa_bitmask_free(buffer);
 				continue;
 			} else {
 				numa_warn(W_numcpus, "Unable to determine max cpu"
 					  " (sched_getaffinity: %s); guessing...",
 					  strerror(errno));
 				n = sizeof(cpu_set_t);
 				break;
 			}
 		}
 	} while (n < 0);
 	numa_bitmask_free(buffer);
 	errno = olde;
 	cpumask_sz = n*8;
 }

 /*
  * get the total (configured) number of cpus - both online and offline
  */
 static void
 set_configured_cpus(void)
 {
 	maxconfiguredcpu = sysconf(_SC_NPROCESSORS_CONF) - 1;
 	if (maxconfiguredcpu == -1)
 		numa_error("sysconf(NPROCESSORS_CONF) failed");
 }

 /*
  * Initialize all the sizes.
  */
 static void
 set_sizes(void)
 {
 	sizes_set++;
 	set_nodemask_size();	/* size of kernel nodemask_t */
 	set_configured_nodes();	/* configured nodes listed in /sys */
 	set_numa_max_cpu();	/* size of kernel cpumask_t */
 	set_configured_cpus();	/* cpus listed in /sys/devices/system/cpu */
 	set_task_constraints(); /* cpus and nodes for current task */
 }

 int
 numa_num_configured_nodes(void)
 {
 	/*
 	* NOTE: this function's behavior matches the documentation (ie: it
 	* returns a count of nodes with memory) despite the poor function
 	* naming.  We also cannot use the similarly poorly named
 	* numa_all_nodes_ptr as it only tracks nodes with memory from which
 	* the calling process can allocate.  Think sparse nodes, memory-less
 	* nodes, cpusets...
 	*/
 	int memnodecount=0, i;

 	for (i=0; i <= maxconfigurednode; i++) {
 		if (numa_bitmask_isbitset(numa_memnode_ptr, i))
 			memnodecount++;
 	}
 	return memnodecount;
 }

 int
 numa_num_configured_cpus(void)
 {

 	return maxconfiguredcpu+1;
 }

 int
 numa_num_possible_nodes(void)
 {
 	return nodemask_sz;
 }

 int
 numa_num_possible_cpus(void)
 {
 	return cpumask_sz;
 }

 int
 numa_num_task_nodes(void)
 {
 	return numprocnode;
 }

 /*
  * for backward compatibility
  */
 int
 numa_num_thread_nodes(void)
 {
 	return numa_num_task_nodes();
 }

 int
 numa_num_task_cpus(void)
 {
 	return numproccpu;
 }

 /*
  * for backward compatibility
  */
 int
 numa_num_thread_cpus(void)
 {
 	return numa_num_task_cpus();
 }

 /*
  * Return the number of the highest node in this running system,
  */
 int
 numa_max_node(void)
 {
 	return maxconfigurednode;
 }

 make_internal_alias(numa_max_node);

 /*
  * Return the number of the highest possible node in a system,
  * which for v1 is the size of a numa.h nodemask_t(in bits)-1.
  * but for v2 is the size of a kernel nodemask_t(in bits)-1.
  */
 int
 numa_max_possible_node_v1(void)
 {
 	return ((sizeof(nodemask_t)*8)-1);
 }
 backward_symver(numa_max_possible_node_v1,numa_max_possible_node);

 int
 numa_max_possible_node_v2(void)
 {
 	return numa_num_possible_nodes()-1;
 }
 symver(numa_max_possible_node_v2,numa_max_possible_node);

 make_internal_alias(numa_max_possible_node_v1);
 make_internal_alias(numa_max_possible_node_v2);

 /*
  * Allocate a bitmask for cpus, of a size large enough to
  * match the kernel's cpumask_t.
  */
 struct bitmask *
 numa_allocate_cpumask()
 {
 	int ncpus = numa_num_possible_cpus();

 	return numa_bitmask_alloc(ncpus);
 }

 /*
  * Allocate a bitmask the size of a libnuma nodemask_t
  */
 static struct bitmask *
 allocate_nodemask_v1(void)
 {
 	int nnodes = numa_max_possible_node_v1_int()+1;

 	return numa_bitmask_alloc(nnodes);
 }

 /*
  * Allocate a bitmask for nodes, of a size large enough to
  * match the kernel's nodemask_t.
  */
 struct bitmask *
 numa_allocate_nodemask(void)
 {
 	struct bitmask *bmp;
 	int nnodes = numa_max_possible_node_v2_int() + 1;

 	bmp = numa_bitmask_alloc(nnodes);
 	return bmp;
 }

 /* (cache the result?) */
 long long numa_node_size64(int node, long long *freep)
 {
 	size_t len = 0;
 	char *line = NULL;
 	long long size = -1;
 	FILE *f;
 	char fn[64];
 	int ok = 0;
 	int required = freep ? 2 : 1;

 	if (freep)
 		*freep = -1;
 	sprintf(fn,"/sys/devices/system/node/node%d/meminfo", node);
 	f = fopen(fn, "r");
 	if (!f)
 		return -1;
 	while (getdelim(&line, &len, '\n', f) > 0) {
 		char *end;
 		char *s = strcasestr(line, "kB");
 		if (!s)
 			continue;
 		--s;
 		while (s > line && isspace(*s))
 			--s;
 		while (s > line && isdigit(*s))
 			--s;
 		if (strstr(line, "MemTotal")) {
 			size = strtoull(s,&end,0) << 10;
 			if (end == s)
 				size = -1;
 			else
 				ok++;
 		}
 		if (freep && strstr(line, "MemFree")) {
 			*freep = strtoull(s,&end,0) << 10;
 			if (end == s)
 				*freep = -1;
 			else
 				ok++;
 		}
 	}
 	fclose(f);
 	free(line);
 	if (ok != required)
 		numa_warn(W_badmeminfo, "Cannot parse sysfs meminfo (%d)", ok);
 	return size;
 }

 make_internal_alias(numa_node_size64);

 long numa_node_size(int node, long *freep)
 {
 	long long f2;
 	long sz = numa_node_size64_int(node, &f2);
 	if (freep)
 		*freep = f2;
 	return sz;
 }

 int numa_available(void)
 {
 	if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
 		return -1;
 	return 0;
 }

 void
 numa_interleave_memory_v1(void *mem, size_t size, const nodemask_t *mask)
 {
 	struct bitmask bitmask;

 	bitmask.size = sizeof(nodemask_t) * 8;
 	bitmask.maskp = (unsigned long *)mask;
 	dombind(mem, size, MPOL_INTERLEAVE, &bitmask);
 }
 backward_symver(numa_interleave_memory_v1,numa_interleave_memory);

 void
 numa_interleave_memory_v2(void *mem, size_t size, struct bitmask *bmp)
 {
 	dombind(mem, size, MPOL_INTERLEAVE, bmp);
 }
 symver(numa_interleave_memory_v2,numa_interleave_memory);

 void numa_tonode_memory(void *mem, size_t size, int node)
 {
 	struct bitmask *nodes;

 	nodes = numa_allocate_nodemask();
 	numa_bitmask_setbit(nodes, node);
 	dombind(mem, size, bind_policy, nodes);
 	numa_bitmask_free(nodes);
 }

 void
 numa_tonodemask_memory_v1(void *mem, size_t size, const nodemask_t *mask)
 {
 	struct bitmask bitmask;

 	bitmask.maskp = (unsigned long *)mask;
 	bitmask.size  = sizeof(nodemask_t);
 	dombind(mem, size,  bind_policy, &bitmask);
 }
 backward_symver(numa_tonodemask_memory_v1,numa_tonodemask_memory);

 void
 numa_tonodemask_memory_v2(void *mem, size_t size, struct bitmask *bmp)
 {
 	dombind(mem, size,  bind_policy, bmp);
 }
 symver(numa_tonodemask_memory_v2,numa_tonodemask_memory);

 void numa_setlocal_memory(void *mem, size_t size)
 {
 	dombind(mem, size, MPOL_PREFERRED, NULL);
 }

 void numa_police_memory(void *mem, size_t size)
 {
 	int pagesize = numa_pagesize_int();
 	unsigned long i;
 	for (i = 0; i < size; i += pagesize)
         ((volatile char*)mem)[i] = ((volatile char*)mem)[i];
 }

 make_internal_alias(numa_police_memory);

 void *numa_alloc(size_t size)
 {
 	char *mem;
 	mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
 		   0, 0);
 	if (mem == (char *)-1)
 		return NULL;
 	numa_police_memory_int(mem, size);
 	return mem;
 }

 void *numa_realloc(void *old_addr, size_t old_size, size_t new_size)
 {
 	char *mem;
 	mem = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE);
 	if (mem == (char *)-1)
 		return NULL;
 	/*
 	 *	The memory policy of the allocated pages is preserved by mremap(), so
 	 *	there is no need to (re)set it here. If the policy of the original
 	 *	allocation is not set, the new pages will be allocated according to the
 	 *	process' mempolicy. Trying to allocate explicitly the new pages on the
 	 *	same node as the original ones would require changing the policy of the
 	 *	newly allocated pages, which violates the numa_realloc() semantics.
 	 */
 	return mem;
 }

 void *numa_alloc_interleaved_subset_v1(size_t size, const nodemask_t *mask)
 {
 	char *mem;
 	struct bitmask bitmask;

 	mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
 			0, 0);
 	if (mem == (char *)-1)
 		return NULL;
 	bitmask.maskp = (unsigned long *)mask;
 	bitmask.size  = sizeof(nodemask_t);
 	dombind(mem, size, MPOL_INTERLEAVE, &bitmask);
 	return mem;
 }
 backward_symver(numa_alloc_interleaved_subset_v1,numa_alloc_interleaved_subset);

 void *numa_alloc_interleaved_subset_v2(size_t size, struct bitmask *bmp)
 {
 	char *mem;

 	mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
 		   0, 0);
 	if (mem == (char *)-1)
 		return NULL;
 	dombind(mem, size, MPOL_INTERLEAVE, bmp);
 	return mem;
 }
 symver(numa_alloc_interleaved_subset_v2,numa_alloc_interleaved_subset);

 make_internal_alias(numa_alloc_interleaved_subset_v1);
 make_internal_alias(numa_alloc_interleaved_subset_v2);

 void *
 numa_alloc_interleaved(size_t size)
 {
 	return numa_alloc_interleaved_subset_v2_int(size, numa_all_nodes_ptr);
 }

 /*
  * given a user node mask, set memory policy to use those nodes
  */
 void
 numa_set_interleave_mask_v1(nodemask_t *mask)
 {
 	struct bitmask *bmp;
 	int nnodes = numa_max_possible_node_v1_int()+1;

 	bmp = numa_bitmask_alloc(nnodes);
 	copy_nodemask_to_bitmask(mask, bmp);
 	if (numa_bitmask_equal(bmp, numa_no_nodes_ptr))
 		setpol(MPOL_DEFAULT, bmp);
 	else
 		setpol(MPOL_INTERLEAVE, bmp);
 	numa_bitmask_free(bmp);
 }

 backward_symver(numa_set_interleave_mask_v1,numa_set_interleave_mask);

 void
 numa_set_interleave_mask_v2(struct bitmask *bmp)
 {
 	if (numa_bitmask_equal(bmp, numa_no_nodes_ptr))
 		setpol(MPOL_DEFAULT, bmp);
 	else
 		setpol(MPOL_INTERLEAVE, bmp);
 }
 symver(numa_set_interleave_mask_v2,numa_set_interleave_mask);

 nodemask_t
 numa_get_interleave_mask_v1(void)
 {
 	int oldpolicy = -1;
 	struct bitmask *bmp;
 	nodemask_t mask;

 	bmp = allocate_nodemask_v1();
 	getpol(&oldpolicy, bmp);
 	if (oldpolicy == MPOL_INTERLEAVE)
 		copy_bitmask_to_nodemask(bmp, &mask);
 	else
 		copy_bitmask_to_nodemask(numa_no_nodes_ptr, &mask);
 	numa_bitmask_free(bmp);
 	return mask;
 }
 backward_symver(numa_get_interleave_mask_v1,numa_get_interleave_mask);

 struct bitmask *
 numa_get_interleave_mask_v2(void)
 {
 	int oldpolicy = -1;
 	struct bitmask *bmp;

 	bmp = numa_allocate_nodemask();
 	getpol(&oldpolicy, bmp);
 	if (oldpolicy != MPOL_INTERLEAVE)
 		copy_bitmask_to_bitmask(numa_no_nodes_ptr, bmp);
 	return bmp;
 }
 symver(numa_get_interleave_mask_v2,numa_get_interleave_mask);

 /* (undocumented) */
 int numa_get_interleave_node(void)
 {
 	int nd;
 	if (get_mempolicy(&nd, NULL, 0, 0, MPOL_F_NODE) == 0)
 		return nd;
 	return 0;
 }

 void *numa_alloc_onnode(size_t size, int node)
 {
 	char *mem;
 	struct bitmask *bmp;

 	bmp = numa_allocate_nodemask();
 	numa_bitmask_setbit(bmp, node);
 	mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
 		   0, 0);
 	if (mem == (char *)-1)
 		mem = NULL;
 	else
 		dombind(mem, size, bind_policy, bmp);
 	numa_bitmask_free(bmp);
 	return mem;
 }

 void *numa_alloc_local(size_t size)
 {
 	char *mem;
 	mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
 		   0, 0);
 	if (mem == (char *)-1)
 		mem =  NULL;
 	else
 		dombind(mem, size, MPOL_PREFERRED, NULL);
 	return mem;
 }

 void numa_set_bind_policy(int strict)
 {
 	if (strict)
 		bind_policy = MPOL_BIND;
 	else
 		bind_policy = MPOL_PREFERRED;
 }

 void
 numa_set_membind_v1(const nodemask_t *mask)
 {
 	struct bitmask bitmask;

 	bitmask.maskp = (unsigned long *)mask;
 	bitmask.size  = sizeof(nodemask_t);
 	setpol(MPOL_BIND, &bitmask);
 }
 backward_symver(numa_set_membind_v1,numa_set_membind);

 void
 numa_set_membind_v2(struct bitmask *bmp)
 {
 	setpol(MPOL_BIND, bmp);
 }
 symver(numa_set_membind_v2,numa_set_membind);

 make_internal_alias(numa_set_membind_v2);

 /*
  * copy a bitmask map body to a numa.h nodemask_t structure
  */
 void
 copy_bitmask_to_nodemask(struct bitmask *bmp, nodemask_t *nmp)
 {
 	int max, i;

 	memset(nmp, 0, sizeof(nodemask_t));
         max = (sizeof(nodemask_t)*8);
 	for (i=0; i<bmp->size; i++) {
 		if (i >= max)
 			break;
 		if (numa_bitmask_isbitset(bmp, i))
 			nodemask_set_compat((nodemask_t *)nmp, i);
 	}
 }

 /*
  * copy a bitmask map body to another bitmask body
  * fill a larger destination with zeroes
  */
 void
 copy_bitmask_to_bitmask(struct bitmask *bmpfrom, struct bitmask *bmpto)
 {
 	int bytes;

 	if (bmpfrom->size >= bmpto->size) {
 		memcpy(bmpto->maskp, bmpfrom->maskp, CPU_BYTES(bmpto->size));
 	} else if (bmpfrom->size < bmpto->size) {
 		bytes = CPU_BYTES(bmpfrom->size);
 		memcpy(bmpto->maskp, bmpfrom->maskp, bytes);
 		memset(((char *)bmpto->maskp)+bytes, 0,
 					CPU_BYTES(bmpto->size)-bytes);
 	}
 }

 /*
  * copy a numa.h nodemask_t structure to a bitmask map body
  */
 void
 copy_nodemask_to_bitmask(nodemask_t *nmp, struct bitmask *bmp)
 {
 	int max, i;

 	numa_bitmask_clearall(bmp);
         max = (sizeof(nodemask_t)*8);
 	if (max > bmp->size)
 		max = bmp->size;
 	for (i=0; i<max; i++) {
 		if (nodemask_isset_compat(nmp, i))
 			numa_bitmask_setbit(bmp, i);
 	}
 }

 nodemask_t
 numa_get_membind_v1(void)
 {
 	int oldpolicy = -1;
 	struct bitmask *bmp;
 	nodemask_t nmp;

 	bmp = allocate_nodemask_v1();
 	getpol(&oldpolicy, bmp);
 	if (oldpolicy == MPOL_BIND) {
 		copy_bitmask_to_nodemask(bmp, &nmp);
 	} else {
 		/* copy the body of the map to numa_all_nodes */
 		copy_bitmask_to_nodemask(bmp, &numa_all_nodes);
 		nmp = numa_all_nodes;
 	}
 	numa_bitmask_free(bmp);
 	return nmp;
 }
 backward_symver(numa_get_membind_v1,numa_get_membind);

 struct bitmask *
 numa_get_membind_v2(void)
 {
 	int oldpolicy = -1;
 	struct bitmask *bmp;

 	bmp = numa_allocate_nodemask();
 	getpol(&oldpolicy, bmp);
 	if (oldpolicy != MPOL_BIND)
 		copy_bitmask_to_bitmask(numa_all_nodes_ptr, bmp);
 	return bmp;
 }
 symver(numa_get_membind_v2,numa_get_membind);

 //TODO:  do we need a v1 nodemask_t version?
 struct bitmask *numa_get_mems_allowed(void)
 {
 	struct bitmask *bmp;

 	/*
 	 * can change, so query on each call.
 	 */
 	bmp = numa_allocate_nodemask();
 	if (get_mempolicy(NULL, bmp->maskp, bmp->size + 1, 0,
 				MPOL_F_MEMS_ALLOWED) < 0)
 		numa_error("get_mempolicy");
 	return bmp;
 }
 make_internal_alias(numa_get_mems_allowed);

 void numa_free(void *mem, size_t size)
 {
 	munmap(mem, size);
 }

 int
 numa_parse_bitmap_v1(char *line, unsigned long *mask, int ncpus)
 {
 	int i;
 	char *p = strchr(line, '\n');
 	if (!p)
 		return -1;

 	for (i = 0; p > line;i++) {
 		char *oldp, *endp;
 		oldp = p;
 		if (*p == ',')
 			--p;
 		while (p > line && *p != ',')
 			--p;
 		/* Eat two 32bit fields at a time to get longs */
 		if (p > line && sizeof(unsigned long) == 8) {
 			oldp--;
 			memmove(p, p+1, oldp-p+1);
 			while (p > line && *p != ',')
 				--p;
 		}
 		if (*p == ',')
 			p++;
 		if (i >= CPU_LONGS(ncpus))
 			return -1;
 		mask[i] = strtoul(p, &endp, 16);
 		if (endp != oldp)
 			return -1;
 		p--;
 	}
 	return 0;
 }
 backward_symver(numa_parse_bitmap_v1,numa_parse_bitmap);

 int
 numa_parse_bitmap_v2(char *line, struct bitmask *mask)
 {
 	int i, ncpus;
 	char *p = strchr(line, '\n');
 	if (!p)
 		return -1;
 	ncpus = mask->size;

 	for (i = 0; p > line;i++) {
 		char *oldp, *endp;
 		oldp = p;
 		if (*p == ',')
 			--p;
 		while (p > line && *p != ',')
 			--p;
 		/* Eat two 32bit fields at a time to get longs */
 		if (p > line && sizeof(unsigned long) == 8) {
 			oldp--;
 			memmove(p, p+1, oldp-p+1);
 			while (p > line && *p != ',')
 				--p;
 		}
 		if (*p == ',')
 			p++;
 		if (i >= CPU_LONGS(ncpus))
 			return -1;
 		mask->maskp[i] = strtoul(p, &endp, 16);
 		if (endp != oldp)
 			return -1;
 		p--;
 	}
 	return 0;
 }
 symver(numa_parse_bitmap_v2,numa_parse_bitmap);

 void
 static init_node_cpu_mask_v2(void)
 {
 	int nnodes = numa_max_possible_node_v2_int() + 1;
 	node_cpu_mask_v2 = calloc (nnodes, sizeof(struct bitmask *));
 }

 /* This would be better with some locking, but I don't want to make libnuma
    dependent on pthreads right now. The races are relatively harmless. */
 int
 numa_node_to_cpus_v1(int node, unsigned long *buffer, int bufferlen)
 {
 	int err = 0;
 	char fn[64];
 	FILE *f;
 	char *line = NULL;
 	size_t len = 0;
 	struct bitmask bitmask;
 	int buflen_needed;
 	unsigned long *mask;
 	int ncpus = numa_num_possible_cpus();
 	int maxnode = numa_max_node_int();

 	buflen_needed = CPU_BYTES(ncpus);
 	if ((unsigned)node > maxnode || bufferlen < buflen_needed) {
 		errno = ERANGE;
 		return -1;
 	}
 	if (bufferlen > buflen_needed)
 		memset(buffer, 0, bufferlen);
 	if (node_cpu_mask_v1[node]) {
 		memcpy(buffer, node_cpu_mask_v1[node], buflen_needed);
 		return 0;
 	}

 	mask = malloc(buflen_needed);
 	if (!mask)
 		mask = (unsigned long *)buffer;
 	memset(mask, 0, buflen_needed);

 	sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node);
 	f = fopen(fn, "r");
 	if (!f || getdelim(&line, &len, '\n', f) < 1) {
 		if (numa_bitmask_isbitset(numa_nodes_ptr, node)) {
 			numa_warn(W_nosysfs2,
 			   "/sys not mounted or invalid. Assuming one node: %s",
 				  strerror(errno));
 			numa_warn(W_nosysfs2,
 			   "(cannot open or correctly parse %s)", fn);
 		}
 		bitmask.maskp = (unsigned long *)mask;
 		bitmask.size  = buflen_needed * 8;
 		numa_bitmask_setall(&bitmask);
 		err = -1;
 	}
 	if (f)
 		fclose(f);

 	if (line && (numa_parse_bitmap_v1(line, mask, ncpus) < 0)) {
 		numa_warn(W_cpumap, "Cannot parse cpumap. Assuming one node");
 		bitmask.maskp = (unsigned long *)mask;
 		bitmask.size  = buflen_needed * 8;
 		numa_bitmask_setall(&bitmask);
 		err = -1;
 	}

 	free(line);
 	memcpy(buffer, mask, buflen_needed);

 	/* slightly racy, see above */
 	if (node_cpu_mask_v1[node]) {
 		if (mask != buffer)
 			free(mask);
 	} else {
 		node_cpu_mask_v1[node] = mask;
 	}
 	return err;
 }
 backward_symver(numa_node_to_cpus_v1,numa_node_to_cpus);

 /*
  * test whether a node has cpus
  */
 /* This would be better with some locking, but I don't want to make libnuma
    dependent on pthreads right now. The races are relatively harmless. */
 /*
  * deliver a bitmask of cpus representing the cpus on a given node
  */
 int
 numa_node_to_cpus_v2(int node, struct bitmask *buffer)
 {
 	int err = 0;
 	int nnodes = numa_max_node();
 	char fn[64], *line = NULL;
 	FILE *f;
 	size_t len = 0;
 	struct bitmask *mask;

 	if (!node_cpu_mask_v2)
 		init_node_cpu_mask_v2();

 	if (node > nnodes) {
 		errno = ERANGE;
 		return -1;
 	}
 	numa_bitmask_clearall(buffer);

 	if (node_cpu_mask_v2[node]) {
 		/* have already constructed a mask for this node */
 		if (buffer->size < node_cpu_mask_v2[node]->size) {
 			errno = EINVAL;
 			numa_error("map size mismatch");
 			return -1;
 		}
 		copy_bitmask_to_bitmask(node_cpu_mask_v2[node], buffer);
 		return 0;
 	}

 	/* need a new mask for this node */
 	mask = numa_allocate_cpumask();

 	/* this is a kernel cpumask_t (see node_read_cpumap()) */
 	sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node);
 	f = fopen(fn, "r");
 	if (!f || getdelim(&line, &len, '\n', f) < 1) {
 		if (numa_bitmask_isbitset(numa_nodes_ptr, node)) {
 			numa_warn(W_nosysfs2,
 			   "/sys not mounted or invalid. Assuming one node: %s",
 				  strerror(errno));
 			numa_warn(W_nosysfs2,
 			   "(cannot open or correctly parse %s)", fn);
 		}
 		numa_bitmask_setall(mask);
 		err = -1;
 	}
 	if (f)
 		fclose(f);

 	if (line && (numa_parse_bitmap_v2(line, mask) < 0)) {
 		numa_warn(W_cpumap, "Cannot parse cpumap. Assuming one node");
 		numa_bitmask_setall(mask);
 		err = -1;
 	}

 	free(line);
 	copy_bitmask_to_bitmask(mask, buffer);

 	/* slightly racy, see above */
 	/* save the mask we created */
 	if (node_cpu_mask_v2[node]) {
 		/* how could this be? */
 		if (mask != buffer)
 			numa_bitmask_free(mask);
 	} else {
 		/* we don't want to cache faulty result */
 		if (!err)
 			node_cpu_mask_v2[node] = mask;
 		else
 			numa_bitmask_free(mask);
 	}
 	return err;
 }
 symver(numa_node_to_cpus_v2,numa_node_to_cpus);

 make_internal_alias(numa_node_to_cpus_v1);
 make_internal_alias(numa_node_to_cpus_v2);

 /* report the node of the specified cpu */
 int numa_node_of_cpu(int cpu)
 {
 	struct bitmask *bmp;
 	int ncpus, nnodes, node, ret;

 	ncpus = numa_num_possible_cpus();
 	if (cpu > ncpus){
 		errno = EINVAL;
 		return -1;
 	}
 	bmp = numa_bitmask_alloc(ncpus);
 	nnodes = numa_max_node();
 	for (node = 0; node <= nnodes; node++){
 		if (numa_node_to_cpus_v2_int(node, bmp) < 0) {
 			/* It's possible for the node to not exist */
 			continue;
 		}
 		if (numa_bitmask_isbitset(bmp, cpu)){
 			ret = node;
 			goto end;
 		}
 	}
 	ret = -1;
 	errno = EINVAL;
 end:
 	numa_bitmask_free(bmp);
 	return ret;
 }

 int
 numa_run_on_node_mask_v1(const nodemask_t *mask)
 {
 	int ncpus = numa_num_possible_cpus();
 	int i, k, err;
 	unsigned long cpus[CPU_LONGS(ncpus)], nodecpus[CPU_LONGS(ncpus)];
 	memset(cpus, 0, CPU_BYTES(ncpus));
 	for (i = 0; i < NUMA_NUM_NODES; i++) {
 		if (mask->n[i / BITS_PER_LONG] == 0)
 			continue;
 		if (nodemask_isset_compat(mask, i)) {
 			if (numa_node_to_cpus_v1_int(i, nodecpus, CPU_BYTES(ncpus)) < 0) {
 				numa_warn(W_noderunmask,
 					  "Cannot read node cpumask from sysfs");
 				continue;
 			}
 			for (k = 0; k < CPU_LONGS(ncpus); k++)
 				cpus[k] |= nodecpus[k];
 		}
 	}
 	err = numa_sched_setaffinity_v1(0, CPU_BYTES(ncpus), cpus);

 	/* The sched_setaffinity API is broken because it expects
 	   the user to guess the kernel cpuset size. Do this in a
 	   brute force way. */
 	if (err < 0 && errno == EINVAL) {
 		int savederrno = errno;
 		char *bigbuf;
 		static int size = -1;
 		if (size == -1)
 			size = CPU_BYTES(ncpus) * 2;
 		bigbuf = malloc(CPU_BUFFER_SIZE);
 		if (!bigbuf) {
 			errno = ENOMEM;
 			return -1;
 		}
 		errno = savederrno;
 		while (size <= CPU_BUFFER_SIZE) {
 			memcpy(bigbuf, cpus, CPU_BYTES(ncpus));
 			memset(bigbuf + CPU_BYTES(ncpus), 0,
 			       CPU_BUFFER_SIZE - CPU_BYTES(ncpus));
 			err = numa_sched_setaffinity_v1_int(0, size, (unsigned long *)bigbuf);
 			if (err == 0 || errno != EINVAL)
 				break;
 			size *= 2;
 		}
 		savederrno = errno;
 		free(bigbuf);
 		errno = savederrno;
 	}
 	return err;
 }
 backward_symver(numa_run_on_node_mask_v1,numa_run_on_node_mask);

 /*
  * Given a node mask (size of a kernel nodemask_t) (probably populated by
  * a user argument list) set up a map of cpus (map "cpus") on those nodes.
  * Then set affinity to those cpus.
  */
 int
 numa_run_on_node_mask_v2(struct bitmask *bmp)
 {
 	int ncpus, i, k, err;
 	struct bitmask *cpus, *nodecpus;

 	cpus = numa_allocate_cpumask();
 	ncpus = cpus->size;
 	nodecpus = numa_allocate_cpumask();

 	for (i = 0; i < bmp->size; i++) {
 		if (bmp->maskp[i / BITS_PER_LONG] == 0)
 			continue;
 		if (numa_bitmask_isbitset(bmp, i)) {
 			/*
 			 * numa_all_nodes_ptr is cpuset aware; use only
 			 * these nodes
 			 */
 			if (!numa_bitmask_isbitset(numa_all_nodes_ptr, i)) {
 				numa_warn(W_noderunmask,
 					"node %d not allowed", i);
 				continue;
 			}
 			if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) {
 				numa_warn(W_noderunmask,
 					"Cannot read node cpumask from sysfs");
 				continue;
 			}
 			for (k = 0; k < CPU_LONGS(ncpus); k++)
 				cpus->maskp[k] |= nodecpus->maskp[k];
 		}
 	}
 	err = numa_sched_setaffinity_v2_int(0, cpus);

 	numa_bitmask_free(cpus);
 	numa_bitmask_free(nodecpus);

 	/* used to have to consider that this could fail - it shouldn't now */
 	if (err < 0) {
 		numa_error("numa_sched_setaffinity_v2_int() failed; abort\n");
 	}

 	return err;
 }
 symver(numa_run_on_node_mask_v2,numa_run_on_node_mask);

 make_internal_alias(numa_run_on_node_mask_v2);

 /*
  * Given a node mask (size of a kernel nodemask_t) (probably populated by
  * a user argument list) set up a map of cpus (map "cpus") on those nodes
  * without any cpuset awareness. Then set affinity to those cpus.
  */
 int
 numa_run_on_node_mask_all(struct bitmask *bmp)
 {
 	int ncpus, i, k, err;
 	struct bitmask *cpus, *nodecpus;

 	cpus = numa_allocate_cpumask();
 	ncpus = cpus->size;
 	nodecpus = numa_allocate_cpumask();

 	for (i = 0; i < bmp->size; i++) {
 		if (bmp->maskp[i / BITS_PER_LONG] == 0)
 			continue;
 		if (numa_bitmask_isbitset(bmp, i)) {
 			if (!numa_bitmask_isbitset(numa_possible_nodes_ptr, i)) {
 				numa_warn(W_noderunmask,
 					"node %d not allowed", i);
 				continue;
 			}
 			if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) {
 				numa_warn(W_noderunmask,
 					"Cannot read node cpumask from sysfs");
 				continue;
 			}
 			for (k = 0; k < CPU_LONGS(ncpus); k++)
 				cpus->maskp[k] |= nodecpus->maskp[k];
 		}
 	}
 	err = numa_sched_setaffinity_v2_int(0, cpus);

 	numa_bitmask_free(cpus);
 	numa_bitmask_free(nodecpus);

 	/* With possible nodes freedom it can happen easily now */
 	if (err < 0) {
 		numa_error("numa_sched_setaffinity_v2_int() failed");
 	}

 	return err;
 }

 nodemask_t
 numa_get_run_node_mask_v1(void)
 {
 	int ncpus = numa_num_configured_cpus();
 	int i, k;
 	int max = numa_max_node_int();
 	struct bitmask *bmp, *cpus, *nodecpus;
 	nodemask_t nmp;

 	cpus = numa_allocate_cpumask();
 	if (numa_sched_getaffinity_v2_int(0, cpus) < 0){
 		nmp = numa_no_nodes;
 		goto free_cpus;
 	}

 	nodecpus = numa_allocate_cpumask();
 	bmp = allocate_nodemask_v1(); /* the size of a nodemask_t */
 	for (i = 0; i <= max; i++) {
 		if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) {
 			/* It's possible for the node to not exist */
 			continue;
 		}
 		for (k = 0; k < CPU_LONGS(ncpus); k++) {
 			if (nodecpus->maskp[k] & cpus->maskp[k])
 				numa_bitmask_setbit(bmp, i);
 		}
 	}
 	copy_bitmask_to_nodemask(bmp, &nmp);
 	numa_bitmask_free(bmp);
 	numa_bitmask_free(nodecpus);
 free_cpus:
 	numa_bitmask_free(cpus);
 	return nmp;
 }
 backward_symver(numa_get_run_node_mask_v1,numa_get_run_node_mask);

 struct bitmask *
 numa_get_run_node_mask_v2(void)
 {
 	int i, k;
 	int ncpus = numa_num_configured_cpus();
 	int max = numa_max_node_int();
 	struct bitmask *bmp, *cpus, *nodecpus;

 	bmp = numa_allocate_cpumask();
 	cpus = numa_allocate_cpumask();
 	if (numa_sched_getaffinity_v2_int(0, cpus) < 0){
 		copy_bitmask_to_bitmask(numa_no_nodes_ptr, bmp);
 		goto free_cpus;
 	}

 	nodecpus = numa_allocate_cpumask();
 	for (i = 0; i <= max; i++) {
 		/*
 		 * numa_all_nodes_ptr is cpuset aware; show only
 		 * these nodes
 		 */
 		if (!numa_bitmask_isbitset(numa_all_nodes_ptr, i)) {
 			continue;
 		}
 		if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) {
 			/* It's possible for the node to not exist */
 			continue;
 		}
 		for (k = 0; k < CPU_LONGS(ncpus); k++) {
 			if (nodecpus->maskp[k] & cpus->maskp[k])
 				numa_bitmask_setbit(bmp, i);
 		}
 	}
 	numa_bitmask_free(nodecpus);
 free_cpus:
 	numa_bitmask_free(cpus);
 	return bmp;
 }
 symver(numa_get_run_node_mask_v2,numa_get_run_node_mask);

 int
 numa_migrate_pages(int pid, struct bitmask *fromnodes, struct bitmask *tonodes)
 {
 	int numa_num_nodes = numa_num_possible_nodes();

 	return migrate_pages(pid, numa_num_nodes + 1, fromnodes->maskp,
 							tonodes->maskp);
 }

 int numa_move_pages(int pid, unsigned long count,
 	void **pages, const int *nodes, int *status, int flags)
 {
 	return move_pages(pid, count, pages, nodes, status, flags);
 }

 int numa_run_on_node(int node)
 {
 	int numa_num_nodes = numa_num_possible_nodes();
 	int ret = -1;
 	struct bitmask *cpus;

 	if (node >= numa_num_nodes){
 		errno = EINVAL;
 		goto out;
 	}

 	cpus = numa_allocate_cpumask();

 	if (node == -1)
 		numa_bitmask_setall(cpus);
 	else if (numa_node_to_cpus_v2_int(node, cpus) < 0){
 		numa_warn(W_noderunmask, "Cannot read node cpumask from sysfs");
 		goto free;
 	}

 	ret = numa_sched_setaffinity_v2_int(0, cpus);
 free:
 	numa_bitmask_free(cpus);
 out:
 	return ret;
 }

 int numa_preferred(void)
 {
 	int policy;
 	int ret;
 	struct bitmask *bmp;

 	bmp = numa_allocate_nodemask();
 	getpol(&policy, bmp);
 	if (policy == MPOL_PREFERRED || policy == MPOL_BIND) {
 		int i;
 		int max = numa_num_possible_nodes();
 		for (i = 0; i < max ; i++)
 			if (numa_bitmask_isbitset(bmp, i)){
 				ret = i;
 				goto end;
 			}
 	}
 	/* could read the current CPU from /proc/self/status. Probably
 	   not worth it. */
 	ret = 0; /* or random one? */
 end:
 	numa_bitmask_free(bmp);
 	return ret;
 }

 void numa_set_preferred(int node)
 {
 	struct bitmask *bmp;

 	bmp = numa_allocate_nodemask();
 	if (node >= 0) {
 		numa_bitmask_setbit(bmp, node);
 		setpol(MPOL_PREFERRED, bmp);
 	} else
 		setpol(MPOL_DEFAULT, bmp);
 	numa_bitmask_free(bmp);
 }

 void numa_set_localalloc(void)
 {
 	setpol(MPOL_DEFAULT, numa_no_nodes_ptr);
 }

 void numa_bind_v1(const nodemask_t *nodemask)
 {
 	struct bitmask bitmask;

 	bitmask.maskp = (unsigned long *)nodemask;
 	bitmask.size  = sizeof(nodemask_t);
 	numa_run_on_node_mask_v2_int(&bitmask);
 	numa_set_membind_v2_int(&bitmask);
 }
 backward_symver(numa_bind_v1,numa_bind);

 void numa_bind_v2(struct bitmask *bmp)
 {
 	numa_run_on_node_mask_v2_int(bmp);
 	numa_set_membind_v2_int(bmp);
 }
 symver(numa_bind_v2,numa_bind);

 void numa_set_strict(int flag)
 {
 	if (flag)
 		mbind_flags |= MPOL_MF_STRICT;
 	else
 		mbind_flags &= ~MPOL_MF_STRICT;
 }

 /*
  * Extract a node or processor number from the given string.
  * Allow a relative node / processor specification within the allowed
  * set if "relative" is nonzero
  */
 static unsigned long get_nr(const char *s, char **end, struct bitmask *bmp, int relative)
 {
 	long i, nr;

 	if (!relative)
 		return strtoul(s, end, 0);

 	nr = strtoul(s, end, 0);
 	if (s == *end)
 		return nr;
 	/* Find the nth set bit */
 	for (i = 0; nr >= 0 && i <= bmp->size; i++)
 		if (numa_bitmask_isbitset(bmp, i))
 			nr--;
 	return i-1;
 }

 /*
  * __numa_parse_nodestring() is called to create a node mask, given
  * an ascii string such as 25 or 12-15 or 1,3,5-7 or +6-10.
  * (the + indicates that the numbers are nodeset-relative)
  *
  * The nodes may be specified as absolute, or relative to the current nodeset.
  * The list of available nodes is in a map pointed to by "allowed_nodes_ptr",
  * which may represent all nodes or the nodes in the current nodeset.
  *
  * The caller must free the returned bitmask.
  */
 static struct bitmask *
 __numa_parse_nodestring(const char *s, struct bitmask *allowed_nodes_ptr)
 {
 	int invert = 0, relative = 0;
 	int conf_nodes = numa_num_configured_nodes();
 	char *end;
 	struct bitmask *mask;

 	mask = numa_allocate_nodemask();

 	if (s[0] == 0){
 		copy_bitmask_to_bitmask(numa_no_nodes_ptr, mask);
 		return mask; /* return freeable mask */
 	}
 	if (*s == '!') {
 		invert = 1;
 		s++;
 	}
 	if (*s == '+') {
 		relative++;
 		s++;
 	}
 	do {
 		unsigned long arg;
 		int i;
 		if (isalpha(*s)) {
 			int n;
 			if (!strcmp(s,"all")) {
 				copy_bitmask_to_bitmask(allowed_nodes_ptr,
 							mask);
 				s+=4;
 				break;
 			}
 			n = resolve_affinity(s, mask);
 			if (n != NO_IO_AFFINITY) {
 				if (n < 0)
 					goto err;
 				s += strlen(s) + 1;
 				break;
 			}
 		}
 		arg = get_nr(s, &end, allowed_nodes_ptr, relative);
 		if (end == s) {
 			numa_warn(W_nodeparse, "unparseable node description `%s'\n", s);
 			goto err;
 		}
 		if (!numa_bitmask_isbitset(allowed_nodes_ptr, arg)) {
 			numa_warn(W_nodeparse, "node argument %d is out of range\n", arg);
 			goto err;
 		}
 		i = arg;
 		numa_bitmask_setbit(mask, i);
 		s = end;
 		if (*s == '-') {
 			char *end2;
 			unsigned long arg2;
 			arg2 = get_nr(++s, &end2, allowed_nodes_ptr, relative);
 			if (end2 == s) {
 				numa_warn(W_nodeparse, "missing node argument %s\n", s);
 				goto err;
 			}
 			if (!numa_bitmask_isbitset(allowed_nodes_ptr, arg2)) {
 				numa_warn(W_nodeparse, "node argument %d out of range\n", arg2);
 				goto err;
 			}
 			while (arg <= arg2) {
 				i = arg;
 				if (numa_bitmask_isbitset(allowed_nodes_ptr,i))
 					numa_bitmask_setbit(mask, i);
 				arg++;
 			}
 			s = end2;
 		}
 	} while (*s++ == ',');
 	if (s[-1] != '\0')
 		goto err;
 	if (invert) {
 		int i;
 		for (i = 0; i < conf_nodes; i++) {
 			if (numa_bitmask_isbitset(mask, i))
 				numa_bitmask_clearbit(mask, i);
 			else
 				numa_bitmask_setbit(mask, i);
 		}
 	}
 	return mask;

 err:
 	numa_bitmask_free(mask);
 	return NULL;
 }

 /*
  * numa_parse_nodestring() is called to create a bitmask from nodes available
  * for this task.
  */

 struct bitmask * numa_parse_nodestring(const char *s)
 {
 	return __numa_parse_nodestring(s, numa_all_nodes_ptr);
 }

 /*
  * numa_parse_nodestring_all() is called to create a bitmask from all nodes
  * available.
  */

 struct bitmask * numa_parse_nodestring_all(const char *s)
 {
 	return __numa_parse_nodestring(s, numa_possible_nodes_ptr);
 }

 /*
  * __numa_parse_cpustring() is called to create a bitmask, given
  * an ascii string such as 25 or 12-15 or 1,3,5-7 or +6-10.
  * (the + indicates that the numbers are cpuset-relative)
  *
  * The cpus may be specified as absolute, or relative to the current cpuset.
  * The list of available cpus for this task is in the map pointed to by
  * "allowed_cpus_ptr", which may represent all cpus or the cpus in the
  * current cpuset.
  *
  * The caller must free the returned bitmask.
  */
 static struct bitmask *
 __numa_parse_cpustring(const char *s, struct bitmask *allowed_cpus_ptr)
 {
 	int invert = 0, relative=0;
 	int conf_cpus = numa_num_configured_cpus();
 	char *end;
 	struct bitmask *mask;

 	mask = numa_allocate_cpumask();

 	if (s[0] == 0)
 		return mask;
 	if (*s == '!') {
 		invert = 1;
 		s++;
 	}
 	if (*s == '+') {
 		relative++;
 		s++;
 	}
 	do {
 		unsigned long arg;
 		int i;

 		if (!strcmp(s,"all")) {
 			copy_bitmask_to_bitmask(allowed_cpus_ptr, mask);
 			s+=4;
 			break;
 		}
 		arg = get_nr(s, &end, allowed_cpus_ptr, relative);
 		if (end == s) {
 			numa_warn(W_cpuparse, "unparseable cpu description `%s'\n", s);
 			goto err;
 		}
 		if (!numa_bitmask_isbitset(allowed_cpus_ptr, arg)) {
 			numa_warn(W_cpuparse, "cpu argument %s is out of range\n", s);
 			goto err;
 		}
 		i = arg;
 		numa_bitmask_setbit(mask, i);
 		s = end;
 		if (*s == '-') {
 			char *end2;
 			unsigned long arg2;
 			int i;
 			arg2 = get_nr(++s, &end2, allowed_cpus_ptr, relative);
 			if (end2 == s) {
 				numa_warn(W_cpuparse, "missing cpu argument %s\n", s);
 				goto err;
 			}
 			if (!numa_bitmask_isbitset(allowed_cpus_ptr, arg2)) {
 				numa_warn(W_cpuparse, "cpu argument %s out of range\n", s);
 				goto err;
 			}
 			while (arg <= arg2) {
 				i = arg;
 				if (numa_bitmask_isbitset(allowed_cpus_ptr, i))
 					numa_bitmask_setbit(mask, i);
 				arg++;
 			}
 			s = end2;
 		}
 	} while (*s++ == ',');
 	if (s[-1] != '\0')
 		goto err;
 	if (invert) {
 		int i;
 		for (i = 0; i < conf_cpus; i++) {
 			if (numa_bitmask_isbitset(mask, i))
 				numa_bitmask_clearbit(mask, i);
 			else
 				numa_bitmask_setbit(mask, i);
 		}
 	}
 	return mask;

 err:
 	numa_bitmask_free(mask);
 	return NULL;
 }

 /*
  * numa_parse_cpustring() is called to create a bitmask from cpus available
  * for this task.
  */

 struct bitmask * numa_parse_cpustring(const char *s)
 {
 	return __numa_parse_cpustring(s, numa_all_cpus_ptr);
 }

 /*
  * numa_parse_cpustring_all() is called to create a bitmask from all cpus
  * available.
  */

 struct bitmask * numa_parse_cpustring_all(const char *s)
 {
 	return __numa_parse_cpustring(s, numa_possible_cpus_ptr);
 }