| .\" Copyright 2003,2004 Andi Kleen, SuSE Labs. |
| .\" |
| .\" Permission is granted to make and distribute verbatim copies of this |
| .\" manual provided the copyright notice and this permission notice are |
| .\" preserved on all copies. |
| .\" |
| .\" Permission is granted to copy and distribute modified versions of this |
| .\" manual under the conditions for verbatim copying, provided that the |
| .\" entire resulting derived work is distributed under the terms of a |
| .\" permission notice identical to this one. |
| .\" |
| .\" Since the Linux kernel and libraries are constantly changing, this |
| .\" manual page may be incorrect or out-of-date. The author(s) assume no |
| .\" responsibility for errors or omissions, or for damages resulting from |
| .\" the use of the information contained herein. |
| .\" |
| .\" Formatted or processed versions of this manual, if unaccompanied by |
| .\" the source, must acknowledge the copyright and authors of this work. |
| .TH NUMA 3 "December 2007" "SuSE Labs" "Linux Programmer's Manual" |
| .SH NAME |
| numa \- NUMA policy library |
| .SH SYNOPSIS |
| .B #include <numa.h> |
| .sp |
| .B cc ... \-lnuma |
| .sp |
| .B int numa_available(void); |
| .sp |
| .BI "int numa_max_possible_node(void);" |
| .br |
| .BI "int numa_num_possible_nodes();" |
| .sp |
| .B int numa_max_node(void); |
| .br |
| .BI "int numa_num_configured_nodes();" |
| .br |
| .B struct bitmask *numa_get_mems_allowed(void); |
| .sp |
| .BI "int numa_num_configured_cpus(void);" |
| .br |
| .BI "struct bitmask *numa_all_nodes_ptr;" |
| .br |
| .BI "struct bitmask *numa_no_nodes_ptr;" |
| .br |
| .BI "struct bitmask *numa_all_cpus_ptr;" |
| .sp |
| .BI "int numa_num_task_cpus();" |
| .br |
| .BI "int numa_num_task_nodes();" |
| .sp |
| .BI "int numa_parse_bitmap(char *" line " , struct bitmask *" mask "); |
| .br |
| .BI "struct bitmask *numa_parse_nodestring(const char *" string ); |
| .br |
| .BI "struct bitmask *numa_parse_nodestring_all(const char *" string ); |
| .br |
| .BI "struct bitmask *numa_parse_cpustring(const char *" string ); |
| .br |
| .BI "struct bitmask *numa_parse_cpustring_all(const char *" string ); |
| .sp |
| .BI "long numa_node_size(int " node ", long *" freep ); |
| .br |
| .BI "long long numa_node_size64(int " node ", long long *" freep ); |
| .sp |
| .B int numa_preferred(void); |
| .br |
| .BI "void numa_set_preferred(int " node ); |
| .br |
| .BI "int numa_get_interleave_node(void); |
| .br |
| .B struct bitmask *numa_get_interleave_mask(void); |
| .br |
| .BI "void numa_set_interleave_mask(struct bitmask *" nodemask ); |
| .br |
| .BI "void numa_interleave_memory(void *" start ", size_t " size ", struct bitmask *" nodemask ); |
| .br |
| .BI "void numa_bind(struct bitmask *" nodemask ); |
| .br |
| .BI "void numa_set_localalloc(void); |
| .br |
| .BI "void numa_set_membind(struct bitmask *" nodemask ); |
| .br |
| .B struct bitmask *numa_get_membind(void); |
| .sp |
| .BI "void *numa_alloc_onnode(size_t " size ", int " node ); |
| .br |
| .BI "void *numa_alloc_local(size_t " size ); |
| .br |
| .BI "void *numa_alloc_interleaved(size_t " size ); |
| .br |
| .BI "void *numa_alloc_interleaved_subset(size_t " size ", struct bitmask *" nodemask ); |
| .BI "void *numa_alloc(size_t " size ); |
| .br |
| .BI "void *numa_realloc(void *"old_addr ", size_t " old_size ", size_t " new_size ); |
| .br |
| .BI "void numa_free(void *" start ", size_t " size ); |
| .sp |
| .BI "int numa_run_on_node(int " node ); |
| .br |
| .BI "int numa_run_on_node_mask(struct bitmask *" nodemask ); |
| .br |
| .BI "int numa_run_on_node_mask_all(struct bitmask *" nodemask ); |
| .br |
| .B struct bitmask *numa_get_run_node_mask(void); |
| .sp |
| .BI "void numa_tonode_memory(void *" start ", size_t " size ", int " node ); |
| .br |
| .BI "void numa_tonodemask_memory(void *" start ", size_t " size ", struct bitmask *" nodemask ); |
| .br |
| .BI "void numa_setlocal_memory(void *" start ", size_t " size ); |
| .br |
| .BI "void numa_police_memory(void *" start ", size_t " size ); |
| .br |
| .BI "void numa_set_bind_policy(int " strict ); |
| .br |
| .BI "void numa_set_strict(int " strict ); |
| .sp |
| .\" should be undocumented ?? |
| .BI "int numa_distance(int " node1 ", int " node2 ); |
| .sp |
| .BI "int numa_sched_getaffinity(pid_t " pid ", struct bitmask *" mask ); |
| .br |
| .BI "int numa_sched_setaffinity(pid_t " pid ", struct bitmask *" mask ); |
| .br |
| .BI "int numa_node_to_cpus(int " node ", struct bitmask *" mask "); |
| .br |
| .BI "int numa_node_of_cpu(int " cpu "); |
| .sp |
| .BI "struct bitmask *numa_allocate_cpumask();" |
| .sp |
| .BI "void numa_free_cpumask();" |
| .br |
| .BI "struct bitmask *numa_allocate_nodemask();" |
| .sp |
| .BI "void numa_free_nodemask();" |
| .br |
| .BI "struct bitmask *numa_bitmask_alloc(unsigned int " n "); |
| .br |
| .BI "struct bitmask *numa_bitmask_clearall(struct bitmask *" bmp ); |
| .br |
| .BI "struct bitmask *numa_bitmask_clearbit(struct bitmask *" bmp ", unsigned int " n ); |
| .br |
| .BI "int numa_bitmask_equal(const struct bitmask *" bmp1 ", const struct bitmask *" bmp2 ); |
| .br |
| .BI "void numa_bitmask_free(struct bitmask *" bmp ); |
| .br |
| .BI "int numa_bitmask_isbitset(const struct bitmask *" bmp ", unsigned int " n ");" |
| .br |
| .BI "unsigned int numa_bitmask_nbytes(struct bitmask *" bmp ); |
| .br |
| .BI "struct bitmask *numa_bitmask_setall(struct bitmask *" bmp ); |
| .br |
| .BI "struct bitmask *numa_bitmask_setbit(struct bitmask *" bmp ", unsigned int " n ); |
| .br |
| .BI "void copy_bitmask_to_nodemask(struct bitmask *" bmp ", nodemask_t *" nodemask ) |
| .br |
| .BI "void copy_nodemask_to_bitmask(nodemask_t *" nodemask ", struct bitmask *" bmp ) |
| .br |
| .BI "void copy_bitmask_to_bitmask(struct bitmask *" bmpfrom ", struct bitmask *" bmpto ) |
| .br |
| .BI "unsigned int numa_bitmask_weight(const struct bitmask *bmp ) |
| .sp |
| .BI "int numa_move_pages(int " pid ", unsigned long " count ", void **" pages ", const int *" nodes ", int *" status ", int " flags ); |
| .br |
| .BI "int numa_migrate_pages(int " pid ", struct bitmask *" fromnodes ", struct bitmask *" tonodes ); |
| .sp |
| .BI "void numa_error(char *" where ); |
| .sp |
| .BI "extern int " numa_exit_on_error ; |
| .br |
| .BI "extern int " numa_exit_on_warn ; |
| .br |
| .BI "void numa_warn(int " number ", char *" where ", ...);" |
| .br |
| |
| .SH DESCRIPTION |
| The |
| .I libnuma |
| library offers a simple programming interface to the |
| NUMA (Non Uniform Memory Access) |
| policy supported by the |
| Linux kernel. On a NUMA architecture some |
| memory areas have different latency or bandwidth than others. |
| |
| Available policies are |
| page interleaving (i.e., allocate in a round-robin fashion from all, |
| or a subset, of the nodes on the system), |
| preferred node allocation (i.e., preferably allocate on a particular node), |
| local allocation (i.e., allocate on the node on which |
| the task is currently executing), |
| or allocation only on specific nodes (i.e., allocate on |
| some subset of the available nodes). |
| It is also possible to bind tasks to specific nodes. |
| |
| Numa memory allocation policy may be specified as a per-task attribute, |
| that is inherited by children tasks and processes, or as an attribute |
| of a range of process virtual address space. |
| Numa memory policies specified for a range of virtual address space are |
| shared by all tasks in the process. |
| Furthermore, memory policies specified for a range of a shared memory |
| attached using |
| .I shmat(2) |
| or |
| .I mmap(2) |
| from shmfs/hugetlbfs are shared by all processes that attach to that region. |
| Memory policies for shared disk backed file mappings are currently ignored. |
| |
| The default memory allocation policy for tasks and all memory range |
| is local allocation. |
| This assumes that no ancestor has installed a non-default policy. |
| |
| For setting a specific policy globally for all memory allocations |
| in a process and its children it is easiest |
| to start it with the |
| .BR numactl (8) |
| utility. For more finegrained policy inside an application this library |
| can be used. |
| |
| All numa memory allocation policy only takes effect when a page is actually |
| faulted into the address space of a process by accessing it. The |
| .B numa_alloc_* |
| functions take care of this automatically. |
| |
| A |
| .I node |
| is defined as an area where all memory has the same speed as seen from |
| a particular CPU. |
| A node can contain multiple CPUs. |
| Caches are ignored for this definition. |
| |
| Most functions in this library are only concerned about numa nodes and |
| their memory. |
| The exceptions to this are: |
| .IR numa_node_to_cpus (), |
| .IR numa_node_of_cpu (), |
| .IR numa_bind (), |
| .IR numa_run_on_node (), |
| .IR numa_run_on_node_mask (), |
| .IR numa_run_on_node_mask_all (), |
| and |
| .IR numa_get_run_node_mask (). |
| These functions deal with the CPUs associated with numa nodes. |
| See the descriptions below for more information. |
| |
| Some of these functions accept or return a pointer to struct bitmask. |
| A struct bitmask controls a bit map of arbitrary length containing a bit |
| representation of nodes. The predefined variable |
| .I numa_all_nodes_ptr |
| points to a bit mask that has all available nodes set; |
| .I numa_no_nodes_ptr |
| points to the empty set. |
| |
| Before any other calls in this library can be used |
| .BR numa_available () |
| must be called. If it returns \-1, all other functions in this |
| library are undefined. |
| |
| .BR numa_max_possible_node() |
| returns the number of the highest possible node in a system. |
| In other words, the size of a kernel type nodemask_t (in bits) minus 1. |
| This number can be gotten by calling |
| .BR numa_num_possible_nodes() |
| and subtracting 1. |
| |
| .BR numa_num_possible_nodes() |
| returns the size of kernel's node mask (kernel type nodemask_t). |
| In other words, large enough to represent the maximum number of nodes that |
| the kernel can handle. This will match the kernel's MAX_NUMNODES value. |
| This count is derived from /proc/self/status, field Mems_allowed. |
| |
| .BR numa_max_node () |
| returns the highest node number available on the current system. |
| (See the node numbers in /sys/devices/system/node/ ). Also see |
| .BR numa_num_configured_nodes(). |
| |
| .BR numa_num_configured_nodes() |
| returns the number of memory nodes in the system. This count |
| includes any nodes that are currently disabled. This count is derived from |
| the node numbers in /sys/devices/system/node. (Depends on the kernel being |
| configured with /sys (CONFIG_SYSFS)). |
| |
| .BR numa_get_mems_allowed() |
| returns the mask of nodes from which the process is allowed to allocate |
| memory in it's current cpuset context. |
| Any nodes that are not included in the returned bitmask will be ignored |
| in any of the following libnuma memory policy calls. |
| |
| .BR numa_num_configured_cpus() |
| returns the number of cpus in the system. This count includes |
| any cpus that are currently disabled. This count is derived from the cpu |
| numbers in /sys/devices/system/cpu. If the kernel is configured without |
| /sys (CONFIG_SYSFS=n) then it falls back to using the number of online cpus. |
| |
| .BR numa_all_nodes_ptr |
| points to a bitmask that is allocated by the library with bits |
| representing all nodes on which the calling task may allocate memory. |
| This set may be up to all nodes on the system, or up to the nodes in |
| the current cpuset. |
| The bitmask is allocated by a call to |
| .BR numa_allocate_nodemask() |
| using size |
| .BR numa_max_possible_node(). |
| The set of nodes to record is derived from /proc/self/status, field |
| "Mems_allowed". The user should not alter this bitmask. |
| |
| .BR numa_no_nodes_ptr |
| points to a bitmask that is allocated by the library and left all |
| zeroes. The bitmask is allocated by a call to |
| .BR numa_allocate_nodemask() |
| using size |
| .BR numa_max_possible_node(). |
| The user should not alter this bitmask. |
| |
| .BR numa_all_cpus_ptr |
| points to a bitmask that is allocated by the library with bits |
| representing all cpus on which the calling task may execute. |
| This set may be up to all cpus on the system, or up to the cpus in |
| the current cpuset. |
| The bitmask is allocated by a call to |
| .BR numa_allocate_cpumask() |
| using size |
| .BR numa_num_possible_cpus(). |
| The set of cpus to record is derived from /proc/self/status, field |
| "Cpus_allowed". The user should not alter this bitmask. |
| |
| .BR numa_num_task_cpus() |
| returns the number of cpus that the calling task is allowed |
| to use. This count is derived from the map /proc/self/status, field |
| "Cpus_allowed". Also see the bitmask |
| .BR numa_all_cpus_ptr. |
| |
| .BR numa_num_task_nodes() |
| returns the number of nodes on which the calling task is |
| allowed to allocate memory. This count is derived from the map |
| /proc/self/status, field "Mems_allowed". |
| Also see the bitmask |
| .BR numa_all_nodes_ptr. |
| |
| .BR numa_parse_bitmap() |
| parses |
| .I line |
| , which is a character string such as found in |
| /sys/devices/system/node/nodeN/cpumap into a bitmask structure. |
| The string contains the hexadecimal representation of a bit map. |
| The bitmask may be allocated with |
| .BR numa_allocate_cpumask(). |
| Returns 0 on success. Returns -1 on failure. |
| This function is probably of little use to a user application, but |
| it is used by |
| .I libnuma |
| internally. |
| |
| .BR numa_parse_nodestring() |
| parses a character string list of nodes into a bit mask. |
| The bit mask is allocated by |
| .BR numa_allocate_nodemask(). |
| The string is a comma-separated list of node numbers or node ranges. |
| A leading ! can be used to indicate "not" this list (in other words, all |
| nodes except this list), and a leading + can be used to indicate that the |
| node numbers in the list are relative to the task's cpuset. The string can |
| be "all" to specify all ( |
| .BR numa_num_task_nodes() |
| ) nodes. Node numbers are limited by the number in the system. See |
| .BR numa_max_node() |
| and |
| .BR numa_num_configured_nodes(). |
| .br |
| Examples: 1-5,7,10 !4-5 +0-3 |
| .br |
| If the string is of 0 length, bitmask |
| .BR numa_no_nodes_ptr |
| is returned. Returns 0 if the string is invalid. |
| |
| .BR numa_parse_nodestring_all() |
| is similar to |
| .BR numa_parse_nodestring |
| , but can parse all possible nodes, not only current nodeset. |
| |
| .BR numa_parse_cpustring() |
| parses a character string list of cpus into a bit mask. |
| The bit mask is allocated by |
| .BR numa_allocate_cpumask(). |
| The string is a comma-separated list of cpu numbers or cpu ranges. |
| A leading ! can be used to indicate "not" this list (in other words, all |
| cpus except this list), and a leading + can be used to indicate that the cpu |
| numbers in the list are relative to the task's cpuset. The string can be |
| "all" to specify all ( |
| .BR numa_num_task_cpus() |
| ) cpus. |
| Cpu numbers are limited by the number in the system. See |
| .BR numa_num_task_cpus() |
| and |
| .BR numa_num_configured_cpus(). |
| .br |
| Examples: 1-5,7,10 !4-5 +0-3 |
| .br |
| Returns 0 if the string is invalid. |
| |
| .BR numa_parse_cpustring_all() |
| is similar to |
| .BR numa_parse_cpustring |
| , but can parse all possible cpus, not only current cpuset. |
| |
| .BR numa_node_size () |
| returns the memory size of a node. If the argument |
| .I freep |
| is not NULL, it used to return the amount of free memory on the node. |
| On error it returns \-1. |
| |
| .BR numa_node_size64 () |
| works the same as |
| .BR numa_node_size () |
| except that it returns values as |
| .I long long |
| instead of |
| .IR long . |
| This is useful on 32-bit architectures with large nodes. |
| |
| .BR numa_preferred () |
| returns the preferred node of the current task. |
| This is the node on which the kernel preferably |
| allocates memory, unless some other policy overrides this. |
| .\" TODO: results are misleading for MPOL_PREFERRED and may |
| .\" be incorrect for MPOL_BIND when Mel Gorman's twozonelist |
| .\" patches go in. In the latter case, we'd need to know the |
| .\" order of the current node's zonelist to return the correct |
| .\" node. Need to tighten this up with the syscall results. |
| |
| .BR numa_set_preferred () |
| sets the preferred node for the current task to |
| .IR node . |
| The system will attempt to allocate memory from the preferred node, |
| but will fall back to other nodes if no memory is available on the |
| the preferred node. |
| Passing a |
| .I node |
| of \-1 argument specifies local allocation and is equivalent to |
| calling |
| .BR numa_set_localalloc (). |
| |
| .BR numa_get_interleave_mask () |
| returns the current interleave mask if the task's memory allocation policy |
| is page interleaved. |
| Otherwise, this function returns an empty mask. |
| |
| .BR numa_set_interleave_mask () |
| sets the memory interleave mask for the current task to |
| .IR nodemask . |
| All new memory allocations |
| are page interleaved over all nodes in the interleave mask. Interleaving |
| can be turned off again by passing an empty mask |
| .RI ( numa_no_nodes ). |
| The page interleaving only occurs on the actual page fault that puts a new |
| page into the current address space. It is also only a hint: the kernel |
| will fall back to other nodes if no memory is available on the interleave |
| target. |
| .\" NOTE: the following is not really the case. this function sets the |
| .\" task policy for all future allocations, including stack, bss, ... |
| .\" The functions specified in this sentence actually allocate a new memory |
| .\" range [via mmap()]. This is quite a different thing. Suggest we drop |
| .\" this. |
| .\" This is a low level |
| .\" function, it may be more convenient to use the higher level functions like |
| .\" .BR numa_alloc_interleaved () |
| .\" or |
| .\" .BR numa_alloc_interleaved_subset (). |
| |
| .BR numa_interleave_memory () |
| interleaves |
| .I size |
| bytes of memory page by page from |
| .I start |
| on nodes specified in |
| .IR nodemask . |
| The |
| .I size |
| argument will be rounded up to a multiple of the system page size. |
| If |
| .I nodemask |
| contains nodes that are externally denied to this process, |
| this call will fail. |
| This is a lower level function to interleave allocated but not yet faulted in |
| memory. Not yet faulted in means the memory is allocated using |
| .BR mmap (2) |
| or |
| .BR shmat (2), |
| but has not been accessed by the current process yet. The memory is page |
| interleaved to all nodes specified in |
| .IR nodemask . |
| Normally |
| .BR numa_alloc_interleaved () |
| should be used for private memory instead, but this function is useful to |
| handle shared memory areas. To be useful the memory area should be |
| several megabytes at least (or tens of megabytes of hugetlbfs mappings) |
| If the |
| .BR numa_set_strict () |
| flag is true then the operation will cause a numa_error if there were already |
| pages in the mapping that do not follow the policy. |
| |
| .BR numa_bind () |
| binds the current task and its children to the nodes |
| specified in |
| .IR nodemask . |
| They will only run on the CPUs of the specified nodes and only be able to allocate |
| memory from them. |
| This function is equivalent to calling |
| .\" FIXME checkme |
| .\" This is the case. --lts |
| .I numa_run_on_node_mask(nodemask) |
| followed by |
| .IR numa_set_membind(nodemask) . |
| If tasks should be bound to individual CPUs inside nodes |
| consider using |
| .I numa_node_to_cpus |
| and the |
| .I sched_setaffinity(2) |
| syscall. |
| |
| .BR numa_set_localalloc () |
| sets the memory allocation policy for the calling task to |
| local allocation. |
| In this mode, the preferred node for memory allocation is |
| effectively the node where the task is executing at the |
| time of a page allocation. |
| |
| .BR numa_set_membind () |
| sets the memory allocation mask. |
| The task will only allocate memory from the nodes set in |
| .IR nodemask . |
| Passing an empty |
| .I nodemask |
| or a |
| .I nodemask |
| that contains nodes other than those in the mask returned by |
| .IR numa_get_mems_allowed () |
| will result in an error. |
| |
| .BR numa_get_membind () |
| returns the mask of nodes from which memory can currently be allocated. |
| If the returned mask is equal to |
| .IR numa_all_nodes , |
| then memory allocation is allowed from all nodes. |
| |
| .BR numa_alloc_onnode () |
| allocates memory on a specific node. |
| The |
| .I size |
| argument will be rounded up to a multiple of the system page size. |
| if the specified |
| .I node |
| is externally denied to this process, this call will fail. |
| This function is relatively slow compared to the |
| .IR malloc (3), |
| family of functions. |
| The memory must be freed |
| with |
| .BR numa_free (). |
| On errors NULL is returned. |
| |
| .BR numa_alloc_local () |
| allocates |
| .I size |
| bytes of memory on the local node. |
| The |
| .I size |
| argument will be rounded up to a multiple of the system page size. |
| This function is relatively slow compared to the |
| .IR malloc (3) |
| family of functions. |
| The memory must be freed |
| with |
| .BR numa_free (). |
| On errors NULL is returned. |
| |
| .BR numa_alloc_interleaved () |
| allocates |
| .I size |
| bytes of memory page interleaved on all nodes. This function is relatively slow |
| and should only be used for large areas consisting of multiple pages. The |
| interleaving works at page level and will only show an effect when the |
| area is large. |
| The allocated memory must be freed with |
| .BR numa_free (). |
| On error, NULL is returned. |
| |
| .BR numa_alloc_interleaved_subset () |
| attempts to allocate |
| .I size |
| bytes of memory page interleaved on all nodes. |
| The |
| .I size |
| argument will be rounded up to a multiple of the system page size. |
| The nodes on which a process is allowed to allocate memory may |
| be constrained externally. |
| If this is the case, this function may fail. |
| This function is relatively slow compare to |
| .IR malloc (3), |
| family of functions and should only be used for large areas consisting |
| of multiple pages. |
| The interleaving works at page level and will only show an effect when the |
| area is large. |
| The allocated memory must be freed with |
| .BR numa_free (). |
| On error, NULL is returned. |
| |
| .BR numa_alloc () |
| allocates |
| .I size |
| bytes of memory with the current NUMA policy. |
| The |
| .I size |
| argument will be rounded up to a multiple of the system page size. |
| This function is relatively slow compare to the |
| .IR malloc (3) |
| family of functions. |
| The memory must be freed |
| with |
| .BR numa_free (). |
| On errors NULL is returned. |
| |
| .BR numa_realloc () |
| changes the size of the memory area pointed to by |
| .I old_addr |
| from |
| .I old_size |
| to |
| .I new_size. |
| The memory area pointed to by |
| .I old_addr |
| must have been allocated with one of the |
| .BR numa_alloc* |
| functions. |
| The |
| .I new_size |
| will be rounded up to a multiple of the system page size. The contents of the |
| memory area will be unchanged to the minimum of the old and new sizes; newly |
| allocated memory will be uninitialized. The memory policy (and node bindings) |
| associated with the original memory area will be preserved in the resized |
| area. For example, if the initial area was allocated with a call to |
| .BR numa_alloc_onnode(), |
| then the new pages (if the area is enlarged) will be allocated on the same node. |
| However, if no memory policy was set for the original area, then |
| .BR numa_realloc () |
| cannot guarantee that the new pages will be allocated on the same node. On |
| success, the address of the resized area is returned (which might be different |
| from that of the initial area), otherwise NULL is returned and |
| .I errno |
| is set to indicate the error. The pointer returned by |
| .BR numa_realloc () |
| is suitable for passing to |
| .BR numa_free (). |
| |
| |
| .BR numa_free () |
| frees |
| .I size |
| bytes of memory starting at |
| .IR start , |
| allocated by the |
| .B numa_alloc_* |
| functions above. |
| The |
| .I size |
| argument will be rounded up to a multiple of the system page size. |
| |
| .BR numa_run_on_node () |
| runs the current task and its children |
| on a specific node. They will not migrate to CPUs of |
| other nodes until the node affinity is reset with a new call to |
| .BR numa_run_on_node_mask (). |
| Passing \-1 |
| permits the kernel to schedule on all nodes again. |
| On success, 0 is returned; on error \-1 is returned, and |
| .I errno |
| is set to indicate the error. |
| |
| .BR numa_run_on_node_mask () |
| runs the current task and its children only on nodes specified in |
| .IR nodemask . |
| They will not migrate to CPUs of |
| other nodes until the node affinity is reset with a new call to |
| .BR numa_run_on_node_mask () |
| or |
| .BR numa_run_on_node (). |
| Passing |
| .I numa_all_nodes |
| permits the kernel to schedule on all nodes again. |
| On success, 0 is returned; on error \-1 is returned, and |
| .I errno |
| is set to indicate the error. |
| |
| .BR numa_run_on_node_mask_all () |
| runs the current task and its children only on nodes specified in |
| .IR nodemask |
| like |
| .I numa_run_on_node_mask |
| but without any cpuset awareness. |
| |
| .BR numa_get_run_node_mask () |
| returns a mask of CPUs on which the current task is allowed to run. |
| |
| .BR numa_tonode_memory () |
| put memory on a specific node. The constraints described for |
| .BR numa_interleave_memory () |
| apply here too. |
| |
| .BR numa_tonodemask_memory () |
| put memory on a specific set of nodes. The constraints described for |
| .BR numa_interleave_memory () |
| apply here too. |
| |
| .BR numa_setlocal_memory () |
| locates memory on the current node. The constraints described for |
| .BR numa_interleave_memory () |
| apply here too. |
| |
| .BR numa_police_memory () |
| locates memory with the current NUMA policy. The constraints described for |
| .BR numa_interleave_memory () |
| apply here too. |
| |
| .BR numa_distance () |
| reports the distance in the machine topology between two nodes. |
| The factors are a multiple of 10. It returns 0 when the distance |
| cannot be determined. A node has distance 10 to itself. |
| Reporting the distance requires a Linux |
| kernel version of |
| .I 2.6.10 |
| or newer. |
| |
| .BR numa_set_bind_policy () |
| specifies whether calls that bind memory to a specific node should |
| use the preferred policy or a strict policy. |
| The preferred policy allows the kernel |
| to allocate memory on other nodes when there isn't enough free |
| on the target node. strict will fail the allocation in that case. |
| Setting the argument to specifies strict, 0 preferred. |
| Note that specifying more than one node non strict may only use |
| the first node in some kernel versions. |
| |
| .BR numa_set_strict () |
| sets a flag that says whether the functions allocating on specific |
| nodes should use use a strict policy. Strict means the allocation |
| will fail if the memory cannot be allocated on the target node. |
| Default operation is to fall back to other nodes. |
| This doesn't apply to interleave and default. |
| |
| .BR numa_get_interleave_node() |
| is used by |
| .I libnuma |
| internally. It is probably not useful for user applications. |
| It uses the MPOL_F_NODE flag of the get_mempolicy system call, which is |
| not intended for application use (its operation may change or be removed |
| altogether in future kernel versions). See get_mempolicy(2). |
| |
| .BR numa_pagesize() |
| returns the number of bytes in page. This function is simply a fast |
| alternative to repeated calls to the getpagesize system call. |
| See getpagesize(2). |
| |
| .BR numa_sched_getaffinity() |
| retrieves a bitmask of the cpus on which a task may run. The task is |
| specified by |
| .I pid. |
| Returns the return value of the sched_getaffinity |
| system call. See sched_getaffinity(2). |
| The bitmask must be at least the size of the kernel's cpu mask structure. Use |
| .BR numa_allocate_cpumask() |
| to allocate it. |
| Test the bits in the mask by calling |
| .BR numa_bitmask_isbitset(). |
| |
| .BR numa_sched_setaffinity() |
| sets a task's allowed cpu's to those cpu's specified in |
| .I mask. |
| The task is specified by |
| .I pid. |
| Returns the return value of the sched_setaffinity system call. |
| See sched_setaffinity(2). You may allocate the bitmask with |
| .BR numa_allocate_cpumask(). |
| Or the bitmask may be smaller than the kernel's cpu mask structure. For |
| example, call |
| .BR numa_bitmask_alloc() |
| using a maximum number of cpus from |
| .BR numa_num_configured_cpus(). |
| Set the bits in the mask by calling |
| .BR numa_bitmask_setbit(). |
| |
| .BR numa_node_to_cpus () |
| converts a node number to a bitmask of CPUs. The user must pass a bitmask |
| structure with a mask buffer long enough to represent all possible cpu's. |
| Use numa_allocate_cpumask() to create it. If the bitmask is not long enough |
| .I errno |
| will be set to |
| .I ERANGE |
| and \-1 returned. On success 0 is returned. |
| |
| .BR numa_node_of_cpu () |
| returns the node that a cpu belongs to. If the user supplies an invalid cpu |
| .I errno |
| will be set to |
| .I EINVAL |
| and \-1 will be returned. |
| |
| .BR numa_allocate_cpumask |
| () returns a bitmask of a size equal to the kernel's cpu |
| mask (kernel type cpumask_t). In other words, large enough to represent |
| NR_CPUS cpus. This number of cpus can be gotten by calling |
| .BR numa_num_possible_cpus(). |
| The bitmask is zero-filled. |
| |
| .BR numa_free_cpumask |
| frees a cpumask previously allocate by |
| .I numa_allocate_cpumask. |
| |
| .BR numa_allocate_nodemask() |
| returns a bitmask of a size equal to the kernel's node |
| mask (kernel type nodemask_t). In other words, large enough to represent |
| MAX_NUMNODES nodes. This number of nodes can be gotten by calling |
| .BR numa_num_possible_nodes(). |
| The bitmask is zero-filled. |
| |
| .BR numa_free_nodemask() |
| frees a nodemask previous allocated by |
| .I numa_allocate_nodemask(). |
| |
| .BR numa_bitmask_alloc() |
| allocates a bitmask structure and its associated bit mask. |
| The memory allocated for the bit mask contains enough words (type unsigned |
| long) to contain |
| .I n |
| bits. The bit mask is zero-filled. The bitmask |
| structure points to the bit mask and contains the |
| .I n |
| value. |
| |
| .BR numa_bitmask_clearall() |
| sets all bits in the bit mask to 0. The bitmask structure |
| points to the bit mask and contains its size ( |
| .I bmp |
| ->size). The value of |
| .I bmp |
| is always returned. Note that |
| .BR numa_bitmask_alloc() |
| creates a zero-filled bit mask. |
| |
| .BR numa_bitmask_clearbit() |
| sets a specified bit in a bit mask to 0. Nothing is done if |
| the |
| .I n |
| value is greater than the size of the bitmask (and no error is |
| returned). The value of |
| .I bmp |
| is always returned. |
| |
| .BR numa_bitmask_equal() |
| returns 1 if two bitmasks are equal. It returns 0 if they |
| are not equal. If the bitmask structures control bit masks of different |
| sizes, the "missing" trailing bits of the smaller bit mask are considered |
| to be 0. |
| |
| .BR numa_bitmask_free() |
| deallocates the memory of both the bitmask structure pointed |
| to by |
| .I bmp |
| and the bit mask. It is an error to attempt to free this bitmask twice. |
| |
| .BR numa_bitmask_isbitset() |
| returns the value of a specified bit in a bit mask. |
| If the |
| .I n |
| value is greater than the size of the bit map, 0 is returned. |
| |
| .BR numa_bitmask_nbytes() |
| returns the size (in bytes) of the bit mask controlled by |
| .I bmp. |
| The bit masks are always full words (type unsigned long), and the returned |
| size is the actual size of all those words. |
| |
| .BR numa_bitmask_setall() |
| sets all bits in the bit mask to 1. The bitmask structure |
| points to the bit mask and contains its size ( |
| .I bmp |
| ->size). |
| The value of |
| .I bmp |
| is always returned. |
| |
| .BR numa_bitmask_setbit() |
| sets a specified bit in a bit mask to 1. Nothing is done if |
| .I n |
| is greater than the size of the bitmask (and no error is |
| returned). The value of |
| .I bmp |
| is always returned. |
| |
| .BR copy_bitmask_to_nodemask() |
| copies the body (the bit map itself) of the bitmask structure pointed |
| to by |
| .I bmp |
| to the nodemask_t structure pointed to by the |
| .I nodemask |
| pointer. If the two areas differ in size, the copy is truncated to the size |
| of the receiving field or zero-filled. |
| |
| .BR copy_nodemask_to_bitmask() |
| copies the nodemask_t structure pointed to by the |
| .I nodemask |
| pointer to the body (the bit map itself) of the bitmask structure pointed |
| to by the |
| .I bmp |
| pointer. If the two areas differ in size, the copy is truncated to the size |
| of the receiving field or zero-filled. |
| |
| .BR copy_bitmask_to_bitmask() |
| copies the body (the bit map itself) of the bitmask structure pointed |
| to by the |
| .I bmpfrom |
| pointer to the body of the bitmask structure pointed to by the |
| .I bmpto |
| pointer. If the two areas differ in size, the copy is truncated to the size |
| of the receiving field or zero-filled. |
| |
| .BR numa_bitmask_weight() |
| returns a count of the bits that are set in the body of the bitmask pointed |
| to by the |
| .I bmp |
| argument. |
| |
| .br |
| .BR numa_move_pages() |
| moves a list of pages in the address space of the currently |
| executing or current process. |
| It simply uses the move_pages system call. |
| .br |
| .I pid |
| - ID of task. If not valid, use the current task. |
| .br |
| .I count |
| - Number of pages. |
| .br |
| .I pages |
| - List of pages to move. |
| .br |
| .I nodes |
| - List of nodes to which pages can be moved. |
| .br |
| .I status |
| - Field to which status is to be returned. |
| .br |
| .I flags |
| - MPOL_MF_MOVE or MPOL_MF_MOVE_ALL |
| .br |
| See move_pages(2). |
| |
| .BR numa_migrate_pages() |
| simply uses the migrate_pages system call to cause the pages of the calling |
| task, or a specified task, to be migated from one set of nodes to another. |
| See migrate_pages(2). |
| The bit masks representing the nodes should be allocated with |
| .BR numa_allocate_nodemask() |
| , or with |
| .BR numa_bitmask_alloc() |
| using an |
| .I n |
| value returned from |
| .BR numa_num_possible_nodes(). |
| A task's current node set can be gotten by calling |
| .BR numa_get_membind(). |
| Bits in the |
| .I tonodes |
| mask can be set by calls to |
| .BR numa_bitmask_setbit(). |
| |
| .BR numa_error () |
| is a |
| .I libnuma |
| internal function that can be overridden by the |
| user program. |
| This function is called with a |
| .I char * |
| argument when a |
| .I libnuma |
| function fails. |
| Overriding the library internal definition |
| makes it possible to specify a different error handling strategy |
| when a |
| .I libnuma |
| function fails. It does not affect |
| .BR numa_available (). |
| The |
| .BR numa_error () |
| function defined in |
| .I libnuma |
| prints an error on |
| .I stderr |
| and terminates |
| the program if |
| .I numa_exit_on_error |
| is set to a non-zero value. |
| The default value of |
| .I numa_exit_on_error |
| is zero. |
| |
| .BR numa_warn () |
| is a |
| .I libnuma |
| internal function that can be also overridden |
| by the user program. |
| It is called to warn the user when a |
| .I libnuma |
| function encounters a non-fatal error. |
| The default implementation |
| prints a warning to |
| .IR stderr . |
| The first argument is a unique |
| number identifying each warning. After that there is a |
| .BR printf (3)-style |
| format string and a variable number of arguments. |
| .I numa_warn |
| exits the program when |
| .I numa_exit_on_warn |
| is set to a non-zero value. |
| The default value of |
| .I numa_exit_on_warn |
| is zero. |
| |
| .SH Compatibility with libnuma version 1 |
| Binaries that were compiled for libnuma version 1 need not be re-compiled |
| to run with libnuma version 2. |
| .br |
| Source codes written for libnuma version 1 may be re-compiled without |
| change with version 2 installed. To do so, in the code's Makefile add |
| this option to CFLAGS: -DNUMA_VERSION1_COMPATIBILITY |
| |
| .SH THREAD SAFETY |
| .I numa_set_bind_policy |
| and |
| .I numa_exit_on_error |
| are process global. The other calls are thread safe. |
| |
| .SH COPYRIGHT |
| Copyright 2002, 2004, 2007, 2008 Andi Kleen, SuSE Labs. |
| .I libnuma |
| is under the GNU Lesser General Public License, v2.1. |
| |
| .SH SEE ALSO |
| .BR get_mempolicy (2), |
| .BR set_mempolicy (2), |
| .BR getpagesize (2), |
| .BR mbind (2), |
| .BR mmap (2), |
| .BR shmat (2), |
| .BR numactl (8), |
| .BR sched_getaffinity (2) |
| .BR sched_setaffinity (2) |
| .BR move_pages (2) |
| .BR migrate_pages (2) |