blob: 834e87155e5de5a06bfa5936dfdc0d3cd75dbd96 [file] [log] [blame]
% File src/library/parallel/man/clusterApply.Rd
% Part of the R package, https://www.R-project.org
% Copyright 2003-2018 R Core Team
% Distributed under GPL 2 or later
\name{clusterApply}
\alias{clusterApply}
\alias{clusterApplyLB}
\alias{clusterCall}
\alias{clusterEvalQ}
\alias{clusterExport}
\alias{clusterMap}
\alias{clusterSplit}
\alias{parApply}
\alias{parCapply}
\alias{parLapply}
\alias{parRapply}
\alias{parSapply}
\alias{parLapplyLB}
\alias{parSapplyLB}
\title{Apply Operations using Clusters}
\description{
These functions provide several ways to parallelize computations using
a cluster.
}
\usage{
clusterCall(cl = NULL, fun, ...)
clusterApply(cl = NULL, x, fun, ...)
clusterApplyLB(cl = NULL, x, fun, ...)
clusterEvalQ(cl = NULL, expr)
clusterExport(cl = NULL, varlist, envir = .GlobalEnv)
clusterMap(cl = NULL, fun, ..., MoreArgs = NULL, RECYCLE = TRUE,
SIMPLIFY = FALSE, USE.NAMES = TRUE,
.scheduling = c("static", "dynamic"))
clusterSplit(cl = NULL, seq)
parLapply(cl = NULL, X, fun, ..., chunk.size = NULL)
parSapply(cl = NULL, X, FUN, ..., simplify = TRUE,
USE.NAMES = TRUE, chunk.size = NULL)
parApply(cl = NULL, X, MARGIN, FUN, ..., chunk.size = NULL)
parRapply(cl = NULL, x, FUN, ..., chunk.size = NULL)
parCapply(cl = NULL, x, FUN, ..., chunk.size = NULL)
parLapplyLB(cl = NULL, X, fun, ..., chunk.size = NULL)
parSapplyLB(cl = NULL, X, FUN, ..., simplify = TRUE,
USE.NAMES = TRUE, chunk.size = NULL)
}
\arguments{
\item{cl}{a cluster object, created by this package or by package
\CRANpkg{snow}. If \code{NULL}, use the registered default cluster.}
\item{fun, FUN}{function or character string naming a function.}
\item{expr}{expression to evaluate.}
\item{seq}{vector to split.}
\item{varlist}{character vector of names of objects to export.}
\item{envir}{environment from which t export variables}
\item{x}{a vector for \code{clusterApply} and \code{clusterApplyLB}, a
matrix for \code{parRapply} and \code{parCapply}.}
\item{...}{additional arguments to pass to \code{fun} or \code{FUN}:
beware of partial matching to earlier arguments.}
\item{MoreArgs}{additional arguments for \code{fun}.}
\item{RECYCLE}{logical; if true shorter arguments are recycled.}
\item{X}{A vector (atomic or list) for \code{parLapply} and
\code{parSapply}, an array for \code{parApply}.}
\item{chunk.size}{scalar number; number of invocations of \code{fun} or
\code{FUN} in one chunk; a chunk is a unit for scheduling.}
\item{MARGIN}{vector specifying the dimensions to use.}
\item{simplify, USE.NAMES}{logical; see \code{\link{sapply}}.}
\item{SIMPLIFY}{logical; see \code{\link{mapply}}.}
\item{.scheduling}{should tasks be statically allocated to nodes or
dynamic load-balancing used?}
}
\details{
\code{clusterCall} calls a function \code{fun} with identical
arguments \code{...} on each node.
\code{clusterEvalQ} evaluates a literal expression on each cluster
node. It is a parallel version of \code{\link{evalq}}, and is a
convenience function invoking \code{clusterCall}.
\code{clusterApply} calls \code{fun} on the first node with
arguments \code{x[[1]]} and \code{...}, on the second node with
\code{x[[2]]} and \code{...}, and so on, recycling nodes as needed.
\code{clusterApplyLB} is a load balancing version of
\code{clusterApply}. If the length \code{n} of \code{x} is not
greater than the number of nodes \code{p}, then a job is sent to
\code{n} nodes. Otherwise the first \code{p} jobs are placed in order
on the \code{p} nodes. When the first job completes, the next job is
placed on the node that has become free; this continues until all jobs
are complete. Using \code{clusterApplyLB} can result in better
cluster utilization than using \code{clusterApply}, but increased
communication can reduce performance. Furthermore, the node that
executes a particular job is non-deterministic. This means that
simulations that assign RNG streams to nodes will not be reproducible.
\code{clusterMap} is a multi-argument version of \code{clusterApply},
analogous to \code{\link{mapply}} and \code{\link{Map}}. If
\code{RECYCLE} is true shorter arguments are recycled (and either none
or all must be of length zero); otherwise, the result length is the
length of the shortest argument. Nodes are recycled if the length of
the result is greater than the number of nodes. (\code{mapply} always
uses \code{RECYCLE = TRUE}, and has argument \code{SIMPLIFY = TRUE}.
\code{Map} always uses \code{RECYCLE = TRUE}.)
\code{clusterExport} assigns the values on the master \R process of
the variables named in \code{varlist} to variables of the same names
in the global environment (aka \sQuote{workspace}) of each node. The
environment on the master from which variables are exported defaults
to the global environment.
\code{clusterSplit} splits \code{seq} into a consecutive piece for
each cluster and returns the result as a list with length equal to the
number of nodes. Currently the pieces are chosen to be close
to equal in length: the computation is done on the master.
\code{parLapply}, \code{parSapply}, and \code{parApply} are parallel
versions of \code{lapply}, \code{sapply} and \code{apply}. Chunks of
computation are statically allocated to nodes using \code{clusterApply}.
By default, the number of chunks is the same as the number of nodes.
\code{parLapplyLB}, \code{parSapplyLB} are load-balancing versions,
intended for use when applying \code{FUN} to different elements of
\code{X} takes quite variable amounts of time, and either the function is
deterministic or reproducible results are not required. Chunks of
computation are allocated dynamically to nodes using
\code{clusterApplyLB}. From \R 3.5.0, the default number of chunks is
twice the number of nodes. Before \R 3.5.0, the (fixed) number of chunks
was the same as the number of nodes. As for \code{clusterApplyLB},
with load balancing the node that executes a particular job is
non-deterministic and simulations that assign RNG streams to nodes
will not be reproducible.
\code{parRapply} and \code{parCapply} are parallel row and column
\code{apply} functions for a matrix \code{x}; they may be slightly
more efficient than \code{parApply} but do less post-processing of the
result.
A chunk size of \code{0} with static scheduling uses the default (one
chunk per node). With dynamic scheduling, chunk size of \code{0} has the
same effect as \code{1} (one invocation of \code{FUN}/\code{fun} per
chunk).
}
\value{
For \code{clusterCall}, \code{clusterEvalQ} and \code{clusterSplit}, a
list with one element per node.
For \code{clusterApply} and \code{clusterApplyLB}, a list the same
length as \code{x}.
\code{clusterMap} follows \code{\link{mapply}}.
\code{clusterExport} returns nothing.
\code{parLapply} returns a list the length of \code{X}.
\code{parSapply} and \code{parApply} follow \code{\link{sapply}} and
\code{\link{apply}} respectively.
\code{parRapply} and \code{parCapply} always return a vector. If
\code{FUN} always returns a scalar result this will be of length the
number of rows or columns: otherwise it will be the concatenation of
the returned values.
An error is signalled on the master if any of the workers produces an
error.
}
\note{
These functions are almost identical to those in package \CRANpkg{snow}.
Two exceptions: \code{parLapply} has argument \code{X}
not \code{x} for consistency with \code{\link{lapply}}, and
\code{parSapply} has been updated to match \code{\link{sapply}}.
}
\author{
Luke Tierney and R Core.
Derived from the \CRANpkg{snow} package.
}
% donttest, as access to ports might be denied. Tested in the 'tests' directory
\examples{\donttest{
## Use option cl.cores to choose an appropriate cluster size.
cl <- makeCluster(getOption("cl.cores", 2))
clusterApply(cl, 1:2, get("+"), 3)
xx <- 1
clusterExport(cl, "xx")
clusterCall(cl, function(y) xx + y, 2)
## Use clusterMap like an mapply example
clusterMap(cl, function(x, y) seq_len(x) + y,
c(a = 1, b = 2, c = 3), c(A = 10, B = 0, C = -10))
parSapply(cl, 1:20, get("+"), 3)
## A bootstrapping example, which can be done in many ways:
clusterEvalQ(cl, {
## set up each worker. Could also use clusterExport()
library(boot)
cd4.rg <- function(data, mle) MASS::mvrnorm(nrow(data), mle$m, mle$v)
cd4.mle <- list(m = colMeans(cd4), v = var(cd4))
NULL
})
res <- clusterEvalQ(cl, boot(cd4, corr, R = 100,
sim = "parametric", ran.gen = cd4.rg, mle = cd4.mle))
library(boot)
cd4.boot <- do.call(c, res)
boot.ci(cd4.boot, type = c("norm", "basic", "perc"),
conf = 0.9, h = atanh, hinv = tanh)
stopCluster(cl)
## or
library(boot)
run1 <- function(...) {
library(boot)
cd4.rg <- function(data, mle) MASS::mvrnorm(nrow(data), mle$m, mle$v)
cd4.mle <- list(m = colMeans(cd4), v = var(cd4))
boot(cd4, corr, R = 500, sim = "parametric",
ran.gen = cd4.rg, mle = cd4.mle)
}
cl <- makeCluster(mc <- getOption("cl.cores", 2))
## to make this reproducible
clusterSetRNGStream(cl, 123)
cd4.boot <- do.call(c, parLapply(cl, seq_len(mc), run1))
boot.ci(cd4.boot, type = c("norm", "basic", "perc"),
conf = 0.9, h = atanh, hinv = tanh)
stopCluster(cl)
}}