src/library/parallel/man/clusterApply.Rd - R - Git at Google

 % File src/library/parallel/man/clusterApply.Rd
 % Part of the R package, https://www.R-project.org
 % Copyright 2003-2018 R Core Team
 % Distributed under GPL 2 or later

 \name{clusterApply}

 \alias{clusterApply}
 \alias{clusterApplyLB}
 \alias{clusterCall}
 \alias{clusterEvalQ}
 \alias{clusterExport}
 \alias{clusterMap}
 \alias{clusterSplit}
 \alias{parApply}
 \alias{parCapply}
 \alias{parLapply}
 \alias{parRapply}
 \alias{parSapply}
 \alias{parLapplyLB}
 \alias{parSapplyLB}

 \title{Apply Operations using Clusters}
 \description{
   These functions provide several ways to parallelize computations using
   a cluster.
 }
 \usage{
 clusterCall(cl = NULL, fun, ...)
 clusterApply(cl = NULL, x, fun, ...)
 clusterApplyLB(cl = NULL, x, fun, ...)
 clusterEvalQ(cl = NULL, expr)
 clusterExport(cl = NULL, varlist, envir = .GlobalEnv)
 clusterMap(cl = NULL, fun, ..., MoreArgs = NULL, RECYCLE = TRUE,
            SIMPLIFY = FALSE, USE.NAMES = TRUE,
            .scheduling = c("static", "dynamic"))
 clusterSplit(cl = NULL, seq)

 parLapply(cl = NULL, X, fun, ..., chunk.size = NULL)
 parSapply(cl = NULL, X, FUN, ..., simplify = TRUE,
           USE.NAMES = TRUE, chunk.size = NULL)
 parApply(cl = NULL, X, MARGIN, FUN, ..., chunk.size = NULL)
 parRapply(cl = NULL, x, FUN, ..., chunk.size = NULL)
 parCapply(cl = NULL, x, FUN, ..., chunk.size = NULL)

 parLapplyLB(cl = NULL, X, fun, ..., chunk.size = NULL)
 parSapplyLB(cl = NULL, X, FUN, ..., simplify = TRUE,
             USE.NAMES = TRUE, chunk.size = NULL)
 }
 \arguments{
   \item{cl}{a cluster object, created by this package or by package
     \CRANpkg{snow}.  If \code{NULL}, use the registered default cluster.}
   \item{fun, FUN}{function or character string naming a function.}
   \item{expr}{expression to evaluate.}
   \item{seq}{vector to split.}
   \item{varlist}{character vector of names of objects to export.}
   \item{envir}{environment from which t export variables}
   \item{x}{a vector for \code{clusterApply} and \code{clusterApplyLB}, a
     matrix for \code{parRapply} and \code{parCapply}.}
   \item{...}{additional arguments to pass to \code{fun} or \code{FUN}:
     beware of partial matching to earlier arguments.}
   \item{MoreArgs}{additional arguments for \code{fun}.}
   \item{RECYCLE}{logical; if true shorter arguments are recycled.}
   \item{X}{A vector (atomic or list) for \code{parLapply} and
     \code{parSapply}, an array for \code{parApply}.}
   \item{chunk.size}{scalar number; number of invocations of \code{fun} or
     \code{FUN} in one chunk; a chunk is a unit for scheduling.}
   \item{MARGIN}{vector specifying the dimensions to use.}
   \item{simplify, USE.NAMES}{logical; see \code{\link{sapply}}.}
   \item{SIMPLIFY}{logical; see \code{\link{mapply}}.}
   \item{.scheduling}{should tasks be statically allocated to nodes or
     dynamic load-balancing used?}
 }
 \details{
   \code{clusterCall} calls a function \code{fun} with identical
   arguments \code{...} on each node.

   \code{clusterEvalQ} evaluates a literal expression on each cluster
   node.  It is a parallel version of \code{\link{evalq}}, and is a
   convenience function invoking \code{clusterCall}.

   \code{clusterApply} calls \code{fun} on the first node with
   arguments \code{x[[1]]} and \code{...}, on the second node with
   \code{x[[2]]} and \code{...}, and so on, recycling nodes as needed.

   \code{clusterApplyLB} is a load balancing version of
   \code{clusterApply}.  If the length \code{n} of \code{x} is not
   greater than the number of nodes \code{p}, then a job is sent to
   \code{n} nodes.  Otherwise the first \code{p} jobs are placed in order
   on the \code{p} nodes.  When the first job completes, the next job is
   placed on the node that has become free; this continues until all jobs
   are complete.  Using \code{clusterApplyLB} can result in better
   cluster utilization than using \code{clusterApply}, but increased
   communication can reduce performance.  Furthermore, the node that
   executes a particular job is non-deterministic. This means that
   simulations that assign RNG streams to nodes will not be reproducible.

   \code{clusterMap} is a multi-argument version of \code{clusterApply},
   analogous to \code{\link{mapply}} and \code{\link{Map}}.  If
   \code{RECYCLE} is true shorter arguments are recycled (and either none
   or all must be of length zero); otherwise, the result length is the
   length of the shortest argument.  Nodes are recycled if the length of
   the result is greater than the number of nodes.  (\code{mapply} always
   uses \code{RECYCLE = TRUE}, and has argument \code{SIMPLIFY = TRUE}.
   \code{Map} always uses \code{RECYCLE = TRUE}.)

   \code{clusterExport} assigns the values on the master \R process of
   the variables named in \code{varlist} to variables of the same names
   in the global environment (aka \sQuote{workspace}) of each node.  The
   environment on the master from which variables are exported defaults
   to the global environment.

   \code{clusterSplit} splits \code{seq} into a consecutive piece for
   each cluster and returns the result as a list with length equal to the
   number of nodes.  Currently the pieces are chosen to be close
   to equal in length: the computation is done on the master.

   \code{parLapply}, \code{parSapply}, and \code{parApply} are parallel
   versions of \code{lapply}, \code{sapply} and \code{apply}.  Chunks of
   computation are statically allocated to nodes using \code{clusterApply}.
   By default, the number of chunks is the same as the number of nodes.
   \code{parLapplyLB}, \code{parSapplyLB} are load-balancing versions,
   intended for use when applying \code{FUN} to different elements of
   \code{X} takes quite variable amounts of time, and either the function is
   deterministic or reproducible results are not required.  Chunks of
   computation are allocated dynamically to nodes using
   \code{clusterApplyLB}.  From \R 3.5.0, the default number of chunks is
   twice the number of nodes. Before \R 3.5.0, the (fixed) number of chunks
   was the same as the number of nodes.  As for \code{clusterApplyLB},
   with load balancing the node that executes a particular job is
   non-deterministic and simulations that assign RNG streams to nodes
   will not be reproducible.

   \code{parRapply} and \code{parCapply} are parallel row and column
   \code{apply} functions for a matrix \code{x}; they may be slightly
   more efficient than \code{parApply} but do less post-processing of the
   result.

   A chunk size of \code{0} with static scheduling uses the default (one
   chunk per node).  With dynamic scheduling, chunk size of \code{0} has the
   same effect as \code{1} (one invocation of \code{FUN}/\code{fun} per
   chunk).
 }
 \value{
   For \code{clusterCall}, \code{clusterEvalQ} and \code{clusterSplit}, a
   list with one element per node.

   For \code{clusterApply} and \code{clusterApplyLB}, a list the same
   length as \code{x}.

   \code{clusterMap} follows \code{\link{mapply}}.

   \code{clusterExport} returns nothing.

   \code{parLapply} returns a list the length of \code{X}.

   \code{parSapply} and \code{parApply} follow \code{\link{sapply}} and
   \code{\link{apply}} respectively.

   \code{parRapply} and \code{parCapply} always return a vector.  If
   \code{FUN} always returns a scalar result this will be of length the
   number of rows or columns: otherwise it will be the concatenation of
   the returned values.

   An error is signalled on the master if any of the workers produces an
   error.
 }
 \note{
   These functions are almost identical to those in package \CRANpkg{snow}.

   Two exceptions: \code{parLapply} has argument \code{X}
   not \code{x} for consistency with \code{\link{lapply}}, and
   \code{parSapply} has been updated to match \code{\link{sapply}}.
 }
 \author{
   Luke Tierney and R Core.

   Derived from the \CRANpkg{snow} package.
 }
 % donttest, as access to ports might be denied.  Tested in the 'tests' directory
 \examples{\donttest{
 ## Use option cl.cores to choose an appropriate cluster size.
 cl <- makeCluster(getOption("cl.cores", 2))

 clusterApply(cl, 1:2, get("+"), 3)
 xx <- 1
 clusterExport(cl, "xx")
 clusterCall(cl, function(y) xx + y, 2)

 ## Use clusterMap like an mapply example
 clusterMap(cl, function(x, y) seq_len(x) + y,
           c(a =  1, b = 2, c = 3), c(A = 10, B = 0, C = -10))


 parSapply(cl, 1:20, get("+"), 3)

 ## A bootstrapping example, which can be done in many ways:
 clusterEvalQ(cl, {
   ## set up each worker.  Could also use clusterExport()
   library(boot)
   cd4.rg <- function(data, mle) MASS::mvrnorm(nrow(data), mle$m, mle$v)
   cd4.mle <- list(m = colMeans(cd4), v = var(cd4))
   NULL
 })
 res <- clusterEvalQ(cl, boot(cd4, corr, R = 100,
                     sim = "parametric", ran.gen = cd4.rg, mle = cd4.mle))
 library(boot)
 cd4.boot <- do.call(c, res)
 boot.ci(cd4.boot,  type = c("norm", "basic", "perc"),
         conf = 0.9, h = atanh, hinv = tanh)
 stopCluster(cl)

 ## or
 library(boot)
 run1 <- function(...) {
    library(boot)
    cd4.rg <- function(data, mle) MASS::mvrnorm(nrow(data), mle$m, mle$v)
    cd4.mle <- list(m = colMeans(cd4), v = var(cd4))
    boot(cd4, corr, R = 500, sim = "parametric",
         ran.gen = cd4.rg, mle = cd4.mle)
 }
 cl <- makeCluster(mc <- getOption("cl.cores", 2))
 ## to make this reproducible
 clusterSetRNGStream(cl, 123)
 cd4.boot <- do.call(c, parLapply(cl, seq_len(mc), run1))
 boot.ci(cd4.boot,  type = c("norm", "basic", "perc"),
         conf = 0.9, h = atanh, hinv = tanh)
 stopCluster(cl)
 }}
	% File src/library/parallel/man/clusterApply.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 2003-2018 R Core Team
	% Distributed under GPL 2 or later

	\name{clusterApply}

	\alias{clusterApply}
	\alias{clusterApplyLB}
	\alias{clusterCall}
	\alias{clusterEvalQ}
	\alias{clusterExport}
	\alias{clusterMap}
	\alias{clusterSplit}
	\alias{parApply}
	\alias{parCapply}
	\alias{parLapply}
	\alias{parRapply}
	\alias{parSapply}
	\alias{parLapplyLB}
	\alias{parSapplyLB}

	\title{Apply Operations using Clusters}
	\description{
	These functions provide several ways to parallelize computations using
	a cluster.
	}
	\usage{
	clusterCall(cl = NULL, fun, ...)
	clusterApply(cl = NULL, x, fun, ...)
	clusterApplyLB(cl = NULL, x, fun, ...)
	clusterEvalQ(cl = NULL, expr)
	clusterExport(cl = NULL, varlist, envir = .GlobalEnv)
	clusterMap(cl = NULL, fun, ..., MoreArgs = NULL, RECYCLE = TRUE,
	SIMPLIFY = FALSE, USE.NAMES = TRUE,
	.scheduling = c("static", "dynamic"))
	clusterSplit(cl = NULL, seq)

	parLapply(cl = NULL, X, fun, ..., chunk.size = NULL)
	parSapply(cl = NULL, X, FUN, ..., simplify = TRUE,
	USE.NAMES = TRUE, chunk.size = NULL)
	parApply(cl = NULL, X, MARGIN, FUN, ..., chunk.size = NULL)
	parRapply(cl = NULL, x, FUN, ..., chunk.size = NULL)
	parCapply(cl = NULL, x, FUN, ..., chunk.size = NULL)

	parLapplyLB(cl = NULL, X, fun, ..., chunk.size = NULL)
	parSapplyLB(cl = NULL, X, FUN, ..., simplify = TRUE,
	USE.NAMES = TRUE, chunk.size = NULL)
	}
	\arguments{
	\item{cl}{a cluster object, created by this package or by package
	\CRANpkg{snow}. If \code{NULL}, use the registered default cluster.}
	\item{fun, FUN}{function or character string naming a function.}
	\item{expr}{expression to evaluate.}
	\item{seq}{vector to split.}
	\item{varlist}{character vector of names of objects to export.}
	\item{envir}{environment from which t export variables}
	\item{x}{a vector for \code{clusterApply} and \code{clusterApplyLB}, a
	matrix for \code{parRapply} and \code{parCapply}.}
	\item{...}{additional arguments to pass to \code{fun} or \code{FUN}:
	beware of partial matching to earlier arguments.}
	\item{MoreArgs}{additional arguments for \code{fun}.}
	\item{RECYCLE}{logical; if true shorter arguments are recycled.}
	\item{X}{A vector (atomic or list) for \code{parLapply} and
	\code{parSapply}, an array for \code{parApply}.}
	\item{chunk.size}{scalar number; number of invocations of \code{fun} or
	\code{FUN} in one chunk; a chunk is a unit for scheduling.}
	\item{MARGIN}{vector specifying the dimensions to use.}
	\item{simplify, USE.NAMES}{logical; see \code{\link{sapply}}.}
	\item{SIMPLIFY}{logical; see \code{\link{mapply}}.}
	\item{.scheduling}{should tasks be statically allocated to nodes or
	dynamic load-balancing used?}
	}
	\details{
	\code{clusterCall} calls a function \code{fun} with identical
	arguments \code{...} on each node.

	\code{clusterEvalQ} evaluates a literal expression on each cluster
	node. It is a parallel version of \code{\link{evalq}}, and is a
	convenience function invoking \code{clusterCall}.

	\code{clusterApply} calls \code{fun} on the first node with
	arguments \code{x[[1]]} and \code{...}, on the second node with
	\code{x[[2]]} and \code{...}, and so on, recycling nodes as needed.

	\code{clusterApplyLB} is a load balancing version of
	\code{clusterApply}. If the length \code{n} of \code{x} is not
	greater than the number of nodes \code{p}, then a job is sent to
	\code{n} nodes. Otherwise the first \code{p} jobs are placed in order
	on the \code{p} nodes. When the first job completes, the next job is
	placed on the node that has become free; this continues until all jobs
	are complete. Using \code{clusterApplyLB} can result in better
	cluster utilization than using \code{clusterApply}, but increased
	communication can reduce performance. Furthermore, the node that
	executes a particular job is non-deterministic. This means that
	simulations that assign RNG streams to nodes will not be reproducible.

	\code{clusterMap} is a multi-argument version of \code{clusterApply},
	analogous to \code{\link{mapply}} and \code{\link{Map}}. If
	\code{RECYCLE} is true shorter arguments are recycled (and either none
	or all must be of length zero); otherwise, the result length is the
	length of the shortest argument. Nodes are recycled if the length of
	the result is greater than the number of nodes. (\code{mapply} always
	uses \code{RECYCLE = TRUE}, and has argument \code{SIMPLIFY = TRUE}.
	\code{Map} always uses \code{RECYCLE = TRUE}.)

	\code{clusterExport} assigns the values on the master \R process of
	the variables named in \code{varlist} to variables of the same names
	in the global environment (aka \sQuote{workspace}) of each node. The
	environment on the master from which variables are exported defaults
	to the global environment.

	\code{clusterSplit} splits \code{seq} into a consecutive piece for
	each cluster and returns the result as a list with length equal to the
	number of nodes. Currently the pieces are chosen to be close
	to equal in length: the computation is done on the master.

	\code{parLapply}, \code{parSapply}, and \code{parApply} are parallel
	versions of \code{lapply}, \code{sapply} and \code{apply}. Chunks of
	computation are statically allocated to nodes using \code{clusterApply}.
	By default, the number of chunks is the same as the number of nodes.
	\code{parLapplyLB}, \code{parSapplyLB} are load-balancing versions,
	intended for use when applying \code{FUN} to different elements of
	\code{X} takes quite variable amounts of time, and either the function is
	deterministic or reproducible results are not required. Chunks of
	computation are allocated dynamically to nodes using
	\code{clusterApplyLB}. From \R 3.5.0, the default number of chunks is
	twice the number of nodes. Before \R 3.5.0, the (fixed) number of chunks
	was the same as the number of nodes. As for \code{clusterApplyLB},
	with load balancing the node that executes a particular job is
	non-deterministic and simulations that assign RNG streams to nodes
	will not be reproducible.

	\code{parRapply} and \code{parCapply} are parallel row and column
	\code{apply} functions for a matrix \code{x}; they may be slightly
	more efficient than \code{parApply} but do less post-processing of the
	result.

	A chunk size of \code{0} with static scheduling uses the default (one
	chunk per node). With dynamic scheduling, chunk size of \code{0} has the
	same effect as \code{1} (one invocation of \code{FUN}/\code{fun} per
	chunk).
	}
	\value{
	For \code{clusterCall}, \code{clusterEvalQ} and \code{clusterSplit}, a
	list with one element per node.

	For \code{clusterApply} and \code{clusterApplyLB}, a list the same
	length as \code{x}.

	\code{clusterMap} follows \code{\link{mapply}}.

	\code{clusterExport} returns nothing.

	\code{parLapply} returns a list the length of \code{X}.

	\code{parSapply} and \code{parApply} follow \code{\link{sapply}} and
	\code{\link{apply}} respectively.

	\code{parRapply} and \code{parCapply} always return a vector. If
	\code{FUN} always returns a scalar result this will be of length the
	number of rows or columns: otherwise it will be the concatenation of
	the returned values.

	An error is signalled on the master if any of the workers produces an
	error.
	}
	\note{
	These functions are almost identical to those in package \CRANpkg{snow}.

	Two exceptions: \code{parLapply} has argument \code{X}
	not \code{x} for consistency with \code{\link{lapply}}, and
	\code{parSapply} has been updated to match \code{\link{sapply}}.
	}
	\author{
	Luke Tierney and R Core.

	Derived from the \CRANpkg{snow} package.
	}
	% donttest, as access to ports might be denied. Tested in the 'tests' directory
	\examples{\donttest{
	## Use option cl.cores to choose an appropriate cluster size.
	cl <- makeCluster(getOption("cl.cores", 2))

	clusterApply(cl, 1:2, get("+"), 3)
	xx <- 1
	clusterExport(cl, "xx")
	clusterCall(cl, function(y) xx + y, 2)

	## Use clusterMap like an mapply example
	clusterMap(cl, function(x, y) seq_len(x) + y,
	c(a = 1, b = 2, c = 3), c(A = 10, B = 0, C = -10))


	parSapply(cl, 1:20, get("+"), 3)

	## A bootstrapping example, which can be done in many ways:
	clusterEvalQ(cl, {
	## set up each worker. Could also use clusterExport()
	library(boot)
	cd4.rg <- function(data, mle) MASS::mvrnorm(nrow(data), mle$m, mle$v)
	cd4.mle <- list(m = colMeans(cd4), v = var(cd4))
	NULL
	})
	res <- clusterEvalQ(cl, boot(cd4, corr, R = 100,
	sim = "parametric", ran.gen = cd4.rg, mle = cd4.mle))
	library(boot)
	cd4.boot <- do.call(c, res)
	boot.ci(cd4.boot, type = c("norm", "basic", "perc"),
	conf = 0.9, h = atanh, hinv = tanh)
	stopCluster(cl)

	## or
	library(boot)
	run1 <- function(...) {
	library(boot)
	cd4.rg <- function(data, mle) MASS::mvrnorm(nrow(data), mle$m, mle$v)
	cd4.mle <- list(m = colMeans(cd4), v = var(cd4))
	boot(cd4, corr, R = 500, sim = "parametric",
	ran.gen = cd4.rg, mle = cd4.mle)
	}
	cl <- makeCluster(mc <- getOption("cl.cores", 2))
	## to make this reproducible
	clusterSetRNGStream(cl, 123)
	cd4.boot <- do.call(c, parLapply(cl, seq_len(mc), run1))
	boot.ci(cd4.boot, type = c("norm", "basic", "perc"),
	conf = 0.9, h = atanh, hinv = tanh)
	stopCluster(cl)
	}}