| % File src/library/stats/man/prcomp.Rd |
| % Part of the R package, https://www.R-project.org |
| % Copyright 1995-2016 R Core Team |
| % Distributed under GPL 2 or later |
| |
| \name{prcomp} |
| \title{Principal Components Analysis} |
| \alias{prcomp} |
| \alias{prcomp.formula} |
| \alias{prcomp.default} |
| \alias{plot.prcomp} |
| \alias{predict.prcomp} |
| \alias{print.prcomp} |
| \alias{summary.prcomp} |
| \alias{print.summary.prcomp} |
| \concept{PCA} |
| \description{ |
| Performs a principal components analysis on the given data matrix |
| and returns the results as an object of class \code{prcomp}.} |
| \usage{ |
| prcomp(x, \dots) |
| |
| \method{prcomp}{formula}(formula, data = NULL, subset, na.action, \dots) |
| |
| \method{prcomp}{default}(x, retx = TRUE, center = TRUE, scale. = FALSE, |
| tol = NULL, rank. = NULL, \dots) |
| |
| \method{predict}{prcomp}(object, newdata, \dots) |
| } |
| \arguments{ |
| \item{formula}{a formula with no response variable, referring only to |
| numeric variables.} |
| \item{data}{an optional data frame (or similar: see |
| \code{\link{model.frame}}) containing the variables in the |
| formula \code{formula}. By default the variables are taken from |
| \code{environment(formula)}.} |
| \item{subset}{an optional vector used to select rows (observations) of the |
| data matrix \code{x}.} |
| \item{na.action}{a function which indicates what should happen |
| when the data contain \code{NA}s. The default is set by |
| the \code{na.action} setting of \code{\link{options}}, and is |
| \code{\link{na.fail}} if that is unset. The \sQuote{factory-fresh} |
| default is \code{\link{na.omit}}.} |
| \item{\dots}{arguments passed to or from other methods. If \code{x} is |
| a formula one might specify \code{scale.} or \code{tol}.} |
| \item{x}{a numeric or complex matrix (or data frame) which provides |
| the data for the principal components analysis.} |
| \item{retx}{a logical value indicating whether the rotated variables |
| should be returned.} |
| \item{center}{a logical value indicating whether the variables |
| should be shifted to be zero centered. Alternately, a vector of |
| length equal the number of columns of \code{x} can be supplied. |
| The value is passed to \code{scale}.} |
| \item{scale.}{a logical value indicating whether the variables should |
| be scaled to have unit variance before the analysis takes |
| place. The default is \code{FALSE} for consistency with S, but |
| in general scaling is advisable. Alternatively, a vector of length |
| equal the number of columns of \code{x} can be supplied. The |
| value is passed to \code{\link{scale}}.} |
| \item{tol}{a value indicating the magnitude below which components |
| should be omitted. (Components are omitted if their |
| standard deviations are less than or equal to \code{tol} times the |
| standard deviation of the first component.) With the default null |
| setting, no components are omitted (unless \code{rank.} is specified |
| less than \code{min(dim(x))}.). Other settings for tol could be |
| \code{tol = 0} or \code{tol = sqrt(.Machine$double.eps)}, which |
| would omit essentially constant components.} |
| \item{rank.}{optionally, a number specifying the maximal rank, i.e., |
| maximal number of principal components to be used. Can be set as |
| alternative or in addition to \code{tol}, useful notably when the |
| desired rank is considerably smaller than the dimensions of the matrix.} |
| |
| \item{object}{object of class inheriting from \code{"prcomp"}} |
| \item{newdata}{An optional data frame or matrix in which to look for |
| variables with which to predict. If omitted, the scores are used. |
| If the original fit used a formula or a data frame or a matrix with |
| column names, \code{newdata} must contain columns with the same |
| names. Otherwise it must contain the same number of columns, to be |
| used in the same order. |
| } |
| } |
| \value{ |
| \code{prcomp} returns a list with class \code{"prcomp"} |
| containing the following components: |
| \item{sdev}{the standard deviations of the principal components |
| (i.e., the square roots of the eigenvalues of the |
| covariance/correlation matrix, though the calculation |
| is actually done with the singular values of the data matrix).} |
| \item{rotation}{the matrix of variable loadings (i.e., a matrix |
| whose columns contain the eigenvectors). The function |
| \code{princomp} returns this in the element \code{loadings}.} |
| \item{x}{if \code{retx} is true the value of the rotated data (the |
| centred (and scaled if requested) data multiplied by the |
| \code{rotation} matrix) is returned. Hence, \code{cov(x)} is the |
| diagonal matrix \code{diag(sdev^2)}. For the formula method, |
| \code{\link{napredict}()} is applied to handle the treatment of values |
| omitted by the \code{na.action}.} |
| \item{center, scale}{the centering and scaling used, or \code{FALSE}.} |
| } |
| \details{ |
| The calculation is done by a singular value decomposition of the |
| (centered and possibly scaled) data matrix, not by using |
| \code{eigen} on the covariance matrix. This |
| is generally the preferred method for numerical accuracy. The |
| \code{print} method for these objects prints the results in a nice |
| format and the \code{plot} method produces a scree plot. |
| |
| Unlike \code{\link{princomp}}, variances are computed with the usual |
| divisor \eqn{N - 1}. |
| |
| Note that \code{scale = TRUE} cannot be used if there are zero or |
| constant (for \code{center = TRUE}) variables. |
| } |
| \note{ |
| The signs of the columns of the rotation matrix are arbitrary, and |
| so may differ between different programs for PCA, and even between |
| different builds of \R. |
| } |
| \references{ |
| Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) |
| \emph{The New S Language}. |
| Wadsworth & Brooks/Cole. |
| |
| Mardia, K. V., J. T. Kent, and J. M. Bibby (1979) |
| \emph{Multivariate Analysis}, London: Academic Press. |
| |
| Venables, W. N. and B. D. Ripley (2002) |
| \emph{Modern Applied Statistics with S}, Springer-Verlag. |
| } |
| \seealso{ |
| \code{\link{biplot.prcomp}}, \code{\link{screeplot}}, |
| \code{\link{princomp}}, \code{\link{cor}}, \code{\link{cov}}, |
| \code{\link{svd}}, \code{\link{eigen}}. |
| } |
| \examples{ |
| C <- chol(S <- toeplitz(.9 ^ (0:31))) # Cov.matrix and its root |
| all.equal(S, crossprod(C)) |
| set.seed(17) |
| X <- matrix(rnorm(32000), 1000, 32) |
| Z <- X \%*\% C ## ==> cov(Z) ~= C'C = S |
| all.equal(cov(Z), S, tolerance = 0.08) |
| pZ <- prcomp(Z, tol = 0.1) |
| summary(pZ) # only ~14 PCs (out of 32) |
| ## or choose only 3 PCs more directly: |
| pz3 <- prcomp(Z, rank. = 3) |
| summary(pz3) # same numbers as the first 3 above |
| stopifnot(ncol(pZ$rotation) == 14, ncol(pz3$rotation) == 3, |
| all.equal(pz3$sdev, pZ$sdev, tolerance = 1e-15)) # exactly equal typically |
| |
| \donttest{## signs are random |
| require(graphics) |
| ## the variances of the variables in the |
| ## USArrests data vary by orders of magnitude, so scaling is appropriate |
| prcomp(USArrests) # inappropriate |
| prcomp(USArrests, scale. = TRUE) |
| prcomp(~ Murder + Assault + Rape, data = USArrests, scale. = TRUE) |
| plot(prcomp(USArrests)) |
| summary(prcomp(USArrests, scale. = TRUE)) |
| biplot(prcomp(USArrests, scale. = TRUE)) |
| } |
| } |
| \keyword{multivariate} |