src/library/stats/man/influence.measures.Rd - R - Git at Google

 % File src/library/stats/man/influence.measures.Rd
 % Part of the R package, https://www.R-project.org
 % Copyright 1995-2022 R Core Team
 % Distributed under GPL 2 or later

 \name{influence.measures}
 \title{Regression Deletion Diagnostics}

 \concept{studentized residuals}
 \concept{standardized residuals}
 \concept{Cook's distances}
 \concept{Covariance ratios}
 \concept{DFBETAs}
 \concept{DFFITs}
 \concept{PRESS}

 \alias{influence.measures}
 %\alias{print.infl}
 %\alias{summary.infl} <- To document: has 'digits' & return()s
 \alias{hat}
 \alias{hatvalues}
 \alias{hatvalues.lm}
 \alias{rstandard}
 \alias{rstandard.lm}
 \alias{rstandard.glm}
 \alias{rstudent}
 \alias{rstudent.lm}
 \alias{rstudent.glm}
 \alias{dfbeta}
 \alias{dfbeta.lm}
 \alias{dfbetas}
 \alias{dfbetas.lm}
 \alias{dffits}
 \alias{covratio}
 \alias{cooks.distance}
 \alias{cooks.distance.lm}
 \alias{cooks.distance.glm}
 \usage{
 influence.measures(model, infl = influence(model))

 rstandard(model, \dots)
 \method{rstandard}{lm}(model, infl = lm.influence(model, do.coef = FALSE),
           sd = sqrt(deviance(model)/df.residual(model)),
           type = c("sd.1", "predictive"), \dots)
 \method{rstandard}{glm}(model, infl = influence(model, do.coef = FALSE),
           type = c("deviance", "pearson"), \dots)

 rstudent(model, \dots)
 \method{rstudent}{lm}(model, infl = lm.influence(model, do.coef = FALSE),
          res = infl$wt.res, \dots)
 \method{rstudent}{glm}(model, infl = influence(model, do.coef = FALSE), \dots)

 dffits(model, infl = , res = )

 dfbeta(model, \dots)
 \method{dfbeta}{lm}(model, infl = lm.influence(model, do.coef = TRUE), \dots)

 dfbetas(model, \dots)
 \method{dfbetas}{lm}(model, infl = lm.influence(model, do.coef = TRUE), \dots)

 covratio(model, infl = lm.influence(model, do.coef = FALSE),
          res = weighted.residuals(model))

 cooks.distance(model, \dots)
 \method{cooks.distance}{lm}(model, infl = lm.influence(model, do.coef = FALSE),
                res = weighted.residuals(model),
                sd = sqrt(deviance(model)/df.residual(model)),
                hat = infl$hat, \dots)
 \method{cooks.distance}{glm}(model, infl = influence(model, do.coef = FALSE),
                res = infl$pear.res,
                dispersion = summary(model)$dispersion,
                hat = infl$hat, \dots)

 hatvalues(model, \dots)
 \method{hatvalues}{lm}(model, infl = lm.influence(model, do.coef = FALSE), \dots)

 hat(x, intercept = TRUE)
 }
 \arguments{
   \item{model}{an \R object, typically returned by \code{\link{lm}} or
     \code{\link{glm}}.}
   \item{infl}{influence structure as returned by
     \code{\link{lm.influence}} or \code{\link{influence}} (the latter
     only for the \code{glm} method of \code{rstudent} and
     \code{cooks.distance}).}
   \item{res}{(possibly weighted) residuals, with proper default.}
   \item{sd}{standard deviation to use, see default.}
   \item{dispersion}{dispersion (for \code{\link{glm}} objects) to use,
     see default.}
   \item{hat}{hat values \eqn{H_{ii}}{H[i,i]}, see default.}
   \item{type}{type of residuals for \code{rstandard}, with different
     options and meanings for \code{lm} and \code{glm}.  Can be
     abbreviated.}

   \item{x}{the \eqn{X} or design matrix.}
   \item{intercept}{should an intercept column be prepended to \code{x}?}
   \item{\dots}{further arguments passed to or from other methods.}
 }
 \description{
   This suite of functions can be used to compute some of the regression
   (leave-one-out deletion) diagnostics for linear and generalized linear
   models discussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982),
   etc.
 }
 \details{
   The primary high-level function is \code{influence.measures} which produces a
   class \code{"infl"} object tabular display showing the DFBETAS for
   each model variable, DFFITS, covariance ratios, Cook's distances and
   the diagonal elements of the hat matrix.  Cases which are influential
   with respect to any of these measures are marked with an asterisk.

   The functions \code{dfbetas}, \code{dffits},
   \code{covratio} and \code{cooks.distance} provide direct access to the
   corresponding diagnostic quantities.  Functions \code{rstandard} and
   \code{rstudent} give the standardized and Studentized residuals
   respectively. (These re-normalize the residuals to have unit variance,
   using an overall and leave-one-out measure of the error variance
   respectively.)

   Note that for \emph{multivariate} \code{lm()} models (of class
   \code{"mlm"}), these functions return 3d arrays instead of matrices,
   or matrices instead of vectors.

   Values for generalized linear models are approximations, as described
   in Williams (1987) (except that Cook's distances are scaled as
   \eqn{F} rather than as chi-square values).  The approximations can be
   poor when some cases have large influence.

   The optional \code{infl}, \code{res} and \code{sd} arguments are there
   to encourage the use of these direct access functions, in situations
   where, e.g., the underlying basic influence measures (from
   \code{\link{lm.influence}} or the generic \code{\link{influence}}) are
   already available.

   Note that cases with \code{weights == 0} are \emph{dropped} from all
   these functions, but that if a linear model has been fitted with
   \code{na.action = na.exclude}, suitable values are filled in for the
   cases excluded during fitting.

   For linear models, \code{rstandard(*, type = "predictive")} provides
   leave-one-out cross validation residuals, and the \dQuote{PRESS}
   statistic (\bold{PRE}dictive \bold{S}um of \bold{S}quares, the same as
   the CV score) of model \code{model} is \preformatted{   PRESS <- sum(rstandard(model, type="pred")^2)}

   The function \code{hat()} exists mainly for S (version 2)
   compatibility; we recommend using \code{hatvalues()} instead.
 }
 \note{
   For \code{hatvalues}, \code{dfbeta}, and \code{dfbetas}, the method
   for linear models also works for generalized linear models.
 }
 \author{
   Several R core team members and John Fox, originally in his \file{car}
   package.
 }
 \references{
   Belsley, D. A., Kuh, E. and Welsch, R. E. (1980).
   \emph{Regression Diagnostics}.
   New York: Wiley.

   Cook, R. D. and Weisberg, S. (1982).
   \emph{Residuals and Influence in Regression}.
   London: Chapman and Hall.

   Williams, D. A. (1987).
   Generalized linear model diagnostics using the deviance and single
   case deletions.
   \emph{Applied Statistics}, \bold{36}, 181--191.
   \doi{10.2307/2347550}.

   Fox, J. (1997).
   \emph{Applied Regression, Linear Models, and Related Methods}.
   Sage.

   Fox, J. (2002)
   \emph{An R and S-Plus Companion to Applied Regression}.
   Sage Publ.

   Fox, J. and Weisberg, S. (2011).
   \emph{An R Companion to Applied Regression}, second edition.
   Sage Publ;
   \url{https://socialsciences.mcmaster.ca/jfox/Books/Companion/}.
 }
 \seealso{
   \code{\link{influence}} (containing \code{\link{lm.influence}}).

   \sQuote{\link{plotmath}} for the use of \code{hat} in plot annotation.
 }
 \examples{
 require(graphics)

 ## Analysis of the life-cycle savings data
 ## given in Belsley, Kuh and Welsch.
 lm.SR <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data = LifeCycleSavings)

 inflm.SR <- influence.measures(lm.SR)
 which(apply(inflm.SR$is.inf, 1, any))
 # which observations 'are' influential
 summary(inflm.SR) # only these
 \donttest{inflm.SR          # all}
 plot(rstudent(lm.SR) ~ hatvalues(lm.SR)) # recommended by some
 plot(lm.SR, which = 5) # an enhanced version of that via plot(<lm>)

 ## The 'infl' argument is not needed, but avoids recomputation:
 rs <- rstandard(lm.SR)
 iflSR <- influence(lm.SR)
 all.equal(rs, rstandard(lm.SR, infl = iflSR), tolerance = 1e-10)
 ## to "see" the larger values:
 1000 * round(dfbetas(lm.SR, infl = iflSR), 3)
 cat("PRESS :"); (PRESS <- sum( rstandard(lm.SR, type = "predictive")^2 ))
 stopifnot(all.equal(PRESS, sum( (residuals(lm.SR) / (1 - iflSR$hat))^2)))

 ## Show that "PRE-residuals"  ==  L.O.O. Crossvalidation (CV) errors:
 X <- model.matrix(lm.SR)
 y <- model.response(model.frame(lm.SR))
 ## Leave-one-out CV least-squares prediction errors (relatively fast)
 rCV <- vapply(seq_len(nrow(X)), function(i)
               y[i] - X[i,] \%*\% .lm.fit(X[-i,], y[-i])$coefficients,
               numeric(1))
 ## are the same as the *faster* rstandard(*, "pred") :
 stopifnot(all.equal(rCV, unname(rstandard(lm.SR, type = "predictive"))))


 ## Huber's data [Atkinson 1985]
 xh <- c(-4:0, 10)
 yh <- c(2.48, .73, -.04, -1.44, -1.32, 0)
 lmH <- lm(yh ~ xh)
 \donttest{summary(lmH)}
 im <- influence.measures(lmH)
 \donttest{ im }
 is.inf <- apply(im$is.inf, 1, any)
 plot(xh,yh, main = "Huber's data: L.S. line and influential obs.")
 abline(lmH); points(xh[is.inf], yh[is.inf], pch = 20, col = 2)

 ## Irwin's data [Williams 1987]
 xi <- 1:5
 yi <- c(0,2,14,19,30)    # number of mice responding to dose xi
 mi <- rep(40, 5)         # number of mice exposed
 glmI <- glm(cbind(yi, mi -yi) ~ xi, family = binomial)
 \donttest{summary(glmI)}
 signif(cooks.distance(glmI), 3)   # ~= Ci in Table 3, p.184
 imI <- influence.measures(glmI)
 \donttest{ imI }
 stopifnot(all.equal(imI$infmat[,"cook.d"],
           cooks.distance(glmI)))
 }
 \keyword{regression}
	% File src/library/stats/man/influence.measures.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 1995-2022 R Core Team
	% Distributed under GPL 2 or later

	\name{influence.measures}
	\title{Regression Deletion Diagnostics}

	\concept{studentized residuals}
	\concept{standardized residuals}
	\concept{Cook's distances}
	\concept{Covariance ratios}
	\concept{DFBETAs}
	\concept{DFFITs}
	\concept{PRESS}

	\alias{influence.measures}
	%\alias{print.infl}
	%\alias{summary.infl} <- To document: has 'digits' & return()s
	\alias{hat}
	\alias{hatvalues}
	\alias{hatvalues.lm}
	\alias{rstandard}
	\alias{rstandard.lm}
	\alias{rstandard.glm}
	\alias{rstudent}
	\alias{rstudent.lm}
	\alias{rstudent.glm}
	\alias{dfbeta}
	\alias{dfbeta.lm}
	\alias{dfbetas}
	\alias{dfbetas.lm}
	\alias{dffits}
	\alias{covratio}
	\alias{cooks.distance}
	\alias{cooks.distance.lm}
	\alias{cooks.distance.glm}
	\usage{
	influence.measures(model, infl = influence(model))

	rstandard(model, \dots)
	\method{rstandard}{lm}(model, infl = lm.influence(model, do.coef = FALSE),
	sd = sqrt(deviance(model)/df.residual(model)),
	type = c("sd.1", "predictive"), \dots)
	\method{rstandard}{glm}(model, infl = influence(model, do.coef = FALSE),
	type = c("deviance", "pearson"), \dots)

	rstudent(model, \dots)
	\method{rstudent}{lm}(model, infl = lm.influence(model, do.coef = FALSE),
	res = infl$wt.res, \dots)
	\method{rstudent}{glm}(model, infl = influence(model, do.coef = FALSE), \dots)

	dffits(model, infl = , res = )

	dfbeta(model, \dots)
	\method{dfbeta}{lm}(model, infl = lm.influence(model, do.coef = TRUE), \dots)

	dfbetas(model, \dots)
	\method{dfbetas}{lm}(model, infl = lm.influence(model, do.coef = TRUE), \dots)

	covratio(model, infl = lm.influence(model, do.coef = FALSE),
	res = weighted.residuals(model))

	cooks.distance(model, \dots)
	\method{cooks.distance}{lm}(model, infl = lm.influence(model, do.coef = FALSE),
	res = weighted.residuals(model),
	sd = sqrt(deviance(model)/df.residual(model)),
	hat = infl$hat, \dots)
	\method{cooks.distance}{glm}(model, infl = influence(model, do.coef = FALSE),
	res = infl$pear.res,
	dispersion = summary(model)$dispersion,
	hat = infl$hat, \dots)

	hatvalues(model, \dots)
	\method{hatvalues}{lm}(model, infl = lm.influence(model, do.coef = FALSE), \dots)

	hat(x, intercept = TRUE)
	}
	\arguments{
	\item{model}{an \R object, typically returned by \code{\link{lm}} or
	\code{\link{glm}}.}
	\item{infl}{influence structure as returned by
	\code{\link{lm.influence}} or \code{\link{influence}} (the latter
	only for the \code{glm} method of \code{rstudent} and
	\code{cooks.distance}).}
	\item{res}{(possibly weighted) residuals, with proper default.}
	\item{sd}{standard deviation to use, see default.}
	\item{dispersion}{dispersion (for \code{\link{glm}} objects) to use,
	see default.}
	\item{hat}{hat values \eqn{H_{ii}}{H[i,i]}, see default.}
	\item{type}{type of residuals for \code{rstandard}, with different
	options and meanings for \code{lm} and \code{glm}. Can be
	abbreviated.}

	\item{x}{the \eqn{X} or design matrix.}
	\item{intercept}{should an intercept column be prepended to \code{x}?}
	\item{\dots}{further arguments passed to or from other methods.}
	}
	\description{
	This suite of functions can be used to compute some of the regression
	(leave-one-out deletion) diagnostics for linear and generalized linear
	models discussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982),
	etc.
	}
	\details{
	The primary high-level function is \code{influence.measures} which produces a
	class \code{"infl"} object tabular display showing the DFBETAS for
	each model variable, DFFITS, covariance ratios, Cook's distances and
	the diagonal elements of the hat matrix. Cases which are influential
	with respect to any of these measures are marked with an asterisk.

	The functions \code{dfbetas}, \code{dffits},
	\code{covratio} and \code{cooks.distance} provide direct access to the
	corresponding diagnostic quantities. Functions \code{rstandard} and
	\code{rstudent} give the standardized and Studentized residuals
	respectively. (These re-normalize the residuals to have unit variance,
	using an overall and leave-one-out measure of the error variance
	respectively.)

	Note that for \emph{multivariate} \code{lm()} models (of class
	\code{"mlm"}), these functions return 3d arrays instead of matrices,
	or matrices instead of vectors.

	Values for generalized linear models are approximations, as described
	in Williams (1987) (except that Cook's distances are scaled as
	\eqn{F} rather than as chi-square values). The approximations can be
	poor when some cases have large influence.

	The optional \code{infl}, \code{res} and \code{sd} arguments are there
	to encourage the use of these direct access functions, in situations
	where, e.g., the underlying basic influence measures (from
	\code{\link{lm.influence}} or the generic \code{\link{influence}}) are
	already available.

	Note that cases with \code{weights == 0} are \emph{dropped} from all
	these functions, but that if a linear model has been fitted with
	\code{na.action = na.exclude}, suitable values are filled in for the
	cases excluded during fitting.

	For linear models, \code{rstandard(*, type = "predictive")} provides
	leave-one-out cross validation residuals, and the \dQuote{PRESS}
	statistic (\bold{PRE}dictive \bold{S}um of \bold{S}quares, the same as
	the CV score) of model \code{model} is \preformatted{ PRESS <- sum(rstandard(model, type="pred")^2)}

	The function \code{hat()} exists mainly for S (version 2)
	compatibility; we recommend using \code{hatvalues()} instead.
	}
	\note{
	For \code{hatvalues}, \code{dfbeta}, and \code{dfbetas}, the method
	for linear models also works for generalized linear models.
	}
	\author{
	Several R core team members and John Fox, originally in his \file{car}
	package.
	}
	\references{
	Belsley, D. A., Kuh, E. and Welsch, R. E. (1980).
	\emph{Regression Diagnostics}.
	New York: Wiley.

	Cook, R. D. and Weisberg, S. (1982).
	\emph{Residuals and Influence in Regression}.
	London: Chapman and Hall.

	Williams, D. A. (1987).
	Generalized linear model diagnostics using the deviance and single
	case deletions.
	\emph{Applied Statistics}, \bold{36}, 181--191.
	\doi{10.2307/2347550}.

	Fox, J. (1997).
	\emph{Applied Regression, Linear Models, and Related Methods}.
	Sage.

	Fox, J. (2002)
	\emph{An R and S-Plus Companion to Applied Regression}.
	Sage Publ.

	Fox, J. and Weisberg, S. (2011).
	\emph{An R Companion to Applied Regression}, second edition.
	Sage Publ;
	\url{https://socialsciences.mcmaster.ca/jfox/Books/Companion/}.
	}
	\seealso{
	\code{\link{influence}} (containing \code{\link{lm.influence}}).

	\sQuote{\link{plotmath}} for the use of \code{hat} in plot annotation.
	}
	\examples{
	require(graphics)

	## Analysis of the life-cycle savings data
	## given in Belsley, Kuh and Welsch.
	lm.SR <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data = LifeCycleSavings)

	inflm.SR <- influence.measures(lm.SR)
	which(apply(inflm.SR$is.inf, 1, any))
	# which observations 'are' influential
	summary(inflm.SR) # only these
	\donttest{inflm.SR # all}
	plot(rstudent(lm.SR) ~ hatvalues(lm.SR)) # recommended by some
	plot(lm.SR, which = 5) # an enhanced version of that via plot(<lm>)

	## The 'infl' argument is not needed, but avoids recomputation:
	rs <- rstandard(lm.SR)
	iflSR <- influence(lm.SR)
	all.equal(rs, rstandard(lm.SR, infl = iflSR), tolerance = 1e-10)
	## to "see" the larger values:
	1000 * round(dfbetas(lm.SR, infl = iflSR), 3)
	cat("PRESS :"); (PRESS <- sum( rstandard(lm.SR, type = "predictive")^2 ))
	stopifnot(all.equal(PRESS, sum( (residuals(lm.SR) / (1 - iflSR$hat))^2)))

	## Show that "PRE-residuals" == L.O.O. Crossvalidation (CV) errors:
	X <- model.matrix(lm.SR)
	y <- model.response(model.frame(lm.SR))
	## Leave-one-out CV least-squares prediction errors (relatively fast)
	rCV <- vapply(seq_len(nrow(X)), function(i)
	y[i] - X[i,] \%*\% .lm.fit(X[-i,], y[-i])$coefficients,
	numeric(1))
	## are the same as the faster rstandard(*, "pred") :
	stopifnot(all.equal(rCV, unname(rstandard(lm.SR, type = "predictive"))))


	## Huber's data [Atkinson 1985]
	xh <- c(-4:0, 10)
	yh <- c(2.48, .73, -.04, -1.44, -1.32, 0)
	lmH <- lm(yh ~ xh)
	\donttest{summary(lmH)}
	im <- influence.measures(lmH)
	\donttest{ im }
	is.inf <- apply(im$is.inf, 1, any)
	plot(xh,yh, main = "Huber's data: L.S. line and influential obs.")
	abline(lmH); points(xh[is.inf], yh[is.inf], pch = 20, col = 2)

	## Irwin's data [Williams 1987]
	xi <- 1:5
	yi <- c(0,2,14,19,30) # number of mice responding to dose xi
	mi <- rep(40, 5) # number of mice exposed
	glmI <- glm(cbind(yi, mi -yi) ~ xi, family = binomial)
	\donttest{summary(glmI)}
	signif(cooks.distance(glmI), 3) # ~= Ci in Table 3, p.184
	imI <- influence.measures(glmI)
	\donttest{ imI }
	stopifnot(all.equal(imI$infmat[,"cook.d"],
	cooks.distance(glmI)))
	}
	\keyword{regression}