blob: 360f6b2b7c9b468c0433d5579c84712fc2067bf0 [file] [log] [blame]
# File src/library/stats/R/lm.influence.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1995-2018 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
### "lm" *and* "glm" leave-one-out influence measures
## this is mainly for back-compatibility (from "lsfit" time) -- use hatvalues()!
hat <- function(x, intercept = TRUE)
{
if(is.qr(x)) n <- nrow(x$qr)
else {
if(intercept) x <- cbind(1, x)
n <- nrow(x)
x <- qr(x)
}
rowSums(qr.qy(x, diag(1, nrow = n, ncol = x$rank))^2)
}
## see PR#7961, https://stat.ethz.ch/pipermail/r-devel/2011-January/059642.html
weighted.residuals <- function(obj, drop0 = TRUE)
{
w <- weights(obj)
r <- residuals(obj, type="deviance")
if(drop0 && !is.null(w)) {
if(is.matrix(r)) r[w != 0, , drop = FALSE] # e.g. mlm fit
else r[w != 0]
} else r
}
lm.influence <- function (model, do.coef = TRUE)
{
wt.res <- weighted.residuals(model)
e <- na.omit(wt.res)
is.mlm <- is.matrix(e) # n x q matrix in the multivariate lm case
if (model$rank == 0) {
n <- length(wt.res) # drops 0 wt, may drop NAs
sigma <- sqrt(deviance(model)/df.residual(model))
res <- list(hat = rep(0, n), coefficients = matrix(0, n, 0),
sigma = rep(sigma, n))
} else {
## if we have a point with hat = 1, the corresponding e should be
## exactly zero. Protect against returning Inf by forcing this
e[abs(e) < 100 * .Machine$double.eps * median(abs(e))] <- 0
mqr <- qr.lm(model)
n <- as.integer(nrow(mqr$qr))
if (is.na(n)) stop("invalid model QR matrix")
## in na.exclude case, omit NAs; also drop 0-weight cases
if(NROW(e) != n)
stop("non-NA residual length does not match cases used in fitting")
do.coef <- as.logical(do.coef)
tol <- 10 * .Machine$double.eps
res <- .Call(C_influence, mqr, do.coef, e, tol)
drop1d <- function(a) { # more cautious variant of drop(.)
d <- dim(a)
if(length(d) == 3L && d[[3L]] == 1L)
dim(a) <- d[-3L]
a
}
if(is.null(model$na.action)) {
if(!is.mlm) { ## drop the 'q=1' array extent (from C)
res$sigma <- drop(res$sigma)
if(do.coef)
res$coefficients <- drop1d(res$coefficients)
}
} else {
hat <- naresid(model$na.action, res$hat)
hat[is.na(hat)] <- 0 # omitted cases have 0 leverage
res$hat <- hat
if(do.coef) {
coefficients <- naresid(model$na.action, res$coefficients)
coefficients[is.na(coefficients)] <- 0 # omitted cases have 0 change
res$coefficients <- if(is.mlm) coefficients else drop1d(coefficients)
}
sigma <- naresid(model$na.action, res$sigma)
sigma[is.na(sigma)] <- sqrt(deviance(model)/df.residual(model))
res$sigma <- if(is.mlm) sigma else drop(sigma)
}
}
res$wt.res <- naresid(model$na.action, e)
res$hat[res$hat > 1 - 10*.Machine$double.eps] <- 1 # force 1
names(res$hat) <- names(res$sigma) <- names(res$wt.res)
if(do.coef) {
cf <- coef(model)
if(is.mlm) { # coef is 3d array
dnr <- dimnames(res$wt.res)
dimnames(res$coefficients) <- list(
dnr[[1L]],
rownames(cf)[!apply(cf, 1L, anyNA)],
dnr[[2L]])
} else {
dimnames(res$coefficients) <- list(names(res$wt.res),
names(cf)[!is.na(cf)])
}
}
res
}
## The following is adapted from John Fox's "car" :
influence <- function(model, ...) UseMethod("influence")
influence.lm <- function(model, do.coef = TRUE, ...)
lm.influence(model, do.coef = do.coef, ...)
influence.glm <- function(model, do.coef = TRUE, ...) {
res <- lm.influence(model, do.coef = do.coef, ...)
pRes <- na.omit(residuals(model, type = "pearson"))[model$prior.weights != 0]
pRes <- naresid(model$na.action, pRes)
names(res)[names(res) == "wt.res"] <- "dev.res"
c(res, list(pear.res = pRes))
}
hatvalues <- function(model, ...) UseMethod("hatvalues")
hatvalues.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), ...) infl$hat
rstandard <- function(model, ...) UseMethod("rstandard")
rstandard.lm <- function(model, infl = lm.influence(model, do.coef=FALSE),
sd = sqrt(deviance(model)/df.residual(model)),
type = c("sd.1", "predictive"), ...)
{
type <- match.arg(type)
res <- infl$wt.res / switch(type,
"sd.1" = c(outer(sqrt(1 - infl$hat), sd)),
"predictive" = 1 - infl$hat)
res[is.infinite(res)] <- NaN
res
}
### New version from Brett Presnell, March 2011
### Slightly modified (dispersion bit) by pd
rstandard.glm <-
function(model,
infl=influence(model, do.coef=FALSE),
type=c("deviance","pearson"), ...)
{
type <- match.arg(type)
res <- switch(type, pearson = infl$pear.res, infl$dev.res)
res <- res/sqrt(summary(model)$dispersion * (1 - infl$hat))
res[is.infinite(res)] <- NaN
res
}
rstudent <- function(model, ...) UseMethod("rstudent")
rstudent.lm <- function(model, infl = lm.influence(model, do.coef=FALSE),
res = infl$wt.res, ...)
{
res <- res / (infl$sigma * sqrt(1 - infl$hat))
res[is.infinite(res)] <- NaN
res
}
rstudent.glm <- function(model, infl = influence(model, do.coef=FALSE), ...)
{
r <- infl$dev.res
r <- sign(r) * sqrt(r^2 + (infl$hat * infl$pear.res^2)/(1 - infl$hat))
r[is.infinite(r)] <- NaN
if (any(family(model)$family == c("binomial", "poisson")))
r else r/infl$sigma
}
### FIXME for glm (see above) ?!?
dffits <- function(model, infl = lm.influence(model, do.coef=FALSE),
res = weighted.residuals(model))
{
res <- res * sqrt(infl$hat)/(infl$sigma*(1-infl$hat))
res[is.infinite(res)] <- NaN
res
}
dfbeta <- function(model, ...) UseMethod("dfbeta")
dfbeta.lm <- function(model, infl = lm.influence(model, do.coef=TRUE), ...)
{
## for lm & glm
b <- infl$coefficients
mlm <- is.matrix(wr <- infl$wt.res)
## is this needed -- check!?
if(!mlm) dimnames(b) <- list(names(wr), variable.names(model))
b
}
dfbetas <- function(model, ...) UseMethod("dfbetas")
dfbetas.lm <- function (model, infl = lm.influence(model, do.coef=TRUE), ...)
{
## for lm & glm
qrm <- qr(model)
xxi <- chol2inv(qrm$qr, qrm$rank)
db <- dfbeta(model, infl)
if(length(dim(db)) == 3L) db <- aperm(db, c(1L,3:2))
db / outer(infl$sigma, sqrt(diag(xxi)))
}
covratio <- function(model, infl = lm.influence(model, do.coef=FALSE),
res = weighted.residuals(model))
{
n <- nrow(qr.lm(model)$qr)
p <- model$rank
omh <- 1-infl$hat
e.star <- res/(infl$sigma*sqrt(omh))
e.star[is.infinite(e.star)] <- NaN
1/(omh*(((n - p - 1)+e.star^2)/(n - p))^p)
}
cooks.distance <- function(model, ...) UseMethod("cooks.distance")
## Used in plot.lm(); allow passing of known parts; `infl' used only via `hat'
cooks.distance.lm <-
function(model, infl = lm.influence(model, do.coef=FALSE),
res = weighted.residuals(model),
sd = sqrt(deviance(model)/df.residual(model)),
hat = infl$hat, ...)
{
p <- model$rank
res <- ((res/c(outer(1 - hat, sd)))^2 * hat)/p
res[is.infinite(res)] <- NaN
res
}
cooks.distance.glm <-
function(model, infl = influence(model, do.coef=FALSE),
res = infl$pear.res,
dispersion = summary(model)$dispersion, hat = infl$hat, ...)
{
p <- model$rank
res <- (res/(1-hat))^2 * hat/(dispersion* p)
res[is.infinite(res)] <- NaN
res
}
influence.measures <- function(model, infl = influence(model))
{
is.influential <- function(infmat, n)# n == sum(h > 0) [!]
{
## Argument is result of using influence.measures
d <- dim(infmat)
k <- d[[length(d)]] - 4L
if(n <= k)
stop("too few cases i with h_ii > 0), n < k")
absmat <- abs(infmat)
r <-
if(is.matrix(infmat)) { # usual non-mlm case
## a matrix of logicals structured like the argument
cbind(absmat[, 1L:k] > 1, # |dfbetas| > 1
absmat[, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > ..
abs(1 - infmat[, k + 2]) > (3*k)/(n - k),# |1-cov.r| >..
pf(infmat[, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5
infmat[, k + 4] > (3 * k)/n) # hat > 3k/n
} else { # is.mlm: a 3d-array, not a matrix
## a 3d-array of logicals structured like the argument
c(absmat[,, 1L:k] > 1, # |dfbetas| > 1
absmat[,, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > ..
abs(1 - infmat[,, k + 2]) > (3*k)/(n - k),# |1-cov.r| >..
pf(infmat[,, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5
infmat[,, k + 4] > (3 * k)/n) # hat > 3k/n
}
attributes(r) <- attributes(infmat) # dim, dimnames, ..
r
}
p <- model$rank
e <- weighted.residuals(model)
s <- sqrt(sum(e^2, na.rm=TRUE)/df.residual(model))
mqr <- qr.lm(model)
xxi <- chol2inv(mqr$qr, mqr$rank)
si <- infl$sigma
h <- infl$hat
is.mlm <- is.matrix(e)
cf <- if(is.mlm) aperm(infl$coefficients, c(1L,3:2)) else infl$coefficients
dfbetas <- cf / outer(infl$sigma, sqrt(diag(xxi)))
vn <- variable.names(model); vn[vn == "(Intercept)"] <- "1_"
dimnames(dfbetas)[[length(dim(dfbetas))]] <- paste0("dfb.", abbreviate(vn))
## Compatible to dffits():
dffits <- e*sqrt(h)/(si*(1-h))
if(any(ii <- is.infinite(dffits))) dffits[ii] <- NaN
cov.ratio <- (si/s)^(2 * p)/(1 - h)
cooks.d <-
if(inherits(model, "glm"))
(infl$pear.res/(1-h))^2 * h/(summary(model)$dispersion * p)
else # lm (incl "mlm")
((e/(s * (1 - h)))^2 * h)/p
infmat <-
if(is.mlm) { # a 3d-array, not a matrix
dns <- dimnames(dfbetas)
dns[[3]] <- c(dns[[3]], "dffit", "cov.r", "cook.d", "hat")
a <- array(dfbetas, dim = dim(dfbetas) + c(0,0, 3+1),
dimnames = dns)
a[,, "dffit"] <- dffits
a[,, "cov.r"] <- cov.ratio
a[,,"cook.d"] <- cooks.d
a[,, "hat" ] <- h
a
} else {
cbind(dfbetas, dffit = dffits, cov.r = cov.ratio,
cook.d = cooks.d, hat = h)
}
infmat[is.infinite(infmat)] <- NaN
is.inf <- is.influential(infmat, sum(h > 0))
ans <- list(infmat = infmat, is.inf = is.inf, call = model$call)
class(ans) <- "infl"
ans
}
print.infl <- function(x, digits = max(3L, getOption("digits") - 4L), ...)
{
## `x' : as the result of influence.measures(.)
cat("Influence measures of\n\t", deparse(x$call),":\n\n")
is.star <- apply(x$is.inf, 1L, any, na.rm = TRUE)
print(data.frame(x$infmat,
inf = ifelse(is.star, "*", " ")),
digits = digits, ...)
invisible(x)
}
summary.infl <-
function(object, digits = max(2L, getOption("digits") - 5L), ...)
{
## object must be as the result of influence.measures(.)
is.inf <- object$is.inf
isMLM <- length(dim(is.inf)) == 3L
## will have NaN values from any hat=1 rows.
is.inf[is.na(is.inf)] <- FALSE
is.star <- apply(is.inf, 1L, any)
cat("Potentially influential observations of\n\t",
deparse(object$call),":\n")
if(any(is.star)) {
if(isMLM) {
is.inf <- is.inf [is.star,,]
imat <- object $ infmat[is.star,, , drop = FALSE]
} else { # regular "lm"
is.inf <- is.inf [is.star, ]
imat <- object $ infmat[is.star, , drop = FALSE]
}
if(is.null(rownam <- dimnames(object $ infmat)[[1L]]))
rownam <- format(seq(is.star))
dimnames(imat)[[1L]] <- rownam[is.star]
chmat <- format(round(imat, digits = digits))
cat("\n")
print(array(paste0(chmat, c("", "_*")[1L + is.inf]),
dimnames = dimnames(imat), dim = dim(imat)),
quote = FALSE)
invisible(imat)
} else {
cat("NONE\n")
numeric()
}
}