blob: 2d639a1d83e77d3707ea194d0690e3e6fd8dccea [file] [log] [blame]
# File src/library/stats/R/loess.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1998-2015 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
loess <-
function(formula, data, weights, subset, na.action, model = FALSE,
span = 0.75, enp.target, degree = 2L, parametric = FALSE,
drop.square = FALSE, normalize = TRUE,
family = c("gaussian", "symmetric"),
method = c("loess", "model.frame"),
control = loess.control(...), ...)
{
family <- match.arg(family)
method <- match.arg(method)
mf <- match.call(expand.dots=FALSE)
mf$model <- mf$span <- mf$enp.target <- mf$degree <-
mf$parametric <- mf$drop.square <- mf$normalize <- mf$family <-
mf$method <- mf$control <- mf$... <- NULL
## need stats:: for non-standard evaluation
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
if (match.arg(method) == "model.frame") return(mf)
mt <- attr(mf, "terms")
y <- model.response(mf, "numeric")
w <- model.weights(mf)
if(is.null(w)) w <- rep_len(1, length(y))
nmx <- as.character(attr(mt, "variables"))[-(1L:2)]
x <- mf[, nmx, drop=FALSE]
if(any(sapply(x, is.factor))) stop("predictors must all be numeric")
x <- as.matrix(x)
D <- ncol(x)
nmx <- setNames(nm = colnames(x))
drop.square <- match(nmx, nmx[drop.square], 0L) > 0L
parametric <- match(nmx, nmx[parametric], 0L) > 0L
if(!match(degree, 0L:2L, 0L)) stop("'degree' must be 0, 1 or 2")
iterations <- if(family == "gaussian") 1L else control$iterations
if(!missing(enp.target))
if(!missing(span))
warning("both 'span' and 'enp.target' specified: 'span' will be used")
else { # White book p.321
tau <- switch(degree+1L, 1, D+1, (D+1)*(D+2)/2) - sum(drop.square)
span <- 1.2 * tau/enp.target
}
## Let's add sanity checks on control
if(!is.list(control) || !is.character(control$surface) ||
!is.character(control$statistics) || !is.character(control$trace.hat) ||
!is.numeric(control$cell) || !is.numeric(iterations))
stop("invalid 'control' argument")
fit <- simpleLoess(y, x, w, span, degree=degree, parametric=parametric,
drop.square=drop.square, normalize=normalize,
statistics=control$statistics, surface=control$surface,
cell=control$cell, iterations=iterations,
iterTrace=control$iterTrace, trace.hat=control$trace.hat)
fit$call <- match.call()
fit$terms <- mt
fit$xnames <- nmx
fit$x <- x
fit$y <- y
fit$weights <- w
if(model) fit$model <- mf
fit$na.action <- attr(mf, "na.action")
fit
}
loess.control <-
function(surface = c("interpolate", "direct"),
statistics = c("approximate", "exact", "none"),
trace.hat = c("exact", "approximate"),
cell = 0.2, iterations = 4L, iterTrace = FALSE, ...)
{
stopifnot(length(iterations) == 1L, !is.na(iterations), as.integer(iterations) > 0L,
length(iterTrace) == 1L, !is.na(iterTrace), as.integer(iterTrace) >= 0L)
list(surface = match.arg(surface),
statistics = match.arg(statistics),
trace.hat = match.arg(trace.hat),
cell=cell, iterations=iterations, iterTrace=iterTrace)
}
simpleLoess <- function(y, x, weights, span = 0.75, degree = 2L,
parametric = FALSE, drop.square = FALSE, normalize = TRUE,
statistics = "approximate", surface = "interpolate",
cell, iterations, ## iter. == 1 <==> "gaussian"
## tol = 1e-4, ## <- TODO stop iteration if converged := {rel.change <= tol}
iterTrace, trace.hat)
{
## loess_ translated to R.
D <- as.integer(NCOL(x))
if (is.na(D)) stop("invalid NCOL(X)")
if(D > 4) stop("only 1-4 predictors are allowed")
N <- as.integer(NROW(x))
if (is.na(N)) stop("invalid NROW(X)")
if(!N || !D) stop("invalid 'x'")
if(length(y) != N) stop("invalid 'y'")
x <- as.matrix(x)
storage.mode(x) <- "double"
storage.mode(y) <- "double"
storage.mode(weights) <- "double"
max.kd <- max(N, 200L)
robust <- rep_len(1, N)
if(normalize && D > 1L) {
trim <- ceiling(0.1 * N)
divisor <-
sqrt(apply(apply(x, 2L, sort)[seq(trim+1, N-trim), , drop = FALSE],
2L, var))
x <- x/rep(divisor, rep_len(N, D))
} else
divisor <- 1
sum.drop.sqr <- sum(drop.square)
sum.parametric <- sum(parametric)
nonparametric <- sum(!parametric)
order.parametric <- order(parametric)
x <- x[, order.parametric]
order.drop.sqr <- (2L - drop.square)[order.parametric]
if(degree == 1L && sum.drop.sqr)
stop("specified the square of a factor predictor to be dropped when degree = 1")
if(D == 1L && sum.drop.sqr)
stop("specified the square of a predictor to be dropped with only one numeric predictor")
if(sum.parametric == D) stop("specified parametric for all predictors")
if (length(span) != 1L) stop("invalid argument 'span'")
if (length(cell) != 1L) stop("invalid argument 'cell'")
if (length(degree) != 1L) stop("invalid argument 'degree'")
if(surface == "interpolate" && statistics == "approximate") # default
statistics <- if(trace.hat == "exact") "1.approx"
else "2.approx" # trace.hat == "approximate"
surf.stat <- paste(surface, statistics, sep = "/")
do.rob <- (iterations > 1L) # will do robustness iter.
if(!do.rob && iterTrace) {
warning(sprintf(gettext("iterTrace = %d is not obeyed since iterations = %d"),
iterTrace, iterations))
iterTrace <- FALSE
}
no.st <- (statistics == "none")
if(iterTrace) wRSS <- NA
for(j in seq_len(iterations)) {
no.st <- (statistics == "none")
z <- .C(C_loess_raw, # ../src/loessc.c
y, x,
if(no.st) 1 else weights,
if(no.st) weights * robust else 1,
D, N,
as.double(span),
as.integer(degree),
as.integer(nonparametric),
as.integer(order.drop.sqr),
as.integer(sum.drop.sqr),
as.double(span*cell),
as.character(surf.stat),
fitted.values = double(N),
parameter = integer(7L),
a = integer(max.kd),
xi = double(max.kd),
vert = double(2L*D),
vval = double((D+1L)*max.kd),
diagonal = double(N),
trL = double(1L),
delta1 = double(1L),
delta2 = double(1L),
as.integer(surf.stat == "interpolate/exact"))
fitted.residuals <- y - z$fitted.values
if(j < iterations) { ## update robustness weights,
## not for *last* iteration, so they remain consistent with 'fitted.values'
if(iterTrace) old.rob <- robust
robust <- .Fortran(C_lowesw, fitted.residuals, N,
robust = double(N), integer(N))$robust
}
if(j == 1) {
trace.hat.out <- z$trL
one.delta <- z$delta1
two.delta <- z$delta2
if(do.rob) {
statistics <- "none"
surf.stat <- paste(surface, statistics, sep = "/")
no.st <- TRUE
}
}
if(iterTrace) {
oSS <- wRSS
wRSS <- sum(weights * fitted.residuals^2)
del.SS <- abs(oSS-wRSS)/(if(wRSS == 0) 1 else wRSS)
d.rob.w <- if(j < iterations) ## have updated 'robust', see above
sum(abs(old.rob - robust)) / sum(robust) else NA
cat(sprintf(
"iter.%2d: wRSS=%#14.9g, rel. changes: (SS=%#9.4g, rob.wgts=%#9.4g)\n",
j, wRSS, del.SS, d.rob.w))
if(iterTrace >= 2 && j < iterations) {
cat("robustness weights:\n")
print(quantile(robust, probs=(0:8)/8), digits=3)
}
}
} ## end { iterations }
if(surface == "interpolate") {
pars <- setNames(z$parameter,
c("d", "n", "vc", "nc", "nv", "liv", "lv"))
enough <- (D + 1L) * pars[["nv"]]
fit.kd <- list(parameter=pars, a=z$a[1L:pars[4L]], xi=z$xi[1L:pars[4L]],
vert=z$vert, vval=z$vval[1L:enough])
}
if(do.rob) {
pseudovalues <- .Fortran(C_lowesp, # lowesp() in ../src/loessf.f
N,
as.double(y),
as.double(z$fitted.values),
as.double(weights), # 'pwgts'
as.double(robust), # 'rwgts'
integer(N),
pseudovalues = double(N))$pseudovalues
zz <- .C(C_loess_raw, pseudovalues,
x, weights, weights, D, N,
as.double(span),
as.integer(degree),
as.integer(nonparametric),
as.integer(order.drop.sqr),
as.integer(sum.drop.sqr),
as.double(span*cell),
as.character(surf.stat), ## == <surface>/none
fitted = double(N),
parameter = integer(7L),
a = integer(max.kd),
xi = double(max.kd),
vert = double(2L*D),
vval = double((D+1L)*max.kd),
diagonal = double(N),
trL = double(1L),
delta1 = double(1L),
delta2 = double(1L),
0L)[["fitted"]]
pseudo.resid <- pseudovalues - zz
}
sum.squares <- if(do.rob)
sum (weights * pseudo.resid^2)
else sum(weights * fitted.residuals^2)
enp <- one.delta + 2*trace.hat.out - N
s <- sqrt(sum.squares/one.delta)
## return
structure(
class = "loess",
list(n = N, fitted = z$fitted.values, residuals = fitted.residuals,
enp = enp, s = s, one.delta = one.delta, two.delta = two.delta,
trace.hat = trace.hat.out, divisor = divisor, robust = robust,
pars = list(span = span, degree = degree,
normalize = normalize,
parametric = parametric, drop.square = drop.square,
surface = surface, cell = cell,
family = if(iterations <= 1L) "gaussian" else "symmetric",
trace.hat = trace.hat, iterations = iterations),
kd = if(surface == "interpolate") fit.kd))
}
predict.loess <-
function(object, newdata = NULL, se = FALSE, na.action = na.pass, ...)
{
if(!inherits(object, "loess"))
stop("first argument must be a \"loess\" object")
if(is.null(newdata) && !se)
return(fitted(object))
op <- object$pars
res <- predLoess(object$y, object$x,
newx = if(is.null(newdata)) object$x
else if(is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), newdata,
na.action = na.action))
else as.matrix(newdata), # this case is undocumented
object$ s, object$ weights, object$ robust,
op$span, op$degree, op$normalize,
op$parametric, op$drop.square, op$surface,
op$cell, op$family,
object$ kd, object$ divisor, se = se)
if(!is.null(out.attrs <- attr(newdata, "out.attrs"))) { # expand.grid used
if(se) {
res$fit <- array(res$fit, out.attrs$dim, out.attrs$dimnames)
res$se.fit <- array(res$se.fit, out.attrs$dim, out.attrs$dimnames)
} else res <- array(res, out.attrs$dim, out.attrs$dimnames)
}
if(se)
res$df <- object$one.delta^2/object$two.delta
res
}
predLoess <-
function(y, x, newx, s, weights, robust, span, degree,
normalize, parametric, drop.square, surface, cell, family,
kd, divisor, se = FALSE)
{
## translation of pred_
D <- NCOL(x); N <- NROW(x); M <- NROW(newx)
x <- as.matrix(x); newx <- as.matrix(newx)
if(any(divisor != 1)) {
newx <- newx/rep(divisor, rep_len(M, D))
x <- x /rep(divisor, rep_len(N, D))
}
sum.drop.sqr <- sum(drop.square)
nonparametric <- sum(!parametric)
order.parametric <- order(parametric)
x <- x[, order.parametric, drop=FALSE]
x.evaluate <- newx[, order.parametric, drop=FALSE]
order.drop.sqr <- (2L - drop.square)[order.parametric]
storage.mode(x) <- "double"
storage.mode(y) <- "double"
if(surface == "direct") {
nas <- rowSums(is.na(newx)) > 0
fit <- rep_len(NA_real_, length(nas))
x.evaluate <- x.evaluate[!nas,, drop = FALSE]
M <- nrow(x.evaluate)
if(se) {
se.fit <- fit
z <- .C(C_loess_dfitse,
y,
x,
as.double(x.evaluate),
as.double(weights*robust),
as.double(robust),
as.integer(family =="gaussian"),
as.double(span),
as.integer(degree),
as.integer(nonparametric),
as.integer(order.drop.sqr),
as.integer(sum.drop.sqr),
as.integer(D),
as.integer(N),
as.integer(M),
fit = double(M),
L = double(N*M))[c("fit", "L")]
fit[!nas] <- z$fit
ses <- rowSums(matrix(z$L^2, M, N) / rep(weights, rep_len(M,N)))
se.fit[!nas] <- s * sqrt(ses)
} else {
fit[!nas] <- .C(C_loess_dfit,
y,
x,
as.double(x.evaluate),
as.double(weights*robust),
as.double(span),
as.integer(degree),
as.integer(nonparametric),
as.integer(order.drop.sqr),
as.integer(sum.drop.sqr),
as.integer(D),
as.integer(N),
as.integer(M),
fit = double(M))$fit
}
}
else { ## interpolate
## need to eliminate points outside original range - not in pred_
ranges <- apply(x, 2L, range)
inside <-
rowSums((x.evaluate <= rep(ranges[2L,], rep_len(M, D))) &
(x.evaluate >= rep(ranges[1L,], rep_len(M, D)))) == D
inside[is.na(inside)] <- FALSE
M1 <- sum(inside)
fit <- rep_len(NA_real_, M)
if(any(inside))
fit[inside] <- .C(C_loess_ifit,
as.integer(kd$parameter),
as.integer(kd$a), as.double(kd$xi),
as.double(kd$vert), as.double(kd$vval),
as.integer(M1),
as.double(x.evaluate[inside, ]),
fit = double(M1))$fit
if(se) {
se.fit <- rep_len(NA_real_, M)
if(any(inside)) {
L <- .C(C_loess_ise,
y,
x,
as.double(x.evaluate[inside, ]),
as.double(weights),
as.double(span),
as.integer(degree),
as.integer(nonparametric),
as.integer(order.drop.sqr),
as.integer(sum.drop.sqr),
as.double(span*cell),
as.integer(D),
as.integer(N),
as.integer(M1),
double(M1),
L = double(N*M1)
)$L
tmp <- rowSums(matrix(L^2, M1, N) / rep(weights, rep_len(M1,N)))
se.fit[inside] <- s * sqrt(tmp)
}
}
}
rn <- rownames(newx)
if(se) {
if(!is.null(rn)) names(fit) <- names(se.fit) <- rn
list(fit = fit, se.fit = drop(se.fit), residual.scale = s)
} else {
if(!is.null(rn)) names(fit) <- rn
fit
}
}
pointwise <- function(results, coverage)
{
fit <- results$fit
lim <- qt((1 - coverage)/2, results$df, lower.tail = FALSE) * results$se.fit
list(fit = fit, lower = fit - lim, upper = fit + lim)
}
print.loess <- function(x, digits = max(3L, getOption("digits") - 3L), ...)
{
if(!is.null(cl <- x$call)) {
cat("Call:\n")
dput(cl, control=NULL)
}
cat("\nNumber of Observations:", x$n, "\n")
cat("Equivalent Number of Parameters:", format(round(x$enp, 2L)), "\n")
cat("Residual",
if(x$pars$family == "gaussian")"Standard Error:" else "Scale Estimate:",
format(signif(x$s, digits)), "\n")
invisible(x)
}
summary.loess <- function(object, ...)
{
class(object) <- "summary.loess"
object
}
print.summary.loess <-
function(x, digits = max(3L, getOption("digits") - 3L), ...)
{
print.loess(x, digits=digits, ...)
cat("Trace of smoother matrix: ", format(round(x$trace.hat, 2L)),
" (",x$pars$trace.hat, ")\n", sep="")
cat("\nControl settings:\n")
cat(" span : ", format(x$pars$span), "\n")
cat(" degree : ", x$pars$degree, "\n")
cat(" family : ", x$pars$family)
if(x$pars$family != "gaussian")
cat(" iterations =", x$pars$iterations)
cat("\n surface : ", x$pars$surface)
if(x$pars$surface == "interpolate")
cat(" cell =", format(x$pars$cell))
cat("\n normalize: ", x$pars$normalize)
cat("\n parametric: ", x$pars$parametric)
cat("\ndrop.square: ", x$pars$drop.square, "\n")
invisible(x)
}
scatter.smooth <-
function(x, y = NULL, span = 2/3, degree = 1,
family = c("symmetric", "gaussian"),
xlab = NULL, ylab = NULL,
ylim = range(y, pred$y, na.rm = TRUE),
evaluation = 50, ..., lpars = list())
{
xlabel <- if (!missing(x)) deparse(substitute(x))
ylabel <- if (!missing(y)) deparse(substitute(y))
xy <- xy.coords(x, y, xlabel, ylabel)
x <- xy$x
y <- xy$y
xlab <- if (is.null(xlab)) xy$xlab else xlab
ylab <- if (is.null(ylab)) xy$ylab else ylab
pred <- loess.smooth(x, y, span, degree, family, evaluation)
plot(x, y, ylim = ylim, xlab = xlab, ylab = ylab, ...)
do.call(lines, c(list(pred), lpars))
invisible()
}
loess.smooth <-
function(x, y, span = 2/3, degree = 1, family = c("symmetric", "gaussian"),
evaluation = 50, ...)
{
notna <- !(is.na(x) | is.na(y))
x <- x[notna]; y <- y[notna]
new.x <- seq.int(min(x), max(x), length.out = evaluation)
control <- loess.control(...)
w <- rep_len(1, length(y))
family <- match.arg(family)
iterations <- if(family == "gaussian") 1L else control$iterations
kd <- simpleLoess(y, x, w, span, degree=degree, parametric=FALSE, drop.square=FALSE,
normalize=FALSE, statistics="none", surface="interpolate",
cell=control$cell, iterations=iterations,
iterTrace=control$iterTrace, trace.hat=control$trace.hat)$kd
z <- .C(C_loess_ifit,
as.integer(kd$parameter),
as.integer(kd$a), as.double(kd$xi),
as.double(kd$vert), as.double(kd$vval),
as.integer(evaluation),
as.double(new.x),
fit = double(evaluation))$fit
list(x = new.x, y = z)
}
anova.loess <- function(object, ...)
{
objects <- list(object, ...)
responses <- as.character(lapply(objects,
function(x) as.character(x$terms[[2L]])))
sameresp <- responses == responses[1L]
## calculate the number of models
if (!all(sameresp)) {
objects <- objects[sameresp]
warning(gettextf("models with response %s removed because response differs from model 1",
sQuote(deparse(responses[!sameresp]))),
domain = NA)
}
nmodels <- length(objects)
if(nmodels <= 1L) stop("no models to compare")
models <- as.character(lapply(objects, function(x) x$call))
descr <- paste0("Model ", format(1L:nmodels), ": ", models,
collapse = "\n")
## extract statistics
delta1 <- sapply(objects, function(x) x$one.delta)
delta2 <- sapply(objects, function(x) x$two.delta)
s <- sapply(objects, function(x) x$s)
enp <- sapply(objects, function(x) x$enp)
rss <- s^2*delta1
max.enp <- order(enp)[nmodels]
d1diff <- abs(diff(delta1))
dfnum <- c(d1diff^2/abs(diff(delta2)))
dfden <- (delta1^2/delta2)[max.enp]
Fvalue <- c(NA, (abs(diff(rss))/d1diff)/s[max.enp]^2)
pr <- pf(Fvalue, dfnum, dfden, lower.tail = FALSE)
ans <- data.frame(ENP = round(enp,2L), RSS = rss, "F-value" = Fvalue,
"Pr(>F)" = pr, check.names = FALSE)
attr(ans, "heading") <-
paste0(descr, "\n\n", "Analysis of Variance: denominator df ",
format(round(dfden, 2L)), "\n")
class(ans) <- c("anova", "data.frame")
ans
}