src/library/stats/man/runmed.Rd - R - Git at Google

 % File src/library/stats/man/runmed.Rd
 % Part of the R package, https://www.R-project.org
 % Copyright 1995-2019 R Core Team
 % Distributed under GPL 2 or later

 \name{runmed}
 \title{Running Medians -- Robust Scatter Plot Smoothing}
 \alias{runmed}
 \encoding{UTF-8}
 \description{
   Compute running medians of odd span.  This is the \sQuote{most robust}
   scatter plot smoothing possible.  For efficiency (and historical
   reason), you can use one of two different algorithms giving identical
   results.
 }
 \usage{
 runmed(x, k, endrule = c("median", "keep", "constant"),
        algorithm = NULL,
        na.action = c("+Big_alternate", "-Big_alternate", "na.omit", "fail"),
        print.level = 0)
 }
 \arguments{
   \item{x}{numeric vector, the \sQuote{dependent} variable to be
     smoothed.}
   \item{k}{integer width of median window; must be odd.  Turlach had a
     default of \code{k <- 1 + 2 * min((n-1)\%/\% 2, ceiling(0.1*n))}.
     Use \code{k = 3} for \sQuote{minimal} robust smoothing eliminating
     isolated outliers.}
   \item{endrule}{character string indicating how the values at the
     beginning and the end (of the data) should be treated.
     Can be abbreviated.  Possible values are:
     \describe{
       \item{\code{"keep"}}{keeps the first and last \eqn{k_2}{k2} values
         at both ends, where \eqn{k_2}{k2} is the half-bandwidth
         \code{k2 = k \%/\% 2},
         i.e., \code{y[j] = x[j]} for \eqn{j \in \{1,\ldots,k_2;
           n-k_2+1,\ldots,n\}}{j = 1, \dots, k2 and (n-k2+1), \dots, n};}
       \item{\code{"constant"}}{copies \code{median(y[1:k2])} to the first
         values and analogously for the last ones making the smoothed ends
         \emph{constant};}
       \item{\code{"median"}}{the default, smooths the ends by using
         symmetrical medians of subsequently smaller bandwidth, but for
         the very first and last value where Tukey's robust end-point
         rule is applied, see \code{\link{smoothEnds}}.}
     }
   }
   \item{algorithm}{character string (partially matching \code{"Turlach"} or
     \code{"Stuetzle"}) or the default \code{NULL}, specifying which algorithm
     should be applied.  The default choice depends on \code{n = length(x)}
     and \code{k} where \code{"Turlach"} will be used for larger problems.}
   \item{na.action}{character string determining the behavior in the case of
     \code{\link{NA}} or \code{\link{NaN}} in \code{x}, (partially matching)
     one of
     \describe{
       \item{\code{"+Big_alternate"}}{Here, all the NAs in \code{x} are
 	first replaced by alternating \eqn{\pm B}{+/- B} where \eqn{B} is a
 	\dQuote{Big} number (with \eqn{2B < M*}, where
 	\eqn{M*=}\code{\link{.Machine} $ double.xmax}).  The replacement
 	values are \dQuote{from left} \eqn{(+B, -B, +B, \ldots)},
 	i.e. start with \code{"+"}.}
       \item{\code{"-Big_alternate"}}{almost the same as
 	\code{"+Big_alternate"}, just starting with \eqn{-B} (\code{"-Big..."}).}
       \item{\code{"na.omit"}}{the result is the same as
 	\code{runmed(x[!is.na(x)], k, ..)}.}
       \item{\code{"fail"}}{the presence of NAs in \code{x} will raise an error.}
     }
   }
   \item{print.level}{integer, indicating verboseness of algorithm;
     should rarely be changed by average users.}
 }
 \value{vector of smoothed values of the same length as \code{x} with an
   \code{\link{attr}}ibute \code{k} containing (the \sQuote{oddified})
   \code{k}.
 }
 \details{
   Apart from the end values, the result \code{y = runmed(x, k)} simply has
   \code{y[j] = median(x[(j-k2):(j+k2)])} (\code{k = 2*k2+1}), computed very
   efficiently.

   The two algorithms are internally entirely different:
   \describe{
     \item{\code{"Turlach"}}{is the \enc{Härdle}{Haerdle}--Steiger
       algorithm (see Ref.) as implemented by Berwin Turlach.
       A tree algorithm is used, ensuring performance \eqn{O(n \log
         k)}{O(n * log(k))} where \code{n = length(x)} which is
       asymptotically optimal.}
     \item{\code{"Stuetzle"}}{is the (older) Stuetzle--Friedman implementation
       which makes use of median \emph{updating} when one observation
       enters and one leaves the smoothing window.  While this performs as
       \eqn{O(n \times k)}{O(n * k)} which is slower asymptotically, it is
       considerably faster for small \eqn{k} or \eqn{n}.}
   }

   Note that, both algorithms (and the \code{\link{smoothEnds}()} utility)
   now \dQuote{work} also when \code{x} contains non-finite entries
   (\eqn{\pm}{+/-}\code{\link{Inf}}, \code{\link{NaN}}, and
   \code{\link{NA}}):
   \describe{
     \item{\code{"Turlach"}}{.......
     }
     \item{\code{"Stuetzle"}}{currently simply works by applying the
       underlying math library (\file{libm}) arithmetic for the non-finite
       numbers; this may optionally change in the future.
     }
   }


   Currently long vectors are only supported for \code{algorithm = "Stuetzle"}.
 }
 \references{
   \enc{Härdle}{Haerdle}, W. and Steiger, W. (1995)
   Algorithm AS 296: Optimal median smoothing,
   \emph{Applied Statistics} \bold{44}, 258--264.
   \doi{10.2307/2986349}.

   Jerome H. Friedman and Werner Stuetzle (1982)
   \emph{Smoothing of Scatterplots};
   Report, Dep. Statistics, Stanford U., Project Orion 003.
 %%
 %% Martin Maechler (2003)
 %% Fast Running Medians: Finite Sample and Asymptotic Optimality;
 %% "lost" working paper
 }
 \author{Martin Maechler \email{maechler@stat.math.ethz.ch},
   based on Fortran code from Werner Stuetzle and S-PLUS and C code from
   Berwin Turlach.
 }
 \seealso{
   \code{\link{smoothEnds}} which implements Tukey's end point rule and
   is called by default from \code{runmed(*, endrule = "median")}.
   \code{\link{smooth}} uses running
   medians of 3 for its compound smoothers.
 }
 \examples{
 require(graphics)

 utils::example(nhtemp)
 myNHT <- as.vector(nhtemp)
 myNHT[20] <- 2 * nhtemp[20]
 plot(myNHT, type = "b", ylim = c(48, 60), main = "Running Medians Example")
 lines(runmed(myNHT, 7), col = "red")

 ## special: multiple y values for one x
 plot(cars, main = "'cars' data and runmed(dist, 3)")
 lines(cars, col = "light gray", type = "c")
 with(cars, lines(speed, runmed(dist, k = 3), col = 2))
 %% FIXME: Show how to do it properly ! tapply(*, unique(.), median)

 ## nice quadratic with a few outliers
 y <- ys <- (-20:20)^2
 y [c(1,10,21,41)] <- c(150, 30, 400, 450)
 all(y == runmed(y, 1)) # 1-neighbourhood <==> interpolation
 plot(y) ## lines(y, lwd = .1, col = "light gray")
 lines(lowess(seq(y), y, f = 0.3), col = "brown")
 lines(runmed(y, 7), lwd = 2, col = "blue")
 lines(runmed(y, 11), lwd = 2, col = "red")

 ## Lowess is not robust
 y <- ys ; y[21] <- 6666 ; x <- seq(y)
 col <- c("black", "brown","blue")
 plot(y, col = col[1])
 lines(lowess(x, y, f = 0.3), col = col[2])
 %% predict(loess(y ~ x, span = 0.3, degree=1, family = "symmetric"))
 %% gives 6-line warning but does NOT break down
 lines(runmed(y, 7),      lwd = 2, col = col[3])
 legend(length(y),max(y), c("data", "lowess(y, f = 0.3)", "runmed(y, 7)"),
        xjust = 1, col = col, lty = c(0, 1, 1), pch = c(1,NA,NA))

 ## An example with initial NA's - used to fail badly (notably for "Turlach"):
 x15 <- c(rep(NA, 4), c(9, 9, 4, 22, 6, 1, 7, 5, 2, 8, 3))
 rS15 <- cbind(Sk.3 = runmed(x15, k = 3, algorithm="S"),
               Sk.7 = runmed(x15, k = 7, algorithm="S"),
               Sk.11= runmed(x15, k =11, algorithm="S"))
 rT15 <- cbind(Tk.3 = runmed(x15, k = 3, algorithm="T", print.level=1),
               Tk.7 = runmed(x15, k = 7, algorithm="T", print.level=1),
               Tk.9 = runmed(x15, k = 9, algorithm="T", print.level=1),
               Tk.11= runmed(x15, k =11, algorithm="T", print.level=1))
 cbind(x15, rS15, rT15) # result for k=11  maybe a bit surprising ..
 Tv <- rT15[-(1:3),]
 stopifnot(3 <= Tv, Tv <= 9, 5 <= Tv[1:10,])
 matplot(y = cbind(x15, rT15), type = "b", ylim = c(1,9), pch=1:5, xlab = NA,
         main = "runmed(x15, k, algo = \"Turlach\")")
 mtext(paste("x15 <-", deparse(x15)))
 points(x15, cex=2)
 legend("bottomleft", legend=c("data", paste("k = ", c(3,7,9,11))),
        bty="n", col=1:5, lty=1:5, pch=1:5)
 }
 \keyword{smooth}
 \keyword{robust}
	% File src/library/stats/man/runmed.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 1995-2019 R Core Team
	% Distributed under GPL 2 or later

	\name{runmed}
	\title{Running Medians -- Robust Scatter Plot Smoothing}
	\alias{runmed}
	\encoding{UTF-8}
	\description{
	Compute running medians of odd span. This is the \sQuote{most robust}
	scatter plot smoothing possible. For efficiency (and historical
	reason), you can use one of two different algorithms giving identical
	results.
	}
	\usage{
	runmed(x, k, endrule = c("median", "keep", "constant"),
	algorithm = NULL,
	na.action = c("+Big_alternate", "-Big_alternate", "na.omit", "fail"),
	print.level = 0)
	}
	\arguments{
	\item{x}{numeric vector, the \sQuote{dependent} variable to be
	smoothed.}
	\item{k}{integer width of median window; must be odd. Turlach had a
	default of \code{k <- 1 + 2 * min((n-1)\%/\% 2, ceiling(0.1*n))}.
	Use \code{k = 3} for \sQuote{minimal} robust smoothing eliminating
	isolated outliers.}
	\item{endrule}{character string indicating how the values at the
	beginning and the end (of the data) should be treated.
	Can be abbreviated. Possible values are:
	\describe{
	\item{\code{"keep"}}{keeps the first and last \eqn{k_2}{k2} values
	at both ends, where \eqn{k_2}{k2} is the half-bandwidth
	\code{k2 = k \%/\% 2},
	i.e., \code{y[j] = x[j]} for \eqn{j \in \{1,\ldots,k_2;
	n-k_2+1,\ldots,n\}}{j = 1, \dots, k2 and (n-k2+1), \dots, n};}
	\item{\code{"constant"}}{copies \code{median(y[1:k2])} to the first
	values and analogously for the last ones making the smoothed ends
	\emph{constant};}
	\item{\code{"median"}}{the default, smooths the ends by using
	symmetrical medians of subsequently smaller bandwidth, but for
	the very first and last value where Tukey's robust end-point
	rule is applied, see \code{\link{smoothEnds}}.}
	}
	}
	\item{algorithm}{character string (partially matching \code{"Turlach"} or
	\code{"Stuetzle"}) or the default \code{NULL}, specifying which algorithm
	should be applied. The default choice depends on \code{n = length(x)}
	and \code{k} where \code{"Turlach"} will be used for larger problems.}
	\item{na.action}{character string determining the behavior in the case of
	\code{\link{NA}} or \code{\link{NaN}} in \code{x}, (partially matching)
	one of
	\describe{
	\item{\code{"+Big_alternate"}}{Here, all the NAs in \code{x} are
	first replaced by alternating \eqn{\pm B}{+/- B} where \eqn{B} is a
	\dQuote{Big} number (with \eqn{2B < M*}, where
	\eqn{M*=}\code{\link{.Machine} $ double.xmax}). The replacement
	values are \dQuote{from left} \eqn{(+B, -B, +B, \ldots)},
	i.e. start with \code{"+"}.}
	\item{\code{"-Big_alternate"}}{almost the same as
	\code{"+Big_alternate"}, just starting with \eqn{-B} (\code{"-Big..."}).}
	\item{\code{"na.omit"}}{the result is the same as
	\code{runmed(x[!is.na(x)], k, ..)}.}
	\item{\code{"fail"}}{the presence of NAs in \code{x} will raise an error.}
	}
	}
	\item{print.level}{integer, indicating verboseness of algorithm;
	should rarely be changed by average users.}
	}
	\value{vector of smoothed values of the same length as \code{x} with an
	\code{\link{attr}}ibute \code{k} containing (the \sQuote{oddified})
	\code{k}.
	}
	\details{
	Apart from the end values, the result \code{y = runmed(x, k)} simply has
	\code{y[j] = median(x[(j-k2):(j+k2)])} (\code{k = 2*k2+1}), computed very
	efficiently.

	The two algorithms are internally entirely different:
	\describe{
	\item{\code{"Turlach"}}{is the \enc{Härdle}{Haerdle}--Steiger
	algorithm (see Ref.) as implemented by Berwin Turlach.
	A tree algorithm is used, ensuring performance \eqn{O(n \log
	k)}{O(n * log(k))} where \code{n = length(x)} which is
	asymptotically optimal.}
	\item{\code{"Stuetzle"}}{is the (older) Stuetzle--Friedman implementation
	which makes use of median \emph{updating} when one observation
	enters and one leaves the smoothing window. While this performs as
	\eqn{O(n \times k)}{O(n * k)} which is slower asymptotically, it is
	considerably faster for small \eqn{k} or \eqn{n}.}
	}

	Note that, both algorithms (and the \code{\link{smoothEnds}()} utility)
	now \dQuote{work} also when \code{x} contains non-finite entries
	(\eqn{\pm}{+/-}\code{\link{Inf}}, \code{\link{NaN}}, and
	\code{\link{NA}}):
	\describe{
	\item{\code{"Turlach"}}{.......
	}
	\item{\code{"Stuetzle"}}{currently simply works by applying the
	underlying math library (\file{libm}) arithmetic for the non-finite
	numbers; this may optionally change in the future.
	}
	}


	Currently long vectors are only supported for \code{algorithm = "Stuetzle"}.
	}
	\references{
	\enc{Härdle}{Haerdle}, W. and Steiger, W. (1995)
	Algorithm AS 296: Optimal median smoothing,
	\emph{Applied Statistics} \bold{44}, 258--264.
	\doi{10.2307/2986349}.

	Jerome H. Friedman and Werner Stuetzle (1982)
	\emph{Smoothing of Scatterplots};
	Report, Dep. Statistics, Stanford U., Project Orion 003.
	%%
	%% Martin Maechler (2003)
	%% Fast Running Medians: Finite Sample and Asymptotic Optimality;
	%% "lost" working paper
	}
	\author{Martin Maechler \email{maechler@stat.math.ethz.ch},
	based on Fortran code from Werner Stuetzle and S-PLUS and C code from
	Berwin Turlach.
	}
	\seealso{
	\code{\link{smoothEnds}} which implements Tukey's end point rule and
	is called by default from \code{runmed(*, endrule = "median")}.
	\code{\link{smooth}} uses running
	medians of 3 for its compound smoothers.
	}
	\examples{
	require(graphics)

	utils::example(nhtemp)
	myNHT <- as.vector(nhtemp)
	myNHT[20] <- 2 * nhtemp[20]
	plot(myNHT, type = "b", ylim = c(48, 60), main = "Running Medians Example")
	lines(runmed(myNHT, 7), col = "red")

	## special: multiple y values for one x
	plot(cars, main = "'cars' data and runmed(dist, 3)")
	lines(cars, col = "light gray", type = "c")
	with(cars, lines(speed, runmed(dist, k = 3), col = 2))
	%% FIXME: Show how to do it properly ! tapply(*, unique(.), median)

	## nice quadratic with a few outliers
	y <- ys <- (-20:20)^2
	y [c(1,10,21,41)] <- c(150, 30, 400, 450)
	all(y == runmed(y, 1)) # 1-neighbourhood <==> interpolation
	plot(y) ## lines(y, lwd = .1, col = "light gray")
	lines(lowess(seq(y), y, f = 0.3), col = "brown")
	lines(runmed(y, 7), lwd = 2, col = "blue")
	lines(runmed(y, 11), lwd = 2, col = "red")

	## Lowess is not robust
	y <- ys ; y[21] <- 6666 ; x <- seq(y)
	col <- c("black", "brown","blue")
	plot(y, col = col[1])
	lines(lowess(x, y, f = 0.3), col = col[2])
	%% predict(loess(y ~ x, span = 0.3, degree=1, family = "symmetric"))
	%% gives 6-line warning but does NOT break down
	lines(runmed(y, 7), lwd = 2, col = col[3])
	legend(length(y),max(y), c("data", "lowess(y, f = 0.3)", "runmed(y, 7)"),
	xjust = 1, col = col, lty = c(0, 1, 1), pch = c(1,NA,NA))

	## An example with initial NA's - used to fail badly (notably for "Turlach"):
	x15 <- c(rep(NA, 4), c(9, 9, 4, 22, 6, 1, 7, 5, 2, 8, 3))
	rS15 <- cbind(Sk.3 = runmed(x15, k = 3, algorithm="S"),
	Sk.7 = runmed(x15, k = 7, algorithm="S"),
	Sk.11= runmed(x15, k =11, algorithm="S"))
	rT15 <- cbind(Tk.3 = runmed(x15, k = 3, algorithm="T", print.level=1),
	Tk.7 = runmed(x15, k = 7, algorithm="T", print.level=1),
	Tk.9 = runmed(x15, k = 9, algorithm="T", print.level=1),
	Tk.11= runmed(x15, k =11, algorithm="T", print.level=1))
	cbind(x15, rS15, rT15) # result for k=11 maybe a bit surprising ..
	Tv <- rT15[-(1:3),]
	stopifnot(3 <= Tv, Tv <= 9, 5 <= Tv[1:10,])
	matplot(y = cbind(x15, rT15), type = "b", ylim = c(1,9), pch=1:5, xlab = NA,
	main = "runmed(x15, k, algo = \"Turlach\")")
	mtext(paste("x15 <-", deparse(x15)))
	points(x15, cex=2)
	legend("bottomleft", legend=c("data", paste("k = ", c(3,7,9,11))),
	bty="n", col=1:5, lty=1:5, pch=1:5)
	}
	\keyword{smooth}
	\keyword{robust}