blob: 6ec5d34b69b8f7492ccc1e116c0aad34c97d1013 [file] [log] [blame]
# File src/library/stats/R/aggregate.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1995-2018 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
aggregate <-
function(x, ...)
UseMethod("aggregate")
aggregate.default <-
function(x, ...)
{
if(is.ts(x))
aggregate.ts(as.ts(x), ...)
else
aggregate.data.frame(as.data.frame(x), ...)
}
aggregate.data.frame <-
function(x, by, FUN, ..., simplify = TRUE, drop = TRUE)
{
if(!is.data.frame(x)) x <- as.data.frame(x)
## Do this here to avoid masking by non-function (could happen)
FUN <- match.fun(FUN)
if(NROW(x) == 0L) stop("no rows to aggregate")
if(NCOL(x) == 0L) {
## fake it
x <- data.frame(x = rep(1, NROW(x)))
return(aggregate.data.frame(x, by, function(x) 0L)[seq_along(by)])
}
if(!is.list(by))
stop("'by' must be a list")
if(is.null(names(by)) && length(by))
names(by) <- paste0("Group.", seq_along(by))
else {
nam <- names(by)
ind <- which(!nzchar(nam))
names(by)[ind] <- paste0("Group.", ind)
}
if(any(lengths(by) != NROW(x)))
stop("arguments must have same length")
y <- as.data.frame(by, stringsAsFactors = FALSE)
keep <- complete.cases(by)
y <- y[keep, , drop = FALSE]
x <- x[keep, , drop = FALSE]
nrx <- NROW(x)
## Generate a group identifier vector with integers and dots.
ident <- function(x) {
y <- as.factor(x)
l <- length(levels(y))
s <- as.character(seq_len(l))
n <- nchar(s)
levels(y) <- paste0(strrep("0", n[l] - n), s)
y # levels used for drop = FALSE
}
grp <- lapply(y, ident)
multi.y <- !drop && ncol(y)
if(multi.y) {
lev <- lapply(grp, levels)
y <- as.list(y)
for (i in seq_along(y)) {
z <- y[[i]][match(lev[[i]], grp[[i]])]
if(is.factor(z) && any(keep <- is.na(z)))
z[keep] <- levels(z)[keep]
y[[i]] <- z
}
eGrid <- function(L)
expand.grid(L, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE)
y <- eGrid(y)
}
grp <- if(ncol(y)) {
names(grp) <- NULL
do.call(paste, c(rev(grp), list(sep = ".")))
} else
integer(nrx)
if(multi.y) {
lev <- as.list(eGrid(lev))
names(lev) <- NULL
lev <- do.call(paste, c(rev(lev), list(sep = ".")))
} else
y <- y[match(sort(unique(grp)), grp, 0L), , drop = FALSE]
z <- lapply(x,
function(e) {
## In case of a common length > 1, sapply() gives
## the transpose of what we need ...
ans <- lapply(X = unname(split(e, grp)), FUN = FUN, ...)
if(simplify &&
length(len <- unique(lengths(ans))) == 1L) {
## this used to lose classes
if(len == 1L) {
cl <- lapply(ans, oldClass)
cl1 <- cl[[1L]]
ans <- unlist(ans, recursive = FALSE, use.names = FALSE)
if (!is.null(cl1) &&
all(vapply(cl, identical, NA, y = cl1)))
class(ans) <- cl1
} else if(len > 1L)
ans <- matrix(unlist(ans, recursive = FALSE, use.names = FALSE),
ncol = len,
byrow = TRUE,
dimnames =
if(!is.null(nms <- names(ans[[1L]])))
list(NULL, nms) ## else NULL
)
}
ans
})
len <- length(y)
if(multi.y) {
keep <- match(lev, sort(unique(grp)))
for(i in seq_along(z))
y[[len + i]] <- if(is.matrix(z[[i]]))
z[[i]][keep, , drop = FALSE]
else z[[i]][keep]
} else
for(i in seq_along(z))
y[[len + i]] <- z[[i]]
names(y) <- c(names(by), names(x))
row.names(y) <- NULL
y
}
aggregate.formula <-
function(formula, data, FUN, ..., subset, na.action = na.omit)
{
if(missing(formula) || !inherits(formula, "formula"))
stop("'formula' missing or incorrect")
if(length(formula) != 3L)
stop("'formula' must have both left and right hand sides")
m <- match.call(expand.dots = FALSE)
if(is.matrix(eval(m$data, parent.frame())))
m$data <- as.data.frame(data)
m$... <- m$FUN <- NULL
## need stats:: for non-standard evaluation
m[[1L]] <- quote(stats::model.frame)
if (formula[[2L]] == ".") {
## LHS is a dot, expand it ...
##rhs <- unlist(strsplit(deparse(formula[[3L]]), " *[:+] *"))
## <NOTE>
## Note that this will not do quite the right thing in case the
## RHS contains transformed variables, such that
## setdiff(rhs, names(data))
## is non-empty ...
##lhs <- sprintf("cbind(%s)",
## paste(setdiff(names(data), rhs), collapse = ","))
## formula[[2L]] <- parse(text = lhs)[[1L]]
## </NOTE>
## New logic May 2012 --pd
## Dot expansion:
## lhs ends up as quote(cbind(v1, v2, ....)) using all variables in
## data, except those that are used on the RHS.
## This version uses terms() to get the rhs variables, which means
## that it will NOT remove a variable from the expansion if a
## transformation of it is on the RHS of the formula.
rhs <- as.list(attr(terms(formula[-2L]),"variables")[-1])
lhs <- as.call(c(quote(cbind),
setdiff(lapply(names(data), as.name),
rhs)
)
)
formula[[2L]] <- lhs
m[[2L]] <- formula
}
mf <- eval(m, parent.frame())
if(is.matrix(mf[[1L]])) {
## LHS is a cbind() combo, convert to data frame and fix names.
## Commented out May 2012 (seems to work without it) -- pd
##lhs <- setNames(as.data.frame(mf[[1L]]),
## as.character(m[[2L]][[2L]])[-1L])
lhs <- as.data.frame(mf[[1L]])
aggregate.data.frame(lhs, mf[-1L], FUN = FUN, ...)
}
else
aggregate.data.frame(mf[1L], mf[-1L], FUN = FUN, ...)
}
aggregate.ts <-
function(x, nfrequency = 1, FUN = sum, ndeltat = 1,
ts.eps = getOption("ts.eps"), ...)
{
x <- as.ts(x)
ofrequency <- tsp(x)[3L]
## do this here to avoid masking by non-function (could happen)
FUN <- match.fun(FUN)
## Set up the new frequency, and make sure it is an integer.
if(missing(nfrequency))
nfrequency <- 1 / ndeltat
if((nfrequency > 1) &&
(abs(nfrequency - round(nfrequency)) < ts.eps))
nfrequency <- round(nfrequency)
if(nfrequency == ofrequency)
return(x)
ratio <- ofrequency /nfrequency
if(abs(ratio - round(ratio)) > ts.eps)
stop(gettextf("cannot change frequency from %g to %g",
ofrequency, nfrequency), domain = NA)
## The desired result is obtained by applying FUN to blocks of
## length ofrequency/nfrequency, for each of the variables in x.
## We first get the new start and end right, and then break x into
## such blocks by reshaping it into an array and setting dim.
## avoid e.g. 1.0 %/% 0.2
## https://stat.ethz.ch/pipermail/r-devel/2010-April/057225.html
len <- trunc((ofrequency / nfrequency) + ts.eps)
mat <- is.matrix(x)
if(mat) cn <- colnames(x)
## nstart <- ceiling(tsp(x)[1L] * nfrequency) / nfrequency
## x <- as.matrix(window(x, start = nstart))
nstart <- tsp(x)[1L]
## Can't use nstart <- start(x) as this causes problems if
## you get a vector of length 2.
x <- as.matrix(x)
nend <- floor(nrow(x) / len) * len
x <- apply(array(c(x[1 : nend, ]),
dim = c(len, nend / len, ncol(x))),
MARGIN = c(2L, 3L), FUN = FUN, ...)
if(!mat) x <- as.vector(x)
else colnames(x) <- cn
ts(x, start = nstart, frequency = nfrequency)
}