src/library/stats/R/aggregate.R - R - Git at Google

 #  File src/library/stats/R/aggregate.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2018 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 aggregate <-
 function(x, ...)
     UseMethod("aggregate")

 aggregate.default <-
 function(x, ...)
 {
     if(is.ts(x))
         aggregate.ts(as.ts(x), ...)
     else
         aggregate.data.frame(as.data.frame(x), ...)
 }

 aggregate.data.frame <-
 function(x, by, FUN, ..., simplify = TRUE, drop = TRUE)
 {
     if(!is.data.frame(x)) x <- as.data.frame(x)
     ## Do this here to avoid masking by non-function (could happen)
     FUN <- match.fun(FUN)
     if(NROW(x) == 0L) stop("no rows to aggregate")
     if(NCOL(x) == 0L) {
         ## fake it
         x <- data.frame(x = rep(1, NROW(x)))
         return(aggregate.data.frame(x, by, function(x) 0L)[seq_along(by)])
     }
     if(!is.list(by))
         stop("'by' must be a list")
     if(is.null(names(by)) && length(by))
         names(by) <- paste0("Group.", seq_along(by))
     else {
         nam <- names(by)
         ind <- which(!nzchar(nam))
         names(by)[ind] <- paste0("Group.", ind)
     }

     if(any(lengths(by) != NROW(x)))
         stop("arguments must have same length")

     y <- as.data.frame(by, stringsAsFactors = FALSE)
     keep <- complete.cases(by)
     y <- y[keep, , drop = FALSE]
     x <- x[keep, , drop = FALSE]
     nrx <- NROW(x)

     ## Generate a group identifier vector with integers and dots.
     ident <- function(x) {
         y <- as.factor(x)
         l <- length(levels(y))
         s <- as.character(seq_len(l))
         n <- nchar(s)
         levels(y) <- paste0(strrep("0", n[l] - n), s)
         y # levels used for drop = FALSE
     }
     grp <- lapply(y, ident)
     multi.y <- !drop && ncol(y)
     if(multi.y) {
         lev <- lapply(grp, levels)
 	y <- as.list(y)
         for (i in seq_along(y)) {
             z <- y[[i]][match(lev[[i]], grp[[i]])]
             if(is.factor(z) && any(keep <- is.na(z)))
                 z[keep] <- levels(z)[keep]
             y[[i]] <- z
         }
         eGrid <- function(L)
             expand.grid(L, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE)
 	y <- eGrid(y)
     }
     grp <- if(ncol(y)) {
         names(grp) <- NULL
 	do.call(paste, c(rev(grp), list(sep = ".")))
     } else
 	integer(nrx)
     if(multi.y) {
         lev <- as.list(eGrid(lev))
         names(lev) <- NULL
         lev <- do.call(paste, c(rev(lev), list(sep = ".")))
     } else
         y <- y[match(sort(unique(grp)), grp, 0L), , drop = FALSE]
     z <- lapply(x,
                 function(e) {
                     ## In case of a common length > 1, sapply() gives
                     ## the transpose of what we need ...
 		    ans <- lapply(X = unname(split(e, grp)), FUN = FUN, ...)
                     if(simplify &&
                        length(len <- unique(lengths(ans))) == 1L) {
                         ## this used to lose classes
                         if(len == 1L) {
                             cl <- lapply(ans, oldClass)
                             cl1 <- cl[[1L]]
 			    ans <- unlist(ans, recursive = FALSE, use.names = FALSE)
                             if (!is.null(cl1) &&
                                 all(vapply(cl, identical, NA, y = cl1)))
                                 class(ans) <- cl1
                         } else if(len > 1L)
 			    ans <- matrix(unlist(ans, recursive = FALSE, use.names = FALSE),
                                           ncol = len,
                                           byrow = TRUE,
 					  dimnames =
 					      if(!is.null(nms <- names(ans[[1L]])))
 						  list(NULL, nms) ## else NULL
 					  )
                     }
                     ans
                 })
     len <- length(y)
     if(multi.y) {
 	keep <- match(lev, sort(unique(grp)))
 	for(i in seq_along(z))
 	    y[[len + i]] <- if(is.matrix(z[[i]]))
 				 z[[i]][keep, , drop = FALSE]
 			    else z[[i]][keep]
     } else
 	for(i in seq_along(z))
 	    y[[len + i]] <- z[[i]]
     names(y) <- c(names(by), names(x))
     row.names(y) <- NULL
     y
 }

 aggregate.formula <-
 function(formula, data, FUN, ..., subset, na.action = na.omit)
 {
     if(missing(formula) || !inherits(formula, "formula"))
         stop("'formula' missing or incorrect")
     if(length(formula) != 3L)
         stop("'formula' must have both left and right hand sides")

     m <- match.call(expand.dots = FALSE)
     if(is.matrix(eval(m$data, parent.frame())))
         m$data <- as.data.frame(data)
     m$... <- m$FUN <- NULL
     ## need stats:: for non-standard evaluation
     m[[1L]] <- quote(stats::model.frame)

     if (formula[[2L]] == ".") {
         ## LHS is a dot, expand it ...
         ##rhs <- unlist(strsplit(deparse(formula[[3L]]), " *[:+] *"))
         ## <NOTE>
         ## Note that this will not do quite the right thing in case the
         ## RHS contains transformed variables, such that
         ##   setdiff(rhs, names(data))
         ## is non-empty ...
         ##lhs <- sprintf("cbind(%s)",
         ##              paste(setdiff(names(data), rhs), collapse = ","))
         ## formula[[2L]] <- parse(text = lhs)[[1L]]
         ## </NOTE>

         ## New logic May 2012 --pd

         ## Dot expansion:
         ## lhs ends up as quote(cbind(v1, v2, ....)) using all variables in
         ## data, except those that are used on the RHS.

         ## This version uses terms() to get the rhs variables, which means
         ## that it will NOT remove a variable from the expansion if a
         ## transformation of it is on the RHS of the formula.

         rhs <- as.list(attr(terms(formula[-2L]),"variables")[-1])
         lhs <- as.call(c(quote(cbind),
                          setdiff(lapply(names(data), as.name),
                                  rhs)
                          )
                        )
         formula[[2L]] <- lhs
         m[[2L]] <- formula
     }
     mf <- eval(m, parent.frame())

     if(is.matrix(mf[[1L]])) {
         ## LHS is a cbind() combo, convert to data frame and fix names.
         ## Commented out May 2012 (seems to work without it) -- pd
 	##lhs <- setNames(as.data.frame(mf[[1L]]),
 	##		as.character(m[[2L]][[2L]])[-1L])
         lhs <- as.data.frame(mf[[1L]])
         aggregate.data.frame(lhs, mf[-1L], FUN = FUN, ...)
     }
     else
         aggregate.data.frame(mf[1L], mf[-1L], FUN = FUN, ...)
 }

 aggregate.ts <-
 function(x, nfrequency = 1, FUN = sum, ndeltat = 1,
          ts.eps = getOption("ts.eps"), ...)
 {
     x <- as.ts(x)
     ofrequency <- tsp(x)[3L]
     ## do this here to avoid masking by non-function (could happen)
     FUN <- match.fun(FUN)
     ## Set up the new frequency, and make sure it is an integer.
     if(missing(nfrequency))
         nfrequency <- 1 / ndeltat
     if((nfrequency > 1) &&
         (abs(nfrequency - round(nfrequency)) < ts.eps))
         nfrequency <- round(nfrequency)

     if(nfrequency == ofrequency)
         return(x)
     ratio <- ofrequency /nfrequency
     if(abs(ratio - round(ratio)) > ts.eps)
         stop(gettextf("cannot change frequency from %g to %g",
                       ofrequency, nfrequency), domain = NA)
     ## The desired result is obtained by applying FUN to blocks of
     ## length ofrequency/nfrequency, for each of the variables in x.
     ## We first get the new start and end right, and then break x into
     ## such blocks by reshaping it into an array and setting dim.
     ## avoid e.g. 1.0 %/% 0.2
     ## https://stat.ethz.ch/pipermail/r-devel/2010-April/057225.html
     len <- trunc((ofrequency / nfrequency) + ts.eps)
     mat <- is.matrix(x)
     if(mat) cn <- colnames(x)
     ##   nstart <- ceiling(tsp(x)[1L] * nfrequency) / nfrequency
     ##   x <- as.matrix(window(x, start = nstart))
     nstart <- tsp(x)[1L]
     ## Can't use nstart <- start(x) as this causes problems if
     ## you get a vector of length 2.
     x <- as.matrix(x)
     nend <- floor(nrow(x) / len) * len
     x <- apply(array(c(x[1 : nend, ]),
                      dim = c(len, nend / len, ncol(x))),
                MARGIN = c(2L, 3L), FUN = FUN, ...)
     if(!mat) x <- as.vector(x)
     else colnames(x) <- cn
     ts(x, start = nstart, frequency = nfrequency)
 }
	# File src/library/stats/R/aggregate.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2018 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	aggregate <-
	function(x, ...)
	UseMethod("aggregate")

	aggregate.default <-
	function(x, ...)
	{
	if(is.ts(x))
	aggregate.ts(as.ts(x), ...)
	else
	aggregate.data.frame(as.data.frame(x), ...)
	}

	aggregate.data.frame <-
	function(x, by, FUN, ..., simplify = TRUE, drop = TRUE)
	{
	if(!is.data.frame(x)) x <- as.data.frame(x)
	## Do this here to avoid masking by non-function (could happen)
	FUN <- match.fun(FUN)
	if(NROW(x) == 0L) stop("no rows to aggregate")
	if(NCOL(x) == 0L) {
	## fake it
	x <- data.frame(x = rep(1, NROW(x)))
	return(aggregate.data.frame(x, by, function(x) 0L)[seq_along(by)])
	}
	if(!is.list(by))
	stop("'by' must be a list")
	if(is.null(names(by)) && length(by))
	names(by) <- paste0("Group.", seq_along(by))
	else {
	nam <- names(by)
	ind <- which(!nzchar(nam))
	names(by)[ind] <- paste0("Group.", ind)
	}

	if(any(lengths(by) != NROW(x)))
	stop("arguments must have same length")

	y <- as.data.frame(by, stringsAsFactors = FALSE)
	keep <- complete.cases(by)
	y <- y[keep, , drop = FALSE]
	x <- x[keep, , drop = FALSE]
	nrx <- NROW(x)

	## Generate a group identifier vector with integers and dots.
	ident <- function(x) {
	y <- as.factor(x)
	l <- length(levels(y))
	s <- as.character(seq_len(l))
	n <- nchar(s)
	levels(y) <- paste0(strrep("0", n[l] - n), s)
	y # levels used for drop = FALSE
	}
	grp <- lapply(y, ident)
	multi.y <- !drop && ncol(y)
	if(multi.y) {
	lev <- lapply(grp, levels)
	y <- as.list(y)
	for (i in seq_along(y)) {
	z <- y[[i]][match(lev[[i]], grp[[i]])]
	if(is.factor(z) && any(keep <- is.na(z)))
	z[keep] <- levels(z)[keep]
	y[[i]] <- z
	}
	eGrid <- function(L)
	expand.grid(L, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE)
	y <- eGrid(y)
	}
	grp <- if(ncol(y)) {
	names(grp) <- NULL
	do.call(paste, c(rev(grp), list(sep = ".")))
	} else
	integer(nrx)
	if(multi.y) {
	lev <- as.list(eGrid(lev))
	names(lev) <- NULL
	lev <- do.call(paste, c(rev(lev), list(sep = ".")))
	} else
	y <- y[match(sort(unique(grp)), grp, 0L), , drop = FALSE]
	z <- lapply(x,
	function(e) {
	## In case of a common length > 1, sapply() gives
	## the transpose of what we need ...
	ans <- lapply(X = unname(split(e, grp)), FUN = FUN, ...)
	if(simplify &&
	length(len <- unique(lengths(ans))) == 1L) {
	## this used to lose classes
	if(len == 1L) {
	cl <- lapply(ans, oldClass)
	cl1 <- cl[[1L]]
	ans <- unlist(ans, recursive = FALSE, use.names = FALSE)
	if (!is.null(cl1) &&
	all(vapply(cl, identical, NA, y = cl1)))
	class(ans) <- cl1
	} else if(len > 1L)
	ans <- matrix(unlist(ans, recursive = FALSE, use.names = FALSE),
	ncol = len,
	byrow = TRUE,
	dimnames =
	if(!is.null(nms <- names(ans[[1L]])))
	list(NULL, nms) ## else NULL
	)
	}
	ans
	})
	len <- length(y)
	if(multi.y) {
	keep <- match(lev, sort(unique(grp)))
	for(i in seq_along(z))
	y[[len + i]] <- if(is.matrix(z[[i]]))
	z[[i]][keep, , drop = FALSE]
	else z[[i]][keep]
	} else
	for(i in seq_along(z))
	y[[len + i]] <- z[[i]]
	names(y) <- c(names(by), names(x))
	row.names(y) <- NULL
	y
	}

	aggregate.formula <-
	function(formula, data, FUN, ..., subset, na.action = na.omit)
	{
	if(missing(formula) \|\| !inherits(formula, "formula"))
	stop("'formula' missing or incorrect")
	if(length(formula) != 3L)
	stop("'formula' must have both left and right hand sides")

	m <- match.call(expand.dots = FALSE)
	if(is.matrix(eval(m$data, parent.frame())))
	m$data <- as.data.frame(data)
	m$... <- m$FUN <- NULL
	## need stats:: for non-standard evaluation
	m[[1L]] <- quote(stats::model.frame)

	if (formula[[2L]] == ".") {
	## LHS is a dot, expand it ...
	##rhs <- unlist(strsplit(deparse(formula[[3L]]), " [:+] "))
	## <NOTE>
	## Note that this will not do quite the right thing in case the
	## RHS contains transformed variables, such that
	## setdiff(rhs, names(data))
	## is non-empty ...
	##lhs <- sprintf("cbind(%s)",
	## paste(setdiff(names(data), rhs), collapse = ","))
	## formula[[2L]] <- parse(text = lhs)[[1L]]
	## </NOTE>

	## New logic May 2012 --pd

	## Dot expansion:
	## lhs ends up as quote(cbind(v1, v2, ....)) using all variables in
	## data, except those that are used on the RHS.

	## This version uses terms() to get the rhs variables, which means
	## that it will NOT remove a variable from the expansion if a
	## transformation of it is on the RHS of the formula.

	rhs <- as.list(attr(terms(formula[-2L]),"variables")[-1])
	lhs <- as.call(c(quote(cbind),
	setdiff(lapply(names(data), as.name),
	rhs)
	)
	)
	formula[[2L]] <- lhs
	m[[2L]] <- formula
	}
	mf <- eval(m, parent.frame())

	if(is.matrix(mf[[1L]])) {
	## LHS is a cbind() combo, convert to data frame and fix names.
	## Commented out May 2012 (seems to work without it) -- pd
	##lhs <- setNames(as.data.frame(mf[[1L]]),
	## as.character(m[[2L]][[2L]])[-1L])
	lhs <- as.data.frame(mf[[1L]])
	aggregate.data.frame(lhs, mf[-1L], FUN = FUN, ...)
	}
	else
	aggregate.data.frame(mf[1L], mf[-1L], FUN = FUN, ...)
	}

	aggregate.ts <-
	function(x, nfrequency = 1, FUN = sum, ndeltat = 1,
	ts.eps = getOption("ts.eps"), ...)
	{
	x <- as.ts(x)
	ofrequency <- tsp(x)[3L]
	## do this here to avoid masking by non-function (could happen)
	FUN <- match.fun(FUN)
	## Set up the new frequency, and make sure it is an integer.
	if(missing(nfrequency))
	nfrequency <- 1 / ndeltat
	if((nfrequency > 1) &&
	(abs(nfrequency - round(nfrequency)) < ts.eps))
	nfrequency <- round(nfrequency)

	if(nfrequency == ofrequency)
	return(x)
	ratio <- ofrequency /nfrequency
	if(abs(ratio - round(ratio)) > ts.eps)
	stop(gettextf("cannot change frequency from %g to %g",
	ofrequency, nfrequency), domain = NA)
	## The desired result is obtained by applying FUN to blocks of
	## length ofrequency/nfrequency, for each of the variables in x.
	## We first get the new start and end right, and then break x into
	## such blocks by reshaping it into an array and setting dim.
	## avoid e.g. 1.0 %/% 0.2
	## https://stat.ethz.ch/pipermail/r-devel/2010-April/057225.html
	len <- trunc((ofrequency / nfrequency) + ts.eps)
	mat <- is.matrix(x)
	if(mat) cn <- colnames(x)
	## nstart <- ceiling(tsp(x)[1L] * nfrequency) / nfrequency
	## x <- as.matrix(window(x, start = nstart))
	nstart <- tsp(x)[1L]
	## Can't use nstart <- start(x) as this causes problems if
	## you get a vector of length 2.
	x <- as.matrix(x)
	nend <- floor(nrow(x) / len) * len
	x <- apply(array(c(x[1 : nend, ]),
	dim = c(len, nend / len, ncol(x))),
	MARGIN = c(2L, 3L), FUN = FUN, ...)
	if(!mat) x <- as.vector(x)
	else colnames(x) <- cn
	ts(x, start = nstart, frequency = nfrequency)
	}