| % File src/library/stats/man/reshape.Rd |
| % Part of the R package, https://www.R-project.org |
| % Copyright 1995-2012 R Core Team |
| % Distributed under GPL 2 or later |
| |
| \name{reshape} |
| \alias{reshape} |
| \title{Reshape Grouped Data} |
| \description{ |
| This function reshapes a data frame between \sQuote{wide} format with |
| repeated measurements in separate columns of the same record and |
| \sQuote{long} format with the repeated measurements in separate |
| records. |
| } |
| \usage{ |
| reshape(data, varying = NULL, v.names = NULL, timevar = "time", |
| idvar = "id", ids = 1:NROW(data), |
| times = seq_along(varying[[1]]), |
| drop = NULL, direction, new.row.names = NULL, |
| sep = ".", |
| split = if (sep == "") { |
| list(regexp = "[A-Za-z][0-9]", include = TRUE) |
| } else { |
| list(regexp = sep, include = FALSE, fixed = TRUE)} |
| ) |
| |
| } |
| \arguments{ |
| \item{data}{a data frame} |
| \item{varying}{names of sets of variables in the wide format that |
| correspond to single variables in long format |
| (\sQuote{time-varying}). This is canonically a list of vectors of |
| variable names, but it can optionally be a matrix of names, or a |
| single vector of names. In each case, the names can be replaced by |
| indices which are interpreted as referring to \code{names(data)}. |
| See \sQuote{Details} for more details and options.} |
| \item{v.names}{names of variables in the long format that correspond |
| to multiple variables in the wide format. See \sQuote{Details}.} |
| \item{timevar}{the variable in long format that differentiates multiple |
| records from the same group or individual. If more than one record |
| matches, the first will be taken (with a warning). } |
| \item{idvar}{Names of one or more variables in long format that |
| identify multiple records from the same group/individual. These |
| variables may also be present in wide format.} |
| \item{ids}{the values to use for a newly created \code{idvar} |
| variable in long format.} |
| \item{times}{the values to use for a newly created \code{timevar} |
| variable in long format. See \sQuote{Details}.} |
| \item{drop}{a vector of names of variables to drop before reshaping.} |
| \item{direction}{character string, partially matched to either |
| \code{"wide"} to reshape to wide format, or \code{"long"} to reshape |
| to long format.} |
| \item{new.row.names}{character or \code{NULL}: a non-null value will be |
| used for the row names of the result.} |
| \item{sep}{A character vector of length 1, indicating a separating |
| character in the variable names in the wide format. This is used for |
| guessing \code{v.names} and \code{times} arguments based on the |
| names in \code{varying}. If \code{sep == ""}, the split is just before |
| the first numeral that follows an alphabetic character. This is |
| also used to create variable names when reshaping to wide format.} |
| \item{split}{A list with three components, \code{regexp}, |
| \code{include}, and (optionally) \code{fixed}. This allows an |
| extended interface to variable name splitting. See \sQuote{Details}.} |
| } |
| \details{ |
| The arguments to this function are described in terms of longitudinal |
| data, as that is the application motivating the functions. A |
| \sQuote{wide} longitudinal dataset will have one record for each |
| individual with some time-constant variables that occupy single |
| columns and some time-varying variables that occupy a column for each |
| time point. In \sQuote{long} format there will be multiple records |
| for each individual, with some variables being constant across these |
| records and others varying across the records. A \sQuote{long} format |
| dataset also needs a \sQuote{time} variable identifying which time |
| point each record comes from and an \sQuote{id} variable showing which |
| records refer to the same person. |
| |
| If the data frame resulted from a previous \code{reshape} then the |
| operation can be reversed simply by \code{reshape(a)}. The |
| \code{direction} argument is optional and the other arguments are |
| stored as attributes on the data frame. |
| |
| If \code{direction = "wide"} and no \code{varying} or \code{v.names} |
| arguments are supplied it is assumed that all variables except |
| \code{idvar} and \code{timevar} are time-varying. They are all |
| expanded into multiple variables in wide format. |
| |
| If \code{direction = "long"} the \code{varying} argument can be a vector |
| of column names (or a corresponding index). The function will attempt |
| to guess the \code{v.names} and \code{times} from these names. The |
| default is variable names like \code{x.1}, \code{x.2}, where |
| \code{sep = "."} specifies to split at the dot and drop it from the |
| name. To have alphabetic followed by numeric times use \code{sep = ""}. |
| |
| Variable name splitting as described above is only attempted in the |
| case where \code{varying} is an atomic vector, if it is a list or a |
| matrix, \code{v.names} and \code{times} will generally need to be |
| specified, although they will default to, respectively, the first |
| variable name in each set, and sequential times. |
| |
| Also, guessing is not attempted if \code{v.names} is given |
| explicitly. Notice that the order of variables in \code{varying} is |
| like \code{x.1},\code{y.1},\code{x.2},\code{y.2}. |
| |
| The \code{split} argument should not usually be necessary. The |
| \code{split$regexp} component is passed to either |
| \code{\link{strsplit}} or \code{\link{regexpr}}, where the latter is |
| used if \code{split$include} is \code{TRUE}, in which case the |
| splitting occurs after the first character of the matched string. In |
| the \code{\link{strsplit}} case, the separator is not included in the |
| result, and it is possible to specify fixed-string matching using |
| \code{split$fixed}. |
| } |
| \value{ |
| The reshaped data frame with added attributes to simplify reshaping |
| back to the original form. |
| } |
| \seealso{\code{\link{stack}}, \code{\link{aperm}}; |
| \code{\link{relist}} for reshaping the result of \code{\link{unlist}}. |
| } |
| \examples{ |
| summary(Indometh) |
| wide <- reshape(Indometh, v.names = "conc", idvar = "Subject", |
| timevar = "time", direction = "wide") |
| wide |
| |
| reshape(wide, direction = "long") |
| reshape(wide, idvar = "Subject", varying = list(2:12), |
| v.names = "conc", direction = "long") |
| |
| ## times need not be numeric |
| df <- data.frame(id = rep(1:4, rep(2,4)), |
| visit = I(rep(c("Before","After"), 4)), |
| x = rnorm(4), y = runif(4)) |
| df |
| reshape(df, timevar = "visit", idvar = "id", direction = "wide") |
| ## warns that y is really varying |
| reshape(df, timevar = "visit", idvar = "id", direction = "wide", v.names = "x") |
| |
| |
| ## unbalanced 'long' data leads to NA fill in 'wide' form |
| df2 <- df[1:7, ] |
| df2 |
| reshape(df2, timevar = "visit", idvar = "id", direction = "wide") |
| |
| ## Alternative regular expressions for guessing names |
| df3 <- data.frame(id = 1:4, age = c(40,50,60,50), dose1 = c(1,2,1,2), |
| dose2 = c(2,1,2,1), dose4 = c(3,3,3,3)) |
| reshape(df3, direction = "long", varying = 3:5, sep = "") |
| |
| |
| ## an example that isn't longitudinal data |
| state.x77 <- as.data.frame(state.x77) |
| long <- reshape(state.x77, idvar = "state", ids = row.names(state.x77), |
| times = names(state.x77), timevar = "Characteristic", |
| varying = list(names(state.x77)), direction = "long") |
| |
| reshape(long, direction = "wide") |
| |
| reshape(long, direction = "wide", new.row.names = unique(long$state)) |
| |
| ## multiple id variables |
| df3 <- data.frame(school = rep(1:3, each = 4), class = rep(9:10, 6), |
| time = rep(c(1,1,2,2), 3), score = rnorm(12)) |
| wide <- reshape(df3, idvar = c("school","class"), direction = "wide") |
| wide |
| ## transform back |
| reshape(wide) |
| |
| } |
| \keyword{manip} |