src/library/stats/man/reshape.Rd - R - Git at Google

 % File src/library/stats/man/reshape.Rd
 % Part of the R package, https://www.R-project.org
 % Copyright 1995-2012 R Core Team
 % Distributed under GPL 2 or later

 \name{reshape}
 \alias{reshape}
 \title{Reshape Grouped Data}
 \description{
   This function reshapes a data frame between \sQuote{wide} format (with
   repeated measurements in separate columns of the same row) and
   \sQuote{long} format (with the repeated measurements in separate
   rows).
 }
 \usage{
 reshape(data, varying = NULL, v.names = NULL, timevar = "time",
         idvar = "id", ids = 1:NROW(data),
         times = seq_along(varying[[1]]),
         drop = NULL, direction, new.row.names = NULL,
         sep = ".",
         split = if (sep == "") {
             list(regexp = "[A-Za-z][0-9]", include = TRUE)
         } else {
             list(regexp = sep, include = FALSE, fixed = TRUE)}
         )

 ### Typical usage for converting from long to wide format:

 # reshape(data, direction = "wide",
 #         idvar = "___", timevar = "___", # mandatory
 #         v.names = c(___),    # time-varying variables
 #         varying = list(___)) # auto-generated if missing

 ### Typical usage for converting from wide to long format:

 ### If names of wide-format variables are in a 'nice' format

 # reshape(data, direction = "long",
 #         varying = c(___), # vector
 #         sep)              # to help guess 'v.names' and 'times'

 ### To specify long-format variable names explicitly

 # reshape(data, direction = "long",
 #         varying = ___,  # list / matrix / vector (use with care)
 #         v.names = ___,  # vector of variable names in long format
 #         timevar, times, # name / values of constructed time variable
 #         idvar, ids)     # name / values of constructed id variable

 }
 \arguments{
   \item{data}{a data frame}
   \item{varying}{names of sets of variables in the wide format that
     correspond to single variables in long format
     (\sQuote{time-varying}).  This is canonically a list of vectors of
     variable names, but it can optionally be a matrix of names, or a
     single vector of names.  In each case, when \code{direction =
     "long"}, the names can be replaced by indices which are interpreted
     as referring to \code{names(data)}.  See \sQuote{Details} for more
     details and options.}
   \item{v.names}{names of variables in the long format that correspond
     to multiple variables in the wide format.  See \sQuote{Details}.}
   \item{timevar}{the variable in long format that differentiates multiple
     records from the same group or individual.  If more than one record
     matches, the first will be taken (with a warning). }
   \item{idvar}{Names of one or more variables in long format that
     identify multiple records from the same group/individual.  These
     variables may also be present in wide format.}
   \item{ids}{the values to use for a newly created \code{idvar}
     variable in long format.}
   \item{times}{the values to use for a newly created \code{timevar}
     variable in long format.  See \sQuote{Details}.}
   \item{drop}{a vector of names of variables to drop before reshaping.}
   \item{direction}{character string, partially matched to either
     \code{"wide"} to reshape to wide format, or \code{"long"} to reshape
     to long format.}
   \item{new.row.names}{character or \code{NULL}: a non-null value will be
     used for the row names of the result.}
   \item{sep}{A character vector of length 1, indicating a separating
     character in the variable names in the wide format.  This is used for
     guessing \code{v.names} and \code{times} arguments based on the
     names in \code{varying}.  If \code{sep == ""}, the split is just before
     the first numeral that follows an alphabetic character.  This is
     also used to create variable names when reshaping to wide format.}
   \item{split}{A list with three components, \code{regexp},
     \code{include}, and (optionally) \code{fixed}.  This allows an
     extended interface to variable name splitting.  See \sQuote{Details}.}
 }
 \details{
   Although \code{reshape()} can be used in a variety of contexts, the
   motivating application is data from longitudinal studies, and the
   arguments of this function are named and described in those terms. A
   longitudinal study is characterized by repeated measurements of the
   same variable(s), e.g., height and weight, on each unit being studied
   (e.g., individual persons) at different time points (which are assumed
   to be the same for all units). These variables are called time-varying
   variables. The study may include other variables that are measured
   only once for each unit and do not vary with time (e.g., gender and
   race); these are called time-constant variables.

   A \sQuote{wide} format representation of a longitudinal dataset will
   have one record (row) for each unit, typically with some time-constant
   variables that occupy single columns, and some time-varying variables
   that occupy multiple columns (one column for each time point).  A
   \sQuote{long} format representation of the same dataset will have
   multiple records (rows) for each individual, with the time-constant
   variables being constant across these records and the time-varying
   variables varying across the records.  The \sQuote{long} format
   dataset will have two additional variables: a \sQuote{time} variable
   identifying which time point each record comes from, and an
   \sQuote{id} variable showing which records refer to the same unit.

   The type of conversion (long to wide or wide to long) is determined by
   the \code{direction} argument, which is mandatory unless the
   \code{data} argument is the result of a previous call to
   \code{reshape}.  In that case, the operation can be reversed simply
   using \code{reshape(data)} (the other arguments are stored as
   attributes on the data frame).

   Conversion from long to wide format with \code{direction = "wide"} is
   the simpler operation, and is mainly useful in the context of
   multivariate analysis where data is often expected as a wide-format
   matrix. In this case, the time variable \code{timevar} and id variable
   \code{idvar} must be specified. All other variables are assumed to be
   time-varying, unless the time-varying variables are explicitly
   specified via the \code{v.names} argument.  A warning is issued if
   time-constant variables are not actually constant.

   Each time-varying variable is expanded into multiple variables in the
   wide format.  The names of these expanded variables are generated
   automatically, unless they are specified as the \code{varying}
   argument in the form of a list (or matrix) with one component (or row)
   for each time-varying variable. If \code{varying} is a vector of
   names, it is implicitly converted into a matrix, with one row for each
   time-varying variable. Use this option with care if there are multiple
   time-varying variables, as the ordering (by column, the default in the
   \code{\link{matrix}} constructor) may be unintuitive, whereas the
   explicit list or matrix form is unambiguous.

   %% (FIXME: delete?) If 'direction = "wide"' and no \code{varying} or
   %% \code{v.names} arguments are supplied it is assumed that all variables
   %% except \code{idvar} and \code{timevar} are time-varying.

   Conversion from wide to long with \code{direction = "long"} is the
   more common operation as most (univariate) statistical modeling
   functions expect data in the long format. In the simpler case where
   there is only one time-varying variable, the corresponding columns in
   the wide format input can be specified as the \code{varying} argument,
   which can be either a vector of column names or the corresponding
   column indices. The name of the corresponding variable in the long
   format output combining these columns can be optionally specified as
   the \code{v.names} argument, and the name of the time variables as the
   \code{timevar} argument. The values to use as the time values
   corresponding to the different columns in the wide format can be
   specified as the \code{times} argument.  If \code{v.names} is
   unspecified, the function will attempt to guess \code{v.names} and
   \code{times} from \code{varying} (an explicitly specified \code{times}
   argument is unused in that case).  The default expects variable names
   like \code{x.1}, \code{x.2}, where \code{sep = "."}  specifies to
   split at the dot and drop it from the name.  To have alphabetic
   followed by numeric times use \code{sep = ""}.

   Multiple time-varying variables can be specified in two ways, either
   with \code{varying} as an atomic vector as above, or as a list (or a
   matrix). The first form is useful (and mandatory) if the automatic
   variable name splitting as described above is used; this requires the
   names of all time-varying variables to be suitably formatted in the
   same manner, and \code{v.names} to be unspecified. If \code{varying}
   is a list (with one component for each time-varying variable) or a
   matrix (one row for each time-varying variable), variable name
   splitting is not attempted, and \code{v.names} and \code{times} will
   generally need to be specified, although they will default to,
   respectively, the first variable name in each set, and sequential
   times.

   Also, guessing is not attempted if \code{v.names} is given explicitly,
   even if \code{varying} is an atomic vector. In that case, the number
   of time-varying variables is taken to be the length of \code{v.names},
   and \code{varying} is implicitly converted into a matrix, with one row
   for each time-varying variable. As in the case of long to wide
   conversion, the matrix is filled up by column, so careful attention needs
   to be paid to the order of variable names (or indices) in
   \code{varying}, which is taken to be like \code{x.1}, \code{y.1},
   \code{x.2}, \code{y.2} (i.e., variables corresponding to the same time
   point need to be grouped together).

   The \code{split} argument should not usually be necessary.  The
   \code{split$regexp} component is passed to either
   \code{\link{strsplit}} or \code{\link{regexpr}}, where the latter is
   used if \code{split$include} is \code{TRUE}, in which case the
   splitting occurs after the first character of the matched string.  In
   the \code{\link{strsplit}} case, the separator is not included in the
   result, and it is possible to specify fixed-string matching using
   \code{split$fixed}.
 }
 \value{
   The reshaped data frame with added attributes to simplify reshaping
   back to the original form.
 }
 \seealso{\code{\link{stack}}, \code{\link{aperm}};
   \code{\link{relist}} for reshaping the result of
   \code{\link{unlist}}. \code{\link{xtabs}} and
   \code{\link{as.data.frame.table}} for creating contingency tables and
   converting them back to data frames.
 }
 \examples{
 summary(Indometh) # data in long format

 ## long to wide (direction = "wide") requires idvar and timevar at a minimum
 reshape(Indometh, direction = "wide", idvar = "Subject", timevar = "time")

 ## can also explicitly specify name of combined variable
 wide <- reshape(Indometh, direction = "wide", idvar = "Subject",
                 timevar = "time", v.names = "conc", sep= "_")
 wide

 ## reverse transformation
 reshape(wide, direction = "long")
 reshape(wide, idvar = "Subject", varying = list(2:12),
         v.names = "conc", direction = "long")

 ## times need not be numeric
 df <- data.frame(id = rep(1:4, rep(2,4)),
                  visit = I(rep(c("Before","After"), 4)),
                  x = rnorm(4), y = runif(4))
 df
 reshape(df, timevar = "visit", idvar = "id", direction = "wide")
 ## warns that y is really varying
 reshape(df, timevar = "visit", idvar = "id", direction = "wide", v.names = "x")


 ##  unbalanced 'long' data leads to NA fill in 'wide' form
 df2 <- df[1:7, ]
 df2
 reshape(df2, timevar = "visit", idvar = "id", direction = "wide")

 ## Alternative regular expressions for guessing names
 df3 <- data.frame(id = 1:4, age = c(40,50,60,50), dose1 = c(1,2,1,2),
                   dose2 = c(2,1,2,1), dose4 = c(3,3,3,3))
 reshape(df3, direction = "long", varying = 3:5, sep = "")


 ## an example that isn't longitudinal data
 state.x77 <- as.data.frame(state.x77)
 long <- reshape(state.x77, idvar = "state", ids = row.names(state.x77),
                 times = names(state.x77), timevar = "Characteristic",
                 varying = list(names(state.x77)), direction = "long")

 reshape(long, direction = "wide")

 reshape(long, direction = "wide", new.row.names = unique(long$state))

 ## multiple id variables
 df3 <- data.frame(school = rep(1:3, each = 4), class = rep(9:10, 6),
                   time = rep(c(1,1,2,2), 3), score = rnorm(12))
 wide <- reshape(df3, idvar = c("school", "class"), direction = "wide")
 wide
 ## transform back
 reshape(wide)

 }
 \keyword{manip}
	% File src/library/stats/man/reshape.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 1995-2012 R Core Team
	% Distributed under GPL 2 or later

	\name{reshape}
	\alias{reshape}
	\title{Reshape Grouped Data}
	\description{
	This function reshapes a data frame between \sQuote{wide} format (with
	repeated measurements in separate columns of the same row) and
	\sQuote{long} format (with the repeated measurements in separate
	rows).
	}
	\usage{
	reshape(data, varying = NULL, v.names = NULL, timevar = "time",
	idvar = "id", ids = 1:NROW(data),
	times = seq_along(varying[[1]]),
	drop = NULL, direction, new.row.names = NULL,
	sep = ".",
	split = if (sep == "") {
	list(regexp = "[A-Za-z][0-9]", include = TRUE)
	} else {
	list(regexp = sep, include = FALSE, fixed = TRUE)}
	)

	### Typical usage for converting from long to wide format:

	# reshape(data, direction = "wide",
	# idvar = "___", timevar = "___", # mandatory
	# v.names = c(___), # time-varying variables
	# varying = list(___)) # auto-generated if missing

	### Typical usage for converting from wide to long format:

	### If names of wide-format variables are in a 'nice' format

	# reshape(data, direction = "long",
	# varying = c(___), # vector
	# sep) # to help guess 'v.names' and 'times'

	### To specify long-format variable names explicitly

	# reshape(data, direction = "long",
	# varying = ___, # list / matrix / vector (use with care)
	# v.names = ___, # vector of variable names in long format
	# timevar, times, # name / values of constructed time variable
	# idvar, ids) # name / values of constructed id variable

	}
	\arguments{
	\item{data}{a data frame}
	\item{varying}{names of sets of variables in the wide format that
	correspond to single variables in long format
	(\sQuote{time-varying}). This is canonically a list of vectors of
	variable names, but it can optionally be a matrix of names, or a
	single vector of names. In each case, when \code{direction =
	"long"}, the names can be replaced by indices which are interpreted
	as referring to \code{names(data)}. See \sQuote{Details} for more
	details and options.}
	\item{v.names}{names of variables in the long format that correspond
	to multiple variables in the wide format. See \sQuote{Details}.}
	\item{timevar}{the variable in long format that differentiates multiple
	records from the same group or individual. If more than one record
	matches, the first will be taken (with a warning). }
	\item{idvar}{Names of one or more variables in long format that
	identify multiple records from the same group/individual. These
	variables may also be present in wide format.}
	\item{ids}{the values to use for a newly created \code{idvar}
	variable in long format.}
	\item{times}{the values to use for a newly created \code{timevar}
	variable in long format. See \sQuote{Details}.}
	\item{drop}{a vector of names of variables to drop before reshaping.}
	\item{direction}{character string, partially matched to either
	\code{"wide"} to reshape to wide format, or \code{"long"} to reshape
	to long format.}
	\item{new.row.names}{character or \code{NULL}: a non-null value will be
	used for the row names of the result.}
	\item{sep}{A character vector of length 1, indicating a separating
	character in the variable names in the wide format. This is used for
	guessing \code{v.names} and \code{times} arguments based on the
	names in \code{varying}. If \code{sep == ""}, the split is just before
	the first numeral that follows an alphabetic character. This is
	also used to create variable names when reshaping to wide format.}
	\item{split}{A list with three components, \code{regexp},
	\code{include}, and (optionally) \code{fixed}. This allows an
	extended interface to variable name splitting. See \sQuote{Details}.}
	}
	\details{
	Although \code{reshape()} can be used in a variety of contexts, the
	motivating application is data from longitudinal studies, and the
	arguments of this function are named and described in those terms. A
	longitudinal study is characterized by repeated measurements of the
	same variable(s), e.g., height and weight, on each unit being studied
	(e.g., individual persons) at different time points (which are assumed
	to be the same for all units). These variables are called time-varying
	variables. The study may include other variables that are measured
	only once for each unit and do not vary with time (e.g., gender and
	race); these are called time-constant variables.

	A \sQuote{wide} format representation of a longitudinal dataset will
	have one record (row) for each unit, typically with some time-constant
	variables that occupy single columns, and some time-varying variables
	that occupy multiple columns (one column for each time point). A
	\sQuote{long} format representation of the same dataset will have
	multiple records (rows) for each individual, with the time-constant
	variables being constant across these records and the time-varying
	variables varying across the records. The \sQuote{long} format
	dataset will have two additional variables: a \sQuote{time} variable
	identifying which time point each record comes from, and an
	\sQuote{id} variable showing which records refer to the same unit.

	The type of conversion (long to wide or wide to long) is determined by
	the \code{direction} argument, which is mandatory unless the
	\code{data} argument is the result of a previous call to
	\code{reshape}. In that case, the operation can be reversed simply
	using \code{reshape(data)} (the other arguments are stored as
	attributes on the data frame).

	Conversion from long to wide format with \code{direction = "wide"} is
	the simpler operation, and is mainly useful in the context of
	multivariate analysis where data is often expected as a wide-format
	matrix. In this case, the time variable \code{timevar} and id variable
	\code{idvar} must be specified. All other variables are assumed to be
	time-varying, unless the time-varying variables are explicitly
	specified via the \code{v.names} argument. A warning is issued if
	time-constant variables are not actually constant.

	Each time-varying variable is expanded into multiple variables in the
	wide format. The names of these expanded variables are generated
	automatically, unless they are specified as the \code{varying}
	argument in the form of a list (or matrix) with one component (or row)
	for each time-varying variable. If \code{varying} is a vector of
	names, it is implicitly converted into a matrix, with one row for each
	time-varying variable. Use this option with care if there are multiple
	time-varying variables, as the ordering (by column, the default in the
	\code{\link{matrix}} constructor) may be unintuitive, whereas the
	explicit list or matrix form is unambiguous.

	%% (FIXME: delete?) If 'direction = "wide"' and no \code{varying} or
	%% \code{v.names} arguments are supplied it is assumed that all variables
	%% except \code{idvar} and \code{timevar} are time-varying.

	Conversion from wide to long with \code{direction = "long"} is the
	more common operation as most (univariate) statistical modeling
	functions expect data in the long format. In the simpler case where
	there is only one time-varying variable, the corresponding columns in
	the wide format input can be specified as the \code{varying} argument,
	which can be either a vector of column names or the corresponding
	column indices. The name of the corresponding variable in the long
	format output combining these columns can be optionally specified as
	the \code{v.names} argument, and the name of the time variables as the
	\code{timevar} argument. The values to use as the time values
	corresponding to the different columns in the wide format can be
	specified as the \code{times} argument. If \code{v.names} is
	unspecified, the function will attempt to guess \code{v.names} and
	\code{times} from \code{varying} (an explicitly specified \code{times}
	argument is unused in that case). The default expects variable names
	like \code{x.1}, \code{x.2}, where \code{sep = "."} specifies to
	split at the dot and drop it from the name. To have alphabetic
	followed by numeric times use \code{sep = ""}.

	Multiple time-varying variables can be specified in two ways, either
	with \code{varying} as an atomic vector as above, or as a list (or a
	matrix). The first form is useful (and mandatory) if the automatic
	variable name splitting as described above is used; this requires the
	names of all time-varying variables to be suitably formatted in the
	same manner, and \code{v.names} to be unspecified. If \code{varying}
	is a list (with one component for each time-varying variable) or a
	matrix (one row for each time-varying variable), variable name
	splitting is not attempted, and \code{v.names} and \code{times} will
	generally need to be specified, although they will default to,
	respectively, the first variable name in each set, and sequential
	times.

	Also, guessing is not attempted if \code{v.names} is given explicitly,
	even if \code{varying} is an atomic vector. In that case, the number
	of time-varying variables is taken to be the length of \code{v.names},
	and \code{varying} is implicitly converted into a matrix, with one row
	for each time-varying variable. As in the case of long to wide
	conversion, the matrix is filled up by column, so careful attention needs
	to be paid to the order of variable names (or indices) in
	\code{varying}, which is taken to be like \code{x.1}, \code{y.1},
	\code{x.2}, \code{y.2} (i.e., variables corresponding to the same time
	point need to be grouped together).

	The \code{split} argument should not usually be necessary. The
	\code{split$regexp} component is passed to either
	\code{\link{strsplit}} or \code{\link{regexpr}}, where the latter is
	used if \code{split$include} is \code{TRUE}, in which case the
	splitting occurs after the first character of the matched string. In
	the \code{\link{strsplit}} case, the separator is not included in the
	result, and it is possible to specify fixed-string matching using
	\code{split$fixed}.
	}
	\value{
	The reshaped data frame with added attributes to simplify reshaping
	back to the original form.
	}
	\seealso{\code{\link{stack}}, \code{\link{aperm}};
	\code{\link{relist}} for reshaping the result of
	\code{\link{unlist}}. \code{\link{xtabs}} and
	\code{\link{as.data.frame.table}} for creating contingency tables and
	converting them back to data frames.
	}
	\examples{
	summary(Indometh) # data in long format

	## long to wide (direction = "wide") requires idvar and timevar at a minimum
	reshape(Indometh, direction = "wide", idvar = "Subject", timevar = "time")

	## can also explicitly specify name of combined variable
	wide <- reshape(Indometh, direction = "wide", idvar = "Subject",
	timevar = "time", v.names = "conc", sep= "_")
	wide

	## reverse transformation
	reshape(wide, direction = "long")
	reshape(wide, idvar = "Subject", varying = list(2:12),
	v.names = "conc", direction = "long")

	## times need not be numeric
	df <- data.frame(id = rep(1:4, rep(2,4)),
	visit = I(rep(c("Before","After"), 4)),
	x = rnorm(4), y = runif(4))
	df
	reshape(df, timevar = "visit", idvar = "id", direction = "wide")
	## warns that y is really varying
	reshape(df, timevar = "visit", idvar = "id", direction = "wide", v.names = "x")


	## unbalanced 'long' data leads to NA fill in 'wide' form
	df2 <- df[1:7, ]
	df2
	reshape(df2, timevar = "visit", idvar = "id", direction = "wide")

	## Alternative regular expressions for guessing names
	df3 <- data.frame(id = 1:4, age = c(40,50,60,50), dose1 = c(1,2,1,2),
	dose2 = c(2,1,2,1), dose4 = c(3,3,3,3))
	reshape(df3, direction = "long", varying = 3:5, sep = "")


	## an example that isn't longitudinal data
	state.x77 <- as.data.frame(state.x77)
	long <- reshape(state.x77, idvar = "state", ids = row.names(state.x77),
	times = names(state.x77), timevar = "Characteristic",
	varying = list(names(state.x77)), direction = "long")

	reshape(long, direction = "wide")

	reshape(long, direction = "wide", new.row.names = unique(long$state))

	## multiple id variables
	df3 <- data.frame(school = rep(1:3, each = 4), class = rep(9:10, 6),
	time = rep(c(1,1,2,2), 3), score = rnorm(12))
	wide <- reshape(df3, idvar = c("school", "class"), direction = "wide")
	wide
	## transform back
	reshape(wide)

	}
	\keyword{manip}