blob: 8acb37bf40ef122cf9846f7763c9a64d0d46af4a [file] [log] [blame]
% File src/library/base/man/cut.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2014 R Core Team
% Distributed under GPL 2 or later
\name{cut}
\title{Convert Numeric to Factor}
\usage{
cut(x, \dots)
\method{cut}{default}(x, breaks, labels = NULL,
include.lowest = FALSE, right = TRUE, dig.lab = 3,
ordered_result = FALSE, \dots)
}
\alias{cut}
\alias{cut.default}
\arguments{
\item{x}{a numeric vector which is to be converted to a factor by cutting.}
\item{breaks}{either a numeric vector of two or more unique cut points or a
single number (greater than or equal to 2) giving the number of
intervals into which \code{x} is to be cut.}
\item{labels}{labels for the levels of the resulting category. By default,
labels are constructed using \code{"(a,b]"} interval notation. If
\code{labels = FALSE}, simple integer codes are returned instead of
a factor.}
\item{include.lowest}{logical, indicating if an \sQuote{x[i]} equal to
the lowest (or highest, for \code{right = FALSE}) \sQuote{breaks}
value should be included.}
\item{right}{logical, indicating if the intervals should be closed on
the right (and open on the left) or vice versa.}
\item{dig.lab}{integer which is used when labels are not given. It
determines the number of digits used in formatting the break numbers.}
\item{ordered_result}{logical: should the result be an ordered factor?}
\item{\dots}{further arguments passed to or from other methods.}
}
\description{
\code{cut} divides the range of \code{x} into intervals
and codes the values in \code{x} according to which
interval they fall. The leftmost interval corresponds to level one,
the next leftmost to level two and so on.
}
\details{
When \code{breaks} is specified as a single number, the range of the
data is divided into \code{breaks} pieces of equal length, and then
the outer limits are moved away by 0.1\% of the range to ensure that
the extreme values both fall within the break intervals. (If \code{x}
is a constant vector, equal-length intervals are created, one of
which includes the single value.)
If a \code{labels} parameter is specified, its values are used to name
the factor levels. If none is specified, the factor level labels are
constructed as \code{"(b1, b2]"}, \code{"(b2, b3]"} etc. for
\code{right = TRUE} and as \code{"[b1, b2)"}, \ldots if \code{right =
FALSE}.
In this case, \code{dig.lab} indicates the minimum number of digits
should be used in formatting the numbers \code{b1}, \code{b2}, \ldots.
A larger value (up to 12) will be used if needed to distinguish
between any pair of endpoints: if this fails labels such as
\code{"Range3"} will be used. Formatting is done by
\code{\link{formatC}}.
The default method will sort a numeric vector of \code{breaks}, but
other methods are not required to and \code{labels} will correspond to
the intervals after sorting.
As from \R 3.2.0, \code{getOption("OutDec")} is consulted when labels
are constructed for \code{labels = NULL}.
}
\value{
A \code{\link{factor}} is returned, unless \code{labels = FALSE} which
results in an integer vector of level codes.
Values which fall outside the range of \code{breaks} are coded as
\code{NA}, as are \code{NaN} and \code{NA} values.
}
\note{
Instead of \code{table(cut(x, br))}, \code{hist(x, br, plot = FALSE)} is
more efficient and less memory hungry. Instead of \code{cut(*,
labels = FALSE)}, \code{\link{findInterval}()} is more efficient.
}
\references{
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
\emph{The New S Language}.
Wadsworth & Brooks/Cole.
}
\seealso{
\code{\link{split}} for splitting a variable according to a group factor;
\code{\link{factor}}, \code{\link{tabulate}}, \code{\link{table}},
\code{\link{findInterval}}.
\code{\link{quantile}} for ways of choosing breaks of roughly equal
content (rather than length).
\code{\link{.bincode}} for a bare-bones version.
}
\examples{
Z <- stats::rnorm(10000)
table(cut(Z, breaks = -6:6))
sum(table(cut(Z, breaks = -6:6, labels = FALSE)))
sum(graphics::hist(Z, breaks = -6:6, plot = FALSE)$counts)
cut(rep(1,5), 4) #-- dummy
tx0 <- c(9, 4, 6, 5, 3, 10, 5, 3, 5)
x <- rep(0:8, tx0)
stopifnot(table(x) == tx0)
table( cut(x, b = 8))
table( cut(x, breaks = 3*(-2:5)))
table( cut(x, breaks = 3*(-2:5), right = FALSE))
##--- some values OUTSIDE the breaks :
table(cx <- cut(x, breaks = 2*(0:4)))
table(cxl <- cut(x, breaks = 2*(0:4), right = FALSE))
which(is.na(cx)); x[is.na(cx)] #-- the first 9 values 0
which(is.na(cxl)); x[is.na(cxl)] #-- the last 5 values 8
## Label construction:
y <- stats::rnorm(100)
table(cut(y, breaks = pi/3*(-3:3)))
table(cut(y, breaks = pi/3*(-3:3), dig.lab = 4))
table(cut(y, breaks = 1*(-3:3), dig.lab = 4))
# extra digits don't "harm" here
table(cut(y, breaks = 1*(-3:3), right = FALSE))
#- the same, since no exact INT!
## sometimes the default dig.lab is not enough to be avoid confusion:
aaa <- c(1,2,3,4,5,2,3,4,5,6,7)
cut(aaa, 3)
cut(aaa, 3, dig.lab = 4, ordered = TRUE)
## one way to extract the breakpoints
labs <- levels(cut(aaa, 3))
cbind(lower = as.numeric( sub("\\\\((.+),.*", "\\\\1", labs) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\\\]", "\\\\1", labs) ))
}
\keyword{category}