| % File src/library/base/man/cut.Rd |
| % Part of the R package, https://www.R-project.org |
| % Copyright 1995-2014 R Core Team |
| % Distributed under GPL 2 or later |
| |
| \name{cut} |
| \title{Convert Numeric to Factor} |
| \usage{ |
| cut(x, \dots) |
| |
| \method{cut}{default}(x, breaks, labels = NULL, |
| include.lowest = FALSE, right = TRUE, dig.lab = 3, |
| ordered_result = FALSE, \dots) |
| } |
| \alias{cut} |
| \alias{cut.default} |
| \arguments{ |
| \item{x}{a numeric vector which is to be converted to a factor by cutting.} |
| \item{breaks}{either a numeric vector of two or more unique cut points or a |
| single number (greater than or equal to 2) giving the number of |
| intervals into which \code{x} is to be cut.} |
| \item{labels}{labels for the levels of the resulting category. By default, |
| labels are constructed using \code{"(a,b]"} interval notation. If |
| \code{labels = FALSE}, simple integer codes are returned instead of |
| a factor.} |
| \item{include.lowest}{logical, indicating if an \sQuote{x[i]} equal to |
| the lowest (or highest, for \code{right = FALSE}) \sQuote{breaks} |
| value should be included.} |
| \item{right}{logical, indicating if the intervals should be closed on |
| the right (and open on the left) or vice versa.} |
| \item{dig.lab}{integer which is used when labels are not given. It |
| determines the number of digits used in formatting the break numbers.} |
| \item{ordered_result}{logical: should the result be an ordered factor?} |
| \item{\dots}{further arguments passed to or from other methods.} |
| } |
| \description{ |
| \code{cut} divides the range of \code{x} into intervals |
| and codes the values in \code{x} according to which |
| interval they fall. The leftmost interval corresponds to level one, |
| the next leftmost to level two and so on. |
| } |
| \details{ |
| When \code{breaks} is specified as a single number, the range of the |
| data is divided into \code{breaks} pieces of equal length, and then |
| the outer limits are moved away by 0.1\% of the range to ensure that |
| the extreme values both fall within the break intervals. (If \code{x} |
| is a constant vector, equal-length intervals are created, one of |
| which includes the single value.) |
| |
| If a \code{labels} parameter is specified, its values are used to name |
| the factor levels. If none is specified, the factor level labels are |
| constructed as \code{"(b1, b2]"}, \code{"(b2, b3]"} etc. for |
| \code{right = TRUE} and as \code{"[b1, b2)"}, \ldots if \code{right = |
| FALSE}. |
| In this case, \code{dig.lab} indicates the minimum number of digits |
| should be used in formatting the numbers \code{b1}, \code{b2}, \ldots. |
| A larger value (up to 12) will be used if needed to distinguish |
| between any pair of endpoints: if this fails labels such as |
| \code{"Range3"} will be used. Formatting is done by |
| \code{\link{formatC}}. |
| |
| The default method will sort a numeric vector of \code{breaks}, but |
| other methods are not required to and \code{labels} will correspond to |
| the intervals after sorting. |
| |
| As from \R 3.2.0, \code{getOption("OutDec")} is consulted when labels |
| are constructed for \code{labels = NULL}. |
| } |
| \value{ |
| A \code{\link{factor}} is returned, unless \code{labels = FALSE} which |
| results in an integer vector of level codes. |
| |
| Values which fall outside the range of \code{breaks} are coded as |
| \code{NA}, as are \code{NaN} and \code{NA} values. |
| } |
| \note{ |
| Instead of \code{table(cut(x, br))}, \code{hist(x, br, plot = FALSE)} is |
| more efficient and less memory hungry. Instead of \code{cut(*, |
| labels = FALSE)}, \code{\link{findInterval}()} is more efficient. |
| } |
| \references{ |
| Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) |
| \emph{The New S Language}. |
| Wadsworth & Brooks/Cole. |
| } |
| \seealso{ |
| \code{\link{split}} for splitting a variable according to a group factor; |
| \code{\link{factor}}, \code{\link{tabulate}}, \code{\link{table}}, |
| \code{\link{findInterval}}. |
| |
| \code{\link{quantile}} for ways of choosing breaks of roughly equal |
| content (rather than length). |
| |
| \code{\link{.bincode}} for a bare-bones version. |
| } |
| \examples{ |
| Z <- stats::rnorm(10000) |
| table(cut(Z, breaks = -6:6)) |
| sum(table(cut(Z, breaks = -6:6, labels = FALSE))) |
| sum(graphics::hist(Z, breaks = -6:6, plot = FALSE)$counts) |
| |
| cut(rep(1,5), 4) #-- dummy |
| tx0 <- c(9, 4, 6, 5, 3, 10, 5, 3, 5) |
| x <- rep(0:8, tx0) |
| stopifnot(table(x) == tx0) |
| |
| table( cut(x, b = 8)) |
| table( cut(x, breaks = 3*(-2:5))) |
| table( cut(x, breaks = 3*(-2:5), right = FALSE)) |
| |
| ##--- some values OUTSIDE the breaks : |
| table(cx <- cut(x, breaks = 2*(0:4))) |
| table(cxl <- cut(x, breaks = 2*(0:4), right = FALSE)) |
| which(is.na(cx)); x[is.na(cx)] #-- the first 9 values 0 |
| which(is.na(cxl)); x[is.na(cxl)] #-- the last 5 values 8 |
| |
| |
| ## Label construction: |
| y <- stats::rnorm(100) |
| table(cut(y, breaks = pi/3*(-3:3))) |
| table(cut(y, breaks = pi/3*(-3:3), dig.lab = 4)) |
| |
| table(cut(y, breaks = 1*(-3:3), dig.lab = 4)) |
| # extra digits don't "harm" here |
| table(cut(y, breaks = 1*(-3:3), right = FALSE)) |
| #- the same, since no exact INT! |
| |
| ## sometimes the default dig.lab is not enough to be avoid confusion: |
| aaa <- c(1,2,3,4,5,2,3,4,5,6,7) |
| cut(aaa, 3) |
| cut(aaa, 3, dig.lab = 4, ordered = TRUE) |
| |
| ## one way to extract the breakpoints |
| labs <- levels(cut(aaa, 3)) |
| cbind(lower = as.numeric( sub("\\\\((.+),.*", "\\\\1", labs) ), |
| upper = as.numeric( sub("[^,]*,([^]]*)\\\\]", "\\\\1", labs) )) |
| } |
| \keyword{category} |