| % File src/library/stats/man/dist.Rd |
| % Part of the R package, https://www.R-project.org |
| % Copyright 1995-2017 R Core Team |
| % Distributed under GPL 2 or later |
| |
| \name{dist} |
| \alias{dist} |
| \alias{print.dist} |
| \alias{format.dist} |
| \alias{labels.dist} |
| \alias{as.matrix.dist} |
| \alias{as.dist} |
| \alias{as.dist.default} |
| \concept{dissimilarity} |
| \title{Distance Matrix Computation} |
| \description{ |
| This function computes and returns the distance matrix computed by |
| using the specified distance measure to compute the distances between |
| the rows of a data matrix. |
| } |
| \usage{ |
| dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2) |
| |
| as.dist(m, diag = FALSE, upper = FALSE) |
| \method{as.dist}{default}(m, diag = FALSE, upper = FALSE) |
| |
| \method{print}{dist}(x, diag = NULL, upper = NULL, |
| digits = getOption("digits"), justify = "none", |
| right = TRUE, \dots) |
| |
| \method{as.matrix}{dist}(x, \dots) |
| } |
| \arguments{ |
| \item{x}{a numeric matrix, data frame or \code{"dist"} object.} |
| \item{method}{the distance measure to be used. This must be one of |
| \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"}, |
| \code{"canberra"}, \code{"binary"} or \code{"minkowski"}. |
| Any unambiguous substring can be given.} |
| \item{diag}{logical value indicating whether the diagonal of the |
| distance matrix should be printed by \code{print.dist}.} |
| \item{upper}{logical value indicating whether the upper triangle of the |
| distance matrix should be printed by \code{print.dist}.} |
| \item{p}{The power of the Minkowski distance.} |
| \item{m}{An object with distance information to be converted to a |
| \code{"dist"} object. For the default method, a \code{"dist"} |
| object, or a matrix (of distances) or an object which can be coerced |
| to such a matrix using \code{\link{as.matrix}()}. (Only the lower |
| triangle of the matrix is used, the rest is ignored).} |
| \item{digits, justify}{passed to \code{\link{format}} inside of |
| \code{print()}.} |
| \item{right, \dots}{further arguments, passed to other methods.} |
| } |
| \details{ |
| Available distance measures are (written for two vectors \eqn{x} and |
| \eqn{y}): |
| \describe{ |
| \item{\code{euclidean}:}{Usual distance between the two vectors (2 |
| norm aka \eqn{L_2}), \eqn{\sqrt{\sum_i (x_i - y_i)^2}}{sqrt(sum((x_i - y_i)^2))}.} |
| |
| \item{\code{maximum}:}{Maximum distance between two components of \eqn{x} |
| and \eqn{y} (supremum norm)} |
| |
| \item{\code{manhattan}:}{Absolute distance between the two vectors (1 norm aka \eqn{L_1}).} |
| |
| \item{\code{canberra}:}{ |
| %% till 2017-07-15: \eqn{\sum_i |x_i - y_i| / |x_i + y_i|}{sum(|x_i - y_i| / |x_i + y_i|)}. |
| \eqn{\sum_i |x_i - y_i| / (|x_i| + |y_i|)}{sum(|x_i - y_i| / (|x_i| + |y_i|))}. |
| Terms with zero numerator and denominator are omitted from the sum |
| and treated as if the values were missing. |
| |
| This is intended for non-negative values (e.g., counts), in which |
| case the denominator can be written in various equivalent ways; |
| Originally, \R used \eqn{x_i + y_i}, then from 1998 to 2017, |
| \eqn{|x_i + y_i|}, and then the correct \eqn{|x_i| + |y_i|}. |
| } |
| |
| \item{\code{binary}:}{(aka \emph{asymmetric binary}): The vectors |
| are regarded as binary bits, so non-zero elements are \sQuote{on} |
| and zero elements are \sQuote{off}. The distance is the |
| \emph{proportion} of bits in which only one is on amongst those in |
| which at least one is on.} |
| |
| \item{\code{minkowski}:}{The \eqn{p} norm, the \eqn{p}th root of the |
| sum of the \eqn{p}th powers of the differences of the components.} |
| } |
| |
| Missing values are allowed, and are excluded from all computations |
| involving the rows within which they occur. |
| Further, when \code{Inf} values are involved, all pairs of values are |
| excluded when their contribution to the distance gave \code{NaN} or |
| \code{NA}. |
| If some columns are excluded in calculating a Euclidean, Manhattan, |
| Canberra or Minkowski distance, the sum is scaled up proportionally to |
| the number of columns used. If all pairs are excluded when |
| calculating a particular distance, the value is \code{NA}. |
| |
| The \code{"dist"} method of \code{as.matrix()} and \code{as.dist()} |
| can be used for conversion between objects of class \code{"dist"} |
| and conventional distance matrices. |
| |
| \code{as.dist()} is a generic function. Its default method handles |
| objects inheriting from class \code{"dist"}, or coercible to matrices |
| using \code{\link{as.matrix}()}. Support for classes representing |
| distances (also known as dissimilarities) can be added by providing an |
| \code{\link{as.matrix}()} or, more directly, an \code{as.dist} method |
| for such a class. |
| } |
| \value{ |
| \code{dist} returns an object of class \code{"dist"}. |
| |
| The lower triangle of the distance matrix stored by columns in a |
| vector, say \code{do}. If \code{n} is the number of |
| observations, i.e., \code{n <- attr(do, "Size")}, then |
| for \eqn{i < j \le n}, the dissimilarity between (row) i and j is |
| \code{do[n*(i-1) - i*(i-1)/2 + j-i]}. |
| The length of the vector is \eqn{n*(n-1)/2}, i.e., of order \eqn{n^2}. |
| |
| The object has the following attributes (besides \code{"class"} equal |
| to \code{"dist"}): |
| \item{Size}{integer, the number of observations in the dataset.} |
| \item{Labels}{optionally, contains the labels, if any, of the |
| observations of the dataset.} |
| \item{Diag, Upper}{logicals corresponding to the arguments \code{diag} |
| and \code{upper} above, specifying how the object should be printed.} |
| \item{call}{optionally, the \code{\link{call}} used to create the |
| object.} |
| \item{method}{optionally, the distance method used; resulting from |
| \code{\link{dist}()}, the (\code{\link{match.arg}()}ed) \code{method} |
| argument.} |
| } |
| \references{ |
| Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) |
| \emph{The New S Language}. |
| Wadsworth & Brooks/Cole. |
| |
| Mardia, K. V., Kent, J. T. and Bibby, J. M. (1979) |
| \emph{Multivariate Analysis.} Academic Press. |
| |
| Borg, I. and Groenen, P. (1997) |
| \emph{Modern Multidimensional Scaling. Theory and Applications.} |
| Springer. |
| } |
| \seealso{ |
| \code{\link[cluster]{daisy}} in the \CRANpkg{cluster} package with more |
| possibilities in the case of \emph{mixed} (continuous / categorical) |
| variables. |
| \code{\link{hclust}}. |
| } |
| \examples{ |
| require(graphics) |
| |
| x <- matrix(rnorm(100), nrow = 5) |
| dist(x) |
| dist(x, diag = TRUE) |
| dist(x, upper = TRUE) |
| m <- as.matrix(dist(x)) |
| d <- as.dist(m) |
| stopifnot(d == dist(x)) |
| |
| ## Use correlations between variables "as distance" |
| dd <- as.dist((1 - cor(USJudgeRatings))/2) |
| round(1000 * dd) # (prints more nicely) |
| plot(hclust(dd)) # to see a dendrogram of clustered variables |
| |
| ## example of binary and canberra distances. |
| x <- c(0, 0, 1, 1, 1, 1) |
| y <- c(1, 0, 1, 1, 0, 1) |
| dist(rbind(x, y), method = "binary") |
| ## answer 0.4 = 2/5 |
| dist(rbind(x, y), method = "canberra") |
| ## answer 2 * (6/5) |
| |
| ## To find the names |
| labels(eurodist) |
| |
| ## Examples involving "Inf" : |
| ## 1) |
| x[6] <- Inf |
| (m2 <- rbind(x, y)) |
| dist(m2, method = "binary") # warning, answer 0.5 = 2/4 |
| ## These all give "Inf": |
| stopifnot(Inf == dist(m2, method = "euclidean"), |
| Inf == dist(m2, method = "maximum"), |
| Inf == dist(m2, method = "manhattan")) |
| ## "Inf" is same as very large number: |
| x1 <- x; x1[6] <- 1e100 |
| stopifnot(dist(cbind(x, y), method = "canberra") == |
| print(dist(cbind(x1, y), method = "canberra"))) |
| |
| ## 2) |
| y[6] <- Inf #-> 6-th pair is excluded |
| dist(rbind(x, y), method = "binary" ) # warning; 0.5 |
| dist(rbind(x, y), method = "canberra" ) # 3 |
| dist(rbind(x, y), method = "maximum") # 1 |
| dist(rbind(x, y), method = "manhattan") # 2.4 |
| } |
| \keyword{multivariate} |
| \keyword{cluster} |