blob: 390dac132bbd8d76f9fbf3209ef5c7226944ea40 [file] [log] [blame]
% File src/library/stats/man/dist.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2017 R Core Team
% Distributed under GPL 2 or later
\name{dist}
\alias{dist}
\alias{print.dist}
\alias{format.dist}
\alias{labels.dist}
\alias{as.matrix.dist}
\alias{as.dist}
\alias{as.dist.default}
\concept{dissimilarity}
\title{Distance Matrix Computation}
\description{
This function computes and returns the distance matrix computed by
using the specified distance measure to compute the distances between
the rows of a data matrix.
}
\usage{
dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2)
as.dist(m, diag = FALSE, upper = FALSE)
\method{as.dist}{default}(m, diag = FALSE, upper = FALSE)
\method{print}{dist}(x, diag = NULL, upper = NULL,
digits = getOption("digits"), justify = "none",
right = TRUE, \dots)
\method{as.matrix}{dist}(x, \dots)
}
\arguments{
\item{x}{a numeric matrix, data frame or \code{"dist"} object.}
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"} or \code{"minkowski"}.
Any unambiguous substring can be given.}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
\item{upper}{logical value indicating whether the upper triangle of the
distance matrix should be printed by \code{print.dist}.}
\item{p}{The power of the Minkowski distance.}
\item{m}{An object with distance information to be converted to a
\code{"dist"} object. For the default method, a \code{"dist"}
object, or a matrix (of distances) or an object which can be coerced
to such a matrix using \code{\link{as.matrix}()}. (Only the lower
triangle of the matrix is used, the rest is ignored).}
\item{digits, justify}{passed to \code{\link{format}} inside of
\code{print()}.}
\item{right, \dots}{further arguments, passed to other methods.}
}
\details{
Available distance measures are (written for two vectors \eqn{x} and
\eqn{y}):
\describe{
\item{\code{euclidean}:}{Usual distance between the two vectors (2
norm aka \eqn{L_2}), \eqn{\sqrt{\sum_i (x_i - y_i)^2}}{sqrt(sum((x_i - y_i)^2))}.}
\item{\code{maximum}:}{Maximum distance between two components of \eqn{x}
and \eqn{y} (supremum norm)}
\item{\code{manhattan}:}{Absolute distance between the two vectors (1 norm aka \eqn{L_1}).}
\item{\code{canberra}:}{
%% till 2017-07-15: \eqn{\sum_i |x_i - y_i| / |x_i + y_i|}{sum(|x_i - y_i| / |x_i + y_i|)}.
\eqn{\sum_i |x_i - y_i| / (|x_i| + |y_i|)}{sum(|x_i - y_i| / (|x_i| + |y_i|))}.
Terms with zero numerator and denominator are omitted from the sum
and treated as if the values were missing.
This is intended for non-negative values (e.g., counts), in which
case the denominator can be written in various equivalent ways;
Originally, \R used \eqn{x_i + y_i}, then from 1998 to 2017,
\eqn{|x_i + y_i|}, and then the correct \eqn{|x_i| + |y_i|}.
}
\item{\code{binary}:}{(aka \emph{asymmetric binary}): The vectors
are regarded as binary bits, so non-zero elements are \sQuote{on}
and zero elements are \sQuote{off}. The distance is the
\emph{proportion} of bits in which only one is on amongst those in
which at least one is on.}
\item{\code{minkowski}:}{The \eqn{p} norm, the \eqn{p}th root of the
sum of the \eqn{p}th powers of the differences of the components.}
}
Missing values are allowed, and are excluded from all computations
involving the rows within which they occur.
Further, when \code{Inf} values are involved, all pairs of values are
excluded when their contribution to the distance gave \code{NaN} or
\code{NA}.
If some columns are excluded in calculating a Euclidean, Manhattan,
Canberra or Minkowski distance, the sum is scaled up proportionally to
the number of columns used. If all pairs are excluded when
calculating a particular distance, the value is \code{NA}.
The \code{"dist"} method of \code{as.matrix()} and \code{as.dist()}
can be used for conversion between objects of class \code{"dist"}
and conventional distance matrices.
\code{as.dist()} is a generic function. Its default method handles
objects inheriting from class \code{"dist"}, or coercible to matrices
using \code{\link{as.matrix}()}. Support for classes representing
distances (also known as dissimilarities) can be added by providing an
\code{\link{as.matrix}()} or, more directly, an \code{as.dist} method
for such a class.
}
\value{
\code{dist} returns an object of class \code{"dist"}.
The lower triangle of the distance matrix stored by columns in a
vector, say \code{do}. If \code{n} is the number of
observations, i.e., \code{n <- attr(do, "Size")}, then
for \eqn{i < j \le n}, the dissimilarity between (row) i and j is
\code{do[n*(i-1) - i*(i-1)/2 + j-i]}.
The length of the vector is \eqn{n*(n-1)/2}, i.e., of order \eqn{n^2}.
The object has the following attributes (besides \code{"class"} equal
to \code{"dist"}):
\item{Size}{integer, the number of observations in the dataset.}
\item{Labels}{optionally, contains the labels, if any, of the
observations of the dataset.}
\item{Diag, Upper}{logicals corresponding to the arguments \code{diag}
and \code{upper} above, specifying how the object should be printed.}
\item{call}{optionally, the \code{\link{call}} used to create the
object.}
\item{method}{optionally, the distance method used; resulting from
\code{\link{dist}()}, the (\code{\link{match.arg}()}ed) \code{method}
argument.}
}
\references{
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
\emph{The New S Language}.
Wadsworth & Brooks/Cole.
Mardia, K. V., Kent, J. T. and Bibby, J. M. (1979)
\emph{Multivariate Analysis.} Academic Press.
Borg, I. and Groenen, P. (1997)
\emph{Modern Multidimensional Scaling. Theory and Applications.}
Springer.
}
\seealso{
\code{\link[cluster]{daisy}} in the \CRANpkg{cluster} package with more
possibilities in the case of \emph{mixed} (continuous / categorical)
variables.
\code{\link{hclust}}.
}
\examples{
require(graphics)
x <- matrix(rnorm(100), nrow = 5)
dist(x)
dist(x, diag = TRUE)
dist(x, upper = TRUE)
m <- as.matrix(dist(x))
d <- as.dist(m)
stopifnot(d == dist(x))
## Use correlations between variables "as distance"
dd <- as.dist((1 - cor(USJudgeRatings))/2)
round(1000 * dd) # (prints more nicely)
plot(hclust(dd)) # to see a dendrogram of clustered variables
## example of binary and canberra distances.
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
dist(rbind(x, y), method = "binary")
## answer 0.4 = 2/5
dist(rbind(x, y), method = "canberra")
## answer 2 * (6/5)
## To find the names
labels(eurodist)
## Examples involving "Inf" :
## 1)
x[6] <- Inf
(m2 <- rbind(x, y))
dist(m2, method = "binary") # warning, answer 0.5 = 2/4
## These all give "Inf":
stopifnot(Inf == dist(m2, method = "euclidean"),
Inf == dist(m2, method = "maximum"),
Inf == dist(m2, method = "manhattan"))
## "Inf" is same as very large number:
x1 <- x; x1[6] <- 1e100
stopifnot(dist(cbind(x, y), method = "canberra") ==
print(dist(cbind(x1, y), method = "canberra")))
## 2)
y[6] <- Inf #-> 6-th pair is excluded
dist(rbind(x, y), method = "binary" ) # warning; 0.5
dist(rbind(x, y), method = "canberra" ) # 3
dist(rbind(x, y), method = "maximum") # 1
dist(rbind(x, y), method = "manhattan") # 2.4
}
\keyword{multivariate}
\keyword{cluster}