src/library/stats/R/hclust.R - R - Git at Google

 #  File src/library/stats/R/hclust.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2019 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 ## Hierarchical clustering, on raw input data; we will use Euclidean
 ## distance.  A range of criteria are supported; also there is a
 ## storage-economic option.
 ##
 ## We use the general routine, `hc', which caters for 7 criteria,
 ## using a half dissimilarity matrix; (BTW, this uses the very efficient
 ## nearest neighbor chain algorithm, which makes this algorithm of
 ## O(n^2) computational time, and differentiates it from the less
 ## efficient -- i.e. O(n^3) -- implementations in all commercial
 ## statistical packages -- as far as I am aware -- except Clustan.)
 ##
 ## Clustering Methods:
 ##
 ## 1. Ward's minimum variance or error sum of squares method (using raw d) -> "ward.D"
 ## 2. single linkage or nearest neighbor method.
 ## 3. complete linkage or diameter.
 ## 4. average linkage, group average, or UPGMA method.
 ## 5. McQuitty's or WPGMA method.
 ## 6. median, Gower's or WPGMC method.
 ## 7. centroid or UPGMC method (7).
 ## 8. Ward's ... "correct" method using d^2 (in Fortran) -> "ward.D2"
 ##
 ## Original author: F. Murtagh, May 1992
 ## R Modifications: Ross Ihaka, Dec 1996
 ##		    Friedrich Leisch, Apr 1998, Jun 2000
 ## "ward.D" and "ward.D2" from suggestions by Pierre Legendre,
 ## by Martin Maechler, mostly in the Fortran part.

 hclust <- function(d, method="complete", members=NULL)
 {
     ## order of METHODS --> i.meth -> Fortran's  iOpt  codes
     METHODS <- c("ward.D", "single", # 1, 2,
                  "complete", "average", "mcquitty", # 3, 4, 5,
                  "median", "centroid", "ward.D2") # 6, 7, 8
     if(method == "ward") { # do not deprecate earlier than 2015!
 	message("The \"ward\" method has been renamed to \"ward.D\"; note new \"ward.D2\"")
 	method <- "ward.D"
     }
     i.meth <-  pmatch(method, METHODS)
     if(is.na(i.meth))
         ## TODO: use gettextf() [-> translation string change]
 	stop("invalid clustering method", paste("", method))
     if(i.meth == -1)
 	stop("ambiguous clustering method", paste("", method))

     n <- as.integer(attr(d, "Size"))
     if(is.null(n))
 	stop("invalid dissimilarities")
     if(is.na(n) || n > 65536L) stop("size cannot be NA nor exceed 65536")
     if(n < 2)
         stop("must have n >= 2 objects to cluster")
     len <- as.integer(n*(n-1)/2)
     if(length(d) != len)
         (if (length(d) < len) stop else warning
          )("dissimilarities of improper length")

     if(is.null(members))
         members <- rep(1, n)
     else if(length(members) != n)
         stop("invalid length of members")

     storage.mode(d) <- "double"
     hcl <- .Fortran(C_hclust,
 		    n = n,
 		    len = len,
 		    method = as.integer(i.meth),
 		    ia = integer(n),
 		    ib = integer(n),
 		    crit = double(n),
 		    members = as.double(members),
 		    nn = integer(n),
 		    disnn = double(n),
 #		    flag = logical(n), # unused
 		    diss = d)

     ## 2nd step: interpret the information that we now have
     ## as merge, height, and order lists.

     hcass <- .Fortran(C_hcass2,
 		      n = n, # checked above.
 		      ia = hcl$ia,
 		      ib = hcl$ib,
 		      order = integer(n),
 		      iia = integer(n),
 		      iib = integer(n))

     structure(list(merge = cbind(hcass$iia[1L:(n-1)], hcass$iib[1L:(n-1)]),
 		   height = hcl$crit[1L:(n-1)],
 		   order = hcass$order,
 		   labels = attr(d, "Labels"),
 		   method = METHODS[i.meth],
 		   call = match.call(),
 		   dist.method = attr(d, "method")),
 	      class = "hclust")
 }

 ##' @title Check hclust() object for validity
 ##' @param x "hclust" object
 ##' @param merge (= x$merge, passing it may save memory)
 ##' @param order logical indicating if 'x$order' should be checked, too
 ##' @return character vector with message or TRUE
 ##' @author Martin Maechler
 .validity.hclust <- function(x, merge = x$merge, order = TRUE) {
     if (!is.matrix(merge) || ncol(merge) != 2)
 	return("invalid dendrogram")
     ## merge should be integer but might not be after dump/restore.
     if (any(as.integer(merge) != merge))
 	return("'merge' component in dendrogram must be integer")
     n1 <- nrow(merge) # == #{obs} - 1
     n <- n1+1L
     if(length(x$height) != n1) return("'height' is of wrong length")
     if(order && length(x$order ) != n ) return("'order' is of wrong length")
     if(identical(sort(as.integer(merge)), c(-(n:1L), +seq_len(n-2L))))
 	TRUE
     else
 	"'merge' matrix has invalid contents"
 }

 plot.hclust <-
     function (x, labels = NULL, hang = 0.1, check = TRUE,
               axes = TRUE, frame.plot = FALSE, ann = TRUE,
               main = "Cluster Dendrogram",
               sub = NULL, xlab = NULL, ylab = "Height", ...)
 {
     merge <- x$merge
     if(check && !isTRUE(msg <- .validity.hclust(x,merge)))
 	stop(msg)
     storage.mode(merge) <- "integer"
     n1 <- nrow(merge) # == #{obs} - 1
     n <- n1+1L
     height <- as.double(x$height)
     labels <-
 	if(missing(labels) || is.null(labels)) {
 	    as.character(if(is.null(x$labels)) seq_len(n) else x$labels)
 	} else {
 	    if(is.logical(labels) && !labels)# FALSE
 		character(n)
 	    else
 		as.character(labels)
 	}

     dev.hold(); on.exit(dev.flush())
     plot.new()
     graphics:::plotHclust(n1, merge, height, order(x$order), hang, labels, ...)
     if(axes)
         axis(2, at=pretty(range(height)), ...)
     if (frame.plot)
         box(...)
     if (ann) {
         if(!is.null(cl <- x$call) && is.null(sub))
             sub <- paste0(deparse(cl[[1L]])," (*, \"", x$method,"\")")
         if(is.null(xlab) && !is.null(cl))
             xlab <- deparse(cl[[2L]])
         title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...)
     }
     invisible()
 }


 as.hclust <- function(x, ...) UseMethod("as.hclust")
 ## need *.default for idempotency:
 as.hclust.default <- function(x, ...) {
     if(inherits(x, "hclust")) x
     else
 	stop(gettextf("argument 'x' cannot be coerced to class %s",
                       dQuote("hclust")),
              if(!is.null(oldClass(x)))
              gettextf("\n Consider providing an as.hclust.%s() method",
                       oldClass(x)[1L]),
              domain = NA)
 }

 as.hclust.twins <- function(x, ...)
 {
     r <- list(merge = x$merge,
 	      height = sort(x$height),
 	      order = x$order,
 	      labels = if(!is.null(lb <- x$order.lab)) {
                   lb[sort.list(x$order)] } else rownames(x$data),# may be NULL
 	      call = if(!is.null(cl <- x$call)) cl else match.call(),
 	      method = if(!is.null(mt <- x$method)) mt else NA,
 	      dist.method = attr(x$diss, "Metric"))
     class(r) <- "hclust"
     r
 }

 print.hclust <- function(x, ...)
 {
     if(!is.null(x$call))
         cat("\nCall:\n", deparse(x$call), "\n\n", sep = "")
     if(!is.null(x$method))
         cat("Cluster method   :", x$method, "\n")
     if(!is.null(x$dist.method))
         cat("Distance         :", x$dist.method, "\n")
     cat("Number of objects:", length(x$height)+1, "\n")
     cat("\n")
     invisible(x)
 }

 cophenetic <-
 function(x)
     UseMethod("cophenetic")
 cophenetic.default <-
 function(x)
 {
     x <- as.hclust(x)
     nobs <- length(x$order)
     ilist <- vector("list", length = nobs)
     out <- matrix(0, nrow = nobs, ncol = nobs)
     for(i in 1 : (nobs - 1)) {
         inds <- x$merge[i,]
         ids1 <- if(inds[1L] < 0L) -inds[1L] else ilist[[inds[1L]]]
         ids2 <- if(inds[2L] < 0L) -inds[2L] else ilist[[inds[2L]]]
         ilist[[i]] <- c(ids1, ids2)
         out[cbind(rep.int(ids1, rep.int(length(ids2), length(ids1))),
                   rep.int(ids2, length(ids1)))] <- x$height[i]
     }
     rownames(out) <- x$labels
     as.dist(out + t(out))
 }
 cophenetic.dendrogram <-
 function(x)
 {
     ## Obtain cophenetic distances from a dendrogram by recursively
     ## doing the following:
     ## * if not a leaf, then for all children call ourselves, create
     ##   a block diagonal matrix from this, and fill the rest with the
     ##   current height (as everything in different children is joined
     ##   at the current split) ...
     ## * if a leaf, height and result are 0.
     ## Actually, we need to return something of class "dist", so things
     ## are a bit more complicated, and we might be able to make this
     ## more efficient by avoiding matrices ...
     if(is.leaf(x)) {
         ## If there is no label, we cannot recover the (names of the)
         ## objects the distances are for, and hence abort.
         if(is.null(label <- attr(x, "label")))
             stop("need dendrograms where all leaves have labels")
         return(as.dist(matrix(0, dimnames = list(label, label))))
     }
     children <- vector("list", length(x))
     for(i in seq_along(x))
         children[[i]] <- Recall(x[[i]])
     lens <- sapply(children, attr, "Size")
     m <- matrix(attr(x, "height"), sum(lens), sum(lens))
     ## This seems a bit slower:
     ##    inds <- split(seq(length.out = sum(lens)),
     ##                  rep.int(seq_along(lens), lens))
     ##    for(i in seq_along(inds))
     ##         m[inds[[i]], inds[[i]]] <- as.matrix(children[[i]])
     hi <- cumsum(lens)
     lo <- c(0L, hi[-length(hi)]) + 1L
     for(i in seq_along(x))
         m[lo[i] : hi[i], lo[i] : hi[i]] <- as.matrix(children[[i]])
     rownames(m) <- colnames(m) <- unlist(lapply(children, attr, "Labels"))
     as.dist(m)
 }
	# File src/library/stats/R/hclust.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2019 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	## Hierarchical clustering, on raw input data; we will use Euclidean
	## distance. A range of criteria are supported; also there is a
	## storage-economic option.
	##
	## We use the general routine, `hc', which caters for 7 criteria,
	## using a half dissimilarity matrix; (BTW, this uses the very efficient
	## nearest neighbor chain algorithm, which makes this algorithm of
	## O(n^2) computational time, and differentiates it from the less
	## efficient -- i.e. O(n^3) -- implementations in all commercial
	## statistical packages -- as far as I am aware -- except Clustan.)
	##
	## Clustering Methods:
	##
	## 1. Ward's minimum variance or error sum of squares method (using raw d) -> "ward.D"
	## 2. single linkage or nearest neighbor method.
	## 3. complete linkage or diameter.
	## 4. average linkage, group average, or UPGMA method.
	## 5. McQuitty's or WPGMA method.
	## 6. median, Gower's or WPGMC method.
	## 7. centroid or UPGMC method (7).
	## 8. Ward's ... "correct" method using d^2 (in Fortran) -> "ward.D2"
	##
	## Original author: F. Murtagh, May 1992
	## R Modifications: Ross Ihaka, Dec 1996
	## Friedrich Leisch, Apr 1998, Jun 2000
	## "ward.D" and "ward.D2" from suggestions by Pierre Legendre,
	## by Martin Maechler, mostly in the Fortran part.

	hclust <- function(d, method="complete", members=NULL)
	{
	## order of METHODS --> i.meth -> Fortran's iOpt codes
	METHODS <- c("ward.D", "single", # 1, 2,
	"complete", "average", "mcquitty", # 3, 4, 5,
	"median", "centroid", "ward.D2") # 6, 7, 8
	if(method == "ward") { # do not deprecate earlier than 2015!
	message("The \"ward\" method has been renamed to \"ward.D\"; note new \"ward.D2\"")
	method <- "ward.D"
	}
	i.meth <- pmatch(method, METHODS)
	if(is.na(i.meth))
	## TODO: use gettextf() [-> translation string change]
	stop("invalid clustering method", paste("", method))
	if(i.meth == -1)
	stop("ambiguous clustering method", paste("", method))

	n <- as.integer(attr(d, "Size"))
	if(is.null(n))
	stop("invalid dissimilarities")
	if(is.na(n) \|\| n > 65536L) stop("size cannot be NA nor exceed 65536")
	if(n < 2)
	stop("must have n >= 2 objects to cluster")
	len <- as.integer(n*(n-1)/2)
	if(length(d) != len)
	(if (length(d) < len) stop else warning
	)("dissimilarities of improper length")

	if(is.null(members))
	members <- rep(1, n)
	else if(length(members) != n)
	stop("invalid length of members")

	storage.mode(d) <- "double"
	hcl <- .Fortran(C_hclust,
	n = n,
	len = len,
	method = as.integer(i.meth),
	ia = integer(n),
	ib = integer(n),
	crit = double(n),
	members = as.double(members),
	nn = integer(n),
	disnn = double(n),
	# flag = logical(n), # unused
	diss = d)

	## 2nd step: interpret the information that we now have
	## as merge, height, and order lists.

	hcass <- .Fortran(C_hcass2,
	n = n, # checked above.
	ia = hcl$ia,
	ib = hcl$ib,
	order = integer(n),
	iia = integer(n),
	iib = integer(n))

	structure(list(merge = cbind(hcass$iia[1L:(n-1)], hcass$iib[1L:(n-1)]),
	height = hcl$crit[1L:(n-1)],
	order = hcass$order,
	labels = attr(d, "Labels"),
	method = METHODS[i.meth],
	call = match.call(),
	dist.method = attr(d, "method")),
	class = "hclust")
	}

	##' @title Check hclust() object for validity
	##' @param x "hclust" object
	##' @param merge (= x$merge, passing it may save memory)
	##' @param order logical indicating if 'x$order' should be checked, too
	##' @return character vector with message or TRUE
	##' @author Martin Maechler
	.validity.hclust <- function(x, merge = x$merge, order = TRUE) {
	if (!is.matrix(merge) \|\| ncol(merge) != 2)
	return("invalid dendrogram")
	## merge should be integer but might not be after dump/restore.
	if (any(as.integer(merge) != merge))
	return("'merge' component in dendrogram must be integer")
	n1 <- nrow(merge) # == #{obs} - 1
	n <- n1+1L
	if(length(x$height) != n1) return("'height' is of wrong length")
	if(order && length(x$order ) != n ) return("'order' is of wrong length")
	if(identical(sort(as.integer(merge)), c(-(n:1L), +seq_len(n-2L))))
	TRUE
	else
	"'merge' matrix has invalid contents"
	}

	plot.hclust <-
	function (x, labels = NULL, hang = 0.1, check = TRUE,
	axes = TRUE, frame.plot = FALSE, ann = TRUE,
	main = "Cluster Dendrogram",
	sub = NULL, xlab = NULL, ylab = "Height", ...)
	{
	merge <- x$merge
	if(check && !isTRUE(msg <- .validity.hclust(x,merge)))
	stop(msg)
	storage.mode(merge) <- "integer"
	n1 <- nrow(merge) # == #{obs} - 1
	n <- n1+1L
	height <- as.double(x$height)
	labels <-
	if(missing(labels) \|\| is.null(labels)) {
	as.character(if(is.null(x$labels)) seq_len(n) else x$labels)
	} else {
	if(is.logical(labels) && !labels)# FALSE
	character(n)
	else
	as.character(labels)
	}

	dev.hold(); on.exit(dev.flush())
	plot.new()
	graphics:::plotHclust(n1, merge, height, order(x$order), hang, labels, ...)
	if(axes)
	axis(2, at=pretty(range(height)), ...)
	if (frame.plot)
	box(...)
	if (ann) {
	if(!is.null(cl <- x$call) && is.null(sub))
	sub <- paste0(deparse(cl[[1L]])," (*, \"", x$method,"\")")
	if(is.null(xlab) && !is.null(cl))
	xlab <- deparse(cl[[2L]])
	title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...)
	}
	invisible()
	}



	as.hclust <- function(x, ...) UseMethod("as.hclust")
	## need *.default for idempotency:
	as.hclust.default <- function(x, ...) {
	if(inherits(x, "hclust")) x
	else
	stop(gettextf("argument 'x' cannot be coerced to class %s",
	dQuote("hclust")),
	if(!is.null(oldClass(x)))
	gettextf("\n Consider providing an as.hclust.%s() method",
	oldClass(x)[1L]),
	domain = NA)
	}

	as.hclust.twins <- function(x, ...)
	{
	r <- list(merge = x$merge,
	height = sort(x$height),
	order = x$order,
	labels = if(!is.null(lb <- x$order.lab)) {
	lb[sort.list(x$order)] } else rownames(x$data),# may be NULL
	call = if(!is.null(cl <- x$call)) cl else match.call(),
	method = if(!is.null(mt <- x$method)) mt else NA,
	dist.method = attr(x$diss, "Metric"))
	class(r) <- "hclust"
	r
	}

	print.hclust <- function(x, ...)
	{
	if(!is.null(x$call))
	cat("\nCall:\n", deparse(x$call), "\n\n", sep = "")
	if(!is.null(x$method))
	cat("Cluster method :", x$method, "\n")
	if(!is.null(x$dist.method))
	cat("Distance :", x$dist.method, "\n")
	cat("Number of objects:", length(x$height)+1, "\n")
	cat("\n")
	invisible(x)
	}

	cophenetic <-
	function(x)
	UseMethod("cophenetic")
	cophenetic.default <-
	function(x)
	{
	x <- as.hclust(x)
	nobs <- length(x$order)
	ilist <- vector("list", length = nobs)
	out <- matrix(0, nrow = nobs, ncol = nobs)
	for(i in 1 : (nobs - 1)) {
	inds <- x$merge[i,]
	ids1 <- if(inds[1L] < 0L) -inds[1L] else ilist[[inds[1L]]]
	ids2 <- if(inds[2L] < 0L) -inds[2L] else ilist[[inds[2L]]]
	ilist[[i]] <- c(ids1, ids2)
	out[cbind(rep.int(ids1, rep.int(length(ids2), length(ids1))),
	rep.int(ids2, length(ids1)))] <- x$height[i]
	}
	rownames(out) <- x$labels
	as.dist(out + t(out))
	}
	cophenetic.dendrogram <-
	function(x)
	{
	## Obtain cophenetic distances from a dendrogram by recursively
	## doing the following:
	## * if not a leaf, then for all children call ourselves, create
	## a block diagonal matrix from this, and fill the rest with the
	## current height (as everything in different children is joined
	## at the current split) ...
	## * if a leaf, height and result are 0.
	## Actually, we need to return something of class "dist", so things
	## are a bit more complicated, and we might be able to make this
	## more efficient by avoiding matrices ...
	if(is.leaf(x)) {
	## If there is no label, we cannot recover the (names of the)
	## objects the distances are for, and hence abort.
	if(is.null(label <- attr(x, "label")))
	stop("need dendrograms where all leaves have labels")
	return(as.dist(matrix(0, dimnames = list(label, label))))
	}
	children <- vector("list", length(x))
	for(i in seq_along(x))
	children[[i]] <- Recall(x[[i]])
	lens <- sapply(children, attr, "Size")
	m <- matrix(attr(x, "height"), sum(lens), sum(lens))
	## This seems a bit slower:
	## inds <- split(seq(length.out = sum(lens)),
	## rep.int(seq_along(lens), lens))
	## for(i in seq_along(inds))
	## m[inds[[i]], inds[[i]]] <- as.matrix(children[[i]])
	hi <- cumsum(lens)
	lo <- c(0L, hi[-length(hi)]) + 1L
	for(i in seq_along(x))
	m[lo[i] : hi[i], lo[i] : hi[i]] <- as.matrix(children[[i]])
	rownames(m) <- colnames(m) <- unlist(lapply(children, attr, "Labels"))
	as.dist(m)
	}