| # File src/library/stats/R/hclust.R |
| # Part of the R package, https://www.R-project.org |
| # |
| # Copyright (C) 1995-2019 The R Core Team |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # A copy of the GNU General Public License is available at |
| # https://www.R-project.org/Licenses/ |
| |
| ## Hierarchical clustering, on raw input data; we will use Euclidean |
| ## distance. A range of criteria are supported; also there is a |
| ## storage-economic option. |
| ## |
| ## We use the general routine, `hc', which caters for 7 criteria, |
| ## using a half dissimilarity matrix; (BTW, this uses the very efficient |
| ## nearest neighbor chain algorithm, which makes this algorithm of |
| ## O(n^2) computational time, and differentiates it from the less |
| ## efficient -- i.e. O(n^3) -- implementations in all commercial |
| ## statistical packages -- as far as I am aware -- except Clustan.) |
| ## |
| ## Clustering Methods: |
| ## |
| ## 1. Ward's minimum variance or error sum of squares method (using raw d) -> "ward.D" |
| ## 2. single linkage or nearest neighbor method. |
| ## 3. complete linkage or diameter. |
| ## 4. average linkage, group average, or UPGMA method. |
| ## 5. McQuitty's or WPGMA method. |
| ## 6. median, Gower's or WPGMC method. |
| ## 7. centroid or UPGMC method (7). |
| ## 8. Ward's ... "correct" method using d^2 (in Fortran) -> "ward.D2" |
| ## |
| ## Original author: F. Murtagh, May 1992 |
| ## R Modifications: Ross Ihaka, Dec 1996 |
| ## Friedrich Leisch, Apr 1998, Jun 2000 |
| ## "ward.D" and "ward.D2" from suggestions by Pierre Legendre, |
| ## by Martin Maechler, mostly in the Fortran part. |
| |
| hclust <- function(d, method="complete", members=NULL) |
| { |
| ## order of METHODS --> i.meth -> Fortran's iOpt codes |
| METHODS <- c("ward.D", "single", # 1, 2, |
| "complete", "average", "mcquitty", # 3, 4, 5, |
| "median", "centroid", "ward.D2") # 6, 7, 8 |
| if(method == "ward") { # do not deprecate earlier than 2015! |
| message("The \"ward\" method has been renamed to \"ward.D\"; note new \"ward.D2\"") |
| method <- "ward.D" |
| } |
| i.meth <- pmatch(method, METHODS) |
| if(is.na(i.meth)) |
| ## TODO: use gettextf() [-> translation string change] |
| stop("invalid clustering method", paste("", method)) |
| if(i.meth == -1) |
| stop("ambiguous clustering method", paste("", method)) |
| |
| n <- as.integer(attr(d, "Size")) |
| if(is.null(n)) |
| stop("invalid dissimilarities") |
| if(is.na(n) || n > 65536L) stop("size cannot be NA nor exceed 65536") |
| if(n < 2) |
| stop("must have n >= 2 objects to cluster") |
| len <- as.integer(n*(n-1)/2) |
| if(length(d) != len) |
| (if (length(d) < len) stop else warning |
| )("dissimilarities of improper length") |
| |
| if(is.null(members)) |
| members <- rep(1, n) |
| else if(length(members) != n) |
| stop("invalid length of members") |
| |
| storage.mode(d) <- "double" |
| hcl <- .Fortran(C_hclust, |
| n = n, |
| len = len, |
| method = as.integer(i.meth), |
| ia = integer(n), |
| ib = integer(n), |
| crit = double(n), |
| members = as.double(members), |
| nn = integer(n), |
| disnn = double(n), |
| # flag = logical(n), # unused |
| diss = d) |
| |
| ## 2nd step: interpret the information that we now have |
| ## as merge, height, and order lists. |
| |
| hcass <- .Fortran(C_hcass2, |
| n = n, # checked above. |
| ia = hcl$ia, |
| ib = hcl$ib, |
| order = integer(n), |
| iia = integer(n), |
| iib = integer(n)) |
| |
| structure(list(merge = cbind(hcass$iia[1L:(n-1)], hcass$iib[1L:(n-1)]), |
| height = hcl$crit[1L:(n-1)], |
| order = hcass$order, |
| labels = attr(d, "Labels"), |
| method = METHODS[i.meth], |
| call = match.call(), |
| dist.method = attr(d, "method")), |
| class = "hclust") |
| } |
| |
| ##' @title Check hclust() object for validity |
| ##' @param x "hclust" object |
| ##' @param merge (= x$merge, passing it may save memory) |
| ##' @param order logical indicating if 'x$order' should be checked, too |
| ##' @return character vector with message or TRUE |
| ##' @author Martin Maechler |
| .validity.hclust <- function(x, merge = x$merge, order = TRUE) { |
| if (!is.matrix(merge) || ncol(merge) != 2) |
| return("invalid dendrogram") |
| ## merge should be integer but might not be after dump/restore. |
| if (any(as.integer(merge) != merge)) |
| return("'merge' component in dendrogram must be integer") |
| n1 <- nrow(merge) # == #{obs} - 1 |
| n <- n1+1L |
| if(length(x$height) != n1) return("'height' is of wrong length") |
| if(order && length(x$order ) != n ) return("'order' is of wrong length") |
| if(identical(sort(as.integer(merge)), c(-(n:1L), +seq_len(n-2L)))) |
| TRUE |
| else |
| "'merge' matrix has invalid contents" |
| } |
| |
| plot.hclust <- |
| function (x, labels = NULL, hang = 0.1, check = TRUE, |
| axes = TRUE, frame.plot = FALSE, ann = TRUE, |
| main = "Cluster Dendrogram", |
| sub = NULL, xlab = NULL, ylab = "Height", ...) |
| { |
| merge <- x$merge |
| if(check && !isTRUE(msg <- .validity.hclust(x,merge))) |
| stop(msg) |
| storage.mode(merge) <- "integer" |
| n1 <- nrow(merge) # == #{obs} - 1 |
| n <- n1+1L |
| height <- as.double(x$height) |
| labels <- |
| if(missing(labels) || is.null(labels)) { |
| as.character(if(is.null(x$labels)) seq_len(n) else x$labels) |
| } else { |
| if(is.logical(labels) && !labels)# FALSE |
| character(n) |
| else |
| as.character(labels) |
| } |
| |
| dev.hold(); on.exit(dev.flush()) |
| plot.new() |
| graphics:::plotHclust(n1, merge, height, order(x$order), hang, labels, ...) |
| if(axes) |
| axis(2, at=pretty(range(height)), ...) |
| if (frame.plot) |
| box(...) |
| if (ann) { |
| if(!is.null(cl <- x$call) && is.null(sub)) |
| sub <- paste0(deparse(cl[[1L]])," (*, \"", x$method,"\")") |
| if(is.null(xlab) && !is.null(cl)) |
| xlab <- deparse(cl[[2L]]) |
| title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...) |
| } |
| invisible() |
| } |
| |
| |
| |
| as.hclust <- function(x, ...) UseMethod("as.hclust") |
| ## need *.default for idempotency: |
| as.hclust.default <- function(x, ...) { |
| if(inherits(x, "hclust")) x |
| else |
| stop(gettextf("argument 'x' cannot be coerced to class %s", |
| dQuote("hclust")), |
| if(!is.null(oldClass(x))) |
| gettextf("\n Consider providing an as.hclust.%s() method", |
| oldClass(x)[1L]), |
| domain = NA) |
| } |
| |
| as.hclust.twins <- function(x, ...) |
| { |
| r <- list(merge = x$merge, |
| height = sort(x$height), |
| order = x$order, |
| labels = if(!is.null(lb <- x$order.lab)) { |
| lb[sort.list(x$order)] } else rownames(x$data),# may be NULL |
| call = if(!is.null(cl <- x$call)) cl else match.call(), |
| method = if(!is.null(mt <- x$method)) mt else NA, |
| dist.method = attr(x$diss, "Metric")) |
| class(r) <- "hclust" |
| r |
| } |
| |
| print.hclust <- function(x, ...) |
| { |
| if(!is.null(x$call)) |
| cat("\nCall:\n", deparse(x$call), "\n\n", sep = "") |
| if(!is.null(x$method)) |
| cat("Cluster method :", x$method, "\n") |
| if(!is.null(x$dist.method)) |
| cat("Distance :", x$dist.method, "\n") |
| cat("Number of objects:", length(x$height)+1, "\n") |
| cat("\n") |
| invisible(x) |
| } |
| |
| cophenetic <- |
| function(x) |
| UseMethod("cophenetic") |
| cophenetic.default <- |
| function(x) |
| { |
| x <- as.hclust(x) |
| nobs <- length(x$order) |
| ilist <- vector("list", length = nobs) |
| out <- matrix(0, nrow = nobs, ncol = nobs) |
| for(i in 1 : (nobs - 1)) { |
| inds <- x$merge[i,] |
| ids1 <- if(inds[1L] < 0L) -inds[1L] else ilist[[inds[1L]]] |
| ids2 <- if(inds[2L] < 0L) -inds[2L] else ilist[[inds[2L]]] |
| ilist[[i]] <- c(ids1, ids2) |
| out[cbind(rep.int(ids1, rep.int(length(ids2), length(ids1))), |
| rep.int(ids2, length(ids1)))] <- x$height[i] |
| } |
| rownames(out) <- x$labels |
| as.dist(out + t(out)) |
| } |
| cophenetic.dendrogram <- |
| function(x) |
| { |
| ## Obtain cophenetic distances from a dendrogram by recursively |
| ## doing the following: |
| ## * if not a leaf, then for all children call ourselves, create |
| ## a block diagonal matrix from this, and fill the rest with the |
| ## current height (as everything in different children is joined |
| ## at the current split) ... |
| ## * if a leaf, height and result are 0. |
| ## Actually, we need to return something of class "dist", so things |
| ## are a bit more complicated, and we might be able to make this |
| ## more efficient by avoiding matrices ... |
| if(is.leaf(x)) { |
| ## If there is no label, we cannot recover the (names of the) |
| ## objects the distances are for, and hence abort. |
| if(is.null(label <- attr(x, "label"))) |
| stop("need dendrograms where all leaves have labels") |
| return(as.dist(matrix(0, dimnames = list(label, label)))) |
| } |
| children <- vector("list", length(x)) |
| for(i in seq_along(x)) |
| children[[i]] <- Recall(x[[i]]) |
| lens <- sapply(children, attr, "Size") |
| m <- matrix(attr(x, "height"), sum(lens), sum(lens)) |
| ## This seems a bit slower: |
| ## inds <- split(seq(length.out = sum(lens)), |
| ## rep.int(seq_along(lens), lens)) |
| ## for(i in seq_along(inds)) |
| ## m[inds[[i]], inds[[i]]] <- as.matrix(children[[i]]) |
| hi <- cumsum(lens) |
| lo <- c(0L, hi[-length(hi)]) + 1L |
| for(i in seq_along(x)) |
| m[lo[i] : hi[i], lo[i] : hi[i]] <- as.matrix(children[[i]]) |
| rownames(m) <- colnames(m) <- unlist(lapply(children, attr, "Labels")) |
| as.dist(m) |
| } |