src/library/stats/R/kmeans.R - R - Git at Google

 #  File src/library/stats/R/kmeans.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2017 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 kmeans <-
 function(x, centers, iter.max = 10L, nstart = 1L,
 	 algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"),
          trace = FALSE)
 {
     .Mimax <- .Machine$integer.max
     do_one <- function(nmeth) {
         switch(nmeth,
            {                            # 1 : Hartigan-Wong
 	       isteps.Qtran <- as.integer(min(.Mimax, 50 * m))
 	       iTran <- c(isteps.Qtran, integer(k)) # correct for k=0
                Z <- .Fortran(C_kmns, x, m, p,
                              centers = centers,
                              as.integer(k), c1 = integer(m), c2 = integer(m),
                              nc = integer(k), double(k), double(k), ncp=integer(k),
                              D = double(m), iTran = iTran, live = integer(k),
                              iter = iter.max, wss = double(k),
                              ifault = as.integer(trace))
                switch(Z$ifault,
                       ## 1:
                       stop("empty cluster: try a better set of initial centers",
                            call. = FALSE),
                       ## 2:
                           Z$iter <- max(Z$iter, iter.max+1L), # -> and warn below
                       ## 3:
                       stop("number of cluster centres must lie between 1 and nrow(x)",
                            call.=FALSE),
                       ## 4: {new @ 2013-06-30; maybe better fix (in Fortran) ?}
                       warning(gettextf("Quick-TRANSfer stage steps exceeded maximum (= %d)",
                                        isteps.Qtran),
                               call.=FALSE)
                       )
            },
            {                            # 2 : Lloyd-Forgy
                Z <- .C(C_kmeans_Lloyd, x, m, p,
                        centers = centers, k,
                        c1 = integer(m), iter = iter.max,
                        nc = integer(k), wss = double(k))
            },
            {                            # 3 : MacQueen
                Z <- .C(C_kmeans_MacQueen, x, m, p,
                        centers = as.double(centers), k,
                        c1 = integer(m), iter = iter.max,
                        nc = integer(k), wss = double(k))
            })

 	if(m23 <- any(nmeth == c(2L, 3L))) {
 	    if(any(Z$nc == 0))
 		warning("empty cluster: try a better set of initial centers",
 			call. = FALSE)
 	}
 	if(Z$iter > iter.max) {
 	    warning(sprintf(ngettext(iter.max,
 				     "did not converge in %d iteration",
 				     "did not converge in %d iterations"),
 			    iter.max), call. = FALSE, domain = NA)
 	    if(m23) Z$ifault <- 2L
 	}
         if(nmeth %in% c(2L, 3L)) {
             if(any(Z$nc == 0))
                 warning("empty cluster: try a better set of initial centers",
                         call. = FALSE)
         }
 	Z
     }
     x <- as.matrix(x)
     ## as.integer(<too large>) gives NA ==> not allowing too large nrow() / ncol():
     m <- as.integer(nrow(x)); if(is.na(m)) stop("invalid nrow(x)")
     p <- as.integer(ncol(x)); if(is.na(p)) stop("invalid ncol(x)")
     if(missing(centers))
 	stop("'centers' must be a number or a matrix")
     nmeth <- switch(match.arg(algorithm),
                     "Hartigan-Wong" = 1L,
                     "Lloyd" = 2L, "Forgy" = 2L,
                     "MacQueen" = 3L)
     storage.mode(x) <- "double"
     if(length(centers) == 1L) {
 	k <- centers
         ## we need to avoid duplicates here
         if(nstart == 1L)
             centers <- x[sample.int(m, k), , drop = FALSE]
         if(nstart >= 2L || any(duplicated(centers))) {
             cn <- unique(x)
             mm <- nrow(cn)
             if(mm < k)
                 stop("more cluster centers than distinct data points.")
             centers <- cn[sample.int(mm, k), , drop=FALSE]
         }
     } else {
 	centers <- as.matrix(centers)
         if(any(duplicated(centers)))
             stop("initial centers are not distinct")
         cn <- NULL
 	k <- nrow(centers)
         if(m < k)
             stop("more cluster centers than data points")
     }
     k <- as.integer(k)
     if(is.na(k)) stop(gettextf("invalid value of %s", "'k'"), domain = NA)
     if (k == 1L) nmeth <- 3L # Hartigan-Wong, (Fortran) needs k > 1
     iter.max <- as.integer(iter.max)
     if(is.na(iter.max) || iter.max < 1L) stop("'iter.max' must be positive")
     if(ncol(x) != ncol(centers))
 	stop("must have same number of columns in 'x' and 'centers'")
     storage.mode(centers) <- "double"
     Z <- do_one(nmeth)
     best <- sum(Z$wss)
     if(nstart >= 2L && !is.null(cn))
 	for(i in 2:nstart) {
 	    centers <- cn[sample.int(mm, k), , drop=FALSE]
 	    ZZ <- do_one(nmeth)
 	    if((z <- sum(ZZ$wss)) < best) {
 		Z <- ZZ
 		best <- z
 	    }
 	}
     centers <- matrix(Z$centers, k)
     dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
     cluster <- Z$c1
     if(!is.null(rn <- rownames(x)))
         names(cluster) <- rn
     totss <- sum(scale(x, scale = FALSE)^2)
     structure(list(cluster = cluster, centers = centers, totss = totss,
 		   withinss = Z$wss, tot.withinss = best,
 		   betweenss = totss - best, size = Z$nc,
 		   iter = Z$iter, ifault = Z$ifault),
 	      class = "kmeans")
 }

 ## modelled on print methods in the cluster package
 print.kmeans <- function(x, ...)
 {
     cat("K-means clustering with ", length(x$size), " clusters of sizes ",
         paste(x$size, collapse = ", "), "\n", sep = "")
     cat("\nCluster means:\n")
     print(x$centers, ...)
     cat("\nClustering vector:\n")
     print(x$cluster, ...)
     cat("\nWithin cluster sum of squares by cluster:\n")
     print(x$withinss, ...)
     ratio <- sprintf(" (between_SS / total_SS = %5.1f %%)\n",
                      100 * x$betweenss/x$totss)
     cat(sub(".", getOption("OutDec"), ratio, fixed = TRUE),
 	"Available components:\n", sep = "\n")
     print(names(x))
     if(!is.null(x$ifault) && x$ifault == 2L)
 	cat("Warning: did *not* converge in specified number of iterations\n")
     invisible(x)
 }

 fitted.kmeans <- function(object, method = c("centers", "classes"), ...)
 {
 	method <- match.arg(method)
 	if (method == "centers") object$centers[object$cl, , drop = FALSE]
 	else object$cl
 }
	# File src/library/stats/R/kmeans.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2017 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	kmeans <-
	function(x, centers, iter.max = 10L, nstart = 1L,
	algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"),
	trace = FALSE)
	{
	.Mimax <- .Machine$integer.max
	do_one <- function(nmeth) {
	switch(nmeth,
	{ # 1 : Hartigan-Wong
	isteps.Qtran <- as.integer(min(.Mimax, 50 * m))
	iTran <- c(isteps.Qtran, integer(k)) # correct for k=0
	Z <- .Fortran(C_kmns, x, m, p,
	centers = centers,
	as.integer(k), c1 = integer(m), c2 = integer(m),
	nc = integer(k), double(k), double(k), ncp=integer(k),
	D = double(m), iTran = iTran, live = integer(k),
	iter = iter.max, wss = double(k),
	ifault = as.integer(trace))
	switch(Z$ifault,
	## 1:
	stop("empty cluster: try a better set of initial centers",
	call. = FALSE),
	## 2:
	Z$iter <- max(Z$iter, iter.max+1L), # -> and warn below
	## 3:
	stop("number of cluster centres must lie between 1 and nrow(x)",
	call.=FALSE),
	## 4: {new @ 2013-06-30; maybe better fix (in Fortran) ?}
	warning(gettextf("Quick-TRANSfer stage steps exceeded maximum (= %d)",
	isteps.Qtran),
	call.=FALSE)
	)
	},
	{ # 2 : Lloyd-Forgy
	Z <- .C(C_kmeans_Lloyd, x, m, p,
	centers = centers, k,
	c1 = integer(m), iter = iter.max,
	nc = integer(k), wss = double(k))
	},
	{ # 3 : MacQueen
	Z <- .C(C_kmeans_MacQueen, x, m, p,
	centers = as.double(centers), k,
	c1 = integer(m), iter = iter.max,
	nc = integer(k), wss = double(k))
	})

	if(m23 <- any(nmeth == c(2L, 3L))) {
	if(any(Z$nc == 0))
	warning("empty cluster: try a better set of initial centers",
	call. = FALSE)
	}
	if(Z$iter > iter.max) {
	warning(sprintf(ngettext(iter.max,
	"did not converge in %d iteration",
	"did not converge in %d iterations"),
	iter.max), call. = FALSE, domain = NA)
	if(m23) Z$ifault <- 2L
	}
	if(nmeth %in% c(2L, 3L)) {
	if(any(Z$nc == 0))
	warning("empty cluster: try a better set of initial centers",
	call. = FALSE)
	}
	Z
	}
	x <- as.matrix(x)
	## as.integer(<too large>) gives NA ==> not allowing too large nrow() / ncol():
	m <- as.integer(nrow(x)); if(is.na(m)) stop("invalid nrow(x)")
	p <- as.integer(ncol(x)); if(is.na(p)) stop("invalid ncol(x)")
	if(missing(centers))
	stop("'centers' must be a number or a matrix")
	nmeth <- switch(match.arg(algorithm),
	"Hartigan-Wong" = 1L,
	"Lloyd" = 2L, "Forgy" = 2L,
	"MacQueen" = 3L)
	storage.mode(x) <- "double"
	if(length(centers) == 1L) {
	k <- centers
	## we need to avoid duplicates here
	if(nstart == 1L)
	centers <- x[sample.int(m, k), , drop = FALSE]
	if(nstart >= 2L \|\| any(duplicated(centers))) {
	cn <- unique(x)
	mm <- nrow(cn)
	if(mm < k)
	stop("more cluster centers than distinct data points.")
	centers <- cn[sample.int(mm, k), , drop=FALSE]
	}
	} else {
	centers <- as.matrix(centers)
	if(any(duplicated(centers)))
	stop("initial centers are not distinct")
	cn <- NULL
	k <- nrow(centers)
	if(m < k)
	stop("more cluster centers than data points")
	}
	k <- as.integer(k)
	if(is.na(k)) stop(gettextf("invalid value of %s", "'k'"), domain = NA)
	if (k == 1L) nmeth <- 3L # Hartigan-Wong, (Fortran) needs k > 1
	iter.max <- as.integer(iter.max)
	if(is.na(iter.max) \|\| iter.max < 1L) stop("'iter.max' must be positive")
	if(ncol(x) != ncol(centers))
	stop("must have same number of columns in 'x' and 'centers'")
	storage.mode(centers) <- "double"
	Z <- do_one(nmeth)
	best <- sum(Z$wss)
	if(nstart >= 2L && !is.null(cn))
	for(i in 2:nstart) {
	centers <- cn[sample.int(mm, k), , drop=FALSE]
	ZZ <- do_one(nmeth)
	if((z <- sum(ZZ$wss)) < best) {
	Z <- ZZ
	best <- z
	}
	}
	centers <- matrix(Z$centers, k)
	dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
	cluster <- Z$c1
	if(!is.null(rn <- rownames(x)))
	names(cluster) <- rn
	totss <- sum(scale(x, scale = FALSE)^2)
	structure(list(cluster = cluster, centers = centers, totss = totss,
	withinss = Z$wss, tot.withinss = best,
	betweenss = totss - best, size = Z$nc,
	iter = Z$iter, ifault = Z$ifault),
	class = "kmeans")
	}

	## modelled on print methods in the cluster package
	print.kmeans <- function(x, ...)
	{
	cat("K-means clustering with ", length(x$size), " clusters of sizes ",
	paste(x$size, collapse = ", "), "\n", sep = "")
	cat("\nCluster means:\n")
	print(x$centers, ...)
	cat("\nClustering vector:\n")
	print(x$cluster, ...)
	cat("\nWithin cluster sum of squares by cluster:\n")
	print(x$withinss, ...)
	ratio <- sprintf(" (between_SS / total_SS = %5.1f %%)\n",
	100 * x$betweenss/x$totss)
	cat(sub(".", getOption("OutDec"), ratio, fixed = TRUE),
	"Available components:\n", sep = "\n")
	print(names(x))
	if(!is.null(x$ifault) && x$ifault == 2L)
	cat("Warning: did not converge in specified number of iterations\n")
	invisible(x)
	}

	fitted.kmeans <- function(object, method = c("centers", "classes"), ...)
	{
	method <- match.arg(method)
	if (method == "centers") object$centers[object$cl, , drop = FALSE]
	else object$cl
	}