PageRenderTime 50ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/R-2.15.1/src/library/stats/R/kmeans.R

#
R | 153 lines | 132 code | 4 blank | 17 comment | 34 complexity | e8f152b8071555dc149823f738564b8b MD5 | raw file
Possible License(s): LGPL-2.1, LGPL-3.0, CC-BY-SA-4.0, BSD-3-Clause, AGPL-3.0, GPL-2.0, GPL-3.0, LGPL-2.0
  1. # File src/library/stats/R/kmeans.R
  2. # Part of the R package, http://www.R-project.org
  3. #
  4. # This program is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation; either version 2 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # A copy of the GNU General Public License is available at
  15. # http://www.r-project.org/Licenses/
  16. kmeans <-
  17. function(x, centers, iter.max = 10, nstart = 1,
  18. algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"))
  19. {
  20. do_one <- function(nmeth) {
  21. Z <-
  22. switch(nmeth,
  23. { # 1
  24. Z <- .Fortran(C_kmns, x, m, p,
  25. centers = centers,
  26. as.integer(k), c1 = integer(m), integer(m),
  27. nc = integer(k), double(k), double(k), integer(k),
  28. double(m), integer(k), integer(k),
  29. as.integer(iter.max), wss = double(k),
  30. ifault = 0L)
  31. switch(Z$ifault,
  32. stop("empty cluster: try a better set of initial centers",
  33. call.=FALSE),
  34. warning(gettextf("did not converge in %d iterations",
  35. iter.max), call.=FALSE, domain =NA),
  36. stop("number of cluster centres must lie between 1 and nrow(x)",
  37. call.=FALSE)
  38. )
  39. Z
  40. },
  41. { # 2
  42. Z <- .C(C_kmeans_Lloyd, x, m, p,
  43. centers = centers, as.integer(k),
  44. c1 = integer(m), iter = as.integer(iter.max),
  45. nc = integer(k), wss = double(k))
  46. if(Z$iter > iter.max)
  47. warning("did not converge in ",
  48. iter.max, " iterations", call.=FALSE)
  49. if(any(Z$nc == 0))
  50. warning("empty cluster: try a better set of initial centers", call.=FALSE)
  51. Z
  52. },
  53. { # 3
  54. Z <- .C(C_kmeans_MacQueen, x, m, p,
  55. centers = as.double(centers), as.integer(k),
  56. c1 = integer(m), iter = as.integer(iter.max),
  57. nc = integer(k), wss = double(k))
  58. if(Z$iter > iter.max)
  59. warning("did not converge in ",
  60. iter.max, " iterations", call.=FALSE)
  61. if(any(Z$nc == 0))
  62. warning("empty cluster: try a better set of initial centers", call.=FALSE)
  63. Z
  64. })
  65. Z
  66. }
  67. x <- as.matrix(x)
  68. m <- as.integer(nrow(x))
  69. if(is.na(m)) stop("invalid nrow(x)")
  70. p <- as.integer(ncol(x))
  71. if(is.na(p)) stop("invalid ncol(x)")
  72. if(missing(centers))
  73. stop("'centers' must be a number or a matrix")
  74. nmeth <- switch(match.arg(algorithm),
  75. "Hartigan-Wong" = 1,
  76. "Lloyd" = 2, "Forgy" = 2,
  77. "MacQueen" = 3)
  78. if(length(centers) == 1L) {
  79. if (centers == 1) nmeth <- 3
  80. k <- centers
  81. ## we need to avoid duplicates here
  82. if(nstart == 1)
  83. centers <- x[sample.int(m, k), , drop = FALSE]
  84. if(nstart >= 2 || any(duplicated(centers))) {
  85. cn <- unique(x)
  86. mm <- nrow(cn)
  87. if(mm < k)
  88. stop("more cluster centers than distinct data points.")
  89. centers <- cn[sample.int(mm, k), , drop=FALSE]
  90. }
  91. } else {
  92. centers <- as.matrix(centers)
  93. if(any(duplicated(centers)))
  94. stop("initial centers are not distinct")
  95. cn <- NULL
  96. k <- nrow(centers)
  97. if(m < k)
  98. stop("more cluster centers than data points")
  99. }
  100. if(iter.max < 1) stop("'iter.max' must be positive")
  101. if(ncol(x) != ncol(centers))
  102. stop("must have same number of columns in 'x' and 'centers'")
  103. if(!is.double(x)) storage.mode(x) <- "double"
  104. if(!is.double(centers)) storage.mode(centers) <- "double"
  105. Z <- do_one(nmeth)
  106. best <- sum(Z$wss)
  107. if(nstart >= 2 && !is.null(cn))
  108. for(i in 2:nstart) {
  109. centers <- cn[sample.int(mm, k), , drop=FALSE]
  110. ZZ <- do_one(nmeth)
  111. if((z <- sum(ZZ$wss)) < best) {
  112. Z <- ZZ
  113. best <- z
  114. }
  115. }
  116. centers <- matrix(Z$centers, k)
  117. dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
  118. cluster <- Z$c1
  119. if(!is.null(rn <- rownames(x)))
  120. names(cluster) <- rn
  121. totss <- sum(scale(x, scale = FALSE)^2)
  122. structure(list(cluster = cluster, centers = centers, totss = totss,
  123. withinss = Z$wss, tot.withinss = best,
  124. betweenss = totss - best, size = Z$nc),
  125. class = "kmeans")
  126. }
  127. ## modelled on print methods in the cluster package
  128. print.kmeans <- function(x, ...)
  129. {
  130. cat("K-means clustering with ", length(x$size), " clusters of sizes ",
  131. paste(x$size, collapse=", "), "\n", sep="")
  132. cat("\nCluster means:\n")
  133. print(x$centers, ...)
  134. cat("\nClustering vector:\n")
  135. print(x$cluster, ...)
  136. cat("\nWithin cluster sum of squares by cluster:\n")
  137. print(x$withinss, ...)
  138. cat(sprintf(" (between_SS / total_SS = %5.1f %%)\n",
  139. 100 * x$betweenss/x$totss),
  140. "Available components:\n", sep="\n")
  141. print(names(x))
  142. invisible(x)
  143. }
  144. fitted.kmeans <- function(object, method = c("centers", "classes"), ...)
  145. {
  146. method <- match.arg(method)
  147. if (method == "centers") object$centers[object$cl, , drop=FALSE]
  148. else object$cl
  149. }