R/cluster.R

Defines functions assignCluster

Documented in assignCluster

#' Append a Cluster Membership Variable to a Dataframe
#'
#' @name assignCluster
#'
#' @details
#' Correctly creates a cluster membership variable that can be attached to a dataframe when only a subset of the observations in that dataframe were used to create the clustering solution.
#' NAs are assigned to the observations of the original dataframe not used in creating the clustering solution.
#'
#' This code originally by Dan Putler, used with permission.
#' 
#' @author Dan Putler
#' 
#' @param clusterData The data matrix used in the clustering solution. The data matrix may have have only a subset of the observations contained in the original dataframe.
#' @param origData The original dataframe from which the data used in the clustering solution were taken.
#' @param clusterVec An integer variable containing the cluster membership assignments for the observations used in creating the clustering solution.
#' This vector can be created using \code{cutree} for clustering solutions generated by \code{hclust} or the \code{cluster} component of a list object created by \code{kmeans} or \code{KMeans}.
#'
#' @return A factor (with integer labels) that indicate the cluster assignment for each observation, with an NA value given to observations not used in the clustering solution.
#' 
#' @seealso \code{\link[stats]{hclust}}, \code{\link[stats]{cutree}}, \code{\link[stats]{kmeans}}, \code{\link{KMeans}}.
#' 
#' @examples
#' ## Load USArrests data set
#' data(USArrests)
#' 
#' ## Create three cluster
#' USArrkm3 <- KMeans(USArrests[USArrests$UrbanPop<66, ], centers=3)
#' 
#' ## Create a variable with cluster assignment
#' assignCluster(USArrests[USArrests$UrbanPop<66, ], USArrests, USArrkm3$cluster)
#' 
#' @export
assignCluster <- function(clusterData, origData, clusterVec){
    rowsDX <- row.names(clusterData)
    rowsX <- row.names(origData)
    clustAssign <- rep(NA, length(rowsX))
    validData <- rowsX %in% rowsDX
    clustAssign[validData] <- clusterVec
    return(as.factor(clustAssign))
}

#' K-Means Clustering Using Multiple Random Seeds
#'
#' @name KMeans
#'
#' @keywords misc
#' 
#' @details
#' Finds a number of k-means clusting solutions using R's \code{kmeans} function, and selects as the final solution the one that has the minimum total within-cluster sum of squared distances.
#' @param x A numeric matrix of data, or an object that can be coerced to such a matrix (such as a numeric vector or a dataframe with all numeric columns).
#' @param centers The number of clusters in the solution.
#' @param iter.max The maximum number of iterations allowed.
#' @param num.seeds The number of different starting random seeds to use. Each random seed results in a different k-means solution.
#'
#' @return
#' A list with components:
#' \describe{
#'    \item{cluster}{A vector of integers indicating the cluster to which each point is allocated.}
#'    \item{centers}{A matrix of cluster centres (centroids).}
#'    \item{withinss}{The within-cluster sum of squares for each cluster.}
#'    \item{tot.withinss}{The within-cluster sum of squares summed across clusters.}
#'    \item{betweenss}{The between-cluster sum of squared distances.}
#'    \item{size}{The number of points in each cluster.}
#' }
#'
#' @author Dan Putler
#'
#' @seealso \code{\link[stats]{kmeans}}
#'
#' @examples
#' data(USArrests)
#' KMeans(USArrests, centers=3, iter.max=5, num.seeds=5)
#'
#' @export
KMeans <- function (x, centers, iter.max=10, num.seeds=10) {
    ## fixed 15 Mar 05 by J. Fox
    if(mode(x)=="numeric") x<-data.frame(new.x=x)
    KM <- kmeans(x=x, centers=centers, iter.max=iter.max)
    for(i in 2:num.seeds) {
        newKM <- kmeans(x=x, centers=centers, iter.max=iter.max)
        if(sum(newKM$withinss) < sum(KM$withinss)) {
            KM <- newKM
        }
    }
    KM$tot.withinss <- sum(KM$withinss)
    xmean <- apply(x, 2, mean)
    centers <- rbind(KM$centers, xmean)
    bss1 <- as.matrix(dist(centers)^2)
    KM$betweenss <- sum(as.vector(bss1[nrow(bss1),])*c(KM$size,0))
    return(KM)
}

Try the RcmdrMisc package in your browser

Any scripts or data that you put into this service are public.

RcmdrMisc documentation built on Jan. 8, 2026, 9:06 a.m.