Nothing
#' K-means clustering with build-in missing data imputation
#'
#' Clustering algorithm that produces a missing value imputation using on the go. The (local) imputation distribution is defined by the currently assigned cluster. The first draw is by random imputation.
#'
#' @param X Data frame with only numeric values or NAs
#' @param nr_cluster Number of clusters
#' @param nr_iter Iterations of procedure
#' @param c_steps Number of clustering steps per iteration
#' @param wf Weight function. Linear up to n_end by default. Used to shrink X towards zero or the global mean (default). See shrink_towards_global_mean
#' @param n_end Steps until convergence of weight function to 1
#' @param seed_nr Number for set.seed()
#' @param assign_with_wf Default is TRUE. If set to False, then the weight function is only applied in the centroid computation, but ignored in the cluster assignment.
#' @param shrink_towards_global_mean By default TRUE. The weight matrix w is applied on the difference of X from the global mean m, i.e, (x-m)*w+m
#'
#' @return
#' \describe{
#' \item{complete_data}{Completed data without NAs}
#' \item{clusters}{For each row of complete_data, the associated cluster}
#' \item{centroids}{For each cluster, the coordinates of the centroids in tidy format}
#' \item{centroids_matrix}{For each cluster, the coordinates of the centroids in matrix format}
#' \item{imp_values_mean}{Mean of the imputed variables per draw}
#' \item{imp_values_sd}{Standard deviation of the imputed variables per draw}
#' }
#'
#' @examples
#'# Random Dataset
#'set.seed(739)
#'n <- 750 # numer of points
#'nr_other_vars <- 2
#'mat <- matrix(rnorm(nr_other_vars*n),n,nr_other_vars)
#'me<-4 # mean
#'x <- c(rnorm(n/3,me/2,1),rnorm(2*n/3,-me/2,1))
#'y <- c(rnorm(n/3,0,1),rnorm(n/3,me,1),rnorm(n/3,-me,1))
#'dat <- cbind(mat,x,y)
#'dat<- as.data.frame(scale(dat)) # scaling
#'
#'# Create NAs
#'dat_with_miss <- miss_sim(dat,p=.1,seed_nr=120)
#'
#'# Run ClustImpute
#'res <- ClustImpute(dat_with_miss,nr_cluster=3)
#'
#'# Plot complete data set and cluster assignment
#'ggplot2::ggplot(res$complete_data,ggplot2::aes(x,y,color=factor(res$clusters))) +
#'ggplot2::geom_point()
#'
#'# View centroids
#'res$centroids
#'
#' @export
ClustImpute <- function(X,nr_cluster, nr_iter=10, c_steps=1, wf=default_wf, n_end=10, seed_nr=150519, assign_with_wf = TRUE, shrink_towards_global_mean = TRUE) {
mis_ind <- is.na(X) # missing indicator
if (all(is.na.data.frame(X))) stop("All values of X are NA")
# random imputation
for (j in 1:dim(X)[2]) { # variable j
idx_sample_from <- which(mis_ind[,j]==FALSE) # sample from not missing
sample_len <- dim(X[mis_ind[,j]==TRUE, ])[1] # nr missings = nr of required draws
set.seed(seed_nr+j)
sampled_idx <- sample(idx_sample_from,size=sample_len,replace=TRUE)
X[mis_ind[,j]==TRUE,j] <- X[sampled_idx,j] # overwrite old with new draws
}
# save statistics for imputed values
org_is_false <- mis_ind
org_is_false[mis_ind==FALSE] <- NA # imputed is TRUE, original is NA
mean_imp <- sapply(X*org_is_false,mean,na.rm=TRUE)
sd_imp <- sapply(X*org_is_false,stats::sd,na.rm=TRUE)
for (n in 1:nr_iter) {
# mean_by_feature <- X %>% tidyr::pivot_longer(cols=tidyselect::everything(), names_to = "Feature", values_to = "Value") %>%
# dplyr::group_by(.data$Feature) %>% dplyr::summarise(Value=mean(.data$Value)) # compute overall mean by feature
mean_by_feature <- colMeans(X)
# Use weights to adjust the scale of a variable specifically for each observation and to regularize towards the overall variable mean
args_wf <- list(n,n_end)
weight_matrix <- 1 - mis_ind * do.call(wf, args_wf)
if (shrink_towards_global_mean) {
global_mean <- matrix(rep(mean_by_feature,times=dim(X)[1]),ncol=length(mean_by_feature),byrow=TRUE)
# use weight w to shrink towards global mean m of the feature: (x-m)*w+m
X_down_weighted <- (X-global_mean) * weight_matrix + global_mean
} else {
if(n==1) {
# probe if data is centered by computing the max
is_centered <- max(abs(mean_by_feature))<1e-1
if (!is_centered) {
warning("For non-centered data we recommend the option shrink_towards_global_mean=TRUE")
message("The mean by feature after random imputation is given by:")
message(knitr::kable(mean_by_feature))
}
}
X_down_weighted <- X * weight_matrix # shrink towards zero
}
# perform clustering
if (n==1) {
cl_new <- ClusterR::KMeans_arma(data=X_down_weighted,clusters=nr_cluster,n_iter=c_steps,seed=seed_nr+n)
} else {
# starting with existing clusters
cl_old <- cl_new
class(cl_old) <- "matrix"
cl_new <- ClusterR::KMeans_arma(data=X_down_weighted,clusters=nr_cluster,n_iter=c_steps,seed=seed_nr+n,
CENTROIDS=cl_old, seed_mode="keep_existing")
}
# check new centroids for duplicate rows and replace with random draws if necessary
cl_new <- check_replace_dups(centroids = cl_new, X = X_down_weighted, seed = seed_nr)
# predict cluster
if (assign_with_wf==TRUE) {
pred <- ClusterR::predict_KMeans(X_down_weighted,cl_new) # use X without weight for assignment
class(pred) <- "integer"
} else { # option to assign clusters ignoring the weight
pred <- ClusterR::predict_KMeans(X,cl_new) # use X without weight for assignment
class(pred) <- "integer"
}
# draw missing values from current cluster: random imputation
for (i in 1:max(pred)) { # cluster i
for (j in 1:dim(X)[2]) { # variable j
idx_sample_from <- which(pred==i & mis_ind[,j]==FALSE) # sample from not missing
if (length(idx_sample_from)==0) { # take donors from all clusters if there are none in the current cluster
idx_sample_from <- which(mis_ind[,j]==FALSE) # sample from not missing
}
sample_len <- dim(X[pred==i & mis_ind[,j]==TRUE, ])[1] # nr missings = nr of required draws
if (sample_len>0) {
set.seed(seed_nr+n+i+j)
sampled_idx <- sample(idx_sample_from,size=sample_len,replace=TRUE)
X[pred==i & mis_ind[,j]==TRUE,j] <- X[sampled_idx,j] # overwrite old with new draws
}
}
}
# compute mean and variance of imputed values for trace plot
# store in matrix
mean_imp <- rbind(mean_imp,sapply(X*org_is_false,mean,na.rm=TRUE))
sd_imp <- rbind(sd_imp,sapply(X*org_is_false,stats::sd,na.rm=TRUE))
} # loop end: nr_iter
# add index for imputation iteration
mean_imp <- cbind(mean_imp,iter=0:nr_iter)
sd_imp <- cbind(sd_imp,iter=0:nr_iter)
# perform clustering one last time
cl_old <- cl_new
class(cl_old) <- "matrix"
cl_new <- ClusterR::KMeans_arma(data=X,clusters=nr_cluster,n_iter=c_steps,seed=seed_nr+n,
CENTROIDS=cl_old, seed_mode="keep_existing")
# check new centroids for duplicate rows and replace with random draws if necessary
cl_new <- check_replace_dups(centroids = cl_new, X = X_down_weighted, seed = seed_nr)
# predict cluster one last time on final draws
pred <- ClusterR::predict_KMeans(X,cl_new)
class(pred) <- "integer"
# convert centroids into nice format
class(cl_new) <- "matrix"
centroids <- as.data.frame(cl_new)
names(centroids) <- names(X)
centroids$Cluster <- 1:dim(centroids)[1]
# return all relevant results
# parameters are provided as attributes
res <- structure(list(complete_data=X,clusters=pred,centroids=centroids,centroids_matrix=cl_new,imp_values_mean=mean_imp,imp_values_sd=sd_imp),
class="kmeans_ClustImpute",nr_iter=nr_iter, c_steps=c_steps, wf=wf, n_end=n_end, seed_nr=seed_nr)
return (res)
}
#' Check and replace duplicate (centroid) rows
#'
#' Internal function of ClustImpute: check new centroids for duplicate rows and replace with random draws in this case.
#'
#' @param centroids Matrix of centroids
#' @param X Underlying data matrix (without missings)
#' @param seed Seed used for random sampling
#'
#' @return Returns centroids where duplicate rows are replaced by random draws
#'
check_replace_dups <- function(centroids, X, seed) {
# check if new centroids contain duplicate rows (min checks that indeed all values in a row are duplicates)
dupcliate_rows <- apply(matrix(duplicated(centroids),nrow=dim(centroids)[1],),1,min)
# If so, replace them by randomly drawn centroids
nr_dups <- sum(dupcliate_rows)
while (nr_dups > 0)
{
# keep unique centroids
centroids <- centroids[!dupcliate_rows,]
# get support for each column and draw uniformly from there
sup_min <- apply(X,1,min)
sup_max <- apply(X,1,max)
set.seed(seed+1)
new_centroids <- matrix(stats::runif(nr_dups,sup_min,sup_max),nr_dups,1)
for (col in 2:dim(X)[2]) {
sup_min <- apply(X,col,min)
sup_max <- apply(X,col,max)
set.seed(seed+col)
new_centroids <- cbind(new_centroids, matrix(stats::runif(nr_dups,sup_min,sup_max),nr_dups,1))
}
centroids <- rbind(centroids,new_centroids) # add new to existing centroids
dupcliate_rows <- apply(matrix(duplicated(centroids),nrow=dim(centroids)[1],),1,min)
nr_dups <- sum(dupcliate_rows)
}
return (centroids)
}
#' K-means clustering with build-in missing data imputation
#'
#' Default weight function. One minus the return value is multiplied with missing(=imputed) values.
#' It starts with 1 and goes to 0 at n_end.
#'
#' @param n current step
#' @param n_end steps until convergence of weight function to 0
#' @return value between 0 and 1
#' @examples
#' x <- 0:20
#' plot(x,1-default_wf(x))
#'
#' @export
default_wf <- function(n,n_end=10) {
y <- 1-pmin(n/n_end,1)
return(y)
}
#' Prediction method
#'
#' @param object Object of class kmeans_ClustImpute
#' @param newdata Data frame
#' @param ... additional arguments affecting the predictions produced - not currently used
#' @return integer value (cluster assignment)
#' @examples
#'# Random Dataset
#'set.seed(739)
#'n <- 750 # numer of points
#'nr_other_vars <- 2
#'mat <- matrix(rnorm(nr_other_vars*n),n,nr_other_vars)
#'me<-4 # mean
#'x <- c(rnorm(n/3,me/2,1),rnorm(2*n/3,-me/2,1))
#'y <- c(rnorm(n/3,0,1),rnorm(n/3,me,1),rnorm(n/3,-me,1))
#'dat <- cbind(mat,x,y)
#'dat<- as.data.frame(scale(dat)) # scaling
#'
#'# Create NAs
#'dat_with_miss <- miss_sim(dat,p=.1,seed_nr=120)
#'
#' res <- ClustImpute(dat_with_miss,nr_cluster=3)
#' predict(res,newdata=dat[1,])
#'
#'
#' @rdname predict
#' @export
predict.kmeans_ClustImpute <- function(object,newdata,...) {
pred <- ClusterR::predict_KMeans(newdata,object$centroids_matrix)
class(pred) <- "integer"
return(pred)
}
#' Plot showing marginal distribution by cluster assignment
#'
#' Returns a plot with the marginal distributions by cluster and feature. The plot shows histograms or boxplots and , as a ggplot object, can be modified further.
#'
#' @param x an object returned from ClustImpute
#' @param type either "hist" to plot a histogram or "box" for a boxplot
#' @param vline for "hist" a vertical line is plotted showing either the centroid value or the mean of all data points grouped by cluster and feature
#' @param hist_bins number of bins for histogram
#' @param color_bins color for the histogram bins
#' @param color_vline color for the vertical line
#' @param size_vline size of the vertical line
#' @param ... currently unused
#' @return Returns a ggplot object
#' @rdname plot
#' @export
plot.kmeans_ClustImpute <- function(x,type="hist",vline="centroids",hist_bins=30,color_bins="#56B4E9",color_vline="#E69F00",size_vline=2,...) {
complete_data <- x$complete_data
complete_data$Cluster <- x$cluster
# reshape data for ggplot
data4plot <- complete_data %>% tidyr::pivot_longer(!.data$Cluster, names_to = "Feature", values_to = "value")
if (type=="hist") {
# get value for vertical line
if (vline=="centroids") {
dataLine <- x$centroids %>% tidyr::pivot_longer(!.data$Cluster, names_to = "Feature", values_to = "value")
} else if (vline=="mean") {
dataLine <- data4plot %>%
dplyr::group_by(.data$Cluster,.data$Feature) %>%
dplyr::summarize(value = mean(.data$value))
} else stop("vline must be either 'centroids' or 'mean'")
p <- ggplot2::ggplot(data4plot) + ggplot2::geom_histogram(ggplot2::aes(x = .data$value), bins=hist_bins, fill=color_bins) +
ggplot2::facet_grid(.data$Feature~.data$Cluster,labeller = ggplot2::label_both) +
ggplot2::geom_vline(data=dataLine,ggplot2::aes(xintercept=.data$value),size=size_vline,color=color_vline) + ggplot2::theme_light()
} else if (type=="box") {
p <- ggplot2::ggplot(data4plot, ggplot2::aes(x = .data$value, y = .data$Feature)) + ggplot2::geom_boxplot() +
ggplot2::facet_grid(~.data$Cluster,labeller = ggplot2::label_both) + ggplot2::theme_classic()
} else stop("type must bei either 'hist' or 'box'")
return(p)
}
#' Print method for ClustImpute
#'
#' Returns a plot with the marginal distributions by cluster and feature. The plot shows histograms or boxplots and , as a ggplot object, can be modified further.
#'
#' @param x an object returned from ClustImpute
#' @param ... currently unused
#'
#' @return No return value (print function)
#'
#' @rdname print
#' @export
print.kmeans_ClustImpute = function(x,...) {
cat("Cluster centroids:")
print(knitr::kable(x$centroids))
cat("\n\n")
cat("Number of observations per cluster:")
print(knitr::kable(table(x$clusters)))
cat("Total number of observations:",dim(x$complete_data)[1])
}
#' Reduction of variance
#'
#' Computes one minus the ratio of the sum of all within cluster variances by the overall variance
#'
#' @param clusterObj Object of class kmeans_ClustImpute
#' @return integer value typically between 0 and 1
#' @examples
#'
#'# Random Dataset
#'set.seed(739)
#'n <- 750 # numer of points
#'nr_other_vars <- 2
#'mat <- matrix(rnorm(nr_other_vars*n),n,nr_other_vars)
#'me<-4 # mean
#'x <- c(rnorm(n/3,me/2,1),rnorm(2*n/3,-me/2,1))
#'y <- c(rnorm(n/3,0,1),rnorm(n/3,me,1),rnorm(n/3,-me,1))
#'dat <- cbind(mat,x,y)
#'dat<- as.data.frame(scale(dat)) # scaling
#'
#'# Create NAs
#'dat_with_miss <- miss_sim(dat,p=.1,seed_nr=120)
#'
#'res <- ClustImpute(dat_with_miss,nr_cluster=3)
#'var_reduction(res)
#'
#'@importFrom rlang .data
#'
#' @export
var_reduction <- function(clusterObj) {
# within cluster variances
within_var <- clusterObj$complete_data %>% dplyr::mutate(pred=clusterObj$clusters) # add prediciton column
cluster_sizes <- within_var %>% dplyr::group_by(.data$pred) %>% dplyr::count() %>%
dplyr::ungroup(.data$pred) %>% dplyr::select(.data$n) %>% dplyr::pull()
within_var <- within_var %>% dplyr::group_by(.data$pred) %>% dplyr::summarise_all(stats::var) %>%
dplyr::select(-.data$pred) %>% dplyr::summarise_all(stats::weighted.mean,w=cluster_sizes,na.rm=TRUE)
within_var_sum <- sum(within_var) # sum of all within cluster variances
# overall variance
overall_var <- sum(clusterObj$complete_data %>% dplyr::summarise_all(stats::var))
return(list(Variance_reduction=1-within_var_sum/overall_var,Variance_by_cluster=within_var))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.