#' A k-means clustering function
#'
#' This function allows you to apply k-means clustering algorithm, return a graphical heatmap output, group change trends and features belonging to each k-group.
#' @param DataDir Defaults to a randomly generated matrix from the distribution_test_mat function.
#' @param returnClustMeanPlots Defaults to True, allows to return mean plots for each k group.
#' @param returnClustFeatures Defaults to True, allows to return the names from features inside the k-groups.
#' @param k Defaults to 5, is the number of k-partitions.
#' @param nboots Defaults to 100, is the number of iterations in the k-means clustering to get a consensus.
#' @param AU_p_value Defaults to 0.95 or in other words the classic P value 0.05 treshold to define a significant cluster.
#' @param ClustMethod Defaults to average, inherits from the ComplexHeatmap package.
#' @param ClustDist Defaults to pearson, inherits from the ComplexHeatmap package.
#' @param consensusYAxis allows the user to default the ggplot y-axes to the max and min observation value
#' @keywords class discovery, clustering, ComplexHeatmap
#' @export
#' @examples
#' test <- KmeansPlus()
KmeansPlus <- function(DataDir = distribution_test_mat(),
returnClustMeanPlots = T,
returnClustFeatures = T,
k = 5,
n_boot = 100,
ClustMethod = "average",
ClustDist = "pearson",
consensusYAxis = F) {
## necessary functions
list2df <- function(x)
{
MAX.LEN <- max(sapply(x, length), na.rm = TRUE)
DF <- data.frame(lapply(x, function(x) c(x, rep(NA, MAX.LEN - length(x)))))
colnames(DF) <- paste("V", seq(ncol(DF)), sep = "")
DF
}
## data
if (class(DataDir) == "matrix") {
data <- DataDir
if (is.null(colnames(DataDir))) {
## two artificial factors in case a trial matrix without colnames is inputted
colnames(data) <- c(paste0(rep("X", ((dim(data)[2])/2)), ".", c(1:((dim(data)[2])/2))),
paste0(rep("Y", ((dim(data)[2])/2)), ".", c(1:((dim(data)[2])/2))))
}
if (is.null(rownames(DataDir))) {
## two artificial factors in case a trial matrix without colnames is inputted
rownames(data) <- c(paste0(rep("A", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("B", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("C", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("D", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("E", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("F", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("G", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("H", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("E", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))),
paste0(rep("F", ((dim(data)[1])/10)), ".", c(1:((dim(data)[1])/10))))
}
} else if (class(DataDir) == "character") {
data <- read.table(file = DataDir, header = T, sep = "\t", row.names = 1)
} else {
cat("You can only feed me either a matrix or a directory path to one tab delimited matrix file")
}
data_mat <- as.matrix(data)
## main
## test maximum values in your data
cat("\n")
print(summary(melt(as.matrix(data))[,"value"]))
cat("\n")
return_decision <- readline(prompt="Do you want me to scale matrix ? (Y/N): ")
cat("\n")
if (return_decision == "Y") {
scaled_mat <- (data_mat - rowMeans(data_mat)) / rowSds(data_mat)
col = c(-2.5, -1, 0, 1, 2.5)
## test AGAIN maximum values in your data
"\n"
cat("Scaled data summary: ", "\n")
print(summary(melt(scaled_mat)[,"value"]))
## Data should be now between with a lower interval.
} else {
scaled_mat = data_mat
col = as.numeric(summary(melt(as.matrix(data))[,"value"])[-3])
}
cat("\n")
cat("Iterating Kmean clustering to get consensus...", "\n")
cat("\n")
### treatments must be well defined in the first row as column identifiers
Treatment_vector <- list2df(strsplit(colnames(data), split = "\\."))[1,]
type = gsub("s\\d+_", "", lapply(Treatment_vector, FUN = as.character))
ha = HeatmapAnnotation(df = data.frame(type = type))
rowOrder <- row_order(Heatmap(matrix = scaled_mat,
name = "Intensities",
km = k, ## five K-means
row_km_repeats = n_boot, ## repeats to get consensus
col = colorRamp2(col,
c("white",
"yellow",
"darkgoldenrod1",
"violet",
"purple")),
top_annotation = ha ,
show_column_names = F,
show_row_names = T,
row_dend_reorder = T,
cluster_columns = F,
cluster_rows = T,
clustering_method_rows = ClustMethod,
clustering_distance_rows = ClustDist))
## output 1 = metabolites per treatment
out_list <- list()
clust_ID <- c()
for (i in 1:length(rowOrder)) {
clust_ID[i] <- paste0("Cluster ", i)
out_list[[i]] <- rownames(scaled_mat)[rowOrder[[i]]]
}
out_mat <- list2df(out_list)
colnames(out_mat) <- clust_ID
## Generating consensus y-axis
if (class(consensusYAxis) == "logical") {
if (consensusYAxis == T) {
Axis_max <- c()
Axis_min <- c()
for (i in 1:length(rowOrder)) {
cluster_K_mat <- scaled_mat[rowOrder[[i]],]
if (is.null(dim(cluster_K_mat))) {
} else {
Cluster_Melted2 <- melt(cluster_K_mat)
Axis_max[i] <- max(Cluster_Melted2$value)
Axis_min[i] <- min(Cluster_Melted2$value)
}
}
y_Axis <- c(min(Axis_min), max(Axis_max))
}
} else if (class(consensusYAxis) == "numeric") {
y_Axis <- consensusYAxis
} else {
cat("consensusYAxis: Provide either logical F or T, or numeric axes coords, e.g., c(x0,xn)")
}
## returning mean plots per cluster
if (returnClustMeanPlots == T) {
pdf("ClustMeanSEplots.pdf")
par(mar = c(10,5,1,1))
for (i in 1:length(rowOrder)) {
cat("Producing Plot # ", i, "\n")
cluster_K_mat <- scaled_mat[rowOrder[[i]],]
if (is.null(dim(cluster_K_mat))) {
boxplot(cluster_K_mat, las = 2, cex.axis = 0.5, main = paste0("Cluster No.", i))
} else {
boxplot(cluster_K_mat, las = 2, cex.axis = 0.5, main = paste0("Cluster No.", i))
Cluster_Melted2 <- melt(cluster_K_mat)
Cluster_Melted2[,"Var2"] <- as.numeric(Cluster_Melted2$Var2)
Treatments <- as.factor(apply(list2df(strsplit(as.character(melt(cluster_K_mat)[,2]), "\\."))[1,], 2, as.character))
c <- ggplot(Cluster_Melted2, aes(x = Var2 , y = value))
if (consensusYAxis != F) {
print(c +
geom_smooth() +
ylim(y_Axis) +
geom_point(aes(colour = Treatments)) +
ggtitle(paste0("Cluster No.", i)))
} else {
print(c +
geom_smooth() +
geom_point(aes(colour = Treatments)) +
ggtitle(paste0("Cluster No.", i)))
}
}
}
dev.off()
}
if (returnClustFeatures == T) {
return(out_mat)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.