# Install Package: Ctrl + Shift + B
# devtools::document()
#' Cluster number for k-means clustering
#'
#' Imports:
#' ggplot2
#'
#' @param data table, where rows are to be clustered
#' @param maxclusters integer, maximum number of clusters to show in the plot
#' @param resample integer, number of times the the algorithm repeats its procedure with random starting points
#' @param maxrecenter integer, maximum number of times the algorithm re-centeres the cluster by mean and re-assigns the data points. This is done until the clusters do not change anymore but maximum as many times as specified here.
#' @return plot object
#' @examples
#' seekKmeansClusterNumber
seekKmeansClusterNumber <- function(data0, maxclusters=15, resample=20, maxrecenter=10, useFactoextra=T){
data <- as.data.frame(scale(data0))
maxclusters <- min(maxclusters, nrow(data)-1) # ensure that cluster number doesnt exceed sample number
k <- seq(maxclusters)
variance <- sapply(k, function(x) sum(kmeans(data, centers=x, nstart=resample, iter.max=maxrecenter)$withinss)) # calculate sum of variances for each k
df <- data.frame(Variance=variance, K=k)
#elbow
delta <- (c(0,variance)-c(variance,0))[k]
spec <- (c(delta,0)/c(delta[-1],0,0))[k]
spec[spec==Inf] <- 0
hits <- sort(spec, decreasing = T)[1:3]
elbow <- k[spec %in% hits]
ggplot2::ggplot(df, ggplot2::aes(x=K, y=Variance)) +
ggplot2::geom_vline(xintercept=elbow, linetype="dotted") +
ggplot2::geom_line() +
ggplot2::geom_point(size=5) +
ggplot2::scale_x_continuous(breaks=k, name="number of clusters k") +
#ggplot2::scale_y_continuous(expand=c(0,0), name="within groups sum of squares", limits=c(0,max(variance)*1.1)) +
ggplot2::geom_label(x=2.2, y=mean(c(df$Variance[1],df$Variance[2])), hjust=0, size=6,
ggplot2::aes(label=paste0("re-sampling (nstart): ", resample, "\nre-centering max (iter.max): ", maxrecenter, "\nalgorithm: Hartigan and Wong (1979)"))) +
ggplot2::theme_classic() +
ggplot2::theme(text=ggplot2::element_text(size=18), panel.border=ggplot2::element_rect(fill=NA))
#plot(k, variance, type="b", xlab="Number of Clusters", ylab="within groups sum of squares", xaxt="n")
#axis(1, at=k)
}
#' k-means clustering for rows and hierarchical clustering for columns
#'
#' Imports:
#' ggplot2,
#' reshape2
#' tibble
#'
#' @param data0 data.frame, where rows are genes and columns are samples (possibility of having gene names as row names)
#' @param k integer, setting the number of clusters to be used (determine with seekKmeansClusterNumber())
#' @param samplecluster boolean, should clustering of samples be performed
#' @param kmeans.resample integer, number of times the the algorithm repeats its procedure with random starting points
#' @param maxrecenter integer, maximum number of times the algorithm re-centeres the cluster by mean and re-assigns the data points. This is done until the clusters do not change anymore but maximum as many times as specified here.
#' @param dendro.distmethod string, one of: "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
#' @param dendro.linkmethod string, one of: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid"
#' @param tileColors vector of 3 strings depicting colors for low, medium & high values
#' @param showRows boolean, should row names be displayed in heatmap?
#' @param helpLinesV vector of numbers, indicating at what position vertical lines should be drawn
#' @param helpLinesH boolean, indicating if horizontal lines should be drawn to indicate clusters
#' @param ownCluster a vector of same length AND ORDER as data. This vector will just be cbind to the data. If provided, no kmeans clustering be calculated. For the graph, the clusters will be sorted alphabetically from bottom to top of the graph.
#' @param showClusters should clusters be indicatd by colorful lines?
#' @param tileSepColor color to seperate tiles by (not recommended for heatmaps with many tiles)
#' @return list with 1) a data.frame, same as "data", but with an additional column, defining the cluster for each row, 2) the data.frame used as input for the plot, 3) a ggplot object
#' @examples
#' seekKmeansCluster(mx, 5)
seekCluster <- function(data0, k=2, scale=T, samplecluster=T, kmeans.resample=50, maxrecenter=10,
dendro.distmethod="euclidean", dendro.linkmethod="complete",
tileColors=c("royalblue4","white","red"), showRows=F,
helpLinesV=NA, helpLinesH=FALSE, ownCluster=NA, showClusters=F, tileSepColor=NA){
colnames(data0) <- make.names(colnames(data0), unique=T)
# perform kmeans clustering (if not already given)
if(is.na(ownCluster[1])){
data <- na.omit(data0)
data <- data[apply(data,1,var)>0,]
if(scale) data <- as.data.frame(t(scale(t(data))))
fit <- kmeans(data, centers = k, nstart = kmeans.resample, iter.max = maxrecenter)
res <- data.frame(data, fit$cluster)
}else{
data <- data0
if(scale) data <- as.data.frame(t(scale(t(data))))
res <- data.frame(data, fit.cluster=ownCluster)
}
# melt data.frame, order by cluster and give levels to rownames for plot ordering purposes
res2 <- reshape2::melt(tibble::rownames_to_column(res), id=c("rowname", "fit.cluster"))
res2 <- res2[order(res2$fit.cluster),]
res2$rowname <- factor(res2$rowname, levels=unique(res2$rowname))
res2$variable <- gsub("^X","",res2$variable)
#perform hierarchical clustering
if(samplecluster){
data2 <- t(data)
distmat <- dist(data2, method=dendro.distmethod)
clust <- hclust(distmat, method=dendro.linkmethod)
res2$variable <- factor(res2$variable, levels=clust$labels[clust$order])
}else{
res2$variable <- factor(res2$variable, levels=colnames(data0))
}
#data.frame to color clusters in the plot
yend=0.5
clusterlines <- data.frame(x=0, y=0, cluster=0)
for(i in 1:k){
xendstart <- ncol(data)+0.5
ystart <- yend
yend <- ystart + nrow(subset(res, fit.cluster %in% sort(as.vector(unique(res$fit.cluster)))[i]))
clusterlines <- rbind(clusterlines, c(xendstart, ystart, i), c(xendstart, yend, i))
}
clusterlines <- clusterlines[-1,]
clusterlines$cluster <- as.character(clusterlines$cluster)
#lines to indicate clusters in the plot
if(helpLinesH){helpLinesH <- clusterlines[seq(2, nrow(clusterlines), by=2), "y"]}else{helpLines <- NA}
#the plot
plot1 <- ggplot2::ggplot(res2, ggplot2::aes(y=rowname, x=variable)) +
ggplot2::geom_tile(ggplot2::aes(fill = value), color=tileSepColor) +
ggplot2::scale_fill_gradient2(low=tileColors[1], mid=tileColors[2], high=tileColors[3], midpoint=0) +
ggplot2::theme_minimal()+ #removes axes lines
ggplot2::theme(plot.title=ggplot2::element_blank(), axis.title=ggplot2::element_blank()) +
ggplot2::scale_x_discrete(expand = c(0, 0)) + #removes gap between label and graph
ggplot2::scale_y_discrete(expand = c(0, 0)) +
ggplot2::geom_vline(xintercept=helpLinesV) +
ggplot2::geom_hline(yintercept=helpLinesH, size=0.5)
if(showClusters){
plot1 <- plot1 +
ggplot2::geom_line(data=clusterlines, ggplot2::aes(x=x, y=y, color=cluster), size=5) +
ggplot2::scale_color_manual(values = rainbow(k))
}
if(!showRows){
plot1 <- plot1 + ggplot2::theme(axis.text.y=ggplot2::element_blank())
}
if(samplecluster){
plot2 <- ggdendro::ggdendrogram(clust) + ggplot2::theme(axis.text.y=ggplot2::element_blank(), axis.text.x=ggplot2::element_blank())
plot <- ggpubr::ggarrange(plot2, plot1, ncol=1, nrow=2, align="h", heights=c(3,7), common.legend = T, legend="right")
}else{
plot <- plot1
}
cat("\n#=======================================================#",
"\nk means clustering performed with:\n\tre-sampling (nstart): ", kmeans.resample,
"\n\tre-centering max (iter.max): ", maxrecenter,
"\n\talgorithm: Hartigan and Wong (1979)",
"\n#=======================================================#",
"\nhierarchical clusterinig performed with:\n\tditance matrix method: ", dendro.distmethod,
"\n\tlinking method: ", dendro.linkmethod,
"\n#=======================================================#\n"
)
if(samplecluster){
list(ggplot=plot, table=res, ggplotTable=res2, hclust_distanceMatrix=distmat, hclust_clusters=clust$labels[clust$order])
}else{
list(ggplot=plot, table=res, ggplotTable=res2)
}
}
#' PCA
#'
#' Imports:
#' limma
#'
#' @param Counts matrix oder data.frame with raw counts, Entrez IDs as rownames, and sample names as colnames
#' @param Groups a vector of integers, indicating which column belongs to which group. Must be in the same order as the columns.
#' @param Batches a vector of integers, indicating which column belongs to which batch. Must be in the same order as the columns. Is used to remove batch effects (limma). Note that two batches must have at least one shared group.
#' @return A matrix where each row is a sample, and the three columns are principal components
#' @examples
#' myCounts
#' myGroup <- c(1,1,2,1,2,2) #(e.g. you have WT,WT,KO,WT,KO,KO)
#' myBatch <- c(1,1,1,2,2,2)
#' myPCA <- seekPCA(myCounts, myGroup, myBatch)
seekPCA <- function(Counts, Groups, Batches=NA){
#==input==#
design <- model.matrix(~Groups+0)
#==code==#
# batch correction if batches are defined
if(is.na(Batches[1])){
cluster <- log2(na.omit(Counts)+1)}else{
cluster <- limma::removeBatchEffect(log2(na.omit(Counts)+1), batch=Batches, design=design)
}
# PCA using prcomp
rMax <- apply(cluster, 1, max)
rMin <- apply(cluster, 1, min)
pca <- prcomp(t(cluster[rMax>rMin,]), scale = T)
pca <- cbind(as.data.frame(pca$x)[,1:3], groups=as.character(Groups))
#==output==#
pca
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.