R/Cluster.R

Defines functions seekPCA seekCluster seekKmeansClusterNumber

Documented in seekCluster seekKmeansClusterNumber seekPCA

# Install Package: Ctrl + Shift + B

# devtools::document()

#' Cluster number for k-means clustering
#'
#' Imports:
#' ggplot2
#'
#' @param data table, where rows are to be clustered
#' @param maxclusters integer, maximum number of clusters to show in the plot
#' @param resample integer, number of times the the algorithm repeats its procedure with random starting points
#' @param maxrecenter integer, maximum number of times the algorithm re-centeres the cluster by mean and re-assigns the data points. This is done until the clusters do not change anymore but maximum as many times as specified here.
#' @return plot object
#' @examples
#' seekKmeansClusterNumber
seekKmeansClusterNumber <- function(data0, maxclusters=15, resample=20, maxrecenter=10, useFactoextra=T){
  data <- as.data.frame(scale(data0))
  maxclusters <- min(maxclusters, nrow(data)-1) # ensure that cluster number doesnt exceed sample number
  k <- seq(maxclusters)
  variance <- sapply(k, function(x) sum(kmeans(data, centers=x, nstart=resample, iter.max=maxrecenter)$withinss)) # calculate sum of variances for each k
  df <- data.frame(Variance=variance, K=k)

  #elbow
  delta <- (c(0,variance)-c(variance,0))[k]
  spec <- (c(delta,0)/c(delta[-1],0,0))[k]
  spec[spec==Inf] <- 0
  hits <- sort(spec, decreasing = T)[1:3]
  elbow <- k[spec %in% hits]

  ggplot2::ggplot(df, ggplot2::aes(x=K, y=Variance)) +
    ggplot2::geom_vline(xintercept=elbow, linetype="dotted") +
    ggplot2::geom_line() +
    ggplot2::geom_point(size=5) +
    ggplot2::scale_x_continuous(breaks=k, name="number of clusters k") +
    #ggplot2::scale_y_continuous(expand=c(0,0), name="within groups sum of squares", limits=c(0,max(variance)*1.1)) +
    ggplot2::geom_label(x=2.2, y=mean(c(df$Variance[1],df$Variance[2])), hjust=0, size=6,
                       ggplot2::aes(label=paste0("re-sampling (nstart): ", resample, "\nre-centering max (iter.max): ", maxrecenter, "\nalgorithm: Hartigan and Wong (1979)"))) +
    ggplot2::theme_classic() +
    ggplot2::theme(text=ggplot2::element_text(size=18), panel.border=ggplot2::element_rect(fill=NA))
  #plot(k, variance, type="b", xlab="Number of Clusters", ylab="within groups sum of squares", xaxt="n")
  #axis(1, at=k)
}


#' k-means clustering for rows and hierarchical clustering for columns
#'
#' Imports:
#' ggplot2,
#' reshape2
#' tibble
#'
#' @param data0 data.frame, where rows are genes and columns are samples (possibility of having gene names as row names)
#' @param k integer, setting the number of clusters to be used (determine with seekKmeansClusterNumber())
#' @param samplecluster boolean, should clustering of samples be performed
#' @param kmeans.resample integer, number of times the the algorithm repeats its procedure with random starting points
#' @param maxrecenter integer, maximum number of times the algorithm re-centeres the cluster by mean and re-assigns the data points. This is done until the clusters do not change anymore but maximum as many times as specified here.
#' @param dendro.distmethod string, one of: "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
#' @param dendro.linkmethod string, one of: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid"
#' @param tileColors vector of 3 strings depicting colors for low, medium & high values
#' @param showRows boolean, should row names be displayed in heatmap?
#' @param helpLinesV vector of numbers, indicating at what position vertical lines should be drawn
#' @param helpLinesH boolean, indicating if horizontal lines should be drawn to indicate clusters
#' @param ownCluster a vector of same length AND ORDER as data. This vector will just be cbind to the data. If provided, no kmeans clustering be calculated. For the graph, the clusters will be sorted alphabetically from bottom to top of the graph.
#' @param showClusters should clusters be indicatd by colorful lines?
#' @param tileSepColor color to seperate tiles by (not recommended for heatmaps with many tiles)
#' @return list with 1) a data.frame, same as "data", but with an additional column, defining the cluster for each row, 2) the data.frame used as input for the plot, 3) a ggplot object
#' @examples
#' seekKmeansCluster(mx, 5)
seekCluster <- function(data0, k=2, scale=T, samplecluster=T, kmeans.resample=50, maxrecenter=10,
                        dendro.distmethod="euclidean", dendro.linkmethod="complete",
                        tileColors=c("royalblue4","white","red"), showRows=F,
                        helpLinesV=NA, helpLinesH=FALSE, ownCluster=NA, showClusters=F, tileSepColor=NA){
  colnames(data0) <- make.names(colnames(data0), unique=T)
  # perform kmeans clustering (if not already given)
  if(is.na(ownCluster[1])){
    data <- na.omit(data0)
    data <- data[apply(data,1,var)>0,]
    if(scale) data <- as.data.frame(t(scale(t(data))))
    fit <- kmeans(data, centers = k, nstart = kmeans.resample, iter.max = maxrecenter)
    res <- data.frame(data, fit$cluster)
  }else{
    data <- data0
    if(scale) data <- as.data.frame(t(scale(t(data))))
    res <- data.frame(data, fit.cluster=ownCluster)
  }

  # melt data.frame, order by cluster and give levels to rownames for plot ordering purposes
  res2 <- reshape2::melt(tibble::rownames_to_column(res), id=c("rowname", "fit.cluster"))
  res2 <- res2[order(res2$fit.cluster),]
  res2$rowname <- factor(res2$rowname, levels=unique(res2$rowname))
  res2$variable <- gsub("^X","",res2$variable)

  #perform hierarchical clustering
  if(samplecluster){
    data2 <- t(data)
    distmat <- dist(data2, method=dendro.distmethod)
    clust <- hclust(distmat, method=dendro.linkmethod)
    res2$variable <- factor(res2$variable, levels=clust$labels[clust$order])
  }else{
    res2$variable <- factor(res2$variable, levels=colnames(data0))
  }

  #data.frame to color clusters in the plot
  yend=0.5
  clusterlines <- data.frame(x=0, y=0, cluster=0)
  for(i in 1:k){
    xendstart <- ncol(data)+0.5
    ystart <- yend
    yend <- ystart + nrow(subset(res, fit.cluster %in% sort(as.vector(unique(res$fit.cluster)))[i]))
    clusterlines <- rbind(clusterlines, c(xendstart, ystart, i), c(xendstart, yend, i))
  }
  clusterlines <- clusterlines[-1,]
  clusterlines$cluster <- as.character(clusterlines$cluster)

  #lines to indicate clusters in the plot
  if(helpLinesH){helpLinesH <- clusterlines[seq(2, nrow(clusterlines), by=2), "y"]}else{helpLines <- NA}

  #the plot
  plot1 <- ggplot2::ggplot(res2, ggplot2::aes(y=rowname, x=variable)) +
    ggplot2::geom_tile(ggplot2::aes(fill = value), color=tileSepColor) +
    ggplot2::scale_fill_gradient2(low=tileColors[1], mid=tileColors[2], high=tileColors[3], midpoint=0) +
    ggplot2::theme_minimal()+ #removes axes lines
    ggplot2::theme(plot.title=ggplot2::element_blank(), axis.title=ggplot2::element_blank()) +
    ggplot2::scale_x_discrete(expand = c(0, 0)) + #removes gap between label and graph
    ggplot2::scale_y_discrete(expand = c(0, 0)) +
    ggplot2::geom_vline(xintercept=helpLinesV) +
    ggplot2::geom_hline(yintercept=helpLinesH, size=0.5)
  if(showClusters){
    plot1 <- plot1 +
      ggplot2::geom_line(data=clusterlines, ggplot2::aes(x=x, y=y, color=cluster), size=5) +
      ggplot2::scale_color_manual(values = rainbow(k))
  }
  if(!showRows){
    plot1 <- plot1 + ggplot2::theme(axis.text.y=ggplot2::element_blank())
  }

  if(samplecluster){
    plot2 <- ggdendro::ggdendrogram(clust) + ggplot2::theme(axis.text.y=ggplot2::element_blank(), axis.text.x=ggplot2::element_blank())
    plot <- ggpubr::ggarrange(plot2, plot1, ncol=1, nrow=2, align="h", heights=c(3,7), common.legend = T, legend="right")
  }else{
    plot <- plot1
  }

  cat("\n#=======================================================#",
            "\nk means clustering performed with:\n\tre-sampling (nstart): ", kmeans.resample,
            "\n\tre-centering max (iter.max): ", maxrecenter,
            "\n\talgorithm: Hartigan and Wong (1979)",
            "\n#=======================================================#",
            "\nhierarchical clusterinig performed with:\n\tditance matrix method: ", dendro.distmethod,
            "\n\tlinking method: ", dendro.linkmethod,
            "\n#=======================================================#\n"
  )

  if(samplecluster){
    list(ggplot=plot, table=res, ggplotTable=res2, hclust_distanceMatrix=distmat, hclust_clusters=clust$labels[clust$order])
  }else{
    list(ggplot=plot, table=res, ggplotTable=res2)
  }

}


#' PCA
#'
#' Imports:
#' limma
#'
#' @param Counts matrix oder data.frame with raw counts, Entrez IDs as rownames, and sample names as colnames
#' @param Groups a vector of integers, indicating which column belongs to which group. Must be in the same order as the columns.
#' @param Batches a vector of integers, indicating which column belongs to which batch. Must be in the same order as the columns. Is used to remove batch effects (limma). Note that two batches must have at least one shared group.
#' @return A matrix where each row is a sample, and the three columns are principal components
#' @examples
#' myCounts
#' myGroup <- c(1,1,2,1,2,2) #(e.g. you have WT,WT,KO,WT,KO,KO)
#' myBatch <- c(1,1,1,2,2,2)
#' myPCA <- seekPCA(myCounts, myGroup, myBatch)
seekPCA <- function(Counts, Groups, Batches=NA){
  #==input==#
  design <- model.matrix(~Groups+0)

  #==code==#
  # batch correction if batches are defined
  if(is.na(Batches[1])){
    cluster <- log2(na.omit(Counts)+1)}else{
      cluster <- limma::removeBatchEffect(log2(na.omit(Counts)+1), batch=Batches, design=design)
    }

  # PCA using prcomp
  rMax <- apply(cluster, 1, max)
  rMin <- apply(cluster, 1, min)
  pca <- prcomp(t(cluster[rMax>rMin,]), scale = T)
  pca <- cbind(as.data.frame(pca$x)[,1:3], groups=as.character(Groups))

  #==output==#
  pca
}
Solatar/seeqR documentation built on Feb. 19, 2021, 8:07 p.m.