R/samplePCA.R

Defines functions samplePCA

Documented in samplePCA

#' Sample PCA plot for RNASeq
#' @description PCA plots of samples for DE analysis
#'
#' @param expressionMatrix Data frame with variance-stabilized expression data (in DESeq would be obtained by counts(dds, normalized = TRUE))
#' @param numberOfPCs Integer value for number of principal components
#' @param clusters User has the option of choosing how many clusters they want (k Means)
#' @param autoClustering Boolean. If TRUE the function will determine the optimal number of k means
#' @param colorBy User has "sample" and "cluster" options to determine which colors the plot
#'
#' @return List with 4 plots, `variancePlot` has cumulative and individual importance of each PC, `pcaGRid` is a grid of PCA plots, `pca3dPlot` has 3D PCA, and `clusteringPlot` is is the plot generated by eclust on the first 2 PCs
#' @author Felipe Flores
#' @import tidyverse GGally plotly
#'
#' @export


samplePCA <- function(
  expressionMatrix,
  numberOfPCs = 2,
  clusters = 1,
  autoClustering = FALSE,
  colorBy = "sample")
{

  returnList <- list()
  expressionMatrix <- expressionMatrix[which(apply(expressionMatrix, 1, var)!=0), ]
  pca <- stats::prcomp(t(expressionMatrix),center=TRUE,scale=TRUE)

  pcaImportanceDF <- pca %>%
    broom::tidy(matrix="pcs") %>%
    dplyr::rename(.,Individual=percent,Cumulative=cumulative) %>%
    tidyr::gather(key,value,Individual,Cumulative)

  importancePlot <- pcaImportanceDF %>%
    ggplot2::ggplot(aes(x=PC,y=value,group=key,color=key))+
    theme_bw()+
    geom_line()+
    geom_point()+
    labs(title="Importance of PCs",y="Percent of Variance explained")+
    theme(plot.title = element_text(hjust=0.5), legend.title = element_blank())

  pcaDF <- as_data_frame(pca$x,rownames = "Sample")

  pca3dPlot <- pcaDF %>%
    plotly::plot_ly(x = ~PC1, y = ~PC2, z = ~PC3, text = ~Sample, mode="marker+text")

  # Code has not changed from the first couple plots up to here

  # Case 1: If either the user specifies a number of clusters or automatic clustering, then
  # this part of code is triggered
  if (clusters != 1 || autoClustering) {
    # Using factoextra's eclust to find clusters
    pdf(NULL)
    # This is just to suppress the graphical output for the moment, but we wanna save this plot later
    eclustering <- factoextra::eclust(
      t(expressionMatrix),
      FUNcluster = "kmeans",
      k=clusters,
      k.max=min(10,nrow(t(expressionMatrix))-1)
    )
    dev.off()

    # We customize the clustering plot from eclust a little bit:
    clusteringPlot <- eclustering$clust_plot +
      theme_bw()+
      theme(plot.title = element_text(hjust = 0.5))

    returnList <- list(clusteringPlot = clusteringPlot)

    # Join the dataframe with the clustering information as a tibble
    pcaDF <- pcaDF %>%
      dplyr::right_join(dplyr::data_frame(
        Sample=names(eclustering$cluster),
        Cluster=LETTERS[eclustering$cluster]
      ))

    pca3dPlot <- pca3dPlot %>%
      plotly::add_trace(data = pcaDF, color = ~Cluster, colors = "Paired", showlegend = FALSE)

    # Notice I converted the clusters to letters instead of numbers.
    # ggpairs can deal with this categorical data much better.
    # Factors could also have been used but meh /shrug

    # Generating the grid plot with clusters for colors and samples for shapes

    # This part is added because ggpairs allows up to 6 different shapes
    # Maybe in the future they'll implement a feature to increment them
    # Any more than that and the data points will disappear!
    # In such a case, we flip the shape and cluster parameters

    if (ncol(expressionMatrix)>6){
      graphingParameters <- list(mapping=aes(shape=Cluster,col=Sample))
    } else{
      # graphingParameters <- list(mapping=aes(col=Cluster,shape=Sample))
      switch (colorBy,
        "cluster" = {graphingParameters <- list(mapping=aes(col = Cluster, shape = Sample))},
        "sample" = {graphingParameters <- list(mapping=aes(col = Sample, shape = Cluster))}
      )
    }
  }


  else { # Case 2: User wants no clustering
    graphingParameters <- list(mapping=aes(col=Sample))
  }

  graphingParameters <- c(
    graphingParameters, # aes() parameters from the nested if's
    list(
      data=pcaDF,
      columns=2:(numberOfPCs+1),
      lower=list(continuous=GGally::wrap("points",size=3)),
      upper=NULL,
      legend = c(2,1)
    )
  )

  pcaPlot <- do.call(GGally::ggpairs,graphingParameters) # Generate the plot with these parameters

  # Add a few details:
  pcaPlot <- pcaPlot + theme_bw() +
    labs(title="Principal Components")+
    theme(plot.title=element_text(hjust=0.5))



  # And return them to the user!
  returnList <-
    c(returnList,
      list(
        importancePlot = importancePlot,
        pcaPlot = pcaPlot,
        pca3dPlot = pca3dPlot
      )
    )

  return(returnList)

} # end of function
polyak-lab/felipeCommonFunctions documentation built on May 26, 2019, 12:32 a.m.