R/PAC_pca.R

Defines functions PAC_pca

Documented in PAC_pca

#'Principle component analysis with scatterplots from PAC
#'
#'\code{PAC_pca} PAC principle component analysis.
#'
#'Given a PAC object the function will perform a principle component analysis by
#'calling the PCA function in the FactoMineR package, and then plot scatter
#'plots with the fviz_pca functions of the factoextra package.
#'
#'@family PAC analysis
#'
#'@seealso \url{https://github.com/Danis102} for updates on the current package.
#'
#'@param PAC PAC-list object.
#'
#'@param norm Character indicating what type of data to be used. If
#'  type="counts" the PCA will be conducted on the raw Counts. If type="cpm" the
#'  analysis will be done on cpm values returned from \code{PAC_norm} function
#'  and stored in the norm folder of the PAC-list object. The name of any other
#'  table in the norm(PAC) folder can also be used.
#'  
#'@param type Character indicating what type of pca and plot to be drawn. When
#'  type="pheno" then results will be drawned from a sample perspective, while
#'  if type="anno" then results will be presented from a sequence perspective.
#'  If type="both" then a biplot will be drawn, representing both pheno and anno
#'  features. Note, the 1st object (target column name) in the pheno_target or
#'  anno_taget can be used to specifically highlight sample and sequence groups
#'  (see below).
#'  
#'@param graphs Logical whether or not scatter plots should be plotted.
#'  (default=TRUE)
#'  
#'@param anno_target List with: 1st object being a character vector of target
#'  column(s) in Anno, 2nd object being a character vector of the target
#'  features(s) in the target column (1st object). The 1st object also control
#'  sequence feature colors when type="anno". (default=NULL)
#'
#'@param pheno_target List with: 1st object being a character vector of target
#'  column(s) in Pheno, 2nd object being a character vector of the target
#'  group(s) in the target column (1st object). The 1st object also control
#'  group colors when type="pheno" or "both". (default=NULL)
#'  
#'@param labels  If labels="sample", then points will be labeled with the names
#'  rownames in pheno(PAC) or anno(PAC) depending on type="pheno" or
#'  type="anno", respectively. Point labels can also be manually provided as a
#'  character vector in the same length as the intended target. As default,
#'  labels=NULL where only point are plotted.

#'@param ... parsing to the fviz_pca functions of the factoextra package. 
  
#'@return A PCA list object generated by the PCA function in the FactoMineR
#'  package
#'
#' @examples
#'
#' # Load a PAC-object 
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                   package = "seqpac", mustWork = TRUE))
#' 
#' # Simple sample counts pca and scatterplots with no groupings: 
#' pca_cnt <- PAC_pca(pac, norm="counts")
#' 
#' # Sample cpm pca and scatterplots with color groupings from 
#' # pheno(PAC)$type column:    
#' pca_cpm <- PAC_pca(pac, norm="cpm", type="pheno", 
#'                    pheno_target=list("stage"))
#' 
#' # Same but with or without text labels: 
#' pca_cpm_lab <- PAC_pca(pac, norm="cpm", type="pheno", 
#'                        pheno_target=list("stage"), labels="samples")
#' pca_cpm_lab2 <- PAC_pca(pac, norm="cpm", type="pheno", 
#'                         pheno_target=list("stage"), 
#'                         labels=pheno(pac)$batch)
#' 
#' # Cpm pca with anno(PAC) sequence features instead of Pheno samples and 
#' # restricted to read size 20-22:
#' pca_cpm_anno <- PAC_pca(pac, norm="cpm", type="anno", 
#'                         anno_target=list("Size", 20:22))
#' 
#' # Cpm pca as biplot:
#' pca_cpm_bi <- PAC_pca(pac, norm="cpm", type="both", 
#'                       pheno_target=list("stage"))
#' 
#' # Plot individual graphs
#' pca_cpm_bi$graphs[[1]]
#' pca_cpm_lab$graphs[[1]]
#' pca_cpm_anno$graphs[[3]]
#' 
#' # Extract pca output
#' pca_cpm_anno$pca
#' 
#' @importFrom ggplot2 geom_hline geom_vline geom_point aes 
#' theme scale_colour_gradient theme_minimal xlab ylab
#' @export

PAC_pca <- function(PAC, norm="counts", type="pheno", graphs=TRUE, 
                    pheno_target=NULL, anno_target=NULL, labels=NULL, ...){
  Dim.1 <- Dim.2 <- Dim.3 <- NULL
  ## Check S4
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  
  if(length(pheno_target)==2){
    PAC <- PAC_filter(PAC, subset_only=TRUE, 
                                       pheno_target=pheno_target)
    
    }
  if(length(anno_target)==2){
    PAC <- PAC_filter(PAC, subset_only=TRUE, 
                                       anno_target=anno_target)
    }
  stopifnot(PAC_check(PAC))
  
  if(norm=="counts"){
      data <- PAC$Counts 
    }else{
      data <- PAC$norm[[norm]]
    }   
  
  if(!is.null(labels)){
      geom <- c("point", "text")
      if(length(labels) > 1){
            if(any(duplicated(labels))){
              colnames(data) <- paste(labels, seq.int(ncol(data)), sep="_")
            }else{
              colnames(data) <- as.character(labels)
              }
      }
  }else{ 
    geom <- "point" 
    }
  
  if(type=="pheno"|type=="both"){
    data <- t(as.matrix(data))
    }
  pca_res <- FactoMineR::PCA(data, graph=FALSE)
  
  if(graphs==FALSE){
    return(pca_res)
  }else{

    if(!is.null(pheno_target)|!is.null(anno_target)){
      if(type=="pheno"){
        # Check likely type of data
        n_sampl <- nrow(PAC$Pheno)
        col <- PAC$Pheno[,pheno_target[[1]]]
        if(is.matrix(col)){
          col <- col[,1]
        }
        rtio <- length(unique(col))/n_sampl
      if(is.factor(col)|rtio<0.4){
        col <- as.factor(as.character(col))
      }
      if(is.numeric(col)|is.integer(col)|rtio>0.4){
        col <- as.numeric(col)
      }
    }
      
      if(type=="anno"){
        col <- as.factor(as.character(PAC$Anno[,anno_target[[1]]]))
        }
      if(type=="both"){
        if(!is.null(pheno_target)){
        col <- as.factor(as.character(PAC$Pheno[,pheno_target[[1]]]))
      }else{
        col <- "none"}}
    }else{
      col <- "none"}
  
  grphs <- list(PC1_PC2=NULL, PC1_PC3=NULL, PC2_PC3=NULL)
# Build graphs depending on type
    if(type=="pheno"){
    # For gradient/numeric use ggplot2
      if(is.numeric(col)){
        coord<- as.data.frame(pca_res$ind$coord)
        con<- as.data.frame(pca_res$eig[,"percentage of variance"])
        grphs$PC1_PC2 <- ggplot2::ggplot() +
          geom_hline(yintercept=0, linetype="dashed", color="black", size=0.5)+
          geom_vline(xintercept=0, linetype="dashed", color="black", size=0.5)+
          geom_point(data=coord, aes(x=Dim.1, y=Dim.2, colour=col)) + 
          theme(legend.position="none") +
          scale_colour_gradient(low="#00FFE6", high="#FF0000") +
          theme_minimal() +
          xlab(paste0("PC1 (", round(con["comp 1",], digits=2), "%)")) + 
          ylab(paste0("PC2 (", round(con["comp 2",], digits=2), "%)"))

        grphs$PC1_PC3 <- ggplot2::ggplot() +
          geom_hline(yintercept=0, linetype="dashed", color="black", size=0.5)+
          geom_vline(xintercept=0, linetype="dashed", color="black", size=0.5)+
          geom_point(data=coord, aes(x=Dim.1, y=Dim.3, colour=col)) + 
          theme(legend.position="none") +
          scale_colour_gradient(low="#00FFE6", high="#FF0000") +
          theme_minimal() +
          xlab(paste0("PC1 (", round(con["comp 1",], digits=2), "%)")) + 
          ylab(paste0("PC3 (", round(con["comp 3",], digits=2), "%)"))

        grphs$PC2_PC3 <- ggplot2::ggplot() +
          geom_hline(yintercept=0, linetype="dashed", color="black", size=0.5)+
          geom_vline(xintercept=0, linetype="dashed", color="black", size=0.5)+
          geom_point(data=coord, aes(x=Dim.2, y=Dim.3, colour=col)) + 
          theme(legend.position="none") +
          scale_colour_gradient(low="#00FFE6", high="#FF0000") +
          theme_minimal() +
          xlab(paste0("PC2 (", round(con["comp 2",], digits=2), "%)")) + 
          ylab(paste0("PC3 (", round(con["comp 3",], digits=2), "%)"))

    }else{
      # For factor use factoextra
      grphs$PC1_PC2 <- factoextra::fviz_pca_ind(
        pca_res, habillage = col, geom=geom, repel=TRUE, addEllipses = FALSE, 
        axes=c(1,2), invisible="quali", pointsize=2, labelsize=3, 
        title="PC1_PC2 - Pheno")
      grphs$PC1_PC3  <- factoextra::fviz_pca_ind(
        pca_res, habillage = col, geom=geom, repel=TRUE,  addEllipses = FALSE, 
        axes=c(1,3), invisible="quali", pointsize=2, labelsize=3, 
        title="PC1_PC3 - Pheno",)
      grphs$PC2_PC3  <- factoextra::fviz_pca_ind(
        pca_res, habillage = col, geom=geom, repel=TRUE, addEllipses = FALSE, 
        axes=c(2,3), invisible="quali", pointsize=2, labelsize=3, 
        title="PC2_PC3 - Pheno")
      grphs <- lapply(grphs, function(x){
        x <- gginnards::move_layers(x, match_type="GeomPoint",
                                    position = "top")
        x <- gginnards::move_layers(x, match_type="GeomTextRepel", 
                                    position = "top")
        return(x)
        })
      }
    }
  
  if(type=="anno"){
    # For gradient/numeric use ggplot2
    if(is.numeric(col)){
      coord<- as.data.frame(pca_res$var$coord)
      con<- as.data.frame(pca_res$eig[,"percentage of variance"])
      grphs$PC1_PC2 <- ggplot2::ggplot() +
        geom_hline(yintercept=0, linetype="dashed", color="black", size=0.5)+
        geom_vline(xintercept=0, linetype="dashed", color="black", size=0.5)+
        geom_point(data=coord, aes(x=Dim.1, y=Dim.2, colour=col)) + 
        theme(legend.position="none") +
        scale_colour_gradient(low="#00FFE6", high="#FF0000") +
        theme_minimal() +
        xlab(paste0("PC1 (", round(con["comp 1",], digits=2), "%)")) + 
        ylab(paste0("PC2 (", round(con["comp 2",], digits=2), "%)")) 

      grphs$PC1_PC3 <- ggplot2::ggplot() +
        geom_hline(yintercept=0, linetype="dashed", color="black", size=0.5)+
        geom_vline(xintercept=0, linetype="dashed", color="black", size=0.5)+
        geom_point(data=coord, aes(x=Dim.1, y=Dim.3, colour=col)) + 
        theme(legend.position="none") +
        scale_colour_gradient(low="#00FFE6", high="#FF0000") +
        theme_minimal() +
        xlab(paste0("PC1 (", round(con["comp 1",], digits=2), "%)")) + 
        ylab(paste0("PC3 (", round(con["comp 3",], digits=2), "%)")) 

      grphs$PC2_PC3 <- ggplot2::ggplot() +
        geom_hline(yintercept=0, linetype="dashed", color="black", size=0.5)+
        geom_vline(xintercept=0, linetype="dashed", color="black", size=0.5)+
        geom_point(data=coord, aes(x=Dim.2, y=Dim.3, colour=col)) + 
        theme(legend.position="none") +
        scale_colour_gradient(low="#00FFE6", high="#FF0000") +
        theme_minimal() +
        xlab(paste0("PCA2 (", round(con["comp 2",], digits=2), "%)")) + 
        ylab(paste0("PCA3 (", round(con["comp 3",], digits=2), "%)")) 

    }else{
        grphs$PC1_PC2 <- factoextra::fviz_pca_ind(
          pca_res, geom="point", habillage = col, repel=TRUE, 
          addEllipses = FALSE, axes=c(1,2), invisible="quali", 
          pointsize=1,  title="PC1_PC2 - Anno")
        grphs$PC1_PC3  <- factoextra::fviz_pca_ind(
          pca_res, geom="point", habillage = col, repel=TRUE,  
          addEllipses = FALSE, axes=c(1,3), invisible="quali", 
          pointsize=1,  title="PC1_PC3 - Anno")
        grphs$PC2_PC3  <- factoextra::fviz_pca_ind(
          pca_res, geom="point", habillage = col, repel=TRUE, 
          addEllipses = FALSE, axes=c(2,3), invisible="quali", 
          pointsize=1,  title="PC2_PC3 - Anno")
    }
  } 
  if(type=="both"){
    grphs$PC1_PC2 <- factoextra::fviz_pca_biplot(
      pca_res, habillage = col, geom=c("arrow", "text"), geom.var = "point", 
      col.var = "black", repel=TRUE, addEllipses = FALSE, axes=c(1,2), 
      invisible="quali", pointsize=0.5, labelsize=3, 
      title="PC1_PC2 - Biplot", ...)
    grphs$PC1_PC3 <- factoextra::fviz_pca_biplot(
      pca_res, habillage = col, geom=c("arrow", "text"), geom.var = "point", 
      col.var = "black", repel=TRUE, addEllipses = FALSE, axes=c(1,3), 
      invisible="quali", pointsize=0.5, labelsize=3, 
      title="PC1_PC3 - Biplot", ...)
    grphs$PC2_PC3 <- factoextra::fviz_pca_biplot(
      pca_res, habillage = col, geom=c("arrow", "text"), geom.var = "point", 
      col.var = "black", repel=TRUE, addEllipses = FALSE, axes=c(2,3), 
      invisible="quali", pointsize=0.5, labelsize=3, 
      title="PC2_PC3 - Biplot", ...)
    grphs <- lapply(grphs, function(x){
      x <- gginnards::move_layers(x, match_type="GeomPoint", 
                                  position = "bottom") 
      x <- gginnards::move_layers(x, match_type="GeomArrow", 
                                  position = "top")
      x <- gginnards::move_layers(x, match_type="GeomTextRepel", 
                                  position = "top")
      return(x)
      })
  }
  print(cowplot::plot_grid(plotlist=grphs, ncol=2, nrow=2))
  return(list(graphs=grphs, pca=pca_res))
  } #end graph=true
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.