R/plots.R

Defines functions artmsVolcanoPlot .artms_getPCAplots .artms_summarySE .artms_significantHits .artms_samplePeptideBarplot .artms_prettyPrintHeatmapLabels .artms_plotRatioLog2fc .artms_plotCorrelationConditions .artms_plotReproducibilityAbundance .artms_plotNumberProteinsImputedLog2fc .artms_plotNumberProteinsAbundance .artms_plotAbundanceBoxplots .artms_plotReproducibilityEvidence artmsPlotHeatmapQuant .artms_plotHeat artmsDataPlots

Documented in artmsDataPlots artmsPlotHeatmapQuant artmsVolcanoPlot

# artMS PLOT FUNCTIONS

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Individual Normalized abundance dot plots for every protein
#'
#' @description Protein abundance dot plots for each unique uniprot id. It can
#' take a long time
#' @param input_file (char) File path and name to the `-normalized.txt` output
#' file from MSstats
#' @param output_file (char) Output file (path) name (add the `.pdf` extension)
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @return (pdf) file with each individual protein abundance plot for each
#' conditions
#' @keywords abundance, dotplots, plot
#' @examples \dontrun{
#' artmsDataPlots(input_file = "results/ab-results-mss-normalized.txt",
#'                output_file = "results/ab-results-mss-normalized.pdf")
#' }
#' @export
artmsDataPlots <- function(input_file, 
                           output_file,
                           verbose = TRUE) {
                            
  if(any(missing(input_file) | 
         missing(output_file)))
    stop("Missed (one or many) required argument(s)
        Please, check the help of this function to find out more")
  
  
  data_mss = fread(input_file, integer64 = 'double')
  unique_subjects <- unique(data_mss$PROTEIN)
  condition_length <- length(unique(data_mss$GROUP))
  min_abu <- min(data_mss$ABUNDANCE, na.rm = TRUE)
  max_abu <- max(data_mss$ABUNDANCE, na.rm = TRUE)
  
  pdf(output_file, width = condition_length * 1.5, height = 3)
  if(verbose) message('>> PRINTING CONDITION PLOTS for every protein ')
  for (subject in unique_subjects) {
    subject_data <- data_mss[PROTEIN == subject, ]
    if(verbose) message(sprintf('%s ', subject))
    p <-
      ggplot(data = subject_data,
             aes(x = SUBJECT, y = ABUNDANCE, colour = FEATURE))
    p <- p + geom_point(size = 2, na.rm = TRUE) +
      facet_wrap(
        facets = ~ GROUP,
        drop = TRUE,
        scales = 'free_x',
        ncol = condition_length
      ) +
      ylim(min_abu, max_abu) +
      theme(axis.text.x = element_text(angle = -90, hjust = 1)) +
      guides(colour = FALSE) +
      xlab(NULL) +
      ggtitle(subject)
    print(p)
  }
  if(verbose) message("--- Done! ")
  garbarge <- dev.off()
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Heatmap of significant values
#
# @description heatmap plot to represent proteins with significant changes
# @param mss_F (data.frame) with the significant values (log2fc, pvalues)
# @param out_file (char) Name for the output
# @param labelOrder (vector) Vector with the particular order for the IDs
# (default, `NULL` no order)
# @param names (char) Type of ID used. Default is `Protein` (uniprot entry id).
# Soon will be possible to use 'Gene' name ids.
# @param cluster_cols (logical) Select whether to cluster the columns.
# Options: `T` or `F`. Default `T`.
# @param display (char) Value used to genarate the heatmaps. Options:
# - `log2FC` (default)
# - `adj.pvalue`
# - `pvalue`
# @param verbose (logical) `TRUE` (default) shows function messages
# @return A heatmap of significant values
# @keywords significant, heatmap
.artms_plotHeat <- function(mss_F,
                            out_file,
                            labelOrder = NULL,
                            names = 'Protein',
                            cluster_cols = FALSE,
                            display = c('log2FC', 'adj.pvalue', 'pvalue'),
                            verbose = TRUE) {

  heat_data <- data.frame(mss_F, names = names)
    
  ## create matrix from log2FC or p-value as user defined
  display <- match.arg(display)
  if (display == 'log2FC') {
    # Issues with extreme_val later if we have Inf/-Inf values.
    if (sum(is.infinite(heat_data$log2FC)) > 0) {
      idx <- is.infinite(heat_data$log2FC)
      heat_data$log2FC[idx] <- NA
    }
    ##LEGACY
    # heat_data_w = data.table::dcast(names ~ Label, data = heat_data, value.var = 'log2FC')
    heat_data_w <- heat_data %>% 
      tidyr::pivot_wider(id_cols = names, 
                         names_from = Label, 
                         values_from = log2FC,
                         values_fn = list(log2FC = length))
  } else if (display == 'adj.pvalue') {
    heat_data$adj.pvalue = -log10(heat_data$adj.pvalue + 10 ^ -16)
    ##LEGACY
    # heat_data_w = data.table::dcast(names ~ Label,
    #                     data = heat_data, value.var = 'adj.pvalue')
    heat_data_w <- heat_data %>% 
      tidyr::pivot_wider(id_cols = names, 
                         names_from = Label, 
                         values_from = adj.pvalue,
                         values_fn = list(adj.pvalue = length))
  } else if (display == 'pvalue') {
    heat_data$pvalue = -log10(heat_data$pvalue + 10 ^ -16)
    ##LEGACY
    # heat_data_w = data.table::dcast(names ~ Label, data = heat_data, value.var = 'pvalue')
    
    heat_data_w <- heat_data %>% 
      tidyr::pivot_wider(id_cols = names, 
                         names_from = Label, 
                         values_from = pvalue,
                         values_fn = list(pvalue = length))
  }
  
  heat_data_w <- as.data.frame(heat_data_w)
  
  ## try
  #gene_names = uniprot_to_gene_replace(uniprot_ac=heat_data_w$Protein)
  rownames(heat_data_w) = heat_data_w$names
  heat_data_w = heat_data_w[, -1]
  heat_data_w[is.na(heat_data_w)] = 0
  max_val = ceiling(max(heat_data_w))
  min_val = floor(min(heat_data_w))
  extreme_val = max(max_val, abs(min_val))
  if (extreme_val %% 2 != 0)
    extreme_val = extreme_val + 1
  bin_size = 2
  signed_bins = (extreme_val / bin_size)
  colors_neg = rev(colorRampPalette(brewer.pal("Blues", n = extreme_val /
                                                 bin_size))(signed_bins))
  colors_pos = colorRampPalette(
    brewer.pal("Reds", n = extreme_val / bin_size))(signed_bins)
  colors_tot = c(colors_neg, colors_pos)
  
  if (is.null(labelOrder)) {
    pheatmap(
      heat_data_w,
      scale = "none",
      cellheight = 10,
      cellwidth = 10,
      filename = out_file,
      color = colors_tot,
      breaks = seq(
        from = -extreme_val,
        to = extreme_val,
        by = bin_size
      ),
      cluster_cols = cluster_cols,
      fontfamily = "mono"
    )
  } else{
    heat_data_w <- heat_data_w[, labelOrder]
    pheatmap(
      heat_data_w,
      scale = "none",
      cellheight = 10,
      cellwidth = 10,
      filename = out_file,
      color = colors_tot,
      breaks = seq(
        from = -extreme_val,
        to = extreme_val,
        by = bin_size
      ),
      cluster_cols = cluster_cols,
      fontfamily = "mono"
    )
    if(verbose) message("--- Heatmap is out ")
  }
  
  return(heat_data_w)
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Outputs a heatmap of the MSStats results created using the log2fold
#' changes
#'
#' @description Heatmap of the Relative Quantifications (MSStats results)
#' @param input_file (char) MSstats `results.txt` file and location (or
#' data.frame of resuts)
#' @param output_file (char) Output file name (pdf format) and location.
#' Default:"quantifications_heatmap.pdf"
#' @param species (char). Specie name to be able to add the Gene name. To find
#' out more about the supported species check `?artmsMapUniprot2Entrez`
#' @param labels (vector) of uniprot ids if only specific labes would like to
#' be plotted. Default: all labels
#' @param cluster_cols (boolean) `True` or `False` to cluster columns.
#' Default: FALSE
#' @param lfc_lower (int) Lower limit for the log2fc. Default: -2
#' @param lfc_upper (int) Upper limit for the log2fc. Default: +2
#' @param whatPvalue (char) `pvalue` or `adj.pvalue` (default)
#' @param FDR (int) Upper limit false discovery rate (or pvalue). Default: 0.05
#' @param display Metric to be displayed. Options:
#' - `log2fc` (default)
#' - `adj.pvalue`
#' - `pvalue`
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @return (pdf or ggplot2 object) heatmap of the MSStats results using the
#' selected metric
#' @keywords heatmap, log2fc
#' @examples
#' # Unfortunately, the example does not contain any significant hits
#' # Use for illustration purposes
#' artmsPlotHeatmapQuant(input_file = artms_data_ph_msstats_results,
#'                        species = "human",
#'                        output_file = NULL,
#'                        whatPvalue = "pvalue",
#'                        lfc_lower = -1,
#'                        lfc_upper = 1)
#' @export
artmsPlotHeatmapQuant <- function(input_file,
                                   output_file = "quantifications_heatmap.pdf",
                                   species,
                                   labels = '*',
                                   cluster_cols = FALSE,
                                   display = 'log2FC',
                                   lfc_lower = -2,
                                   lfc_upper = 2,
                                   whatPvalue = "adj.pvalue",
                                   FDR = 0.05,
                                   verbose = TRUE) {
  
  if(any(missing(input_file) | 
         missing(species)))
    stop("Missed (one or many) required argument(s)
        Please, check the help of this function to find out more")
  
  input <- .artms_checkIfFile(input_file)
  
  ## select data points  by LFC & FDR criterium in single condition and
  ## adding corresponding data points from the other conditions
  sign_hits <- .artms_significantHits(
    input,
    labels = labels,
    LFC = c(lfc_lower, lfc_upper),
    whatPvalue = whatPvalue,
    FDR = FDR
  )
  sign_hits <- sign_hits[complete.cases(sign_hits$log2FC), ]
  sign_hits <- sign_hits[is.finite(sign_hits$log2FC), ]
  if (dim(sign_hits)[1] == 0) {
    return(message("--- not enough significant hits!"))
  }
  
  sign_labels <- unique(sign_hits$Label)
  if(verbose) message(
    sprintf(
      ">> TOTAL NUMBER OF SELECTED HITS FOR PLOTS WITH LFC BETWEEN %s AND %s AT %s FDR:%s\n",
      lfc_lower,
      lfc_upper,
      FDR,
      nrow(sign_hits) / length(sign_labels)
    )
  )
  
  suppressMessages(
    sign_hits <- artmsAnnotationUniprot(
      x = sign_hits,
      columnid = "Protein",
      species = species
    )
  )
  
  ## REPRESENTING RESULTS AS HEATMAP
  ## plot heat map for all contrasts
  if (any(grepl('Gene', colnames(sign_hits)))) {
    heat_labels <- paste(sign_hits$Protein, sign_hits$Gene, sep = '_')
  } else{
    heat_labels <- sign_hits$Protein
  }
  
  heat_labels <- gsub('\\sNA$', '', heat_labels)
  
  # Old PlotHeat function:
  heat_data <- data.frame(sign_hits, heat_labels = heat_labels)
  
  ## create matrix from log2FC or p-value as user defined
  if (display == 'log2FC') {
    # Issues with extreme_val later if we have Inf/-Inf values.
    if (sum(is.infinite(heat_data$log2FC)) > 0) {
      idx <- is.infinite(heat_data$log2FC)
      heat_data$log2FC[idx] <- NA
    }
    
    ##LEGACY
    # heat_data_w <- data.table::dcast(heat_labels ~ Label, 
    #                                  data = heat_data, 
    #                                  value.var = 'log2FC')
    
    heat_data_w <- heat_data %>% 
      tidyr::pivot_wider(id_cols = heat_labels, 
                         names_from = Label, 
                         values_from = log2FC)
    
      
  } else if (display == 'adj.pvalue') {
    heat_data$adj.pvalue <- -log10(heat_data$adj.pvalue + 10 ^ -16)
    
    ##LEGACY
    # heat_data_w <- data.table::dcast(heat_labels ~ Label, 
    #                                  data = heat_data, 
    #                                  value.var = 'adj.pvalue')
    heat_data_w <- heat_data %>% 
      tidyr::pivot_wider(id_cols = heat_labels, 
                         names_from = Label, 
                         values_from = adj.pvalue)
      
  } else if (display == 'pvalue') {
    heat_data$pvalue <- -log10(heat_data$pvalue + 10 ^ -16)
    ##LEGACY
    # heat_data_w <- data.table::dcast(heat_labels ~ Label, 
    #                                  data = heat_data, 
    #                                  value.var = 'pvalue')
    heat_data_w <- heat_data %>% 
      tidyr::pivot_wider(id_cols = heat_labels, 
                         names_from = Label, 
                         values_from = pvalue)
  }
  
  #gene_names <- uniprot_to_gene_replace(uniprot_ac=heat_data_w$Protein)
  heat_data_w <- as.data.frame(heat_data_w)
  rownames(heat_data_w) <- heat_data_w$heat_labels
  heat_data_w <- heat_data_w[, -1]
  heat_data_w[is.na(heat_data_w)] = 0
  max_val <- ceiling(max(heat_data_w))
  min_val <- floor(min(heat_data_w))
  extreme_val <- max(max_val, abs(min_val))
  if (extreme_val %% 2 != 0)
    extreme_val = extreme_val + 1
  bin_size = 2
  signed_bins <- (extreme_val / bin_size)
  colors_neg <- rev(colorRampPalette(RColorBrewer::brewer.pal("Blues", n = extreme_val /
                                                    bin_size))(signed_bins))
  colors_pos <- colorRampPalette(RColorBrewer::brewer.pal("Reds", 
                                    n = extreme_val / bin_size))(signed_bins)
  colors_tot <- c(colors_neg, colors_pos)
  
  if (!is.null(output_file)) {
    pheatmap(
      heat_data_w,
      scale = "none",
      cellheight = 10,
      cellwidth = 10,
      filename = output_file,
      color = colors_tot,
      breaks = seq(
        from = -extreme_val,
        to = extreme_val,
        by = bin_size
      ),
      cluster_cols = cluster_cols,
      fontfamily = "mono"
    )
    if(verbose) message("--- Heatmap done ")
  } else{
    pheatmap(
      heat_data_w,
      scale = "none",
      cellheight = 10,
      cellwidth = 10,
      color = colors_tot,
      breaks = seq(
        from = -extreme_val,
        to = extreme_val,
        by = bin_size
      ),
      cluster_cols = cluster_cols,
      fontfamily = "mono"
    )
  }
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Generate reproducibility plots based on raw intentities
# (from the evidence file)
#
# @description Generate reproducibility plots based on raw intentities
# (log tranformed) from the evidence file
# @param data (data.frame) clean processed evidence
# @param verbose (logical) `TRUE` (default) shows function messages
# @return (pdf) A reproducibility plot based on evidence values
# @keywords internal, plot, qc, quality, control
.artms_plotReproducibilityEvidence <- function(data,
                                               verbose = TRUE) {
  
  Feature = NULL
  
  data <- data[c('Feature',
                 'Proteins',
                 'Intensity',
                 'Condition',
                 'BioReplicate',
                 'Run')]
    
  condi <- unique(data$Condition)
  
  # Progress bar
  if(verbose) pb <- txtProgressBar(min = 0,
                       max = length(condi),
                       style = 3)
  
  for ( i in seq_len(length(condi)) ) {
    eCondition <- condi[i]
    # Progress bar
    if(verbose) setTxtProgressBar(pb, i)
    
    conditionOne <- data[which(data$Condition == eCondition), ]
    
    # FIRST CHECK FOR TECHNICAL REPLICAS
    bioreplicasAll <- unique(conditionOne$BioReplicate)
    
    for (eBioreplica in bioreplicasAll) {
      # message('\tChecking for technical replicas in ',eBioreplica, " ")
      biorepli <- conditionOne[conditionOne$BioReplicate == eBioreplica, ]
      here <- unique(biorepli$Run)
      
      if (length(here) > 1) {
        #Limit to 2 technical replicas: (length(here) > 1 & length(here) == 2)
        # We are expecting no more than 2 technical replicas. 
        # If there is more... it is worthy to double check
        # message('\t\t>>Reproducibility for technical replicas of',eBioreplica,":")
        #Need to change the RUN number to letters (TR: TECHNICAL REPLICA)
        biorepli$TR <- biorepli$Run
        biorepli$TR[biorepli$TR == here[1]] <- 'tr1'
        biorepli$TR[biorepli$TR == here[2]] <- 'tr2'
        
        # Let's select unique features per TECHNICAL REPLICAS, 
        # and sum them up (the same feature might have been many differnt times)
        biorepliaggregated <- aggregate(Intensity ~ Feature+Proteins+Condition+BioReplicate+Run+TR,
                                        data = biorepli,
                                        FUN = sum)
          
          
        biorepliaggregated$Intensity <- log2(biorepliaggregated$Intensity)
        ##LEGACY
        # bdc <- data.table::dcast(data = biorepliaggregated,
        #                          Feature + Proteins ~ TR,
        #                          value.var = 'Intensity')
        
        bdc <- biorepliaggregated %>%
          tidyr::pivot_wider(id_cols = c(Feature, Proteins), 
                             names_from = TR, 
                             values_from = Intensity)
        bdc <- as.data.frame(bdc)
          
        # Get the number of proteins
        np <- dim(bdc)[1]
        corr_coef <- round(cor(bdc$tr1, bdc$tr2, use = "pairwise.complete.obs"), digits = 2)
        # message("r:\t",corr_coef," ")
        p1 <- ggplot(bdc, aes(x = tr1, y = tr2))
        p1 <- p1 + geom_point(na.rm = TRUE) + geom_rug() + 
          geom_density_2d(colour = 'lightgreen')
        p1 <- p1 + geom_smooth(colour = "green",
                               fill = "lightblue",
                               method = 'lm',
                               formula = y ~ x,
                               na.rm = TRUE)
        p1 <- p1 + theme_light()
        p1 <- p1 + labs(title = paste(
          "Reproducibility between Technical Replicas\nBioReplica:\n",
          eBioreplica,
          "\n(n =  ",
          np,
          ", r = ",
          corr_coef,
          ")"
        )
        )
        
        print(p1)
        
      } 
      # else if (length(here) == 1) {
      #   # message("\t\tOnly one technical replica ")
      # } else{
      #   stop("More than 2 technical replicates found. This is very unusual.
      #   Please, revise it and if correct contact artms developers")
      # }
    } # Checking the reproducibility between Technical Replicas
    
    
    # NOW BETWEEN BIOREPLICAS
    # Before comparing the different biological replicas, 
    # aggregate the technical replicas
    # First choose the maximum for the technical replicas as before, 
    # but first check whether there are more than one
    if (length(here) > 1) {
      conditionOne <- aggregate(Intensity ~ Feature + Proteins + Condition + BioReplicate + Run,
                                data = conditionOne,
                                FUN = sum)
      b <- aggregate(Intensity ~ Feature + Proteins + Condition + BioReplicate,
                     data = conditionOne,
                     FUN = mean)
    } else{
      b <- aggregate(Intensity ~ Feature + Proteins + Condition + BioReplicate,
                     data = conditionOne,
                     FUN = sum)
    }
    
    b$Intensity <- log2(b$Intensity)
    blist <- unique(b$BioReplicate)
    if (length(blist) > 1) {
      
      # We need at least TWO BIOLOGICAL REPLICAS
      to <- length(blist) - 1
      
      for (i in seq_len(to)) {
        j <- i + 1
        for ( k in j:length(blist) ) {
          br1 <- blist[i]
          br2 <- blist[k]
          
          # message("\tChecking reproducibility between ",br1, "and ",br2 ,"\t")
          ##LEGACY
          # bcfinal <- data.table::dcast(data = b,
          #                              Feature + Proteins ~ BioReplicate,
          #                              value.var = 'Intensity')

          bcfinal <- b %>% tidyr::pivot_wider(id_cols = c(Feature, Proteins), 
                                              names_from = BioReplicate, 
                                              values_from = Intensity)
            
            
          bcfinal <- as.data.frame(bcfinal)
          bcfinal <- bcfinal[complete.cases(bcfinal),]
          
          # Let's check the total number of peptides here...
          checkTotalNumber <- subset(bcfinal, select = c(br1, br2))
          npt <- dim(checkTotalNumber)[1]
          
          corr_coef <- round(cor(bcfinal[[br1]], bcfinal[[br2]], use = "pairwise.complete.obs"), digits = 2)
          # message("r:\t",corr_coef," ")
          
          colnames(bcfinal) <- c("Feature", "Proteins", "br1", "br2")
          
          p2 <- ggplot(bcfinal, aes(x = br1, y = br2 ))
          p2 <- p2 + geom_point(na.rm = TRUE) + geom_rug() + geom_density_2d(colour = 'red')
          p2 <- p2 + geom_smooth(colour = "red",
                                 fill = "lightgreen",
                                 method = 'lm',
                                 formula = y ~ x,
                                 na.rm = TRUE)
          p2 <- p2 + theme_light()
          p2 <- p2 + labs(title = paste("Peptide Reproducibility between Bioreplicas\n (condition:",
                                        eCondition,
                                        ")",
                                        br1,
                                        "vs",
                                        br2,
                                        "\n(n =",
                                        npt,
                                        " r = ",
                                        corr_coef,
                                        ")"))
          p2 <- p2 + labs(x = "br1")
          p2 <- p2 + labs(y = "br2")
          print(p2)
        } #end for
      } #end for
    } else{
      message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
    }
  } # all the conditions
  # Close Progress bar
  if(verbose) close(pb)
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Plot abundance boxplots
#
# @description Plot abundance boxplots
# @param data (data.frame) processed modelqc
# @return Abundacen boxplot
# @keywords internal, plot, abundance
.artms_plotAbundanceBoxplots <- function(df) {
  p1 <- ggplot2::ggplot(df, 
                        aes(x = SUBJECT, 
                            y = ABUNDANCE, 
                            fill = ABUNDANCE))
  p1 <- p1 + geom_boxplot(aes(fill = SUBJECT),
                          na.rm = TRUE)
  p1 <- p1 + theme_linedraw()
  p1 <-
    p1 + theme(
      axis.text.x = element_text(
        angle = 90,
        hjust = 1,
        vjust = 0.5
      ),
      legend.position = "none"
    )
  p1 <- p1 + labs(x = "BIOREPLICATES")
  p1 <- p1 + ggtitle("Relative Abundance BioReplicates")
  print(p1)
  
  p2 <-
    ggplot2::ggplot(df, aes(
      x = as.factor(GROUP),
      y = ABUNDANCE,
      fill = ABUNDANCE
    ))
  p2 <- p2 + geom_boxplot(aes(fill = GROUP),
                          na.rm = TRUE)
  p2 <- p2 + theme_linedraw()
  p2 <-
    p2 + theme(
      axis.text.x = element_text(
        angle = 90,
        hjust = 1,
        vjust = 0.5
      ),
      legend.position = "none"
    )
  p2 <- p2 + labs(x = "CONDITIONS")
  p2 <- p2 + ggtitle("Relative Abundance Conditions")
  print(p2)
}


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Total Number of unique proteins based on abundance data
#
# @description Total Number of unique proteins per biological replicate and
# conditions
# @param df (data.frame) modelqc
# @return (pdf) Barplots with the number of proteins per br / condition
# @keywords internal, plots, abundance, counts
.artms_plotNumberProteinsAbundance <- function(df) {
  x <- df[,c('PROTEIN', 'SUBJECT')]
  y <- unique(x)
  names(y)[grep('SUBJECT', names(y))] <- 'BioReplicate'
  z <-
    ggplot2::ggplot(y, aes(x = BioReplicate, fill = BioReplicate))
  z <- z + geom_bar(stat = "count", na.rm = TRUE)
  z <- z + theme_linedraw()
  z <- z + theme(axis.text.x = element_text(angle = 90,
                                            hjust = 1,
                                            vjust = 0.5))
  z <- z + geom_text(stat = 'count',
                     aes(label = ..count..),
                     vjust = -0.5,
                     size = 2.7)
  z <- z + ggtitle("Unique Proteins in BioReplicates")
  print(z)
  
  a <- df[,c('PROTEIN', 'GROUP')]
  b <- unique(a)
  names(b)[grep('GROUP', names(b))] <- 'Condition'
  c <- ggplot2::ggplot(b, aes(x = Condition, fill = Condition))
  c <- c + geom_bar(stat = "count", na.rm = TRUE)
  c <- c + theme_linedraw()
  c <- c + theme(axis.text.x = element_text(angle = 90,
                                            hjust = 1,
                                            vjust = 0.5))
  c <- c + geom_text(stat = 'count',
                     aes(label = ..count..),
                     vjust = -0.5,
                     size = 2.7)

  c <- c + ggtitle("Unique Proteins in Conditions")
  print(c)
}

# 
# @title Plot the total number of quantified proteins in each condition
#
# @description
# @keys internal, plot, counts
# @param x (data.frame) Data frame of imputed log2fc
.artms_plotNumberProteinsImputedLog2fc <- function(x) {
  x <- x[c('Protein', 'Comparison')]
  y <- unique(x)
  z <- ggplot(y, aes(x = Comparison, fill = Comparison))
  z <- z + geom_bar(stat = "count",
                    na.rm = TRUE)
  z <- z + theme_linedraw()
  z <- z + theme(axis.text.x = element_text(angle = 90,
                                            hjust = 1,
                                            vjust = 0.5))
  
  
  z <- z + geom_text(stat = 'count',
                     aes(label = ..count..),
                     vjust = -0.5,
                     size = 2.7)
  
  z <- z + ggtitle("Unique Proteins in Comparisons")
  print(z)
}


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Generate reproducibility plots based on abundance data
# (normalized intensities from MSstats modelqc)
#
# @description Generate reproducibility plots based on abundance data
# (normalized intensities from MSstats modelqc)
# @param data (data.frame) Protein abundance (modelqc)
# @return Reproducibility plots based on abundance data 
# (normalized intensities)
# @keywords plot, reproducibility, abundance
.artms_plotReproducibilityAbundance <- function(x,
                                                verbose = verbose) {
  condi <- unique(x$GROUP)
  
  # Progress bar
  if(verbose) pb <- txtProgressBar(min = 0,
                       max = length(condi),
                       style = 3)
  
  for ( i in seq_len(length(condi)) ) {
    eCondition <- condi[i]
    
    # Progress bar
    if(verbose) setTxtProgressBar(pb, i)
    
    # message(" ################## CONDITION: ",eCondition," ############### ")
    # message("TECHNICAL REPLICAS --------------------------- ")
    # message("- ", eCondition," ")
    
    conditionOne <- x[which(x$GROUP == eCondition), ]
    
    # FIRST CHECK FOR TECHNICAL REPLICAS
    bioreplicasAll <- unique(conditionOne$SUBJECT)
    # plot_tr = list()
    for (eBioreplica in bioreplicasAll) {
      # message('\tChecking for technical replicas in ',eBioreplica, " ")
      biorepli <- conditionOne[conditionOne$SUBJECT == eBioreplica, ]
      here <- unique(biorepli$RUN)
      
      if (length(here) > 1) {
        # Check whether we have more than 2 technical replicas and let 
        # the user know:
        if (length(here) > 2) {
          message(
            "(-)----- WARNING: More than 2 technical replicas! 
            make sure that this is right  "
          )
        }
        # We are expecting no more than 2 technical replicas. 
        # If there is more... it is worthy to double check
        # message('\t\t>>Plotting Reproducibility between technical replicas ',
        # eBioreplica," ")
        #Need to change the RUN number to letters (TR: TECHNICAL REPLICA)
        biorepli$TR <- biorepli$RUN
        biorepli$TR[biorepli$TR == here[1]] <- 'tr1'
        biorepli$TR[biorepli$TR == here[2]] <- 'tr2'
        
        ##LEGACY
        # bdc <- data.table::dcast(data = biorepli, PROTEIN ~ TR, 
        #                          value.var = 'ABUNDANCE')
        bdc <- biorepli %>% 
          tidyr::pivot_wider(id_cols = PROTEIN, 
                             names_from = TR, 
                             values_from = ABUNDANCE)
        bdc <- as.data.frame(bdc)
        
        bdc <- bdc[complete.cases(bdc), ]
        # Get the number of proteins
        np <- length(unique(bdc$PROTEIN))
        corr_coef <- round(cor(bdc$tr1, bdc$tr2), digits = 2)
        p1 <- ggplot2::ggplot(bdc, aes(x = tr1, y = tr2))
        p1 <- p1 + geom_point(na.rm = TRUE)
        p1 <- p1 + geom_smooth(colour = "green",
                               fill = "lightblue",
                               method = 'lm',
                               formula = y ~ x,
                               na.rm = TRUE)
        p1 <- p1 + theme_light()
        p1 <- p1 + labs(
          title = paste(
            "Reproducibility between Technical Replicas\nBioReplica:\n",
            eBioreplica,
            "\n(n = ",
            np,
            ", r = ",
            corr_coef,
            ")"
          )
        )
        print(p1)
        # plot_tr[[eBioreplica]] <- p1
      } else if (length(here) < 1) {
        stop("More than 2 technical replicates found. This is very unusual.
             Please, revise it and contact artMS developers if right")
      }
    } # Checking the reproducibility between Technical Replicas
    
    
    # NOW BETWEEN BIOREPLICAS
    # Before comparing the different biological replicas, 
    # aggregate on the technical replicas
    # message(" BIOLOGICAL REPLICAS --------------------------- ")
    b <- aggregate(
      ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
      data = conditionOne,
      FUN = mean
    ) 
    
    # Remove the dash
    b$SUBJECT <- gsub("-", "_", b$SUBJECT)
      
    blist <- unique(b$SUBJECT)
    if ( length(blist) > 1 ) {
      # We need at least TWO BIOLOGICAL REPLICAS
      to <- length(blist) - 1
      for (i in seq_len(to) ) {
        j <- i + 1
        for ( k in j:length(blist) ) {
          br1 <- blist[i]
          br2 <- blist[k]
          # message("\tChecking reproducibility between ",br1, "and ",br2 ," ")
          
          ##LEGACY
          # bc <- data.table::dcast(data = b,
          #                        PROTEIN ~ SUBJECT,
          #                        value.var = 'ABUNDANCE')
          
          bc <- b %>% 
            tidyr::pivot_wider(id_cols = PROTEIN, 
                               names_from = SUBJECT, 
                               values_from = ABUNDANCE)
            
          bc <- bc[complete.cases(bc), ]
          
          npt <- length(unique(bc$PROTEIN))
          
          corr_coef <- round(cor(bc[[br1]], bc[[br2]]), digits = 2)
          
          p2 <- ggplot2::ggplot(bc, aes_string(x = paste(br1) , y = paste(br2) ))
          p2 <- p2 + geom_point(na.rm = TRUE)
          p2 <- p2 + geom_smooth(colour = "red",
                                 fill = "lightgreen",
                                 method = 'lm',
                                 formula = y ~ x,
                                 na.rm = TRUE)
          p2 <- p2 + theme_light()
          p2 <- p2 + labs(
            title = paste(
              "Reproducibility between Bioreplicas\n(condition:",
              eCondition, ") ", br1, "vs", br2, "\n(n =", npt, " r = ", corr_coef, ")"
            )
          )
          p2 <- p2 + labs(x = br1)
          p2 <- p2 + labs(y = br2)
          print(p2)
        }
      }
      if(verbose) message(" ")
    } else{
      if(verbose) message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
    }
  } # all the conditions
  
  if(verbose) close(pb)
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Plot correlation between conditions
#
# @description Plot correlation between conditions
# @param x (data.frame) of Protein Abundance (MSstats modelqc)
# @param numberBiologicalReplicas (int) Number of biological replicates
# @return (ggplot.object) A correlation plot between conditions
# @keywords internal, plot, correlation
.artms_plotCorrelationConditions <- function(x, numberBiologicalReplicas) {
  # Before jumping to merging biological replicas:
  # Technical replicas: aggregate on the technical replicas
  b <- aggregate(ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
                 data = x,
                 FUN = mean)
    
  # Aggregate now the CONDITIONS on the biological replicas:
  
  # One way to do this would be to be very stringent, 
  # requiring to find data in all biological replicas:
  # allBiologicalReplicas <- function(x){
  # ifelse(sum(!is.na(x)) == numberBiologicalReplicas, 
  # mean(x, na.rm = TRUE), NA)}
  # datadc <- data.table::dcast(data=b, 
  # PROTEIN~GROUP, value.var = 'ABUNDANCE', 
  # fun.aggregate = allBiologicalReplicas, fill = 0)
  
  # Or a most relaxed way:
  
  ##LEGACY
  # datadc <- data.table::dcast(data = b,
  #                             PROTEIN ~ GROUP,
  #                             value.var = 'ABUNDANCE',
  #                             fun.aggregate = mean) 
  
  datadc <- b %>% 
    tidyr::pivot_wider(id_cols = PROTEIN,
                       names_from = GROUP, 
                       values_from = ABUNDANCE, 
                       values_fn = list(ABUNDANCE = mean))
    
  
  # before <- dim(datadc)[1]
  # l <- dim(datadc)[2]
  # datadc <- datadc[apply(datadc[c(2:l)],1,function(z) !any(z==0)),]
  # evenafter <- dim(datadc)[1]
  # datadc <- datadc[complete.cases(datadc),]
  # after <- dim(datadc)[1]
  # message("Total proteins before: ", 
  # before, " Removing the 0s: ",evenafter, 
  # " Total proteins (only complete cases): ", after, "  ")
  
  blist <- unique(b$GROUP)
  if (length(blist) > 1) {
    # We need at least TWO CONDITIONS
    to <- length(blist) - 1
    for (i in seq_len(to) ) {
      j <- i + 1
      for (k in j:length(blist)) {
        br1 <- blist[i]
        br2 <- blist[k]
        # message("\t",br1,"-",br2,":")
        
        npt <- length(unique(datadc$PROTEIN))
        
        corr_coef <- round(cor(datadc[[br1]], 
                               datadc[[br2]], 
                               use = "complete.obs"), digits = 2)
        # cat ("r:",corr_coef,"\n")
        
        p2 <- ggplot2::ggplot(datadc, aes_string(x = paste(br1), y = paste(br2)))
        p2 <- p2 + geom_point(na.rm = TRUE)
        p2 <- p2 + geom_smooth(colour = "blue",
                               fill = "lightblue",
                               method = 'lm',
                               formula = y ~ x, 
                               na.rm = TRUE)
        p2 <- p2 + theme_light()
        p2 <- p2 + labs(title = paste("CORRELATION between CONDITIONS:\n",
                                      br1,
                                      "and",
                                      br2,
                                      "\n(n = ",
                                      npt,
                                      " r = ",
                                      corr_coef,
                                      ")"))
        p2 <- p2 + labs(x = br1)
        p2 <- p2 + labs(y = br2)
        print(p2)
      } # FOR loop
    } # For loop
    # message(" ")
  } else{
    message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
  }
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Plot correlation between quantifications different quantified
# comparisons
#
# @description Plot correlation between all quantifications, i.e., different
# quantified comparisons
# @param datai (data.frame) Processed MSstats results
# @param verbose (logical) `TRUE` (default) shows function messages
# @return (ggplot.object) Plot correlation between quantifications and r values
# @keywords internal, plot, correlation, log2fc
.artms_plotRatioLog2fc <- function(datai,
                                   verbose = TRUE) {
  ##LEGACY
  # datadc <- data.table::dcast(data = datai, Protein ~ Label, value.var = 'log2FC')
  
  datadc <- datai %>% 
    tidyr::pivot_wider(id_cols = Protein,
                       names_from = Label, 
                       values_from = log2FC)
  datadc <- as.data.frame(datadc)
  
  before <- dim(datadc)[1]
  l <- dim(datadc)[2]
  datadc <- do.call(data.frame, lapply(datadc, function(x) replace(x, is.infinite(x), NA)))
  datadc <- datadc[complete.cases(datadc), ]
  after <- dim(datadc)[1]
  if(verbose) message("---Total unique identifiers before: ",
    before,
    "\n---Total unique identifiers (only complete cases): ",
    after)
  
  blist <- unique(datai$Label)
  blist <- gsub("-", ".", blist)
  
  if (length(blist) > 1) {
    # We need at least TWO CONDITIONS
    to <- length(blist) - 1
    # Progress bar
    if(verbose) pb <- txtProgressBar(min = 0,
                         max = length(blist) - 1,
                         style = 3)
    for (i in seq_len(to)) {
      if(verbose) setTxtProgressBar(pb, i)
      j <- i + 1
      for (k in j:length(blist)) {
        br1 <- blist[i]
        br2 <- blist[k]
        # message("\tChecking relation between conditions ",br1, "and ",br2 ,":")
        
        npt <- length(unique(datadc$Protein))
        
        corr_coef <- round(cor(datadc[[br1]], datadc[[br2]]), digits = 2)
        # cat ("r: ",corr_coef," ")
        
        p3 <- ggplot2::ggplot(datadc, aes_string(x = paste(br1), y = paste(br2)))
        p3 <- p3 + geom_point(na.rm = TRUE) + geom_rug() + geom_density_2d()
        p3 <- p3 + geom_smooth(colour = "red",
                               fill = "lightblue",
                               method = 'lm',
                               formula = y ~ x,
                               na.rm = TRUE)
        p3 <- p3 + theme_light()
        p3 <- p3 + labs(title = paste0("log2fc(",
                                       br1,
                                       ") vs log2fc(",
                                       br2,
                                       ") \n(n = ",
                                       npt,
                                       " r = ",
                                       corr_coef,
                                       ")"))
        p3 <- p3 + labs(x = br1)
        p3 <- p3 + labs(y = br2)
        print(p3)
      } # FOR loop
    } # For loop
    if(verbose) close(pb)
  } else{
    message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
  }
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Pretty Labels for Heatmaps
#
# @description Generates pretty labels for the heatmaps.
# @param uniprot_acs (char) Uniprot accession id
# @param uniprot_ids (char) Uniprot entry id
# @param gene_names (car) Gene symbol
# @return Pretty labels for a heatmap
# @keywords internal, plots, pretty
.artms_prettyPrintHeatmapLabels <- function(uniprot_acs, uniprot_ids, gene_names) {
  result = paste(uniprot_acs, uniprot_ids, gene_names, sep = ' ')
  return(result)
}


# @title Correlation heatmaps of all the individual features
# @description Correlation heatmap using intensity values across all the
# conditions
# @param data_w (data.frame) resulting from the `.artms_castMaxQToWidePTM`
# function
# @param keys (data.frame) of the keys
# @param config (yaml.object) Configuration object (yaml loaded)
# @return (pdf) A correlation heatmap (suffix `-heatmap.pdf`)
# @keywords internal, heatmap, intensity, comparisons
.artms_sampleCorrelationHeatmap <- function (data_w, keys, config) {
  # data_w <- data.table::as.data.table(data_w2)
  # mat = log2(data_w[, 4:ncol(data_w), with = TRUE])
  # mat[is.na(mat)] = 0
  # mat_cor = cor(mat, method = 'pearson', use = 'everything')
  
  data_w <- log2(data_w[,4:ncol(data_w)])
  data_w[is.na(data_w)] <- 0
  mat_cor <- cor(data_w, method = 'pearson', use = 'everything')
  
  ## we want to make informarive row names so order by 
  ## RawFile because that's how data_w is ordered
  keys <- as.data.frame(keys)
  ordered_keys <- keys[with(keys, order(RawFile)), ] 
  mat_names <- paste(ordered_keys$Condition,
                    ordered_keys$BioReplicate,
                    ordered_keys$Run)
  colnames(mat_cor) <- mat_names
  rownames(mat_cor) <- mat_names
  colors_pos <- colorRampPalette(RColorBrewer::brewer.pal("Blues", n = 5))(10)
  colors_neg <- rev(colorRampPalette(RColorBrewer::brewer.pal("Reds", n = 5))(10))
  colors_tot <- c(colors_neg, colors_pos)
  pheatmap(
    mat = mat_cor,
    cellwidth = 10,
    cellheight = 10,
    scale = 'none',
    filename = gsub('.txt', '-heatmap.pdf', config$files$output),
    color = colors_tot,
    breaks = seq(from = -1, to = 1, by = .1),
    fontfamily = "mono"
  )
}



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Barplot of peptide counts per biological replicate
#
# @description Total number of unique peptide identified per biological
# replicate
# @param data_f (char) Evidence file (same structure as the original)
# @param config (yaml.object) Configuration object
# @return (pdf) Barplot of peptide counts
# @keywords barplot, counts, peptides
.artms_samplePeptideBarplot <- function(data_f, config) {
  # set up data into ggplot compatible format
  data_f <- data.table(data_f,
                       labels = paste(data_f$RawFile, 
                                      data_f$Condition, 
                                      data_f$BioReplicate))
  
  data_f <- data_f[with(data_f, order(labels, decreasing = TRUE)), ]
  
  # plot the peptide counts for all the samples TOGETHER
  p <- ggplot(data = data_f, aes(x = labels))
  p <- p + geom_bar(na.rm = TRUE) + theme(axis.text.x = element_text(angle = 90,
                                                         hjust = 1,
                                                         family = 'mono'
  )) + ggtitle('Unique peptides per run\n after filtering') + coord_flip()
  
  ggsave(filename = gsub('.txt', '-peptidecounts.pdf', config$files$output),
         plot = p,
         width = 8,
         height = 10)
  
  
  w <- 10
  h <- ceiling((7 / 5 + 2) * ceiling(length(unique(data_f$Condition)) / 5))
  # plot the peptide counts for all the samples PER BAIT
  p <- ggplot(data = data_f, aes(x = as.factor(BioReplicate)))
  p <- p + geom_bar(na.rm = TRUE) + theme(axis.text.x = element_text(angle = 90,
                                                         hjust = 1,
                                                         family = 'mono' )) + 
    ggtitle('Unique peptides per run\n after filtering') + 
    facet_wrap( ~ Condition, scales = 'free', ncol = 5)  + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  ggsave(filename = gsub('.txt', '-peptidecounts-perBait.pdf', config$files$output),
         plot = p,
         width = w,
         height = h)
}


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Select significant hits
#
# @description Filtered data.frame with significant values (log2fc > 2 |
# log2fc < -2; adj.pvalue < 0.05) from the MSstats results
# @param mss_results (data.frame) of MSstats results
# @param labels (vector) of selected labels. Default: all (`*`)
# @param LFC (vector, int) with the negative and positive threshold. Default:
# c(-2, 2)
# @param whatPvalue (char) `pvalue` or `adj.pvalue` (default)?
# @param FDR (int) false discovery rate (adj.pvalue) threshold. Default: 0.05
# @return (data.frame) only with significant hits
# @keywords internal, significant, selections
.artms_significantHits <- function(mss_results,
                                   labels = '*',
                                   LFC = c(-2, 2),
                                   whatPvalue = 'adj.pvalue',
                                   FDR = 0.05) {
  selected_results <- mss_results[grep(labels, mss_results$Label),]
  
  if (whatPvalue == "adj.pvalue") {
    significant_proteins <-
      selected_results[(
        !is.na(selected_results$log2FC) &
          selected_results$adj.pvalue <= FDR &
          (
            selected_results$log2FC >= LFC[2] |
              selected_results$log2FC <= LFC[1]
          )
      ), 'Protein']
  } else if (whatPvalue == "pvalue") {
    significant_proteins <-
      selected_results[(
        !is.na(selected_results$log2FC) &
          selected_results$pvalue <= FDR &
          (
            selected_results$log2FC >= LFC[2] |
              selected_results$log2FC <= LFC[1]
          )
      ), 'Protein']
  } else{
    stop("The whatPvalue argument is wrong. Valid options: pvalue or adj.pvalue")
  }
  
  significant_results <-
    selected_results[selected_results$Protein %in% significant_proteins,]
  return(significant_results)
}



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Summary stats for plots
#
# @description Gives count, mean, standard deviation, standard error of the mean, 
# and confidence interval (default 95%). 
# Borrowed from Cookbook for R 
# (http://www.cookbook-r.com/Manipulating_data/Summarizing_data/)
#
# @param data: a data frame.
# @param measurevar: the name of a column that contains the variable to be summariezed
# @param groupvars: a vector containing names of columns that contain grouping variables
# @param na.rm: a boolean that indicates whether to ignore NA's
# @param conf.interval: the percent range of the confidence interval (default is 95%)
# @return data.frame with summary statistics
# @keywords internal, plot, stats
.artms_summarySE <- function(data=NULL, 
                             measurevar, 
                             groupvars=NULL, 
                             na.rm=FALSE,
                             conf.interval=.95, 
                             .drop=TRUE) {
  
  # New version of length which can handle NA's: if na.rm==T, don't count them
  length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
  }
  
  # This does the summary. For each group's data frame, return a vector with
  # N, mean, and sd
  datac <- plyr::ddply(data, groupvars, .drop=.drop,
                       .fun = function(xx, col) {
                         c(N    = length2(xx[[col]], na.rm=na.rm),
                           mean = mean   (xx[[col]], na.rm=na.rm),
                           sd   = sd     (xx[[col]], na.rm=na.rm)
                         )
                       },
                       measurevar
  )
  
  # Rename the "mean" column    
  datac <- rename(datac, c("mean" = measurevar))
  
  datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
  
  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- qt(conf.interval/2 + .5, datac$N-1)
  datac$ci <- datac$se * ciMult
  
  return(datac)
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Generate PCA plots based on abundance data
#
# @description Generate PCA plots based on abundance data
# @param x Data.frame output from `artms_loadModelQCstrict`
# @param filename Prefix to generate output names (WITH NO EXTENSION)
# @param allConditions Conditions selected to generate the plots
# @return PCA plots based on abundance data (pdf format)
# @keywords internal, plot, pca
.artms_getPCAplots <- function(x, filename, allConditions) {
  # PRINCIPAL COMPONENT ANALYSIS
  # Using the following packages:
  # FactoMineR, factoextra, corrplot, PerformanceAnalytics
  
  # Remove NA
  x <- as.data.frame(x)
  nogenename2 <- x[duplicated(x$Gene), ]
  if (dim(nogenename2)[1] > 0) {
    # message("\t---Removing proteins without a gene name: ")
    x <- x[complete.cases(x), ]
  }
  
  x <- x[!duplicated(x[, c("Gene")]),]
  rownames(x) <- x$Gene
  df <- x[, allConditions]
  
  # Correlation matrix
  df.cor.matrix <- round(cor(df, use = "complete.obs"), 2)
  
  # Correlation plots:
  out.correlation <- paste0(filename, "-correlations.pdf")
  pdf(out.correlation)
  corrplot::corrplot(
    df.cor.matrix,
    type = "upper",
    tl.col = "black",
    tl.srt = 45,
    diag = FALSE,
    addCoef.col = TRUE
  )
  PerformanceAnalytics::chart.Correlation(df,
                                          histogram = TRUE,
                                          pch = 19,
                                          main = "Correlation between Conditions")
  garbage <- dev.off()
  
  # PCA 1----
  
  res.pca <- FactoMineR::PCA(df,
                             scale.unit = TRUE,
                             ncp = 4,
                             graph = FALSE)
    
  eigenvalues <- res.pca$eig
  
  out.pca01 <- paste0(filename, "-pca01.pdf")
  pdf(out.pca01)
  # par(mfrow = c(1, 1))
  here1 <- plot(res.pca, choix = "var", new.plot = FALSE)
  print(here1)
  # This is equivalent to
  # PCA(df, scale.unit = TRUE, ncp = 4, graph = TRUE)
  garbage <- dev.off()
  
  # PCA 2----
  out.pca02 <- paste0(filename, "-pca02.pdf")
  pdf(out.pca02)
  barplot(
    eigenvalues[, 2],
    names.arg = seq_len(nrow(eigenvalues)),
    main = "Variances",
    xlab = "Principal Components",
    ylab = "Percentage of variances",
    col = "steelblue"
  )
  lines(
    x = seq_len(nrow(eigenvalues)),
    eigenvalues[, 2],
    type = "b",
    pch = 19,
    col = "red"
  )
  garbage <- dev.off()
  
  # PCA 3----
  h <- factoextra::fviz_pca_var(res.pca, 
                                col.var = "contrib") + theme_minimal()
  i <- factoextra::fviz_pca_biplot(res.pca,  
                                   labelsize = 3, 
                                   pointsize = 0.8) + theme_minimal()
  j <- factoextra::fviz_contrib(res.pca, choice = "var", axes = 1)
  l <- factoextra::fviz_contrib(res.pca, choice = "var", axes = 2)
  
  out.pca03 <- paste0(filename, "-pca03.pdf")
  pdf(out.pca03)
  print(h)
  print(i)
  print(j)
  print(l)
  garbage <- dev.off()
  
  # PCA 4
  df <- df[complete.cases(df),]
  prot.pca <- prcomp(t(df))
  prot.pcax <- as.data.frame(prot.pca$x)
  
  prot.pcax$Condition <- row.names(prot.pcax)
  
  p.pca.group <- ggplot(prot.pcax, 
                        aes(x = PC1, 
                            y = PC2, 
                            color = Condition)) + #, color=condition, shape=condition
    geom_point(alpha = .8, size = 3) +
    theme_light() + 
    labs(title = "PCA") 
  
  out.pca04 <- paste0(filename, "-pca04.pdf")
  pdf(out.pca04, width = 9, height = 7)
    plot(p.pca.group)
  garbage <- dev.off()
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Volcano plot (log2fc / pvalues)
#'
#' @description It generates a scatter-plot used to quickly identify changes
#' @param mss_results (data.frame or file) Selected MSstats results
#' @param lfc_upper (numeric) log2fc upper threshold (positive value)
#' @param lfc_lower (numeric) log2fc lower threshold (negative value)
#' @param whatPvalue (char) `pvalue` or `adj.pvalue` (default)
#' @param FDR (numeric) False Discovery Rate threshold
#' @param output_name (char) Name for the output file (don't forget the `.pdf`
#' extension)
#' @param PDF (logical) Option to generate pdf format. Default: `T`
#' @param decimal_threshold (numeric) Decimal threshold for the pvalue.
#' Default: 16 (10^-16)
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @keywords plot, volcano
#' @return (pdf) of a volcano plot
#' @examples
#' artmsVolcanoPlot(mss_results = artms_data_ph_msstats_results,
#'                   whatPvalue = "pvalue",
#'                   PDF = FALSE)
#' @export
artmsVolcanoPlot <- function(mss_results,
                             output_name = "volcano_plot.pdf",
                             lfc_upper = 1,
                             lfc_lower = -1,
                             whatPvalue = "adj.pvalue",
                             FDR = 0.05,
                             PDF = TRUE,
                             decimal_threshold = 16,
                             verbose = TRUE) {
  
  if(any(missing(mss_results)))
    stop("One (or many) of the required arguments missed. 
         Please, check the help for this function to find out more")
  
  if (PDF) {
    if (!grepl("\\.pdf", output_name)) {
      stop("File extension '.pdf' is missed for < output_name >")
    }
  }
  
  mss_results <- .artms_checkIfFile(mss_results)
  
  # handle cases where log2FC is Inf. There are no pvalues or other 
  # information for these cases :(
  # Issues with extreme_val later if we have Inf/-Inf values.
  if (sum(is.infinite(mss_results$log2FC)) > 0) {
    idx <- is.infinite(mss_results$log2FC)
    mss_results$log2FC[idx] <- NA
  }
  
  min_x <- -ceiling(max(abs(mss_results$log2FC), na.rm = TRUE))
  max_x <- ceiling(max(abs(mss_results$log2FC), na.rm = TRUE))
  
  # Deal with special cases in the data where we have pvalues = Inf,NA,0
  if (whatPvalue == "adj.pvalue") {
    if (sum(is.na(mss_results$adj.pvalue)) > 0)
      mss_results <- mss_results[!is.na(mss_results$adj.pvalue), ]
    if (nrow(mss_results[mss_results$adj.pvalue == 0 |
                         mss_results$adj.pvalue == -Inf, ]) > 0)
      mss_results[!is.na(mss_results$adj.pvalue) &
                    (mss_results$adj.pvalue == 0 |
                       mss_results$adj.pvalue == -Inf), ]$adj.pvalue = 10 ^ -decimal_threshold
    max_y = ceiling(-log10(min(mss_results[mss_results$adj.pvalue > 0, ]$adj.pvalue, na.rm = TRUE))) + 1
  } else if (whatPvalue == "pvalue") {
    if (sum(is.na(mss_results$pvalue)) > 0)
      mss_results <- mss_results[!is.na(mss_results$pvalue), ]
    if (nrow(mss_results[mss_results$pvalue == 0 |
                         mss_results$pvalue == -Inf, ]) > 0)
      mss_results[!is.na(mss_results$pvalue) &
                    (mss_results$pvalue == 0 |
                       mss_results$pvalue == -Inf), ]$pvalue = 10 ^ -decimal_threshold
    max_y = ceiling(-log10(min(mss_results[mss_results$pvalue > 0, ]$pvalue, na.rm = TRUE))) + 1
  } else{
    stop("The whatPvalue argument is wrong. Valid options: < pvalue > or < adj.pvalue >")
  }
  l <- length(unique(mss_results$Label))
  w_base <- 7
  h_base <- 7
  
  if (l <= 2) {
    w <- w_base * l
  } else{
    w <- w_base * 2
  }
  h <- h_base * ceiling(l / 2)
  
  if (whatPvalue == "adj.pvalue") {
    p <- ggplot(mss_results, aes(x = log2FC, y = -log10(adj.pvalue)))
    p <- p + geom_point(colour = 'grey', na.rm = TRUE) +
      geom_point(
        data = mss_results[mss_results$adj.pvalue <= FDR &
                             mss_results$log2FC >= lfc_upper, ],
        aes(x = log2FC, y = -log10(adj.pvalue)),
        colour = 'red',
        size = 1,
        na.rm = TRUE
      ) +
      geom_point(
        data = mss_results[mss_results$adj.pvalue <= FDR &
                             mss_results$log2FC <= lfc_lower, ],
        aes(x = log2FC, y = -log10(adj.pvalue)),
        colour = 'blue',
        size = 1,
        na.rm = TRUE
      ) +
      geom_vline(xintercept = c(lfc_lower, lfc_upper),
                 lty = 'dashed') +
      geom_hline(yintercept = -log10(FDR), lty = 'dashed') +
      xlim(min_x, max_x) +
      ylim(0, max_y) +
      facet_wrap(facets = ~ Label,
                 ncol = 2,
                 scales = 'fixed')
  } else{
    p <- ggplot(mss_results, aes(x = log2FC, y = -log10(pvalue)))
    p <- p + geom_point(colour = 'grey', na.rm = TRUE) +
      geom_point(
        data = mss_results[mss_results$pvalue <= FDR &
                             mss_results$log2FC >= lfc_upper, ],
        aes(x = log2FC, y = -log10(pvalue)),
        colour = 'red',
        size = 1,
        na.rm = TRUE
      ) +
      geom_point(
        data = mss_results[mss_results$pvalue <= FDR &
                             mss_results$log2FC <= lfc_lower, ],
        aes(x = log2FC, y = -log10(pvalue)),
        colour = 'blue',
        size = 1,
        na.rm = TRUE
      ) +
      geom_vline(xintercept = c(lfc_lower, lfc_upper),
                 lty = 'dashed') +
      geom_hline(yintercept = -log10(FDR), lty = 'dashed') +
      xlim(min_x, max_x) +
      ylim(0, max_y) +
      facet_wrap(facets = ~ Label,
                 ncol = 2,
                 scales = 'fixed')
  }
  
  if (PDF) {
    pdf(output_name, width = w, height = h)
    print(p)
    garbage <- dev.off()
    if(verbose) message(output_name, " is ready")
  } else{
    print(p)
  }
}
kroganlab/artMS documentation built on July 7, 2023, 5:35 a.m.