R/analysisQuantifications.R

Defines functions .artmsExtractUniprotId .artms_selectTheOneLog2fc .artms_RemoveProtBelowThres .artms_loadModelqcBasic .artms_imputeMissingValues artmsGeneratePhSiteExtended artmsAnnotateSpecie artmsAnalysisQuantifications

Documented in artmsAnalysisQuantifications artmsAnnotateSpecie artmsGeneratePhSiteExtended

#' @title Analysis of the Relative Quantifications
#'
#' @description Analysis of relative quantifications, including:
#'
#' - Annotations
#' - Summary files in different format (xls, txt) and shapes (long, wide)
#' - Numerous summary plots
#' - Enrichment analysis using Gprofiler
#' - PCA of quantifications
#' - Clustering analysis
#' - Basic imputation of missing values
#' 
#' To run this function, the following packages must be installed on your system:
#' - From bioconductor:
#' `BiocManager::install(c("ComplexHeatmap", "org.Mm.eg.db"))`
#' - From CRAN:
#' `install.packages(c("factoextra", "FactoMineR", "gProfileR", "PerformanceAnalytics"))`
#' 
#' @param log2fc_file (char) MSstats results file location
#' @param modelqc_file (char) MSstats modelqc file location
#' @param species (char) Select one species. Species currently supported for
#' a full analysis (including enrichment analysis):
#' - HUMAN
#' - MOUSE
#' To find out species supported only for annotation check 
#' `?artmsIsSpeciesSupported()`
#' 
#' @param output_dir (char) Name for the folder to output the results from the 
#' function. Default is current directory (recommended to provide a new folder
#' name).
#' @param outliers (char) It allows to keep or remove outliers. Options:
#' - `keep` (default): it keeps outliers 'keep', 'iqr', 'std'
#' - `iqr` (recommended): remove outliers +/- 6 x Interquartile Range (IQR)
#' - `std` : 6 x standard deviation
#' @param enrich (logical) Performed enrichment analysis using GprofileR?
#' Only available for species HUMAN and MOUSE. 
#' `TRUE` (default if "human" or "mouse" are the species) or `FALSE`
#' @param l2fc_thres (int) log2fc cutoff for enrichment analysis (default,
#' `l2fc_thres = 1.5`)
#' @param choosePvalue (char) specify whether `pvalue` or `adjpvalue` should 
#' use for the analysis. The default option is `adjpvalue`
#' (multiple testing correction).
#' But if the number of biological replicates for a given experiment is
#' too low (for example n = 2), then `choosePvalue = pvalue` is recommended.
#' @param isBackground (char) background of gene names for enrichment analysis.
#' `nobackground` (default) will use the total number of genes detected.
#' Alternatively provided the file path name to the background gene list.
#' @param isPtm (char) Is a ptm-site quantification? 
#' - `global` (default), 
#' - `ptmsites` (for site specific analysis), 
#' - `ptmph` (Jeff Johnson script output evidence file)
#' @param mnbr (int) **PARAMETER FOR NAIVE IMPUTATION**: 
#' "minimal number of biological replicates" for "naive 
#' imputation" and filtering. Default: `mnbr = 2`. Details:
#' Intensity values for proteins/PTMs that are completely missed in one 
#' of the two conditions compared ("condition A"), but are found in 
#' at least 2 biological replicates (`mnbr = 2`) of the other "condition B", 
#' are imputed (values artificially assigned) and the log2FC values calculated. 
#' The goal is to keep those proteins/PTMs that are consistently found in 
#' one of the two conditions (in this case "condition B") and facilitate the 
#' inclusion in downstream analysis (if wished). The imputed intensity values 
#' are sampled from the lowest intensity values detected in the experiment, 
#' and (**WARNING**) the p-values are just randomly assigned between 
#' 0.05 and 0.01 for illustration purposes (when generating a volcano 
#' plot with the output of `artmsAnalysisQuantifications`) or to include 
#' them when making a cutoff of p-value < 0.05 for enrichment analysis
#' **CAUTION**: `mnbr` would also add the constraint that any protein 
#' must be identified in at least `nmbr` biological replicates of the 
#' **same** condition or it will be filtered out. That is,
#' if `mnbr = 2`, a protein found in two conditions but only in one
#' biological replicate in each of them, it would be removed.
#' @param pathogen (char) Is there a pathogen in the dataset as well?
#' if it does not, then use `pathogen = nopathogen` (default).
#' Pathogens available: `tb` (Tuberculosis), `lpn` (Legionella)
#' @param plotPvaluesLog2fcDist (logical) If `TRUE` (default) plots pvalues
#' and log2fc distributions
#' @param plotAbundanceStats (logical) If `TRUE` (default) plots stats graphs
#' about abundance values
#' @param plotReproAbundance (logical) If `TRUE` plots reproducibility
#' based on normalized abundance values
#' @param plotCorrConditions (logical) If `TRUE` plots correlation  
#' between the different conditions
#' @param plotCorrQuant (logical) if `TRUE` plots correlation between the
#' available quantifications (comparisons)
#' @param plotPCAabundance (logical) if `TRUE` performs PCA analysis of
#' conditions using normalized abundance values
#' @param plotFinalDistributions (logical) if `TRUE` plots distribution of both
#' log2fc and pvalues
#' @param plotPropImputation (logical) if `TRUE` plots proportion of overall
#' imputation
#' @param plotHeatmapsChanges (logical) if `TRUE` plots heatmaps of quantified
#' changes (both all and significant only). Only if `printPDF` is also `TRUE`
#' @param plotTotalQuant (logical) if `TRUE` plots barplot of total number of
#' quantifications per comparison
#' @param plotClusteringAnalysis (logical) if `TRUE` performs clustering
#' analysis between quantified comparisons (more than 1 comparison required)
#' @param data_object (logical) flag to indicate whether the required files 
#' are data objects. Default is FALSE
#' @param printPDF If `TRUE` (default) prints out the pdfs. Warning: plot
#' objects are not returned due to the large number of them. 
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @return (data.frame) summary of quantifications, including annotations, 
#' enrichments, etc
#' @keywords analysis, quantifications
#' @examples
#' # Testing that the files cannot be empty
#' artmsAnalysisQuantifications(log2fc_file = NULL,
#'                               modelqc_file = NULL,
#'                               species = NULL,
#'                               output_dir = NULL)
#' @export
artmsAnalysisQuantifications <- function(log2fc_file,
                                         modelqc_file,
                                         species,
                                         output_dir = "analysis_quant",
                                         outliers = c("keep", "iqr", "std"),
                                         enrich = TRUE,
                                         l2fc_thres = 1,
                                         choosePvalue = c("adjpvalue","pvalue"),
                                         isBackground = "nobackground",
                                         isPtm = "global",
                                         mnbr = 2,
                                         pathogen = "nopathogen",
                                         plotPvaluesLog2fcDist = TRUE,
                                         plotAbundanceStats = TRUE,
                                         plotReproAbundance = TRUE,
                                         plotCorrConditions = TRUE,
                                         plotCorrQuant = TRUE,
                                         plotPCAabundance = TRUE,
                                         plotFinalDistributions = TRUE,
                                         plotPropImputation = TRUE,
                                         plotHeatmapsChanges = TRUE,
                                         plotTotalQuant = TRUE,
                                         plotClusteringAnalysis = TRUE,
                                         data_object = FALSE,
                                         printPDF = TRUE,
                                         verbose = TRUE
                                         ) {

  Uniprot_PTM = imputed = Gene_Protein = NULL
  
  if(verbose){
    message("---------------------------------------------")
    message("artMS: ANALYSIS OF QUANTIFICATIONS")
    message("---------------------------------------------")
  }
  
  # Checking arguments-----
  
  if(any(missing(log2fc_file) | 
         missing(modelqc_file) |
         missing(species) ))
   stop("One (or many) of the required arguments missed. 
        Please, check the help for this function to find out more")
  
  if(is.null(log2fc_file) & 
     is.null(modelqc_file) & 
     is.null(species) & 
     is.null(output_dir)){
    return("The evidence_file, modelqc_file, species and output_dir arguments cannot be NULL")
  }

  # CHECK POINT: DO THE FILES EXIST?
  if (!is.logical(data_object)) {
    stop(" Argument <data_object> must be logical (TRUE or FALSE) ")
  }
  
  if(!data_object){
    if(!file.exists(log2fc_file)){
      stop("THE FILE ", log2fc_file, " DOES NOT EXIST ")
    }
    
    if(!file.exists(modelqc_file)){
      stop("THE FILE ", modelqc_file, " DOES NOT EXIST ")
    }
  }
  
  if (!is.logical(enrich)) {
    stop(" Argument <enrich> must be logical (TRUE or FALSE) ")
  }

  if (!is.numeric(l2fc_thres)) {
    stop(" Argument <l2fc_thres> must be numeric ")
  }
  
  if (!is.numeric(mnbr)) {
    stop(" Argument <mnbr> must be numeric ")
  }

  if(!(isPtm %in% c('global', 'ptmph', 'ptmsites'))){
    stop("The < isPtm > argument is wrong. 
         The valid options are: <global> or <ptmsites> ")
  }
  
  outliers <- tolower(outliers)
  outliers <- match.arg(outliers)
  
  choosePvalue <- tolower(choosePvalue)
  choosePvalue <- match.arg(choosePvalue)
  
  # Check supported species------
  if(isFALSE(artmsIsSpeciesSupported(species))){
    if(verbose) message("----(-) ", species, " is not supported. \n\tGene Symbol, Protein Name, and EntrezID won't be provided")
  }
  
  species <- tolower(species)
  if(!(species %in% c('human', 'mouse'))){
    if(enrich){
      message("--- Enrichment analysis turned off (only available for HUMAN and MOUSE)")
      enrich <- FALSE
    }
  }
  
  # Checking required packages-----

  ## Bioconductor-----
  cp <- 0
  if(species == "human"){
    if (!"org.Hs.eg.db" %in% installed.packages()){
      message(paste0('- Package < org.Hs.eg.db > not installed in your system. 
                     Please, install running < BiocManager::install(\"org.Hs.eg.db\") >'))
      cp = cp + 1
    }
  }else if(species == "mouse"){
    if (!"org.Mm.eg.db" %in% installed.packages()){
      message(paste0('- Package < org.Mm.eg.db > not installed in your system. 
                     Please, install running < BiocManager::install(\"org.Mm.eg.db\") >'))
      cp = cp + 1
    }
  }
  
  ## CRAN -----
  required_bioc_packages <- c("ComplexHeatmap")
  for(p in required_bioc_packages){
    if(!(p %in% installed.packages())){
      message(paste0('- Package < ', p, ' > not installed in your system. 
                     Please, install running < BiocManager::install(\"',p,'\") >'))
      cp <- cp + 1
    }
  }
  
  required_cran_packages <- c("factoextra", "FactoMineR", "gProfileR", "PerformanceAnalytics")
  
  for(p in required_cran_packages){
    if(!(p %in% installed.packages())){
      message(paste0('- Package < ', p, ' > not installed in your system. 
                     Please, install running < install.packages(\"',p,'\") >'))
      cp <- cp + 1
    }
  }

  if(cp > 0){
    stop("\nRequired packages to run artmsAnalysisQuantifications() not available. Please, install and run again")
  }
  
  
  # Check pathogens-----
  pathogen <- tolower(pathogen)
  if (pathogen == "nopathogen") {
    if(verbose) message("--- No Pathogen extra in these samples")
  } else if (pathogen == "tb") {
    # This should not work
    if(verbose) message("\nPATHOGEN IN SAMPLES: Tuberculosis (TB)\n")
    pathogen.ids <- artms_data_pathogen_TB
    names(pathogen.ids) <- c('Entry')
  } else if (pathogen == "lpn") {
    if(verbose) message("\nPATHOGEN IN SAMPLES: LEGIONELLA PNEUMOPHILA\n")
    pathogen.ids <- artms_data_pathogen_LPN
  } else{
    stop("This pathogen is not supported yet")
  }
  
  # Session info-----
  session <- sessionInfo()
  sink("artms_sessionInfo_analysisQuant.txt")
  print(session)
  sink()
  
  # Create dir-----
  log2dirname <- normalizePath(dirname(log2fc_file))
  output_dir <- file.path(log2dirname, paste0(output_dir, "_", choosePvalue))
  
  if(!dir.exists(file.path(output_dir))){
    dir.create(file.path(output_dir), recursive = TRUE)
  }
  
  # Create the ouput_directory in the results folder
  # dirname(log2fc_file)
  # getwd()

  # Open log2fc-----
  if(verbose) message(">> LOADING QUANTIFICATIONS (-results.txt from MSstats) ")
  if(data_object){
    dflog2fcraw <- log2fc_file
    log2fc_file <- "analysis-results.txt"
  }else{
    dflog2fcraw <- read.delim(log2fc_file,
                              header = TRUE,
                              sep = "\t",
                              stringsAsFactors = FALSE)
  }
  
  # use this as a template to generate the output file names
  log2fc_file <- basename(log2fc_file)
  
  # Open modelqc ----
  if(verbose) message(">> LOADING modelqc FILE (ABUNDANCE) ")
  if(data_object){
    dfmq <- modelqc_file
    remove(modelqc_file)
  }else{
    dfmq <- read.delim(modelqc_file,
                       header = TRUE,
                       sep = "\t",
                       stringsAsFactors = FALSE)
    colnames(dfmq) <- toupper(colnames(dfmq))
  }
  
  # ModelQC: fix old files-----
  if("GROUP_ORIGINAL" %in% colnames(dfmq)){
    if(verbose) message("\n------------------------------------------------------------")
    if(verbose) message("(!) WARNING!")
    if(verbose) message("Be aware that you are analyzing a dataset that was generated with an old version of MSstats (< 4.0)")
    if(verbose) message("Some changes will be applied")
    if(verbose) message(" ------------------------------------------------------------\n")
    dfmq$GROUP <- NULL
    dfmq$GROUP <- dfmq$GROUP_ORIGINAL
    dfmq$SUBJECT <- NULL
    dfmq$SUBJECT <- dfmq$SUBJECT_ORIGINAL
  }
  
  # Removing the empty protein names-----
  if (any(dfmq$PROTEIN == "")) {
    dfmq <- dfmq[-which(dfmq$PROTEIN == ""), ]
    if(verbose) message("--- Empty ID proteins removed from abundance data")
  }
  dfmq$PROTEIN <- gsub("(sp\\|)(.*)(\\|.*)", "\\2", dfmq$PROTEIN)
  dfmq$PROTEIN <- gsub("(.*)(\\|.*)", "\\1", dfmq$PROTEIN)
  
  # Check protein ID----
  if( length(dfmq$PROTEIN[grep("\\w{2}_\\d{1,}\\.\\d{1,}", dfmq$PROTEIN)]) > 100 ){
    if(verbose) message("----(-) Many RefSeq IDs detected in this dataset, which is not supported yet.
                        Gene Symbol, Protein Name, and EntrezID won't be provided")
  }
  
  # Remove outliers------
  if (outliers == "iqr" | outliers == "std"){

    if(outliers == "iqr"){
      if(verbose) message("--- Removing outliers (based on 6xIQR cutoff)")
      iqr <- IQR(dfmq$ABUNDANCE)
      m <- mean(dfmq$ABUNDANCE)
      uplim <- m+(6*iqr)
      lowlim <- m-(6*iqr)
    }else if(outliers == "std"){
      if(verbose) message("--- Removing outliers (based on 6xSTD cutoff)")
      std <- sd(dfmq$ABUNDANCE)
      m <- mean(dfmq$ABUNDANCE)
      uplim <- m+3*std
      lowlim <- m-3*std
    }

    # Record outliers for plotting
    dfmq$outliers <- ifelse(dfmq$ABUNDANCE > lowlim & dfmq$ABUNDANCE < uplim, "no", "yes")
    
    outliers_number <- length(dfmq$outliers[which(dfmq$outliers == "yes")])
    
    # Report outliers removed
    if (outliers_number == 0){
      if(verbose) message("------> No outliers have been removed")
    }else{
      if(verbose) message("------> Total outliers removed: ", outliers_number)
      out.outliers <- gsub(".txt", paste0("-outliers-removed-",outliers,".txt"), log2fc_file)
      out.outliers <- paste0(output_dir, "/", out.outliers)
      dfmq.outliers <- dfmq[which(dfmq$outliers == "yes"),]
      write.table(
        dfmq.outliers,
        out.outliers,
        quote = FALSE,
        sep = "\t",
        row.names = FALSE,
        col.names = TRUE
      )
      
      # PLOT Outliers
      k <- ggplot(dfmq, aes(x = SUBJECT, y = ABUNDANCE, color = outliers))
      k <- k + geom_jitter(width = 0.3, size = 0.5, na.rm = TRUE)
      k <- k + theme_minimal()
      k <- k + ggtitle("Distribution of ABUNDANCE with outliers")
      k <-k + theme(axis.text.x = element_text(
        angle = 90,
        hjust = 1,
        vjust = 0.5
      ))
      
      # Distribution without ouliers
      l <- ggplot(dfmq[which(dfmq$outliers == "no"),], aes(x = SUBJECT, y = ABUNDANCE))
      l <- l + geom_jitter(width = 0.3, size = 0.5, na.rm = TRUE)
      l <- l + theme_minimal()
      l <- l + ggtitle("Distribution of ABUNDANCE (outliers removed)")
      l <- l + theme(axis.text.x = element_text(
        angle = 90,
        hjust = 1,
        vjust = 0.5
      ))
      
      out.outliers <- gsub(".txt", paste0("-outliers-removed-",outliers,".pdf"), log2fc_file)
      out.outliers <- paste0(output_dir, "/", out.outliers)
      if(printPDF) pdf(out.outliers)
      plot(k)
      plot(l)
      if(printPDF) garbage <- dev.off()
      
      if(verbose) message("------> Check table and plot to find out more about outliers removed")

      # Remove outliers
      dfmq <- dfmq[which(dfmq$ABUNDANCE > lowlim & dfmq$ABUNDANCE < uplim), ]
    }
  }else{
    if(verbose) message("--- Outliers kept (user selection)")
    dfmq <- dfmq
  }

  # Conditions----
  conditions <- unique(dfmq$GROUP)
  numberConditions <- length(conditions)
  
  # KEY STEP: getting background gene list-----
  if (isBackground == "nobackground") {
    # If not list of background genes is provided,
    # then extract them from the modelqc file
    if (isPtm == "global") {
      suppressMessages(
        dfmq2Genes <- artmsAnnotationUniprot(dfmq, 'PROTEIN', species)
        )
      numberTotalGenes <- length(unique(dfmq2Genes$Gene))
      if(verbose) message("--- Total number of genes/proteins: ", numberTotalGenes, " ")
      listOfGenes <- unique(dfmq2Genes$Gene)
    } else if (grepl("ptm", isPtm)) {
      dfmq2Genes <- dfmq[c('PROTEIN', 'GROUP')] 
      names(dfmq2Genes)[grep('PROTEIN', names(dfmq2Genes))] <- 'Protein'
      # # Removing party sites
      # dfmq2Genes <- dfmq2Genes[grep(",", dfmq2Genes$Protein, invert = TRUE), ]
      
      dfmq2Genes <- .artmsExtractUniprotId(x = dfmq2Genes, 
                                           uniprotPtmColumn = "Protein", 
                                           newColumnName = "Protein")
      
      dfmq2Genes <- unique(dfmq2Genes)
      suppressMessages(
        dfmq2Genes <- artmsAnnotationUniprot(dfmq2Genes, 'Protein', species)
        )
      numberTotalGenes <- length(unique(dfmq2Genes$Gene))
      if(verbose) message("--- TOTAL NUMBER OF GENES/PROTEINS: ",
          numberTotalGenes, "  ")
      if (numberTotalGenes == 0) {
        stop("IDs are not recognized")
      }
      listOfGenes <- unique(dfmq2Genes$Gene)
    }
  } else{
    # No matter what list is provided, it must come with a "Gene" column
    backgroundList <- read.delim(isBackground,
                                 header = TRUE,
                                 sep = "\t",
                                 quote = "",
                                 stringsAsFactors = FALSE)
    listOfGenes <- unique(backgroundList$Gene)
  }
  
  backgroundNumber <- length(listOfGenes)
  
  
  # Process log2fc----
  
  if (any(dflog2fcraw$Protein == "")) {
    dflog2fcraw <- dflog2fcraw[-which(dflog2fcraw$Protein == ""), ]
    if(verbose) message("--- Empty ID proteins removed from log2fc data")
  }
  dflog2fcraw$Protein <- gsub("(sp\\|)(.*)(\\|.*)", "\\2", dflog2fcraw$Protein)
  dflog2fcraw$Protein <- gsub("(.*)(\\|.*)", "\\1", dflog2fcraw$Protein)
  
  dflog2fcfinites <- dflog2fcraw[is.finite(dflog2fcraw$log2FC), ]
  
  
  ## Removing log2fc outliers----
  cutofflog2fc <- 15
  if(verbose) message(
    "--- Removing log2fc outliers (",
    paste0("-", cutofflog2fc, " < log2fc < +", cutofflog2fc),
    ") "
  )
  
  filtermorethan10 <- length(dflog2fcfinites$log2FC[abs(dflog2fcfinites$log2FC) > cutofflog2fc])
  if (filtermorethan10 > 0) {
    if(verbose) message("------ (-) Removing [",
                        filtermorethan10,
                        "] protein ids with a abs(log2fc) >",
                        cutofflog2fc, 
                        " ")
    dflog2fcfinites <- dflog2fcfinites[-which(abs(dflog2fcfinites$log2FC) > cutofflog2fc), ]
  }
  
  ## IMPUTING MISSING VALUES-----
  # When a value is completely missed in one of the conditions,
  # the log2fc = Inf / -Inf. Here, we impute those values.
  # The imputation method works as follow. The assumption is that those
  # proteins are likely present as well in those conditions where are missed, 
  # but due to the
  # small sampling (usually 2 or 3 biological replicas) and other proteomics
  # related issue, those proteins didn't make it through the level of detection.
  # Therefore, a small intensity (sampled from the bottom 5%) will be assigned
  # to the protein/site in the missing condition, and the new log2fc is 
  # re-calculated out of the MSstats box. Two issues are addressed in this way
  # 1. If a protein has been consistently identified in one of the conditions, 
  # it will stay
  # 2. But if the intensity value in those conditions was too low, then the 
  # log2fc will be also low
  
  if(verbose) message(">> IMPUTING MISSING VALUES ")
  # Select infinite values (i.e., log2fc missed for that)
  dflog2fcinfinites <- dflog2fcraw[is.infinite(dflog2fcraw$log2FC), ]
  numberInfinites <- dim(dflog2fcinfinites)[1]
  
  # Control
  if (numberInfinites < 1) {
    if(verbose) message(" WARNING: O Infinite values (not very usual)")
  } else {
    if(verbose) message("--- Number of +/- INF values: ", dim(dflog2fcinfinites)[1], " ")
    
    imputedL2FCmelted <- .artms_imputeMissingValues(dflog2fcinfinites = dflog2fcinfinites, 
                                                    dfmq = dfmq)
    
    # Merge with the original log2fc values to impute...
    theImputedL2FC <- merge(dflog2fcinfinites,
                            imputedL2FCmelted,
                            by = c("Protein", "Label"),
                            all = FALSE)
    # check size of theImputedL2FC compared with dflog2fcinfinites to see if we lost rows during imputation
    # possibly we were unable to impute rows after removing outliers
    if (nrow(theImputedL2FC) != nrow (dflog2fcinfinites)){
      numMissing = nrow (dflog2fcinfinites) - nrow(theImputedL2FC)
      message(" WARNING:  Failed to impute values for ", numMissing, "rows. Possibly due to outlier removal")
    }
      
    theImputedL2FC$imputed <- "yes"
  }
  
  # Getting the data ready for merging
  dflog2fcfinites$imputed <- "no"
  dflog2fcfinites$iLog2FC <- dflog2fcfinites$log2FC
  
  ## Choose the pvalue or adjusted pvalue as the iPvalue-----
  if (choosePvalue == "pvalue") {
    dflog2fcfinites$iPvalue <- dflog2fcfinites$pvalue
  } else if (choosePvalue == "adjpvalue") {
    dflog2fcfinites$iPvalue <- dflog2fcfinites$adj.pvalue
  } else{
    stop("Only <pvalue> or <adjpvalue> for argument <choosePvalue>")
  }
  
  # Merging: NA values are thrown away at this point
  if (numberInfinites < 1) {
    dflog2fc <- dflog2fcfinites
  } else {
    dflog2fc <- rbind(dflog2fcfinites, theImputedL2FC)
  }
  
  ## Plot pvalue distributions-----
  if(plotPvaluesLog2fcDist){
    if(verbose) message("--- Plotting distributions of log2fc and pvalues ")
    
    plotDFdistColor <-
      ggplot(dflog2fc, aes(x = log2FC, fill = Label)) +
      geom_histogram(bins = 100,
                     alpha = .4,
                     col = "black", 
                     na.rm = TRUE) +
      labs(title = "Distribution log2FC", x = "log2FC")
    
    plotDFdistAll <- ggplot(dflog2fc, aes(x = log2FC)) +
      geom_histogram(bins = 100,
                     alpha = .4,
                     col = "black", 
                     na.rm = TRUE) +
      labs(title = "Distribution log2FC", x = "log2FC")
    
    plotDFdistiLog <- ggplot(dflog2fc, aes(x = iLog2FC)) +
      geom_histogram(bins = 100,
                     alpha = .4,
                     col = "black", 
                     na.rm = TRUE) +
      labs(title = "Distribution ilog2FC (imputed + nonimputed", x = "iLog2FC")
    
    plotPvalues <-
      ggplot(dflog2fc[is.finite(dflog2fc$pvalue), ], aes(x = pvalue)) +
      geom_histogram(bins = 50,
                     alpha = .4,
                     col = "black",
                     na.rm = TRUE) +
      labs(title = "Distribution p-values", x = "p-values")
    
    plotAdjustedPvalues <-
      ggplot(dflog2fc[-which(dflog2fc$adj.pvalue == 0), ], aes(x = adj.pvalue)) +
      geom_histogram(bins = 150,
                     alpha = .4,
                     col = "black",
                     na.rm = TRUE) +
      labs(title = "Distribution adj.pvalues", x = "adj.values")
    
    plotAdjustedIpvalues <- ggplot(dflog2fc, aes(x = iPvalue)) +
      geom_histogram(bins = 150,
                     alpha = .4,
                     col = "black",
                     na.rm = TRUE) +
      labs(title = "Distribution imputed p-values", x = "iPvalues")
    
    
    # DISTRIBUTION PRINT OUTS
    distributionsOut <- gsub(".txt", ".distributions.pdf", log2fc_file)
    distributionsOut <- paste0(output_dir, "/", distributionsOut)
    if(printPDF) pdf(distributionsOut)
    print(plotDFdistColor)
    print(plotDFdistAll)
    print(plotDFdistiLog)
    print(plotPvalues)
    print(plotAdjustedPvalues)
    print(plotAdjustedIpvalues)
    
    if (numberInfinites > 0) {
      hist(
        imputedL2FCmelted$iLog2FC,
        breaks = 100,
        main = paste0("Imputed Log2FC (all)\n n = ", dim(imputedL2FCmelted)[1]),
        xlab = "log2fc"
      )
      hist(
        theImputedL2FC$iLog2FC,
        breaks = 100,
        main = paste0("Imputed Log2FC merged\n n = ", dim(theImputedL2FC)[1]),
        xlab = "log2fc"
      )
    }
    
    hist(
      dflog2fcfinites$pvalue,
      breaks = 100,
      main = paste0("p-value distribution\n n = ", dim(dflog2fcfinites)[1]),
      xlab = "adj.pvalues"
    )
    hist(
      dflog2fcfinites$adj.pvalue,
      breaks = 100,
      main = paste0("Adjusted p-values distribution\n n = ", 
                    dim(dflog2fcfinites)[1]),
      xlab = "adj.pvalues"
    )
    hist(
      dflog2fcfinites$iLog2FC,
      breaks = 1000,
      main = paste0(
        "Non-imputed Log2FC distribution\n n = ",
        dim(dflog2fcfinites)[1]
      ),
      xlab = "log2FC"
    )
    hist(
      dflog2fc$iPvalue,
      breaks = 100,
      main = paste0(
        "(Imputed+NonImputed) adjusted pvalue distribution\n n = ",
        dim(dflog2fc)[1]
      ),
      xlab = "adj.pvalues"
    )
    hist(
      dflog2fc$iLog2FC,
      breaks = 1000,
      main = paste0(
        "(Imputed+NonImputed) log2fc distribution\n n = ",
        dim(dflog2fc)[1]
      ),
      xlab = "log2FC"
    )
    if(printPDF) garbage <- dev.off()
  }
  
  # Plot correlations based on log2fc------
  if(plotCorrQuant){
    if(verbose) message(">> PLOT: CORRELATION BETWEEN QUANTIFICATIONS (based on log2fc values)")
    if (length(unique(dflog2fc$Label)) > 1) {
      relaChanges <- gsub(".txt", ".correlationQuantifications.pdf", log2fc_file)
      relaChanges <- paste0("plot.", relaChanges)
      relaChanges <- paste0(output_dir, "/", relaChanges)
      if(printPDF) pdf(relaChanges)
      .artms_plotRatioLog2fc(dflog2fc, verbose = verbose)
      if(printPDF) garbage <- dev.off()
    } else{
      if(verbose) message("--- Only one Comparison is available (correlation is not possible) ")
    }
  }
  
  # ABUNDANCE-----
  ## Relationship between conditions------
  # Get the number of biological replicas based on the first condition
  theConditions <- unique(dfmq$GROUP)
  theFirstCond <- theConditions[2]
  condFirst <- dfmq[which(dfmq$GROUP == theFirstCond), ]
  theBiologicalReplicas <- unique(condFirst$SUBJECT)
  numberBioReplicas <- length(theBiologicalReplicas)

  ## ABUNDANCE PLOTS----
  
  if(plotAbundanceStats){
    if(verbose) message(">> PLOTS: ABUNDANCE PLOTS ")
    abundancesName <- gsub(".txt", ".relativeABUNDANCE.pdf", log2fc_file)
    abundancesName <- paste0("plot.", abundancesName)
    abundancesName <- paste0(output_dir, "/", abundancesName)
    if(printPDF) pdf(abundancesName)
    .artms_plotAbundanceBoxplots(df = dfmq)
    .artms_plotNumberProteinsAbundance(df = dfmq)
    if(printPDF) garbage <- dev.off()
  }
  
  ## Reproducibility plots-----
  if(plotReproAbundance){
    if(verbose) message(">> PLOTS: REPRODUCIBILITY PLOTS ")
    reproName <- gsub(".txt", ".reproducibilityAbundance.pdf", log2fc_file)
    reproName <- paste0("plot.", reproName)
    reproName <- paste0(output_dir, "/", reproName)
    if(printPDF) pdf(reproName)
    .artms_plotReproducibilityAbundance(x = dfmq, verbose = verbose)
    if(printPDF) garbage <- dev.off()
  }
  
  ## Correlation between Conditions------
  if(plotCorrConditions){
    if(verbose) message(">> PLOT: CORRELATION BETWEEN ALL COMPARISONS ")
    relaCond <- gsub(".txt", ".correlationConditions.pdf", log2fc_file)
    relaCond <- paste0("plot.", relaCond)
    relaCond <- paste0(output_dir, "/", relaCond)
    if(printPDF) pdf(relaCond)
    .artms_plotCorrelationConditions(x = dfmq, 
                                     numberBiologicalReplicas = numberBioReplicas)
    if(printPDF) garbage <- dev.off()
  }
  
  ## ABUNDANCE DATA, CREATE FILTERS-----
  abundance <- .artms_loadModelqcBasic(dfmq)
  names(abundance)[grep('Protein', names(abundance))] <- 'Prey'
  names(abundance)[grep('Condition', names(abundance))] <- 'Bait'
  # TECHNICAL REPLICAS: if there are technical replicas means that we will find
  # two values for the same protein in the same bioreplica, therefore we need to
  # aggregate first just in case:
  ##LEGACY
  # abundance <- aggregate(Abundance ~ Prey + Bait + BioReplicate,
  #                        data = abundance,
  #                        FUN = mean)
  # abundance_dcmean <- data.table::dcast(abundance,
  #                                       Prey ~ Bait,
  #                                       value.var = 'Abundance',
  #                                       fun.aggregate = mean,
  #                                       fill = 0)
  
  # Let's aggregate to get the sum of the abundance, we will use it later.
  # abundance_dcsum <- data.table::dcast(abundance,
  #                                      Prey ~ Bait,
  #                                      value.var = 'Abundance',
  #                                      fun.aggregate = sum,
  #                                      fill = 0)
  
  abundance_dcsum <- abundance %>% pivot_wider(names_from = Bait, 
                                                id_cols = Prey,
                                                values_from = Abundance, 
                                                values_fn = list(Abundance = sum))
  
  abundance_dcmean <- abundance %>% pivot_wider(names_from = Bait, 
                                                 id_cols = Prey,
                                                 values_from = Abundance, 
                                                 values_fn = list(Abundance = mean))
                                               
  # Melt again the sum and mean
  ##LEGACY
  # abundancelongsum <- data.table::melt(abundance_dcsum,
  #                                    id.vars = c('Prey'),
  #                                    value.name = 'Abundance',
  #                                    variable.name = 'Bait')
  # 
  # abundancelongmean <- data.table::melt(abundance_dcmean,
  #                                     id.vars = c('Prey'),
  #                                     value.name = 'Abundance',
  #                                     variable.name = 'Bait')
  
  abundancelongsum <- abundance_dcsum %>% pivot_longer(cols = -Prey, 
                                                        names_to = "Bait", 
                                                        values_to = "AbSum")
  
  abundancelongmean <- abundance_dcmean %>% pivot_longer(cols = -Prey, 
                                                           names_to = "Bait", 
                                                           values_to = "AbMean")
  
  # We dont need the 0 values
  abundancelongsum <- abundancelongsum[!(abundancelongsum$AbSum == 0), ]
  abundancelongmean <- abundancelongmean[!(abundancelongmean$AbMean == 0), ]
  
  abundancelongsum <- abundancelongsum[!(is.na(abundancelongsum$AbSum)), ]
  abundancelongmean <- abundancelongmean[!( is.na(abundancelongmean$AbMean) ), ]
  
  abundancelongsummean <- merge(abundancelongsum, abundancelongmean, by = c('Prey', 'Bait'))
  
  remove(abundancelongsum, abundancelongmean, abundance_dcsum, abundance_dcmean)
  
  # REPRODUCIBILITY AND SPECIFICY PARATEMERS
  
  # Get the number of bioreplicates based on abundance data
  
  ##LEGACY
  # nbr_wide <- data.table::dcast(abundance,
  #                               Prey ~ Bait,
  #                               value.var = 'Abundance',
  #                               fun.aggregate = length,
  #                               fill = 0)
  # nbr_long <- data.table::melt(nbr_wide,
  #                            id.vars = c('Prey'),
  #                            value.name = 'Abundance',
  #                            variable.name = 'Bait')
  # 
  # nbr_long <- nbr_long[!(nbr_long$BioRep == 0), ]
  # names(nbr_long)[grep('Abundance', names(nbr_long))] <- 'BioRep'
  
  nbr_wide <- abundance %>% pivot_wider(names_from = Bait, 
                                         id_cols = Prey,
                                         values_from = Abundance, 
                                         values_fn = list(Abundance = length))
  nbr_wide[(is.na(nbr_wide))] <- 0
  
  nbr_long <- nbr_wide %>% pivot_longer(cols = -Prey, 
                                        names_to = "Bait", 
                                        values_to = "BioRep")
  
  nbr_long <- nbr_long[!(nbr_long$BioRep == 0), ]
  nbr_long <- nbr_long[!( is.na(nbr_long$BioRep) ), ]
                                                       
  # Get the number of replicates in long format
  ##LEGACY
  # OUTreprod <- data.table::dcast(data = nbr_long, Prey ~ Bait, value.var = 'BioRep')
  # OUTreprod[is.na(OUTreprod)] <- 0
  
  OUTreprod <- abundance %>% pivot_wider(id_cols = Prey, 
                                        names_from = Bait, 
                                        values_from = Abundance, 
                                        values_fn = list(Abundance = length))
  
  OUTreprod[is.na(OUTreprod)] <- 0
  
  # Make a copy for conditions
  OUTreproCondition <- OUTreprod
  
  here <- dim(OUTreprod)[2]
  
  # Make a copy to use later
  bioReplicaInfo <- OUTreprod
  
  # And get the total number of biological replicates
  
  OUTreprod$BiorepCount <- rowSums(OUTreprod[, 2:here])
  
  # Get whether a protein is found in all conditions
  reprospec2merge <- subset(OUTreprod, select = c(Prey, BiorepCount))
  
  ##LEGACY
  # OUTreproCondition <- data.table::dcast(data = nbr_long, Prey ~ Bait, value.var = 'BioRep')
  # OUTreproCondition[is.na(OUTreproCondition)] <- 0
  
  thedim <- dim(OUTreproCondition)[2]
  thepreys <- subset(OUTreproCondition, select = c(Prey))
  thevalues <- subset(OUTreproCondition, select = -c(Prey))
  thevalues[thevalues > 0] <- 1
  thevalues$CondCount <- rowSums(thevalues)
  FinalReproCondition <- cbind(thepreys, thevalues)
  
  reprocondition2merge <- subset(FinalReproCondition, select = c(Prey, CondCount))
  
  # This version will be printed out below
  OUTreprodFinal <- merge(OUTreprod, reprocondition2merge, by = 'Prey')
  
  # PCA ANALYSIS-----
  # It requires a simplified version for modelqc
  if(plotPCAabundance){
    if(verbose) message(">> PRINCIPAL COMPONENT ANALYSIS BASED ON ABUNDANCE ")
    modelqcabundance <- .artms_loadModelQCstrict(df = dfmq,
                                                 species = species,
                                                 ptmis = isPtm,
                                                 verbose = verbose)
    out.pca <- gsub(".txt", "-pca", log2fc_file)
    out.pca <- paste0(output_dir, "/", out.pca)
    suppressWarnings(.artms_getPCAplots(x = modelqcabundance, 
                                        filename = out.pca, 
                                        allConditions = conditions))
  }

  # ANNOTATIONS-----
  modelqc_file_splc <- .artms_mergeAbNbr(df_input =  dfmq, 
                                         repro =  nbr_wide, 
                                         species =  species)
  
  if(verbose) message(">> ANNOTATIONS ")
  # Now get ready for annotation
  if(verbose) message("--- Abundance data ")
  if (grepl("ptm", isPtm)) {
    names(modelqc_file_splc)[grep('^Protein$', names(modelqc_file_splc))] <- 'Uniprot_PTM'
    # Take the Protein ID, but being very careful about the fluomics labeling
    
    modelqc_file_splc <- .artmsExtractUniprotId(x = modelqc_file_splc, 
                                                uniprotPtmColumn = "Uniprot_PTM", 
                                                newColumnName = "Protein")
    suppressMessages(
      modelqc_file_splc <- artmsAnnotationUniprot(modelqc_file_splc, 'Protein', species)
      )
  } else{
    suppressMessages(
      modelqc_file_splc <- artmsAnnotationUniprot(modelqc_file_splc, 'Protein', species)
    )
  }
  
  if(verbose) message("--- Relative Quantifications (Log2fc) ")
  
  if(choosePvalue == "adjpvalue"){
    choosePvalue <- "adj.pvalue"
  }
  # Prepare output of changes
  log2fc_file_splc <- .artms_mergeChangesNbr(df_input = dflog2fc, 
                                             repro =  nbr_wide, 
                                             species =  species,
                                             choosePvalue = choosePvalue)
  
  # Now get ready for annotation
  if (grepl("ptm", isPtm)) {
    names(log2fc_file_splc)[grep('^Protein$', names(log2fc_file_splc))] <- 'Uniprot_PTM'
    # Take the Protein ID, but being very careful about the fluomics labeling
    log2fc_file_splc <- .artmsExtractUniprotId(x = log2fc_file_splc, 
                                               uniprotPtmColumn = "Uniprot_PTM", 
                                               newColumnName = "Protein")
    suppressMessages(
      log2fc_file_splc <- artmsAnnotationUniprot(log2fc_file_splc, 'Protein', species)
    )
  } else{
    suppressMessages(
      log2fc_file_splc <- artmsAnnotationUniprot(log2fc_file_splc, 'Protein', species)
    )
  }
  
  if(verbose) message(">> FILTERING CHANGES BEFORE PRINTING OUT ")
  
  imputedDF <- dflog2fc[c('Protein',
                          'Label',
                          'log2FC',
                          'pvalue',
                          'adj.pvalue',
                          'imputed',
                          'iLog2FC',
                          'iPvalue'
                          )]
    
    
  if(verbose) message("--- Merging Changes with replication data ")
  imputedDF <- merge(imputedDF,
                     bioReplicaInfo,
                     by.x = 'Protein',
                     by.y = 'Prey',
                     all.x = TRUE)
  
  if(verbose) message("--- Removing NA ")
    imputedDF <- imputedDF[!is.na(imputedDF$log2FC), ]
  
  if(verbose) message("--- Add labeling of condition more abundant in the quantification ")
    imputedDF$CMA <- mapply(.artms_selectTheOneLog2fc,
                            imputedDF$iLog2FC,
                            imputedDF$Label)
  
  if(verbose) message("--- Removing proteins not found in a minimal number <",
                      mnbr,
                      "> (user selected) of biological replicates ")
  
  before <- dim(imputedDF)[1]
  imputedDF <- .artms_RemoveProtBelowThres(imputedDF, mnbr)
  after <- dim(imputedDF)[1]
  total_remove <- before - after
  
  if(verbose) message("\t", total_remove, " proteins removed (i.e., not found in more than <", mnbr, "> biological replicates in at least one condition)")
  
  if(verbose) message("--- Filtering is done! ")
  
  # PLOTS: GENERATING QC PLOTS ABOUT CHANGES-----
  if(plotFinalDistributions){
    if(verbose) message(">> GENERATING QC PLOTS ABOUT CHANGES (log2fc) ")
    if(verbose) message("--- Distribution of log2fc and pvalues ")
    distributionsFilteredOut <- gsub(".txt", ".distributionsFil.pdf", log2fc_file)
    distributionsFilteredOut <- paste0(output_dir, "/", distributionsFilteredOut)
    if(printPDF) pdf(distributionsFilteredOut)
      hist(imputedDF$iLog2FC,
           breaks = 1000,
           main = paste0("Filtered Log2FC (>2BR)\n n = ", dim(imputedDF)[1]),
           xlab = "log2fc")
      hist(imputedDF$iPvalue,
           breaks = 1000,
           main = paste0("Filtered p-values (>2BR)\n n = ", dim(imputedDF)[1]),
           xlab = "p-value")
    if(printPDF) garbage <- dev.off()
  }
  
  if(plotPropImputation){
    if(verbose) message("--- Proportion imputed values ")
    # Stats about imputed values
    yesimputed <- dim(imputedDF[which(imputedDF$imputed == 'yes'), ])[1]
    nonimputed <- dim(imputedDF[which(imputedDF$imputed == 'no'), ])[1]
    
    dat <- data.frame(count = c(yesimputed, nonimputed),
                      category = c("Imputed", "Non-Imputed"))
    # Add addition columns, needed for drawing with geom_rect.
    dat$fraction = dat$count / sum(dat$count)
    dat <- dat[order(dat$fraction),]
    dat$ymax <- cumsum(dat$fraction)
    dat$ymin <- c(0, head(dat$ymax, n = -1))
    
    p1 <- ggplot(dat, aes(fill = category,
                          ymax = ymax,
                          ymin = ymin,
                          xmax = 4,
                          xmin = 3)) +
      geom_rect(na.rm = TRUE) +
      coord_polar(theta = "y") +
      xlim(c(0, 4)) +
      labs(title = "Proportion of Imputed Intensity values")
    
    p2 <- ggplot(dat, aes(x = category, y = count, fill = category)) +
      geom_bar(stat = "identity",
               na.rm = TRUE) +
      labs(title = "Proportion of Imputed Intensity values")
    
    outImputation <- gsub(".txt", ".imputation.pdf", log2fc_file)
    outImputation <- paste0(output_dir, "/", outImputation)
    if(printPDF) pdf(outImputation)
    print(p1)
    print(p2)
    if(printPDF) garbage <- dev.off()
  }
  
  if(plotHeatmapsChanges){
    if(verbose) message(">> HEATMAPS OF CHANGES (log2fc) ")
    
    ##LEGACY
    # l2fcol <- data.table::dcast(data = imputedDF, 
    #                             Protein ~ Label, 
    #                             value.var = 'iLog2FC')
    l2fcol <- imputedDF %>% tidyr::pivot_wider(id_cols = Protein, 
                                                names_from = Label, 
                                                values_from = iLog2FC)
    
    l2fcol <- as.data.frame(l2fcol)
    rownames(l2fcol) <- l2fcol$Protein
    l2fcol <- within(l2fcol, rm(Protein))
    l2fcol[is.na(l2fcol)] <- 0
    
    if (numberConditions > 1) {
      l2fcolmatrix <- data.matrix(l2fcol)
      if(verbose) message("--- All changes ")
      
      if(printPDF){
        outHeatMapOverallL2fc <- gsub(".txt",
                                      ".clustering.log2fc.all-overview.pdf",
                                      log2fc_file)
        
        outHeatMapOverallL2fc <- paste0(output_dir, "/", outHeatMapOverallL2fc)
        plotPhLog1 <- pheatmap(
          l2fcolmatrix,
          filename = outHeatMapOverallL2fc,
          cellwidth = 20,
          main = "Clustering Log2FC",
          cluster_cols = FALSE,
          clustering_method = "average",
          fontfamily = "Helvetica",
          show_colnames = FALSE,
          fontsize = 6,
          fontsize_row = 3,
          fontsize_col = 10,
          border_color = NA
        )
        print(plotPhLog1)
      }else{
        plotPhLog2 <- pheatmap(
          l2fcolmatrix,
          cellwidth = 20,
          main = "Clustering Log2FC",
          cluster_cols = FALSE,
          clustering_method = "average",
          fontfamily = "Helvetica",
          show_colnames = FALSE,
          fontsize = 6,
          fontsize_row = 3,
          fontsize_col = 10,
          border_color = NA
        )
        print(plotPhLog2)
      }

      # Only significant pvalues
      if(verbose) message("--- Only significant changes ")
      imputedDFsig <- imputedDF[which(imputedDF$iPvalue < 0.05), ]
      ##LEGACY
      # l2fcolSignificants <- data.table::dcast(data = imputedDFsig, 
      #                                         Protein ~ Label, 
      #                                         value.var = 'iLog2FC')
      l2fcolSignificants <- imputedDFsig %>% tidyr::pivot_wider(id_cols = Protein, 
                                                 names_from = Label, 
                                                 values_from = iLog2FC)
      l2fcolSignificants <- as.data.frame(l2fcolSignificants)
      rownames(l2fcolSignificants) <- l2fcolSignificants$Protein
      l2fcolSignificants <- within(l2fcolSignificants, rm(Protein))
      l2fcolSignificants[is.na(l2fcolSignificants)] <- 0
      
      l2fcolSignificantsmatrix <- data.matrix(l2fcolSignificants)

      if(printPDF){
        outHeatMapOverallL2fc <- gsub(".txt",
                                      ".clustering.log2fcSign.all-overview.pdf",
                                      log2fc_file)
        outHeatMapOverallL2fc <- paste0(output_dir, "/", outHeatMapOverallL2fc)
        plotPh1 <- pheatmap(
          l2fcolSignificantsmatrix,
          filename = outHeatMapOverallL2fc,
          cellwidth = 20,
          main = "Clustering Log2FC (p-value < 0.05)",
          cluster_cols = FALSE,
          fontfamily = "Helvetica",
          labels_row = "",
          fontsize = 6,
          fontsize_row = 8,
          fontsize_col = 8,
          border_color = NA,
          fontfamily = "Helvetica")
        print(plotPh1)
      }else{
        plotPh2 <- pheatmap(
          l2fcolSignificantsmatrix,
          cellwidth = 20,
          main = "Clustering Log2FC (p-value < 0.05)",
          cluster_cols = FALSE,
          fontfamily = "Helvetica",
          labels_row = "",
          fontsize = 6,
          fontsize_row = 8,
          fontsize_col = 8,
          border_color = NA,
          fontfamily = "Helvetica")
        print(plotPh2)
      }
    }
  }

  # ENRICHMENT OF MOST ABUNDANT PROTEINS (from IMPUTED LOG2FC values)-----
  # Let's select first significance based on pvalue, by using the iPvalue
  # we are already including the imputed pvalues...
  
  ##LEGACY
  # l2fcol4enrichment <- data.table::dcast(data = imputedDF[which(imputedDF$iPvalue < 0.05), ], 
  #                                        Protein ~ Label, 
  #                                        value.var = 'iLog2FC')
  
  l2fcol4enrichment <- imputedDF %>% 
    dplyr::filter(iPvalue < 0.05) %>%
    tidyr::pivot_wider(id_cols = Protein, 
                       names_from = Label, 
                       values_from = iLog2FC)
  
  l2fcol4enrichment <- as.data.frame(l2fcol4enrichment)
  
  ## GPROFILER=====
  if (enrich == TRUE & dim(l2fcol4enrichment)[1] > 0) {
    if(verbose) message(">> ENRICHMENT ANALYSIS OF SELECTED CHANGES USING GPROFILER ")
    
    if (dim(l2fcol4enrichment)[1] > 0) {
      # Let's melt now for enrichment analysis
      ##LEGACY
      # l2fcol4enrichment <- data.table::melt(data = l2fcol4enrichment,
      #                                     id.vars = c('Protein'),
      #                                     variable.name = "Comparisons")
      
      l2fcol4enrichment <- l2fcol4enrichment %>%
        tidyr::pivot_longer(cols = -Protein, 
                            names_to = "Comparisons", 
                            values_to = "value")
      
      l2fcol4enrichment <- as.data.frame(l2fcol4enrichment)
      
      l2fcol4enrichment <- l2fcol4enrichment[complete.cases(l2fcol4enrichment), ]
      # Now get ready for annotation
      if (grepl("ptm", isPtm)) {
        names(l2fcol4enrichment)[grep('^Protein$', names(l2fcol4enrichment))] <- 'Uniprot_PTM'
        # Take the Protein ID, but being very careful about the fluomics labeling
        l2fcol4enrichment <- .artmsExtractUniprotId(x = l2fcol4enrichment, 
                                                    uniprotPtmColumn = "Uniprot_PTM", 
                                                    newColumnName = "Protein")
        suppressMessages(
          l2fcol4enrichment <- artmsAnnotationUniprot(l2fcol4enrichment, 
                                                      'Protein', 
                                                      species)
        )
      } else{
        suppressMessages(
          l2fcol4enrichment <- artmsAnnotationUniprot(l2fcol4enrichment, 
                                                      'Protein', 
                                                      species)
        )
      }
    }
    
    if (grepl("ptm", isPtm)) {
      # l2fcol4enrichment <- 
      # within(l2fcol4enrichment, rm(Gene,Uniprot_PTM,Protein.names))
      # # Remove parties for enrichment
      # l2fcol4enrichment <- l2fcol4enrichment[grep(",", l2fcol4enrichment$Protein, invert = TRUE),]
      # Select the Uniprot ID, but keep in mind that some of them might
      # have many _ph54_ph446 before
      # l2fcol4enrichment$Protein <- 
      # gsub("^(\\S+?)_.*", "\\1", l2fcol4enrichment$Protein, perl = TRUE)
      l2fcol4enrichment <- unique(l2fcol4enrichment[c("Protein", "Gene", "Comparisons", "value")])
    }
    
    # ALL SIGNIFICANT CHANGES log2fc
    # GPROFILER
    if(verbose) message("1) Enrichment of ALL significant Changes ")
      filallsig_log2fc_long <- l2fcol4enrichment[which(abs(l2fcol4enrichment$value) >= l2fc_thres),]
    
    if (dim(filallsig_log2fc_long)[1] > 0) {
      out.mac.allsig <- gsub(".txt", "-enrich-MAC-allsignificants.txt", log2fc_file)
      out.mac.allsig <- paste0(output_dir, "/", out.mac.allsig)
      mac.allsig <- NULL
      
      tryCatch(
        mac.allsig <- artmsEnrichLog2fc(
          dataset = filallsig_log2fc_long,
          output_name = out.mac.allsig,
          species = species,
          heatmaps = TRUE,
          background = listOfGenes,
          verbose = verbose
        ), error = function(e){
          message("\n\t(Error): Enrichment is not possible! ")
          message("\tgProfiler server night be down or your internet connection is not working")
          enrich <- FALSE
          }
      )
      
      if(!is.null(mac.allsig)){
        if (dim(mac.allsig)[1] > 0) {
          write.table(
            mac.allsig,
            out.mac.allsig,
            quote = FALSE,
            sep = "\t",
            row.names = FALSE,
            col.names = TRUE
          )
        }
      }

      
      if(verbose) message("---+ Corum Protein Complex Enrichment Analysis ")
      
      # CORUM
      allsigComplexEnriched <- .artms_enrichForComplexes(df = filallsig_log2fc_long,
                                                         backgroundNumber = backgroundNumber)
      
      if (dim(allsigComplexEnriched)[1] > 0) {
        out.mac.allsig.corum <-  gsub(".txt",
                                      "-enrich-MAC-allsignificants-corum.txt",
                                      log2fc_file)
        out.mac.allsig.corum <- paste0(output_dir, "/", out.mac.allsig.corum)
        
        write.table(
          allsigComplexEnriched,
          out.mac.allsig.corum,
          quote = FALSE,
          sep = "\t",
          row.names = FALSE,
          col.names = TRUE
        )
      }
      # And the heatmap
      if (dim(allsigComplexEnriched)[1] > 2) {
        out.mac.allsig.corum.pdf <- gsub(".txt",
                                         "-enrich-MAC-allsignificants-corum.pdf",
                                         log2fc_file)
          
        out.mac.allsig.corum.pdf <- paste0(output_dir, "/", out.mac.allsig.corum.pdf)
        
        .artms_plotCorumEnrichment(x =  allsigComplexEnriched, 
                                   outfile = out.mac.allsig.corum.pdf, 
                                   theTitle = "MAC ALL SIGNIFICANT Protein Complex Enrichment")
          
      } else{
        message("--- (-) Not enough negative corum complexes to plot ")
      }
    } else{
      message(" ----(-) No significant hits ")
      mac.allsig <- NULL
      allsigComplexEnriched <- NULL
    }
    
    # POSITIVE log2fc
    # GPROFILER
    if(verbose) message("2) Enrichment of selected POSITIVE significant changes ")
    
    filpos_log2fc_long <- l2fcol4enrichment[which(l2fcol4enrichment$value >= l2fc_thres),]
    
    if (dim(filpos_log2fc_long)[1] > 0) {
      out.mac.pos <- gsub(".txt", "-enrich-MAC-positives.txt", log2fc_file)
      out.mac.pos <- paste0(output_dir, "/", out.mac.pos)

      mac.pos <- NULL
      tryCatch(
          mac.pos <- artmsEnrichLog2fc(
            dataset = filpos_log2fc_long,
            species = species,
            heatmaps = TRUE,
            output_name = out.mac.pos,
            background = listOfGenes,
            verbose = verbose
          ), error = function(e){
            message("\n\t(Error): Enrichment is not possible! ")
            message("\tgProfiler server night be down or your internet connection is not working")
            enrich = FALSE
          }
      )
      
      if(!is.null(mac.pos)){
        if (dim(mac.pos)[1] > 0) {
          write.table(
            mac.pos,
            out.mac.pos,
            quote = FALSE,
            sep = "\t",
            row.names = FALSE,
            col.names = TRUE
          )
        }
      }
      
      if(verbose) message("---+ Corum Protein Complex Enrichment Analysis ")
      
      # CORUM
      positiveComplexEnriched <- .artms_enrichForComplexes(filpos_log2fc_long, backgroundNumber)
      
      if (dim(positiveComplexEnriched)[1] > 0) {
        out.mac.pos.corum <-
          gsub(".txt",
               "-enrich-MAC-positives-corum.txt",
               log2fc_file)
        out.mac.pos.corum <-
          paste0(output_dir, "/", out.mac.pos.corum)
        write.table(
          positiveComplexEnriched,
          out.mac.pos.corum,
          quote = FALSE,
          sep = "\t",
          row.names = FALSE,
          col.names = TRUE
        )
      }
      
      # And the heatmap
      if (dim(positiveComplexEnriched)[1] > 2) {
        out.mac.pos.corum.pdf <- gsub(".txt",
                                      "-enrich-MAC-positives-corum.pdf",
                                      log2fc_file)
          
        out.mac.pos.corum.pdf <- paste0(output_dir, "/", out.mac.pos.corum.pdf)
        # out.mac.pos.corum.pdf <- 'whatever.corum.positive.pdf'
        .artms_plotCorumEnrichment(
          positiveComplexEnriched,
          out.mac.pos.corum.pdf,
          "MAC+ Protein Complex Enrichment"
        )
      } else{
        if(verbose) message("\t----(-) Not enough positive corum complexes to plot ")
      }
    } else{
      if(verbose) message("\t ------ Nothing is significant in the selected Positive log2fc ")
      mac.pos <- NULL
      positiveComplexEnriched <- NULL
    }
    
    
    # NEGATIVE log2fc
    if(verbose) message("3) Enrichment of selected NEGATIVE significant changes ")
    
    filneg_log2fc_long <- l2fcol4enrichment[which(l2fcol4enrichment$value <= -l2fc_thres),]
    
    if (dim(filneg_log2fc_long)[1] > 0) {
      out.mac.neg <- gsub(".txt", "-enrich-MAC-negatives.txt", log2fc_file)
      out.mac.neg <- paste0(output_dir, "/", out.mac.neg)
      
      mac.neg <- NULL
      tryCatch(
        mac.neg <- artmsEnrichLog2fc(
          dataset = filneg_log2fc_long,
          output_name = out.mac.neg,
          species = species,
          heatmaps = TRUE,
          background = listOfGenes,
          verbose = verbose), 
        error = function(e){
          message("\n\t(Error): Enrichment is not possible! ")
          message("\tgProfiler server night be down or your internet connection is not working")
          enrich <- FALSE
        })
      
      if(!is.null(mac.neg)){
        if (dim(mac.neg)[1] > 0) {
          write.table(
            mac.neg,
            out.mac.neg,
            quote = FALSE,
            sep = "\t",
            row.names = FALSE,
            col.names = TRUE
          )
        }
      }
      
      if(verbose) message("---+ Corum Protein Complex Enrichment Analysis ")
      
      negativesComplexEnriched <- .artms_enrichForComplexes(filneg_log2fc_long, backgroundNumber)
      
      if (dim(negativesComplexEnriched)[1] > 0) {
        out.mac.neg.corum <- gsub(".txt",
                                  "-enrich-MAC-negatives-corum.txt",
                                  log2fc_file)
          
        out.mac.neg.corum <- paste0(output_dir, "/", out.mac.neg.corum)
        write.table(
          negativesComplexEnriched,
          out.mac.neg.corum,
          quote = FALSE,
          sep = "\t",
          row.names = FALSE,
          col.names = TRUE
        )
      }
      
      # And the heatmap
      if (dim(negativesComplexEnriched)[1] > 2) {
        out.mac.neg.corum.pdf <- gsub(".txt",
                                      "-enrich-MAC-negatives-corum.pdf",
                                      log2fc_file)
          
        out.mac.neg.corum.pdf <- paste0(output_dir, "/", out.mac.neg.corum.pdf)
        .artms_plotCorumEnrichment(x = negativesComplexEnriched, 
                                   outfile = out.mac.neg.corum.pdf, 
                                   theTitle = "MAC- Protein Complex Enrichment")
          
      } else{
        if(verbose) message("\t-----(-) Not enough negative corum complexes to plot ")
      }
    } else{
      if(verbose) message("\t------ Nothing is significant in the NEGATIVE site of things ")
      mac.neg <- NULL
      negativesComplexEnriched <- NULL
    }
  } else{
    if(verbose) message(">> NO ENRICHMENT of CHANGES (log2fc) SELECTED ")
    mac.allsig <- NULL
    mac.pos <- NULL
    mac.neg <- NULL
  }
  
  # Superunified data version-----
  superunified <- merge(abundancelongsummean,
                        nbr_long,
                        by = c('Bait', 'Prey'),
                        all = TRUE)
  superunified <- merge(superunified,
                        reprocondition2merge,
                        by = 'Prey',
                        all = TRUE)
  superunified <- merge(superunified,
                        reprospec2merge,
                        by = 'Prey',
                        all = TRUE)
  
  if (grepl("ptm", isPtm)) {
    names(superunified)[grep('^Prey$', names(superunified))] <- 'Uniprot_PTM'
    # Take the Protein ID, but being very careful about the fluomics labeling
    superunified <- .artmsExtractUniprotId(x = superunified, 
                                           uniprotPtmColumn = "Uniprot_PTM", 
                                           newColumnName = "Prey")
  }
  
  suppressMessages(
    superunified <- artmsAnnotationUniprot(superunified, 'Prey', species)
    )
  
  # Rename (before it was just a lazy way to use another code)
  names(superunified)[grep('Bait', names(superunified))] <- 'Condition'
  
  # ANNOTATE SPECIE
  if(verbose) message("--- Annotating species(s) in files ")
  superunified <- artmsAnnotateSpecie(superunified, pathogen, species)

  if (grepl("ptm", isPtm)) {
    if(verbose) message(">> GENERATING EXTENDED DETAILED VERSION OF PH-SITE ")
    imputedDFext <- artmsGeneratePhSiteExtended(df = imputedDF,
                                                pathogen = pathogen,
                                                species = species,
                                                ptmType = isPtm,
                                                output_name = paste0(output_dir, "/", log2fc_file))
  }
  
  # Final OUTPUT FILES------
  if(verbose) message(">> GENERATING FINAL OUTPUT FILES ")
  if (grepl("ptm", isPtm)) {
    names(imputedDF)[grep('Protein', names(imputedDF))] <- 'Uniprot_PTM'
    imputedDF$UniprotID <- imputedDF$Uniprot_PTM
    # The virus labeling has to be taken into account 
    # when getting the uniprot id:
    imputedDF <- .artmsExtractUniprotId(x = imputedDF, 
                                        uniprotPtmColumn = "Uniprot_PTM", 
                                        newColumnName = "UniprotID")
    suppressMessages(
      imputedDF <- artmsAnnotationUniprot(imputedDF, 
                                          'UniprotID', 
                                          species)
      )
                                                         
    names(imputedDF)[grep("Label", names(imputedDF))] <- 'Comparison'
    
    imputedDF <- artmsAnnotateSpecie(imputedDF, pathogen, species)
    
    # Wide version of imputedDF
    ## LEGACY NOT TESTED!
    # imputedDF_wide_log2fc <- data.table::dcast(
    #   data = imputedDF,
    #   Gene + Protein + EntrezID + Uniprot_PTM ~ Comparison,
    #   value.var = 'iLog2FC',
    #   fill = 0
    # )
    
    imputedDF_wide_log2fc <- imputedDF %>%
      tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID, Uniprot_PTM), 
                         names_from = Comparison, 
                         values_from = iLog2FC)
      
    ##LEGACY NOT TESTED!
    # imputedDF_wide_pvalue <-
    #   data.table::dcast(
    #     data = imputedDF,
    #     Gene + Protein + EntrezID + Uniprot_PTM ~ Comparison,
    #     value.var = 'iPvalue',
    #     fill = 0
    #   )
    
    imputedDF_wide_pvalue <- imputedDF %>%
      tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID, Uniprot_PTM), 
                         names_from = Comparison, 
                         values_from = iPvalue)
    
  } else if (isPtm == "global") {
    suppressMessages(
      imputedDF <- artmsAnnotationUniprot(x = imputedDF, 
                                          columnid = 'Protein', 
                                          species =  species)
      )
                                                         
    names(imputedDF)[grep("Label", names(imputedDF))] <- 'Comparison'
    
    imputedDF <- artmsAnnotateSpecie(imputedDF, pathogen, species)
    
    # Wide version of imputedDF
    ##LEGACY
    # imputedDF_wide_log2fc <- data.table::dcast(data = imputedDF,
    #                                            Gene + Protein + EntrezID ~ Comparison,
    #                                            value.var = 'iLog2FC',
    #                                            fill = 0)
    
    imputedDF_wide_log2fc <- imputedDF %>%
      tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID), 
                         names_from = Comparison, 
                         values_from = iLog2FC)
    
    # LEGACY
    # imputedDF_wide_pvalue <- data.table::dcast(data = imputedDF,
    #                                            Gene + Protein + EntrezID ~ Comparison,
    #                                            value.var = 'iPvalue',
    #                                            fill = 0)
    
    imputedDF_wide_pvalue <- imputedDF %>%
      tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID), 
                         names_from = Comparison, 
                         values_from = iPvalue)
      
      
  } else{
    stop(" WRONG isPTM SELECTED. 
         OPTIONS AVAILABLE: global, ptmph, ptmsites ")
  }

  # PLOTS: boxplot of relative quantification------
  if(plotTotalQuant){
    if(verbose) message(">> PLOT OUT: TOTAL NUMBER OF PROTEINS/SITES QUANTIFIED")
    numimputedfinal <- gsub(".txt", ".TotalQuantifications.pdf", log2fc_file)
    numimputedfinal <- paste0("plot.", numimputedfinal)
    numimputedfinal <- paste0(output_dir, "/", numimputedfinal)
    if(printPDF) pdf(numimputedfinal)
    .artms_plotNumberProteinsImputedLog2fc(imputedDF)
    if(printPDF) garbage <- dev.off()
  }
  
  # CLUSTERING analysis of quantifications-----
  if(plotClusteringAnalysis){
    if (isPtm == "global") {
      if(verbose) message(">> CLUSTERING ANALYSIS OF QUANTIFICATIONS ")
      
      # PRE-CLUSTERING
      # GET THE LIST OF SIGNIFICANTS FOR THE EXPERIMENT(S)
      list_of_significants <- unique(imputedDF$Protein[which(abs(imputedDF$iLog2FC > 1) &
                                                               imputedDF$iPvalue < 0.05)])
        
      
      # AND APPLY THE FILTER
      data.select <- imputedDF[which(imputedDF$Protein %in% list_of_significants), ]
      
      # GENE BASED -> heatmaps
      ##LEGACY
      # hasdc <- data.table::dcast(data = data.select[which(data.select$imputed == "no"), ],
      #                            Gene + Protein ~ Comparison,
      #                            value.var = "iLog2FC",
      #                            fun.aggregate = median,
      #                            fill = 0)
      # EXPERIMENT BASED -> PCA
      # hasdcexp <- data.table::dcast(data = data.select[which(data.select$imputed == "no"), ],
      #                               Comparison ~ Gene + Protein,
      #                               value.var = "iLog2FC",
      #                               fun.aggregate = median,
      #                               fill = 0)
      
     
      hasdc <- data.select %>% dplyr::filter(imputed == "no") %>%
        tidyr::pivot_wider(id_cols = c(Gene, Protein), 
                           names_from = Comparison, 
                           values_from = iLog2FC, 
                           values_fn = list(iLog2FC = median), 
                           values_fill = list(iLog2FC = 0))
      
      if(dim(hasdc)[1] > 50){
        hasdcexp <- data.select %>% dplyr::filter(imputed == "no") %>%
          dplyr::mutate(Gene_Protein = paste(Gene, Protein, sep = "_")) %>%
          tidyr::pivot_wider(id_cols = Comparison, 
                             names_from = Gene_Protein, 
                             values_from = iLog2FC, 
                             values_fn = list(iLog2FC = median),
                             values_fill = list(iLog2FC = 0))
        
        hasdc <- as.data.frame(hasdc)
        hasdcexp <- as.data.frame(hasdcexp)
        
        
        # CLUSTERING ANALYSIS
        # GENE BASED
        rownames(hasdc) <- paste0(hasdc$Gene, "_", hasdc$Protein)
        vamos <- within(hasdc, rm(Gene, Protein))
        
        # if this dataset only have one comparison,
        # this analysis does not makes sense: check it out:
        if ( dim(vamos)[2] > 2 ) {
          venga <- as.matrix(vamos)
          
          # EXPERIMENT BASED
          rownames(hasdcexp) <- hasdcexp$Comparison
          vamosexp <- within(hasdcexp, rm(Comparison))
          vengaexp <- as.matrix(vamosexp)
          
          # PCA AND CORRELATION ANALYSIS
          if(verbose) message("--- Correlation plots ")
          df.cor.matrix <- round(cor(venga, use = "pairwise.complete.obs"), 2)
          file_corr_l2fc <- gsub(".txt", ".log2fc-corr.pdf", log2fc_file)
          file_corr_l2fc <- paste0(output_dir, "/", file_corr_l2fc)
          if(printPDF) pdf(file_corr_l2fc, width = 12, height = 9)
            corrplot::corrplot(
            df.cor.matrix,
            type = "upper",
            tl.pos = "td",
            method = "circle",
            tl.cex = 0.9,
            tl.col = 'black',
            tl.srt = 45,
            # order = "hclust",
            diag = TRUE
          )
          PerformanceAnalytics::chart.Correlation(venga,
                                                  histogram = TRUE,
                                                  pch = 25,
                                                  main = "Correlation between Comparisons")
          if(printPDF) garbage <- dev.off()
          
          # BASED ON GROUPS
          pca.hasdcexp <- FactoMineR::PCA(
            hasdcexp[, -c(1)],
            scale.unit = FALSE,
            ncp = 4,
            graph = FALSE
          )
          
          pca_all <- factoextra::fviz_pca_ind(pca.hasdcexp,
                                              labelsize = 3,
                                              repel = TRUE,
                                              habillage = as.factor(hasdcexp$Comparison),
                                              addEllipses = FALSE,
                                              ellipse.level = 0.95
          )
          
          
          if(verbose) message("--- PCA, individuals plot ")

          if(printPDF){
            file_pca_l2fc <- gsub(".txt", ".log2fc-individuals-pca.pdf", log2fc_file)
            file_pca_l2fc <- paste0(output_dir, "/", file_pca_l2fc)
            pdf(file_pca_l2fc, width = 9, height = 7)
            print(pca_all)
            garbage <- dev.off()  
          } else{
            print(pca_all)
          }
          
          # Determine the OPTIMAL NUMBER OF CLUSTERS:
          
          # Elbow method
          e1 <- factoextra::fviz_nbclust(venga, kmeans, method = "wss") +
            geom_vline(xintercept = 4, linetype = 2) +
            labs(subtitle = "kmeans Elbow method")
          
          e2 <- factoextra::fviz_nbclust(venga, cluster::pam, method = "wss") +
            geom_vline(xintercept = 4, linetype = 2) +
            labs(subtitle = "PAM Elbow method")
          
          # Silhouette method
          k1 <- factoextra::fviz_nbclust(venga, kmeans, method = "silhouette") +
            labs(subtitle = "kmeans Silhouette method")
          
          k2 <- factoextra::fviz_nbclust(venga, cluster::pam, method = "silhouette") +
            labs(subtitle = "pam Silhouette method")
          
          # Create a dendrogram
          if(verbose) message("--- Dendrogram ")
          res.dist <- factoextra::get_dist(vamosexp, stand = TRUE, method = "minkowski")
          hc <- hclust(res.dist)
          file_dendro_l2fc <- gsub(".txt", ".log2fc-dendro.pdf", log2fc_file)
          file_dendro_l2fc <- paste0(output_dir, "/", file_dendro_l2fc)
          if(printPDF){
            pdf(file_dendro_l2fc, width = 9, height = 7)
            plot(hc)
            garbage <- dev.off()
          }else{
            plot(hc)
          }

          
          # COMPLEXHEATMAP Heatmap with a specified number of optimal clusters
          n = 10
          pam.res <- pam(vamos, k = n)
          
          cp1 <- factoextra::fviz_cluster(pam.res)
          cp2 <- factoextra::fviz_silhouette(pam.res, print.summary = FALSE)
          
          if(verbose) message("--- Plots to determine optimal number of clusters ")
          file_clusterplots_l2fc <- gsub(".txt", ".log2fc-clusters.pdf", log2fc_file)
          file_clusterplots_l2fc <- paste0(output_dir, "/", file_clusterplots_l2fc)
          
          if(printPDF){
            pdf(file_clusterplots_l2fc, width = 9, height = 7)
            print(e1)
            print(e2)
            print(k1)
            print(k2)
            print(cp1)
            print(cp2)
            garbage <- dev.off()
          } else{
            print(e1)
            print(e2)
            print(k1)
            print(k2)
            print(cp1)
            print(cp2)
          }
          
          if(verbose) message("--- Cluster heatmaps (10 clusters) ")
          
          hmap <- ComplexHeatmap::Heatmap(venga,
                                          name = paste0("Clusters ", "(n = ", n, ")"),
                                          col = circlize::colorRamp2(c(-3, 0, 3), c("firebrick1", "black", "olivedrab1")),
                                          heatmap_legend_param = list(color_bar = "continuous",
                                                                      legend_direction = "horizontal",
                                                                      legend_width = unit(5, "cm"),
                                                                      title_position = "topcenter",
                                                                      title_gp = gpar(fontsize = 15, fontface = "bold")),
                                          split = paste0("", pam.res$clustering),
                                          row_title = "Genes",
                                          row_title_side = "left",
                                          row_title_gp = gpar(fontsize = 15, fontface = "bold"),
                                          show_row_names = FALSE,
                                          column_title = "Relative Quantifications",
                                          column_title_side = "top",
                                          column_title_gp = gpar(fontsize = 10, fontface = "bold"),
                                          column_title_rot = 0,
                                          show_column_names = TRUE,
                                          cluster_columns = FALSE,
                                          clustering_distance_columns = function(x) as.dist(1 - cor(t(x))),
                                          clustering_method_columns = "ward.D2",
                                          clustering_distance_rows = "euclidean",
                                          clustering_method_rows = "ward.D2",
                                          row_dend_width = unit(30, "mm"),
                                          column_dend_height = unit(30, "mm"),
                                          # top_annotation=colAnn,
                                          # top_annotation_height = unit(1.75, "cm"),
                                          # bottom_annotation=sampleBoxplot,
                                          # bottom_annotation_height = unit(4, "cm"),
                                          column_names_gp = gpar(fontsize = 10)
          )
          
          file_clusterheat_l2fc <- gsub(".txt", ".log2fc-clusterheatmap.pdf", log2fc_file)
          file_clusterheat_l2fc <- paste0(output_dir, "/", file_clusterheat_l2fc)
          if(printPDF) pdf(file_clusterheat_l2fc,
              width = 12,
              height = 10)
          
          ComplexHeatmap::draw(hmap,
                               heatmap_legend_side = "top",
                               annotation_legend_side = "right")
          if(printPDF) garbage <- dev.off()
          
          # Pre enrichment of clusters
          cl_number <- pam.res$clustering
          dfclusters <- as.data.frame(cl_number)
          dfclusters$ids <- row.names(dfclusters)
          dfclusters$Gene <- gsub("(.*)(_)(.*)", "\\1", dfclusters$ids)
          dfclusters$Protein <- gsub("(.*)(_)(.*)", "\\3", dfclusters$ids)
          
          if(enrich){
            if(verbose) message("--- Enrichment analysis of the clusters ")
            # Making sure we have unique genes in each comparison 
            # (the PTM might bring redundancy)
            pretmp <- dfclusters[c('Gene', 'cl_number')]
            pretmp <- unique(pretmp)
            
            tmp <- split(pretmp$Gene, pretmp$cl_number, drop = TRUE)
            enrichgenes <- NULL
            
            if (species == "human") {
              tryCatch(enrichgenes <- artmsEnrichProfiler(tmp,
                                                          categorySource = c(
                                                            'GO:BP',
                                                            'GO:MF',
                                                            'GO:CC',
                                                            'KEGG',
                                                            'REAC',
                                                            'CORUM',
                                                            'HPA',
                                                            'OMIM'
                                                          ),
                                                          species = 'hsapiens',
                                                          listOfGenes,
                                                          verbose = verbose), 
                       error = function(e){
                         message("\n\t(Error): Enrichment is not possible! ")
                         message("\tgProfiler server night be down or your internet connection is not working")
                         enrich <- FALSE
                       }
              )
              
            } else if (species == "mouse") {
              tryCatch(enrichgenes <- artmsEnrichProfiler(tmp,
                                                          categorySource = c('GO:BP', 
                                                                             'GO:MF', 
                                                                             'GO:CC', 
                                                                             'KEGG', 
                                                                             'REAC', 
                                                                             'CORUM'),
                                                          species = 'mmusculus',
                                                          listOfGenes,
                                                          verbose = verbose),
                       error = function(e){
                         message("\n\t(Error): Enrichment is not possible! ")
                         message("\tgProfiler server night be down or your internet connection is not working")
                         enrich <- FALSE
                       }
              )
            } else{
              stop(species, " is currently not supported in the enrichment")
            }
            
            # Write the file
            if(!is.null(enrichgenes)){
              file_clusterheatenrich_l2fc <- gsub(".txt",
                                                  ".log2fc-clusterheatmap-enriched.txt",
                                                  log2fc_file)
              
              file_clusterheatenrich_l2fc <- paste0(output_dir, "/", file_clusterheatenrich_l2fc)
              write.table(enrichgenes,
                          file_clusterheatenrich_l2fc,
                          col.names = TRUE,
                          row.names = FALSE,
                          sep = "\t",
                          quote = FALSE)
            }
          } #if enrich = TRUE
          
          # Print out clusters
          file_clusterheatdata_l2fc <- gsub(".txt", ".log2fc-clusterheatmap.txt", log2fc_file)
          file_clusterheatdata_l2fc <- paste0(output_dir, "/", file_clusterheatdata_l2fc)
          write.table(
            dfclusters,
            file_clusterheatdata_l2fc,
            col.names = TRUE,
            row.names = FALSE,
            sep = "\t",
            quote = FALSE
          )
        }
        
      }else{
        message("--- (-) Not enought data available to perform clustering analysis")
      }
    } # if (isPtm == "global") {
  }
  # End of clustering analysis
  
  # END: WRITING OUTPUT FILES------
  if(verbose) message(">> WRITING THE OUTPUT FILES")
  
  # PRINT OUT IMPUTED
  outlog2fcImpute <- gsub(".txt", "-log2fc-long.txt", log2fc_file)
  outlog2fcImpute <- paste0(output_dir, "/", outlog2fcImpute)
  write.table(
    imputedDF,
    outlog2fcImpute,
    quote = FALSE,
    sep = "\t",
    row.names = FALSE,
    col.names = TRUE
  )
  
  outlog2fc <- gsub(".txt", "-log2fc-wide.txt", log2fc_file)
  outlog2fc <- paste0(output_dir, "/", outlog2fc)
  write.table(
    log2fc_file_splc,
    outlog2fc,
    quote = FALSE,
    sep = "\t",
    row.names = FALSE,
    col.names = TRUE
  )
  
  outexcel <- gsub(".txt", "-summary.xlsx", log2fc_file)
  outexcel <- paste0(output_dir, "/", outexcel)
  
  if(is.null(mac.allsig) | is.null(mac.pos) | is.null(mac.neg))
    enrich <- FALSE
  
  if (enrich) {
    # But now check whether is a PTM case:
    if (grepl("ptm", isPtm)) {
      list_of_datasets <- list(
        "log2fcImputed" = imputedDF,
        "log2fcImpExt" = imputedDFext,
        "wide_iLog2fc" = imputedDF_wide_log2fc,
        "wide_iPvalue" = imputedDF_wide_pvalue,
        "enrichALL" = mac.allsig,
        "enrichMACpos" = mac.pos,
        "enrichMACneg" = mac.neg,
        "enMACallCorum" = allsigComplexEnriched,
        "enMACposCorum" = positiveComplexEnriched,
        "enMACnegCorum" = negativesComplexEnriched
      )
    } else if (grepl("global", isPtm)) {
      list_of_datasets <- list(
        "log2fcImputed" = imputedDF,
        "wide_iLog2fc" = imputedDF_wide_log2fc,
        "wide_iPvalue" = imputedDF_wide_pvalue,
        "enrichALL" = mac.allsig,
        "enrich-MACpos" = mac.pos,
        "enrich-MACneg" = mac.neg,
        "enMACallCorum" = allsigComplexEnriched,
        "enMACposCorum" = positiveComplexEnriched,
        "enMACnegCorum" = negativesComplexEnriched
      )
      
    } else{
      stop("Wrong option:", isPtm)
    }
  } else if (!enrich) {
    if (grepl("ptm", isPtm)) {
      list_of_datasets <- list(
        "log2fcImputed" = imputedDF,
        "log2fcImpExt" = imputedDFext,
        "wide_iLog2fc" = imputedDF_wide_log2fc,
        "wide_iPvalue" = imputedDF_wide_pvalue
      )
    } else if (grepl("global", isPtm)) {
      list_of_datasets <- list(
        "log2fcImputed" = imputedDF,
        "wide_iLog2fc" = imputedDF_wide_log2fc,
        "wide_iPvalue" = imputedDF_wide_pvalue
      )
    } else{
      stop(isPtm, " is not a valid option")
    }
  } else{
    stop("Contact developers")
    # The script should have crashed by this point. 
    # If it gets up to here... it would be very weird
  }
  
  ## EXCEL: Defining style for the header-----
  hs <- openxlsx::createStyle(fontName = "Arial",
                              fontColour = "white",
                              fgFill = "#000000",
                              textDecoration = "Bold",
                              border = "Bottom")
  openxlsx::write.xlsx(
    list_of_datasets,
    file = outexcel,
    asTable = FALSE,
    headerStyle = hs, 
    overwrite = TRUE
  )
  
  if(verbose) message("Folder <", output_dir, "> ")
  if(verbose) message("- EXCEL: ", outexcel, " ")
  if(verbose) message("- Log2fc Wide: ", outlog2fc, " ")
  if(verbose) message("- Log2fc Impute: ", outlog2fc, " ")
  
  if (enrich == TRUE) {
    if(verbose) message("- ENRICHMENT files should also be out ")
  }
  
  if(verbose) message(">> SUPER ANALYSIS COMPLETED")
}


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Adding a column with the species name
#'
#' @description Adding the species name to every protein.
#' This makes more sense if there are more than one species in the dataset,
#' which must be specified in the `pathogen` option. Influenza is a special
#' case that it does not need to be specified, as far as the proteins were
#' originally annotated as `INFLUENZAGENE_STRAIN`
#' (strains covered `H1N1`, `H3N2`, `H5N1`), as for example, `NS1_H1N1`
#' @param df (data.frame) with a `Protein` column (of uniprot ids)
#' @param pathogen (char) Is there a pathogen in the dataset as well?
#' if it does not, then use `pathogen = nopathogen` (default). Supported`tb` 
#' (Tuberculosis),
#' `lpn` (Legionella)
#' @param species (char) Host organism (supported for now: `human` or `mouse`)
#' @return (data.frame) The same data.frame but with an extra column 
#' specifying the species
#' @keywords annotation, species
#' @examples
#' # Adding a new column with the main species of the data. Easy.
#' # But the main functionality is to add both the host-species and a pathogen,
#' # which is not illustrated in this example
#' data_with_specie <- artmsAnnotateSpecie(df = artms_data_ph_msstats_results,
#'                                          species = "human")
#' @export
artmsAnnotateSpecie <- function(df,
                                 pathogen = "nopathogen",
                                 species) {
  
  if(any(missing(df) | missing(species)))
    stop("Missed (one or many) required argument(s).
         Please, check the help of this function to find out more")
  
  if (pathogen == "nopathogen") {
    # Influenza is treated differently
    df$Species <-
      ifelse(grepl("_H1N1|_H3N2|_H5N1", df$Protein),
             "Influenza",
             species)
  } else if (pathogen == "tb") {
    pathogen.ids <- artms_data_pathogen_TB
    df$Species <-
      ifelse(df$Protein %in% pathogen.ids$Entry, pathogen, species)
  } else if (pathogen == "lpn") {
    pathogen.ids <- artms_data_pathogen_LPN
    df$Species <-
      ifelse(df$Protein %in% pathogen.ids$Entry, pathogen, species)
  }
  return(df)
}

# 
#' @title Generate ph-site specific detailed file
#'
#' @description Generate extended detailed ph-site file, where every line is a
#' ph site instead of a peptide. Therefore, if one peptide has multiple ph sites
#' it will be breaking down in each of the sites. This file will help generate
#' input files for tools as [Phosfate](http://phosfate.com/) or
#' [PHOTON](https://github.com/jdrudolph/photon)
#' @param df (data.frame) of log2fc and imputed values
#' @param pathogen (char) Is there a pathogen in the dataset as well? Available
#' pathogens are `tb` (Tuberculosis), `lpn` (Legionella). If it is not,
#' then use `nopathogen` (default).
#' @param species (char) Main organism (supported for now: `human` or `mouse`)
#' @param ptmType (char) It must be a ptm-site quantification dataset. Either:
#' yes: `ptmsites` (for site specific analysis), or
#' `ptmph` (Jeff's script output evidence file).
#' @param output_name (char) A output file name (extension `.txt` required)
#' @return (data.frame) extended version of the ph-site
#' @keywords external, tools, phosfate
#' @examples \dontrun{
#' artmsGeneratePhSiteExtended(df = dfobject, 
#'                              species = "mouse", 
#'                              ptmType = "ptmsites",
#'                              output_name = log2fc_file)
#' }
#' @export
artmsGeneratePhSiteExtended <- function(df,
                                        pathogen = "nopathogen", 
                                        species, 
                                        ptmType,
                                        output_name) {
                                         
  # #Debug
  # df = artms_data_ph_msstats_results
  # pathogen = "nopathogen"
  # species = "human"
  # ptmType = "ptmsites"
  # output_name = ""
  
  if(any(missing(df) | 
         missing(species) | 
         missing(ptmType) | 
         missing(output_name)))
    stop("Missed (one or many) required argument(s)
         Please, check the help of this function to find out more")
    
    imputedDFext <- NULL
    
    imputedDFext <- .artms_checkIfFile(input_file = df)
    
    if (ptmType == "ptmph") {
      # Change the Protein name to Uniprot_PTM (if it is not there already)
      if(!any(grepl("Uniprot_PTM", colnames(imputedDFext)))) 
        names(imputedDFext)[grep('^Protein$', names(imputedDFext))] <- 'Uniprot_PTM'
      
      # Take the Protein ID, but being very careful about the fluomics labeling
      imputedDFext <- 
        .artmsExtractUniprotId(x = imputedDFext, 
                               uniprotPtmColumn = "Uniprot_PTM", 
                               newColumnName = "Protein")
      
      # Extract sites from Uniprot_PTM
      imputedDFext$PTMsite <-
        gsub("^(\\S+?)(_ph.*)", "\\2", imputedDFext$Uniprot_PTM, perl = TRUE)
      imputedDFext$PTMsite <- gsub("^(_ph)", "", imputedDFext$PTMsite)
      imputedDFext$PTMsite <- gsub("_ph", ",", imputedDFext$PTMsite)
      
      # And create independent columns for each of them
      imputedDFext <- imputedDFext %>% mutate(PTMsite = strsplit(PTMsite, ",")) %>% tidyr::unnest(PTMsite)
      
      if( !any(grepl("Gene", colnames(imputedDFext))) ){
        suppressMessages(
          imputedDFext <- artmsAnnotationUniprot(imputedDFext, 
                                                 'Protein', 
                                                 species)
        )
      }
      
      if( any(grepl("Label", colnames(imputedDFext))) ){
        names(imputedDFext)[grep("^Label$", names(imputedDFext))] <- 'Comparison'
      }

      imputedDFext <- artmsAnnotateSpecie(imputedDFext, pathogen, species)
    } else if (ptmType == "ptmsites") {
      #1. Change the Protein name to Uniprot_PTM (if it is not there already)
      if(!any(grepl("Uniprot_PTM", colnames(imputedDFext)))) 
        names(imputedDFext)[grep('^Protein$', names(imputedDFext))] <- 
        'Uniprot_PTM'
      
      # 2. Make a copy of Uniprot_PTM to operate on it
      imputedDFext$PTMone <- imputedDFext$Uniprot_PTM
      
      # 3. Create independent columns for each of them
      imputedDFext <- imputedDFext %>% 
        dplyr::mutate(PTMone = strsplit(PTMone, ",")) %>% 
        tidyr::unnest(PTMone)
      
      # 4. And take the labels:
      imputedDFext$Protein <-
        ifelse(
          grepl("_H1N1|_H3N2|_H5N1", imputedDFext$PTMone),
          gsub("^(\\S+?_H[1,3,5]N[1,2])_.*", "\\1",
               imputedDFext$PTMone, perl = TRUE
          ) ,
          gsub("^(\\S+?)_.*", "\\1", imputedDFext$PTMone, perl = TRUE)
        )
      imputedDFext$PTMaa <-
        gsub("(\\S+)(_)([S,T,Y,K])(\\d+)", "\\3", imputedDFext$PTMone)
      imputedDFext$PTMsite <-
        gsub("(\\S+)(_)([S,T,Y,K])(\\d+)", "\\4", imputedDFext$PTMone)
      
      imputedDFext$PTMone <- NULL
      
      if( !any(grepl("Gene", colnames(imputedDFext))) ) {
        suppressMessages(
          imputedDFext <- artmsAnnotationUniprot(imputedDFext, 'Protein', species)
        )
      }
      
      if( any(grepl("Label", colnames(imputedDFext))) ) {
        names(imputedDFext)[grep("^Label$", names(imputedDFext))] <- 'Comparison'
      }
      
      # imputedDFext$Species <- ifelse(grepl("_H1N1|_H3N2|_H5N1", 
      # imputedDFext$Protein), "Influenza", species)
      # imputedDFext$Species <- ifelse(imputedDFext$Protein 
      # %in% pathogen.ids$Entry, pathogen, species)
      
      if( !any(grepl("Species", colnames(imputedDFext))) ){
        suppressMessages(
          imputedDFext <- artmsAnnotateSpecie(imputedDFext, "Protein", species)
        )
      }
    }else{
      stop( "--- (!) Only 'ptmph' or 'ptmsites' allowed for argument <ptmType> ")
    }
      
    outlog2fcImputext <- gsub(".txt", "-imputedL2fcExtended.txt", output_name)
    
    write.table(
      imputedDFext,
      outlog2fcImputext,
      quote = FALSE,
      sep = "\t",
      row.names = FALSE,
      col.names = TRUE
    )
    return(imputedDFext)
  }


# 
# @title Imputing missing values
#
# @description When a value is completely missed in one of the conditions,
# the `log2fc = Inf / -Inf`. This function imputes those values, i.e.,
# will assign 'artificial' values.
# The imputation method works as follow. The assumption is that those
# proteins are likely present as well in those conditions where are found
# missed, but due to the small sampling (usually 2 or 3 biological replicas)
# and other proteomics related issues, those proteins didn't make it through
# the level of detection.
# Therefore, a small intensity (sampled from the bottom 10 intensity values)
# will be assigned to the protein/site in the missing condition,
# and the new log2fc is re-calculated out of the MSstats box.
# Two issues are addressed as follows:
# 1. If a protein has been consistently identified in one of the conditions,
# it will stay
# 2. But if the intensity value in those conditions was too low,
# then the log2fc will be also low
# @param dflog2fcinfinites data.frame of proteins with only infinite
# values from the msstats results file
# @param dfmq Abundance data, which will be used to know the details of
# reproducibility
# @return Imputed missing values
# @keywords internal, imputation, log2fc, quantifications, missing values
.artms_imputeMissingValues <- function(dflog2fcinfinites, 
                                       dfmq) {
  
  # The comparsions
  contrast <- unique(dflog2fcinfinites$Label)
  
  # Select the IDs to impute
  ids2impute <- unique(dflog2fcinfinites$Protein)
  
  # Take the abundance values for all the proteins
  abu2imp <- .artms_loadModelqcBasic(dfmq)
  
  # Aggregate the technical replica by choosing the maximum value
  abu2imp <- aggregate(Abundance ~ Protein + Condition + BioReplicate,
                        data = abu2imp,
                        FUN = mean)
    
  
  # Check things that will be imputed
  # dfdc.ni <- data.table::dcast(data=abu2imp2, 
  # Protein~BioReplicate, value.var = "Abundance")
  
  # Two possible options here.
  # 1. Select based on the bottom x%
  # # Imputing the missing values by selecting randomly from the bottom 5%
  # theMin <- min(dfmq$ABUNDANCE)
  # # Select the 5% quartile as the maximum value to sample from
  # theMax <- quantile(dfmq$ABUNDANCE, probs = .05)
  #
  # 2. Select the bottom 20 intensities
  # Grab the bottom 30 intensities in the dataset
  dfmqOrdered <- dfmq[order(dfmq$ABUNDANCE, decreasing = FALSE), ]
  
  numberFromBottom <- 5
  abuBottom <- head(dfmqOrdered$ABUNDANCE, n = numberFromBottom)
  
  theMin <- abuBottom[1]
  theMax <- abuBottom[numberFromBottom]
  
  # Generating the numbers from which we are going to sample
  numbers2sample <- seq(from = theMin, to = theMax, by = .00001)
  
  ##LEGACY
  # # dcast on abundance and fill with random numbers between the minimum and q05
  # suppressWarnings(
  #   dfdc.im <- data.table::dcast(data = abu2imp2,
  #                                Protein ~ BioReplicate,
  #                                value.var = "Abundance",
  #                                fill = sample(numbers2sample, 
  #                                              replace = FALSE)
  #   )
  # )
  
  dfdc.im <- abu2imp %>% 
    dplyr::select(-Condition) %>%
    tidyr::pivot_wider(names_from = BioReplicate,
                       values_from = Abundance)
  
  dfdc.im <- as.data.frame(dfdc.im)

  dfdc.im[] <- Map(function(x, y) replace(x, is.na(x), y), 
                   dfdc.im, 
                   sample(numbers2sample, 
                          size = dim(dfdc.im)[2], 
                          replace = FALSE))

  # Needs to aggregate on biological replicas
  # 1. Melt on biological replicas
  # Legacy
  # dfdc.melt <- data.table::melt(dfdc.im,
  #                               id.vars = c('Protein'),
  #                               value.name = 'Abundance',
  #                               variable.name = 'BioReplicate')
  
  dfdc.melt <- dfdc.im %>% 
    tidyr::pivot_longer(cols = -c(Protein), 
                        names_to = "BioReplicate", 
                        values_to = "Abundance")

  # 2. Get the condition
  dfdc.melt$Condition <- gsub("(.*)(-)(.*)", "\\1", dfdc.melt$BioReplicate)
    
  # 3. Dcast and Aggregate on the condition, taking the mean
  ##LEGACY
  # dfdc.final <-
  #   data.table::dcast(
  #     data = dfdc.melt,
  #     Protein ~ Condition,
  #     value.var = 'Abundance',
  #     fun.aggregate = mean
  #   )
  
  dfdc.final <- dfdc.melt %>% 
    dplyr::select(-BioReplicate) %>%
    tidyr::pivot_wider(names_from = Condition, 
                values_from = Abundance,
                values_fn = list(Abundance = mean))
                                           
  # 4. Filter by proteins to impute
  dfdc.final <- dfdc.final[which(dfdc.final$Protein %in% ids2impute), ]
    
  
  for (c in contrast) {
    # message("\t",c," --> ")
    x <- gsub("(.*)(-)(.*)", "\\1", c)
    y <- gsub("(.*)(-)(.*)", "\\3", c)
    # message("log2fc(",x, " - ", y,") ")
    
    # Renaming the comparision name just for illustration purposes
    rnc <- paste0("l2fc_", c)
    
    dfdc.final[[rnc]] <- dfdc.final[[x]] - dfdc.final[[y]]
  }
  
  # Select only log2fc columns
  imputedL2FCValues <- dfdc.final[grepl("Protein|l2fc_", colnames(dfdc.final))]
  
  # Melt again
  # LEGACY
  # imputedL2FCmelted <- data.table::melt(imputedL2FCValues,
  #                                       id.vars = c('Protein'),
  #                                       variable.name = 'Label',
  #                                       value.name = 'iLog2FC')
  
  imputedL2FCmelted <- imputedL2FCValues %>%
    tidyr::pivot_longer(cols = -Protein,
                        names_to = "Label",
                        values_to = "iLog2FC")
    
    
  # Now let's get it ready for merging with the values to be 
  # imputed at dflog2fcinfinites
  imputedL2FCmelted$Label <- gsub("l2fc_", "", imputedL2FCmelted$Label)
    
  
  # And let's add p-values
  samplingPvalue <- seq(from = 0.01, to = 0.05, by = .0000001)
  # And add imputed pvalues
  imputedL2FCmelted$iPvalue <- sample(samplingPvalue,
                                      size = nrow(imputedL2FCmelted),
                                      replace = FALSE)
  
  imputedL2FCmelted <- as.data.frame(imputedL2FCmelted)
  
  return (imputedL2FCmelted)
}

# 
# @title Load limited columns from abundance (modelqc) annotated
#
# @description Load limited columns from abundance (modelqc) annotated
# @param df (data.frame) with the raw abundance data (modelqc)
# @param species (char) Species name for annotation purposes
# @param ptmis (char) Specify whether is a PTM dataset: `global`, `ptmsites`,
# `ptmph`
# @param verbose (logical) `TRUE` (default) shows function messages
# @return annotated data.frame of abundance data
# @keywords abundance, annotated
.artms_loadModelQCstrict <- function (df, 
                                      species, 
                                      ptmis,
                                      verbose = TRUE) {
  
  colnames(df) <- toupper(colnames(df))
  
  # Remove empty entries
  if (any(df$PROTEIN == "")) {
    df <- df[-which(df$PROTEIN == ""), ]
  }
  df$PROTEIN <- gsub("(^sp\\|)(.*)(\\|.*)", "\\2", df$PROTEIN)
  df$PROTEIN <- gsub("(.*)(\\|.*)", "\\1", df$PROTEIN)
  
  # Technical replicas: aggregate on the mean the technical replicas
  b <- aggregate(ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
                 data = df,
                 FUN = mean)
  
  ##LEGACY
  # datadc <- data.table::dcast(data = b,
  #                             PROTEIN ~ GROUP,
  #                             value.var = 'ABUNDANCE',
  #                             fun.aggregate = mean)
  
  datadc <- b %>% pivot_wider(id_cols = PROTEIN, 
                              names_from = GROUP, 
                              values_from = ABUNDANCE, 
                              values_fn = list(ABUNDANCE = mean))
                               
    
  
  names(datadc)[grep('PROTEIN', names(datadc))] <- 'Protein'
  
  if (grepl("ptm", ptmis)) {
    # if is a PTM dataset we don't need the real gene names for now,
    # we need to use the Uniprot_ptm notation
    datadc$Gene <- datadc$Protein
    send_back <- datadc
  } else{
    suppressMessages(
      send_back <- artmsAnnotationUniprot(x =  datadc, 
                                          columnid = 'Protein', 
                                          species = species)
      )
  }
  return(send_back)
}


#
# @title Load the basic ModelQC file
#
# @param x (data.frame) of the ModelQC file
# @return (data.frame) of the modelqc file with the columns Protein, Abundance,
# Condition, BioReplicate
# @keywords internal, loading
.artms_loadModelqcBasic <- function(x) {
  ## Not removing protein groups for now
  # if (length(grep(";", x$PROTEIN)) > 0)
  #   x <- x[-grep(";", x$PROTEIN), ]
  
  if ("PROTEIN" %in% colnames(x)) {
    names(x)[grep("PROTEIN", names(x))] <- 'Protein'
  } else{
    stop("<PROTEIN> column not found")
  }
  if ("ABUNDANCE" %in% colnames(x)) {
    names(x)[grep("ABUNDANCE", names(x))] <- 'Abundance'
  } else{
    stop("<ABUNDANCE> protein not found!")
  }
  if ("GROUP" %in% colnames(x)) {
    names(x)[grep("GROUP", names(x))] <- 'Condition'
  } else{
    stop("<GROUP> not found")
  }
  if ("SUBJECT" %in% colnames(x)) {
    names(x)[grep("SUBJECT", names(x))] <- 'BioReplicate'
  } else{
    stop("<SUBJECT> not found")
  }
  x <- subset(x, select = c(Protein, Abundance, Condition, BioReplicate))
  return(x)
}

# 
# @title Merge abundance and number of biological replicates per condition
#
# @description Merge abundance and number of biological replicates
# per condition
# @param df_input (data.frame) Abundance input file
# @param repro (data.frame) Reproducibility data.frame
# @param species (char) Species for annotation purposes
# @return (data.frame) of abundance merged with reproducibility info
# @keywords abundance, reproducibility, merging
.artms_mergeAbNbr <- function (df_input, repro, species) {
  # Remove empty entries
  if (any(df_input$PROTEIN == "")) {
    df_input <- df_input[-which(df_input$PROTEIN == ""), ]
  }
  df_input$PROTEIN <- gsub("(^sp\\|)(.*)(\\|.*)", "\\2", df_input$PROTEIN)
  df_input$PROTEIN <- gsub("(.*)(\\|.*)", "\\1", df_input$PROTEIN)
  
  # TECHNICAL REPLICAS: if there are technical replicas, 
  # this means that we will find
  # two values for the same protein in the same bioreplica, 
  # therefore we need to aggregate first just in case:
  df_input <- aggregate(ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
                        data = df_input,
                        FUN = mean)
    
  ##LEGACY
  # dc_input <- data.table::dcast( data = df_input[, c('PROTEIN', 'ABUNDANCE', 'GROUP')],
  #                                PROTEIN ~ GROUP,
  #                                value.var = 'ABUNDANCE',
  #                                fun.aggregate = mean,
  #                                fill = 0)
  
  dc_input <- df_input %>% pivot_wider(id_cols = PROTEIN, 
                                        names_from = GROUP, 
                                        values_from = ABUNDANCE, 
                                        values_fn = list(ABUNDANCE = mean), 
                                        values_fill = list(ABUNDANCE = 0))
   
    
  names(dc_input)[grep('PROTEIN', names(dc_input))] <- 'Protein'
  
  colnames(repro) <- paste("NumBR", colnames(repro), sep = "_")
  colnames(repro)[1] <- 'Protein'
  dc_input <- merge(dc_input, repro, by = c('Protein'))
  
  return(dc_input)
}

# 
# @title Merge changes (log2fc) and number of biological replicates per
# condition
#
# @description Merge changes, i.e., MSstats results file of quantified changes,
# with the number of biological replicates per condition
# @param df_input Changes data.frame
# @param repro Reproducibility data.frame
# @param species Species for annotation purposes
# @param choosePvalue pvalue to use when making wider the dataframe
# @return Merged data.frame of changes and reproducibility information
# @keywords changes, log2fc, reproducibility, merging
.artms_mergeChangesNbr <- function (df_input, repro, species, choosePvalue) {
  # # Remove the weird empty proteins
  # if(any(df_input$Protein == "")){ df_input <- 
  # df_input[-which(df_input$Protein == ""),]}
  # df_input$Protein <- gsub("(^sp\\|)(.*)(\\|.*)", "\\2", df_input$Protein )
  # df_input$Protein <- gsub("(.*)(\\|.*)", "\\1", df_input$Protein )
  
  ##LEGACY
  # input_melt = data.table::melt(data = df_input[, c('Protein', 
  #                                                 'Label', 
  #                                                 'log2FC', 
  #                                                 'adj.pvalue'), ], 
  #                               id.vars = c('Protein', 'Label'))
  
  input_melt <- df_input %>%
    dplyr::select(c(Protein, 
                    Label, 
                    log2FC, 
                    !!!choosePvalue)) %>%
    tidyr::pivot_longer(cols = -c(Protein, Label), 
                 names_to = "variable", 
                 values_to = "value")
  ##LEGACY
  # input_dcast <- data.table::dcast(Protein ~ Label + variable,
  #                                 data = input_melt,
  #                                 value.var = c('value'))
  
  input_dcast <- input_melt %>% 
    dplyr::mutate(Label_variable = paste(Label, variable, sep = "_")) %>%
    tidyr::pivot_wider(id_cols = c(Protein),
                       names_from = Label_variable, 
                       values_from = value)
  
  colnames(repro) <- paste("NumBR", colnames(repro), sep = "_")
  colnames(repro)[1] <- 'Protein'
  input_dcast <- merge(input_dcast, repro, by = c('Protein'))
  
  # Move Gene name to the left:
  return(input_dcast)
}


# @title Filter: Remove proteins below some threshold of minimal reproducibility
#
# @description If a protein is not found in a minimal number of
# biological replicates in at least one of the conditions, it is removed
# @param dfi (data.frame) Data.frame with biological replicates information
# @param mnbr (int) minimal number of biological replicates
# @return (data.frame) a filtered `dfi`
# @keywords internal, filter, bioreplicates, reproducibility
.artms_RemoveProtBelowThres <- function(dfi, mnbr) {
  theComparisons2check <- unique(dfi$Label)
  for (onlyonecomp in (theComparisons2check)) {
    ax <- gsub("(.*)(-)(.*)", "\\1", onlyonecomp)
    ay <- gsub("(.*)(-)(.*)", "\\3", onlyonecomp)
    
    # If the condition is not met, i.e., if all the proteins are
    # found in at least X biological replicas, then
    # it would remove the whole thing.
    if (dim(dfi[which((dfi[[ax]] < mnbr) & (dfi[[ay]] < mnbr)), ])[1] > 0) {
      dfi <- dfi[-which(((dfi[[ax]] < mnbr) & (dfi[[ay]] < mnbr)) & (dfi$Label == onlyonecomp)), ]
    }
  }
  return(dfi)
}

# 
# @title Select and label the condition more abundant in a quantification
#
# @description Select and label the condition more abundant in a quantification
# - If log2fc > 0 the condition on the left ('numerator') is the most abundant
# - If log2fc < 0 the condition on the right ('denominator') 
# is the most abundant
# @param a (char) log2fc column
# @param b (char) comparison column
# @return (char) One of the conditions from the comparison
# @keywords internal, selection, labeling
.artms_selectTheOneLog2fc <- function(a, b) {
  thrs <- 0
  # a should be the log2fc column
  if (a > thrs) {
    sb <- gsub("(.*)(-)(.*)", "\\1", b)
  } else if (a < -thrs) {
    sb <- gsub("(.*)(-)(.*)", "\\3", b)
  } else {
    sb <- 'NA'
  }
  return(sb)
}



# 
# @title Extract Uniprot ID from a UNIPROT_PTM  identifier
#
# @param x The data.frame with the uniprot_ptm column
# @param uniprotPtmColumn (char) Column in the data.frame with the Uniprot_PTM
# IDs
# @param newColumnName (char) Name of the new column that will contain the
# Uniprot ids
# @return (char) One of the conditions from the comparison
# @keywords internal, selection, labeling
.artmsExtractUniprotId <- function(x, uniprotPtmColumn, newColumnName){
  if(any(missing(x) | missing(newColumnName) | missing(uniprotPtmColumn)))
    stop("Missed (one or many) required argument(s)
         Please, check the help of this function to find out more")
  
  if(!is.character(newColumnName)) 
    stop("Argument 'newColumnName' must be a character")
  if(!is.character(uniprotPtmColumn)) 
    stop("Argument 'uniprotPtmColumn' must be a character")
  
  x[[newColumnName]] <- ifelse(
    grepl("_H1N1|_H3N2|_H5N1", 
          x[[uniprotPtmColumn]]),
    gsub("^(\\S+?_H[1,3,5]N[1,2])_.*",
         "\\1",
         x[[uniprotPtmColumn]],
         perl = TRUE) ,
    gsub("^(\\S+?)_.*", 
         "\\1", 
         x[[uniprotPtmColumn]], 
         perl = TRUE) 
  )
  return(x)
}
kroganlab/artMS documentation built on July 7, 2023, 5:35 a.m.