LFQbench: Label Free Quantitation benchmark package

Documented in FSWE.generateReports

#####################################

#' FSWE.generateReports
#' 
#' It generates peptide and protein reports compatible with LFQbench, by following the data processing
#' suggested in (#Ref_to_article). 
#' 
#' @param experimentFile
#' @export
FSWE.generateReports <- function(
                            experimentFile, 
                            plotHistogram = F, 
                            plotHistNAs = F, 
                            reportSequences = F, 
                            peptide.list = NULL,
                            protein.list = NULL,
                            singleHits = F,
                            softwareSource="guess",
                            working_dir = LFQbench.Config$DataRootFolder,
                            keep_original_names = FALSE,
                            outputFileNameSuffix = NULL,
                            minimumHitsPerSample = 2,
                            top.N =3,
                            top.N.min =2,
                            integrationMethod = "topN_individual",
                            quench.minPepsToRemove = 10, 
                            quench.removeTop = 3
                            )
{


    integrationMethod.types = c("topN_individual", "topN_consensus", "median_individual")
    
    dataSets = as.list(FSWE.dataSets)
    speciesTags = FSWE.speciesTags
    sample.names = names(FSWE.dataSets)
    modificationsToUniMod = FSWE.modificationsToUniMod
        
    # TODO: check if datasets are defined
    
    topN.sort_method = "sum"  # "sum", "mean", "idrate_mean"  ## TODO: NOT YET IMPLEMENTED 
    topNindividual = T
    restrictNA = F
    #topN.allowNAs = T  ## Redundant
    #top.N = 3 
    #top.N.min = 2
    
    results_dir <- file.path( working_dir, "input" )
    supplementary_dir <- file.path( working_dir, "supplementary" )
    LFQbench.setDataRootFolder(working_dir, createSubfolders = F)
    mkdir( results_dir )
    mkdir( supplementary_dir )
    
  if(softwareSource == "guess") {
    softwareSource <- guessSoftwareSource(experimentFile, FSWE.softwareNames)
  }
  
  FSWE.switchSoftwareConfiguration(softwareSource)
  
  # expand config items as variables to the environment of current function
  nix = sapply( names(FSWE.Config), 
        function(pn) assign( pn,FSWE.Config[[pn]], envir = parent.env(environment()) ) 
  )

  ## Ops related to the quantitation variable ##
  sumquant <- paste0("sum(", quantitative.var, ")")
  medianquant <- paste0("median(", quantitative.var,")")
  
  qvalue.filtered = FALSE
  # Read file
  original.experimentFile <- experimentFile
  experimentFile <- file.path(working_dir, experimentFile)
  cat(paste0("Generating peptide report for ", experimentFile, "\n"))
  
  if(grepl(".xls", input.extension)){
    df <- read_excel(experimentFile, sheet=sheet.data, col_names=TRUE)
    names(df) <- gsub(" ", ".", names(df))
    if(!is.na(q_filter_threshold)){
      df$sequence_z <- paste(df[[sequence.mod.var]], df[[charge.var]], sep="_")
      df$sequence_z <- as.character(df$sequence_z)
      df.fdr <- read_excel(experimentFile, sheet=sheet.fdr, col_names=TRUE)
      names(df.fdr) <- gsub(" ", ".", names(df.fdr))
      df.fdr$sequence_z <- paste(df.fdr[[sequence.mod.var]], df.fdr[[charge.var]], sep="_")
      
      #remove possible duplicates
      df <- df %>% filter(!duplicated(sequence_z))
      df.fdr <- df.fdr %>% filter(!duplicated(sequence_z))
      
      tmp1 <- data.frame(sequence_z = as.character(df.fdr$sequence_z))
      tmp1$sequence_z <- as.character(tmp1$sequence_z)
      valid_peptides <- df.fdr[, grepl(fdr.var.tag, colnames(df.fdr), ignore.case=T)] <= q_filter_threshold
      tmp1 <- cbind(tmp1, valid_peptides)
      df.fdr <- tmp1
      rm(tmp1)
      df2 <- left_join(df, df.fdr, by= "sequence_z" )
      df2 <- unique(df2)
      df2.flags <- df2[,grepl(fdr.var.tag, colnames(df2), ignore.case=T)]
      df2.values <-df2[,grepl(quantitative.var.tag, colnames(df2), ignore.case=T) & 
                        !grepl(fdr.var.tag, colnames(df2), ignore.case=T)] 
      df2.values[!df2.flags] <- NA
      rm(df2); rm(df2.flags); rm(valid_peptides); rm(df.fdr)
      df <- df[ ,!grepl(quantitative.var.tag, colnames(df), ignore.case=T)]
      df <- cbind(df, df2.values)
      
      qvalue.filtered = TRUE
    }
  }else{
    #df <- read.table(experimentFile, na.strings= nastrings, header=T, 
    #                 sep=guessSep(experimentFile), stringsAsFactors =F, fill = T)

    if(guessSep(experimentFile) == ","){
        df <- read_csv(experimentFile, na = nastrings)
    }
    if(guessSep(experimentFile) == "\t"){
        df <- read_tsv(experimentFile, na = nastrings)
    }
        
  }
  
  # read_csv/read_tsv will produce spaces in column names
  # we need to replace them by . for compatibility with read.table
  names(df) <- make.names(names(df))
  
  # match column names if a column name parameter is a list
  theEnvir = environment()
  matchColumnNames <- function(paramName = "protein.var")
  {
      # find which element matches the real column name 
      # set the var to this value
      paramValues = get0(paramName)
      
      # don't do anything if the value is NULL OR NA
      if( is.null(paramValues) || is.na(paramValues) ) return(F)
      
      # for any kind of vector/array/list/etc.
      if( length(paramValues)>1 ) 
      {
          columnNames = names(df)
          matchingValues = paramValues %in% columnNames
          # the parameter must be there!!!
          if(!any(matchingValues)) 
          {
              stop( paste0("software configuration does not match the input file!\n",
                           "file: ", experimentFile, "\n",
                           paramName, ": ", paste(paramValues, collapse=", "), "\n",
                           "column names: ", paste(columnNames, collapse=", "), "\n")
              )
          }
          assign(paramName, paramValues[matchingValues][1], envir = theEnvir)
      }
  }
  
  # list all variables
  colVars = ls()
  # extract variables' names identifiying data columns
  colVars = colVars[grep("\\.var$", colVars)]
  # check if every parameter is present in the header of input file
  nix = sapply( colVars, matchColumnNames )
  
  # FOR DEBUGGING ONLY
  # sapply( colVars, function(n) cat(paste0("\t", n, " = ", get0(n), "\n")) )
  
  if(protein_input){
    # If the input is already a peptide report, we need to "fake" some columns as a temporary solution 
    # to process the files
    df <- df %>% 
      mutate_( sequence = protein.var, charge = 1)
    sequence.mod.var = "sequence"
    charge.var = "charge"
  }
  
  if(!is.na(q_filter_threshold) & !qvalue.filtered){
    df <- eval( substitute(filter(df, var < q_filter_threshold), list( var = as.name(qvalue.var)) ) )
    qvalue.filtered = TRUE
  }    
  
  # Remove rows containing decoy tags
  for(dectag in decoy.tags){
    df <- df[!grepl(dectag, df[[protein.var]], ignore.case = T),]
  }
  
  # If there is a "decoy" column with boolean values, filter to no decoys
  if(!is.na(decoy.var)){
      df[[decoy.var]] = as.logical(df[[decoy.var]])
      df <- df[df[[decoy.var]] == FALSE,]
  }
  
  # Attach species and remove peptides belonging to multiple species, and not considered species ("NA"s)
  df <- df %>% rowwise()
  df <- eval( substitute( mutate(df, "species" = guessOrganism(var, speciesTags)), list(var = as.name(protein.var) ) ) ) 
  df <- filter(df, species != "NA", species != "multiple")
  
  experiment <- NA
  if(input_format == "wide"){
    # At this point, it is better to transform the data frame into a long-format data frame, and
    # continue the analysis commonly for wide and long inputs.
    
    # find the columns containing the quantitative values (pivoted values). 
    # They will be use to gather the key-value pairs
    experiment <- which( sapply(dataSets, guessExperiment_wide, colnames(df) ) )
    
    # WARNING: the quantitative variables should be the only ones containing the injection names at this point.
    # We filter by the quantitative.var.tag in order to remove any other columns like score, ret time...
    #tmp1 <- df[, grepl(protein.var, colnames(df))]
    #tmp1 <- cbind( tmp1, df[, grepl(sequence.mod.var, colnames(df))] )
    #tmp1 <- cbind( tmp1, df[, grepl(charge.var, colnames(df))] )
    #tmp1 <- cbind( tmp1, df[, grepl("species", colnames(df))])
    tmp1 <- df[, protein.var]
    tmp1 <- cbind( tmp1, df[, sequence.mod.var] )
    tmp1 <- cbind( tmp1, df[, charge.var] )
    tmp1 <- cbind( tmp1, df[, "species"])
    
    # For the quantitative data, we filter by columns that have: the quantitative.var.tag AND any of the injection names
    quant.columns <- grepl(quantitative.var.tag, colnames(df), ignore.case=T) 
    injection.columns <- Reduce(`|`, lapply(dataSets[[experiment]], grepl, colnames(df)))
    tmp1 <- cbind( tmp1, df[, quant.columns & injection.columns])
    
    df <- tmp1
    rm(tmp1)
    
    quantvar.range <- which( grepl( quantitative.var.tag, colnames(df), ignore.case=T ) )
    quantvar.colnames <- names(df)[quantvar.range]
    if(is.na(filename.var))
        filename.var <- "filename.var"
    df <- df %>% gather_(filename.var, quantitative.var, quantvar.colnames) %>%
                 arrange_(protein.var, sequence.mod.var)
    
  }else if(input_format == "long"){
    # If filename.var contains the complete path -> remove it
    df[[filename.var]] <- basename(file_path_sans_ext(df[[filename.var]]))
    # Do a second check, in case there were extensions of compressed files like .mzxml.gz
    df[[filename.var]] <- basename(file_path_sans_ext(df[[filename.var]]))
    
    # Guess the experiment by the filename.var column
    injections <- distinct(select_(df, filename.var))  
    experiment <- which(sapply(dataSets, guessExperiment, injections))
  }
  
  # Remove any duplicate: (based on: injectionfile, ModifiedSequence, ChargeState)
  # (These cases are likely to be due to several entries in the library)
  data <- df %>% distinct_(filename.var, sequence.mod.var, charge.var, .keep_all = TRUE)  # I am not sure I removed all duplicates, or there is still one value per duplicate
  
  # Change zeroes to NA values at the quantitative variable
  data[[quantitative.var]][data[[quantitative.var]] == 0] <- NA
  
  #Remove NAs of the quant variable  
  # TODO: this does not work properly, it may remove everything if there is an empty (NA) variable
  #data <- data %>% na.omit() 
  
  # For each peptide: Sum quantitative values of charge states
  data <- data %>% ungroup() %>% group_by_(filename.var, sequence.mod.var, protein.var , "species") %>% 
    summarise_( quant_value = sumquant ) # proteinID = protein.var, species = "species" , 
  
  peptides_wide <- spread_(data, filename.var, "quant_value") 
  
  # Change variable names of injections by their right name 
  # (removing additions from software to the name, etc)
  common_names <- names(peptides_wide)[c(1:3)]
  inj_names <- names(peptides_wide)[-c(1:3)]
  inj_names <- as.character( sapply(inj_names, guessInjection, dataSets[[experiment]]) )
  names(peptides_wide) <- c( common_names, inj_names )
  
  experiment.order <- match(dataSets[[experiment]], names(peptides_wide[-c(1:3)])) + 3
  peptides_wide <- peptides_wide[, c(c(1:3), experiment.order)]
  
  #Rename the samples 
  names(peptides_wide) <- c("sequenceID", "proteinID", "species", rownames(FSWE.dataSets) )
  # names(peptides_wide) <- c("sequenceID", "proteinID", "species", inj_names )
  
  # add a sequence column (just to remove it after reporting naked sequences)
  peptides_wide$sequence <- gsub( "*\\[.*?\\]", "", peptides_wide$sequenceID )
  peptides_wide$sequence <- gsub( "*\\(.*?\\)", "", peptides_wide$sequence )
  
  
  # Convert modifications to UniMod
  for(mod in ls(modificationsToUniMod)){
      peptides_wide$sequenceID <- gsub(mod, modificationsToUniMod[[mod]], peptides_wide$sequenceID)
  }

  
  #expfile_noext <- file_path_sans_ext(basename(experimentFile))
  expfile_noext <- paste(softwareSource, names(experiment)[1], sep="_")
  if(!is.null(outputFileNameSuffix)) 
  {
    expfile_noext <- paste(expfile_noext, outputFileNameSuffix, sep="_")
  }
  peptidereportname <- paste0(expfile_noext, "_peptides.tsv")
  proteinreportname <- paste0(expfile_noext, "_proteins.tsv")
  sequencereportname <- paste0(expfile_noext, "_sequencelist.csv")
  histPepProteinreportname <- paste0(expfile_noext, "_HistogramPeptidesProtein.pdf")
  histNAsProteinsreportname <- paste0(expfile_noext, "_proteins_HistogramNAs.pdf")
  histNAsPeptidesreportname <- paste0(expfile_noext, "_peptides_HistogramNAs.pdf")
    if(keep_original_names) 
    {
        file_base_name <- original.experimentFile
        if(!is.null(outputFileNameSuffix)) 
        {
          file_base_name <- paste(file_base_name, outputFileNameSuffix, sep="_")
        }
          
        peptidereportname  <- paste(file_base_name, "peptides.tsv", sep="_") 
        proteinreportname  <- paste(file_base_name, "proteins.tsv", sep="_") 
        sequencereportname <- paste(file_base_name, "sequencelist.csv", sep="_")
        histPepProteinreportname <- paste(file_base_name, "HistogramPeptidesProtein.pdf", sep="_")
        histNAsProteinsreportname <- paste(file_base_name, "_proteins_HistogramNAs.pdf", sep="_")
        histNAsPeptidesreportname <- paste(file_base_name, "_peptides_HistogramNAs.pdf", sep="_")
    }
  
  if(reportSequences){
    sequence_list <- as.data.frame(unique(peptides_wide$sequence))
    write.table(sequence_list, 
                file=file.path(supplementary_dir, sequencereportname),
                sep=",", row.names=F, col.names=F)
  }
  
  if(!is.null(peptide.list)){
    peptides_wide <- peptides_wide %>% filter(sequenceID %in% peptide.list)
  }
  
  if(!is.null(protein.list)){
    filterProtein <- function(myprotein, listtosearch){
      which(grepl(myprotein, listtosearch))
    }
    
    matches <- unique(unlist(sapply(protein.list, filterProtein, peptides_wide$proteinID)))
    peptides_wide <- peptides_wide[matches, ] 
  }
  
  attr(peptides_wide, "vars") <- NULL
  peptides_wide <- peptides_wide %>% ungroup() %>% select(-sequence) 
  # Remove "empty" peptides (all values are NAs).
  common_names <- !sapply(peptides_wide, is.numeric)
  nums <- sapply(peptides_wide, is.numeric)
  peptides_wide <- peptides_wide %>% ungroup() %>%
    mutate(totalIntensity = rowSums(.[nums], na.rm=T)) %>%
    filter(totalIntensity > 0) %>%
    select(-totalIntensity)
  
  
  
  if(protein_input){
    # If the input was already a protein report, we can finish here
    protein_report <- peptides_wide %>% select(-sequenceID)
    write.table(protein_report, file=file.path(results_dir , proteinreportname), 
                sep="\t", row.names=F, col.names=T)
    
    cat("Protein report as input -- No peptide report generated.\n")
    
    return(NA)
    
  }
  
  
  write.table(peptides_wide, file=file.path(results_dir , peptidereportname), 
              sep="\t", row.names=F, col.names=T)
  
  
  ## PROTEIN REPORT
  cat(paste0("Generating protein report for ", experimentFile,"\n"))

  if(singleHits){
    print("Summarising protein single hits...")
    integrationMethod = ""
    library(dplyr)
    #peptides_wide <- read_tsv("/Users/napedro/tmp_data/SWATHbenchmark/output_benchmark/HYE110_TTOF6600_32fix/input/Spectronaut_HYE110_TTOF6600_32fix_peptides.tsv")
    proteins_wide <- peptides_wide %>% 
        arrange(proteinID, species) %>%
        group_by(proteinID, species) %>%
        # filter(n_distinct(sequenceID) == 1) %>%    ## n_distinct is very, very, very slow!
        filter(length(unique(sequenceID)) == 1) %>%
        select(-sequenceID) %>% 
        summarise_each(funs(single_hits(.)))

  }
  
  if(integrationMethod == "topN_individual"){
      print("Summarising proteins...")
      print("using TOP3 individual for each run")
      proteins_wide <- peptides_wide %>% 
          select(-sequenceID) %>% 
          arrange(proteinID, species) %>%
          group_by(proteinID, species) %>%  
          summarise_each(funs(avg_top_n(., top.N, top.N.min))) 
  }
  
  if(integrationMethod == "median_individual"){
      print("Summarising proteins...")
      print("using median, individual for each run")
      proteins_wide <- peptides_wide %>% 
          select(-sequenceID) %>% 
          arrange(proteinID, species) %>%
          group_by(proteinID, species) %>%  
          summarise_each(funs(median_all(., top.N.min))) 
  }
  
  if(integrationMethod == "medianQuench_individual"){
      print("Summarising proteins...")
      print(paste0("using median (quenching top", quench.removeTop, 
                   " when the protein has more than ", quench.minPepsToRemove, 
                   " peptides), individual for each run." )) 
      proteins_wide <- peptides_wide %>% 
          select(-sequenceID) %>% 
          arrange(proteinID, species) %>%
          group_by(proteinID, species) %>%  
          summarise_each(funs(median_quench(., top.N.min, minPepsToRemove = quench.minPepsToRemove, removeTop = quench.removeTop))) 
  }
    
  if(integrationMethod == "topN_consensus"){
      print("Summarising proteins...")
      print(paste0("using consensus TOP", top.N))
      nums <- sapply(peptides_wide, is.numeric)
      proteins_wide <- peptides_wide %>% ungroup() %>%
          mutate(totalIntensity = rowSums(.[nums], na.rm=T))  %>% 
          group_by(proteinID) %>%
          arrange(desc(totalIntensity)) %>%
          filter(row_number() <= top.N & n() >= top.N.min) %>%
          select(-sequenceID, -totalIntensity) %>%
          group_by(proteinID, species)
      
      #TODO: Does this make sense only to TopN_consensus ????
      if(restrictNA){
          proteins_wide <- proteins_wide %>%
              summarise_each(funs(mean))
      }else{
          proteins_wide <- proteins_wide %>%
              summarise_each(funs(avgNA))
      }
      
  }


  

    
   
  
  # Remove "empty" proteins (all values are NAs). TODO: I wish I could find a more elegant way to do it. I am tired.
  common_names <- !sapply(proteins_wide, is.numeric)
  nums <- sapply(proteins_wide, is.numeric)
  proteins_wide <- proteins_wide %>% ungroup() %>%
    mutate(totalIntensity = rowSums(.[nums], na.rm=T)) %>%
    filter(totalIntensity > 0) %>%
    select(-totalIntensity)
  
  
  
  #Remove proteins, which have less than x hits per sample (among the two samples)
  #TODO: This is still provisional since it only considers two samples A and B.
  injectionNamesA <- rownames(FSWE.dataSets)[1:(length(rownames(FSWE.dataSets))/2)]
  injectionNamesB <- rownames(FSWE.dataSets)[(1+length(rownames(FSWE.dataSets))/2):length(rownames(FSWE.dataSets))]
  cols_sampleA <- match(injectionNamesA, names(proteins_wide)) 
  cols_sampleB <- match(injectionNamesB, names(proteins_wide)) 
  
    
  proteins_wide$numHitsA <- apply(proteins_wide, 1, function(elt, cols) length(cols) - sum(is.na(elt[cols])), cols_sampleA)
  proteins_wide$numHitsB <- apply(proteins_wide, 1, function(elt, cols) length(cols) - sum(is.na(elt[cols])), cols_sampleB)
  proteins_wide <- proteins_wide %>% rowwise() %>% 
                                mutate(maxHitsPerSample = max(numHitsA, numHitsB)) %>% 
                                ungroup()
  
  proteins_wide <- proteins_wide %>% 
                        filter(maxHitsPerSample >= minimumHitsPerSample) %>%
                        select(-numHitsA, -numHitsB, -maxHitsPerSample)
  
  write.table(proteins_wide, file=file.path(results_dir , proteinreportname), 
              sep="\t", row.names=F, col.names=T)
  
  ## Histogram: Peptides per protein
  
  peptides_per_protein <- peptides_wide %>%
    group_by(proteinID, species) %>%
    summarise(pep.protein = n_distinct(sequenceID))
  
  if(plotHistogram){
    pdf(file=file.path(supplementary_dir, histPepProteinreportname), width=6, height=4)
    h <- hist(x=peptides_per_protein$pep.protein, breaks=c(0:20,30,40,50,100,1000), main=NULL, xlab="Peptides per protein",
              xlim = c(0,20), plot=T,freq=T)
    dev.off()
  }
  
  ## Histograms: missing values across samples (peptides and proteins)
  peptides_wide$numNAs <- apply(peptides_wide, 1, function(elt) sum(is.na(elt)))
  proteins_wide$numNAs <- apply(proteins_wide, 1, function(elt) sum(is.na(elt)))
  peptides_wide$numNAs <- factor(peptides_wide$numNAs, levels = seq(0, length(which(nums))))
  proteins_wide$numNAs <- factor(proteins_wide$numNAs, levels = seq(0, length(which(nums))))
  
  if(plotHistNAs){
    p <- ggplot(proteins_wide, aes(x = numNAs))
    p <- p + geom_bar() #geom_histogram()
    p <- p + aes() + ylab("# proteins")
    if(exists("histNAs.proteins.scale")){
      p <- p + scale_y_continuous(limits = histNAs.proteins.scale)
    }
    hNAprot <- p + facet_wrap( ~ species, ncol = 3)  + theme_classic() +
      scale_fill_discrete(drop=FALSE) + scale_x_discrete(drop=FALSE)
    
    p <- ggplot(peptides_wide, aes(x = numNAs))
    p <- p + geom_bar() # geom_histogram()
    p <- p + aes() + ylab("# peptides")
    if(exists("histNAs.peptides.scale")){
      p <- p + scale_y_continuous(limits = histNAs.peptides.scale)
    }
    hNApep <- p + facet_wrap( ~ species, ncol = 3)  + theme_classic() +
      scale_fill_discrete(drop=FALSE) + scale_x_discrete(drop=FALSE)
    
    ggsave(filename = file.path(supplementary_dir, histNAsProteinsreportname), plot = hNAprot , width=6, height=4)
    ggsave(filename = file.path(supplementary_dir, histNAsPeptidesreportname), plot = hNApep,  width=6, height=4)
  }
}