R/lfqbench.access.r

Defines functions makeEverythingInOneFolderDocSet makeDocSet getCombinedProteinRSDs getCombinedLogRatios LFQbench.getMetrics saveSpeciesSeparation saveIDs saveSampleCVs saveSampleMeans saveLogRatios logIdStatistics fixFileExt getFileExt LFQbench.showMetrics LFQbench.explainMetrics saveMetrics LFQbench.plotResultSet

Documented in fixFileExt getCombinedLogRatios getCombinedProteinRSDs getFileExt LFQbench.explainMetrics LFQbench.getMetrics LFQbench.plotResultSet LFQbench.showMetrics logIdStatistics makeDocSet makeEverythingInOneFolderDocSet saveIDs saveLogRatios saveMetrics saveSampleCVs saveSampleMeans saveSpeciesSeparation

#' LFQbench.plotResultSet
#' 
#' Function to generate the scatter plot
#' @param resSet a result set
#' @param showScatterPlot
#' @param showScatterAndDensityPlot
#' @param showScatterAndBoxPlot
#' @param showBoxPlot 
#' @param showDensityPlot
#' @export
LFQbench.plotResultSet = function(resSet,
                         showScatterPlot=T,
                         showScatterAndDensityPlot=T,
                         showScatterAndBoxPlot=T,
                         showBoxPlot=T,
                         showDensityPlot=T){
    
    pdfFile = resSet$docSet$pdfFile
    
    if(!is.null(pdfFile)){
      cat(paste("creating ", pdfFile, " ...", sep = ""))
      pdf(file = pdfFile, onefile = T, width = LFQbench.Config$PlotWidth, height = LFQbench.Config$PlotHeight, family = "Helvetica", pointsize = 9)      
    }
    
    # accuracy and precision
    if(showScatterPlot) sapply( resSet$result, LFQbench.showScatterPlot, showRegLines = T )
    if(showScatterAndDensityPlot) sapply( resSet$result, LFQbench.showScatterAndDensityPlot, showRegLines = T )
    if(showScatterAndBoxPlot) sapply( resSet$result, LFQbench.showScatterAndBoxPlot, showRegLines = T )
    if(showBoxPlot) sapply( resSet$result, LFQbench.showLogRatioBoxPlot )
    if(showDensityPlot) sapply( resSet$result, LFQbench.showDistributionDensityPlot )
    
    if(!is.null(pdfFile)) dev.off()
    cat("[done]\n")
}

#' saveMetrics 
#' 
#' This function save all the metrics related with the experiment 
#' @param resultSets
#' @param File 
saveMetrics = function(resultSets = ResultSets, File = paste(LFQbench.Config$LogFilesLocation, "/metrics.txt", sep = "")){
  metrics = lapply( resultSets, LFQbench.getMetrics )
  sink(file=File, split=T)
  LFQbench.explainMetrics()
  x = lapply(metrics, LFQbench.showMetrics)
  sink()
}

#' LFQbench.explainMetrics 
#' 
#' This function verbose the metrics of the experiment 
#' @export
LFQbench.explainMetrics = function(){
    cat("--------------------------------------------------------------------------------\n")
    cat("# Metrics description: \n\n")
    
    cat("Identification statistics:\n")
    cat("  the number of identifications for benchmark species\n\n")
    
    cat("Quantification statistics:\n")
    cat("   valid ratios - total number of valid log-ratios that meet following conditions:\n")
    cat("    a) peptide/protein could be quantified in both samples.\n")
    cat("    b) calulcated log-ratio falls within the expectation range\n")
    cat("       defined by the sample composition and the spread of spread of the log-ratios observed for each species.\n")
    cat(paste0(
        "    The expectation range is defined as a miximum difference of ",
        LFQbench.Config$LogRatioValidityRangeSDFactor," standard deviations from the average ratio.\n"))
    
    cat("   in plot range - numbers of valid log-ratios inside of the user defined plot range\n")
    cat("   out of plot range - numbers of valid log-ratios outside of the user defined plot range\n")
    cat("   invalid ratios - total numbers of invalid log-ratios\n")
    cat("   invalid ratios, out of validity range - proteins/peptides outside of validity range (see condition b)\n")
    cat("   invalid ratios, missing value - proteins/peptides exclusively quantified in one of the samples (see condition a)\n\n")
    
    cat("Technical variance:\n")
    cat("  the median CV for the background species among replicate runs\n\n")
    
    cat("Global accuracy:\n")
    cat("  accuracy of relative quantification,\n")
    cat("  the median deviation of log-ratios to the expected value\n\n")
    
    cat("Global precision:\n")
    cat("  precision of quantification,\n")
    cat("  the standard deviation of log-ratios\n\n")
    
    cat("Global species overlap:\n")
    cat(" species separation ability,\n")
    cat(" the area under ROC curve between a species pair.\n\n")
    
    cat("Local accuracy:\n")
    cat(" local accuracy of relative quantification,\n")
    cat(" the median deviation of log-ratios to the expected value\n")
    cat(" for partial data split in quantiles by intensity in first sample\n\n")
    
    cat("Local precision:\n")
    cat(" precision of quantification,\n")
    cat(" the standard deviation of log-ratios\n")
    cat(" for partial data split in quantiles by intensity in first sample\n\n")
    
    cat("Local species overlap:\n")
    cat(" species separation ability,\n")
    cat(" the area under ROC curve between a species pair\n")
    cat(" for partial data split in quantiles by intensity in first sample\n")
    cat("--------------------------------------------------------------------------------\n")
}

#' LFQbench.showMetrics 
#' 
#' verbose the metrics of the experiment
#' 
#' @param m the metrics object
#' @export
LFQbench.showMetrics = function(m){
  cat("--------------------------------------------\n")
  cat(paste("\n", m$name, "\n", sep=""))
  sm = function(mn){
      cat( paste(mn,": ", sep="" ))
      v = m[[mn]]
      if( is.atomic(v) ) 
      {
        cat(v)
      }
      else 
      {
        cat("\n")
        show(v)
      }
      cat("\n")
  }
  nix = sapply(names(m), sm)
  cat("\n")
}

#' getFileExt
#' 
#' generate appropriate file extension for a column separator: 
#' csv for ',' or ';'
#' tsv for '\t'
#' txt for anything else
#' 
#' @param colSep column separator character
#' @return a file extension csv/tsv/txt without '.'
#' @export
getFileExt = function(colSep)
{
    if(colSep==',' || colSep==';') return("csv")
    if(colSep=="\t") return("tsv")
    return("txt")
}

#' fixFileExt
#' 
#' fix the given file name by applying an appropriate file extension
#' that is guessed by column separator character provided
#' the new file extension will be
#' csv for ',' or ';'
#' tsv for '\t'
#' txt for anything else
#' 
#' @param file file name or path
#' @param sep column separator character
#' @return fixed file name or path
#' @export
fixFileExt=function(file, sep)
{
    # remove current extension
    fileBase = gsub("\\.(.sv|txt)$","", file, ignore.case = T)
    # find appropriate extension
    ext = getFileExt(sep)
    # apply new extension
    resFile = paste(fileBase, ext, sep=".")
    return(resFile)
}

#' logIdStatistics
#' 
#' store identification statistics from a list of result sets
#' 
#' @param resultSets the result sets
#' @param csvFile the delimiter separated output file
logIdStatistics = function( resultSets, csvFile=paste(LFQbench.Config$LogFilesLocation,"identification_statistics.csv",sep="/") ){
  cat("storing identification statistics ... ")
  idstats = t( sapply( resultSets, function(rs) rs$idstat ) )
  sums = rowSums(idstats)
  labs = sapply( resultSets, function(rs) rs$docSet$fileBase )
  write.table(
      data.frame( "name"=labs, idstats, "sum"=sums ),
      file=fixFileExt(csvFile, LFQbench.Config$CsvColumnSeparator), 
      sep=LFQbench.Config$CsvColumnSeparator,
      row.names=F, col.names=T
      )
  cat("[done]\n")
}

#' saveLogRatios
#' 
#' extract and store calculated log-ratios from a list of result sets
#' @param rs the result set
saveLogRatios = function(rs){
    listLR = function(pairName)
    { 
        pairRes = rs$result[[pairName]]
        write.table( 
          pairRes$validLogRatios,
          paste(rs$docSet$logPath,"/",rs$docSet$fileBase," LR (", pairRes$name1,"-" ,pairRes$name2, ").csv",sep = ""),
          sep = ",",dec = ".",row.names = F, col.names = T
        )
        
        write.table( 
          pairRes$validLogRatios[pairRes$validLogRatios$logratio < LFQbench.Config$LogRatioPlotRange[1] | pairRes$validLogRatios$logratio > LFQbench.Config$LogRatioPlotRange[2],],
          paste(rs$docSet$logPath,"/", rs$docSet$fileBase ," LR (", pairRes$name1 , "-" , pairRes$name2, ") out of range.csv", sep=""),
          sep = ",",dec = ".",row.names = F, col.names = T
        )    
        
        write.table( 
          pairRes$invalidLogRatios,
          paste(rs$docSet$logPath ,"/" , rs$docSet$fileBase , " invalid LR (", pairRes$name1 , "-" , pairRes$name2, ").csv",sep=""),
          sep = ",",dec = ".",row.names = F, col.names = T
        )
        
        return( pairRes$validLogRatios )
    }
    logRatios = lapply( names(rs$result), listLR )
    return(logRatios)
}

#' saveSampleMeans
#' 
#' save sample means to file
#' @param resultSet the result set
#' 
saveSampleMeans = function( resultSet )
{
    cat("storing sample means ... ")
    d = resultSet$data
    # id, species, a, b, c
    write.table( 
    data.frame(
      id=d$id,
      species=d$species,
      d$mean
    ), file=resultSet$docSet$avgFile, sep=LFQbench.Config$CsvColumnSeparator, dec=LFQbench.Config$CsvDecimalPointChar, col.names=T, row.names=F
    )
    cat("[done]\n")
}
################################################################################

#' saveSampleCVs
#' 
#' save the in sample CVs to a file
#' @param resultSet the result set
saveSampleCVs = function( resultSet)
{
  cat("storing coeffiecients of in-sample variation ... ")
  d = resultSet$data
  # id, species, a, b, c
  write.table( 
    data.frame(
      id=d$id,
      species=d$species,
      d$cv,
      row.names = NULL
    ), file=resultSet$docSet$rsdFile, sep=LFQbench.Config$CsvColumnSeparator, dec=LFQbench.Config$CsvDecimalPointChar, col.names=T, row.names=F
  )
  cat("[done]\n")
}

#' saveIDs
#' 
#' save identified proteins/peptides to a file
#' @param resultSet the result set
saveIDs = function( resultSet ){
  cat("storing identified protein names ... ")
  write.table( 
    resultSet$data$id
    , file = resultSet$docSet$idsFile, sep=LFQbench.Config$CsvColumnSeparator, dec = LFQbench.Config$CsvDecimalPointChar, col.names = F, row.names = F
  )
  cat("[done]\n")
}

#' saveSpeciesSeparation
#' 
#' This function enable to save the species 
#' @param resultSet
saveSpeciesSeparation = function( resultSet ){
  cat("storing species separation scores ... ")
  # species, a:b, a:c, ...
  d = data.frame(
    species=LFQbench.Config$AllSpeciesPairsLabels,
    sapply( resultSet$result, function(d) d$separation)
  )
  names(d) = c("species", LFQbench.Config$SamplePairsLabels)
  write.table( d, file = resultSet$docSet$rocFile, sep = LFQbench.Config$CsvColumnSeparator, dec = LFQbench.Config$CsvDecimalPointChar, col.names = T, row.names = F  )
  cat("[done]\n")
}

#' LFQbench.getMetrics 
#' 
#' calculate metrics for a result set
#' @param resultSet
#' @export
LFQbench.getMetrics = function(resultSet){
  # identification rate
  # number of identified proteins (only benchmark species)
  # PROBLEM? identification = sum( sapply(LFQbench.Config$AllSpeciesNames, function(s) sum( resultSet$data$species == s, na.rm=T) ))
  identification = sum( sapply(LFQbench.Config$AllSpeciesNames, function(s) length( which(resultSet$data$species == s) ) ) )
  # in case we want all IDs just count all
  # identificationRateMetric = length(resultSet$data$id)
  
  # replication variance
  # median CV for background-species
  replication = median( na.exclude( resultSet$data$cv[ resultSet$data$species == LFQbench.Config$BackgroundSpeciesName ] ) )
    
  # area under ROC curve in predefined quantiles of intensity
  globalSeparation = lapply(resultSet$result, function(d) d$separation )
  localSeparation = lapply(resultSet$result, function(d) d$quantileSeparation )  
  
  # median log-ratio deviation to the expectation in predefined quantiles of intensity
  globalAccuracy = lapply(resultSet$result, function(d) d$accuracy )
  localAccuracy = lapply(resultSet$result, function(d) d$quantileAccuracy )
  
  # standard deviation of log-ratios in predefined quantiles of intensity
  globalPrecision = lapply(resultSet$result, function(d) d$precision )
  localPrecision = lapply(resultSet$result, function(d) d$quantilePrecision )
  
  # count log ratio statistics
  lrs4spc = function(spc, sp)
  {
    iLR = sp$invalidLogRatios$logratio[sp$invalidLogRatios$species == spc]
    vLR = sp$validLogRatios$logratio[sp$validLogRatios$species == spc]
    
    invalid = length(iLR)
    missing = ifelse(invalid == 0, 0, length(which(is.na(iLR))) )
    
    valid = length(vLR)
    plotted = length( which( vLR >= min(LFQbench.Config$LogRatioPlotRange) & vLR <= max(LFQbench.Config$LogRatioPlotRange) ) )
    return( c(
        "invalid ratios" = invalid, 
        "invalid, out of validity range" = (invalid - missing),
        "invalid, missing value" = missing,
        "valid ratios" = valid,
        "in plot range" = plotted, 
        "out of plot range" = (valid - plotted)
    ) )
  }
  
  logRatioStats = lapply(resultSet$result, function(sp) sapply( LFQbench.Config$AllSpeciesNames, lrs4spc, sp  ))
    
  return(
    list(
      "name" = resultSet$docSet$fileBase,
      "Identification statistics" = identification,
      "Quantification statistics" = logRatioStats,
      "Technical variance" = replication,
      "Global accuracy" = globalAccuracy,
      "Global precision" = globalPrecision,
      "Global species overlap" = globalSeparation,
      "Local accuracy" = localAccuracy,
      "Local precision" = localPrecision,
      "Local species overlap" = localSeparation
    )
  )
}

#' getCombinedLogRatios
#' 
#' This function get the Combined Log Ratios 
#' 
#' @param ResultSets 
getCombinedLogRatios = function( ResultSets ){
  listLR = function(pairName, rs)
  { 
    fileName = rs$docSet$fileBase
    spcNames = names( rs$result[[pairName]]$data )
    spcLogRatios = sapply( rs$result[[pairName]]$data, function(d) d$y )
    names(spcLogRatios) = paste(fileName , "  " , pairName , "  " , spcNames, sep = "")
    return(spcLogRatios)
  }
  
  logRatios = unlist( 
    sapply( ResultSets, function(rs) lapply(names(rs$result), listLR, rs) )
    , recursive=F 
  )
  
  return(logRatios)
}

#' getCombinedProteinRSDs 
#' 
#' This fucntion return the combined protein RSDs 
#' @param ResultsSets 
getCombinedProteinRSDs = function(ResultSets){
  CVs = lapply( ResultSets, function(rs) as.vector(rs$data$cv) )
  names(CVs) = as.vector( lapply(ResultSets, function(rs) rs$docSet$fileBase ) )
  return(CVs)
}

#' makeDocSet
#' 
#' construct a document set for a file by paths from user configuration
#' @param inFile
makeDocSet = function( inFile, inputPath=LFQbench.Config$InputFilesLocation, plotPath=LFQbench.Config$PlotFilesLocation, logPath=LFQbench.Config$LogFilesLocation ){
  fileBase = sub(LFQbench.Config$InputExtensionPattern, "", basename(inFile) )
  return( 
    list(
      inputPath = inputPath, 
      plotPath = plotPath, 
      logPath  = logPath,
      fileBase = fileBase, 
      csvFile  = ifelse(file.exists(inFile), inFile, paste(inputPath,"/", inFile, sep = "")),
      pdfFile  = paste(plotPath,"/", fileBase,".pdf", sep = ""),
      logFile  = paste(logPath,"/", fileBase,".log", sep = ""),
      avgFile  = paste(logPath, "/", fileBase, "_sample_means.csv", sep = ""),
      rsdFile  = paste(logPath, "/", fileBase, "_cv.csv", sep = ""),
      l2rFile  = paste(logPath, "/", fileBase, "_log2_ratio.csv", sep = ""),
      rocFile  = paste(logPath, "/", fileBase, "_species_separation.csv", sep = ""),
      idsFile  = paste(logPath, "/", fileBase, "_ids.csv", sep = "")
    )
  )
}

#' makeEverythingInOneFolderDocSet
#' 
#' construct a doc set using the input file's path
#' @param inFile
makeEverythingInOneFolderDocSet = function( inFile ){
    tarDir = dirname(inFile)
    docSet = makeDocSet(inFile = inFile, inputPath=tarDir, plotPath=tarDir, logPath=tarDir)
    return(docSet)
}
IFIproteomics/LFQbench documentation built on March 2, 2023, 9:45 a.m.