R/checkIfFile.R
In artMS: Analytical R tools for Mass Spectrometry

Documented in artmsIsEvidenceNewVersion

# ------------------------------------------------------------------------------
# @title Check if an input is a file or a data object
# @description This function is used in order to make it so a user can submit
#  either a path to a data file or a data object in data.frame or data.table
#  form.
# @param input_file (object or data.frame) The filepath/object to be checked.
# @param dont_check_names if TRUE (default) names are not checked, otherwise
# names will be checked
# @param is.evidence (logical) Whether or not the file to be read in is an
# evidence file. This will assign proper classes to the evidence file
# when being read in.
# @return An R data object
# @keywords internal, file, evidence, input
.artms_checkIfFile <- function(input_file, 
                               dont_check_names = TRUE,
                               is.evidence = FALSE) {
  # check if already a data.frame or data.table
  if (is.data.table(input_file)) {
    x <- data.table(input_file)
  } else if (is.data.frame(input_file)) {
    x <- data.frame(input_file)
  } else if (is.vector(input_file)){
    if(length(input_file) == 1){
      if(!file.exists(input_file)){
        stop("The file ", input_file, " does not exist! ")
      }else{
        if(dont_check_names){
          x <- read.delim(input_file, 
                          stringsAsFactors = FALSE)
        }else{
          x <- read.delim(input_file, 
                          stringsAsFactors = FALSE,
                          check.names = FALSE)
        }
        
      }
    }else{
      stop("The input object is not valid")
    }
  }else{
    stop("The input object is not valid")
  }
  
  if(is.evidence){
    message("--- Evidence file ready")
    return(x)
  }else{
    return(x)
  }
  
}

#' @title Check if a given evidencee file was generated by a new version of
#' MaxQuant (v>1)
#' @description MaxQuant introduced changes in the column names and number
#' of columns for the evidence file in version 1 (we think).
#' This function check whether the evidence comes from the latest version
#' of MaxQuant.
#' @param evidence_file the evidence file name
#' @return (logical) `TRUE` if it is a newer version of MaxQuant,
#' `FALSE` otherwise
#' @keywords file, evidence, input, check version
#' @examples
#' artmsIsEvidenceNewVersion(evidence_file = artms_data_ph_evidence)
#' @export
artmsIsEvidenceNewVersion <- function(evidence_file) {
  rawEvidence <- .artms_checkIfFile(input_file = evidence_file, 
                                    is.evidence = TRUE)
  
  if (any(grepl("^Leading.proteins$", names(rawEvidence))) &
      any(grepl("^Leading.razor.protein$", names(rawEvidence)))) {
    return(TRUE)
  } else{
    return(FALSE)
  }
}


# ------------------------------------------------------------------------------
# @title Read the Evidence File
#
# @description Read in a MaxQuant searched Evidence file using data.table.
# This function properly classes each column and so fread doesn't have
# to guess.
# @param evidence_file (char)The filepath to the MaxQuant searched data
# (evidence) file (txt tab delimited file).
# @param verbose (logical) `TRUE` (default) shows function messages
# @return (data.frame) with the evidence file with defining classes
# @keywords internal, MaxQuant, evidence
.artms_read_evidence_file <- function(evidence_file,
                                      verbose = TRUE) {
  if(verbose) message("--- Reading in evidence file... ")
  # read in the first line to get the header names
  cols <- readLines(evidence_file, 1)
  cols <- data.frame(V1 = unlist(strsplit(cols, "\t")), 
                     stringsAsFactors = FALSE)
  
  cols$idx <- seq_len(dim(cols)[1])
  
  # get data frame of pre-recorded column names and their respective classes
  col.classes <-
    as.data.frame(matrix(
      c(
        "Sequence",
        "character",
        "Length",
        "integer",
        "Modifications",
        "character",
        "Modified sequence",
        "character",
        "Oxidation (M) Probabilities",
        "character",
        "Oxidation (M) Score Diffs",
        "character",
        "Acetyl (Protein N-term)",
        "integer",
        "Oxidation (M)",
        "integer",
        "Missed cleavages",
        "integer",
        "Proteins",
        "character",
        "Leading proteins",
        "character",
        "Leading Proteins",
        "character",
        "Leading razor protein",
        "character",
        "Leading Razor Protein",
        "character",
        "Gene names",
        "character",
        "Gene Names",
        "character",
        "Protein names",
        "character",
        "Protein Names",
        "character",
        "Type",
        "character",
        "Raw file",
        "character",
        "Experiment",
        "character",
        "MS/MS m/z",
        "numeric",
        "Charge",
        "integer",
        "m/z",
        "numeric",
        "Mass",
        "numeric",
        "Resolution",
        "numeric",
        "Uncalibrated - Calibrated m/z [ppm]",
        "numeric",
        "Uncalibrated - Calibrated m/z [Da]",
        "numeric",
        "Mass Error [ppm]",
        "numeric",
        "Mass error [ppm]",
        "numeric",
        "Mass Error [Da]",
        "numeric",
        "Mass error [Da]",
        "numeric",
        "Uncalibrated Mass Error [ppm]",
        "numeric",
        "Uncalibrated mass error [ppm]",
        "numeric",
        "Uncalibrated Mass Error [Da]",
        "numeric",
        "Uncalibrated mass error [Da]",
        "numeric",
        "Max intensity m/z 0",
        "numeric",
        "Retention time",
        "numeric",
        "Retention length",
        "numeric",
        "Calibrated retention time",
        "numeric",
        "Calibrated retention time start",
        "numeric",
        "Calibrated retention time finish",
        "numeric",
        "Retention time calibration",
        "numeric",
        "Match time difference",
        "numeric",
        "Match m/z difference",
        "numeric",
        "Match q-value",
        "numeric",
        "Match score",
        "numeric",
        "Number of data points",
        "integer",
        "Number of scans",
        "integer",
        "Number of isotopic peaks",
        "integer",
        "PIF",
        "numeric",
        "Fraction of total spectrum",
        "numeric",
        "Base peak fraction",
        "numeric",
        "PEP",
        "numeric",
        "MS/MS Count",
        "integer",
        "MS/MS count",
        "integer",
        "MS/MS Scan Number",
        "integer",
        "MS/MS scan number",
        "integer",
        "Score",
        "numeric",
        "Delta score",
        "numeric",
        "Combinatorics",
        "integer",
        "Intensity",
        "numeric",
        "Reverse",
        "character",
        "Potential contaminant",
        "character",
        "id",
        "integer",
        "Protein group IDs",
        "character",
        "Peptide ID",
        "integer",
        "Mod. peptide ID",
        "integer",
        "MS/MS IDs",
        "character",
        "Best MS/MS",
        "integer",
        "AIF MS/MS IDs",
        "logical",
        "Oxidation (M) site IDs",
        "character",
        "Acetyl (K) Probabilities",
        "character",
        "GlyGly (K) Probabilities",
        "character",
        "Phospho (STY) Probabilities",
        "Character",
        "Acetyl (K) Score Diffs",
        "character",
        "GlyGly (K) Score Diffs",
        "character",
        "Phospho (STY) Score Diffs",
        "character",
        "Acetyl (K)",
        "integer",
        "GlyGly (K)",
        "integer",
        "Phospho (STY)",
        "integer",
        "Acetyl (K) site IDs",
        "character",
        "GlyGly (K) site IDs",
        "character",
        "Phospho (STY) site IDs",
        "character",
        "Contaminant",
        "character",
        "Fraction",
        "integer"
      ),
      ncol = 2,
      byrow = TRUE
    ), stringsAsFactors = FALSE)
  # merge the classes to the columns
  cols.matched = merge(cols, col.classes, by = "V1", all.x = TRUE)
  # re-order things to match the initial order
  cols.matched <- cols.matched[order(cols.matched$idx),]
  
  # Stop if there is an issue
  if (length(which(is.na(cols.matched$V2))) > 0) {
    stop("The evidence file contains columns that are not recognize.
        If they are new columns, add them to 'col.classes' of
        function <.artms_read_evidence_file>",
        paste(cols.matched$V1[which(is.na(cols.matched$V2))],"\n")
    )
  }
  
  # read in the evidence file with their classes
  x <- fread(evidence_file, 
             integer64 = "double",
             colClasses = cols.matched$V2)
  # make sure all the column names are the same as supporting analytical
  # functions are looking for
  x <- .artms_unifyColumnNames(x)
  return(x)
}

# ------------------------------------------------------------------------------
# @title Unify column names
#
# @description The newer versions of MaxQuant change the letter case in the
# column names. This is to help keep things unified still.
# @param x (data.frame) Loaded evidence file.
# @return (data.frame) with the evidence file with defining classes and
# unified columns
# @keywords internal, MaxQuant, evidence
.artms_unifyColumnNames <- function(x) {
  names(x) <-
    gsub("^Mass error \\[ppm\\]$", "Mass Error \\[ppm\\]", names(x))
  names(x) <-
    gsub("^Mass error \\[Da\\]$", "Mass Error \\[Da\\]", names(x))
  names(x) <- gsub("^Protein Names$", "Protein names", names(x))
  names(x) <-
    gsub("^Leading Razor Protein$", "Leading razor protein", names(x))
  names(x) <-
    gsub("^Leading Proteins$", "Leading proteins", names(x))
  names(x) <- gsub("^Gene Names$", "Gene names", names(x))
  names(x) <- gsub("^MS/MS count$", "MS/MS Count", names(x))
  names(x) <-
    gsub("^MS/MS scan number$", "MS/MS Scan Number", names(x))
  names(x) <-
    gsub("^Uncalibrated mass error \\[Da\\]$",
         "Uncalibrated Mass Error \\[Da\\]",
         names(x))
  names(x) <-
    gsub("^Uncalibrated mass error \\[ppm\\]$",
         "Uncalibrated Mass Error \\[ppm\\]",
         names(x))
  
  return(x)
}