R/import.R

Defines functions import_ProteomeDiscoverer

Documented in import_ProteomeDiscoverer

#' Import Thermo ProteomeDiscoverer data
#'
#' Reads in a Thermo ProteomeDiscoverer \code{PSMs.txt} file.
#'
#' @param file Location of the \code{PSMs.txt} file.
#' @param shared Include shared peptides?
#' @param used Include only measurements marked as used by ProteomeDiscoverer?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{seaMass_sigma}.
#' @import data.table
#' @export
import_ProteomeDiscoverer <- function(
  file = NULL,
  use.shared.peptides = FALSE,
  use.not.unique = TRUE,
  use.excluded.by.method = TRUE,
  use.redundant = FALSE,
  data = NULL
) {
  if (is.null(file)) {
    if (is.null(data)) stop("One of 'data' or 'file' needs to be specified.")
    DT.raw <- setDT(data)
  } else {
    DT.raw <- fread(file = file, showProgress = T)
  }

  # filtering
  DT.raw[, Use := T]
  if (!use.shared.peptides) DT.raw[`Number of Protein Groups` > 1, Use := F]
  if (!use.not.unique) DT.raw[`Quan Info` == "NotUnique", Use := F]
  if (!use.excluded.by.method) DT.raw[`Quan Info` == "ExcludedByMethod", Use := F]
  if (!use.redundant) DT.raw[`Quan Info` == "Redundant", Use := F]

  # create wide data table
  DT <- DT.raw[ , list(
    Groups = `Master Protein Accessions`,
    GroupInfo = factor(""),
    Component = gsub(" ", "", paste0(Sequence, ",", Modifications)),
    Measurement = paste0(`Spectrum File`, ",", `First Scan`),
    Injection = `Spectrum File`,
    Use
  )]
  if ("Light" %in% colnames(DT.raw)) DT$Channel.Light <- DT.raw$Light
  if ("Medium" %in% colnames(DT.raw)) DT$Channel.Medium <- DT.raw$Medium
  if ("Heavy" %in% colnames(DT.raw)) DT$Channel.Heavy <- DT.raw$Heavy
  if ("126N" %in% colnames(DT.raw)) DT$Channel.126N <- DT.raw$`126N`
  if ("126C" %in% colnames(DT.raw)) DT$Channel.126C <- DT.raw$`126C`
  if ("126" %in% colnames(DT.raw)) DT$Channel.126 <- DT.raw$`126`
  if ("127N" %in% colnames(DT.raw)) DT$Channel.127N <- DT.raw$`127N`
  if ("127C" %in% colnames(DT.raw)) DT$Channel.127C <- DT.raw$`127C`
  if ("127" %in% colnames(DT.raw)) DT$Channel.127 <- DT.raw$`127`
  if ("128N" %in% colnames(DT.raw)) DT$Channel.128N <- DT.raw$`128N`
  if ("128C" %in% colnames(DT.raw)) DT$Channel.128C <- DT.raw$`128C`
  if ("128" %in% colnames(DT.raw)) DT$Channel.128 <- DT.raw$`128`
  if ("129N" %in% colnames(DT.raw)) DT$Channel.129N <- DT.raw$`129N`
  if ("129C" %in% colnames(DT.raw)) DT$Channel.129C <- DT.raw$`129C`
  if ("129" %in% colnames(DT.raw)) DT$Channel.129 <- DT.raw$`129`
  if ("130N" %in% colnames(DT.raw)) DT$Channel.130N <- DT.raw$`130N`
  if ("130C" %in% colnames(DT.raw)) DT$Channel.130C <- DT.raw$`130C`
  if ("130" %in% colnames(DT.raw)) DT$Channel.130 <- DT.raw$`130`
  if ("131N" %in% colnames(DT.raw)) DT$Channel.131N <- DT.raw$`131N`
  if ("131C" %in% colnames(DT.raw)) DT$Channel.131C <- DT.raw$`131C`
  if ("131" %in% colnames(DT.raw)) DT$Channel.131 <- DT.raw$`131`
  rm(DT.raw)

  # fold out shared peptides
  DT[, row := seq_len(nrow(DT))]
  DT <- merge(DT, DT[, list(Group = unlist(strsplit(Groups, ";"))), by = row], by = "row", sort = F)
  DT[, Group := trimws(Group)]
  DT[, Groups := NULL]
  DT[, row := NULL]

  # we can get a spectrum assigned to multiple peptides in a protein - if this occurs, assign the spectrum as a 'new' ambiguous peptide
  DT[, Component := paste(sort(as.character(Component)), collapse = " "), by = .(Group, Measurement, Use)]
  DT <- unique(DT)

  # melt label counts
  DT[, GroupInfo := factor(GroupInfo)]
  DT[, Group := factor(Group)]
  DT[, Component := factor(Component)]
  DT[, Measurement := factor(Measurement)]
  DT[, Run := factor("1")]
  DT[, Injection := factor(Injection)]
  DT <- melt(DT, variable.name = "Channel", value.name = "Count", measure.vars = colnames(DT)[grep("^Channel\\.", colnames(DT))])
  levels(DT$Channel) <- sub("^Channel\\.", "", levels(DT$Channel))

  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Run", "Channel", "Count"))
  setorder(DT, Group, Component, Measurement, Run, Channel)

  setattr(DT, "group", c("ProteinGroup", "ProteinGroups"))
  setattr(DT, "component", c("Peptidoform", "Peptidoforms"))
  setattr(DT, "measurement", c("Spectrum", "Spectra"))

  setDF(DT)
  return(DT[])
}


#' Import SCIEX ProteinPilot data
#'
#' Reads in a SCIEX ProteinPilot \code{PeptideSummary.txt} file for processing with \link{seaMass_sigma}.
#'
#' @param file Location of the \code{PeptideSummary.txt} file.
#' @param shared Include shared peptides?
#' @param min.conf Measurements with peptide ID confidence less than \code{min.conf} (between 0 - 100) are filtered out. The default
#'   \code{"auto"} uses the ProteinPilot default threshold.
#' @param filter Other filters to use, which can include \code{"discordant peptide type"}, \code{"no iTRAQ" and
#'   \code{"weak signal"}}
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{seaMass_sigma}.
#' @import data.table
#' @export
import_ProteinPilot <- function(
  file = NULL,
  min.conf = "auto",
  use.decoys = FALSE,
  use.shared.peptides = FALSE,
  use.discordant.peptide.type = FALSE,
  use.no.itraq = FALSE,
  use.weak.signal = FALSE,
  data = NULL
) {
  if (is.null(file)) {
    if (is.null(data)) stop("One of 'data' or 'file' needs to be specified.")
    DT.raw <- setDT.raw(data)
  } else {
    DT.raw <- fread(file = file, showProgress = T)
  }

  # filtering
  DT.raw[, Use := T]
  if (min.conf == "auto") min.conf <- DT.raw[Annotation == "auto", min(Conf)]
  DT.raw[Conf < min.conf, Use := F]
  if (!use.decoys) DT.raw[grepl("^RRRRR.*", DT.raw$Accessions), Use := F]
  if (!use.shared.peptides) DT.raw[Annotation == "auto - shared MS/MS", Use := F]
  if (!use.discordant.peptide.type) DT.raw[Annotation == "auto - discordant peptide type", Use := F]
  if (!use.no.itraq) DT.raw[Annotation == "auto - no iTRAQ", Use := F]
  if (!use.weak.signal) DT.raw[Annotation == "no quant - weak signal", Use := F]

  # create wide data table
  if(!("ProteinModifications" %in% colnames(DT.raw))) DT.raw[, ProteinModifications := ""]
  DT <- DT.raw[, .(
    Group = gsub(";", "", Accessions),
    GroupInfo = paste0("[", N, "] ", Names),
    Component = gsub(" ", "", paste0(Sequence, ",", Modifications, ",", ProteinModifications, ",", Cleavages), fixed = T),
    Measurement = Spectrum,
    Injection = as.integer(matrix(unlist(strsplit(as.character(DT.raw$Spectrum), ".", fixed = T)), ncol = 5, byrow = T)[, 1]),
    Use
  )]
  if("Area 113" %in% colnames(DT.raw)) DT$Channel.113 <- DT.raw$`Area 113`
  if("Area 114" %in% colnames(DT.raw)) DT$Channel.114 <- DT.raw$`Area 114`
  if("Area 115" %in% colnames(DT.raw)) DT$Channel.115 <- DT.raw$`Area 115`
  if("Area 116" %in% colnames(DT.raw)) DT$Channel.116 <- DT.raw$`Area 116`
  if("Area 117" %in% colnames(DT.raw)) DT$Channel.117 <- DT.raw$`Area 117`
  if("Area 118" %in% colnames(DT.raw)) DT$Channel.118 <- DT.raw$`Area 118`
  if("Area 119" %in% colnames(DT.raw)) DT$Channel.119 <- DT.raw$`Area 119`
  if("Area 121" %in% colnames(DT.raw)) DT$Channel.121 <- DT.raw$`Area 121`
  rm(DT.raw)

  # we can get a spectrum assigned to multiple peptides in a protein - if this occurs, assign the spectrum as a 'new' ambiguous peptide
  DT[, Component := paste(sort(as.character(Component)), collapse = " "), by = .(Group, Measurement, Use)]
  DT <- unique(DT)

  # melt
  DT[, GroupInfo := factor(GroupInfo)]
  DT[, Group := factor(Group)]
  DT[, Component := factor(Component)]
  DT[, Measurement := factor(Measurement)]
  DT[, Run := factor("1")]
  DT[, Injection := factor(Injection)]
  DT <- melt(DT, variable.name = "Channel", value.name = "Count", measure.vars = colnames(DT)[grep("^Channel\\.", colnames(DT))])
  levels(DT$Channel) <- sub("^Channel\\.", "", levels(DT$Channel))

  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Injection", "Run", "Channel", "Count", "Use"))
  setorder(DT, Group, Component, Measurement, Injection, Run, Channel)

  setattr(DT, "group", c("ProteinGroup", "ProteinGroups"))
  setattr(DT, "component", c("Peptidoform", "Peptidoforms"))
  setattr(DT, "measurement", c("Spectrum", "Spectra"))

  setDF(DT)
  return(DT[])
}


#' Import OpenSWATH data
#'
#' Reads in the output of an OpenSWATH -> PyProphet -> TRIC pipeline.
#'
#' @param files A \code{csv} file to import.
#' @param max.m_score Include only measurements with PyProphet m_score >= than this?
#' @param data Advanced: Rather than specifying a \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{seaMass_sigma}.
#' @import data.table
#' @export
import_OpenSWATH <- function(
  file = NULL,
  max.m_score = 0.05,
  use.shared.peptides = FALSE,
  use.decoys = FALSE,
  data = NULL
) {
  if (is.null(file) && is.null(data)) stop("One of 'data' or 'file' needs to be specified.")
  if (!is.null(data)) file <- data

  DT <- rbindlist(lapply(file, function(f) {
    if (is.data.frame(f)) {
      DT <- as.data.file(data)
    } else {
      DT <- fread(file = f, showProgress = T)
    }

    # fold out shared peptides and mark if use shared
    DT[, row := seq_len(nrow(DT))]
    DT[, ProteinName := sub("^[0-9]+/", "", ProteinName)]
    DT.groups <- DT[, list(Group = unlist(strsplit(ProteinName, "/"))), by = row]
    DT.groups[, Group := trimws(Group)]
    DT.groups[, Use := T]
    if (!use.decoys) DT.groups[grep("^reverse_", Group), Use := F]
    if (!use.shared.peptides) DT.groups[Use == T, Use := sum(Use) == 1, by = row]
    DT <- merge(DT, DT.groups, by = "row", sort = F)
    DT[, row := NULL]

    # other filters
    DT[m_score > max.m_score, Use := F]
    if (!use.decoys) DT[decoy != 0, Use := F]

    # create long data table
    DT <- DT[, .(
      Group,
      GroupInfo = factor(""),
      Component = FullPeptideName,
      Measurement = gsub(";", ";bayesprot;", aggr_Fragment_Annotation),
      Run = paste(run_id, tools::file_path_sans_ext(basename(filename)), sep = ";"),
      Count = gsub(";", ";bayesprot;", aggr_Peak_Area),
      Use
    )]
    DT <- DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ";bayesprot;", fixed = T)))]
    DT[, Group := sub("^1/", "", Group)]
    DT[, Group := factor(Group, levels = unique(Group))]
    DT[, Component := factor(Component, levels = unique(Component))]
    DT[, Measurement := factor(Measurement, levels = unique(Measurement))]
    DT[, Run := factor(Run, levels = unique(Run))]
    DT[, Count := as.numeric(Count)]
    DT[, Use := as.logical(Use)]

    return(DT)
  }))


  # remove measurements that have more than one identification in any assay
  DT[, N := .N, by = .(Measurement, Run)]
  DT[N != 1, Use := F]
  DT[, N := NULL]
  DT[, GroupInfo := ""]
  DT[, Channel := factor("1")]

  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Run", "Channel"))
  setorder(DT, Group, Component, Measurement, Run, Channel)

  setattr(DT, "group", c("ProteinGroup", "ProteinGroups"))
  setattr(DT, "component", c("Peptidoform", "Peptidoforms"))
  setattr(DT, "measurement", c("Transition", "Transitions"))

  setDF(DT)
  return(DT[])
}

#' Import MaxQuant LF data
#'
#' Reads in MaxQuant \code{evidence.txt} and \code{proteinGroups.txt}.
#'
#' @param proteinGroups.file Location of the \code{proteinGroups.txt} file.
#' @param evidence.file Location of the \code{evidence.txt} file.
#' @param shared Include shared peptides?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{bayesprot}.
#' @import data.table
#' @export
import_MaxQuant <- function(
  proteinGroups.file = NULL,
  evidence.file = NULL,
  use.decoys = FALSE,
  use.shared.peptides = FALSE,
  use.only.identified.by.site = FALSE,
  use.potential.contaminant = FALSE,
  proteinGroups.data = NULL,
  evidence.data = NULL
) {
  warning("import_MaxQuant currently only supports labelfree data with no fractionation")

  # load groups
  if (is.null(proteinGroups.file)) {
    if (is.null(proteinGroups.data)) stop("One of 'proteinGroups.data' or 'proteinGroups.file' needs to be specified.")
    DT.groups <- setDT(proteinGroups.data)
  } else {
    DT.groups <- fread(file = proteinGroups.file, showProgress = T)
  }

  # make sure we keep ALL groups, components and measurements for reference
  DT.groups[, Group := factor(`Protein IDs`, levels = unique(`Protein IDs`))]
  DT.groups[, Use := T]

  # filter groups
  if (!use.only.identified.by.site) DT.groups[`Only identified by site` == "+", Use := F]
  if (!use.decoys) DT.groups[`Reverse` == "+", Use := F]
  if (!use.potential.contaminant) DT.groups[`Potential contaminant` == "+", Use := F]

  DT.groups <- DT.groups[, .(
    Group,
    GroupInfo = factor(`Fasta headers`, levels = unique(`Fasta headers`)),
    GroupID = as.integer(id),
    Use
  )]

  # load wide raw
  if (is.null(evidence.file)) {
    if (is.null(evidence.data)) stop("One of 'evidence.data' or 'evidence.file' needs to be specified.")
    DT <- setDT(evidence.data)
  } else {
    DT <- fread(file = evidence.file, showProgress = T)
  }

  # make sure we keep ALL components and measurements for reference
  DT <- DT[, .(
    GroupID = as.character(`Protein group IDs`),
    MeasurementID = as.integer(id),
    Component = factor(`Modified sequence`, levels = unique(`Modified sequence`)),
    Measurement = paste0(`Modified sequence`, ",", Charge, "+"),
    Run = factor(Experiment, levels = unique(Experiment)),
    Channel = factor(1),
    Count = as.double(Intensity)
  )]
  DT[, Measurement := factor(Measurement, levels = unique(Measurement))]

  # fold out rows with shared features
  DT[, Use := ifelse(grepl(";", GroupID), use.shared.peptides, T)]
  DT = merge(DT[, !"GroupID"], DT[, {
    groupID = strsplit(GroupID, ";")
    list(MeasurementID = rep(MeasurementID, sapply(groupID, length)), GroupID = as.integer(unlist(groupID)))
  }], by = "MeasurementID", sort = F)

  # merge groups and raw
  DT <- merge(DT.groups, DT, by = "GroupID", sort = F, suffixes = c(".groups",""))
  DT[, GroupInfo := paste0("[", GroupID, "] ", GroupInfo)]
  DT[, GroupInfo := factor(GroupInfo, levels = unique(GroupInfo))]
  DT[, Use := Use & Use.groups]
  DT[, Use.groups := NULL]
  DT[, GroupID := NULL]
  DT[, MeasurementID := NULL]

  # sum multiple datapoints per measurement
  DT <- DT[, .(GroupInfo = GroupInfo[1], is.na = all(is.na(Count)), Count = sum(Count, na.rm = T)), by = .(Group, Component, Measurement, Run, Channel, Use)]
  DT[is.na == T, Count := NA_real_]
  DT[, is.na := NULL]

  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Run", "Channel", "Count"))
  setorder(DT, Group, Component, Measurement, Run, Channel)

  setattr(DT, "group", c("ProteinGroup", "ProteinGroups"))
  setattr(DT, "component", c("Peptidoform", "Peptidoforms"))
  setattr(DT, "measurement", c("Feature", "Features"))

  setDF(DT)
  return(DT)
}



























#' Import DIA-NN data
#'
#' Reads in DIA-NN \code{report.tsv} for processing.
#'
#' @param proteinGroups.file Location of the \code{proteinGroups.txt} file.
#' @param evidence.file Location of the \code{evidence.txt} file.
#' @param shared Include shared peptides?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{bayesprot}.
#' @import data.table
#' @export
import_DIANN <- function(
  file = NULL,
  use.shared.peptides = FALSE,
  data = NULL
) {
  stop("todo: not implemented yet")

  # load wide raw
  if (is.null(file)) {
    if (is.null(data)) stop("One of 'data' or 'file' needs to be specified.")
    DT <- setDT(data)
  } else {
    DT <- fread(file = file, showProgress = T)
  }

  setDF(DT)
  return(DT)
}


#' Import Waters Progenesis data
#'
#' Reads in a Waters Progenesis \code{pep_ion_measurements.csv} file.
#'
#' @param file Location of the \code{pep_ion_measurements.csv} file.
#' @param used Include only measurements marked as used by Progenesis?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{seaMass_sigma}.
#' @import data.table
#' @export

import_Progenesis <- function(
  file = NULL,
  used = T,
  data = NULL
) {
  stop("todo: needs updating")
  suppressWarnings(suppressMessages(library(R.oo)))

  if (is.null(file)) {
    if (is.null(data)) stop("One of 'data' or 'file' needs to be specified.")
    DT.raw <- setDT(data)
  } else {
    DT.raw <- fread(file = file, showProgress = T)
  }

  # sort out column names
  for (j in 2:ncol(DT.raw)) {
    if (DT.raw[[j]][1] == "") DT.raw[[j]][1] <- DT.raw[[j-1]][1]
    if (DT.raw[[j]][2] == "") DT.raw[[j]][2] <- DT.raw[[j-1]][2]
  }
  colnames(DT.raw) <- apply(DT.raw[1:3,], 2, function(x) trimws(paste(x[1], x[2], x[3])))
  DT.raw <- DT.raw[4:nrow(DT.raw),]

  # column names for different file types
  #if ("Accession" %in% colnames(DT.raw)) {
    colname.group <- "Accession"
    colname.groupinfo <- "Description"
    colname.sequence <- "Sequence"
    colname.modifications <- "Modifications"

    # only use rows that Progenesis uses for quant
    if (used) DT.raw <- DT.raw[`Use in quantitation` == "True",]
  #} else {
  #  colname.group <- colnames(DT.raw)[grep("Best peptide match Group$", colnames(DT.raw))]
  #  colname.groupinfo <- colnames(DT.raw)[grep("Best peptide match Description$", colnames(DT.raw))]
  #  colname.sequence <- colnames(DT.raw)[grep("Best peptide match Sequence$", colnames(DT.raw))]
  #  colname.modifications <- colnames(DT.raw)[grep("Best peptide match Variable modifications \\(\\[position\\] description\\)$", colnames(DT.raw))]
  #}

  # remove decoys and strange missing Accession
  DT.raw <- DT.raw[get(colname.group) != "",]
  DT.raw <- DT.raw[!grepl("^#DECOY#", get(colname.group)),]

  # create wide data table
  DT <- cbind(DT.raw[, .(
    GroupInfo = get(colname.groupinfo),
    Group = get(colname.group),
    Component = gsub(" ", "", paste0(get(colname.sequence), ",", get(colname.modifications))),
    Measurement = `#`
  )], DT.raw[, .SD, .SDcols = names(DT.raw) %like% "^Raw abundance "])

  # group ambiguous PSMs so seaMass-Delta treats them as a single component per group
  DT[, Component := paste(sort(as.character(Component)), collapse = " "), by = .(Group, Measurement)]
  DT <- unique(DT)

  # melt
  DT[, GroupInfo := factor(GroupInfo)]
  DT[, Group := factor(Group)]
  DT[, Component := factor(Component)]
  DT[, Measurement := factor(Measurement)]
  DT <- melt(DT, variable.name = "Run", value.name = "Count", measure.vars = colnames(DT)[grep("^Raw abundance ", colnames(DT))])
  levels(DT$Run) <- sub("^Raw abundance ", "", levels(DT$Run))
  DT[, Channel := factor("1")]
  DT[, Injection := Run]
  DT[, Count := as.numeric(Count)]

  setDF(DT)
  return(DT[])
}




#' Import MaxQuant LF data
#'
#' Reads in MaxQuant \code{peptides.txt} and \code{proteinGroups.txt}.
#'
#' @param proteinGroups.file Location of the \code{proteinGroups.txt} file.
#' @param peptides.file Location of the \code{peptides.txt} file.
#' @param shared Include shared peptides?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{bayesprot}.
#' @import data.table
#' @export
import_MaxQuant_peptides <- function(
  proteinGroups.file = NULL,
  peptides.file = NULL,
  shared = FALSE,
  proteinGroups.data = NULL,
  peptides.data = NULL
) {
  stop("todo: needs updating")
  # load groups
  if (is.null(proteinGroups.file)) {
    if (is.null(proteinGroups.data)) stop("One of 'proteinGroups.data' or 'proteinGroups.file' needs to be specified.")
    DT.groups <- setDT(proteinGroups.data)
  } else {
    DT.groups <- fread(file = proteinGroups.file, showProgress = T)
  }

  # filter groups
  #DT.groups <- DT.groups[`Only identified by site` != "+",]
  DT.groups <- DT.groups[`Reverse` != "+",]
  DT.groups <- DT.groups[`Potential contaminant` != "+",]
  DT.groups <- DT.groups[, .(Group = `Protein IDs`, GroupInfo = `Fasta headers`, GroupID = id)]

  # load wide raw
  if (is.null(peptides.file)) {
    if (is.null(peptides.data)) stop("One of 'peptides.data' or 'peptides.file' needs to be specified.")
    DT.raw <- setDT(peptides.data)
  } else {
    DT.raw <- fread(file = peptides.file, showProgress = T)
  }

  # filter raw
  DT <- DT.raw[ , .(
    GroupID = `Protein group IDs`,
    ComponentID = id,
    #Component = paste(id, Sequence, sep = ","),
    Component = `Protein group IDs`,
    Measurement = paste(id, Sequence, sep = ",")
  )]
  DT.raw <- DT.raw[, colnames(DT.raw)[grep("^Intensity ", colnames(DT.raw))], with = F]
  colnames(DT.raw) <- sub("^Intensity ", "", colnames(DT.raw))
  DT <- cbind(DT, DT.raw)

  # remove or expand out rows with shared features
  if (shared == F) {
    DT <- DT[!grepl(";", GroupID),]
    DT[, GroupID := as.integer(GroupID)]
  } else {
    DT = merge(DT[, !"GroupID"], DT[, {
      groupID = strsplit(GroupID, ";")
      list(ComponentID = rep(ComponentID, sapply(groupID, length)), GroupID = as.integer(unlist(groupID)))
    }], by = "ComponentID")
  }

  # merge groups and raw
  DT <- merge(DT.groups, DT, by = "GroupID")
  DT[, GroupInfo := paste0("[", GroupID, "] ", GroupInfo)]
  DT[, GroupID := NULL]
  DT[, ComponentID := NULL]

  # melt to long data table and convert zeros to NA
  DT <- melt(DT, variable.name = "Run", value.name = "Count", id.vars = c("Group", "GroupInfo", "Component", "Measurement"))
  DT$Count[DT$Count == 0] <- NA
  DT[, GroupInfo := factor(GroupInfo)]
  DT[, Group := factor(Group)]
  DT[, Component := factor(Component)]
  DT[, Measurement := factor(Measurement)]
  DT[, Run := factor(Run)]
  DT[, Channel := factor("1")]
  DT[, Injection := Run]
  DT[, Count := as.double(Count)]
  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Run", "Channel", "Injection"))

  setDF(DT)
  return(DT)
}


#' Import MaxQuant LF data
#'
#' Reads in MaxQuant \code{evidence.txt} and \code{proteinGroups.txt}.
#'
#' @param proteinGroups.file Location of the \code{proteinGroups.txt} file.
#' @param evidence.file Location of the \code{evidence.txt} file.
#' @param shared Include shared peptides?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{bayesprot}.
#' @import data.table
#' @export
import_MaxQuant_evidence0 <- function(
  proteinGroups.file = NULL,
  evidence.file = NULL,
  shared = F,
  rollup = "measurement",
  proteinGroups.data = NULL,
  evidence.data = NULL
) {
  stop("todo: needs updating")
  suppressWarnings(suppressMessages(library(R.oo)))

  if (is.null(evidence.file) && is.null(evidence.data)) stop("One of 'evidence.data' or 'evidence.file' needs to be specified.")
  if (!is.null(evidence.data)) evidence.file <- evidence.data

  if (is.null(proteinGroups.file) && is.null(proteinGroups.data)) stop("One of 'proteinGroups.data' or 'proteinGroups.file' needs to be specified.")
  if (!is.null(proteinGroups.data)) proteinGroups.file <- proteinGroups.data

  #Use proteinGroups.txt and evidence.txt files to match MS/MS IDs to protein groups
  if (is.null(proteinGroups.file)) {
    DT.proteins <- setDT(proteinGroups.data)
  } else {
    DT.proteins <- fread(file = proteinGroups.file, showProgress = T)
  }

  #DT.proteins <- DT.proteins[`Only identified by site` != "+",]
  DT.proteins <- DT.proteins[`Reverse` != "+",]
  DT.proteins <- DT.proteins[`Potential contaminant` != "+",]

  DT.proteins <- unique(DT.proteins[, c("Protein IDs", "Majority protein IDs", "id")])
  DT.proteins[, `Protein group IDs` := id]
  DT.proteins[, id := NULL]

  DT <- rbindlist(lapply(evidence.file, function(f) {
    if (is.data.frame(f)) {
      DT.raw <- as.data.file(evidence.data)
    } else {
      DT.raw <- fread(file = f, showProgress = T)
    }

    # remove decoys and > max.m_score
    #DT.raw <- DT.raw[decoy == 0,]
    #DT.raw <- DT.raw[m_score <= max.m_score,]

    #DT.raw <- DT.raw[`MS/MS IDs` %in% DT.msms[,`MS/MS IDs`],]
    #DT.raw <- DT.raw[`MS/MS IDs` != "",]
    #DT.raw[, `MS/MS IDs` := factor(`MS/MS IDs`)]
    #DT.raw <- merge(DT.raw, DT.msms, by = "MS/MS IDs")
    #DT.msms <- NULL

    #MaxQuant labels unmodified peptides as "Unmodified"
    DT.raw[, Modifications := gsub("Unmodified", "", Modifications)]
    DT.raw[, Measurement := gsub(" ", "", paste0(Sequence, ",", Modifications, ",", Charge))]

    DT.raw[,Shared := grepl(";", `Protein group IDs`)]
    pGroupIDs <- unique(DT.raw[, c("Protein group IDs", "Measurement")])[, .(`Protein group IDs` = tstrsplit(`Protein group IDs`, ";", fixed = T)), by = .(Measurement)]
    DT.raw[, `Protein group IDs` := NULL]
    DT.raw <- merge(DT.raw, pGroupIDs, by = "Measurement")
    DT.raw[, `Protein group IDs` := strtoi(`Protein group IDs`)]

    # only use rows that MaxQuant uses for quant
    if (!shared) DT.raw <- DT.raw[Shared == F,]
    #if (!used) DT.raw <- DT.raw[`Peptide Quan Usage` == "Use",]

    DT.raw <- merge(DT.raw, DT.proteins, by = "Protein group IDs")

    # create long data table
    DT <- DT.raw[, .(
      Group = `Protein IDs`, #From the proteinGroups file
      Component = gsub(" ", "", paste0(Sequence, ",", Modifications)),
      Measurement = Measurement,
      Run = Experiment,
      Count = Intensity
    )]
    #DT <- DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ";bayesprot;", fixed = T)))]
    DT[, Count := as.numeric(Count)]
    DT

  }))

  #Do rollup
  if (rollup == "measurement") {
    DT[, Count := sum(Count, na.rm = !all(is.na(Count))), by = .(Run, Measurement)]
    DT <- unique(DT)
  }

  assays <- unique(DT$Run)

  # create wide data table
  DT <- dcast(DT, Group + Component + Measurement ~ Run, value.var = "Count")

  # group ambiguous transitions so seaMass-Delta treats them as a single component per group ## UNNECCESARY?
  DT[, Component := paste(sort(as.character(Component)), collapse = " "), by = .(Group, Measurement)]
  DT <- unique(DT)


  # melt
  DT[, GroupInfo := factor(Group)]
  #DT[, Group := factor(sub("^1/", "", Group))]
  DT[, Group := factor(Group)]
  DT[, Component := factor(Component)]
  DT[, Measurement := factor(Measurement)]
  DT <- melt(DT, variable.name = "Run", value.name = "Count", measure.vars = assays)
  DT[, Injection := Run]
  DT[, Channel := factor("1")]
  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Run", "Injection", "Channel"))

  setDF(DT)
  return(DT[])

}


#' Import MaxQuant LF data
#'
#' Reads in MaxQuant \code{evidence.txt} and \code{proteinGroups.txt}.
#'
#' @param proteinGroups.file Location of the \code{proteinGroups.txt} file.
#' @param evidence.file Location of the \code{evidence.txt} file.
#' @param shared Include shared peptides?
#' @param data Advanced: Rather than specifying \code{file}, you can enter a \link{data.frame} preloaded with
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{bayesprot}.
#' @import data.table
#' @export
import_MSqRob <- function(
  fData,
  exprs,
  evidence.file = NULL,
  is.log2 = TRUE,
  evidence.data = NULL
) {
  stop("todo: needs updating")
  if (is.null(evidence.file) & is.null(evidence.data)) {
    DT <- setDT(cbind(fData[, c("Proteins", "Sequence")], exprs))
    DT <- melt(DT, variable.name = "Run", value.name = "Count", id.vars = c("Proteins", "Sequence"))
    setnames(DT, c("Proteins", "Sequence"), c("Group", "Measurement"))
    DT[, Group := factor(Group)]
    DT[, GroupInfo := ""]
    DT[, Component := Group]
    DT[, Measurement := factor(Measurement)]
    DT[, Channel := factor("1")]
    if (is.log2) DT[, Count := 2^Count]
  } else {
    # if MaxQuant evidence file supply, convert from two to three stage hierarchy
    DT <- setDT(fData[, c("Proteins", "Sequence","Evidence.IDs")])
    setnames(DT, "Proteins", "Group")
    DT <- DT[, {
      id = strsplit(as.character(Evidence.IDs), ";")
      list(Group = rep(Group, sapply(id, length)), Sequence = rep(Sequence, sapply(id, length)), id = as.integer(unlist(id)))
    }]

    if (!is.null(evidence.file) | is.null(evidence.data)) {
      if (is.null(evidence.file)) {
        DT.evidence <- setDT(evidence.data)
      } else {
        DT.evidence <- fread(file = evidence.file, showProgress = T)
      }
    }

    DT.evidence <- DT.evidence[, .(
      id,
      GroupInfo = "",
      Component = `Modified sequence`,
      Measurement = paste0(`Modified sequence`, ",", Charge, "+"),
      Fraction,
      Run = Experiment,
      Channel = "",
      Count = Intensity
    )]

    DT <- droplevels(merge(DT, DT.evidence, by = "id")[, !"id"])

    # create wide data table (summing up multiple features per measurement)
    DT <- dcast(DT, Group + GroupInfo + Component + Measurement + Fraction ~ Run, fun.aggregate = sum, value.var = "Count")

    # melt to long data table and convert zeros to NA
    DT <- melt(DT, variable.name = "Run", value.name = "Count", id.vars = c("Group", "GroupInfo", "Component", "Measurement", "Fraction"))
    DT$Count[DT$Count == 0] <- NA
    DT[, GroupInfo := factor(GroupInfo)]
    DT[, Group := factor(Group)]
    DT[, Component := factor(Component)]
    DT[, Measurement := factor(Measurement)]
    DT[, Fraction := factor(Fraction)]
    DT[, Run := factor(Run)]
    DT[, Channel := factor("1")]
    DT[, Count := as.double(Count)]
  }

  setcolorder(DT, c("Group", "GroupInfo", "Component", "Measurement", "Run"))
  setDF(DT)
  return(DT)
}


#' Import data outputed by an MSstats import routine
#'
#' Reads in a set of \code{_with_dscore} datasets processed by OpenSWATH and PyProphet.
#'
#' @param data MSstats output
#'   \link[data.table]{fread} default parameters.
#' @return A \link{data.frame} for input into \link{seaMass_sigma}.
#' @import data.table
#' @export
import_MSstats <- function(data) {
  stop("todo: needs updating")
  DT <- data.table(
    Group = factor(data$ProteinName),
    GroupInfo = "",
    Component = factor(data$PeptideSequence),
    Measurement = factor(data$FragmentIon),
    Run = factor(data$Run, levels = unique(data$Run)),
    Injection = factor(data$Run, levels = unique(data$Run)),
    Channel = factor(data$IsotopeLabelType, levels = unique(data$IsotopeLabelType)),
    Count = as.numeric(data$Intensity)
  )

  setDF(DT)
  return(DT[])
}
biospi/deamass documentation built on May 20, 2023, 3:30 a.m.