R/FormatData.R

Defines functions read_maxquant_mod_specific_peptides count_maxquant_phospho_sites read_maxquant_phospho_sites format_spectrum_mill format_mz_tab read_qc

# READ AND FORMAT DATA  ------------------------------------------------------------------------------------------------

## yooo heres the path: my.peptide.data <- read_maxquant("~/Box/CellBio-GoldfarbLab/Users/Ria Jasuja/modificationSpecificPeptides.txt", "TMT10-K", "TMT10-Nterm", c("Acetyl (Protein N-term)"), "Phospho (STY)")

# INPUT: path to MaxQuant's evidence.txt file and modification names
#
# OUTPUT: internal peptide-level formatted data
#
# Reads MaxQuant's "Evidence.txt" file and converts it into our internal QC format.
#
# Returns a tibble with columns for the number of possible and observed TMT labels.
#
# If TMT.N.mod and TMT.K.mod are NA, then assume that this was not searched with variable TMT mods.
# The values of the internal QC columns for those label shouldalso be NA.
#
# If only one of the TMT.N and TMT.K columns are NA, then throw a warning.
# The values of the internal QC column for the NA label should also be NA.
#
# Removes Reverses and Potential contaminants
#
read_maxquant_mod_specific_peptides <- function(path,
                           TMT_K_mod = "TMT10 (K)",
                           TMT_N_mod = "TMT10 (N-term)",
                           N_term_blocking_mods = c("Acetyl (Protein N-term)"),
                           K_blocking_mods = c(),
                           phospho_mod = "Phospho (STY)")
{
  data <- read_tsv(path)

  #select the columns that contain "Experiment " (experiment and a space)
  experiments <- grep("Experiment ", colnames(data), ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE, value = TRUE)

  #organize table a bit to make it easier to work with
  filtered.data <- select(data, "Sequence","Modifications", "Missed cleavages", TMT_N_mod, TMT_K_mod, N_term_blocking_mods, phospho_mod, experiments)
  filtered.data$Total_Nterm_Mods <- rowSums(select(data, N_term_blocking_mods))
  filtered.data <- filtered.data %>% rename("TMT10-K" = `TMT_K_mod`, "TMT10-Nterm" = `TMT_N_mod`, "N-term Modifications" = `Total_Nterm_Mods`, "Phospho (STY)" = `phospho_mod`)

  #compute expected tags (amount of K and N-term) and observed tags (the number of hits in the TMT columns) for each row and add them to their own columns
  filtered.data$expected_lysine <- str_count(filtered.data$Sequence, "K")
  filtered.data$detected_lysine <- filtered.data$"TMT10-K"

  filtered.data$expected_nterm <- str_count(filtered.data$"N-term Modifications", "0")
  filtered.data$detected_nterm <- filtered.data$"TMT10-Nterm"

  # calculate total expected.tags and detected.tags for each column either this way or by adding the values of the columns calculated above
  filtered.data$expected_tags <- str_count(filtered.data$Sequence, "K") + str_count(filtered.data$"N-term Modifications", "0")
  filtered.data$detected_tags <- filtered.data$"TMT10-K" + filtered.data$"TMT10-Nterm"

  #create a column that quantifies the efficiency of the labels
  #if the expected = detected, it is fully labelled,
  #if expected > detected, its partially labelled
  #if expected = 0, its completely unlabeled

  filtered.data$labeling_efficiency <- filtered.data$expected_tags - filtered.data$detected_tags

  filtered.data$labeling_efficiency[filtered.data$expected_tags - filtered.data$detected_tags == 0 & filtered.data$expected_tags > 0] <- "Fully Labeled"
  filtered.data$labeling_efficiency[filtered.data$expected_tags - filtered.data$detected_tags > 0] <- "Partially Labeled"
  filtered.data$labeling_efficiency[filtered.data$expected_tags > 0 & filtered.data$detected_tags == 0] <- "Unlabeled"
  filtered.data$labeling_efficiency[filtered.data$expected_tags == 0] <- "No Sites Available"
  filtered.data$labeling_efficiency[filtered.data$detected_tags - filtered.data$expected_tags > 0] <- "Overlabeled"

  #if there are more detected than expected, throw a warning to user

  for (row in 1:nrow(filtered.data))
  {
    #put a warning for "overlabeling" - not sure about this
    if(filtered.data[row, "detected_tags"] - filtered.data[row, "expected_tags"] > 0)
    {
      warning('This row has more detected tags than expected tags (overlabeled)')
    }

  }
  return(filtered.data)
}

#COUNT HOW MANY PHOSPHO SITES THERE ARE PER PEPTIDE
#data <- count_maxquant_phospho_sites("~/Box/CellBio-GoldfarbLab/Users/Ria Jasuja/modificationSpecificPeptides.txt", "Phospho (STY)")
count_maxquant_phospho_sites <- function(path,
                                         phospho_mod = "Phospho (STY)")
{
  data <- read_tsv(path)
  filtered.data <- select(data, "Sequence", "Modifications", "Phospho (STY)")
}

#COUNT HOW MANY STYS THERE ARE OVER 75% CONFIDENCE
# my.phospho.data <- read_maxquant_phospho_sites("~/Box/CellBio-GoldfarbLab/Users/Ria Jasuja/Phospho (STY)Sites.txt", "Localization prob", "Amino acid")
read_maxquant_phospho_sites <- function(path,
                                        localization_prob = "Localization prob",
                                        amino_acid = "Amino acid")
{
  data <- read_tsv(path)

  fractions <- grep("Localization prob ", colnames(data), ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE, value = TRUE)

  filtered.data <- select(data,"Proteins", localization_prob, amino_acid, fractions)
  filtered.data <- filtered.data %>% rename("Localization prob" = `localization_prob`, "Amino acid" = `amino_acid`)


}
#
format_spectrum_mill <- function(path,
                                 TMT_N_mod,
                                 TMT_K_mod,
                                 N_term_blocking_mods,
                                 K_blocking_mods,
                                 phospho_mod)
{

}

#
format_mz_tab <- function(path,
                          TMT_N_mod,
                          TMT_K_mod,
                          N_term_blocking_mods,
                          K_blocking_mods,
                          phospho_mod)
{

}

#
read_qc <- function(path)
{

}
GoldfarbLab/CPTACQC documentation built on Nov. 20, 2019, 6:31 a.m.