R/annotateMSdetection.R

Defines functions annotateMSdetection

Documented in annotateMSdetection

#' @title annotateMSdetection
#'
#' @description annotate MS detection of IGSeq clones based on DIA_resultset
#'
#' @param IGSeq_resultset IgSeq resultset to be annotated with MS detection
#' @param DIA_resultset DIA MS resultset based on which to annotate MS detection of the clones in IGSeq_resultset.
#' @param write_table Whether to write .csv table of MS detection. Default: TRUE
#' @param n_prot_uniqueness_threshold Uniqueness threshold to define clones detected with sufficient specificity.
#' This may be useful to quantify clonal families with high homology with a certain degree of sharedness of the detectable peptide set.
#' Defaults to 1, i.e. only peptides mapping to only one of the clones in the context of the current .fasta will be maintained for
#'  unique analysis
#'
#' @return IGSeq_resultset with added element step7_clonesMSdetection with tables $clones_detected
#' and quantitative matrices across the MS runs in the DIA_resultset calculated from all mapped or only unique peptides
#' according to the n_prot_uniqueness threshold set as parameter:
#' $MS_abundance_allprecursors_mean, $MS_abundance_uniqueprecursors_mean.
#' In addition, the study design of the DIA_resultset is used to aggregate and report quantities and frequency of observation
#' (mean and sd of precursor intensities and n_rep) across conditions. There are calculated
#' including only *unique* precursors (note n_prot_uniqueness_threshold parameter to modify this inclusion criterion).
#' The result matrices are stored as list elements in the result object with names:
#'  MS_abundance_uniqueprecursors_mean_cond_rep, MS_abundance_uniqueprecursors_mean_cond,
#'  MS_abundance_uniqueprecursors_sd_cond, MS_detection_uniqueprecursors_nrep_cond,
#'  MS_abundance_uniqueprecursors_mean_long. Further, the study_design from the DIA_resultset is propagated to the resultset in slot
#'  $study_design
#'
#' @import data.table
#'
#' @export

annotateMSdetection = function(IGSeq_resultset = seq_r06_fasta,
                               DIA_resultset = ms_r06,

                               write_table = TRUE,
                               n_prot_uniqueness_threshold = 1){

  message("IGSeq-res: ", deparse(substitute(IGSeq_resultset)))
  message("DIA-res: ", deparse(substitute(DIA_resultset)))
  file_out_name = paste0("MSdetection_", deparse(substitute(IGSeq_resultset)), "_",
                         deparse(substitute(DIA_resultset)), "_nu", n_prot_uniqueness_threshold, ".csv")

  # Create target object name depending on input
  res_list_name = paste0("step7_MS_detection_", deparse(substitute(DIA_resultset)))
  IGSeq_resultset[[res_list_name]] = list()

  # First, subset clone table to those detected in MS at all:
  ms_detected_all = unique(DIA_resultset$data_wide$Protein.Group)
  ms_detected_all_groupsresolved = unique(unlist(lapply(ms_detected_all, function(x){strsplit(x, split = ";")})))
  IGSeq_resultset[[res_list_name]]$clones_detected = IGSeq_resultset$step6_clonestofasta[cloneIdGlobal %in% ms_detected_all_groupsresolved]
  IGSeq_resultset[[res_list_name]]$clones_detected

  # Then, for each of these clones, produce the above metrics and store in appropriate containers
  msdetected_clones = IGSeq_resultset[[res_list_name]]$clones_detected$cloneIdGlobal

  # Target containers
  IGSeq_resultset[[res_list_name]]$MS_abundance_allprecursors_mean = matrix(nrow = length(msdetected_clones), ncol = ncol(DIA_resultset$data_wide)-2)
  IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean = matrix(nrow = length(msdetected_clones), ncol = ncol(DIA_resultset$data_wide)-2)
  rownames(IGSeq_resultset[[res_list_name]]$MS_abundance_allprecursors_mean) = msdetected_clones
  rownames(IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean) = msdetected_clones
  colnames(IGSeq_resultset[[res_list_name]]$MS_abundance_allprecursors_mean) = colnames(DIA_resultset$data_wide)[3:ncol(DIA_resultset$data_wide)]
  colnames(IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean) = colnames(DIA_resultset$data_wide)[3:ncol(DIA_resultset$data_wide)]

  # Run @TODO: Parallelize/refactor to speed up
  pb <- txtProgressBar(max = length(msdetected_clones), style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)

  message("Retrieving MS detection and -quantification information for ", length(msdetected_clones),
          " clones across ", ncol(DIA_resultset$data_wide)-2, " MS runs")

  for (i in seq_along(msdetected_clones)){
    # print(i)
    setTxtProgressBar(pb, i)
    grep_pattern = paste0(msdetected_clones[i], "$|", msdetected_clones[i], ";")
    quant_wide = DIA_resultset$data_wide[grep(grep_pattern , Protein.Group)]
    quant_prec = as.matrix(quant_wide[,3:ncol(quant_wide)])
    # Protein group members and size
    n_proteins = sapply(unique(quant_wide$Protein.Group), FUN = function(x){length(unlist(strsplit(x, split = ";")))})
    proteins = unique(quant_wide$Protein.Group)

    # All precursors
    n_precursors_all = length(unique(quant_wide$Precursor.Id))
    n_peptides_all = length(unique(sapply(quant_wide$Precursor.Id, function(x){substr(x, 1, nchar(x)-1)})))
    precursors_all = paste(unique(quant_wide$Precursor.Id), collapse = ";")
    peptides_all = paste(unique(sapply(quant_wide$Precursor.Id, function(x){substr(x, 1, nchar(x)-1)})), collapse = ";")
    quant_prot_all = colSums(quant_prec, na.rm = TRUE)

    # Unique precursors only
    quant_wide_unique = quant_wide[n_proteins <= n_prot_uniqueness_threshold]
    quant_prec_unique = as.matrix(quant_wide_unique[,3:ncol(quant_wide_unique)])
    n_precursors_unique = length(unique(quant_wide_unique$Precursor.Id))
    n_peptides_unique = length(unique(sapply(quant_wide_unique$Precursor.Id, function(x){substr(x, 1, nchar(x)-1)})))
    precursors_unique = paste(unique(quant_wide_unique$Precursor.Id), collapse = ";")
    peptides_unique = paste(unique(sapply(quant_wide_unique$Precursor.Id, function(x){substr(x, 1, nchar(x)-1)})), collapse = ";")
    quant_prot_unique = colSums(quant_prec_unique, na.rm = TRUE)

    # Write global stats to clones_detected table
    IGSeq_resultset[[res_list_name]]$clones_detected$MSdetection_protein_group_sizes[i] = paste(n_proteins, collapse = ";")
    IGSeq_resultset[[res_list_name]]$clones_detected$MSdetection_protein_groups[i] = paste(proteins, collapse = ":")
    IGSeq_resultset[[res_list_name]]$clones_detected$n_precursors_all[i] = n_precursors_all
    IGSeq_resultset[[res_list_name]]$clones_detected$n_peptides_all[i] = n_peptides_all
    IGSeq_resultset[[res_list_name]]$clones_detected$precursors_all[i] = precursors_all
    IGSeq_resultset[[res_list_name]]$clones_detected$peptides_all[i] = peptides_all
    IGSeq_resultset[[res_list_name]]$clones_detected$n_precursors_unique[i] = n_precursors_unique
    IGSeq_resultset[[res_list_name]]$clones_detected$n_peptides_unique[i] = n_peptides_unique
    IGSeq_resultset[[res_list_name]]$clones_detected$precursors_unique[i] = precursors_unique
    IGSeq_resultset[[res_list_name]]$clones_detected$peptides_unique[i] = peptides_unique

    # Write quantification to quant matrices
    IGSeq_resultset[[res_list_name]]$MS_abundance_allprecursors_mean[i,] = quant_prot_all
    IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean[i,] = quant_prot_unique

  }
  close(pb)

  ## aggregate by study design
  MS_abundance_uniqueprecursors_mean_long = reshape2::melt(IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean,
                                                           value.name = "MS_abundance_uniqueprecursors_mean")
  names(MS_abundance_uniqueprecursors_mean_long)[1:2] = c("cloneIdGlobal", "filename")
  MS_abundance_uniqueprecursors_mean_long = merge(MS_abundance_uniqueprecursors_mean_long, DIA_resultset$study_design,
                                                  by = "filename")
  MS_abundance_uniqueprecursors_mean_long = as.data.table(MS_abundance_uniqueprecursors_mean_long)
  MS_abundance_uniqueprecursors_mean_long[, nrep := 0]
  MS_abundance_uniqueprecursors_mean_long[MS_abundance_uniqueprecursors_mean > 0, nrep:=length(unique(replicate)), .(cloneIdGlobal,condition)]

  # Additional matrices + table:
  # MS_abundance per condition and replicate
  MS_abundance_uniqueprecursors_mean_cond_rep = dcast(MS_abundance_uniqueprecursors_mean_long, cloneIdGlobal~condition+replicate,
                                                      value.var = "MS_abundance_uniqueprecursors_mean")
  MS_abundance_uniqueprecursors_mean_cond_rep.m = as.matrix(MS_abundance_uniqueprecursors_mean_cond_rep[, 2:ncol(MS_abundance_uniqueprecursors_mean_cond_rep)])
  row.names(MS_abundance_uniqueprecursors_mean_cond_rep.m) = MS_abundance_uniqueprecursors_mean_cond_rep$cloneIdGlobal

  # MS abundance per condition
  MS_abundance_uniqueprecursors_mean_cond = dcast(MS_abundance_uniqueprecursors_mean_long, cloneIdGlobal~condition,
                                                  value.var = "MS_abundance_uniqueprecursors_mean", fun.aggregate = mean, fill = 0)
  MS_abundance_uniqueprecursors_mean_cond.m = as.matrix(MS_abundance_uniqueprecursors_mean_cond[, 2:ncol(MS_abundance_uniqueprecursors_mean_cond)])
  row.names(MS_abundance_uniqueprecursors_mean_cond.m) = MS_abundance_uniqueprecursors_mean_cond$cloneIdGlobal

  # MS abundance standard deviation per condition
  MS_abundance_uniqueprecursors_sd_cond = dcast(MS_abundance_uniqueprecursors_mean_long, cloneIdGlobal~condition,
                                                  value.var = "MS_abundance_uniqueprecursors_mean", fun.aggregate = sd, fill = 0)
  MS_abundance_uniqueprecursors_sd_cond.m = as.matrix(MS_abundance_uniqueprecursors_sd_cond[, 2:ncol(MS_abundance_uniqueprecursors_sd_cond)])
  row.names(MS_abundance_uniqueprecursors_sd_cond.m) = MS_abundance_uniqueprecursors_sd_cond$cloneIdGlobal


  # MS detection (# of replicates) per condition
  MS_detection_uniqueprecursors_nrep_cond = dcast(MS_abundance_uniqueprecursors_mean_long, cloneIdGlobal~condition, value.var = "nrep",
                                                  fun.aggregate = unique, fill = 0)
  MS_detection_uniqueprecursors_nrep_cond.m = as.matrix(MS_detection_uniqueprecursors_nrep_cond[, 2:ncol(MS_detection_uniqueprecursors_nrep_cond)])
  row.names(MS_detection_uniqueprecursors_nrep_cond.m) = MS_detection_uniqueprecursors_nrep_cond$cloneIdGlobal

  # MS_abundance with these metrics in long format

  # write to result object:
  IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean_cond_rep = MS_abundance_uniqueprecursors_mean_cond_rep.m
  IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean_cond = MS_abundance_uniqueprecursors_mean_cond.m
  IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_sd_cond = MS_abundance_uniqueprecursors_sd_cond.m
  IGSeq_resultset[[res_list_name]]$MS_detection_uniqueprecursors_nrep_cond = MS_detection_uniqueprecursors_nrep_cond.m
  IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean_long = MS_abundance_uniqueprecursors_mean_long
  IGSeq_resultset[[res_list_name]]$study_design = DIA_resultset$study_design

  if (write_table){
    # Assemble & write combined table from MSannotation results
    table2 = IGSeq_resultset[[res_list_name]]$MS_abundance_allprecursors_mean
    colnames(table2) = paste("MS_abundance_allprecursors_mean",colnames(table2))

    table3 = IGSeq_resultset[[res_list_name]]$MS_abundance_uniqueprecursors_mean
    colnames(table3) = paste("MS_abundance_uniqueprecursors_mean", colnames(table3))

    table4 = MS_abundance_uniqueprecursors_mean_cond.m
    colnames(table4) = paste("MS_abundance_uniqueprecursors_mean_cond", colnames(table4))

    table5 = MS_abundance_uniqueprecursors_sd_cond.m
    colnames(table5) = paste("MS_abundance_uniqueprecursors_sd_cond", colnames(table5))

    table6 = MS_detection_uniqueprecursors_nrep_cond.m
    colnames(table6) = paste("MS_detection_uniqueprecursors_nrep_cond", colnames(table6))

    clone_detection_and_quant = cbind(IGSeq_resultset[[res_list_name]]$clones_detected,
                                      table2,table3,table4,table5,table6)
    fwrite(clone_detection_and_quant,
           file = file_out_name)

  }

  return(IGSeq_resultset)
}
heuselm/igseqr documentation built on March 19, 2022, 7:28 p.m.