R/pkg_peptide_prioritizer.R

Defines functions prioritize_peptides

Documented in prioritize_peptides

#' Peptide Prioritizer
#' @description Selects the top n peptides for a list of protein uniprot IDs given a set of priorities
#' @param uniprot_list a list of uniprot IDs for proteins of interest
#' @param max_n max number of peptides selected for each protein
#' @param peptidome list of all possible peptides for relevant proteome, their uniprot IDs, and the values for their non-RF priorities
#' @param prediction_model RF model used to predict scores for peptide detectability
#' @param priorities list of priority column names in peptidome
#' @param priority_thresholds list of minimum values that priorities must reach to be considered
#' @return list of n prioritized peptides for proteins of interest
#' @importFrom dplyr select arrange filter desc
#' @importFrom stats na.omit
#' @import Peptides
#'
#' @examples
#' \dontrun{
#' prioritize_peptides(uniprot_list = c("Q9NQ94", "P04217", "A8K2U0"),
#'                     peptidome = SwissProt2018_peptidome,
#'                     prediction_model = CPTAC_RFmodel,
#'                     priorities = "RF_score",
#'                     priority_thresholds = 0)
#'
#'
#' CPTAC_peptidome <- peptides_inReference(peptidome = SwissProt2018_peptidome,
#'                                         reference_name = "CPTAC",
#'                                         pep_reference = CPTAC_exp_counts,
#'                                         exp_counts_col = "n_obs_pep",
#'                                         detection_freq = TRUE)
#'
#' prioritize_peptides(uniprot_list = c("Q3T906", "P04217", "A8K2U0"),
#'                     max_n = 10, peptidome = CPTAC_peptidome,
#'                     prediction_model = RFmodel_CPTAC,
#'                     priorities = c("CPTAC_freq","RF_score"),
#'                     priority_thresholds = c(0.8, 0))
#'}
#'
#'
#' @export



prioritize_peptides <- function(uniprot_list, max_n, peptidome, prediction_model, priorities, priority_thresholds){
  # peptidome: list of all possible peptides (minimum cols: "sequences", "uniprot")
  # uniprot_list: list of uniprot IDs for proteins of interest
  # max_n: max number of peptides prioritized per protein in final output
  # prediction_model: RF model used to predict/rank peptide observability
  # priorities: list of column names in peptidome to consider ahead of RF scores during prioritization
  # priority_thresholds: list of value cutoffs or booleans for priorities


  # ----- Errors and Defaults -----

  missed_cleavages <- uniprot <- NULL


  if(missing(peptidome)){

    # default peptidome if none specified
    peptidome <- PeptideRanger::SwissProt2018_peptidome

    print('default peptidome: 2018 SwissProt Trypsin digest, 0-2 MCs, 6-25aa, unique to 1 protein')
  }



  if(missing(max_n)){

    # default max num of peptides if none specified
    max_n <- 5
    print(paste("default: max of", max_n, "peptides prioritized per protein"))
  }



  if(missing(prediction_model) & sum(grepl("RF_score", priorities)) != 0){

    # need model if "RF_score" is in priorities
    stop("Must set a prediction_model if RF_score used as a priority")

  }



  if(missing(prediction_model) & sum(grepl("RF_score", priorities)) == 0){

    # no model used if not specified and "RF_score" not in priorities
    prediction_model <- NA

  }



  if(sum(c("size", "missed_cleavages") %in% priorities) > 0){

    stop("size and missed_cleavages are not valid priorities")

  }



  if(length(priorities) != length(priority_thresholds)){

    # num of thresholds must match num of priorities
    stop("number of priorities does not match number of thresholds")

  }



  # ----- Subset Relevant Peptidome -----

  # removes duplicates from uniprot ID list
  uniprot_list <- unique(as.character(uniprot_list))

  # creates a subset of peptidome to only include peptides from proteins in the uniprot_list
  peps_of_interest <- peptidome[peptidome$uniprot %in% uniprot_list,]



  # ----- Random Forest Predictions -----

  if(isTRUE("RF_score" %in% priorities)){

    # creates a new df of the subset, only including peptide sequences and missed cleavages
    pc_descriptors <- dplyr::select(peps_of_interest, sequence, missed_cleavages)

    # calculates relevant physiochemical descriptors for model predictions
    pc_descriptors <- calculate_pc_descriptors(pc_descriptors = pc_descriptors)

    # RF uses peptide descriptors to predict detectability
    # adds predictions to column in list of peptides and corresponding parent protein IDs
    peps_of_interest$RF_score <- stats::predict(prediction_model,pc_descriptors)

  }



  # ----- Peptide Prioritization -----

  # creates empty df for final prioritized peptides
  peptide_predictions <- data.frame()

  # loop goes through each prot ID in uniprot_list
  for(i in 1:length(uniprot_list)){

    # filters for all peptides from current prot
    curr_peps <- dplyr::filter(peps_of_interest, uniprot == uniprot_list[i])
    # creates a df to add prioritized peptides to from current protein - meant for addition to peptide_predictions
    peps_toAdd <- data.frame()


    # loop through priorities
    for(k in 1:length(priorities)){

      # sets current priority
      curr_priority <- as.character(priorities[k])
      # sets corresponding threshold
      curr_threshold <- priority_thresholds[k]

      # if threshold = boolean
      if(isTRUE(curr_threshold) == TRUE | isFALSE(curr_threshold) == TRUE){

        # filters for peptides from curr_peps that have value = threshold in boolean curr_priority column
        priority_peps <- dplyr::filter(curr_peps, curr_peps[[curr_priority]] == curr_threshold)

        # adds max_n peptides from filtered list to df of peptides to add to final prioritized list
        peps_toAdd <- BiocGenerics::rbind(peps_toAdd, priority_peps[1:(max_n-nrow(peps_toAdd)),])
        # removes empty rows: appear if < max_n peptides in filtered list
        peps_toAdd <- na.omit(peps_toAdd)

      # if threshold != boolean (should be numeric)
      }else{

        # filters for peptides from curr_peps that have value >= threshold in numeric curr_priority column
        priority_peps <- dplyr::filter(curr_peps, curr_peps[[curr_priority]] >= curr_threshold)
        # arranges peptides in descending order by value in the curr_priority column
        priority_peps <- dplyr::arrange(priority_peps, dplyr::desc(priority_peps[[curr_priority]]))

        # adds peptides in the top max_n rows (minus nrow peptides_toAdd) in the ordered list
        peps_toAdd <- BiocGenerics::rbind(peps_toAdd, priority_peps[1:(max_n-nrow(peps_toAdd)),])
        # removes empty rows: appear if < max_n peptides in filtered list
        peps_toAdd <- na.omit(peps_toAdd)


      }

      # after selecting peptides for each priority, removes selected peptides from curr_peps (the current protein's pool)
      curr_peps <- curr_peps[(curr_peps$sequence %in% peps_toAdd$sequence) == FALSE,]

      # if list of pepitdes for current protein is already = max_n, break priority loop and go to next protein
      if(nrow(peps_toAdd) == max_n){

        break

      }

    } # end of priority loop

    # adds list of up to max_n prioritized peptides to peptide_predictions
    peptide_predictions = BiocGenerics::rbind(peptide_predictions, peps_toAdd)

  } # end of protein loop


  # returns peptide_predictions
  return(peptide_predictions)

}
rr-2/PeptideRanger documentation built on May 27, 2023, 4:43 p.m.