#' Peptide Reference Merger
#' @description Matches peptide reference information with peptidome sequences
#' @param peptidome list of all possible peptides for proteins of interest and their uniprot IDs
#' @param pep_reference dataframe with a "sequence" column of peptides and a column of n observations in a reference
#' @param reference_name the name of the reference, which is used to label the output columns
#' @param exp_counts_col the name of the reference column containing the n of peptide observations
#' @param detection_freq If TRUE, calculates the each peptide's frequency of appearance in the reference
#' @param detection_ratio If TRUE, calculates ratio of peptide observations to parent protein observations
#' @return dataframe of peptidome with reference information
#' @importFrom dplyr left_join summarise group_by %>%
#'
#' @examples
#' \dontrun{
#'
#' peptides_inReference(peptidome = SwissProt2018_peptidome,
#' pep_reference = tm_peptides,
#' reference_name = "transmembrane")
#'
#' peptides_inReference(peptidome = SwissProt2018_peptidome,
#' pep_reference = CPTAC_exp_counts,
#' reference_name = "CPTAC",
#' exp_counts_col = "n_obs_pep",
#' detection_freq = TRUE,
#' detection_ratio = TRUE)
#'}
#'
#'
#' @export
peptides_inReference <- function(peptidome, pep_reference, reference_name, exp_counts_col, detection_freq, detection_ratio){
# peptidome: list of all possible peptides (minimum cols: "sequences")
# pep_reference: list of reference peptides (minimum cols: "sequences", "n_obs_pep" (if non boolean))
# reference_name: name of reference (used for resulting column names)
# exp_counts_col: name of n_obs_pep column
# detection_freq: If TRUE, adds column of num experiments peptide was observed in divided by total num of experiments in reference
# detection_ratio: If TRUE adds column of num experiments peptide was observed in divded by num experiments parent protein was observed in (also adds n_obs_prot column)
uniprot <- NULL
# ----- Defaults -----
if(missing(exp_counts_col)){
# default: no column for n_obs_pep if none specified
exp_counts_col <- NA
}
if(missing(detection_ratio)){
# default: no comparison if none specified
detection_ratio <- FALSE
}
if(missing(detection_freq)){
detection_freq <- FALSE
}
# ----- Check if Peptide in Reference -----
# if n_obs_pep column not present, all peptidome peptides found in reference labeled TRUE
if(is.na(exp_counts_col)==TRUE){
# creates a col in peptidome with the reference name and labels all peptides FALSE
peptidome[[reference_name]] <- FALSE
# if peptide is found in reference, col = TRUE
peptidome[[reference_name]][peptidome$sequence %in% pep_reference$sequence] <- TRUE
}else{
# name of n_obs_pep col with reference name added
obs_pep_colname <- paste0(reference_name, "_n_obs_pep")
# if n_obs_pep col already present in peptidome for current reference, skips addition
if(obs_pep_colname %in% colnames(peptidome) == FALSE){
# updates n_obs_pep col name in reference df before joining
colnames(pep_reference)[BiocGenerics::grep(exp_counts_col, colnames(pep_reference))] <- obs_pep_colname
# adds num of peptide observations to peptidome
peptidome <- dplyr::left_join(peptidome, pep_reference, by = "sequence")
# replaces NA values in num of observations with 0
peptidome[[obs_pep_colname]][is.na(peptidome[[obs_pep_colname]])] <- 0
}
# ----- Calculate Frequency of Appearance -----
if(isTRUE(detection_freq)){
# largest n of experiments in reference that any peptide has been observed in
max_n_obs <- max(peptidome[[obs_pep_colname]])
# name of freq col for peptidome
freq_colname <- paste0(reference_name, "_freq")
# calculates proportion of experiments in reference that peptidome peptides were observed in
peptidome[,freq_colname] <- peptidome[,obs_pep_colname]/max_n_obs
}
# ----- Calculate Peptide Detectability Ratio -----
if(isTRUE(detection_ratio)){
# determines largest num of experiments in reference that each uniprot ID/protein was observed in
prot_nums <- dplyr::group_by(peptidome, uniprot) %>% dplyr::summarise(n_obs_prot = max(!!rlang::sym(obs_pep_colname)))
# name of n_obs_prot col for peptidome
obs_prot_colname <- paste0(reference_name, "_n_obs_prot")
colnames(prot_nums)[2] <- obs_prot_colname
# matches num of experiments in reference that proteins were observed in to uniprot IDs
peptidome <- dplyr::left_join(peptidome, prot_nums, by = 'uniprot')
# name of ratio col for peptidome
ratio_colname <- paste0(reference_name, "_ratio")
# calculates the ratio of experiments in reference that a peptide was observed in to the experiments that its parent protein was observed in
peptidome[[ratio_colname]] <- peptidome[[obs_pep_colname]]/peptidome[[obs_prot_colname]]
# converts NA's to 0 (result from n_obs_prot = 0)
peptidome[[ratio_colname]][is.na(peptidome[[ratio_colname]])] <- 0
}
}
return(peptidome)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.