#' Peptidome Generator
#' @description Generates a tryptic peptidome of unique peptides using a provided proteome. Requires bioconductor pacakge "cleaver".
#' @param proteome_dir directory to proteome fasta file
#' @param missed_cleavages range of missed cleavage events considered during in-silico digestion
#' @param synth_peps If TRUE, filters out peptides that are problematic for peptide synthesis. Currnetly includes peptides with: N-term Q/E residues and C-term P residues
#' @param aa_range range of peptide size by amino acid count included
#' @return Peptidome of unique peptides with parent protein uniprot IDs, sequence locations in parent proteins, and n missed cleavages
#' @examples
#' \dontrun{
#' create_peptidome(proteome_dir = "./proteome.fasta",
#' missed_cleavages = c(0,2),
#' synth_peps = FALSE,
#' aa_range = c(7-25))
#'}
#'
#' @export
create_peptidome <- function(proteome_dir, missed_cleavages, synth_peps, aa_range){
# proteome_dir: directory to fasta file of all proteins in desired proteome
# missed_cleavages: range of missed cleavages considered in in-silico digestion
# synth_peps: if TRUE, then filters out peptides starting with Q and E
# aa_range: range of number of amino acids in peptides that are filtered for
# ----- Defaults -----
size <- symbol <- uniprot <- start <- end <- NULL
if(missing(missed_cleavages)){
# default missed cleavages
missed_cleavages <- c(0,2)
print('default: 0-2 missed cleavages included in digestion')
}
if(missing(aa_range)){
# default peptide size range
aa_range <- c(6,25)
print('default: 6-25 amino acid range')
}
if(missing(synth_peps)){
# default synth_peps status
synth_peps <- FALSE
#print('default: synthetic peptide constraints not considered')
}
# ----- Digestions -----
# directly accesses fasta file using directory and loads in proteome as large AAStringSet
proteome <- Biostrings::readAAStringSet(file=proteome_dir)
# creates empty peptidome df
peptidome <- data.frame()
### CUSTOM DIGESTION CODE:
# sets cut sites (Trypsin + loss of N-term M)
# cut_sites <- c("^M|([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))", "((?<=[CD])K(?=D))|((?<=C)K(?=[HY]))|((?<=C)R(?=K))|((?<=R)R(?=[HR]))")
# digestion with custom cut sites
# curr_cleavage <- as.data.frame(cleaver::cleave(proteome, custom = cut_sites, missedCleavages = i, unique = FALSE))
# curr_cleavage <- BiocGenerics::as.data.frame(cleaver::cleave(proteome, enzym = 'trypsin' , missedCleavages = i, unique = FALSE))
cut_sites <- "^M|K|R"
# loops through missed cleavage range
for(i in missed_cleavages[1]:missed_cleavages[2]){
# performs in silico trypsin digestion of proteome yielding peptides with i missed cleavages
curr_cleavage <- BiocGenerics::as.data.frame(cleaver::cleave(proteome, custom = cut_sites, missedCleavages = i, unique = FALSE))
# provides locations of peptides within parent proteins
curr_positions <- BiocGenerics::as.data.frame(cleaver::cleavageRanges(proteome, custom = cut_sites, missedCleavages = i))
# adds peptide start and end locations
curr_cleavage$start <- curr_positions$start
curr_cleavage$end <- curr_positions$end
# adds the number of amino acids in peptides
curr_cleavage$size <- curr_positions$width
# adds the number of missed cleavages that resulted in the peptide
curr_cleavage$missed_cleavages <- i
# adds peptides from i missed cleavages and info to peptidome
peptidome <- rbind(peptidome, curr_cleavage)
}
# ----- Column Renaming -----
# extracts gene symbols from fasta headers
peptidome$symbol <- stringr::str_match(peptidome$group_name,pattern = 'GN=(.+) PE')[,2]
# extracts uniprot IDs from fasta headers
peptidome$uniprot <- stringr::str_match(peptidome$group_name,pattern = 'sp\\|(.+)\\|')[,2]
colnames(peptidome)[3] <- 'sequence'
# ----- Filtering by Sequence Constraints -----
# filters for peptides in the amino acid range specified
peptidome = dplyr::filter(peptidome, size >= aa_range[1], size <= aa_range[2])
# removes peptides with sequences containing "U"
peptidome <- dplyr::filter(peptidome, grepl("U", peptidome$sequence) != TRUE)
# if synthetic peptide parameter = TRUE
if(synth_peps == TRUE){
# remove peptides starting with Q or E
peptidome = peptidome[!grepl('^Q|^E', peptidome$sequence),]
# removes peptides ending with P
peptidome = peptidome[!grepl("P$", peptidome$sequence),]
}
# ----- Removal of Non-Unique Peptides -----
# removes duplicates, leaving only peptides unique to one protein
peptidome = peptidome[!(BiocGenerics::duplicated(peptidome$sequence) | BiocGenerics::duplicated(peptidome$sequence, fromLast = TRUE)),]
# ----- Final Output -----
# selects a subset of columns
peptidome <- dplyr::select(peptidome, symbol, uniprot, missed_cleavages, sequence, start, end, size)
return(peptidome)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.