#' RF Model Training
#' @description Trains a random forest model using a peptidome with reference information
#' @param peptidome List of all possible peptide sequences for relevant proteome (total_trainingSet = FALSE) OR a list of peptide sequences in a custom training set (total_trainingSet = TRUE). Must also include peptide n of observations (n_obs_pep) and parent protein n of observations (n_obs_prot) in a desired reference, and peptide missed cleavages.
#' @param reference_name Name of reference (same as column names)
#' @param total_trainingSet If TRUE, model will be trained on all peptides in a custom input peptidome. If FALSE, peptides will be selected from whole input peptidome to create a training set of 10k peptides (same method as models provided in package).
#' @return RF model for predicting peptide detectability
#' @importFrom randomForest randomForest
#' @importFrom stats quantile
#' @examples
#' \dontrun{
#' CPTAC_peptidome <- peptides_inReference(peptidome = SwissProt2018_peptidome,
#' reference_name = "CPTAC",
#' pep_reference = CPTAC_exp_counts,
#' exp_counts_col = "n_obs_pep",
#' detection_ratio = TRUE)
#'
#' train_RFmodel(peptidome = CPTAC_peptidome,
#' reference_name = "CPTAC")
#'}
#'
#' @export
train_RFmodel <- function(peptidome, reference_name, total_trainingSet){
# ----- Errors and Defaults -----
if(missing(total_trainingSet)){
# if total_trainingSet not stated, set to FALSE
total_trainingSet <- FALSE
}
# ----- Training Set Filtering -----
ratio_name <- paste0(reference_name, "_ratio")
# save name of ratio column for desired reference
if(total_trainingSet == FALSE){
# if total peptidome input/not a custom training set
n_obs_prot_name <- paste0(reference_name, "_n_obs_prot")
# save name of n_obs_prot column for desired reference
top10 <- quantile(peptidome[[n_obs_prot_name]], prob = 0.9)
# calculate value for 0.9 quantile of n_obs_prot
training_peptides <- peptidome[peptidome[[n_obs_prot_name]] >= top10,]
# filter peptides reaching n_obs_prot value = 0.9 quantile (peptides from frequently observed proteins)
detected_peptides <- training_peptides[training_peptides[[ratio_name]] >= quantile(training_peptides[[ratio_name]], prob = 0.9),]
# filter peptides reaching 0.9 quantile of reference ratio values (peptides with good detectability)
detected_peptides <- dplyr::sample_n(detected_peptides, 5000)
# randomly sample 5k peptides from pool of peptides with good detectability
undetected_peptides <- training_peptides[training_peptides[[ratio_name]] == 0,]
# filter for unobserved peptides (ratio = 0)
undetected_peptides <- dplyr::sample_n(training_peptides, 5000)
# randomly sample 5k peptides from pool of unobserved peptides
training_peptides <- rbind(detected_peptides, undetected_peptides)
# create training 10k training set of peptides with good detectability and no observations for frequently detected proteins
}else{
training_peptides <- peptidome
# If total_trainingSet = TRUE, training peptides = entire input peptidome
}
# ----- Descriptor Preparation -----
pc_descriptors <- training_peptides[,c(ratio_name, "sequence", "missed_cleavages")]
# create df of peptide detectability ratios, sequences, and missed cleavages
colnames(pc_descriptors)[1] <- "ratio"
# generalize ratio column name
pc_descriptors <- calculate_pc_descriptors(pc_descriptors = pc_descriptors)
# calculate pysio chemical descriptors for training peptides
pc_descriptors$sequence <- NULL
# remove sequences from df
# ----- Model Training -----
start.time<-proc.time()
# record start time
RFmodel <- randomForest::randomForest(ratio~., data = pc_descriptors, ntree = 500, importance = TRUE )
# train RF model to predict ratios
stop.time<-proc.time()
# record stop time
run.time<-stop.time-start.time
# calculate run time
print(paste0("Model training time = ", round(run.time[3]/60,2), " minutes"))
return(RFmodel)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.