knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) suppressPackageStartupMessages({ library(epitopes) library(seqinr)})
This document illustrates an example of how the epitopes
package can be used
to create an organism-specific epitope predictor. We will use data related to
the nematode O. volvulus as our target pathogen.
The epitopes
package provides functions to download, extract, consolidate and pre-process epitope data from the Immune Epitope Database (IEDB), together with protein data (retrieved from NCBI and Uniprot) and taxonomy information for each pathogen (also from NCBI).
This initial part is quite time-consuming, but it
only needs to be done once, at the start of any project (or possibly once every few months, to get updated versions of the database). Once these are retrieved and saved, they can be loaded using simple readRDS()
calls.
NOTE: Function get_LBCE()
is returning odd errors on Windows machines. This is fixed in the current development version, but in the meantime the resulting dataframe obtained by running get_IEDB
-> get_LBCE()
on 28 Jan 2022 is available for download under https://github.com/fcampelo/epitopes/tree/master/data.
library(epitopes) # Download the full IEDB export into a given folder "XYZ" epitopes::get_IEDB(save_folder = "XYZ") # Extract only the linear B-cell epitopes from the IEDB export and save the # result to folder "ABC" epitopes <- epitopes::get_LBCE(data_folder = "XYZ", ncpus = parallel::detectCores() - 2, save_folder = "ABC") # Retrieve the relevant proteins and save them into the same folder "ABC" proteins <- epitopes::get_proteins(uids = unique(epitopes$protein_id), save_folder = "ABC") # Retrieve a taxonomy list for all pathogens and save it into the same folder "ABC" taxonomy <- epitopes::get_taxonomy(uids = unique(epitopes$sourceOrg_id), save_folder = "ABC")
The first step is to enrich the epitope data with protein information. This step is also useful to perform some initial filtering of the epitopes. In this example, assume that:
# Join proteins into epitope dataset jdf <- epitopes::prepare_join_df(epitopes, proteins, min_epit = 8, max_epit = 25, only_exact = FALSE, pos.mismatch.rm = "all", set.positive = "mode")
The next step is to filter out only the observations correspoding to the pathogen
of interest, based on its taxonomy ID (the function retains all lower-order taxIDs,
so if you pass e.g. a genus taxID it will retain all organisms from that genus).
We can also use function
filter_epitopes()
to filter observations by the host ID or to indicate whether we
want to remove some specific ID from the list (e.g., remove a specific strain).
OrgID <- 6282 # Pathogen: O. volvulus hostIDs <- 9606 # Host: humans # Filter data jdf <- epitopes::filter_epitopes(jdf, orgIDs = OrgID, hostIDs = hostIDs, tax_list = taxonomy)
The next step for preparing the data is to project the data into a windowed
representation, in which each amino acid position of each labelled peptide
from the epitope data will be represented by a fixed-length window of its
local neighbourhood. By default, the window size is set as 2 * min_epit - 1
,
to guarantee that the majority of positions in the window will always be part of
a labelled peptide.
Feature calculation is still a relatively time-consuming part of the pipeline
(increasing speed is high in our to-do list). The current version of the
epitopes
package calculates the following features:
Future versions of epitopes
are planned to include structure-based features, protein-level features and others. The package is structured in a way to easily incorporate the calculation of new features.
# Prepare windowed representation wdf <- epitopes::make_window_df(jdf, ncpus = parallel::detectCores() - 2) # Calculate features wdf <- epitopes::calc_features(wdf, max.N = 2, ncpus = parallel::detectCores() - 2)
Before training our organism-specific predictor, we isolate a subset of the data to be able to get a good estimate of the generalisation performance of the model. Although cross-validation could potentially be used, it is slightly harder to prevent data-leakage between folds, so we opted for a hold-out set approach.
If BLASTp is installed locally in your machine (check here for installation details), function split_epitope_data()
can split the data at the protein level using coverage and similarity thresholds to ensure that similar proteins are always placed under the same split, reducing the chances of leakage between the training and hold-out sets.
library(seqinr) prots <- proteins[proteins$UID %in% unique(jdf$protein_id), ] if(!dir.exists("./BLASTp")) dir.create("./BLASTp") fn <- "./BLASTp/prots.fasta" seqinr::write.fasta(as.list(prots$TSeq_sequence), names = prots$UID, file.out = fn, as.string = TRUE, nbchar = 100000) # Run BLASTp to determine protein similarities system(paste0("makeblastdb -in ", fn, " -dbtype prot")) system(paste0("blastp -query ", fn, " -db ", fn, " -seg no ", "-outfmt '6 qseqid sseqid length qlen ", "slen nident pident qcovhsp mismatch gaps qstart ", "qend sstart send evalue score' > ", fn, "-BLAST")) # Split data into training/test/final validation sets, splitting by protein ID # and using BLASTp to minimise similarities across splits splits <- epitopes::split_epitope_data(wdf, split_level = "prot", split_perc = c(75, 25), split_names = c("01_training", "02_holdout"), save_folder = "./splits", blast_file = "./BLASTp/prots.fasta-BLAST", coverage_threshold = 60, identity_threshold = 60)
Any classification strategy can now be used to generate a predictor. The
epitopes
package provides a quick interface to the Random Forest implementation
from ranger, which is quite
versatile. Check the documentation of function fit_model()
to check how to
pass specific hyperparameters to ranger's RF routines.
# Fit model using ranger's standard parameters my.model <- epitopes::fit_model(data.train = splits[[1]]$wdf, data.test = splits[[2]]$wdf, rnd.seed = 1234, ncpus = parallel::detectCores() - 2) # Calculate performance my_perf <- epitopes::calc_performance(truth = splits[[2]]$wdf$Class, pred = my.model$rf_class, prob = my.model$rf_probs, ret.as.list = TRUE)
To check the model performance:
# Plot ROC curve plot(my_perf$fpr, my_perf$tpr, type = "p", pch = 20, las = 1, xlab = "FPR", ylab = "TPR", main = "ROC curve for O. volvulus predictor", sub = paste("AUC = ", signif(my_perf$auc, digits = 3))) print(unlist(my_perf[c(5:12)]))
# this comes from the actual simulation perf_vec <- c(0.7624003, 0.6491852, 0.7012283, 0.7167080, 0.7305359, 0.4147487, 0.7079694, 0.7783327) names(perf_vec) <- c("sens", "spec", "ppv", "npv", "f1", "mcc", "accuracy", "auc") print(perf_vec)
To apply the predictor (which is saved under my.model$rf_model
) to a new protein, you just need to:
predict
function# Get the first protein from the hold-out set as an example: myprot <- proteins[proteins$UID == splits[[2]]$wdf$Info_protein_id[1]] # Make windowed representation: myprot_w <- epitopes::make_window_df(myprot, window_size = nchar(splits[[2]]$wdf$Info_window_seq[1])) myprot_w <- epitopes::calc_features(myprot_w, max.N = 2, ncpus = parallel::detectCores() - 2) # Get predictions from the model myprot_w <- as.data.frame(myprot_w)[, which(!grepl("Info_", names(myprot_w)))] preds <- stats::predict(my.model$rf_model, data = myprot_w)
# Smooth predictions (to remove very short positive sequences) myclass <- epitopes::smooth_predictions(as.numeric(preds$predictions[, 2] > 0.5), minsize = 8) plot(preds$predictions[, 2], type = "l", lwd = .5, las = 1, main = paste("Predictions for protein", myprot$UID[1]), xlab = "Protein position", ylab = "Probability / Prediction", ylim = c(0, 1.05)) points(myclass, type = "p", pch = 20, col = myclass + 1) points(myclass, type = "l", lwd = .3, col = "#77777777")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.