#' Build organism-specific data sets for training epitope prediction models
#'
#' This function extracts organism or taxon-specific datasets from IEDB data
#' and returns data sets for the development and assessment of epitope
#' prediction models.
#'
#' @section Data splitting and BLAST requirements:
#' If the sum of `split_perc` is less than 100 an extra split is generated
#' with the remaining observations - e.g., `split_perc = c(50, 30)` results in
#' three sets with an approximately 50/30/20% split *of the total observations.*
#' If the sum is greater than 100 the splits are linearly scaled down so that
#' the sum becomes 100. Note that the split percents correspond to the number of
#' observations, not the number of unique IDs.
#'
#' This function will attempt to approximate the desired split levels, but
#' depending on the size of the data set and the desired `split_level` it may
#' not be possible (e.g., if `split_level = "org"` and a single organism
#' corresponds to 90% of the data, one of the splits will necessarily correspond
#' to at least 90% of the data, regardless of the values informed in
#' `split_perc`).
#'
#' If `split_level == "prot` the routine will keep any pairs of proteins
#' having (coverage >= `coverage_threshold` AND
#' identity >= `identity_threshold`) under the same split. This is useful to
#' prevent accidental data leakage due to quasi-identical proteins with
#' different UIDs.
#' **NOTE**: this will require BLAST+ to be installed in your
#' local machine. For details on how to set up BLAST+ on your machine, check
#' <https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download>.
#' This function was developed for versions `blast 2.10.0` or later.
#'
#' @inheritParams prepare_join_df
#' @inheritParams filter_epitopes
#' @inheritParams split_epitope_data
#' @param taxonomy_list list containing taxonomy information
#' (generated by [get_taxonomy()])
#' @param orgIDs vector of organism/taxon IDs to retain (see
#' [filter_epitopes()]). If `NULL` then no organism ID filtering is
#' performed.
#' @param hostIDs vector of host IDs to retain (see [filter_epitopes()]).
#' If `NULL` then no host filtering is performed.
#' @param removeIDs vector of organism IDs to remove (see [filter_epitopes()]).
#' Useful for, e.g., using a Class-level `orgIDs` and removing some species
#' or genera. If `NULL` then no removal is performed.
#' @param window_size positive integer, size of window to use.
#' @param max.N maximum length of N-peptide frequency features to be calculated.
#' @param ncpus number of cores to use for data windowing and feature
#' calculation.
#'
#' @return List containing the resulting datasets.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk})
#'
#' @export
#'
make_OrgSpec_datasets <- function(epitopes, proteins, taxonomy_list,
orgIDs = NULL,
hostIDs = NULL,
removeIDs = NULL,
save_folder = "./",
min_epit = 8,
max_epit = 25,
only_exact = FALSE,
pos.mismatch.rm = "all",
set.positive = "mode",
window_size = 2 * min_epit - 1,
max.N = 2,
split_level = "prot",
split_perc = c(75, 25),
split_names = c("01_training", "02_holdout"),
coverage_threshold = 80,
identity_threshold = 80,
ncpus = 1){
# ========================================================================== #
# Sanity checks and initial definitions
id_classes <- c("NULL", "numeric", "integer", "character")
assertthat::assert_that(class(orgIDs) %in% id_classes,
class(hostIDs) %in% id_classes,
class(removeIDs) %in% id_classes,
assertthat::is.count(min_epit),
assertthat::is.count(max_epit),
min_epit <= max_epit,
pos.mismatch.rm %in% c("all", "align"),
set.positive %in% c("any", "mode", "all"),
is.logical(only_exact) & length(only_exact) == 1,
assertthat::is.count(window_size),
assertthat::is.count(max.N),
assertthat::is.count(ncpus),
is.character(save_folder),
length(save_folder) == 1,
is.character(split_level), length(split_level) == 1,
split_level %in% c("org", "prot", "epit"),
is.numeric(split_perc),
all(sapply(split_perc, assertthat::is.count)),
is.null(split_names) | is.character(split_names))
if(!dir.exists(save_folder)) dir.create(save_folder, recursive = TRUE)
# ========================================================================== #
# Join proteins onto epitope dataset
jdf <- prepare_join_df(epitopes = epitopes, proteins = proteins,
min_epit = min_epit, max_epit = max_epit,
only_exact = only_exact,
pos.mismatch.rm = pos.mismatch.rm,
set.positive = set.positive)
# Filter epitopes by organism / host IDs
jdf <- filter_epitopes(df = jdf,
orgIDs = orgIDs,
hostIDs = hostIDs,
removeIDs = removeIDs,
tax_list = taxonomy_list)
# Prepare windowed representation
wdf <- make_window_df(df = jdf,
window_size = window_size,
ncpus = ncpus)
# Calculate features
wdf <- calc_features(wdf, max.N = max.N, ncpus = ncpus)
# Select relevant proteins
prots <- proteins[which(proteins$UID %in% unique(jdf$protein_id)), ]
if (split_level == "prot") {
# Check that BLASTp is installed
a <- tryCatch(system2("blastp", args = "-version",
stdout = TRUE, stderr = TRUE),
error = function(c){"Nope!"},
warning = function(c){"Nope!"})
if (a == "Nope!"){
message("\nBLASTp not installed.
\rPlease check function documentation for instructions.
\rSplitting will be done using only protein IDs.")
} else {
blast_v <- strsplit(a[1], split = ": ", fixed = TRUE)[[1]][2]
BLAST_path <- paste0(save_folder, "/BLASTp")
if(!dir.exists(BLAST_path)) dir.create(BLAST_path)
# Run BLASTp to determine protein similarities
fn <- paste0(BLAST_path, "/prots.fasta")
seqinr::write.fasta(as.list(prots$TSeq_sequence), names = prots$UID,
file.out = fn, as.string = TRUE, nbchar = 100000)
#if(gsub("\\+", "", blast_v) == "2.10.0"){
blast_call <- paste0("blastp -query ", fn, " -db ", fn, " -seg no ",
"-outfmt '6 qseqid sseqid pident qcovhsp' > ",
fn, "-BLAST")
# } else {
# blast_call <- paste0("blastp -query ", fn, " -db ", fn, " -seg no ",
# "-outfmt '6 qseqid sseqid length qlen slen nident pident qcovhsp mismatch gaps qstart qend sstart send evalue score' > ",
# fn, "-BLAST")
#
#
# }
system(paste0("makeblastdb -in ", fn, " -dbtype prot"))
system(blast_call)
}
}
# Split data into sets
splits <- split_epitope_data(wdf,
split_level = split_level,
split_perc = split_perc,
split_names = split_names,
blast_file = unlist(ifelse(a[1] == "Nope!",
list(NULL),
paste0(fn, "-BLAST"))),
coverage_threshold = coverage_threshold,
identity_threshold = identity_threshold)
# Save results
for (i in seq_along(splits)){
saveRDS(splits[[i]]$wdf,
file = paste0(save_folder, "/", names(splits)[i], ".rds"))
}
return(splits)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.