#' Split epitope data based on epitope, protein or organism IDs.
#'
#' Takes a data.table of data of class *windowed_epit_dt* (returned by
#' [make_window_df()]) and split it into mutually exclusive sets of
#' observations, based on columns *Info_sourceOrg_id*, *Info_protein_id* or
#' *Info_epitope_id*.
#'
#' If the sum of **split_perc** is less than 100 an extra split is generated
#' with the remaining observations - e.g., `split_perc = c(50, 30)` results in
#' three sets with an approximately 50/30/20% split *of the total observations.*
#' If the sum is greater than 100 the splits are linearly scaled down so that
#' the sum becomes 100. Note that the split percents correspond to the number of
#' observations, not the number of unique IDs.
#'
#' This function will attempt to approximate the desired split levels, but
#' depending on the size of **wdf** set and the desired **split_level** it may
#' not be possible (e.g., if `split_level = "org"` and a single organism
#' corresponds to 90% of the data, one of the splits will necessarily correspond
#' to at least 90% of the data, regardless of the values informed in
#' `split_perc`.
#'
#' If a BLASTp file is provided the routine will keep any pairs of proteins
#' having (coverage >= **coverage_threshold** AND
#' identity >= **identity_threshold**) under the same split. This is useful to
#' prevent accidental data leakage due to quasi-identical proteins with
#' different UIDs. **NOTE**: this only works if `split_level == "prot`.
#'
#' @param wdf data table of class *windowed_epit_dt* (returned by
#' [make_window_df()])
#' @param split_level which level should be used for splitting? Use "org" for
#' splitting by source organism ID, "prot" by protein ID or "epit" by
#' epitope ID. When "prot" is used the routine attempts to identify
#' different protein versions and treat them as a single unit for
#' splitting purposes.
#' @param split_perc numeric vector of desired splitting percentages. See
#' Details.
#' @param split_names optional character vector with short names for each split.
#' @param blast_file path to file containing all-vs-all BLASTp alignment results
#' for all proteins in **wdf**. See Details.
#' @param coverage_threshold coverage threshold for grouping proteins by
#' similarity, see Details.
#' @param identity_threshold identity threshold for grouping proteins by
#' similarity, see Details.
#' @param save_folder path to folder for saving the results.
#'
#' @return A list object containing the split data tables.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk})
#'
#' @export
#'
split_epitope_data <- function(wdf,
split_level = "prot",
split_perc = c(70, 30),
split_names = NULL,
save_folder = NULL,
blast_file = NULL,
coverage_threshold = 80,
identity_threshold = 80){
# ========================================================================== #
# Sanity checks and initial definitions
assertthat::assert_that("windowed_epit_dt" %in% class(wdf),
is.character(split_level), length(split_level) == 1,
split_level %in% c("org", "prot", "epit"),
is.numeric(split_perc),
all(sapply(split_perc, assertthat::is.count)),
is.null(blast_file) | is.character(blast_file),
is.null(split_names) | is.character(split_names))
# Check and adjust split sizes if necessary
ssp <- sum(split_perc)
if (ssp < 100){
split_perc <-c(split_perc, 100 - ssp)
} else if (ssp > 100){
split_perc <- 100 * split_perc / ssp
}
if (ssp >= 100 & length(split_perc) == 1){
cat("\nNo splitting performed (split_perc = 100).")
invisible(wdf)
}
nsplits <- length(split_perc)
# Set up split names
if (is.null(split_names)){
split_names <- 1:nsplits
} else if (length(split_names) < nsplits){
cat("\nLength of split names smaller than number of splits.",
"\nUsing split numbers instead.")
split_names <- 1:nsplits
}
# Determine splits by splitting column
# (Note that protein IDs ignore the trailing version number)
id_var <- switch(split_level,
org = wdf$Info_sourceOrg_id,
prot = gsub("\\.[1-9]+$", "", wdf$Info_protein_id),
epit = wdf$Info_epitope_id)
# Check similarity based on the blast file (if)
if (split_level == "prot" && !is.null(blast_file)){
blast <- utils::read.csv(blast_file, sep = "\t",
header = FALSE,
stringsAsFactors = FALSE)
names(blast) <- c("QueryID", "SubjectID", "Perc_identity", "Query_coverage")
blast$QueryID <- gsub("\\.[1-9]+$", "", blast$QueryID)
blast$SubjectID <- gsub("\\.[1-9]+$", "", blast$SubjectID)
blast <- data.table::as.data.table(blast)
# Initialise internal data.table variable names to prevent CRAN notes.
Query_coverage <- Perc_identity <- QueryID <- SubjectID <- NULL
# Filter relevant blast entries and variables
blast <- blast[(Query_coverage >= coverage_threshold) & (Perc_identity >= identity_threshold), ]
blast <- blast[!duplicated(t(apply(blast[, 1:2], 1, sort))), list(QueryID, SubjectID)]
blast <- apply(blast, 1, paste, collapse=" ")
} else {
blast <- character()
}
# Get the frequencies of occurrence of each ID
divs <- as.data.frame(sort(table(id_var), decreasing = TRUE),
stringsAsFactors = FALSE)
divs$Freq <- 100 * divs$Freq / length(id_var)
# Determine which IDs go into which splits
split_ids <- vector("list", nsplits)
names(split_ids) <- split_names
for(i in 1:nsplits) split_ids[[i]] <- list(id_type = split_level,
IDs = character(),
IDgroups = vector("list"),
Perc = 0)
sc <- 1 # split counter
nc <- 0 # number of attempts to attribute
while (nrow(divs) > 0){
IDgroup <- divs$id_var[1]
go <- TRUE
cc <- 1
while(go){
idx <- grep(IDgroup[cc], blast)
if(length(idx) > 0){
x <- blast[idx] # get entries that have cands[cc]
blast <- blast[-idx]
x <- gsub(paste0("\\s*", IDgroup[cc], "\\s*"), "", x)
IDgroup <- c(IDgroup, x)
cc <- cc + 1
} else {
go <- FALSE
}
}
Ptot <- sum(divs$Freq[divs$id_var %in% IDgroup])
# If the current split (sc) can accommodate all the data in IDgroup
if(Ptot <= split_perc[sc] - split_ids[[sc]]$Perc){
split_ids[[sc]]$IDs <- c(split_ids[[sc]]$IDs, IDgroup)
split_ids[[sc]]$Perc <- split_ids[[sc]]$Perc + Ptot
split_ids[[sc]]$IDgroups <- c(split_ids[[sc]]$IDgroups, list(IDgroup))
divs <- divs[-which(divs$id_var %in% IDgroup), ]
nc <- 0
} else {
nc <- nc + 1
}
if (nc > nsplits) break
sc <- 1 + sc %% nsplits
}
# Allocate any remaining ones
if (length(divs) > 0){
sidx <- which.max(split_perc - sapply(split_ids, function(x) x$Perc))
split_ids[[sidx]]$IDs <- c(split_ids[[sidx]]$IDs, divs$id_var)
split_ids[[sidx]]$Perc <- split_ids[[sidx]]$Perc + sum(divs$Freq)
}
for (i in seq_along(split_ids)){
split_ids[[i]]$wdf <- wdf[which(id_var %in% split_ids[[i]]$IDs), ]
}
return(split_ids)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.