#' Immunogenicity score.
#'
#' \code{Immunogenicity_TrainModels} performes preprocessing, trains extremely randomized tree models, and internally predicts immunogenicity scores on the entire dataset provided.\cr
#' \code{Immunogenicity_SummarizeInternalScores} summarizes internally calculated immunogenicity scores.\cr
#' \code{Immunogenicity_Score} is a wrapper function of \code{Immunogenicity_TrainModels} and \code{Immunogenicity_SummarizeInternalScores}.\cr
#' \code{Immunogenicity_Score_Cluster} is similar to \code{Immunogenicity_Score}, but takes \code{clusterDF} to filter peptides based on their similarities.\cr
#' \code{Immunogenicity_Predict} predicts immunogenicity scores on external datasets. Models trained by \code{Immunogenicity_TrainModels} are necessary. Note that, due to Java constraints, models have to be constructed in each new R session.
#'
#' @param featureDF A dataframe of features generated by \code{Features}.
#' @param metadataDF A dataframe containing metadata, consisting of "Peptide" and "Immunogenicity" columns. Other columns if provided would be used as features.
#' @param clusterDF A dataframe containing cluster metadata, consisting of "Peptide" and "Cluster" columns.
#' @param featureSet A set of features used for model training. Set "all" to shortcut.
#' @param seedSet A set of random seeds.
#' @param coreN The number of cores to be used for parallelization. Set \code{NULL} to disable.
#' @param trainModelResults The model training result returned by \code{Immunogenicity_TrainModels}.
#' @param externalFeatureDFList The feature dataframes for which immunogenicity is to be predicted.
#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_TrainModels <- function(
featureDF,
metadataDF,
featureSet="all",
seedSet=1:5,
coreN=parallel::detectCores(logical=F)
){
# Input check
if(is.list(featureSet)){
if("MinimumFeatureSet" %in% names(featureSet)){
featureSet <- featureSet$"MinimumFeatureSet"
}else{
stop("featureSet should be either \"all\" or a character vectory specifying the column names of the input featureDF.")
}
}
# A combined feature dataframe
dt <- merge(data.table::as.data.table(metadataDF), data.table::as.data.table(featureDF), by="Peptide")
if(identical(featureSet, "all")){
featureSet <- setdiff(colnames(dt), c("Peptide", "Immunogenicity"))
}else{
dt <- dt[, c("Peptide", "Immunogenicity", featureSet), with=F]
}
dt[,"Immunogenicity":=factor(Immunogenicity, levels=c("Positive", "Negative"))]
# Random splitting
trainIDList <- lapply(seedSet, function(s){set.seed(s);lapply(BBmisc::chunk(1:nrow(dt), n.chunks=5, shuffle=T), function(v){setdiff(1:nrow(dt), v)})})
# The main workflow
main <- function(dt, trainIDs){
dt_train <- dt[trainIDs,]
dt_test <- dt[-trainIDs,]
pp_train <- caret::preProcess(dt_train[, featureSet, with=F], method=c("center", "scale"))
dt_train <- predict(pp_train, dt_train)
dt_test <- predict(pp_train, dt_test)
trgt <- dt_train$"Immunogenicity"
tab <- as.numeric(table(trgt))
w <- 1/tab[trgt]
mat_train <- as.matrix(dt_train[, featureSet, with=F])
ert <- extraTrees::extraTrees(
x=mat_train, y=trgt,
weights=w,
mtry=5,
numThreads=ifelse(is.null(coreN), 1, coreN)
)
mat_test <- as.matrix(dt_test[, featureSet, with=F])
predDT <- data.table::data.table("Peptide"=dt_test$"Peptide")
predDT[,ImmunogenicityScore:=predict(ert, mat_test, probability=T)[,"Positive"]]
return(list(
"pp_train"=pp_train, "ert"=ert, "predDT"=predDT
))
}
message("Training extremely randomized tree models...")
resList <- foreach::foreach(i=1:length(seedSet))%do%{
cat("Random seed = ", seedSet[i], "\n", sep="")
set.seed(seedSet[i])
pbapply::pblapply(trainIDList[[i]], function(trainIDs){main(dt, trainIDs)})
}
return(list("TrainModelResults"=resList, "FeatureSet"=featureSet, "SeedSet"=seedSet))
}
#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_SummarizeInternalScores <- function(trainModelResults){
message("Extracting internally computed scores...")
dt_score <- lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"predDT"})}) %>%
purrr::flatten() %>%
data.table::rbindlist()
dt_score <- dt_score[, list(ImmunogenicityScore.ave=mean(ImmunogenicityScore), ImmunogenicityScore.sd=sd(ImmunogenicityScore)), by=Peptide]
dt_score <- dt_score[, ImmunogenicityScore:=ImmunogenicityScore.ave]
dt_score <- dt_score[, ImmunogenicityScore.cv:=ImmunogenicityScore.sd/ImmunogenicityScore.ave]
dt_score <- dt_score[, c("Peptide", "ImmunogenicityScore", "ImmunogenicityScore.cv"), with=F]
return(dt_score)
}
#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_Score <- function(
featureDF,
metadataDF,
featureSet="all",
seedSet=1:5,
coreN=parallel::detectCores(logical=F)
){
trainModelResults <- Immunogenicity_TrainModels(featureDF, metadataDF, featureSet, seedSet, coreN)
dt_score <- Immunogenicity_SummarizeInternalScores(trainModelResults)
return(dt_score)
}
#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_Score_Cluster <- function(
featureDF,
metadataDF,
clusterDF,
featureSet="all",
seedSet=1:5,
coreN=parallel::detectCores(logical=F)
){
dt_score <- lapply(seedSet, function(s){
set.seed(s)
peptideSet <- clusterDF[sample(1:nrow(clusterDF)),] %>% dplyr::distinct(Cluster, .keep_all=T)
peptideSet <- peptideSet[["Peptide"]]
dt_score <- Immunogenicity_Score(
featureDF=dplyr::filter(featureDF, Peptide %in% peptideSet),
metadataDF=dplyr::filter(metadataDF, Peptide %in% peptideSet),
featureSet=featureSet,
seedSet=s,
coreN=coreN
)
return(dt_score)
}) %>%
data.table::rbindlist()
dt_score <- dt_score[, list(ImmunogenicityScore.ave=mean(ImmunogenicityScore), ImmunogenicityScore.sd=sd(ImmunogenicityScore)), by=Peptide]
dt_score <- dt_score[, ImmunogenicityScore:=ImmunogenicityScore.ave]
dt_score <- dt_score[, ImmunogenicityScore.cv:=ImmunogenicityScore.sd/ImmunogenicityScore.ave]
dt_score <- dt_score[, c("Peptide", "ImmunogenicityScore", "ImmunogenicityScore.cv"), with=F]
return(dt_score)
}
#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_Predict <- function(
externalFeatureDFList,
trainModelResults
){
message("Extrapolating trained models to external peptide sequences...")
featureSet <- trainModelResults$"FeatureSet"
pp_list <- purrr::flatten(
lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"pp_train"})})
)
ert_list <- purrr::flatten(
lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"ert"})})
)
datasetIDs <- 1:length(externalFeatureDFList)
modelIDs <- 1:length(ert_list)
combDT <- data.table::CJ("Data"=datasetIDs, "Model"=modelIDs)
scoreDT_list <- pbapply::pblapply(1:nrow(combDT), function(i){
datasetID <- combDT$"Data"[i]
modelID <- combDT$"Model"[i]
dt <- data.table::as.data.table(externalFeatureDFList[[datasetID]])
pept <- dt$"Peptide"
dt <- dt[, featureSet, with=F]
mat <- as.matrix(predict(pp_list[[modelID]], dt))
pred <- predict(ert_list[[modelID]], mat, probability=T)[,"Positive"]
data.table::data.table(
"Data"=datasetID,
"Model"=modelID,
"Peptide"=pept,
"ImmunogenicityScore"=pred
)
})
scoreDT <- data.table::rbindlist(scoreDT_list)
scoreDT <- scoreDT[, .(ImmunogenicityScore.ave=mean(ImmunogenicityScore), ImmunogenicityScore.sd=sd(ImmunogenicityScore)), by=.(Data, Peptide)]
scoreDT[, ImmunogenicityScore:=ImmunogenicityScore.ave]
scoreDT[, ImmunogenicityScore.cv:=ImmunogenicityScore.sd/ImmunogenicityScore.ave]
scoreDT <- scoreDT[, c("Data", "Peptide", "ImmunogenicityScore", "ImmunogenicityScore.cv"), with=F]
data.table::setorder(scoreDT, Data, Peptide)
scoreDT_list <- split(scoreDT, by="Data", keep.by=F)
w <- floor(log10(length(externalFeatureDFList))) + 1
names(scoreDT_list) <- paste0("ScoreDT_", formatC(1:length(externalFeatureDFList), w=w, flag="0"))
## warning if overlapping sets of peptides were used for both model training and score prediction
peptideSet_train <- unlist(
lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"predDT"$"Peptide"})})
)
peptideSet_external <- scoreDT$"Peptide"
if(length(intersect(peptideSet_train, peptideSet_external))>=1){
warning("We recommend predicting scores by extrapolation only for peptides that were not used for the model training phase. For the peptides used for model training, the Immunogenicity_Score function may be used instead.")
}
return(scoreDT_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.