R/Immunogenicity.R

Defines functions Immunogenicity_Predict Immunogenicity_Score_Cluster Immunogenicity_Score Immunogenicity_SummarizeInternalScores Immunogenicity_TrainModels

Documented in Immunogenicity_Predict Immunogenicity_Score Immunogenicity_Score_Cluster Immunogenicity_SummarizeInternalScores Immunogenicity_TrainModels

#' Immunogenicity score.
#'
#' \code{Immunogenicity_TrainModels} performes preprocessing, trains extremely randomized tree models, and internally predicts immunogenicity scores on the entire dataset provided.\cr
#' \code{Immunogenicity_SummarizeInternalScores} summarizes internally calculated immunogenicity scores.\cr
#' \code{Immunogenicity_Score} is a wrapper function of \code{Immunogenicity_TrainModels} and \code{Immunogenicity_SummarizeInternalScores}.\cr
#' \code{Immunogenicity_Score_Cluster} is similar to \code{Immunogenicity_Score}, but takes \code{clusterDF} to filter peptides based on their similarities.\cr
#' \code{Immunogenicity_Predict} predicts immunogenicity scores on external datasets. Models trained by \code{Immunogenicity_TrainModels} are necessary. Note that, due to Java constraints, models have to be constructed in each new R session.
#'
#' @param featureDF A dataframe of features generated by \code{Features}.
#' @param metadataDF A dataframe containing metadata, consisting of "Peptide" and "Immunogenicity" columns. Other columns if provided would be used as features.
#' @param clusterDF A dataframe containing cluster metadata, consisting of "Peptide" and "Cluster" columns.
#' @param featureSet A set of features used for model training. Set "all" to shortcut.
#' @param seedSet A set of random seeds.
#' @param coreN The number of cores to be used for parallelization. Set \code{NULL} to disable.
#' @param trainModelResults The model training result returned by \code{Immunogenicity_TrainModels}.
#' @param externalFeatureDFList The feature dataframes for which immunogenicity is to be predicted.
#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_TrainModels <- function(
  featureDF,
  metadataDF,
  featureSet="all",
  seedSet=1:5,
  coreN=parallel::detectCores(logical=F)
){
  # Input check
  if(is.list(featureSet)){
    if("MinimumFeatureSet" %in% names(featureSet)){
      featureSet <- featureSet$"MinimumFeatureSet"
    }else{
      stop("featureSet should be either \"all\" or a character vectory specifying the column names of the input featureDF.")
    }
  }

  # A combined feature dataframe
  dt <- merge(data.table::as.data.table(metadataDF), data.table::as.data.table(featureDF), by="Peptide")
  if(identical(featureSet, "all")){
    featureSet <- setdiff(colnames(dt), c("Peptide", "Immunogenicity"))
  }else{
    dt <- dt[, c("Peptide", "Immunogenicity", featureSet), with=F]
  }
  dt[,"Immunogenicity":=factor(Immunogenicity, levels=c("Positive", "Negative"))]

  # Random splitting
  trainIDList <- lapply(seedSet, function(s){set.seed(s);lapply(BBmisc::chunk(1:nrow(dt), n.chunks=5, shuffle=T), function(v){setdiff(1:nrow(dt), v)})})

  # The main workflow
  main <- function(dt, trainIDs){
    dt_train <- dt[trainIDs,]
    dt_test <- dt[-trainIDs,]
    pp_train <- caret::preProcess(dt_train[, featureSet, with=F], method=c("center", "scale"))
    dt_train <- predict(pp_train, dt_train)
    dt_test <- predict(pp_train, dt_test)
    trgt <- dt_train$"Immunogenicity"
    tab <- as.numeric(table(trgt))
    w <- 1/tab[trgt]
    mat_train <- as.matrix(dt_train[, featureSet, with=F])
    ert <- extraTrees::extraTrees(
      x=mat_train, y=trgt,
      weights=w,
      mtry=5,
      numThreads=ifelse(is.null(coreN), 1, coreN)
    )
    mat_test <- as.matrix(dt_test[, featureSet, with=F])
    predDT <- data.table::data.table("Peptide"=dt_test$"Peptide")
    predDT[,ImmunogenicityScore:=predict(ert, mat_test, probability=T)[,"Positive"]]
    return(list(
      "pp_train"=pp_train, "ert"=ert, "predDT"=predDT
    ))
  }
  message("Training extremely randomized tree models...")
  resList <- foreach::foreach(i=1:length(seedSet))%do%{
    cat("Random seed = ", seedSet[i], "\n", sep="")
    set.seed(seedSet[i])
    pbapply::pblapply(trainIDList[[i]], function(trainIDs){main(dt, trainIDs)})
  }
  return(list("TrainModelResults"=resList, "FeatureSet"=featureSet, "SeedSet"=seedSet))
}

#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_SummarizeInternalScores <- function(trainModelResults){
  message("Extracting internally computed scores...")
  dt_score <- lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"predDT"})}) %>%
    purrr::flatten() %>%
    data.table::rbindlist()
  dt_score <- dt_score[, list(ImmunogenicityScore.ave=mean(ImmunogenicityScore), ImmunogenicityScore.sd=sd(ImmunogenicityScore)), by=Peptide]
  dt_score <- dt_score[, ImmunogenicityScore:=ImmunogenicityScore.ave]
  dt_score <- dt_score[, ImmunogenicityScore.cv:=ImmunogenicityScore.sd/ImmunogenicityScore.ave]
  dt_score <- dt_score[, c("Peptide", "ImmunogenicityScore", "ImmunogenicityScore.cv"), with=F]
  return(dt_score)
}

#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_Score <- function(
  featureDF,
  metadataDF,
  featureSet="all",
  seedSet=1:5,
  coreN=parallel::detectCores(logical=F)
){
  trainModelResults <- Immunogenicity_TrainModels(featureDF, metadataDF, featureSet, seedSet, coreN)
  dt_score <- Immunogenicity_SummarizeInternalScores(trainModelResults)
  return(dt_score)
}

#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_Score_Cluster <- function(
  featureDF,
  metadataDF,
  clusterDF,
  featureSet="all",
  seedSet=1:5,
  coreN=parallel::detectCores(logical=F)
){
  dt_score <- lapply(seedSet, function(s){
    set.seed(s)
    peptideSet <- clusterDF[sample(1:nrow(clusterDF)),] %>% dplyr::distinct(Cluster, .keep_all=T)
    peptideSet <- peptideSet[["Peptide"]]
    dt_score <- Immunogenicity_Score(
      featureDF=dplyr::filter(featureDF, Peptide %in% peptideSet),
      metadataDF=dplyr::filter(metadataDF, Peptide %in% peptideSet),
      featureSet=featureSet,
      seedSet=s,
      coreN=coreN
    )
    return(dt_score)
  }) %>%
    data.table::rbindlist()
  dt_score <- dt_score[, list(ImmunogenicityScore.ave=mean(ImmunogenicityScore), ImmunogenicityScore.sd=sd(ImmunogenicityScore)), by=Peptide]
  dt_score <- dt_score[, ImmunogenicityScore:=ImmunogenicityScore.ave]
  dt_score <- dt_score[, ImmunogenicityScore.cv:=ImmunogenicityScore.sd/ImmunogenicityScore.ave]
  dt_score <- dt_score[, c("Peptide", "ImmunogenicityScore", "ImmunogenicityScore.cv"), with=F]
  return(dt_score)
}

#' @export
#' @rdname Immunogenicity
#' @name Immunogenicity
Immunogenicity_Predict <- function(
  externalFeatureDFList,
  trainModelResults
){
  message("Extrapolating trained models to external peptide sequences...")
  featureSet <- trainModelResults$"FeatureSet"
  pp_list <- purrr::flatten(
    lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"pp_train"})})
  )
  ert_list <- purrr::flatten(
    lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"ert"})})
  )
  datasetIDs <- 1:length(externalFeatureDFList)
  modelIDs <- 1:length(ert_list)
  combDT <- data.table::CJ("Data"=datasetIDs, "Model"=modelIDs)
  scoreDT_list <- pbapply::pblapply(1:nrow(combDT), function(i){
    datasetID <- combDT$"Data"[i]
    modelID <- combDT$"Model"[i]
    dt <- data.table::as.data.table(externalFeatureDFList[[datasetID]])
    pept <- dt$"Peptide"
    dt <- dt[, featureSet, with=F]
    mat <- as.matrix(predict(pp_list[[modelID]], dt))
    pred <- predict(ert_list[[modelID]], mat, probability=T)[,"Positive"]
    data.table::data.table(
      "Data"=datasetID,
      "Model"=modelID,
      "Peptide"=pept,
      "ImmunogenicityScore"=pred
    )
  })
  scoreDT <- data.table::rbindlist(scoreDT_list)
  scoreDT <- scoreDT[, .(ImmunogenicityScore.ave=mean(ImmunogenicityScore), ImmunogenicityScore.sd=sd(ImmunogenicityScore)), by=.(Data, Peptide)]
  scoreDT[, ImmunogenicityScore:=ImmunogenicityScore.ave]
  scoreDT[, ImmunogenicityScore.cv:=ImmunogenicityScore.sd/ImmunogenicityScore.ave]
  scoreDT <- scoreDT[, c("Data", "Peptide", "ImmunogenicityScore", "ImmunogenicityScore.cv"), with=F]
  data.table::setorder(scoreDT, Data, Peptide)
  scoreDT_list <- split(scoreDT, by="Data", keep.by=F)
  w <- floor(log10(length(externalFeatureDFList))) + 1
  names(scoreDT_list) <- paste0("ScoreDT_", formatC(1:length(externalFeatureDFList), w=w, flag="0"))

  ## warning if overlapping sets of peptides were used for both model training and score prediction
  peptideSet_train <- unlist(
    lapply(trainModelResults$"TrainModelResults", function(res){lapply(res, function(r){r$"predDT"$"Peptide"})})
  )
  peptideSet_external <- scoreDT$"Peptide"
  if(length(intersect(peptideSet_train, peptideSet_external))>=1){
    warning("We recommend predicting scores by extrapolation only for peptides that were not used for the model training phase. For the peptides used for model training, the Immunogenicity_Score function may be used instead.")
  }

  return(scoreDT_list)
}
masato-ogishi/Repitope documentation built on Feb. 14, 2023, 5:47 a.m.