#' Feature preprocessing.
#'
#' \code{Features_Preprocess} does preprocessing, i.e. centering and rescaling using the internally separated training subdata.\cr
#' \code{Features_CorFilter} filters highly correlated features. Internally using \code{parCor} to do parallelized calculation of a correlational matrix.\cr
#' \code{Features_FeatureSelect} selects important features. See also \code{Features_Importance}.\cr
#' \code{inverseColOrderDFList} takes \code{preprocessedDFList} and reverses the column order of the datatable.
#'
#' @param featureDFList A list of feature dataframes generated by \code{Features}, or imported by \code{readFeatureDFList}.
#' @param metadataDF A dataframe containing metadata, consisting of "Peptide", "Immunogenicity", and "Cluster" columns.
#' @param seedSet A set of random seeds.
#' @param preprocessedDFList A list of feature dataframes generated by \code{Features_Preprocess} or \code{Features_CorFilter}.
#' @param corThreshold The threshold of correlation to eliminate features.
#' @param featureN The number of features to be retained.
#' @param coreN The number of cores to be used for parallelization. Set \code{NULL} to disable.
#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
Features_Preprocess <- function(featureDFList, metadataDF, seedSet=1:5){
# Randomly choose one epitope per each cluster [Minor class is prioritized]
peptideClustering <- function(metadataDF, seed=12345){
set.seed(seed)
dt <- data.table::as.data.table(metadataDF)
outcome_table <- table(dt$"Immunogenicity")
if(outcome_table["Positive"]>=outcome_table["Negative"]){
outcome_minor_major <- c("Negative", "Positive")
}else{
outcome_minor_major <- c("Positive", "Negative")
}
dt <- dt[sample.int(nrow(dt)),]
dt[, Immunogenicity:=factor(Immunogenicity, levels=outcome_minor_major)]
dt <- dplyr::distinct(dt, Immunogenicity, Cluster, .keep_all=T)
dt <- dplyr::distinct(dt, Cluster, .keep_all=T)
dt <- dplyr::distinct(dt, Peptide, .keep_all=T)
data.table::setorder(dt, Peptide)
dt[, Immunogenicity:=factor(Immunogenicity, levels=c("Positive", "Negative"))]
return(dt)
}
# Data splitting
splitData <- function(metadata, seed=12345, trainRatio=0.8){
set.seed(seed)
dt <- data.table::as.data.table(metadataDF)
dataTypes <- rep("Test", nrow(dt))
dataTypes[caret::createDataPartition(dt$"Immunogenicity", p=trainRatio, list=F)] <- "Train"
dataTypes <- factor(dataTypes, levels=c("Train", "Test"))
dt[, DataType:=dataTypes]
data.table::setcolorder(dt, unique(c("DataType", colnames(dt))))
return(dt)
}
# Feature rescaling
rescaleFeatures <- function(featureDF, metadataDF){
df <- dplyr::left_join(metadataDF, featureDF, by="Peptide")
peptideSet_train <- dplyr::filter(df, DataType=="Train")$"Peptide"
featureSet <- setdiff(colnames(featureDF), "Peptide")
dummyFeatureSet <- grep("Peptide_", featureSet, value=T)
pp_train <- featureDF %>%
dplyr::filter(Peptide %in% peptideSet_train) %>%
dplyr::select(setdiff(featureSet, dummyFeatureSet)) %>%
as.data.frame() %>%
caret::preProcess(method=c("center", "scale"), verbose=F)
df <- predict(pp_train, as.data.frame(df))
dt <- data.table::as.data.table(df)
list("dt"=dt, "pp_train"=pp_train)
}
# A wrapper function
preprocessWrapper <- function(featureDF, metadataDF, seed=12345){
# Input peptide check
dt_meta <- data.table::as.data.table(metadataDF)
dt_feature <- data.table::as.data.table(featureDF)
peptideSet <- intersect(dt_meta$"Peptide", dt_feature$"Peptide")
dt_meta <- dt_meta[Peptide %in% peptideSet,]
dt_feature <- dt_feature[Peptide %in% peptideSet,]
# Metadata preprocessing
dt_meta <- splitData(peptideClustering(dt_meta, seed=seed), seed=seed, trainRatio=0.8)
# Feature preprocessing
dt_feature <- rescaleFeatures(dt_feature, dt_meta)
pp_train <- dt_feature$"pp_train"
dt_feature <- dt_feature$"dt"
# Output
rm(list=setdiff(ls(), c("dt_feature", "pp_train")))
gc();gc()
list("dt"=dt_feature, "pp_train"=pp_train)
}
# Main workflow
message("Preprocessing...")
parameterGrid <- expand.grid(1:length(featureDFList), seedSet)
preprocessedDFList <- pbapply::pbapply(
parameterGrid, 1,
function(v){
preprocessWrapper(featureDFList[[v[[1]]]], metadataDF, seed=v[[2]])
}
)
nameList <- apply(parameterGrid, 1, function(v){paste0(names(featureDFList)[v[[1]]], ".", v[[2]])}) ## append the random seed used
names(preprocessedDFList) <- nameList
return(preprocessedDFList)
}
#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
Features_CorFilter <- function(preprocessedDFList, corThreshold=0.75, coreN=parallel::detectCores(logical=F)){
# Correlation-based feature elimination using the training subdataset
corFilter <- function(df, corThreshold=0.75, coreN=NULL){
dt <- data.table::as.data.table(df)
dt_train <- data.table::copy(dt)
dt_train <- dt_train[DataType=="Train", ]
dt_train <- dt_train[, -c("DataType", "Peptide", "Immunogenicity", "Cluster"), with=F]
message("Calculating correlation matrix...")
corMat <- parCor(dt_train, coreN=coreN)
corMat[which(is.na(corMat))] <- 0 ## variables with zero standard deviation result in NA; replace NA with zero to prevent an error in caret::findCorrelation
removedFeatureSet <- caret::findCorrelation(corMat, cutoff=corThreshold, verbose=F, names=T)
if(length(removedFeatureSet)>0) dt <- dt[, -removedFeatureSet, with=F]
message(length(removedFeatureSet), " features were removed based on their correlations.")
rm(list=setdiff(ls(), c("dt", "corMat")))
gc();gc()
return(list("dt"=dt, "corMat"=corMat))
}
# Main workflow
message("Correlation-based feature elimination...")
time.start <- proc.time()
conbinedParamSet <- names(preprocessedDFList)
preprocessedDFList <- foreach::foreach(i=1:length(conbinedParamSet))%do%{
cat(i, "/", length(conbinedParamSet), ": ", conbinedParamSet[i], "\n", sep="")
corFilter(preprocessedDFList[[i]][["dt"]], corThreshold=corThreshold, coreN=coreN)
}
names(preprocessedDFList) <- conbinedParamSet
time.end <- proc.time()
message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
return(preprocessedDFList)
}
#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
Features_FeatureSelect <- function(preprocessedDFList, featureN=100, coreN=parallel::detectCores(logical=F)){
# Selection by filtering
Features_SBF_Single <- function(df, seed=12345, coreN=NULL){
caretSeeds_SBF <- function(seed=12345, number=5){
set.seed(seed)
seeds <- sample.int(10000, number+1)
return(seeds)
}
message("Selection by filtering.")
dt <- data.table::as.data.table(df)
dt <- dt[DataType=="Train", ,]
outcomes <- dt$"Immunogenicity"
dt <- dt[, -c("DataType", "Peptide", "Immunogenicity", "Cluster"), with=F]
sbfCtrl <- caret::sbfControl(
functions=caret::rfSBF,
method="cv", number=5, seeds=caretSeeds_SBF(seed=seed, number=5),
verbose=T, allowParallel=T
)
if(!is.null(coreN)){
cl <- parallel::makeCluster(coreN, type='SOCK')
parallel::clusterExport(
cl,
list("dt", "outcomes", "sbfCtrl"),
envir=environment()
)
doParallel::registerDoParallel(cl)
sbfRes <- caret::sbf(
dt, as.numeric(outcomes), sbfControl=sbfCtrl
)
parallel::stopCluster(cl)
}else{
sbfRes <- caret::sbf(
dt, as.numeric(outcomes), sbfControl=sbfCtrl
)
}
sbfFeatureSet <- caret::predictors(sbfRes)
sbfFeatureSet <- c("DataType", "Peptide", "Immunogenicity", "Cluster", sbfFeatureSet)
dt <- data.table::as.data.table(df)
message(length(setdiff(colnames(dt), sbfFeatureSet)), " features were removed.")
dt <- dt[, sbfFeatureSet, with=F]
# Output
rm(list=setdiff(ls(), c("dt", "sbfRes")))
gc();gc()
list("dt"=dt, "sbfRes"=sbfRes)
}
# Recursive feature elimination
Features_RFE_Single <- function(df, sizes=100, seed=12345, coreN=NULL){
caretSeeds_RFE <- function(seed=12345, number=10){
set.seed(seed)
seeds <- vector(mode="list", length=number+1)
for(i in 1:(number)) seeds[[i]] <- sample.int(10000, 2)
seeds[[number+1]] <- sample.int(10000, 1)
return(seeds)
}
message("Recursive feature elimination.")
dt <- data.table::as.data.table(df)
dt <- dt[DataType=="Train", ,]
outcomes <- dt$"Immunogenicity"
dt <- dt[, -c("DataType", "Peptide", "Immunogenicity", "Cluster"), with=F]
rfeCtrl <- caret::rfeControl(
functions=caret::rfFuncs, rerank=F,
method="cv", number=5, seeds=caretSeeds_RFE(seed=seed, number=5),
verbose=T, allowParallel=T
)
if(!is.null(coreN)){
cl <- parallel::makeCluster(coreN, type='SOCK')
parallel::clusterExport(
cl,
list("dt", "outcomes", "rfeCtrl"),
envir=environment()
)
doParallel::registerDoParallel(cl)
rfeRes <- caret::rfe(
dt, outcomes, sizes=sizes, metric="Kappa", rfeControl=rfeCtrl
)
parallel::stopCluster(cl)
}else{
rfeRes <- caret::rfe(
dt, outcomes, sizes=sizes, metric="Kappa", rfeControl=rfeCtrl
)
}
rfeFeatureSet <- caret::predictors(rfeRes)
rfeFeatureSet <- c("DataType", "Peptide", "Immunogenicity", "Cluster", rfeFeatureSet)
dt <- data.table::as.data.table(df)
message(length(setdiff(colnames(dt), rfeFeatureSet)), " features were removed.")
dt <- dt[, rfeFeatureSet, with=F]
# Output
rm(list=setdiff(ls(), c("dt", "rfeRes")))
gc();gc()
list("dt"=dt, "rfeRes"=rfeRes)
}
# Main workflow
message("Feature selection...")
time.start <- proc.time()
conbinedParamSet <- names(preprocessedDFList)
preprocessedDFList <- foreach::foreach(i=1:length(conbinedParamSet))%do%{
cat(i, "/", length(conbinedParamSet), ": ", conbinedParamSet[i], "\n", sep="")
seed <- as.numeric(as.character(rev(unlist(stringr::str_split(conbinedParamSet[i], stringr::fixed("."))))[1]))
res <- list(
"dt"=preprocessedDFList[[i]][["dt"]],
"sbfRes"=NULL,
"rfeRes"=NULL
)
res <- modifyList(res, Features_SBF_Single(res$"dt", seed=seed, coreN=coreN))
if(length(setdiff(colnames(res$"dt"), c("DataType", "Peptide", "Immunogenicity", "Cluster")))>featureN){
res <- modifyList(res, Features_RFE_Single(res$"dt", sizes=featureN, seed=seed, coreN=coreN))
}
res
}
names(preprocessedDFList) <- conbinedParamSet
time.end <- proc.time()
message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
return(preprocessedDFList)
}
#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
inverseColOrderDFList <- function(preprocessedDFList){
lapply(preprocessedDFList, function(l){
dt <- data.table::copy(l$"dt")
data.table::setcolorder(dt, rev(colnames(dt)))
l$"dt" <- dt
return(l)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.