R/Features_Preprocessing.R

Defines functions inverseColOrderDFList Features_FeatureSelect Features_CorFilter Features_Preprocess

Documented in Features_CorFilter Features_FeatureSelect Features_Preprocess inverseColOrderDFList

#' Feature preprocessing.
#'
#' \code{Features_Preprocess} does preprocessing, i.e. centering and rescaling using the internally separated training subdata.\cr
#' \code{Features_CorFilter} filters highly correlated features. Internally using \code{parCor} to do parallelized calculation of a correlational matrix.\cr
#' \code{Features_FeatureSelect} selects important features. See also \code{Features_Importance}.\cr
#' \code{inverseColOrderDFList} takes \code{preprocessedDFList} and reverses the column order of the datatable.
#'
#' @param featureDFList A list of feature dataframes generated by \code{Features}, or imported by \code{readFeatureDFList}.
#' @param metadataDF A dataframe containing metadata, consisting of "Peptide", "Immunogenicity", and "Cluster" columns.
#' @param seedSet A set of random seeds.
#' @param preprocessedDFList A list of feature dataframes generated by \code{Features_Preprocess} or \code{Features_CorFilter}.
#' @param corThreshold The threshold of correlation to eliminate features.
#' @param featureN The number of features to be retained.
#' @param coreN The number of cores to be used for parallelization. Set \code{NULL} to disable.
#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
Features_Preprocess <- function(featureDFList, metadataDF, seedSet=1:5){
  # Randomly choose one epitope per each cluster [Minor class is prioritized]
  peptideClustering <- function(metadataDF, seed=12345){
    set.seed(seed)
    dt <- data.table::as.data.table(metadataDF)
    outcome_table <- table(dt$"Immunogenicity")
    if(outcome_table["Positive"]>=outcome_table["Negative"]){
      outcome_minor_major <- c("Negative", "Positive")
    }else{
      outcome_minor_major <- c("Positive", "Negative")
    }
    dt <- dt[sample.int(nrow(dt)),]
    dt[, Immunogenicity:=factor(Immunogenicity, levels=outcome_minor_major)]
    dt <- dplyr::distinct(dt, Immunogenicity, Cluster, .keep_all=T)
    dt <- dplyr::distinct(dt, Cluster, .keep_all=T)
    dt <- dplyr::distinct(dt, Peptide, .keep_all=T)
    data.table::setorder(dt, Peptide)
    dt[, Immunogenicity:=factor(Immunogenicity, levels=c("Positive", "Negative"))]
    return(dt)
  }

  # Data splitting
  splitData <- function(metadata, seed=12345, trainRatio=0.8){
    set.seed(seed)
    dt <- data.table::as.data.table(metadataDF)
    dataTypes <- rep("Test", nrow(dt))
    dataTypes[caret::createDataPartition(dt$"Immunogenicity", p=trainRatio, list=F)] <- "Train"
    dataTypes <- factor(dataTypes, levels=c("Train", "Test"))
    dt[, DataType:=dataTypes]
    data.table::setcolorder(dt, unique(c("DataType", colnames(dt))))
    return(dt)
  }

  # Feature rescaling
  rescaleFeatures <- function(featureDF, metadataDF){
    df <- dplyr::left_join(metadataDF, featureDF, by="Peptide")
    peptideSet_train <- dplyr::filter(df, DataType=="Train")$"Peptide"
    featureSet <- setdiff(colnames(featureDF), "Peptide")
    dummyFeatureSet <- grep("Peptide_", featureSet, value=T)
    pp_train <- featureDF %>%
      dplyr::filter(Peptide %in% peptideSet_train) %>%
      dplyr::select(setdiff(featureSet, dummyFeatureSet)) %>%
      as.data.frame() %>%
      caret::preProcess(method=c("center", "scale"), verbose=F)
    df <- predict(pp_train, as.data.frame(df))
    dt <- data.table::as.data.table(df)
    list("dt"=dt, "pp_train"=pp_train)
  }

  # A wrapper function
  preprocessWrapper <- function(featureDF, metadataDF, seed=12345){
    # Input peptide check
    dt_meta <- data.table::as.data.table(metadataDF)
    dt_feature <- data.table::as.data.table(featureDF)
    peptideSet <- intersect(dt_meta$"Peptide", dt_feature$"Peptide")
    dt_meta <- dt_meta[Peptide %in% peptideSet,]
    dt_feature <- dt_feature[Peptide %in% peptideSet,]

    # Metadata preprocessing
    dt_meta <- splitData(peptideClustering(dt_meta, seed=seed), seed=seed, trainRatio=0.8)

    # Feature preprocessing
    dt_feature <- rescaleFeatures(dt_feature, dt_meta)
    pp_train <- dt_feature$"pp_train"
    dt_feature <- dt_feature$"dt"

    # Output
    rm(list=setdiff(ls(), c("dt_feature", "pp_train")))
    gc();gc()
    list("dt"=dt_feature, "pp_train"=pp_train)
  }

  # Main workflow
  message("Preprocessing...")
  parameterGrid <- expand.grid(1:length(featureDFList), seedSet)
  preprocessedDFList <- pbapply::pbapply(
    parameterGrid, 1,
    function(v){
      preprocessWrapper(featureDFList[[v[[1]]]], metadataDF, seed=v[[2]])
    }
  )
  nameList <- apply(parameterGrid, 1, function(v){paste0(names(featureDFList)[v[[1]]], ".", v[[2]])}) ## append the random seed used
  names(preprocessedDFList) <- nameList
  return(preprocessedDFList)
}

#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
Features_CorFilter <- function(preprocessedDFList, corThreshold=0.75, coreN=parallel::detectCores(logical=F)){
  # Correlation-based feature elimination using the training subdataset
  corFilter <- function(df, corThreshold=0.75, coreN=NULL){
    dt <- data.table::as.data.table(df)
    dt_train <- data.table::copy(dt)
    dt_train <- dt_train[DataType=="Train", ]
    dt_train <- dt_train[, -c("DataType", "Peptide", "Immunogenicity", "Cluster"), with=F]
    message("Calculating correlation matrix...")
    corMat <- parCor(dt_train, coreN=coreN)
    corMat[which(is.na(corMat))] <- 0 ## variables with zero standard deviation result in NA; replace NA with zero to prevent an error in caret::findCorrelation
    removedFeatureSet <- caret::findCorrelation(corMat, cutoff=corThreshold, verbose=F, names=T)
    if(length(removedFeatureSet)>0) dt <- dt[, -removedFeatureSet, with=F]
    message(length(removedFeatureSet), " features were removed based on their correlations.")
    rm(list=setdiff(ls(), c("dt", "corMat")))
    gc();gc()
    return(list("dt"=dt, "corMat"=corMat))
  }

  # Main workflow
  message("Correlation-based feature elimination...")
  time.start <- proc.time()
  conbinedParamSet <- names(preprocessedDFList)
  preprocessedDFList <- foreach::foreach(i=1:length(conbinedParamSet))%do%{
    cat(i, "/", length(conbinedParamSet), ": ", conbinedParamSet[i], "\n", sep="")
    corFilter(preprocessedDFList[[i]][["dt"]], corThreshold=corThreshold, coreN=coreN)
  }
  names(preprocessedDFList) <- conbinedParamSet
  time.end <- proc.time()
  message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
  return(preprocessedDFList)
}

#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
Features_FeatureSelect <- function(preprocessedDFList, featureN=100, coreN=parallel::detectCores(logical=F)){
  # Selection by filtering
  Features_SBF_Single <- function(df, seed=12345, coreN=NULL){
    caretSeeds_SBF <- function(seed=12345, number=5){
      set.seed(seed)
      seeds <- sample.int(10000, number+1)
      return(seeds)
    }
    message("Selection by filtering.")
    dt <- data.table::as.data.table(df)
    dt <- dt[DataType=="Train", ,]
    outcomes <- dt$"Immunogenicity"
    dt <- dt[, -c("DataType", "Peptide", "Immunogenicity", "Cluster"), with=F]
    sbfCtrl <- caret::sbfControl(
      functions=caret::rfSBF,
      method="cv", number=5, seeds=caretSeeds_SBF(seed=seed, number=5),
      verbose=T, allowParallel=T
    )
    if(!is.null(coreN)){
      cl <- parallel::makeCluster(coreN, type='SOCK')
      parallel::clusterExport(
        cl,
        list("dt", "outcomes", "sbfCtrl"),
        envir=environment()
      )
      doParallel::registerDoParallel(cl)
      sbfRes <- caret::sbf(
        dt, as.numeric(outcomes), sbfControl=sbfCtrl
      )
      parallel::stopCluster(cl)
    }else{
      sbfRes <- caret::sbf(
        dt, as.numeric(outcomes), sbfControl=sbfCtrl
      )
    }
    sbfFeatureSet <- caret::predictors(sbfRes)
    sbfFeatureSet <- c("DataType", "Peptide", "Immunogenicity", "Cluster", sbfFeatureSet)
    dt <- data.table::as.data.table(df)
    message(length(setdiff(colnames(dt), sbfFeatureSet)), " features were removed.")
    dt <- dt[, sbfFeatureSet, with=F]

    # Output
    rm(list=setdiff(ls(), c("dt", "sbfRes")))
    gc();gc()
    list("dt"=dt, "sbfRes"=sbfRes)
  }

  # Recursive feature elimination
  Features_RFE_Single <- function(df, sizes=100, seed=12345, coreN=NULL){
    caretSeeds_RFE <- function(seed=12345, number=10){
      set.seed(seed)
      seeds <- vector(mode="list", length=number+1)
      for(i in 1:(number)) seeds[[i]] <- sample.int(10000, 2)
      seeds[[number+1]] <- sample.int(10000, 1)
      return(seeds)
    }
    message("Recursive feature elimination.")
    dt <- data.table::as.data.table(df)
    dt <- dt[DataType=="Train", ,]
    outcomes <- dt$"Immunogenicity"
    dt <- dt[, -c("DataType", "Peptide", "Immunogenicity", "Cluster"), with=F]
    rfeCtrl <- caret::rfeControl(
      functions=caret::rfFuncs, rerank=F,
      method="cv", number=5, seeds=caretSeeds_RFE(seed=seed, number=5),
      verbose=T, allowParallel=T
    )
    if(!is.null(coreN)){
      cl <- parallel::makeCluster(coreN, type='SOCK')
      parallel::clusterExport(
        cl,
        list("dt", "outcomes", "rfeCtrl"),
        envir=environment()
      )
      doParallel::registerDoParallel(cl)
      rfeRes <- caret::rfe(
        dt, outcomes, sizes=sizes, metric="Kappa", rfeControl=rfeCtrl
      )
      parallel::stopCluster(cl)
    }else{
      rfeRes <- caret::rfe(
        dt, outcomes, sizes=sizes, metric="Kappa", rfeControl=rfeCtrl
      )
    }
    rfeFeatureSet <- caret::predictors(rfeRes)
    rfeFeatureSet <- c("DataType", "Peptide", "Immunogenicity", "Cluster", rfeFeatureSet)
    dt <- data.table::as.data.table(df)
    message(length(setdiff(colnames(dt), rfeFeatureSet)), " features were removed.")
    dt <- dt[, rfeFeatureSet, with=F]

    # Output
    rm(list=setdiff(ls(), c("dt", "rfeRes")))
    gc();gc()
    list("dt"=dt, "rfeRes"=rfeRes)
  }

  # Main workflow
  message("Feature selection...")
  time.start <- proc.time()
  conbinedParamSet <- names(preprocessedDFList)
  preprocessedDFList <- foreach::foreach(i=1:length(conbinedParamSet))%do%{
    cat(i, "/", length(conbinedParamSet), ": ", conbinedParamSet[i], "\n", sep="")
    seed <- as.numeric(as.character(rev(unlist(stringr::str_split(conbinedParamSet[i], stringr::fixed("."))))[1]))
    res <- list(
      "dt"=preprocessedDFList[[i]][["dt"]],
      "sbfRes"=NULL,
      "rfeRes"=NULL
    )
    res <- modifyList(res, Features_SBF_Single(res$"dt", seed=seed, coreN=coreN))
    if(length(setdiff(colnames(res$"dt"), c("DataType", "Peptide", "Immunogenicity", "Cluster")))>featureN){
      res <- modifyList(res, Features_RFE_Single(res$"dt", sizes=featureN, seed=seed, coreN=coreN))
    }
    res
  }
  names(preprocessedDFList) <- conbinedParamSet
  time.end <- proc.time()
  message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
  return(preprocessedDFList)
}

#' @export
#' @rdname Features_Preprocessing
#' @name Features_Preprocessing
inverseColOrderDFList <- function(preprocessedDFList){
  lapply(preprocessedDFList, function(l){
    dt <- data.table::copy(l$"dt")
    data.table::setcolorder(dt, rev(colnames(dt)))
    l$"dt" <- dt
    return(l)
  })
}
masato-ogishi/Repitope documentation built on Feb. 14, 2023, 5:47 a.m.