R/pipeCometh.R

Defines functions pipeCometh

Documented in pipeCometh

#' Evaluate Prediction on Methylation Data
#'
#' @description Read data, based on one row of information_df, then use aclust
#'  methods to select cpgs as predictors fit \link{glmnet} elastic net,
#'  \link{caret} random forest or support vector machine model and evaluate its
#'   prediction performance.
#'
#' @param rowNum num of row in information_df
#' @param Beta_df Beta_df is a data frame that each row is a cpg probe, each col
#'   is a sample id, each cell is a Beta value, first column is the phenodata, please
#'   make sure this column is a factor with levels, or we can not ensure accuracy
#'   of the results
#' @param beta2M whether transfre beta to m value before prediction
#' @param respCol_index response variable col number in beta data frame
#' @param designInfo_df information df generate by \link{summaryInfo} function
#' @param chromeAnnot_ls list that contains 22 chrome annot probe information
#' @param alphaValue vector that storage alpha values
#' @param ncores number of cores to do parallel computing
#' @param chooseCpGs what cpg selection method to use, use full cpgs \link{fullCpGs}
#'        within cluster or PC1 score \link{getPC1} of cluster or maximum \link{maxCpGs}
#'        expression score, default set to fullcpgs, it is feasible to write new methods
#'        by adding a new function that take in train/test dataset and cpglist then
#'        return a list of train and test subset data.
#' @param predictMethod what prediction method to use
#' @param outcome_type type of outcome variable, gauusian or binomial or poisson, etc
#' @param resultPath path to storage results
#' @param save whether to save the results
#'
#' @return return a list with three elements,\cr
#' \enumerate{
#'     \item first element is the fit model results of different prediction methods
#'     \item Second item second element is the data frame that contains evalutation parameters of
#'           different prediction methods' performace:\cr
#'         for glmnet net, the data frame has row number equal to number of alpha
#'           values given in the function argument times 16 columns with different
#'           evaluation parameters including NumOfRep,NumOfCv, auc_results,
#'           Sensitivity, Specificity, etc;\cr
#'         for random forest and support vector machine, the data frame has one
#'           row times 14 columns with different evaluation parameters including
#'           NumOfRep,NumOfCv, auc_results, Sensitivity, Specificity, etc\cr
#'     \item third element is a vector that indicate number of predictors used
#'  }
#' @details
#' \describe{
#'  \item{predictMethod1: }{Elastic net from function \link[glmnet]{glmnet}
#'      to do prediction}
#'  \item{predictMethod2: }{Random Forest from function \link[caret]{train}
#'      to do prediction(requires package "randomForest" installed first)}
#'  \item{predictMethod3: }{Support Vector Machine from function
#'      \link[caret]{train} to do prediction(requires package "kernlab" installed)}
#' }
#'
#' @importFrom pathwayPCA TransposeAssay
#' @importFrom coMethDMR CoMethAllRegions
#' @importFrom parallel stopCluster makeCluster clusterEvalQ clusterExport parLapply
#'
#' @export
#'
#' @examples \dontrun{
#'  data(Example_df)
#'  data(pfcInfo_df)
#'
#'  test1 <- pipeCometh(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    chooseCpGs = fullCpGs,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#'
#'  test2 <- pipeCometh(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    chromeAnnot_ls = chrome_annot_files,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    chooseCpGs = getPC1,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#'
#'  test3 <- pipeCometh(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    chromeAnnot_ls = chrome_annot_files,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    chooseCpGs = maxCpGs,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#' }

pipeCometh <- function(rowNum, Beta_df, beta2M = TRUE, respCol_index,
                       designInfo_df, chromeAnnot_ls,
                       alphaValue = seq(0, 1, by = 0.1), ncores = 2,
                       chooseCpGs = fullCpGs,
                       predictMethod = c("glmnet", "randomForest", "svm"),
                       outcome_type = "binomial",save = FALSE,resultPath = NULL){

  ## 1. divided methylation data into train and test data  ####################
  TrainTest <- methSplit(rowNum = rowNum, Beta_df = Beta_df, designInfo_df = designInfo_df)
  BetaPhenoTrain_df <- TrainTest$Train_df
  BetaPhenoTest_df <- TrainTest$Test_df

  tBetaPhenoTrain_df <- TransposeAssay(BetaPhenoTrain_df[ , -respCol_index], omeNames = "rowNames")

  ## use comethdmr to select cpgs
  regions_char <- c("NSHORE", "NSHELF", "SSHORE", "SSHELF", "TSS1500", "TSS200",
                    "UTR5", "EXON1", "GENEBODY", "UTR3")

  threads <- ncores
  cl <- makeCluster(threads)
  clusterEvalQ(cl, library(coMethDMR))
  clusterExport(cl, varlist = c("regions_char", "tBetaPhenoTrain_df"), envir = environment())

  coMeth.list  <- parLapply(cl, 1:10, function(i){
      out_ls <- CoMethAllRegions(
        betaMatrix = tBetaPhenoTrain_df,
        regionType = regions_char[i],
        arrayType = "450k",
        returnAllCpGs = FALSE
      )
  })

  stopCluster(cl)

  coMeth.list <- unlist(coMeth.list, recursive = FALSE)

  #whether transfer to mvalue
  trans_list <- beta2M_wrapper(
    beta2M = beta2M,
    BetaPhenoTrain_df = BetaPhenoTrain_df,
    BetaPhenoTest_df = BetaPhenoTest_df,
    respCol_index = respCol_index,
    returnType = "data.frame"
  )

  Train_df <- trans_list$Train
  Test_df <- trans_list$Test

  ## 2. compute predictors based on different selection methods
  sumTnT_ls <- summarizeCpGs(
    clust_ls = coMeth.list,
    train_df = Train_df,
    test_df = Test_df,
    selectMethod = chooseCpGs
  )
  Train_df <- sumTnT_ls$train_df
  Test_df <- sumTnT_ls$test_df
  p <- sumTnT_ls$npredictors

  PhenoTrain_df <- cbind(
    BetaPhenoTrain_df[ , respCol_index],
    Train_df
  )
  PhenoTest_df <- cbind(
    BetaPhenoTest_df[ , respCol_index],
    Test_df
  )

  ## 4. fit model and do prediction  ##########################################
  seed.value <- designInfo_df$seed[rowNum]
  NumOfRep   <- designInfo_df$NumOfRep[rowNum]
  NumOfCv    <- designInfo_df$NumOfCv[rowNum]

  predict_ls <- predict_wrapper(
    predictMethod = predictMethod,
    alphaValue = alphaValue,
    PhenoTrain_df = PhenoTrain_df,
    PhenoTest_df = PhenoTest_df,
    respCol_index = respCol_index,
    outcome_type = outcome_type,
    seed_int = seed.value,
    whichRep_int = NumOfRep,
    whichCVfold_int = NumOfCv,
    ncores = ncores
  )

  out_ls <- list(
    Fit = predict_ls$Fit,
    PredPerformance = predict_ls$performance,
    numOfPredictors = p
  )


  if(save){
    saveRDS(out_ls, paste0(resultPath,"coMethDMR-", selectMethod,"-",
                           predictMethod, rowNum,".RDS"))
  }

  out_ls

}
lizhongliu1996/PredictMisc documentation built on Aug. 23, 2019, 5:55 a.m.