R/pipeAclust.R

Defines functions pipeAclust

Documented in pipeAclust

#' Evaluate Prediction on Methylation Data
#'
#' @description Read data, based on one row of information_df, then use aclust
#'  methods to select cpgs as predictors fit \link{glmnet} elastic net,
#'  \link{caret} random forest or support vector machine model and evaluate its
#'   prediction performance.
#'
#' @param rowNum num of row in information_df
#' @param Beta_df Beta_df is a data frame that each row is a cpg probe, each col
#'   is a sample id, each cell is a Beta value, first column is the phenodata, please
#'   make sure this column is a factor with levels, or we can not ensure accuracy
#'   of the results
#' @param beta2M whether transfre beta to m value before prediction
#' @param respCol_index response variable col number in beta data frame
#' @param designInfo_df information df generate by \link{summaryInfo} function
#' @param alphaValue vector that storage alpha values
#' @param ncores number of cores to do parallel computing
#' @param chromeAnnot_ls list that contains 22 chrome annot probe information
#' @param chooseCpGs what cpg selection method to use, use full cpgs \link{fullCpGs}
#'        within cluster or PC1 score \link{getPC1} of cluster or maximum \link{maxCpGs}
#'        expression score, default set to fullcpgs, it is feasible to write new methods
#'        by adding a new function that take in train/test dataset and cpglist then
#'        return a list of train and test subset data.
#' @param predictMethod what prediction method to use
#' @param outcome_type type of outcome variable, gauusian or binomial or poisson, etc
#' @param resultPath path to storage results
#' @param save whether to save the results
#'
#' @return return a list with three elements,\cr
#' \enumerate{
#'     \item first element is the fit model results of different prediction methods
#'     \item Second item second element is the data frame that contains evalutation parameters of
#'           different prediction methods' performace:\cr
#'         for glmnet net, the data frame has row number equal to number of alpha
#'           values given in the function argument times 16 columns with different
#'           evaluation parameters including NumOfRep,NumOfCv, auc_results,
#'           Sensitivity, Specificity, etc;\cr
#'         for random forest and support vector machine, the data frame has one
#'           row times 14 columns with different evaluation parameters including
#'           NumOfRep,NumOfCv, auc_results, Sensitivity, Specificity, etc\cr
#'     \item third element is a vector that indicate number of predictors used
#'  }
#' @details
#' \describe{
#'  \item{predictMethod1: }{Elastic net from function \link[glmnet]{glmnet}
#'      to do prediction}
#'  \item{predictMethod2: }{Random Forest from function \link[caret]{train}
#'      to do prediction(requires package "randomForest" installed first)}
#'  \item{predictMethod3: }{Support Vector Machine from function
#'      \link[caret]{train} to do prediction(requires package "kernlab" installed)}
#' }
#'
#' @importFrom pathwayPCA TransposeAssay
#'
#' @export
#'
#' @examples \dontrun{
#'  data(Example_df)
#'  data(pfcInfo_df)
#'  data(chrome_annot_files)
#'
#'  test1 <- pipeAclust(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    chromeAnnot_ls = chrome_annot_files,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    chooseCpGs = fullCpGs,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#'
#'  test2 <- pipeAclust(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    chromeAnnot_ls = chrome_annot_files,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    chooseCpGs = getPC1,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#'
#'  test3 <- pipeAclust(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    chromeAnnot_ls = chrome_annot_files,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    chooseCpGs = maxCpGs,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#' }

pipeAclust <- function(rowNum, Beta_df, beta2M = TRUE, respCol_index,
                       designInfo_df, chromeAnnot_ls,
                       alphaValue = seq(0, 1, by = 0.1), ncores = 2,
                       chooseCpGs = fullCpGs,
                       predictMethod = c("glmnet", "randomForest", "svm"),
                       outcome_type = "binomial", save = FALSE, resultPath = NULL){

  ## 1. divided methylation data into train and test data  ####################
  TrainTest <- methSplit(rowNum = rowNum, Beta_df = Beta_df, designInfo_df = designInfo_df)
  BetaPhenoTrain_df <- TrainTest$Train_df
  BetaPhenoTest_df <- TrainTest$Test_df

  ## generate Acluster and extract cpg names out ##
  ## first column of betapheno is phenodata, remove first column
  tBetaPhenoTrain_df <- TransposeAssay(
    BetaPhenoTrain_df[, -respCol_index],
    omeNames = "rowNames"
  )

  aclust.list <- runAcluster(
    chromeAnnot_df = chromeAnnot_ls,
    tBeta_df = tBetaPhenoTrain_df,
    minCpGs = 5,
    ncores = ncores
  )
  names(aclust.list) <- NULL

  # do log transfer after aclust because aclust require beta values
  #whether transfer to mvalue
  trans_list <- beta2M_wrapper(
    beta2M = beta2M,
    BetaPhenoTrain_df = BetaPhenoTrain_df,
    BetaPhenoTest_df = BetaPhenoTest_df,
    respCol_index = respCol_index,
    returnType = "data.frame"
  )

  Train_df <- trans_list$Train
  Test_df <- trans_list$Test

  ## 2. compute predictors based on different selection methods
  sumTnT_ls <- summarizeCpGs(
    clust_ls = aclust.list,
    train_df = Train_df,
    test_df = Test_df,
    selectMethod = chooseCpGs
  )
  Train_df <- sumTnT_ls$train_df
  Test_df <- sumTnT_ls$test_df
  p <- sumTnT_ls$npredictors

  PhenoTrain_df <- cbind(
    BetaPhenoTrain_df[respCol_index],
    Train_df
  )
  PhenoTest_df <- cbind(
    BetaPhenoTest_df[respCol_index],
    Test_df
  )

  ## 4. fit model and do prediction  ##########################################
  seed.value <- designInfo_df$seed[rowNum]
  NumOfRep   <- designInfo_df$NumOfRep[rowNum]
  NumOfCv    <- designInfo_df$NumOfCv[rowNum]

  predict_ls <- predict_wrapper(
    predictMethod = predictMethod,
    alphaValue = alphaValue,
    PhenoTrain_df = PhenoTrain_df,
    PhenoTest_df = PhenoTest_df,
    respCol_index = respCol_index,
    outcome_type = outcome_type,
    seed_int = seed.value,
    whichRep_int = NumOfRep,
    whichCVfold_int = NumOfCv,
    ncores = ncores
  )

  out_ls <- list(
    Fit = predict_ls$Fit,
    PredPerformance = predict_ls$performance,
    numOfPredictors = p
  )

  if(save){
    saveRDS(out_ls, paste0(resultPath, "aclust-", selectMethod,"-",
                           predictMethod, rowNum,".RDS"))
  }

  out_ls

}
lizhongliu1996/PredictMisc documentation built on Aug. 23, 2019, 5:55 a.m.