R/pipeMostVar.R

Defines functions pipeMostVar

Documented in pipeMostVar

#' Evaluate \code{glmnet} Prediction on Methylation Data
#'
#' @description Read data, based on one row of information_df, then selected
#'   most variable cpgs as predictors to fit elastic net, random forest or
#'   support vector machine model and evaluate its prediction performance.
#'
#' @param rowNum num of row in information_df
#' @param Beta_df Beta_df is a data frame that each row is a cpg probe, each col
#'   is a sample id, each cell is a Beta value, first column is the phenodata, please
#'   make sure this column is a factor with levels, or we can not ensure accuracy
#'   of the results
#' @param beta2M whether transfre beta to m value before prediction
#' @param respCol_index response variable col number in beta data frame
#' @param designInfo_df information df generate by \link{summaryInfo} function
#' @param alphaValue vector that storage alpha values
#' @param ncores number of cores to do parallel computing
#' @param npredictors number of cpgs chosen to be predictors(
#' default set to 5k, suggest 5k, 10k, 20k, 50k)
#' @param predictMethod what prediction method to use(glmnet or randomForest
#' or support vector machine)
#' @param outcome_type type of outcome variable, gauusian or binomial or poisson, etc
#' @param save whether to save the results
#' @param resultPath path to storage results
#'
#' @return return a list with three elements,\cr
#' \enumerate{
#'     \item first element is the fit model results of different prediction methods
#'     \item Second item second element is the data frame that contains evalutation parameters of
#'           different prediction methods' performace:\cr
#'         for glmnet net, the data frame has row number equal to number of alpha
#'           values given in the function argument times 16 columns with different
#'           evaluation parameters including NumOfRep,NumOfCv, auc_results,
#'           Sensitivity, Specificity, etc;\cr
#'         for random forest and support vector machine, the data frame has one
#'           row times 14 columns with different evaluation parameters including
#'           NumOfRep,NumOfCv, auc_results, Sensitivity, Specificity, etc\cr
#'     \item third element is a vector that indicate number of predictors used
#'  }
#' @details
#' \describe{
#'  \item{predictMethod1: }{Elastic net from function \link[glmnet]{glmnet}
#'      to do prediction}
#'  \item{predictMethod2: }{Random Forest from function \link[caret]{train}
#'      to do prediction(requires package "randomForest" installed first)}
#'  \item{predictMethod3: }{Support Vector Machine from function
#'      \link[caret]{train} to do prediction(requires package "kernlab" installed)}
#' }
#'
#' @importFrom stats sd
#'
#' @export
#'
#' @examples \dontrun{
#'  data(Example_df)
#'  data(pfcInfo_df)
#'
#'  test <- pipeMostVar(
#'    rowNum = 10,
#'    Beta_df = Example_df,
#'    beta2M = TRUE,
#'    respCol_index = 1,
#'    designInfo_df = pfcInfo_df,
#'    alphaValue = seq(0, 1, by = 0.1),
#'    ncores = 2,
#'    npredictors = 5000,
#'    predictMethod = "glmnet",
#'    outcome_type = "binomial",
#'    save = FALSE,
#'    resultPath = NULL
#'  )
#' }

pipeMostVar <- function(rowNum, Beta_df, beta2M = TRUE, respCol_index,
                        designInfo_df, alphaValue = seq(0, 1, by = 0.1),
                        ncores = 2, npredictors = 5000,
                        predictMethod = c("glmnet", "randomForest", "svm"),
                        outcome_type = "binomial",
                        save = FALSE, resultPath = FALSE){

  ## 1. divided methylation data into train and test data  ####################
  TrainTest <- methSplit(rowNum = rowNum, Beta_df = Beta_df, designInfo_df = designInfo_df)
  BetaPhenoTrain_df <- TrainTest$Train_df
  BetaPhenoTest_df <- TrainTest$Test_df

  #whether transfer to mvalue
  trans_list <- beta2M_wrapper(
    beta2M = beta2M,
    BetaPhenoTrain_df = BetaPhenoTrain_df,
    BetaPhenoTest_df = BetaPhenoTest_df,
    respCol_index = respCol_index,
    returnType = "matrix"
  )

  Train_mat <- trans_list$Train
  Test_mat <- trans_list$Test


  ## compute sd
  stand.dev <- apply(Train_mat, 2, sd)
  std.sorted <- sort(stand.dev, decreasing = TRUE)

  ## 2. choose predictors
  predictors <- names(std.sorted)[seq_len(npredictors)]
  p <- length(predictors)

  ##  subset based on predictors ###########
  Train_df <- as.data.frame(Train_mat[, predictors])
  Test_df  <- as.data.frame(Test_mat[, predictors])

  PhenoTrain_df <- cbind(
    BetaPhenoTrain_df[,respCol_index],
    Train_df
  )
  PhenoTest_df <- cbind(
    BetaPhenoTest_df[,respCol_index],
    Test_df
  )

  ## 4. fit model and do prediction  ##########################################
  seed.value <- designInfo_df$seed[rowNum]
  NumOfRep   <- designInfo_df$NumOfRep[rowNum]
  NumOfCv    <- designInfo_df$NumOfCv[rowNum]

  predict_ls <- predict_wrapper(
    predictMethod = predictMethod,
    alphaValue = alphaValue,
    PhenoTrain_df = PhenoTrain_df,
    PhenoTest_df = PhenoTest_df,
    respCol_index = respCol_index,
    outcome_type = outcome_type,
    seed_int = seed.value,
    whichRep_int = NumOfRep,
    whichCVfold_int = NumOfCv,
    ncores = ncores
  )

  out_ls <- list(
    Fit = predict_ls$Fit,
    PredPerformance = predict_ls$performance,
    numOfPredictors = p
  )

  if(save){
    saveRDS(out_ls, paste0(resultPath,"var-", npredictors,"-",
                           predictMethod,"_row", rowNum,".RDS"))
  }

  out_ls

}
lizhongliu1996/PredictMisc documentation built on Aug. 23, 2019, 5:55 a.m.