#' Evaluate Prediction on Methylation Data
#'
#' @description Read data, based on one row of information_df, then use aclust
#' methods to select cpgs as predictors fit \link{glmnet} elastic net,
#' \link{caret} random forest or support vector machine model and evaluate its
#' prediction performance.
#'
#' @param rowNum num of row in information_df
#' @param Beta_df Beta_df is a data frame that each row is a cpg probe, each col
#' is a sample id, each cell is a Beta value, first column is the phenodata, please
#' make sure this column is a factor with levels, or we can not ensure accuracy
#' of the results
#' @param beta2M whether transfre beta to m value before prediction
#' @param respCol_index response variable col number in beta data frame
#' @param designInfo_df information df generate by \link{summaryInfo} function
#' @param alphaValue vector that storage alpha values
#' @param ncores number of cores to do parallel computing
#' @param chromeAnnot_ls list that contains 22 chrome annot probe information
#' @param chooseCpGs what cpg selection method to use, use full cpgs \link{fullCpGs}
#' within cluster or PC1 score \link{getPC1} of cluster or maximum \link{maxCpGs}
#' expression score, default set to fullcpgs, it is feasible to write new methods
#' by adding a new function that take in train/test dataset and cpglist then
#' return a list of train and test subset data.
#' @param predictMethod what prediction method to use
#' @param outcome_type type of outcome variable, gauusian or binomial or poisson, etc
#' @param resultPath path to storage results
#' @param save whether to save the results
#'
#' @return return a list with three elements,\cr
#' \enumerate{
#' \item first element is the fit model results of different prediction methods
#' \item Second item second element is the data frame that contains evalutation parameters of
#' different prediction methods' performace:\cr
#' for glmnet net, the data frame has row number equal to number of alpha
#' values given in the function argument times 16 columns with different
#' evaluation parameters including NumOfRep,NumOfCv, auc_results,
#' Sensitivity, Specificity, etc;\cr
#' for random forest and support vector machine, the data frame has one
#' row times 14 columns with different evaluation parameters including
#' NumOfRep,NumOfCv, auc_results, Sensitivity, Specificity, etc\cr
#' \item third element is a vector that indicate number of predictors used
#' }
#' @details
#' \describe{
#' \item{predictMethod1: }{Elastic net from function \link[glmnet]{glmnet}
#' to do prediction}
#' \item{predictMethod2: }{Random Forest from function \link[caret]{train}
#' to do prediction(requires package "randomForest" installed first)}
#' \item{predictMethod3: }{Support Vector Machine from function
#' \link[caret]{train} to do prediction(requires package "kernlab" installed)}
#' }
#'
#' @importFrom pathwayPCA TransposeAssay
#'
#' @export
#'
#' @examples \dontrun{
#' data(Example_df)
#' data(pfcInfo_df)
#' data(chrome_annot_files)
#'
#' test1 <- pipeAclust(
#' rowNum = 10,
#' Beta_df = Example_df,
#' beta2M = TRUE,
#' respCol_index = 1,
#' designInfo_df = pfcInfo_df,
#' chromeAnnot_ls = chrome_annot_files,
#' alphaValue = seq(0, 1, by = 0.1),
#' ncores = 2,
#' chooseCpGs = fullCpGs,
#' predictMethod = "glmnet",
#' outcome_type = "binomial",
#' save = FALSE,
#' resultPath = NULL
#' )
#'
#' test2 <- pipeAclust(
#' rowNum = 10,
#' Beta_df = Example_df,
#' beta2M = TRUE,
#' respCol_index = 1,
#' designInfo_df = pfcInfo_df,
#' chromeAnnot_ls = chrome_annot_files,
#' alphaValue = seq(0, 1, by = 0.1),
#' ncores = 2,
#' chooseCpGs = getPC1,
#' predictMethod = "glmnet",
#' outcome_type = "binomial",
#' save = FALSE,
#' resultPath = NULL
#' )
#'
#' test3 <- pipeAclust(
#' rowNum = 10,
#' Beta_df = Example_df,
#' beta2M = TRUE,
#' respCol_index = 1,
#' designInfo_df = pfcInfo_df,
#' chromeAnnot_ls = chrome_annot_files,
#' alphaValue = seq(0, 1, by = 0.1),
#' ncores = 2,
#' chooseCpGs = maxCpGs,
#' predictMethod = "glmnet",
#' outcome_type = "binomial",
#' save = FALSE,
#' resultPath = NULL
#' )
#' }
pipeAclust <- function(rowNum, Beta_df, beta2M = TRUE, respCol_index,
designInfo_df, chromeAnnot_ls,
alphaValue = seq(0, 1, by = 0.1), ncores = 2,
chooseCpGs = fullCpGs,
predictMethod = c("glmnet", "randomForest", "svm"),
outcome_type = "binomial", save = FALSE, resultPath = NULL){
## 1. divided methylation data into train and test data ####################
TrainTest <- methSplit(rowNum = rowNum, Beta_df = Beta_df, designInfo_df = designInfo_df)
BetaPhenoTrain_df <- TrainTest$Train_df
BetaPhenoTest_df <- TrainTest$Test_df
## generate Acluster and extract cpg names out ##
## first column of betapheno is phenodata, remove first column
tBetaPhenoTrain_df <- TransposeAssay(
BetaPhenoTrain_df[, -respCol_index],
omeNames = "rowNames"
)
aclust.list <- runAcluster(
chromeAnnot_df = chromeAnnot_ls,
tBeta_df = tBetaPhenoTrain_df,
minCpGs = 5,
ncores = ncores
)
names(aclust.list) <- NULL
# do log transfer after aclust because aclust require beta values
#whether transfer to mvalue
trans_list <- beta2M_wrapper(
beta2M = beta2M,
BetaPhenoTrain_df = BetaPhenoTrain_df,
BetaPhenoTest_df = BetaPhenoTest_df,
respCol_index = respCol_index,
returnType = "data.frame"
)
Train_df <- trans_list$Train
Test_df <- trans_list$Test
## 2. compute predictors based on different selection methods
sumTnT_ls <- summarizeCpGs(
clust_ls = aclust.list,
train_df = Train_df,
test_df = Test_df,
selectMethod = chooseCpGs
)
Train_df <- sumTnT_ls$train_df
Test_df <- sumTnT_ls$test_df
p <- sumTnT_ls$npredictors
PhenoTrain_df <- cbind(
BetaPhenoTrain_df[respCol_index],
Train_df
)
PhenoTest_df <- cbind(
BetaPhenoTest_df[respCol_index],
Test_df
)
## 4. fit model and do prediction ##########################################
seed.value <- designInfo_df$seed[rowNum]
NumOfRep <- designInfo_df$NumOfRep[rowNum]
NumOfCv <- designInfo_df$NumOfCv[rowNum]
predict_ls <- predict_wrapper(
predictMethod = predictMethod,
alphaValue = alphaValue,
PhenoTrain_df = PhenoTrain_df,
PhenoTest_df = PhenoTest_df,
respCol_index = respCol_index,
outcome_type = outcome_type,
seed_int = seed.value,
whichRep_int = NumOfRep,
whichCVfold_int = NumOfCv,
ncores = ncores
)
out_ls <- list(
Fit = predict_ls$Fit,
PredPerformance = predict_ls$performance,
numOfPredictors = p
)
if(save){
saveRDS(out_ls, paste0(resultPath, "aclust-", selectMethod,"-",
predictMethod, rowNum,".RDS"))
}
out_ls
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.