R/BioMM.R

Defines functions BioMM reconByUnsupervised BioMMstage2pred reconBySupervised predByCV predByBS predByFS baseModel baseGLMnet baseSVM baseRandForest

Documented in baseGLMnet baseModel baseRandForest baseSVM BioMM BioMMstage2pred predByBS predByCV predByFS reconBySupervised reconByUnsupervised

# File : BioMM.R Author: Junfang Chen


############################################################################### 

#' Prediction by random forest 
#'
#' @description
#' Prediction by random forest with different settings: 'probability', 
#' 'classification' and 'regression'.

#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param predMode The prediction mode. Available options are 
#' c('probability', 'classification', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' The valid option: list(ntree, nthreads). 'ntree' is the number of trees 
#' used. The defaul is 2000. 'nthreads' is the number of threads used for 
#' computation. The default is 20.
#' @return The predicted output for the test data.
#' @import ranger
#' @export 
#' @author Junfang Chen
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' dataY <- methylData[,1]
#' ## test a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:2001)])  
#' trainIndex <- sample(nrow(methylSub), 12)
#' trainData = methylSub[trainIndex,]
#' testData = methylSub[-trainIndex,]
#' library(ranger)
#' predY <- baseRandForest(trainData, testData, 
#'                         predMode='classification', 
#'                         paramlist=list(ntree=300, nthreads=20)) 
#' testY <- testData[,1]
#' accuracy <- classifiACC(dataY=testY, predY=predY)
#' print(accuracy) 


baseRandForest <- function(trainData, testData, predMode = c("classification", 
    "probability", "regression"), paramlist = list(ntree = 2000, nthreads = 20)) {
    
    predMode <- match.arg(predMode)
    ## input parameters
    if (!is.element("ntree", names(paramlist))) {
        stop(" 'ntree' is missing in the 'paramlist'!")
    }
    if (!is.element("nthreads", names(paramlist))) {
        stop(" 'nthreads' is missing in the 'paramlist'!")
    }
    ntree <- paramlist$ntree
    nthreads <- paramlist$nthreads
    if (predMode == "probability") {
        trainData$label <- as.factor(trainData$label)
        model <- ranger(label ~ ., data = trainData, num.trees = ntree,
            num.threads = nthreads, write.forest = TRUE, probability = TRUE)
        yhatClassProb <- predictions(predict(model, testData))
        predTest <- round(yhatClassProb[, 2], 3)
    } else if (predMode == "classification") {
        trainData$label <- as.factor(trainData$label)
        model <- ranger(label ~ ., data = trainData, num.trees = ntree,
            num.threads = nthreads, write.forest = TRUE, classification = TRUE)
        predTest <- predictions(predict(model, testData))
        predTest <- as.numeric(predTest) - 1
    } else if (predMode == "regression") {
        model <- ranger(label ~ ., data = trainData, num.trees = ntree, 
            num.threads = nthreads)
        predTest <- predictions(predict(model, testData))
        predTest <- round(predTest, 3)
    }

    return(predTest)
}


############################################################################### 

#' Prediction by SVM
#'
#' @description
#' Prediction by support vector machine (SVM) with two different settings: 
#' 'classification' and 'regression'.

#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param predMode The prediction mode. Available options are 
#' c('classification', 'probability', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' The valid option: list(kernel, gamma, cost, tuneP).  
#' \enumerate{
#'   \item 'tuneP':  a logical value indicating if hyperparameter tuning 
#'                   should be conducted or not. The default is FALSE.
#'   \item 'kernel': options are c('linear', 'polynomial', 'radial', 
#'                   'sigmoid'). The defaut is 'radial'.
#'   \item 'gamma': the parameter needed for all kernels except 'linear'. 
#'         If tuneP is TRUE, more than one value is suggested.  
#'   \item 'cost': is the cost of constraints violation.
#'         If tuneP is TRUE, more than one value is suggested. 
#' } 
#' @return The predicted output for the test data.
#' @details Hyperparameter tuning is recommended in many biological data 
#' mining applications. The best parameters can be determined via an internal 
#' cross validation.
#' @import e1071
#' @export 
#' @author Junfang Chen  
##' @seealso \code{\link[e1071]{svm}}
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' dataY <- methylData[,1]
#' ## select a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:2001)])  
#' trainIndex <- sample(nrow(methylSub), 12)
#' trainData = methylSub[trainIndex,]
#' testData = methylSub[-trainIndex,]
#' library(e1071)
#' predY <- baseSVM(trainData, testData, 
#'                  predMode='classification', 
#'                  paramlist=list(tuneP=FALSE, kernel='radial', 
#'                                 gamma=10^(-3:-1), cost=10^(-3:1))) 
#' testY <- testData[,1]
#' accuracy <- classifiACC(dataY=testY, predY=predY)
#' print(accuracy) 


baseSVM <- function(trainData, testData, 
    predMode = c("classification", "probability", "regression"), 
    paramlist = list(tuneP = TRUE, kernel = "radial", gamma = 10^(-3:-1), 
    cost = 10^(-2:2))) {
    
    predMode <- match.arg(predMode)
    ## input parameters
    if (!is.element("tuneP", names(paramlist))) {
        stop(" 'tuneP' is missing in the 'paramlist'!")
    }
    if (!is.element("kernel", names(paramlist))) {
        stop(" 'kernel' is missing in the 'paramlist'!")
    }
    if (!is.element("gamma", names(paramlist))) {
        stop(" 'gamma' is missing in the 'paramlist'!")
    }
    if (!is.element("cost", names(paramlist))) {
        stop(" 'cost' is missing in the 'paramlist'!")
    }
    tuneP <- paramlist$tuneP
    kernel <- paramlist$kernel
    gamma <- paramlist$gamma
    cost <- paramlist$cost
    if (tuneP == TRUE) {
        kernel <- paramlist$kernel
    }
    if (tuneP == TRUE) {
        gamma <- paramlist$gamma
    }
    if (tuneP == TRUE) {
        cost <- paramlist$cost
    }
    
    if (predMode == "classification" || predMode == "probability") {
        type <- "C-classification"  ## 
        trainData$label <- as.factor(trainData$label)
    } else if (predMode == "regression") {
        type <- "eps-regression"
        if (is.factor(trainData$label)) {
            trainData$label <- as.numeric(trainData$label) - 1
        }
    }
    
    if (predMode == "probability") {
        if (tuneP == TRUE) {
            ## use internal CV
            tuneOut <- tune.svm(label ~ ., data = trainData, type, kernel, 
                gamma, cost, probability = TRUE)
            model <- tuneOut$best.model
        } else {
            ## no internal CV required
            model <- svm(label ~ ., data = trainData, probability = TRUE)
        }
        predTest2 <- predict(model, testData, probability = TRUE)
        predTest <- round(attr(predTest2, "probabilities")[, 2], 3)
        
    } else {
        if (tuneP == TRUE) {
            ## use internal CV
            tuneOut <- tune.svm(label ~ ., data = trainData, type, kernel, 
                gamma, cost)
            model <- tuneOut$best.model
        } else {
            ## no CV required
            model <- svm(label ~ ., data = trainData)
        }
        predTest <- predict(model, testData)
        if (predMode == "classification") {
            predTest <- as.numeric(predTest) - 1
        }
    }

    return(predTest)

}


############################################################################### 

#' Prediction by generalized linear regression models
#'
#' @description
#' Prediction by generalized regression models with lasso or elastic net 
#' regularization.

#' @param trainData The input training dataset. The first column is named 
#' the 'label'.
#' @param testData The input test dataset. The first column is named 
#' the 'label'.
#' @param predMode The prediction mode. Available options are 
#' c('classification', 'probability', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' The valid option: list(family, alpha, typeMeasure, typePred).  
#' \enumerate{
#'   \item 'family': Response type: 'gaussian','binomial','poisson',
#'                'multinomial','cox','mgaussian'. (Default: 'binomial')
#'   \item 'alpha': The elastic net mixing parameter, with 0<=alpha<= 1. 
#'   \item 'typeMeasure': error metrics for internal cross-validation.
#'                        'mse' uses squared loss; 
#'                        'deviance' uses actual deviance; 
#'                        'mae' uses mean absolute error; 
#'                        'class' gives misclassification error;
#'                        'auc' (for two-class logistic regression ONLY) 
#'                        gives area under the ROC curve.
#'   \item 'typePred': The type of prediction: 'response' and 'class'. 
#'                     (Default: 'class' for binary classification)
#' } 
#' @return The predicted output for the test data.
#' @export 
#' @import glmnet
#' @author Junfang Chen 
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' dataY <- methylData[,1]
#' ## select a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:2001)])  
#' trainIndex <- sample(nrow(methylSub), 16)
#' trainData = methylSub[trainIndex,]
#' testData = methylSub[-trainIndex,]
#' library(glmnet)
#' ## classification
#' predY <- baseGLMnet(trainData, testData, 
#'                     predMode='classification', 
#'                     paramlist=list(family='binomial', alpha=0.5, 
#'                                    typeMeasure='mse', typePred='class')) 
#' testY <- testData[,1]
#' accuracy <- classifiACC(dataY=testY, predY=predY)
#' print(accuracy)  


baseGLMnet <- function(trainData, testData, 
    predMode = c("classification", "probability", "regression"), 
    paramlist = list(family = "binomial", alpha = 0.5, typeMeasure = "mse", 
    typePred = "class")) {
    
    predMode <- match.arg(predMode)
    ## input parameters
    if (!is.element("family", names(paramlist))) {
        stop(" 'family' is missing in the 'paramlist'!")
    }
    if (!is.element("alpha", names(paramlist))) {
        stop(" 'alpha' is missing in the 'paramlist'!")
    }
    if (!is.element("typeMeasure", names(paramlist))) {
        stop(" 'typeMeasure' is missing in the 'paramlist'!")
    }
    if (!is.element("typePred", names(paramlist))) {
        stop(" 'typePred' is missing in the 'paramlist'!")
    }
    family <- paramlist$family
    alpha <- paramlist$alpha
    typeMeasure <- paramlist$typeMeasure
    typePred <- paramlist$typePred
    trainDataY <- trainData$label
    ## To avoid 'X should be a matrix with 2 or more cols'
    if (ncol(trainData) == 2) {
        trainData$one <- rep(1, nrow(trainData))
        testData$one <- rep(1, nrow(testData))
        trainDataX <- trainData[, -1]
        testDataX <- testData[, -1]
    } else {
        trainDataX <- trainData[, -1]
        testDataX <- testData[, -1]
    }
    cvfit <- cv.glmnet(as.matrix(trainDataX), trainDataY, family = family, 
        type.measure = typeMeasure, nfolds = 10)
    model <- glmnet(as.matrix(trainDataX), trainDataY, family = family, 
        alpha = alpha, lambda = cvfit$lambda.min)
    yhat <- predict(model, as.matrix(testDataX), type = typePred)
    
    if (predMode == "classification") {
        ## set type='class'
        predTest = as.numeric(yhat)
    } else if (predMode == "probability" || predMode == "regression") {
        predTest = round(yhat[, 1], 3)
    }

    return(predTest)
}


############################################################################### 

#' Base supervised machine learning models for prediction
#'
#' @description
#' Prediction using different supervised machine learning models.

#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param classifier Machine learning classifiers. 
#' Available options are c('randForest', 'SVM', 'glmnet').
#' @param predMode The prediction mode. Available options are
#' c('classification', 'probability', 'regression'). 
#' 'probability' is currently only for 'randForest'. 
#' @param paramlist A set of model parameters defined in an R list object. 
#' See more details for each individual model. 

#' @return Based on a given machine learning, the predicted score/output will be 
#' estimated for the test data.
#' @export 

#' @author Junfang Chen 
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' dataY <- methylData[,1]
#' ## select a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:2001)])  
#' trainIndex <- sample(nrow(methylSub), 16)
#' trainData = methylSub[trainIndex,]
#' testData = methylSub[-trainIndex,]
#' library(ranger) 
#' set.seed(123)
#' predY <- baseModel(trainData, testData, 
#'                    classifier='randForest',  
#'                    predMode='classification', 
#'                    paramlist=list(ntree=300, nthreads=20)) 
#' print(table(predY)) 
#' testY <- testData[,1]
#' accuracy <- classifiACC(dataY=testY, predY=predY)
#' print(accuracy)  

baseModel <- function(trainData, testData, 
    classifier = c("randForest", "SVM", "glmnet"), 
    predMode = c("classification", "probability", "regression"), paramlist) {
    
    if (colnames(trainData)[1] != "label") {
        stop("The first column of the 'trainData' must be the 'label'!")
    }
    if (colnames(testData)[1] != "label") {
        stop("The first column of the 'testData' must be the 'label'!")
    }
    classifier <- match.arg(classifier)
    predMode <- match.arg(predMode)
    if (classifier == "randForest") {
        predTest <- baseRandForest(trainData, testData, predMode, paramlist)
    } else if (classifier == "SVM") {
        predTest <- baseSVM(trainData, testData, predMode, paramlist)
    } else if (classifier == "glmnet") {
        predTest <- baseGLMnet(trainData, testData, predMode, paramlist)
    }

    return(predTest)
}


############################################################################### 

#' Prediction by supervised machine learning along with feature selection
#'
#' @description
#' Prediction by supervised machine learning along with feature selection.

#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param FSmethod Feature selection methods. Available options are 
#' c(NULL, 'positive', 'wilcox.test', 'cor.test', 'chisq.test', 'posWilcox', 
#' or 'top10pCor').
#' @param cutP The cutoff used for p value thresholding.  
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, etc.). The default is 0.05.
#' @param fdr Multiple testing correction method. Available options are 
#' c(NULL, 'fdr', 'BH', 'holm', etc.). 
#' See also \code{\link[stats]{p.adjust}}. The default is NULL.
#' @param FScore The number of cores used for feature selection.  
#' @param classifier Machine learning classifiers. 
#' Available options are c('randForest', 'SVM', 'glmnet').
#' @param predMode The prediction mode. Available options are 
#' c('probability', 'classification', 'regression'). 
#' @param paramlist A set of model parameters defined in an R list object. 

#' @return The predicted output for the test data.
#' @details If no feature selected or just one selected feature, 
#' then top 10% outcome-associated features will be selected by default.
#' @export 

#' @author Junfang Chen 
#' @seealso \code{\link{getDataByFilter}} 
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' dataY <- methylData[,1]
#' ## select a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:501)])  
#' trainIndex <- sample(nrow(methylSub), 16)
#' trainData = methylSub[trainIndex,]
#' testData = methylSub[-trainIndex,]
#' library(ranger) 
#' library(BiocParallel)
#' param <- MulticoreParam(workers = 10)
#' predY <- predByFS(trainData, testData, 
#'                   FSmethod='cor.test', cutP=0.1, 
#'                   fdr=NULL, FScore=param, 
#'                   classifier='randForest',
#'                   predMode='classification', 
#'                   paramlist=list(ntree=300, nthreads=20))  
#' testY <- testData[,1]
#' accuracy <- classifiACC(dataY=testY, predY=predY)
#' print(accuracy)  


predByFS <- function(trainData, testData, FSmethod, cutP, fdr, 
    FScore = MulticoreParam(), classifier, predMode, paramlist) {
    
    datalist <- getDataByFilter(trainData, testData, FSmethod, cutP, fdr, FScore)
    ## include the label
    trainDataSub <- datalist[[1]]
    testDataSub <- datalist[[2]]
    
    ## If no selected features or just one selected feature.
    if (is.null(trainDataSub) || ncol(trainDataSub) == 2) {
        datalist <- getDataByFilter(trainData, testData, FSmethod = "top10pCor", 
            cutP, fdr, FScore)
        trainDataSub <- datalist[[1]]
        testDataSub <- datalist[[2]]
        predTest <- baseModel(trainData = trainDataSub, testData = testDataSub, 
            classifier, predMode, paramlist)
        
    } else if (!is.null(trainDataSub)) {
        predTest <- baseModel(trainData = trainDataSub, testData = testDataSub, 
            classifier, predMode, paramlist)
        
    } else {
        message("Error: Input data is wrong!")
    }
    
    if (is.factor(predTest)) {
        predTest <- as.numeric(predTest) - 1
    }

    return(predTest)
}



############################################################################### 

#' Bootstrap resampling prediction via supervised machine learning with 
#' feature selection
#'
#' @description
#' Prediction via supervised machine learning using bootstrap resampling 
#' along with feature selection methods.

#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param dataMode The input training data mode for model training.
#' It is used only if 'testData' is present. It can be a subset of 
#' the whole training data or the entire training data. 'subTrain' 
#' is the given for subsetting and 'allTrain' for the entire training
#' dataset. 
#' @param repeats The number of repeats used for boostrapping.
#' @param FSmethod Feature selection methods. Available options are 
#' c(NULL, 'positive', 'wilcox.test', 'cor.test', 'chisq.test', 'posWilcox', 
#' or 'top10pCor').
#' @param cutP The cutoff used for p value thresholding.  
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc). The default is 0.05.
#' @param fdr Multiple testing correction method. Available options are 
#' c(NULL, 'fdr', 'BH', 'holm', etc). 
#' See also \code{\link[stats]{p.adjust}}. The default is NULL.
#' @param FScore The number of cores used for feature selection 
#' if parallel computing needed.
#' @param classifier Machine learning classifiers. 
#' @param predMode The prediction mode. Available options are 
#' c('probability', 'classification', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' @param innerCore The number of cores used for computation.

#' @return The predicted output for the test data. 
#' @export  
#' @import BiocParallel 
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' dataY <- methylData[,1]
#' ## select a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:2001)])  
#' trainIndex <- sample(nrow(methylSub), 16)
#' trainData = methylSub[trainIndex,]
#' testData = methylSub[-trainIndex,]
#' library(ranger) 
#' library(BiocParallel)
#' param1 <- MulticoreParam(workers = 1)
#' param2 <- MulticoreParam(workers = 20)
#' predY <- predByBS(trainData, testData, 
#'                   dataMode='allTrain', repeats=50,
#'                   FSmethod=NULL, cutP=0.1, 
#'                   fdr=NULL, FScore=param1, 
#'                   classifier='randForest',
#'                   predMode='classification', 
#'                   paramlist=list(ntree=300, nthreads=10),
#'                   innerCore=param2)  
#' testY <- testData[,1]
#' accuracy <- classifiACC(dataY=testY, predY=predY)
#' print(accuracy)  


predByBS <- function(trainData, testData, dataMode, repeats, FSmethod, cutP, 
    fdr, FScore = MulticoreParam(), classifier, predMode, paramlist, 
    innerCore = MulticoreParam()) {
    
    if (is.null(testData)) {
        ## BS only applied for training data
        data <- trainData
        predTmp <- rep(NA, nrow(data))
        replist <- seq_len(repeats)
        predTmpList <- bplapply(replist, function(reps) {
            trainIndex <- unique(sample(nrow(data), replace = TRUE))
            testIndex <- setdiff(seq_len(nrow(data)), trainIndex)
            trainData <- data[trainIndex, ]
            testData <- data[testIndex, ]
            ## OOB predicted scores
            predTmp[testIndex] <- predByFS(trainData, testData, FSmethod, cutP, 
                fdr, FScore, classifier, predMode, paramlist)
            predTmp
        }, BPPARAM = innerCore)
        predTest <- round(rowMeans(do.call(cbind, predTmpList), na.rm = TRUE), 
            3)
        naCount <- sum(is.na(predTest))
        ## If not enough bootstrapping repeats
        if (naCount > 0) {
            message(paste0("Warning: ", naCount, " NA(s) in predicted testY!"))
        }
        
    } else if (!is.null(testData)) {
        ## repeated prediction on test data
        if (dataMode == "subTrain") {
            data <- trainData
            predTmp <- rep(NA, nrow(data))
            replist <- seq_len(repeats)
            predTmpList <- bplapply(replist, function(reps) {
                trainIndex <- unique(sample(nrow(data), replace = TRUE))
                trainData <- data[trainIndex, ]
                predTmp <- predByFS(trainData, testData, FSmethod, cutP, fdr, 
                  FScore, classifier, predMode, paramlist)
                predTmp
            }, BPPARAM = innerCore)
            predTest <- round(rowMeans(do.call(cbind, predTmpList)), 3)
        } else if (dataMode == "allTrain") {
            predTmp <- rep(NA, nrow(testData))
            replist <- seq_len(repeats)
            predTmpList <- bplapply(replist, function(reps) {
                predTmp <- predByFS(trainData, testData, FSmethod, cutP, fdr, 
                  FScore, classifier, predMode, paramlist)
                predTmp
            }, BPPARAM = innerCore)
            predTest <- round(rowMeans(do.call(cbind, predTmpList)), 3)
        }
    } else {
        message("Input data is wrong!")
    }
    
    if (predMode == "classification") {
        predTest <- ifelse(predTest >= 0.5, 1, 0)
    }
    return(predTest)
}


############################################################################### 

#' Cross validation prediction by supervised machine learning and feature 
#' selection

#' @description
#' Prediction by supervised machine learning models using cross validation 
#' along with feature selection methods.

#' @param data The input dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member. 
#' @param repeats The number of repeats used for cross validation.
#' Repeated cross validation is performed if N >=2.
#' @param nfolds The number of folds is defined for cross validation.
#' @param FSmethod Feature selection methods. Available options are 
#' c(NULL, 'positive', 'wilcox.test', 'cor.test', 'chisq.test', 'posWilcox', 
#' or 'top10pCor').
#' @param cutP The cutoff used for p value thresholding.  
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc). The default is 0.05.
#' @param fdr Multiple testing correction method. Available options are 
#' c(NULL, 'fdr', 'BH', 'holm', etc). 
#' See also \code{\link[stats]{p.adjust}}. The default is NULL.
#' @param FScore The number of cores used for feature selection if parallel 
#' computing needed. 
#' @param classifier Machine learning classifiers. 
#' @param predMode The prediction mode. Available options are 
#' c('probability', 'classification', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' @param innerCore The number of cores used for computation.

#' @return The predicted cross validation output. 
#' @export 
#' @import BiocParallel 
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)   
#' dataY <- methylData[,1]
#' ## select a subset of genome-wide methylation data at random
#' methylSub <- data.frame(label=dataY, methylData[,c(2:2001)])  
#' library(ranger) 
#' library(BiocParallel)
#' param1 <- MulticoreParam(workers = 1)
#' param2 <- MulticoreParam(workers = 20)
#' predY <- predByCV(methylSub, repeats=1, nfolds=10,   
#'                   FSmethod=NULL, cutP=0.1, 
#'                   fdr=NULL, FScore=param1, 
#'                   classifier='randForest',
#'                   predMode='classification', 
#'                   paramlist=list(ntree=300, nthreads=1),
#'                   innerCore=param2)
#' dataY <- methylData[,1]
#' accuracy <- classifiACC(dataY=dataY, predY=predY)
#' print(accuracy)  


predByCV <- function(data, repeats, nfolds, FSmethod, cutP, fdr, FScore = MulticoreParam(), 
    classifier, predMode, paramlist, innerCore = MulticoreParam()) {
    
    cvMat <- c()
    replist <- seq_len(repeats)
    for (reps in replist) {
        foldlists <- split(sample(nrow(data)), rep(seq_len(nfolds), 
            length = nrow(data)))
        cvY <- rep(NA, nrow(data))
        cvEstimate <- bplapply(foldlists, function(fold) {
            trainData <- data[-fold, ]
            testData <- data[fold, ]
            predTest <- predByFS(trainData, testData, FSmethod, cutP, fdr, 
                FScore, classifier, predMode, paramlist)
        }, BPPARAM = innerCore)
        cvY[unlist(foldlists)] <- unlist(cvEstimate)
        cvMat <- cbind(cvMat, cvY)
    }
    
    if (repeats == 1) {
        predTest <- cvMat[, 1]
    } else {
        ## average over the repeats
        predTest <- round(rowMeans(cvMat, na.rm = TRUE), 3)
    }
    
    if (predMode == "classification") {
        predTest <- ifelse(predTest >= 0.5, 1, 0)
    }

    return(predTest)
}


############################################################################### 

#' Reconstruct stage-2 data by supervised machine learning prediction

#' @description
#' Reconstruct stage-2 data by supervised machine learning prediction.

#' @param trainDataList The input training data list containing ordered 
#' collections of matrices.
#' @param testDataList The input test data list containing ordered collections 
#' of matrices. 
#' @param resample The resampling methods. Valid options are 'CV' and 'BS'. 
#' 'CV' for cross validation and 'BS' for bootstrapping resampling.
#' The default is 'BS'.
#' @param dataMode The mode of data used. 'subTrain' or 'allTrain'. 
#' @param repeatA The number of repeats N is used during resampling procedure.
#' Repeated cross validation or multiple boostrapping is performed if N >=2. 
#' One can choose 10 repeats for 'CV' and 100 repeats for 'BS'.
#' @param repeatB The number of repeats N is used for generating test data 
#' prediction scores. 
#' @param nfolds The number of folds is defined for cross validation.
#' @param FSmethod Feature selection methods. Available options are 
#' c(NULL, 'positive', 'wilcox.test', 'cor.test', 'chisq.test', 'posWilcox', 
#' or 'top10pCor').
#' @param cutP The cutoff used for p value thresholding.  
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc). The default is 0.05.
#' @param fdr Multiple testing correction method. Available options are 
#' c(NULL, 'fdr', 'BH', 'holm', etc). 
#' See also \code{\link[stats]{p.adjust}}. The default is NULL.
#' @param FScore The number of cores used for feature selection, if parallel 
#' computing needed. 
#' @param classifier Machine learning classifiers. 
#' @param predMode The prediction mode. Available options are 
#' c('probability', 'classification', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' @param innerCore The number of cores used for computation.
#' @param outFileA The file name of stage-2 training data with the '.rds' 
#' file extension. If it's provided, then the result will be saved in this 
#' file. The default is NULL.
#' @param outFileB The file name of stage-2 training data with the '.rds' 
#' file extension. If it's provided, then the result will be saved in this 
#' file. The default is NULL.

#' @return The predicted stage-2 training data and also stage-2 test data, if 
#' 'testDataList' provided. If outFileA and outFileB are provided, 
#' then the results will be stored in the files.
#' @details Stage-2 training data can be learned either using bootstrapping 
#' or cross validation resampling methods. Stage-2 test data is learned via 
#' independent test set prediction.
#' @export  
#' @import stats
#' @import BiocParallel 
#' @author Junfang Chen 
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)  
#' ## Annotation file
#' probeAnnoFile <- system.file('extdata', 'cpgAnno.rds', package='BioMM')  
#' featureAnno <- readRDS(file=probeAnnoFile)  
#' ## Mapping CpGs into Pathways
#' featureAnno <- readRDS(system.file("extdata", "cpgAnno.rds", package="BioMM")) 
#' pathlistDB <- readRDS(system.file("extdata", "goDB.rds", package="BioMM")) 
#' head(featureAnno)   
#' dataList <- omics2pathlist(data=methylData, pathlistDB, featureAnno, 
#'                            restrictUp=100, restrictDown=10, minPathSize=10) 
#' length(dataList)
#' library(ranger) 
#' library(BiocParallel)
#' param1 <- MulticoreParam(workers = 1)
#' param2 <- MulticoreParam(workers = 20)
#' ## Not Run, this will take a bit long
#' ## stage2data <- reconBySupervised(trainDataList=dataList, testDataList=NULL, 
#' ##                             resample='CV', dataMode='allTrain', 
#' ##                             repeatA=50, repeatB=20, nfolds=10, 
#' ##                             FSmethod=NULL, cutP=0.1, 
#' ##                             fdr=NULL, FScore=param1, 
#' ##                             classifier='randForest',
#' ##                             predMode='classification', 
#' ##                             paramlist=list(ntree=500, nthreads=20),
#' ##                             innerCore=param2, outFileA=NULL, outFileB=NULL) 
#' ## print(dim(stage2data))
#' ## print(head(stage2data[,1:5]))


reconBySupervised <- function(trainDataList, testDataList, resample = "BS", 
    dataMode, repeatA, repeatB, nfolds, FSmethod, cutP, fdr, FScore = MulticoreParam(), classifier, 
    predMode, paramlist, innerCore = MulticoreParam(), outFileA = NULL, outFileB = NULL) {
    
    reconDataAx <- c()
    reconDataBx <- c()
    for (i in seq_along(trainDataList)) {
        ## for each block
        trainData = trainDataList[[i]]
        dataAy <- trainData[, 1] 
        message(paste0('Block:', i))  
        message(paste0('Block_numFeature: ', c(ncol(trainData)-1))) 
        if (resample == "CV") {
            predA <- predByCV(data = trainData, repeats = repeatA, nfolds, 
                FSmethod, cutP, fdr, FScore, classifier, predMode, paramlist, 
                innerCore)
        } else if (resample == "BS") {
            predA <- predByBS(trainData, testData = NULL, dataMode, 
                repeats = repeatA, FSmethod, cutP, fdr, FScore, classifier, 
                predMode, paramlist, innerCore)
        }
        reconDataAx <- cbind(reconDataAx, predA)
        
        if (!is.null(testDataList)) {
            ## ind. prediction
            testData = testDataList[[i]]
            dataBy <- testData[, 1]
            predB <- predByBS(trainData, testData, dataMode, repeats = repeatB, 
                FSmethod, cutP, fdr, FScore, classifier, predMode, paramlist, 
                innerCore)
            reconDataBx <- cbind(reconDataBx, predB)
        }
    }
    
    row.names(reconDataAx) <- row.names(trainData)  ## 
    reconDataA <- data.frame(label = dataAy, reconDataAx)
    colnames(reconDataA) <- c("label", names(trainDataList))
    ## Provide fileName
    if (!is.null(outFileA)) {
        saveRDS(reconDataA, file = outFileA)
    }
    if (!is.null(testDataList)) {
        ## ind. prediction
        row.names(reconDataBx) <- row.names(testData)  ## 
        reconDataB <- data.frame(label = dataBy, reconDataBx)
        colnames(reconDataB) <- c("label", names(testDataList))
        if (!is.null(outFileB)) {
            saveRDS(reconDataB, file = outFileB)
        }
        result <- list(reconDataA, reconDataB)
    } else {
        result <- reconDataA
    }

    return(result)

}



############################################################################### 


#' Prediction performance for stage-2 data using supervised machine learning

#' @description
#' Prediction performance for reconstructed stage-2 data using supervised 
#' machine learning with feature selection methods.

#' @param trainData The input training dataset (stage-2 data). The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset (stage-2 data). The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param resample The resampling methods. Valid options are 'CV' and 'BS'. 
#' 'CV' for cross validation and 'BS' for bootstrapping resampling.
#' The default is 'CV'.
#' @param dataMode The mode of data used. 'subTrain' or 'allTrain'.
#' @param repeatA The number of repeats N is used during resampling prediction. 
#' The default is 1.
#' @param repeatB The number of repeats N is used for test data prediction. 
#' The default is 1. 
#' @param nfolds The number of folds is defined for cross validation.
#' @param FSmethod Feature selection methods. Available options are 
#' c(NULL, 'positive', 'wilcox.test', 'cor.test', 'chisq.test', 'posWilcox', 
#' or 'top10pCor').
#' @param cutP The cutoff used for p value thresholding.  
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc). The default is 0.05.
#' @param fdr Multiple testing correction method. Available options are 
#' c(NULL, 'fdr', 'BH', 'holm', etc). 
#' See also \code{\link[stats]{p.adjust}}. The default is NULL.
#' @param FScore The number of cores used for feature selection if parallel 
#' computing needed. 
#' @param classifier Machine learning classifiers. 
#' @param predMode The prediction mode. Available options are 
#' c('probability', 'classification', 'regression').
#' @param paramlist A set of model parameters defined in an R list object. 
#' @param innerCore The number of cores used for computation. 

#' @return The CV or BS predicted score for stage-2 training data and 
#' test set predicted score for stage-2 test data if the test set is 
#' given. 

#' @details Stage-2 prediction is performed typically using positively  
#' correlated features. Since negative associations likely reflect random 
#' effects in the underlying data
#' @export  
#' @import stats
#' @import utils  
#' @references Perlich, C., & Swirszcz, G. (2011). On cross-validation and  
#' stacking: Building seemingly predictive models on random data. ACM SIGKDD    
#' Explorations Newsletter, 12(2), 11-15. 
#' @author Junfang Chen 


BioMMstage2pred <- function(trainData, testData, resample = "CV", dataMode, 
    repeatA = 1, repeatB = 1, nfolds, FSmethod, cutP, fdr, FScore = MulticoreParam(), classifier, 
    predMode, paramlist, innerCore = MulticoreParam() ) {
    
    resample <- match.arg(resample)
    if (!is.null(resample)) {
        if (is.factor(trainData$label)) {
            trainData$label <- as.numeric(trainData$label) - 1
        }
        trainDataY <- trainData$label
        if (resample == "CV") {
            predY <- predByCV(data = trainData, repeats = repeatA, nfolds, 
                FSmethod, cutP, fdr, FScore, classifier, predMode, paramlist, 
                innerCore)
            message("CrossValidation >>> ")
        } else if (resample == "BS") {
            predY <- predByBS(trainData, testData = NULL, dataMode, 
                repeats = repeatA, FSmethod, cutP, fdr, FScore, classifier, 
                predMode, paramlist, innerCore)
            message("Bootstrapping >>> ")
        }
        
        cvYscore <- predY
        # if (predMode == "probability") {
        #     predY <- ifelse(predY >= 0.5, 1, 0)
        #     metricCV <- getMetrics(dataY = trainDataY, predY)
        # } else if (predMode == "classification") {
        #     metricCV <- getMetrics(dataY = trainDataY, predY)
        # } else if (predMode == "regression") {
        #     metricCV <- cor(trainDataY, predY) 
        # }
        # print(metricCV)

    }
    
    if (!is.null(testData)) {
        ## ind. prediction
        if (is.factor(testData$label)) {
            testData$label <- as.numeric(testData$label) - 1
        }
        testY <- testData$label
        predY <- predByBS(trainData, testData, dataMode, repeats = repeatB, 
            FSmethod, cutP, fdr, FScore, classifier, predMode, paramlist, 
            innerCore)
        testYscore <- predY
        ## Prediction performance for the ind. test performance
        # message(paste0("Test set performance: "))
        # if (predMode == "probability") {
        #     predY <- ifelse(predY >= 0.5, 1, 0)
        #     metricTest <- getMetrics(dataY = testY, predY)
        # } else if (predMode == "classification") {
        #     metricTest <- getMetrics(dataY = testY, predY)
        # } else if (predMode == "regression") {
        #     metricTest <- cor(testY, predY)
        # }        
        # print(metricTest)
        # result <- list(metricCV, metricTest)
        result <- list(cvYscore, testYscore)
    } else {
        # result <- metricCV
        result <- cvYscore
    }

    return(result)
}




############################################################################### 

#' Reconstruct stage-2 data by PCA 

#' @description
#' Stage-2 data reconstruction by regular or sparse constrained principal 
#' component analysis (PCA).  

#' @param trainDataList The input training data list containing ordered 
#' collections of matrices.
#' @param testDataList The input test data list containing ordered collections 
#' of matrices.
#' @param typeMode The type of PCA prediction mode. Available options are 
#' c('regular', 'sparse'). 
#' (Default: regular)
#' @param topPC The number of top PCs selected. The default is 1, 
#' i.e. the first PC. 
#' @param innerCore The number of cores used for computation. 
#' @param outFileA The file name of stage-2 training data with the '.rds' file 
#' extension.
#' If it's provided, then the result will be saved in this file. 
#' The default is NULL.
#' @param outFileB The file name of stage-2 training data with the '.rds' file 
#' extension. If it's provided, then the result will be saved in this file. 
#' The default is NULL.

#' @return The predicted stage-2 training data and also stage-2 test data if 
#' 'testDataList' provided. If outFileA and outFileB are provided then the 
#' results will be stored in the files.

#' @export 
#' @import nsprcomp
#' @import stats
#' @import BiocParallel

#' @author Junfang Chen   
#' @examples  
#' ## Load data  
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)    
#' ## Annotation file   
#' probeAnnoFile <- system.file('extdata', 'cpgAnno.rds', package='BioMM')  
#' ## Mapping CpGs into Pathways
#' featureAnno <- readRDS(file=probeAnnoFile)  
#' ## Mapping CpGs into Pathways
#' featureAnno <- readRDS(system.file("extdata", "cpgAnno.rds", package="BioMM")) 
#' pathlistDB <- readRDS(system.file("extdata", "goDB.rds", package="BioMM")) 
#' head(featureAnno)   
#' dataList <- omics2pathlist(data=methylData, pathlistDB, featureAnno, 
#'                            restrictUp=100, restrictDown=10, minPathSize=10) 
#' length(dataList) 
#' library(BiocParallel)
#' param <- MulticoreParam(workers = 10) 
#' stage2data <- reconByUnsupervised(trainDataList=dataList, testDataList=NULL, 
#'                              typeMode='regular', topPC=1,  
#'                              innerCore=param, outFileA=NULL, outFileB=NULL) 
#' print(dim(stage2data))
#' print(head(stage2data[,1:5]))


reconByUnsupervised <- function(trainDataList, testDataList, typeMode = "regular", 
    topPC = 1, innerCore = MulticoreParam(), outFileA = NULL, outFileB = NULL) {
    
    reconDataAx <- c()  ## for stage-2 training data   
    trainData <- trainDataList[[1]]
    trainDataY <- trainData[, 1]
    if (!is.null(testDataList)) {
        reconDataBx <- c()
        testData <- testDataList[[1]]
        testDataY <- testData[, 1]
    }
    
    numlist <- seq_along(trainDataList)
    predMat <- bplapply(numlist, function(i) {
        trainData <- trainDataList[[i]]
        trainDataX = trainData[, -1]
        if (!is.null(testDataList)) {
            testData <- testDataList[[i]]
            testDataX = testData[, -1]
        }
        ## remove zero variance columns from trainData
        whNon0var <- apply(trainDataX, 2, var) != 0
        if (sum(whNon0var) == 0) {
            ## if all features constant..
            predA <- trainDataX[, 1]
            if (!is.null(testDataList)) {
                predB <- testDataX[, 1]
            }
        } else {
            trainDataX2 <- trainDataX[, whNon0var]
            if (!is.null(testDataList)) {
                testX2 <- as.matrix(testDataX[, whNon0var])
            }
            if (typeMode == "regular") {
                pca = prcomp(trainDataX2, center = TRUE, scale = TRUE)
                predA <- pca$x[, seq_len(min(ncol(pca$x), topPC))]
                if (!is.null(testDataList)) {
                  predB <- predict(pca, testX2)[, seq_len(min(ncol(pca$x), topPC))]
                }
            } else if (typeMode == "sparse") {
                nspc <- nsprcomp(trainDataX2, ncomp = topPC)
                predA <- nspc$x
                if (!is.null(testDataList)) {
                  predB <- predict(nspc, testX2)[, topPC]
                }
            }
        }
        predA <- round(predA, 3)
        pred <- predA
        if (!is.null(testDataList)) {
            predB <- round(predB, 3)
            pred <- list(predA, predB)
        }
        pred
    }, BPPARAM = innerCore)
    
    if (!is.null(testDataList)) {
        ## PC1
        predMatA <- lapply(predMat, function(data) {
            data[[1]]
        })
        reconDataAx <- matrix(unlist(predMatA), ncol = length(predMatA))
        row.names(reconDataAx) <- row.names(trainData)  ## 
        colnames(reconDataAx) <- names(trainDataList)
        reconDataA = data.frame(label = trainDataY, reconDataAx)
        if (!is.null(outFileA)) {
            saveRDS(reconDataA, file = outFileA)
        }
        ## dataB
        predMatB <- lapply(predMat, function(data) {
            data[[2]]
        })
        reconDataBx <- matrix(unlist(predMatB), ncol = length(predMatB))
        row.names(reconDataBx) <- row.names(testData)  ## 
        colnames(reconDataBx) <- names(trainDataList)
        reconDataB = data.frame(label = testDataY, reconDataBx)
        if (!is.null(outFileB)) {
            saveRDS(reconDataB, file = outFileB)
        }
        result <- list(reconDataA, reconDataB)
    } else {
        ## Note 'predMat' is different from above
        reconDataAx <- matrix(unlist(predMat), ncol = length(predMat))
        row.names(reconDataAx) <- row.names(trainData)  ## 
        colnames(reconDataAx) <- names(trainDataList)
        reconDataA = data.frame(label = trainDataY, reconDataAx)
        if (!is.null(outFileA)) {
            saveRDS(reconDataA, file = outFileA)
        }
        result <- reconDataA
    }

    return(result)
}



############################################################################### 

#' BioMM end-to-end prediction 

#' @description
#' The BioMM framework uses two-stage machine learning models that can allow us 
#' to integrate prior biological knowledge for end-to-end phenotype prediction.

#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes, 
#' 0 and 1 are used to indicate the class member. 
#' @param pathlistDB A list of pathways with pathway IDs and their 
#' corresponding genes ('entrezID' is used). This is only used for 
#' pathway-based stratification (only \code{stratify} is 'pathway'). 
#' @param featureAnno The annotation data stored in a data.frame for probe 
#' mapping. It must have at least two columns named 'ID' and 'entrezID'.  
#' If it's NULL, then the input probe is from the transcriptomic data. 
#' (Default: NULL)
#' @param restrictUp The upper-bound of the number of probes or genes in each  
#' biological stratified block. 
#' @param restrictDown The lower-bound of the number of probes or genes in 
#' each biological stratified block.
#' @param minPathSize The minimal defined pathway size after mapping your 
#' own data to GO database. This is only used for 
#' pathway-based stratification (only \code{stratify} is 'pathway'). 
#' @param supervisedStage1 A logical value. If TRUE, then supervised learning 
#' models are applied; if FALSE, unsupervised learning.
#' @param typePCA the type of PCA. Available options are c('regular', 
#' 'sparse').
#' @param resample1 The resampling methods at stage-1. Valid options are 
#' 'CV' and 'BS'. 'CV' for cross validation and 'BS' for bootstrapping 
#' resampling. The default is 'BS'.
#' @param resample2 The resampling methods at stage-2. Valid options are 'CV' 
#' and 'BS'. 'CV' for cross validation and 'BS' for bootstrapping resampling.
#' The default is 'CV'. 
#' @param dataMode The input training data mode for model training.
#' It is used only if 'testData' is present. It can be a subset of 
#' the whole training data or the entire training data. 'subTrain' 
#' is the given for subsetting and 'allTrain' for the entire training
#' dataset. 
#' @param repeatA1 The number of repeats N is used during resampling procedure.
#' Repeated cross validation or multiple boostrapping is performed if N >=2. 
#' One can choose 10 repeats for 'CV' and 100 repeats for 'BS'.
#' @param repeatA2 The number of repeats N is used during resampling 
#' prediction. The default is 1 for 'CV'.  
#' @param repeatB1 The number of repeats N is used for generating stage-2 test 
#' data prediction scores. The default is 20.
#' @param repeatB2 The number of repeats N is used for test data prediction. 
#' The default is 1. 
#' @param nfolds The number of folds is defined for cross validation. 
#' The default is 10.
#' @param FSmethod1 Feature selection methods at stage-1. Available options 
#' are c(NULL, 'positive', 'wilcox.test', 'cor.test', 'chisq.test', 
#' 'posWilcox'). 
#' @param FSmethod2 Feature selection methods at stage-2. Features that are 
#' positively associated with the outcome will be used.
#' @param cutP1 The cutoff used for p value thresholding at stage-1.  
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc). If "FSmethod1" is NULL,
#' Then no cutoff is applied.
#' @param cutP2 The cutoff used for p value thresholding at stage-2. 
#' Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc). If "FSmethod2" is NULL,
#' Then no cutoff is applied.
#' @param fdr2 Multiple testing correction method at stage-2. 
#' Available options are c(NULL, 'fdr', 'BH', 'holm', etc). 
#' See also \code{\link[stats]{p.adjust}}. The default is NULL.
#' This option is useful particularly when large sets of pathways are investigated.
#' @param FScore The number of cores used for feature selection.
#' @param classifier Machine learning classifiers at both stages.  
#' Available options are c('randForest', 'SVM', 'glmnet').
#' @param predMode The prediction mode at both stages. Available options are 
#' c('probability', 'classification', 'regression'). 
#' @param paramlist A list of model parameters at both stages. The set of parameters 
#' are different for each classifier. Please see the detailed parameters are 
#' implemented for each individual classifier, e.g., 'baseRandForest()', 'baseSVM()',
#' and 'baseGLMnet()'. 
#' @param innerCore The number of cores used for computation. It needs to be reconciled
#' with "FScore" depending on the number of cores available.

#' @return The CV or BS predicted score for the training data and 
#' test set predicted score if \code{testData} is given.
#' @details Stage-2 training data can be learned either using bootstrapping 
#' or cross validation resampling methods in the supervised learning settting.
#' Stage-2 test data is learned via independent test set prediction.
#' @export  

#' @references Chen, J., & Schwarz, E. (2017). BioMM: Biologically-informed 
#' Multi-stage Machine learning for identification of epigenetic fingerprints. 
#' arXiv preprint arXiv:1712.00336.
#' @references Perlich, C., & Swirszcz, G. (2011). On cross-validation and  
#' stacking: Building seemingly predictive models on random data. ACM SIGKDD    
#' Explorations Newsletter, 12(2), 11-15. 

#' @seealso \code{\link{reconBySupervised}}; \code{\link{reconByUnsupervised}}; 
#' \code{\link{BioMMstage2pred}}

#' @examples  
#' ## Load data    
#' methylfile <- system.file('extdata', 'methylData.rds', package='BioMM')  
#' methylData <- readRDS(methylfile)    
#' testData <- NULL
#' ## Annotation file
#' probeAnnoFile <- system.file('extdata', 'cpgAnno.rds', package='BioMM')  
#' probeAnno <- readRDS(file=probeAnnoFile)     
#' golist <- readRDS(system.file("extdata", "goDB.rds", package="BioMM")) 
#' pathlistDB <- golist[1:100]
#' supervisedStage1=TRUE
#' classifier <- 'randForest'
#' predMode <- 'classification'
#' paramlist <- list(ntree=300, nthreads=30)   
#' library(BiocParallel)
#' library(ranger)
#' param1 <- MulticoreParam(workers = 2)
#' param2 <- MulticoreParam(workers = 20)
#' ## Not Run 
#' ## result <- BioMM(trainData=methylData, testData=NULL,
#' ##                 pathlistDB, featureAnno=probeAnno, 
#' ##                 restrictUp=200, restrictDown=10, minPathSize=10, 
#' ##                 supervisedStage1, typePCA='regular', 
#' ##                 resample1='BS', resample2='CV', dataMode="allTrain",
#' ##                 repeatA1=20, repeatA2=1, repeatB1=20, repeatB2=1, 
#' ##                 nfolds=10, FSmethod1=NULL, FSmethod2=NULL, 
#' ##                 cutP1=0.1, cutP2=0.1, fdr2=NULL, FScore=param1, 
#' ##                 classifier, predMode, paramlist, innerCore=param2)

#' ## if (is.null(testData)) {
#' ##     predY <- result 
#' ##     trainDataY <- methylData[,1]
#' ##     metricCV <- getMetrics(dataY = trainDataY, predY)
#' ##     message("Cross-validation prediction performance:")
#' ##     print(metricCV)
#' ## } else if (!is.null(testData)){
#' ##     trainDataY <- methylData[,1]
#' ##     testDataY <- testData[,1]
#' ##     cvYscore <- result[[1]]
#' ##     testYscore <- result[[2]] 
#' ##     metricCV <- getMetrics(dataY = trainDataY, cvYscore)
#' ##     metricTest <- getMetrics(dataY = testDataY, testYscore)
#' ##     message("Cross-validation performance:")
#' ##     print(metricCV)
#' ##     message("Test set prediction performance:")
#' ##     print(metricTest)
#' ## }



BioMM <- function(trainData, testData, pathlistDB, featureAnno, 
    restrictUp, restrictDown, minPathSize, supervisedStage1 = TRUE, 
    typePCA, resample1 = "BS", resample2 = "CV", dataMode = "allTrain", 
    repeatA1 = 100, repeatA2 = 1, repeatB1 = 20, repeatB2 = 1, 
    nfolds = 10, FSmethod1, FSmethod2, 
    cutP1, cutP2, fdr2, FScore = MulticoreParam(), classifier, 
    predMode, paramlist, innerCore = MulticoreParam()) {

    trainDataList <- omics2pathlist(data=trainData, pathlistDB, 
                                    featureAnno, restrictUp, 
                                    restrictDown, minPathSize)  
    if (!is.null(testData)){ 
        testDataList <- omics2pathlist(data=testData, pathlistDB, 
                                       featureAnno, restrictUp, 
                                       restrictDown, minPathSize) 
    } else {
        testDataList <- NULL
    }
    
    ## generation of stage-2 data
    if (supervisedStage1 == TRUE) {
        stage2data <- reconBySupervised(trainDataList = trainDataList, 
            testDataList = testDataList,  resample = resample1, dataMode, 
            repeatA = repeatA1, repeatB = repeatB1, nfolds, 
            FSmethod = FSmethod1, cutP = cutP1, fdr = NULL, FScore = FScore, 
            classifier = classifier, predMode = predMode, 
            paramlist = paramlist, innerCore = innerCore, 
            outFileA = NULL, outFileB = NULL)
    } else {
        stage2data <- reconByUnsupervised(trainDataList = trainDataList, 
            testDataList = testDataList, typeMode = typePCA, topPC = 1, 
            innerCore = innerCore, outFileA = NULL, outFileB = NULL)
    }
    
    if (is.null(testDataList)) {
        trainData2 <- stage2data
        testData2 <- NULL
        message("Stage-2: >>> ")
        message(paste0("Number of blocks: ", ncol(trainData2) - 1))
        trainPos2 <- getDataByFilter(trainData = trainData2, testData = NULL, 
            FSmethod = "positive", cutP = 0.1, fdr = NULL, FScore = FScore)
        message(paste0("Number of positive blocks: ", ncol(trainPos2) - 1)) 
    } else {
        ## if testData provided
        trainData2 <- stage2data[[1]]
        testData2 <- stage2data[[2]]
        datalist <- getDataByFilter(trainData2, testData2, FSmethod = "positive", 
            cutP = 0.1, fdr = NULL, FScore = FScore)
        ## include the label
        trainPos2 <- datalist[[1]] 
        message(paste0("Number of positive blocks: ", ncol(trainPos2) - 1))
    }
    
    ## If no positive features
    if (is.null(trainPos2)) {
        message("Warning: No positive features!!")
        result <- data.frame(AUC = 0.5, ACC = 0.5, R2 = 0)
    } else if (!is.null(trainPos2)){
        ## make prediction 
        result <- BioMMstage2pred(trainData = trainData2, testData = testData2, 
            resample = resample2, dataMode, repeatA = repeatA2, 
            repeatB = repeatB2, nfolds, FSmethod = FSmethod2, cutP = cutP2, 
            fdr = fdr2, FScore = FScore, classifier = classifier, 
            predMode = predMode, paramlist = paramlist, innerCore = innerCore)
    }

    return(result)
}

Try the BioMM package in your browser

Any scripts or data that you put into this service are public.

BioMM documentation built on Nov. 8, 2020, 11:04 p.m.