#' @importFrom doMC registerDoMC
#' @importFrom foreach foreach
#' @importFrom foreach %dopar%
#' @importFrom parallel detectCores
#' @importFrom caret confusionMatrix
utils::globalVariables(c("i"))
#' @title rf_clf.pairwise
#' @description Perform pairwise rf classfication for a data matrix between all pairs of group levels
#' @param df A data matrix or data.frame
#' @param f A factor with more than two levels.
#' @param nfolds The number of folds in the cross validation.
#' @param ntree The number of trees.
#' @param verbose The boolean value indicating if the computation status and estimated runtime shown.
#' @return A summary table containing the performance of Random Forest classification models
#' @seealso ranger
#' @examples
#' df <- t(rmultinom(16,160,c(.001,.6,.2,.3,.299))) + 0.65
#' f<-factor(c(rep("A", 4), rep("B", 4), rep("C", 4), rep("D", 4)))
#' f0<-factor(c(rep("A", 8), rep("B", 8)))
#' rf_clf.pairwise(df, f)
#' @author Shi Huang
#' @export
rf_clf.pairwise <- function (df, f, nfolds=3, ntree=5000, verbose=FALSE) {
## TODO: check the class of inputs
rf_compare_levels <- function(df, f, i=1, j=2, nfolds=1, ntree=500, verbose=FALSE) {
df_ij <- df[which(as.integer(f) == i | as.integer(f) == j), ]
f_ij <- factor(f[which(as.integer(f) == i | as.integer(f) == j)])
if(nfolds==1){
oob<-rf.out.of.bag(df_ij, f_ij, verbose=verbose, ntree=ntree)
}else{
oob<-rf.cross.validation(df_ij, f_ij, nfolds=nfolds, verbose=verbose, ntree=ntree)
}
positive_class=levels(f)[i]
cat("\nTraining dataset: ", levels(f)[i], "-", levels(f)[j] ,"\n\n")
conf<-caret::confusionMatrix(data=oob$predicted, f_ij, positive=positive_class)
acc<-conf$overall[1]
kappa_oob<-conf$overall[2]
cat("Accuracy in the cross-validation: ", acc ,"\n")
if(nlevels(f_ij)==2){rf_AUROC<-get.auroc(oob$probabilities[, positive_class], f_ij, positive_class)}else{rf_AUROC<-NA}
if(nlevels(f_ij)==2){rf_AUPRC<-get.auprc(oob$probabilities[, positive_class], f_ij, positive_class)}else{rf_AUPRC<-NA}
cat("AUROC in the cross-validation: ", rf_AUROC ,"\n") # wired value of "1" kept showing
cat("AUPRC in the cross-validation: ", rf_AUPRC ,"\n") # wired value of "1" kept showing
c("AUROC"=rf_AUROC, "AUPRC"=rf_AUPRC, acc, kappa_oob, conf$byClass)
}
if(nlevels(f)==2){
out_summ<-rf_compare_levels(df, f, i=1, j=2, nfolds, ntree, verbose)
}else{
level_names<-levels(f)
ix <- stats::setNames(seq_along(level_names), level_names)
out_list<-outer(ix[-1L], ix[-length(ix)],
function(ivec, jvec) sapply(seq_along(ivec),
function(k) {
i <- ivec[k]
j <- jvec[k]
if (i > j) {
rf_compare_levels(df, f, i, j, nfolds, ntree, verbose)
}else{NA}
}))
dataset_name<-outer(dimnames(out_list)[[1]],dimnames(out_list)[[2]], paste, sep="__")
out_rownames<-dataset_name[lower.tri(dataset_name, diag = T)]
out_summ<-do.call(rbind, out_list[lower.tri(out_list, diag = T)]); rownames(out_summ)<-out_rownames
}
class(out_summ)<-"rf_clf.pairwise"
out_summ
}
#' @title rf_clf.by_datasets
#' @description It runs standard random forests with oob estimation for classification of
#' c_category in each the sub-datasets splited by the s_category,
#' and apply the model to all the other datasets. The output includes
#' accuracy, auc and Kappa statistics.
#' @param df Training data: a data.frame.
#' @param metadata Sample metadata with at least two columns.
#' @param s_category A string indicates the category in the sample metadata: a ‘factor’ defines the sample grouping for data spliting.
#' @param c_category A indicates the category in the sample metadata as a responsive vector: if a 'factor', rf classification is performed in each of splited datasets.
#' @param positive_class A string indicates one class in the 'c_category' column of metadata.
#' @param nfolds The number of folds in the cross validation.
#' @param clr_transform A boolean value indicating if the clr-transformation applied.
#' @param rf_imp_pvalues A boolean value indicating if compute both importance score and pvalue for each feature.
#' @param verbose A boolean value indicating if show computation status and estimated runtime.
#' @param ntree The number of trees.
#' @param p.adj.method The p-value correction method, default is "bonferroni".
#' @param q_cutoff The cutoff of q values for features, the default value is 0.05.
#' @return ...
#' @seealso ranger
#' @examples
#' df <- data.frame(rbind(t(rmultinom(14, 14*5, c(.21,.6,.12,.38,.099))),
#' t(rmultinom(16, 16*5, c(.001,.6,.42,.58,.299))),
#' t(rmultinom(30, 30*5, c(.011,.6,.22,.28,.289))),
#' t(rmultinom(30, 30*5, c(.091,.6,.32,.18,.209))),
#' t(rmultinom(30, 30*5, c(.001,.6,.42,.58,.299)))))
#' df0 <- data.frame(t(rmultinom(120, 600,c(.001,.6,.2,.3,.299))))
#' metadata<-data.frame(f_s=factor(c(rep("A", 30), rep("B", 30), rep("C", 30), rep("D", 30))),
#' f_c=factor(c(rep("C", 14), rep("H", 16), rep("C", 14), rep("H", 16),
#' rep("C", 14), rep("H", 16), rep("C", 14), rep("H", 16))),
#' f_d=factor(rep(c(rep("a", 10), rep("b", 10), rep("c", 10)), 4)))
#' system.time(rf_clf.by_datasets(df, metadata, s_category='f_s',
#' c_category='f_c', positive_class="C"))
#' rf_clf.by_datasets(df, metadata, s_category='f_s', c_category='f_c',
#' positive_class="C", rf_imp_pvalues=TRUE)
#' rf_clf.by_datasets(df, metadata, s_category='f_s', c_category='f_d')
#' rf_clf.by_datasets(df, metadata, s_category='f_s', c_category='f_d', rf_imp_pvalues=TRUE)
#' @author Shi Huang
#' @export
rf_clf.by_datasets<-function(df, metadata, s_category, c_category, positive_class=NA,
rf_imp_pvalues=FALSE, clr_transform=TRUE, nfolds=1, verbose=FALSE, ntree=500,
p.adj.method = "BH", q_cutoff=0.05){
## TODO: check the class of inputs
y_list<-split(factor(metadata[, c_category]), factor(metadata[, s_category]))
x_list<-split(df, factor(metadata[, s_category]))
datasets<-levels(factor(metadata[, s_category]))
L<-length(y_list)
positive_class<-ifelse(is.na(positive_class), levels(factor(y_list[[1]]))[1], positive_class)
# 1. sample size of all datasets
sample_size<-as.numeric(table(factor(metadata[, s_category])))
nCores <- parallel::detectCores()
doMC::registerDoMC(nCores-2)
# comb function for parallelization using foreach
comb <- function(x, ...) {
lapply(seq_along(x),
function(i) c(x[[i]], lapply(list(...), function(y) y[[i]])))
}
oper<-foreach(i=1:L, .combine='comb', .multicombine=TRUE, .init=list(list(), list(), list(), list())) %dopar% {
x<-x_list[[i]]
y<-factor(y_list[[i]])
# 2. AUC of random forest model
if(nfolds==1){
oob<-rf.out.of.bag(x, y, verbose=verbose, ntree=ntree, imp_pvalues = rf_imp_pvalues)
rf_imps=oob$importances
}else{
oob<-rf.cross.validation(x, y, nfolds=nfolds, verbose=verbose, ntree=ntree, imp_pvalues = rf_imp_pvalues)
rf_imps=rowMeans(oob$importances)
}
rf_AUROC<-get.auroc(oob$probabilities[, positive_class], y, positive_class)
rf_AUPRC<-get.auprc(oob$probabilities[, positive_class], y, positive_class)
# 3. # of significantly differential abundant features between health and disease
out<-BetweenGroup.test(x, y, clr_transform=clr_transform, positive_class=positive_class, p.adj.method = p.adj.method, q_cutoff=q_cutoff)
feature_imps<-data.frame(feature=rownames(out), dataset=rep(datasets[i], ncol(x)),
rf_imps=rf_imps, out)
list(oob=oob, rf_AUROC=rf_AUROC, rf_AUPRC=rf_AUPRC, feature_imps=feature_imps)
}
result<-list()
result$x_list<-x_list
result$y_list<-y_list
result$datasets<-datasets
result$sample_size<-sample_size
result$rf_model_list<-oper[[1]]
result$rf_AUROC<-unlist(oper[[2]])
result$rf_AUPRC<-unlist(oper[[3]])
result$feature_imps_list<-oper[[4]]
class(result)<-"rf_clf.by_datasets"
return(result)
}
#' @title rf_reg.by_datasets
#' @description It runs standard random forests with oob estimation for regression of
#' \code{c_category} in each the sub-datasets splited by the \code{s_category},
#' and apply the model to all the other datasets. The output includes
#' accuracy, auc and Kappa statistics.
#' @param df Training data: a data.frame.
#' @param metadata Sample metadata with at least two columns.
#' @param s_category A string indicates the category in the sample metadata: a ‘factor’ defines the sample grouping for data spliting.
#' @param c_category A indicates the category in the sample metadata: a 'factor' used as sample label for rf classification in each of splited datasets.
#' @param rf_imp_pvalues A boolean value indicate if compute both importance score and pvalue for each feature.
#' @param ntree The number of trees.
#' @param nfolds The number of folds in the cross validation.
#' @param verbose Show computation status and estimated runtime.
#' @return ...
#' @seealso ranger
#' @examples
#' df <- data.frame(rbind(t(rmultinom(14, 14*5, c(.21,.6,.12,.38,.099))),
#' t(rmultinom(16, 16*5, c(.001,.6,.42,.58,.299))),
#' t(rmultinom(30, 30*5, c(.011,.6,.22,.28,.289))),
#' t(rmultinom(30, 30*5, c(.091,.6,.32,.18,.209))),
#' t(rmultinom(30, 30*5, c(.001,.6,.42,.58,.299)))))
#' df0 <- data.frame(t(rmultinom(120, 600,c(.001,.6,.2,.3,.299))))
#' metadata<-data.frame(f_s=factor(c(rep("A", 60), rep("B", 60))),
#' f_s1=factor(c(rep(TRUE, 60), rep(FALSE, 60))),
#' f_c=factor(c(rep("C", 30), rep("H", 30), rep("D", 30), rep("P", 30))),
#' age=c(1:60, 2:61)
#' )
#'
#' reg_res<-rf_reg.by_datasets(df, metadata, nfolds=5, s_category='f_s', c_category='age')
#' reg_res
#' @author Shi Huang
#' @export
rf_reg.by_datasets<-function(df, metadata, s_category, c_category, nfolds=5,
rf_imp_pvalues=FALSE, verbose=FALSE, ntree=500){
## TODO: check the class of inputs
#as.numeric.factor <- function(y) {as.numeric(levels(y))[y]}
y<-metadata[, c_category]
if(is.factor(metadata[, c_category])) y<-as.numeric(as.character(y))
y_list<-split(y, factor(metadata[, s_category]))
x_list<-split(df, factor(metadata[, s_category]))
# data check
try(if(!all(unlist(lapply(y_list, length)) == unlist(lapply(x_list, nrow))))
stop("# of samples in x and length of y should match to each other within all datasets!") )
datasets<-levels(factor(metadata[, s_category]))
L<-length(y_list)
# sample size of all datasets
sample_size<-as.numeric(table(metadata[, s_category]))
# check the available cores for parallel computation
#chk <- Sys.getenv("_R_CHECK_LIMIT_CORES_", "")
#if (nzchar(chk) && chk == "TRUE") {
# # use 2 cores in CRAN
# nCores <- 2L
#} else {
# use all cores in devtools::test()
# nCores <- parallel::detectCores()
#}
nCores <- parallel::detectCores()-2
doMC::registerDoMC(nCores)
# comb function for parallelization using foreach
comb <- function(x, ...) {
lapply(seq_along(x),
function(i) c(x[[i]], lapply(list(...), function(y) y[[i]])))
}
if(nfolds==1){
oper_len=15
out_list<-list(list())[rep(1, oper_len)]
oper_names<-c("rf.model", "y", "predicted", "MSE", "RMSE", "nRMSE",
"MAE", "MAPE", "MASE", "Spearman_rho", "R_squared", "Adj_R_squared",
"importances", "params", "error.type")
}else if(nfolds!=1){
oper_len=16
out_list<-list(list())[rep(1, oper_len)]
oper_names<-c("rf.model", "y", "predicted", "MSE", "RMSE", "nRMSE",
"MAE", "MAPE", "MASE", "Spearman_rho", "R_squared", "Adj_R_squared",
"importances", "params", "error.type", "nfolds")}
require("foreach")
oper<-foreach(i=1:L, .combine='comb', .multicombine=TRUE,
.init=out_list) %dopar% {
if(nfolds==1){
out<-rf.out.of.bag(x_list[[i]], y_list[[i]],
verbose=verbose, ntree=ntree,
imp_pvalues = rf_imp_pvalues)
}else{
out<-rf.cross.validation(x_list[[i]], y_list[[i]], nfolds=nfolds,
verbose=verbose, ntree=ntree,
imp_pvalues = rf_imp_pvalues)
}
out
}
names(oper)<-oper_names
result<-list()
result$x_list<-x_list
result$y_list<-y_list
result$datasets<-datasets
result$sample_size<-sample_size
result$rf_model_list<-oper$rf.model; names(result$rf_model_list)<-result$datasets
result$rf_predicted<-oper$predicted; names(result$rf_predicted)<-result$datasets
if(nfolds==3){
result$feature_imps_list<-oper$importances
}else{
result$feature_imps_list<-lapply(oper$importances, rowMeans)}
names(result$feature_imps_list)<-result$datasets
result$rf_MSE<-unlist(oper$MSE)
result$rf_RMSE<-unlist(oper$RMSE)
result$rf_nRMSE<-unlist(oper$nRMSE)
result$rf_MAE<-unlist(oper$MAE)
result$rf_MAPE<-unlist(oper$MAPE)
result$rf_MASE<-unlist(oper$MASE)
result$rf_Spearman_rho<-unlist(oper$Spearman_rho)
result$rf_R_squared<-unlist(oper$R_squared)
result$rf_Adj_R_squared<-unlist(oper$Adj_R_squared)
class(result)<-"rf_reg.by_datasets"
return(result)
}
#' @title rf_clf.cross_appl
#' @description Based on pre-computed rf models classifying 'c_category' in each the sub-datasets splited by the 's_category',
#' perform cross-datasets application of the rf models. The inputs are precalculated
#' rf models, and the outputs include accuracy, auc and Kappa statistics.
#' @param rf_model_list A list of rf.model objects from \code{rf.out.of.bag}.
#' @param x_list A list of training datasets usually in the format of data.frame.
#' @param y_list A list of responsive vector for regression in the training datasets.
#' @param positive_class A string indicates one common class in each of elements in the y_list.
#' @return A object of class rf_clf.cross_appl including a list of performance summary and predicted values of all predictions
#' @seealso ranger
#' @examples
#' df <- data.frame(rbind(t(rmultinom(14, 14*5, c(.21,.6,.12,.38,.099))),
#' t(rmultinom(16, 16*5, c(.001,.6,.42,.58,.299))),
#' t(rmultinom(30, 30*5, c(.011,.6,.22,.28,.289))),
#' t(rmultinom(30, 30*5, c(.091,.6,.32,.18,.209))),
#' t(rmultinom(30, 30*5, c(.001,.6,.42,.58,.299)))))
#' df0 <- data.frame(t(rmultinom(120, 600,c(.001,.6,.2,.3,.299))))
#' metadata<-data.frame(f_s=factor(c(rep("A", 30), rep("B", 30), rep("C", 30), rep("D", 30))),
#' f_c=factor(c(rep("C", 14), rep("H", 16), rep("C", 14), rep("H", 16),
#' rep("C", 14), rep("H", 16), rep("C", 14), rep("H", 16))),
#' f_d=factor(rep(c(rep("a", 10), rep("b", 10), rep("c", 10)), 4)))
#' res_list<-rf_clf.by_datasets(df, metadata, s_category='f_s', nfolds=5,
#' c_category='f_c', positive_class="C")
#' rf_model_list<-res_list$rf_model_list
#'
#' rf_clf.cross_appl(rf_model_list, res_list$x_list, res_list$y_list, positive_class="C")
#' #--------------------
#' comp_group="A"
#' comps_res<-rf_clf.comps(df, f=metadata[, 'f_s'], comp_group, verbose=FALSE,
#' ntree=500, p.adj.method = "bonferroni", q_cutoff=0.05)
#' comps_res
#' rf_clf.cross_appl(comps_res$rf_model_list,
#' x_list=comps_res$x_list,
#' y_list=comps_res$y_list,
#' positive_class=comp_group)
#' @author Shi Huang
#' @export
rf_clf.cross_appl<-function(rf_model_list, x_list, y_list, positive_class=NA){
## TODO: check the class of inputs
L<-length(rf_model_list)
y_list<-lapply(y_list,factor)
positive_class<-ifelse(is.na(positive_class), levels(factor(y_list[[1]]))[1], positive_class)
try(if(!identical(L, length(x_list), length(y_list))) stop("The length of x list, y list and rf model list should be identical."))
perf_summ<-data.frame(matrix(NA, ncol=18, nrow=L*L))
colnames(perf_summ)<-c("Train_data", "Test_data", "Validation_type", "Accuracy", "AUROC", "AUPRC", "Kappa",
"Sensitivity", "Specificity", "Pos_Pred_Value","Neg_Pred_Value", "Precision", "Recall",
"F1", "Prevalence", "Detection_Rate", "Detection_Prevalence", "Balanced_Accuracy")
predicted<-matrix(list(), ncol=2, nrow=L*L)
colnames(predicted)<-c("predicted", "probabilities")
for(i in 1:L){
y<-y_list[[i]]
x<-x_list[[i]]
try(if(nlevels(y)==1) stop("Less than one level in the subgroup for classification"))
oob<-rf_model_list[[i]]
#---
# RF Training accuracy
#---
cat("\nTraining dataset: ", names(x_list)[i] ,"\n\n")
conf<-caret::confusionMatrix(data=oob$predicted, oob$y, positive=positive_class)
acc<-conf$overall[1]
kappa_oob<-conf$overall[2]
cat("Accuracy in the self-validation: ", acc ,"\n")
#---
# AUC computation
#---
auroc<-get.auroc(oob$probabilities[, positive_class], oob$y, positive_class)
auprc<-get.auprc(oob$probabilities[, positive_class], oob$y, positive_class)
cat("AUROC in the self-validation: ", auroc ,"\n")
cat("AUPRC in the self-validation: ", auprc ,"\n")
D<-1:L
T<-D[D!=i]
a=1+(i-1)*L
perf_summ[a, 1:3]<-c(names(x_list)[i], names(x_list)[i], "self_validation")
perf_summ[a, 4:18]<-c(acc, auroc, auprc, kappa_oob, conf$byClass)
predicted[a, 1][[1]]<-data.frame(test_y=y, pred_y=oob$predicted)
predicted[a, 2][[1]]<-oob$probabilities
#rownames(predicted)[a]<-paste(names(x_list)[i], names(x_list)[i], sep="__VS__")
loop_num<-1
for(j in T){
if(nrow(x_list[[j]])>0){
newx<-x_list[[j]]
newy<-y_list[[j]]
if(class(oob)=="rf.out.of.bag"){
oob_rf.model<-oob$rf.model
}else{
oob_rf.model<-oob$rf.model[[1]]} # pick one sout of K rf models in the "rf.cross.validation"
#pred_prob<-predict(oob$rf.model, x_list[[j]], type="prob") # for regular rf.out.of.bag (randomForest) function
pred_prob <- get.predict.probability.from.forest(oob_rf.model, newx) # ranger only
pred_prob<-pred_prob[,order(colnames(pred_prob))] # to avoid unanticipated order of numeric levels of factor y
pred_newy<-factor(predict(oob_rf.model, newx, type="response")$predictions) # ranger only
if(identical(levels(newy), levels(oob$y))){
colnames(pred_prob)<- levels(newy)
levels(pred_newy)<- levels(newy)
#---Accuracy
cat("Test dataset: ", names(x_list)[j] ,"\n")
test_conf<-caret::confusionMatrix(data=pred_newy, newy, positive=positive_class)
test_acc<-test_conf$overall[1]
test_kappa<-test_conf$overall[2]
cat("Accuracy in the cross-applications: ", test_acc ,"\n")
#---AUC
test_auroc<-get.auroc(pred_prob[, positive_class], newy, positive_class)
test_auprc<-get.auprc(pred_prob[, positive_class], newy, positive_class)
cat("AUROC in the cross-applications: ", test_auroc ,"\n")
cat("AUPRC in the cross-applications: ", test_auprc ,"\n")
perf_summ[a+loop_num, 4:18]<-c(test_acc, test_auroc, test_auprc, test_kappa, test_conf$byClass)
}else{
colnames(pred_prob)<- levels(oob$y)
levels(pred_newy)<- levels(oob$y)
perf_summ[a+loop_num, 4:18]<-rep(NA, 15)
}
perf_summ[a+loop_num, 1:3]<-c(names(x_list)[i], names(x_list)[j], "cross_application")
predicted[a+loop_num, 1][[1]]<-data.frame(test_y=newy, pred_y=pred_newy)
predicted[a+loop_num, 2][[1]]<-pred_prob
#rownames(predicted)[a+loop_num]<-paste(names(x_list)[i], names(x_list)[j], sep="__VS__")
loop_num<-loop_num+1
}
}
}
res<-list()
res$perf_summ<-perf_summ
res$predicted<-predicted
class(res)<-"rf_clf.cross_appl"
res
}
#' @title rf_reg.cross_appl
#' @description Based on pre-computed rf models regressing \code{c_category} in each the sub-datasets splited by the \code{s_category},
#' perform cross-datasets application of the rf models. The inputs are precalculated
#' rf regression models, x_list and y_list.
#' @param rf_list A list of rf.model objects from \code{rf.out.of.bag}.
#' @param x_list A list of training datasets usually in the format of data.frame.
#' @param y_list A list of responsive vector for regression in the training datasets.
#' @return ...
#'
#' @seealso ranger
#' @examples
#' df <- data.frame(rbind(t(rmultinom(14, 14*5, c(.21,.6,.12,.38,.099))),
#' t(rmultinom(16, 16*5, c(.001,.6,.42,.58,.299))),
#' t(rmultinom(30, 30*5, c(.011,.6,.22,.28,.289))),
#' t(rmultinom(30, 30*5, c(.091,.6,.32,.18,.209))),
#' t(rmultinom(30, 30*5, c(.001,.6,.42,.58,.299)))))
#' df0 <- data.frame(t(rmultinom(120, 600,c(.001,.6,.2,.3,.299))))
#' metadata<-data.frame(f_s=factor(c(rep("A", 60), rep("B", 60))),
#' f_s1=factor(c(rep(TRUE, 60), rep(FALSE, 60))),
#' f_c=factor(c(rep("C", 30), rep("H", 30), rep("D", 30), rep("P", 30))),
#' age=c(1:60, 2:61)
#' )
#'
#' table(metadata[, 'f_c'])
#' reg_res<-rf_reg.by_datasets(df, metadata, s_category='f_c', c_category='age')
#' rf_reg.cross_appl(reg_res, x_list=reg_res$x_list, y_list=reg_res$y_list)
#' reg_res<-rf_reg.by_datasets(df, metadata, nfolds=5, s_category='f_c', c_category='age')
#' rf_reg.cross_appl(reg_res, x_list=reg_res$x_list, y_list=reg_res$y_list)
#' @author Shi Huang
#' @export
rf_reg.cross_appl<-function(rf_list, x_list, y_list){
## TODO: check the class of inputs
L<-length(rf_list$rf_model_list)
try(if(!identical(L, length(x_list), length(y_list))) stop("The length of x list, y list and rf model list should be identical."))
try(if(all(unlist(lapply(y_list, mode))!="numeric")) stop("All elements in the y list should be numeric for regression."))
perf_summ<-data.frame(matrix(NA, ncol=14, nrow=L*L))
colnames(perf_summ)<-c("Train_data", "Test_data", "Validation_type", "Sample_size", "Min_acutal_value", "Max_acutal_value", "Min_predicted_value", "Max_predicted_value",
"MSE", "RMSE", "MAE", "MAPE", "Spearman_rho", "R_squared")
predicted<-list()
for(i in 1:L){
y<-y_list[[i]]
x<-x_list[[i]]
try(if(nlevels(y)==1) stop("Less than one level in the subgroup for classification"))
rf_model<-rf_list$rf_model_list[[i]]
#--- RF Training performance: MSE, MAE and R_squared
cat("\nTraining dataset: ", names(x_list)[i] ,"\n\n")
train_sample_size<-length(y)
train_y_min<-range(y)[1]
train_y_max<-range(y)[2]
train_pred_y_min<-range(rf_list$rf_predicted[[i]])[1]
train_pred_y_max<-range(rf_list$rf_predicted[[i]])[2]
train_perf<-get.reg.performance(rf_list$rf_predicted[[i]], y_list[[i]])
cat("MSE in the self-validation: ", train_perf[["MSE"]] ,"\n")
cat("RMSE in the self-validation: ", train_perf[["RMSE"]] ,"\n")
cat("MAE in the self-validation: ", train_perf[["MAE"]] ,"\n")
cat("MAE percentage in the self-validation: ", train_perf[["MAPE"]] ,"\n")
cat("R squared in the self-validation: ", train_perf[["R_squared"]] ,"\n")
cat("Spearman_rho in the self-validation: ", train_perf[["Spearman_rho"]] ,"\n")
D<-1:L
T<-D[D!=i]
a=1+(i-1)*L
perf_summ[a, 1:3]<-c(names(x_list)[i], names(x_list)[i], "self_validation")
perf_summ[a, 4:14]<-c(train_sample_size, train_y_min, train_y_max, train_pred_y_min, train_pred_y_max,
train_MSE=train_perf["MSE"],
train_RMSE=train_perf["RMSE"],
train_MAE=train_perf["MAE"],
train_MAPE=train_perf["MAPE"],
train_R_squared=train_perf["R_squared"],
train_Spearman_rho=train_perf["Spearman_rho"])
predicted[[a]]<-data.frame(test_y=y, pred_y=rf_list$rf_predicted[[i]])
names(predicted)[a]<-paste(names(x_list)[i], names(x_list)[i], sep="__VS__")
loop_num<-1
for(j in T){
if(nrow(x_list[[j]])>0){
newx<-x_list[[j]]
newy<-y_list[[j]]
if(class(rf_model)=="list") rf_model <- rf_model[[1]] # pick one out of K rf models in the ranger list
pred_newy<-predict(rf_model, newx, type="response")$predictions # ranger only
#--- RF test performance: MSE, MAE and R_squared
cat("Test dataset: ", names(x_list)[j] ,"\n")
test_sample_size<-length(newy)
test_y_min<-range(newy)[1] #paste0(range(newy), collapse = "-")
test_y_max<-range(newy)[2]
test_pred_y_min<-range(pred_newy)[1]
test_pred_y_max<-range(pred_newy)[2]#paste0(range(pred_newy), collapse = "-")
MSE<-function(y, pred_y){ mean((y-pred_y)^2)}
RMSE<-function(y, pred_y){ sqrt(mean((y-pred_y)^2))}
MAE<-function(y, pred_y){ mean(sqrt((y-pred_y)^2))}
R2<-function(y, pred_y){ 1-(sum((y-pred_y)^2) / sum((y-mean(y))^2)) }
MAPE<-function(y, pred_y){ mean(sqrt((y-pred_y)^2)/y)}
Spearman_rho <- function(y, pred_y){cor.test(y, pred_y, method = "spearman")$estimate}
test_MSE<-MSE(newy, pred_newy)
test_RMSE<-RMSE(newy, pred_newy)
test_MAE<-MAE(newy, pred_newy)
test_MAPE<-MAPE(newy, pred_newy)
test_R_squared<-R2(newy, pred_newy)
test_Spearman_rho<-Spearman_rho(newy, pred_newy)
cat("MSE in the cross-applications: ", test_MSE ,"\n")
cat("RMSE in the cross-applications: ", test_RMSE ,"\n")
cat("MAE in the cross-applications: ", test_MAE ,"\n")
cat("MAE percentage in the cross-applications: ", test_MAPE ,"\n")
cat("R squared in the cross-application: ", test_R_squared ,"\n")
cat("Adjusted R squared in the cross-application: ", test_Spearman_rho ,"\n")
perf_summ[a+loop_num, 1:3]<-c(names(x_list)[i], names(x_list)[j], "cross_application")
perf_summ[a+loop_num, 4:14]<-c(test_sample_size, test_y_min, test_y_max, test_pred_y_min, test_pred_y_max,
test_MSE, test_RMSE, test_MAE, test_MAPE, test_R_squared, test_Spearman_rho)
predicted[[a+loop_num]]<-data.frame(test_y=newy, pred_y=pred_newy)
names(predicted)[a+loop_num]<-paste(names(x_list)[i], names(x_list)[j], sep="__VS__")
loop_num<-loop_num+1
}
}
}
res<-list()
res$perf_summ<-perf_summ
res$predicted<-predicted
class(res)<-"rf_reg.cross_appl"
res
}
#' @title generate.comps_datalist
#' @description To generate sub-datasets and sub-metadata by pairing
#' one level and each of other levels of the specified category in the metadata.
#' @param df Training data: a data.frame.
#' @param f A factor in the metadata with at least two levels (groups).
#' @param comp_group A string indicates the group in the f
#' @return A object of list with elements: a list of data.frame and a list of factor
#' @author Shi Huang
#'
generate.comps_datalist<-function(df, f, comp_group){
all_other_groups<-levels(f)[which(levels(f)!=comp_group)]
L<-length(all_other_groups)
f_list<-list()
df_list<-list()
for(i in 1:L){
f_list[[i]]<-factor(f[which(f==comp_group | f==all_other_groups[i])])
df_list[[i]]<-df[which(f==comp_group | f==all_other_groups[i]), ]
}
names(f_list)<-names(df_list)<-paste(comp_group, all_other_groups, sep="_VS_")
res<-list()
res$f_list<-f_list
res$df_list<-df_list
res
}
#' @title rf_clf.comps
#' @description It runs standard random forests with oob estimation for classification of
#' one level VS all other levels of one category in the datasets.
#' The output includes a list of rf models for the sub datasets
#' and all important statistics for each of features.
#'
#' @param df Training data: a data.frame.
#' @param f A factor in the metadata with at least two levels (groups).
#' @param comp_group A string indicates the group in the f
#' @param clr_transform A boolean value indicating if the clr-transformation applied.
#' @param rf_imp_values A boolean value indicating if compute both importance score and pvalue for each feature.
#' @param verbose A boolean value indicating if show computation status and estimated runtime.
#' @param ntree The number of trees.
#' @param p.adj.method The p-value correction method, default is "bonferroni".
#' @param q_cutoff The cutoff of q values for features, the default value is 0.05.
#' @return ...
#' @seealso ranger
#' @examples
#' df0 <- data.frame(t(rmultinom(60, 300,c(.001,.6,.2,.3,.299))))
#' df <- data.frame(rbind(t(rmultinom(7, 75, c(.21,.6,.12,.38,.099))),
#' t(rmultinom(8, 75, c(.001,.6,.42,.58,.299))),
#' t(rmultinom(15, 75, c(.011,.6,.22,.28,.289))),
#' t(rmultinom(15, 75, c(.091,.6,.32,.18,.209))),
#' t(rmultinom(15, 75, c(.001,.6,.42,.58,.299)))))
#' f=factor(c(rep("A", 15), rep("B", 15), rep("C", 15), rep("D", 15)))
#' comp_group="A"
#' comps_res<-rf_clf.comps(df, f, comp_group, verbose=FALSE, ntree=500,
#' p.adj.method = "bonferroni", q_cutoff=0.05)
#' comps_res
#' @author Shi Huang
#' @export
rf_clf.comps<-function(df, f, comp_group, verbose=FALSE, clr_transform=TRUE,
rf_imp_values=FALSE,ntree=500, p.adj.method = "bonferroni",
q_cutoff=0.05){
f<-factor(f)
all_other_groups<-levels(f)[which(levels(f)!=comp_group)]
L<-length(all_other_groups)
nCores <- parallel::detectCores()
doMC::registerDoMC(nCores-2)
# comb function for parallelization using foreach
comb <- function(x, ...) {
lapply(seq_along(x),
function(i) c(x[[i]], lapply(list(...), function(y) y[[i]])))
}
#require('foreach')
oper<-foreach(i=1:L, .combine='comb', .multicombine=TRUE,
.init=list(list(), list(), list(), list(), list(), list(), list(), list())) %dopar% {
sub_f<-factor(f[which(f==comp_group | f==all_other_groups[i])])
sub_df<-df[which(f==comp_group | f==all_other_groups[i]), ]
print(levels(factor(sub_f)))
# 1. sample size of all datasets
sample_size<-length(factor(sub_f))
dataset<-paste(comp_group, all_other_groups[i], sep="_VS_")
# 2. AUC of random forest model
oob <- rf.out.of.bag(sub_df, factor(sub_f), verbose=verbose, ntree=ntree, imp_pvalues = rf_imp_values)
if(nlevels(factor(sub_f))==2){
rf_AUROC <- get.auroc(oob$probabilities[, comp_group], factor(sub_f), comp_group)
rf_AUPRC <- get.auprc(oob$probabilities[, comp_group], factor(sub_f), comp_group)
}else{rf_AUC <- NA}
# 3. # of significantly differential abundant features between health and disease
out<-BetweenGroup.test(sub_df, factor(sub_f), clr_transform=clr_transform, q_cutoff=q_cutoff,
positive_class=comp_group, p.adj.method = p.adj.method)
stats_out<-data.frame(feature=rownames(out),
dataset=rep(dataset, ncol(sub_df)), rf_imps=oob$importances,
out)
list(x=sub_df, y=sub_f, sample_size=sample_size, datasets=dataset,
oob=oob, rf_AUROC=rf_AUROC, rf_AUPRC=rf_AUPRC, stats_out=stats_out)
}
names(oper[[1]])<-names(oper[[2]])<-names(oper[[5]])<-unlist(oper[[4]])
result<-list()
result$x_list<-oper[[1]]
result$y_list<-oper[[2]]
result$sample_size<-unlist(oper[[3]])
result$datasets<-unlist(oper[[4]])
result$rf_model_list<-oper[[5]]
result$rf_AUROC<-unlist(oper[[6]])
result$rf_AUPRC<-unlist(oper[[7]])
result$feature_imps_list<-oper[[8]]
class(result)<-"rf_clf.comps"
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.