R/supporting_functions.R

Defines functions RF_Pred_Multiclass RF_Pred_Regression RF_Pred Plot_PCoA PERMANOVA_R2

Documented in PERMANOVA_R2 Plot_PCoA RF_Pred RF_Pred_Multiclass RF_Pred_Regression

########## subsequent analyses ##########


#' PERMANOVA R2 of batch and variable of interest
#'
#' @param TAX The taxa read count table, samples (row) by taxa (col).
#' @param batchid The batch indicator, must be a factor.
#' @param covariates The data.frame contains the key variable of interest and other covariates.
#' @param key_index An integer, location of the variable of interest in \code{covariates}.
#'
#' @details Three PERMANOVA R2 will be computed: (1) the standard one (adnois), (2) on euclidified dissimilarities (adonis2, sqrt.dist=T), and (3) with a constant added to the non-diagonal dissimilarities such that all eigenvalues are non-negative in the underlying PCoA (adonis2, add=T).
#'
#' @return A list
#' \itemize{
#'   \item tab_count - A table summarizing PERMANOVA R2 computed on the original taxa read count table in Bray-Curtis dissimilarity.
#'   \item tab_rel - A table summarizing PERMANOVA R2 computed on the corresponding relative abundance table in Euclidean dissimilarity (Aitchison dissimilarity).
#'}
#'
#' @references
#' \itemize{
#'   \item Anderson, M. J. (2014). Permutational multivariate analysis of variance (PERMANOVA). Wiley statsref: statistics reference online, 1-15.
#' }
#'
#' @export

PERMANOVA_R2 <- function(TAX, batchid, covariates, key_index){

  tab_count = tab_rel = matrix(ncol=3, nrow=2)
  colnames(tab_count) = colnames(tab_rel) = c("standard", "sqrt.dist=T", "add=T")
  rownames(tab_count) = rownames(tab_rel) = c("batch", "key")

  # bray-curtis
  tab_count[1,1] = adonis(formula = TAX ~ batchid)$aov.tab[1, 5]
  tab_count[1,2] = adonis2(formula = TAX ~ batchid, sqrt.dist=TRUE)$R2[1]
  tab_count[1,3] = adonis2(formula = TAX ~ batchid, add=TRUE)$R2[1]

  tab_count[2,1] = adonis(formula = TAX ~ ., data=data.frame(covariates[, key_index]))$aov.tab[1, 5]
  tab_count[2,2] = adonis2(formula = TAX ~ ., data=data.frame(covariates[, key_index]), sqrt.dist=TRUE)$R2[1]
  tab_count[2,3] = adonis2(formula = TAX ~ ., data=data.frame(covariates[, key_index]), add=TRUE)$R2[1]

  # aitchison
  Z = coda.base::dist(TAX+0.5, method="aitchison")

  tab_rel[1,1] = adonis(formula = Z  ~ batchid, method="euclidean")$aov.tab[1, 5]
  tab_rel[1,2] = adonis2(formula = Z  ~ batchid, method="euclidean", sqrt.dist=TRUE)$R2[1]
  tab_rel[1,3] = adonis2(formula = Z  ~ batchid, method="euclidean", add=TRUE)$R2[1]

  tab_rel[2,1] = adonis(formula = Z  ~ ., data=data.frame(covariates[, key_index]), method="euclidean")$aov.tab[1, 5]
  tab_rel[2,2] = adonis2(formula = Z  ~ ., data=data.frame(covariates[, key_index]), method="euclidean", sqrt.dist=TRUE)$R2[1]
  tab_rel[2,3] = adonis2(formula = Z  ~ ., data=data.frame(covariates[, key_index]), method="euclidean", add=TRUE)$R2[1]

  return(list(tab_count=tab_count, tab_rel=tab_rel))

}



### PCoA plots: Bray-Curtis, GUniFrac (unweighted, weighted, generalized), Aitchinson

#' Stratified PCoA plots
#'
#' @param TAX The taxa read count table, samples (row) by taxa (col).
#' @param factor The variable for stratification, e.g., batchid or the variable of interest, must be a factor.
#' @param sub_index A vector of sample indices, to restrict the analysis to a subgroup of samples, e.g., c(1:5, 15:20); default is NULL.
#' @param dissimilarity The dissimilarity type, ``Bray'' for Bray-Curtis dissimilarity, ``Aitch'' for Aitchison dissimilarity, ``GUniFrac'' for generalized UniFrac dissimilarity; default is ``Bray''.
#' @param GUniFrac_type The generalized UniFrac type, ``d_1'' for weighted UniFrac, ``d_UW'' for unweighted UniFrac, ``d_VAW'' for variance adjusted weighted UniFrac, ``d_0'' for generalized UniFrac with alpha 0, ``d_0.5'' for generalized UniFrac with alpha 0.5; default is ``d_0.5''.
#' @param tree The rooted phylogenetic tree of R class ``phylo'', must be provided when \code{dissimilarity}=``GUniFrac''; default is NULL.
#' @param main The title of plot; default is NULL.
#' @param aa A real number, the character size for the title.
#'
#' @return Print a PCoA plot.
#'
#' @references
#' \itemize{
#'   \item Chen, J., & Chen, M. J. (2018). Package ‘GUniFrac’. The Comprehensive R Archive Network (CRAN).
#' }
#'
#' @export

Plot_PCoA <- function(TAX, factor, sub_index=NULL, dissimilarity="Bray", GUniFrac_type="d_0.5", tree=NULL, main=NULL, aa=1.5){

  nfactor = length(table(factor))
  if (is.null(sub_index)){
    sub_index = seq(ncol(TAX))
  }

  if (dissimilarity == "Bray"){

    index = which( apply(TAX[, sub_index], 1, sum) > 0 )
    bc =  vegdist(TAX[index, sub_index])
    MDS = cmdscale(bc, k=4)
    s.class(MDS, fac = as.factor(factor[index]), col = 1:nfactor, grid = F, sub = main, csub = aa)

  } else if (dissimilarity == "Aitch"){

    Z = as.matrix(clr(as.matrix(TAX[, sub_index])+0.5))
    MDS = cmdscale(vegdist(Z, method = "euclidean"), k=4)
    s.class(MDS, fac = as.factor(factor), col = 1:nfactor, grid = F, sub = main, csub = aa)

  } else if (dissimilarity == "GUniFrac"){

    index = which( apply(TAX[, sub_index], 1, sum) > 0 )
    unifracs = GUniFrac(TAX[index, sub_index], tree, alpha=c(0, 0.5, 1))$unifracs

    if (!GUniFrac_type %in% c("d_1", "d_UW", "d_VAW", "d_0", "d_0.5")) stop("Please use one of d_1, d_UW, d_VAW, d_0, d_0.5 for GUniFrac dissimilarity.")

    d = unifracs[, , GUniFrac_type]
    MDS = cmdscale(d, k=4)
    s.class(MDS, fac = as.factor(factor[index]), col = 1:nfactor, grid = F, sub = main, csub = aa)

  } else{
    stop("Please use one of Bray, Aitch or GUniFrac as the dissimilarity.")
  }

}


### RF - predict key variable

# Predict binary key variable using random forest, k-fold cross-validation (out-of-bag is not stable), AUC/ROC of the data accumulated from k fold

#' Predict binary variables based on a taxa read count table by random forest
#'
#' @param TAX The taxa read count table, samples (row) by taxa (col).
#' @param factor The binary variable to predict, e.g., the key variable, case/control, must be a factor.
#' @param fold The number of folds; default is 5.
#' @param seed The seed to generate fold indices for samples; default is 2020.
#'
#' @return A list
#' \itemize{
#'   \item pred - A table summarizing the predicted probabilities and true labels for all samples.
#'   \item auc_across_fold - AUC of the ROC curves across folds.
#'   \item auc_on_all - AUC of the ROC curve on all samples.
#'}
#'
#' @export

RF_Pred <- function(TAX, factor, fold=5, seed=2020){

  set.seed(seed)
  ss = sample(1:fold,size=nrow(TAX),replace=T,prob=rep(1/fold, fold)) # assign fold for samples

  temp = as.matrix(TAX)
  rownames(temp) = NULL
  colnames(temp) = NULL

  # do prediction across folds
  record_tab = NULL
  record_auc = NULL
  for (ii in 1:fold){
    trainy = factor[ss!=ii]
    testy = factor[ss==ii]

    trainx = temp[ss!=ii, ]
    testx = temp[ss==ii, ]

    rf_classifier = randomForest(trainy ~ ., data=trainx, importance=TRUE)

    prediction_for_roc_curve = predict(rf_classifier,testx,type="prob")
    pred = prediction(prediction_for_roc_curve[,2],testy)
    record_tab = rbind(record_tab, data.frame(prob=prediction_for_roc_curve[,2],testy))

    auc.perf = performance(pred, measure = "auc")
    record_auc[ii] = auc.perf@y.values[[1]]
  }

  # ROC on the accumulated prediction, e.g., on all samples
  pred = prediction(record_tab[,1],record_tab[,2])
  perf = performance(pred, "tpr", "fpr")

  auc.perf = performance(pred, measure = "auc")
  auc_value = auc.perf@y.values[[1]]

  return(list(pred=record_tab, auc_across_fold=record_auc, auc_on_all=auc_value))

}


# Predict continuous key variable using random forest, k-fold cross-validation (out-of-bag is not stable), rmse from k fold

#' Predict continuous variables based on a taxa read count table by random forest
#'
#' @param TAX The taxa read count table, samples (row) by taxa (col).
#' @param variable The continuous variable to predict.
#' @param fold The number of folds; default is 5.
#' @param seed The seed to generate fold indices for samples; default is 2020.
#'
#' @return A list
#' \itemize{
#'   \item pred - A table summarizing the predicted and true values for all samples.
#'   \item rmse_across_fold - RMSEs across folds.
#'}
#'
#' @export

RF_Pred_Regression <- function(TAX, variable, fold=5, seed=2020){

  set.seed(seed)
  ss = sample(1:fold,size=nrow(TAX),replace=T,prob=rep(1/fold, fold)) # assign fold for samples

  temp = as.matrix(TAX)
  rownames(temp) = NULL
  colnames(temp) = NULL

  # do prediction across folds
  record_tab = NULL
  record_rmse = NULL
  for (ii in 1:fold){
    trainy = variable[ss!=ii]
    testy = variable[ss==ii]

    trainx = temp[ss!=ii, ]
    testx = temp[ss==ii, ]

    rf_regression = randomForest(trainy ~ ., data=trainx, importance=TRUE)

    prediction_for_rmse <- predict(rf_regression,testx)
    record_tab = rbind(record_tab, data.frame(pred=prediction_for_rmse,testy))

    record_rmse[ii] = sqrt(mean( (prediction_for_rmse - testy)^2 ))
  }

  return(list(pred=record_tab, rmse_across_fold=record_rmse))

}


# Predict multiclass key variable using random forest, k-fold cross-validation (out-of-bag is not stable), cross entropy from k fold

#' Predict multiclass variables based on a taxa read count table by random forest
#'
#' @param TAX The taxa read count table, samples (row) by taxa (col).
#' @param factor The multiclass variable to predict, e.g., the key variable, never smoker/former smoker/current smoker, must be a factor.
#' @param fold The number of folds; default is 5.
#' @param seed The seed to generate fold indices for samples; default is 2020.
#'
#' @return A list
#' \itemize{
#'   \item pred - A table summarizing the predicted probabilities and true labels for all samples.
#'   \item cross_entropy_across_fold - mean cross-entropy across folds.
#'}
#'
#' @export

RF_Pred_Multiclass <- function(TAX, factor, fold=5, seed=2020){

  set.seed(seed)
  ss = sample(1:fold,size=nrow(TAX),replace=T,prob=rep(1/fold, fold)) # assign fold for samples

  temp = as.matrix(TAX)
  rownames(temp) = NULL
  colnames(temp) = NULL

  # do prediction across folds
  record_tab = NULL
  record_entropy = NULL
  for (ii in 1:fold){
    trainy = factor[ss!=ii]
    testy = factor[ss==ii]

    trainx = temp[ss!=ii, ]
    testx = temp[ss==ii, ]

    rf_classifier = randomForest(trainy ~ ., data=trainx, importance=TRUE)

    prediction_for_roc_curve = predict(rf_classifier,testx,type="prob")
    testy = dummy_cols(testy)[, -1]

    cross_entropy = NULL
    for (jj in 1:nrow(testy)){
      cross_entropy[jj] = - sum( testy[jj, ] * log(prediction_for_roc_curve[jj, ])    )
    }

    record_tab = rbind(record_tab, data.frame(prediction_for_roc_curve, testy))
    record_entropy[[ii]] = mean( cross_entropy )
  }

  return(list(pred=record_tab, cross_entropy_across_fold=record_entropy))

}
wdl2459/ConQuR documentation built on Aug. 28, 2022, 6:08 a.m.