R/BDMMA.R

Defines functions trace_plot fdr_cut VBatch BDMMA

Documented in BDMMA fdr_cut trace_plot VBatch

#' Bayesian Dirichlet--Multinomial approach for meta-analysis of metagenomic read counts
#' @import Rcpp RcppArmadillo RcppEigen
#' @param Microbiome_dat A SummarizedExperiment object that includes the taxonomy read counts, phenotypes and batch labels.
#' @param abundance_threshold The minimum abundance level for the taxa to be included (default value = 5e-05).
#' @param burn_in The length of burn in period before sampling the parameters (default value = 5,000).
#' @param sample_period The length of sampling period for estimating parameters' distribution (default value = 5,000)
#' @param bFDR The false discovery rate level to control (default value = 0.1).
#' @param PIPcut The threshold to cut the posterior inclusion probabilities (PIPs). By default, PIP is thresholding at 0.5.
#' @return A list contains the selected taxa and summary of parameters included in the model.
#' \item{selected.taxa}{A list includes the selected taxa fesatures that are significantly associated
#' with the main effect variable.}
#' \item{parameter_summary}{A data.frame contains the mean and quantiles of the parameters included
#' in the model. Each row includes a parameter's distribution summary and the parameter name is
#' labeled in the first row. alpha_g: the baseline intercept of g-th taxon; betaj_g: the association strength between
#' the g-th taxon and j-th input variables; deltai_g: the batch effect parameter of batch i, taxon g;
#' L_g: the posterior selection probability of g-th taxon; p: the proportion of significantly associated
#' taxa; eta: the standard deviation of the spike distribution (in the spike-and-slab prior).}
#' \item{PIP}{A vector contains the PIPs of selected microbial taxa.}
#' \item{bFDR}{The corresponding bFDR under the selected microbial taxa.}
#' @docType data
#' @examples
#' require(SummarizedExperiment)
#' data(Microbiome_dat)
#' ## (not run)
#' ## output <- BDMMA(Microbiome_dat, burn_in = 3000, sample_period = 3000)
#' @references Dai, Zhenwei, et al. "Batch Effects Correction for Microbiome Data with Dirichlet-multinomial Regression." Bioinformatics 1 (2018): 8.
#' @export

BDMMA=function(Microbiome_dat, abundance_threshold = 0.00005, burn_in = 5000,
               sample_period = 5000, bFDR = 0.1, PIPcut = 0.5){

  col_data = colData(Microbiome_dat)
  X = data.frame(col_data$main, col_data$confounder)
  Y = t(assay(Microbiome_dat))
  batch = as.numeric(factor(col_data[,3]))
  continuous = mcols(col_data)[1:2,]

  # check input data
  if (length(unique(X[,1]))>2|length(unique(X[,1]))<2){
    stop("The main effect variable is not binary")
  }

  if (length(unique(batch))<2){
    stop("Batch number is less than two: not a well defined batch effect indicator")
  }

  # filter low abundance bacteria
  abundance_m = sweep(Y, 1, rowSums(Y), "/")
  taxa_abundance = apply(abundance_m, 2, mean)
  YY = Y[,(taxa_abundance>abundance_threshold)]
  s_prop = taxa_abundance[taxa_abundance>abundance_threshold]
  if (length(s_prop)<ncol(Y)){
    YY = cbind(YY, (rowSums(Y)-rowSums(YY)))
    s_prop = c(s_prop,(1-sum(s_prop)))
  }


  # data characters
  n_dim = ncol(YY)
  n_size = rowSums(Y)
  n_var = ncol(X)
  n_batch = length(unique(batch))
  taxa = names(YY)

  # Normalize clinical metadata
  X = sweep(X, 2, colMeans(X), "-")
  sd_X=apply(X, 2, sd)
  if (sum(continuous != 0) > 0){
    X[,(continuous != 0)] = as.matrix(X[,(continuous != 0)])
    X[,(continuous != 0)] = sweep(X[,(continuous != 0)], 2, sd_X[(continuous != 0)], "/")
  }

  # Initialize the parameter
  a = b = 1
  c = 0.1
  sigma1 = sigma2 = sigma3 = sqrt(10)
  eta = 0.1
  L = alpha = rep(0, n_dim)

  lambda = rexp(c, n = 1)
  p = rbeta(1, shape1 = 1, shape2 = 1)

  for (i in seq_len(n_dim)){
    alpha[i] = rexp(1/(lambda*s_prop[i]), n = 1)
    L[i] = rbinom(size = 1, n = 1, prob = p)
  }

  alpha_m = matrix(alpha, nrow = nrow(YY), ncol = n_dim, byrow = TRUE)
  beta = matrix(0, nrow = n_var, ncol = n_dim)
  delta = matrix(0, nrow = n_batch, ncol = n_dim)
  delta_m = matrix(0, nrow = nrow(YY), ncol = n_dim)

  # Estimate parameters with MCMC
  iter = burn_in + sample_period

  X = as.matrix(X)
  YY = as.matrix(YY)

  batch_count = table(batch)

  cat("#################### Start MCMC ####################\n\n")

  output = .Call('_BDMMAcorrect_MCMC', PACKAGE = 'BDMMAcorrect', alpha = alpha, alpha_m = alpha_m, x = X, y = YY,
                 beta = beta, delta = delta, delta_m = delta_m, e_delta = t(delta),
                 T = n_dim, N = nrow(YY), K = n_var, I = n_batch, lambda = lambda,
                 prop = s_prop, L = L, sigma1 = sigma1, sigma2 = sigma2, sigma3 = sigma3,
                 iter = iter, eta = eta, a = a, b = b, p = p, batch = batch, weight = batch_count)

  ## calculate the posterior mean of parameters
  alpha_mean = apply(output[(iter-sample_period):iter, seq_len(n_dim)], 2, mean)
  beta1_mean = apply(output[(iter-sample_period):iter, (n_dim+1):(2*n_dim)], 2, mean)
  L_mean = apply(output[(iter-sample_period):iter, (2*n_dim+1):(3*n_dim)], 2, mean)
  eta_mean = mean(output[(iter-sample_period):iter, (3*n_dim+1)])
  p_mean = mean(output[(iter-sample_period):iter, (3*n_dim+2)])

  if (n_var >= 2){
    beta2_mean = apply(output[(iter-sample_period):iter, (3 * n_dim + 3):((2 + n_var) * n_dim + 2)], 2, mean)
  }else{beta2_mean = NULL}

  delta_mean = apply(output[(iter-sample_period):iter, ((2 + n_var) * n_dim + 3):((2 + n_var + n_batch) * n_dim + 2)], 2, mean)

  quantile_2 = function(x){
    return(quantile(x,probs = c(0.025,0.25,0.5,0.75,0.975)))
  }

  ## Generate the summary table for the parameters
  parameter_summary = t(apply(output[(iter-sample_period):iter,], 2, quantile_2))
  parameter_summary = data.frame(mean = c(alpha_mean, beta1_mean, L_mean, eta_mean, p_mean,
                                          beta2_mean, delta_mean), parameter_summary)

  name1 = name2 = name3 = rep(0, n_dim)
  name4 = matrix(0, nrow = n_var - 1, ncol = n_dim)
  name5 = matrix(0, nrow = n_batch, ncol = n_dim)

  for (ii in seq_len(n_dim)){
    name1[ii] = paste("alpha", ii, sep = "_")
    name2[ii] = paste("beta1", ii, sep = "_")
    name3[ii] = paste("L", ii, sep = "_")
    if (n_var>=2){
      for (jj in 2:n_var){
        name4[jj-1,ii] = paste(paste("beta", jj, sep=""), ii, sep = "_")
      }
    }
    else name4 = NULL
    for (kk in seq_len(n_batch)){
      name5[kk,ii] = paste(paste("delta", kk, sep=""), ii, sep = "_")
    }
  }
  name4 = as.vector(name4)
  name5 = as.vector(name5)
  row.names(parameter_summary) = c(name1, name2, name3, "eta", "p", name4, name5)

  ## Record the trace of the parameters
  trace = as.data.frame(output)
  names(trace) = c(name1, name2, name3, "eta", "p", name4, name5)

  ## Select the significantly associated taxa
  prediction_1 = (L_mean > PIPcut) * 1
  cutoff = fdr_cut(L_mean, alpha = bFDR)
  prediction_2 = (L_mean >= cutoff) * 1
  selected.taxa = list()
  selected.taxa$MIM = taxa[prediction_1 > 0]
  selected.taxa$bFDR = taxa[prediction_2 > 0]

  output=list()

  output$trace = trace
  output$parameter_summary = parameter_summary
  output$selected.taxa = selected.taxa
  output$PIP = L_mean[L_mean > PIPcut]
  output$bFDR = 1 - mean(L_mean[L_mean >= cutoff])


  return(output)
}


#' Visualize batch effect with principal coordinate analysis
#' @param Microbiome_dat A SummarizedExperiment object that includes the taxonomy read counts, phenotypes and batch labels.
#' @param main_variable Optional. A vector containing the main effect variable. Only for
#' categorical main effect variable.The function will generate a figure for each catagory.
#' @param method A string indicating which method should be used to calculate the distance matrix for
#' principal coordinate analysis.
#' @return The function returns a list containing plot objects of principal coordinate analysis figures.
#' @examples
#' data(Microbiome_dat)
#' figure <- VBatch(Microbiome_dat, method = "bray")
#' print(figure)
#' @export
VBatch = function(Microbiome_dat, main_variable = NULL, method = "bray"){
  col_data=colData(Microbiome_dat)
  Y = t(assay(Microbiome_dat))
  batch = as.factor(col_data[,3])
  nsize = rowSums(Y)
  abundance = sweep(Y, 1, nsize, "/")
  distance = vegan::vegdist(x = abundance, method = method)
  mds = ape::pcoa(distance)
  mds1 = mds$vectors[,1]
  mds2 = mds$vectors[,2]

  figure = list()
  #### With main_variable input
  if (!is.null(main_variable)){
    main_variable = as.factor(main_variable)
    point =data.frame(PC1 = mds1, PC2 = mds2, batch, main = main_variable)
    k = 1
    for (i in levels(point$main)){
      curve = data.frame()
      point_sub=point[point$main == i,]
      for (g in levels(point_sub$batch)){
        df = with(point_sub[point_sub$batch==g,], ellipse::ellipse(x = cor(PC1, PC2), scale = c(sd(PC1), sd(PC2)),
                                                                   centre = c(mean(PC1), mean(PC2)),level = 0.8))
        df = data.frame(df,group = g)
        curve = rbind(curve,df)
      }
      g = ggplot(aes_string(x = 'PC1', y = 'PC2', color = 'batch'),data = point_sub) +
        geom_point(size = 2) + theme(legend.position = "right") +
        ggtitle(paste("main_variable =", i)) +
        theme(plot.title = element_text(hjust = 0.5), title = element_text(size = 12)) +
        theme(panel.background = element_blank(), axis.line = element_line(colour = "black")) +
        theme(legend.text = element_text(size = 13)) +
        theme(legend.title = element_text(size = 13)) +
        theme(axis.title = element_text(size = 13)) +
        theme(axis.text = element_text(size = 12)) +
        geom_path(data = curve, aes_string(x = 'x', y = 'y', colour = 'group'), size = 1, linetype = 1)
      figure[[k]] = g
      k = k + 1
    }
  }

  #### Without main_variable input
  if (is.null(main_variable)){
    point = data.frame(PC1 = mds1, PC2 = mds2, batch)
    curve = data.frame()
    for (g in levels(point$batch)){
      df = with(point[point$batch==g,], ellipse::ellipse(x = cor(PC1, PC2), scale = c(sd(PC1), sd(PC2)),
                                                         centre = c(mean(PC1), mean(PC2)),level = 0.8))
      df = data.frame(df,group=g)
      curve = rbind(curve,df)
    }
    g = ggplot(aes_string(x = 'PC1', y = 'PC2', color = 'batch'), data = point) +
      geom_point(size = 2) + theme(legend.position = "right") +
      theme(plot.title = element_text(hjust = 0.5),title = element_text(size = 12)) +
      theme(panel.background = element_blank(), axis.line = element_line(colour = "black")) +
      theme(legend.text = element_text(size = 13)) +
      theme(legend.title = element_text(size = 13)) +
      theme(axis.title = element_text(size = 13)) +
      theme(axis.text = element_text(size = 12)) +
      geom_path(data = curve, aes_string(x = 'x', y = 'y', colour = 'group'), size = 1, linetype = 1)
    figure = g
  }
  return(figure)
}




#' Threshold the posterior inclusion probability (PIP) through control Bayesian false discovery
#' rate (bFDR).
#' @param PIP_vec A vector contains the PIPs of parameters
#' @param alpha The level of the bFDR to need to control (default = 0.1)
#' @return The cutoff for PIPs to control the bFDR with the user defined value, alpha.
#' @examples
#' data(L_mean)
#' cutoff <- fdr_cut(L_mean, alpha = 0.1)
#' @export
fdr_cut = function(PIP_vec, alpha = 0.1){
  p = sort(1 - PIP_vec)
  thres = cumsum(p)/seq_len(length(p))
  k = which.max(thres >= alpha)
  cutoff = 1 - p[k-1]
  return(cutoff)
}





#' Trace plot of BDMMA output
#' @param trace A data.frame named "trace" contained in the output of function BDMMA
#' @param param A character vector including the parameters' name for trace_plot
#' @param col A string defining the color of trace plot (default color is black)
#' @return The function returns a list containing plot objects of parameters' trace plot.
#' @examples
#' require(SummarizedExperiment)
#' data(Microbiome_dat)
#' ## (not run)
#' ## output <- BDMMA(Microbiome_dat, burn_in = 3000, sample_period = 3000)
#' ## figure <- trace_plot(output$trace, param = c("alpha_1", "beta1_10"))
#' ## print(figure)
#' @export
#'
trace_plot = function(trace, param, col = "black"){
  for (i in param){
    if (!(i %in% names(trace))){
      stop(paste("The parameter", i, "is not in the list!"))
    }
  }
  param_trace = list()
  k = 1

  for (j in param){
    trace_j = trace[, names(trace) == j]
    trace_j = data.frame(iter = seq_len(nrow(trace)), value = trace_j)
    g = ggplot(aes_string(x = 'iter', y = 'value'), data = trace_j) +
      geom_line(colour = col) + ggtitle(paste("The trace plot of parameter", j)) +
      theme(plot.title = element_text(hjust = 0.5),title = element_text(size = 12)) +
      theme(legend.text = element_text(size = 13)) +
      theme(legend.title = element_text(size = 13)) +
      theme(axis.title = element_text(size = 13)) +
      theme(axis.text = element_text(size = 12))
    param_trace[[k]] = g
    k = k + 1
  }
  return(param_trace)
}

Try the BDMMAcorrect package in your browser

Any scripts or data that you put into this service are public.

BDMMAcorrect documentation built on Nov. 8, 2020, 5:50 p.m.