Nothing
winsor.fun <- function(Y, quan, feature.dat.type) {
if (feature.dat.type == 'count') {
N <- colSums(Y)
P <- t(t(Y) / N)
cut <- apply(P, 1, quantile, quan)
Cut <- matrix(rep(cut, ncol(Y)), nrow(Y))
ind <- P > Cut
P[ind] <- Cut[ind]
Y <- round(t(t(P) * N))
}
if (feature.dat.type == 'proportion') {
cut <- apply(Y, 1, quantile, quan)
Cut <- matrix(rep(cut, ncol(Y)), nrow(Y))
ind <- Y > Cut
Y[ind] <- Cut[ind]
}
return(Y)
}
#' Linear (Lin) model for differential abundance (DA) analysis
#'
#' The function implements a simple, robust and highly scalable approach to tackle
#' the compositional effects in differential abundance analysis. It fits linear regression models
#' on the centered log2-ratio transformed data, identifies a bias term due to the transformation
#' and compositional effect, and corrects the bias using the mode of the regression coefficients.
#' It could fit mixed-effect models.
#'
#' @param otu.tab data frame or matrix representing observed OTU table. Row: taxa; column: samples.
#' NAs are not expected in OTU tables so are not allowed in function \code{linda}.
#' @param meta data frame of covariates. The rows of \code{meta} correspond to the columns of \code{otu.tab}.
#' NAs are allowed. If there are NAs, the corresponding samples will be removed in the analysis.
#' @param formula character. For example: \code{formula = '~x1*x2+x3+(1|id)'}. At least one fixed effect is required.
#' @param adaptive TRUE or FALSE. Default is TRUE. If TRUE, the parameter \code{imputation} will be treated as FALSE no matter
#' what it is actually set to be. Then the significant correlations between the sequencing depth and explanatory variables will be tested
#' via the linear regression between the log of the sequencing depths and \code{formula}. If any p-value is smaller than or equal to
#' \code{corr.cut}, the imputation approach will be used; otherwise, the pseudo-count approach will be used.
#' The information of whether the imputation or pseudo-count approach is used will be printed.
#' @param imputation TRUE or FALSE. Default is FALSE. If TRUE, then we use the imputation approach, i.e., zeros in \code{otu.tab} will be
#' imputed using the formula in the referenced paper.
#' @param pseudo.cnt a positive real value. Default is 0.5. If \code{adaptive} and \code{imputation} are both FALSE,
#' then we use the pseudo-count approach, i.e., we add \code{pseudo.cnt} to each value in \code{otu.tab}.
#' @param corr.cut a real value between 0 and 1; significance level of correlations between the sequencing depth and
#' explanatory variables. Default is 0.1.
#' @param p.adj.method character; p-value adjusting approach. See R function \code{p.adjust}. Default is 'BH'.
#' @param alpha a real value between 0 and 1; significance level of differential abundance. Default is 0.05.
#' @param prev.cut a real value between 0 and 1; taxa with prevalence (percentage of nonzeros)
#' less than prev.cut are excluded. Default is 0 (no taxa will be excluded).
#' @param lib.cut a non-negative real value; samples with less than \code{lib.cut} read counts are excluded.
#' Default is 1 (no samples will be excluded).
#' @param winsor.quan a real value between 0 and 1; winsorization cutoff (quantile) for \code{otu.tab}, e.g., 0.97. Default is NULL.
#' If NULL, winsorization process will not be conducted.
#' @param n.cores a positive integer. If \code{n.cores > 1} and formula is in a form of mixed-effect model,
#' \code{n.cores} parallels will be conducted. Default is 1.
#'
#' @return A list with the elements
#' \item{variables}{A vector of variable names of all fixed effects in \code{formula}. For example: \code{formula = '~x1*x2+x3+(1|id)'}.
#' Suppose \code{x1} and \code{x2} are numerical, and \code{x3} is a categorical variable of three levels: a, b and c.
#' Then the elements of \code{variables} would be \code{('x1', 'x2', 'x3b', 'x3c', 'x1:x2')}.}
#' \item{bias}{numeric vector; each element corresponds to one variable in \code{variables};
#' the estimated bias of the regression coefficients due to the compositional effect.}
#' \item{output}{a list of data frames with columns 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj', 'reject',
#' 'df'; \code{names(output)} is equal to \code{variables}; the rows of the data frame corresponds to taxa.
#' Note: if there are taxa being excluded due to \code{prev.cut}, the number of the rows of the output data frame
#' will be not equal to the number of the rows of \code{otu.tab}. Taxa are identified by the rownames.
#' If the rownames of \code{otu.tab} are NULL, then \code{1 : nrow(otu.tab)} is set as the rownames of \code{otu.tab}.
#' \itemize{
#' \item{baseMean:}{ 2 to the power of the intercept coefficients (normalized by one million)}
#' \item{log2FoldChange:}{ bias-corrected coefficients}
#' \item{lfcSE:}{ standard errors of the coefficients}
#' \item{stat:}{ \code{log2FoldChange / lfcSE}}
#' \item{pvalue:}{ \code{2 * pt(-abs(stat), df)}}
#' \item{padj:}{ \code{p.adjust(pvalue, method = p.adj.method)}}
#' \item{reject:}{ \code{padj <= alpha}}
#' \item{df:}{ degrees of freedom. The number of samples minus the number of explanatory variables (intercept included) for
#' fixed-effect models; estimates from R package \code{lmerTest} with Satterthwaite method of approximation for mixed-effect models.}
#' }}
#' \item{covariance}{a list of data frames; the data frame records the covariances between a regression coefficient with other coefficients;
#' \code{names(covariance)} is equal to \code{variables}; the rows of the data frame corresponds to taxa. If the length of \code{variables}
#' is equal to 1, then the \code{covariance} is NULL.}
#' \item{otu.tab.use}{the OTU table used in the abundance analysis (the \code{otu.tab} after the preprocessing:
#' samples that have NAs in the variables in \code{formula} or have less than \code{lib.cut} read counts are removed;
#' taxa with prevalence less than \code{prev.cut} are removed and data is winsorized if \code{!is.null(winsor.quan)};
#' and zeros are treated, i.e., imputed or pseudo-count added).}
#' \item{meta.use}{the meta data used in the abundance analysis (only variables in \code{formula} are stored; samples that have NAs
#' or have less than \code{lib.cut} read counts are removed; numerical variables are scaled).}
#' \item{wald}{a list for use in Wald test. If the fitting model is a linear model, then it includes
#' \itemize{
#' \item{beta:}{ a matrix of the biased regression coefficients including the intercept and all fixed effects; the columns correspond to taxa}
#' \item{sig:}{ the standard errors; the elements corresponding to taxa}
#' \item{X:}{ the design matrix of the fitting model}
#' \item{bias:}{ the estimated biases of the regression coefficients including intercept and all fixed effects}
#' }
#' If the fitting model is a linear mixed-effect model, then it includes
#' \itemize{
#' \item{beta:}{ a matrix of the biased regression coefficients including intercept and all fixed effects; the culumns correspond to taxa}
#' \item{beta.cov:}{ a list of covariance matrices of \code{beta}; the elements corresponding to taxa}
#' \item{rand.cov:}{ a list with covariance matrices of variance parameters of random effects; the elements corresponding to taxa; see more details in the paper of 'lmerTest'}
#' \item{Joc.beta.cov.rand:} { a list of a list of Jacobian matrices of \code{beta.cov} with respect to the variance parameters; the elements corresponding to taxa}
#' \item{bias:}{ the estimated biases of the regression coefficients including intercept and all fixed effects}
#' }}
#'
#' @author Huijuan Zhou \email{huijuanzhou2019@gmail.com}
#' Jun Chen \email{Chen.Jun2@mayo.edu}
#' Maintainer: Huijuan Zhou, Jun Chen
#' @references Huijuan Zhou, Kejun He, Jun Chen, and Xianyang Zhang. LinDA: Linear Models for Differential Abundance
#' Analysis of Microbiome Compositional Data.
#' @importFrom modeest mlv
#' @importFrom lmerTest lmer
#' @importFrom lmerTest as_lmerModLmerTest
#' @import foreach
#' @import parallel
#' @examples
#'
#' #install package "phyloseq" for importing "smokers" dataset
#' ind <- smokers$meta$AIRWAYSITE == 'Throat'
#' otu.tab <- as.data.frame(smokers$otu[, ind])
#' meta <- cbind.data.frame(Smoke = factor(smokers$meta$SMOKER[ind]),
#' Sex = factor(smokers$meta$SEX[ind]),
#' Site = factor(smokers$meta$SIDEOFBODY[ind]),
#' SubjectID = factor(smokers$meta$HOST_SUBJECT_ID[ind]))
#' ind1 <- which(meta$Site == 'Left')
#' res.left <- linda(otu.tab[, ind1], meta[ind1, ], formula = '~Smoke+Sex', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' ind2 <- which(meta$Site == 'Right')
#' res.right <- linda(otu.tab[, ind2], meta[ind2, ], formula = '~Smoke+Sex', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' rownames(res.left$output[[1]])[which(res.left$output[[1]]$reject)]
#' rownames(res.right$output[[1]])[which(res.right$output[[1]]$reject)]
#'
#' linda.obj <- linda(otu.tab, meta, formula = '~Smoke+Sex+(1|SubjectID)', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' linda.plot(linda.obj, c('Smokey', 'Sexmale'),
#' titles = c('Smoke: n v.s. y', 'Sex: female v.s. male'), alpha = 0.1, lfc.cut = 1,
#' legend = TRUE, directory = NULL, width = 11, height = 8)
#'
#' @export
linda <- function(feature.dat, meta.dat, formula, feature.dat.type = c('count', 'proportion'),
prev.filter = 0, mean.abund.filter = 0, max.abund.filter = 0,
is.winsor = TRUE, outlier.pct = 0.03,
adaptive = TRUE, zero.handling = c('pseudo-count', 'imputation'),
pseudo.cnt = 0.5, corr.cut = 0.1,
p.adj.method = 'BH', alpha = 0.05,
n.cores = 1, verbose = TRUE) {
feature.dat.type <- match.arg(feature.dat.type)
# if (!is.null(phyloseq.obj)) {
# feature.dat.type <- 'count'
#
# feature.dat <- otu_table(phyloseq.obj)@.Data
# meta.dat <- data.frame(sample_data(phyloseq.obj))
# }
if(any(is.na(feature.dat))) {
stop('The feature table contains NAs! Please remove!\n')
}
allvars <- all.vars(as.formula(formula))
Z <- as.data.frame(meta.dat[, allvars])
###############################################################################
# Filter sample
keep.sam <- which(rowSums(is.na(Z)) == 0)
Y <- feature.dat[, keep.sam]
Z <- as.data.frame(Z[keep.sam, ])
names(Z) <- allvars
# Filter features
temp <- t(t(Y) / colSums(Y))
keep.tax <- rowMeans(temp != 0) >= prev.filter & rowMeans(temp) >= mean.abund.filter & rowMaxs(temp) >= max.abund.filter
names(keep.tax) <- rownames(Y)
rm(temp)
if (verbose) cat(sum(!keep.tax), ' features are filtered!\n')
Y <- Y[keep.tax, ]
n <- ncol(Y)
m <- nrow(Y)
## some samples may have zero total counts after screening taxa
if(any(colSums(Y) == 0)) {
ind <- which(colSums(Y) > 0)
Y <- Y[, ind]
Z <- as.data.frame(Z[ind, ])
names(Z) <- allvars
keep.sam <- keep.sam[ind]
n <- ncol(Y)
}
if (verbose) cat('The filtered data has ', n, ' samples and ', m, ' features will be tested!\n' )
if (sum(rowSums(Y != 0) <= 2) != 0) {
warning('Some features have less than 3 nonzero values!
They have virtually no statistical power. You may consider filtering them in the analysis!\n')
}
###############################################################################
## scaling numerical variables
ind <- sapply(1 : ncol(Z), function(i) is.numeric(Z[, i]))
Z[, ind] <- scale(Z[, ind])
## winsorization
if(is.winsor) {
Y <- winsor.fun(Y, 1 - outlier.pct, feature.dat.type)
}
##
if(grepl('\\(', formula)) {
random.effect <- TRUE
} else {
random.effect <- FALSE
}
if(is.null(rownames(feature.dat))) {
taxa.name <- (1 : nrow(feature.dat))[keep.tax]
} else {
taxa.name <- rownames(feature.dat)[keep.tax]
}
if(is.null(rownames(meta.dat))) {
samp.name <- (1 : nrow(meta.dat))[keep.sam]
} else {
samp.name <- rownames(meta.dat)[keep.sam]
}
## handling zeros
if (feature.dat.type == 'count') {
if(any(Y == 0)) {
N <- colSums(Y)
if(adaptive) {
logN <- log(N)
if(random.effect) {
tmp <- lmer(as.formula(paste0('logN', formula)), Z)
} else {
tmp <- lm(as.formula(paste0('logN', formula)), Z)
}
corr.pval <- coef(summary(tmp))[-1, "Pr(>|t|)"]
if(any(corr.pval <= corr.cut)) {
if (verbose) cat('Imputation approach is used.\n')
zero.handling <- "Imputation"
} else {
if (verbose) cat('Pseudo-count approach is used.\n')
zero.handling <- "Pseudo-count"
}
}
if(zero.handling == 'imputation') {
N.mat <- matrix(rep(N, m), nrow = m, byrow = TRUE)
N.mat[Y > 0] <- 0
tmp <- N[max.col(N.mat)]
Y <- Y + N.mat / tmp
} else {
Y <- Y + pseudo.cnt
}
}
}
if (feature.dat.type == 'proportion') {
if(any(Y == 0)) {
# Half minimum approach
Y <- t(apply(Y, 1, function (x) {
x[x == 0] <- 0.5 * min(x[x != 0])
return(x)
}))
colnames(Y) <- samp.name
rownames(Y) <- taxa.name
}
}
## CLR transformation
logY <- log2(Y)
W <- t(logY) - colMeans(logY)
## linear regression
# oldw <- getOption('warn')
# options(warn = -1)
if(!random.effect) {
if (verbose) cat("Fit linear models ...\n")
suppressMessages(fit <- lm(as.formula(paste0('W', formula)), Z))
res <- do.call(rbind, coef(summary(fit)))
d <- ncol(model.matrix(fit))
df <- rep(n - d, m)
tmp <- vcov(fit)
res.cov <- foreach(i = 1 : m) %do% {tmp[((i-1)*d+1) : (i*d), ((i-1)*d+1) : (i*d)]}
wald <- list(beta = coef(fit), sig = sigma(fit), X = model.matrix(fit))
res.cov <- do.call(rbind, res.cov)
rownames(res.cov) <- rownames(res)
colnames(res.cov) <- rownames(res)[1 : d]
} else {
if (verbose) cat("Fit linear mixed effects models ...\n")
fun <- function(i) {
w <- W[, i]
suppressMessages(fit <- lmer(as.formula(paste0('w', formula)), Z))
a <- as_lmerModLmerTest(fit)
rand.cov <- a@vcov_varpar
Jac.beta.cov.rand <- a@Jac_list
list(coef(summary(fit)), vcov(fit), rand.cov, Jac.beta.cov.rand)
}
if(n.cores > 1) {
tmp <- mclapply(c(1 : m), function(i) fun(i), mc.cores = n.cores)
} else {
suppressMessages(tmp <- foreach(i = 1 : m) %do% fun(i))
}
res <- do.call(rbind, lapply(tmp, `[[`, 1))
res.cov <- do.call(rbind, lapply(tmp, `[[`, 2))
wald <- list(beta = do.call(cbind, lapply(lapply(tmp, `[[`, 1), function(x)x[,1])),
beta.cov = lapply(tmp, `[[`, 2),
rand.cov = lapply(tmp, `[[`, 3),
Jac.beta.cov.rand = lapply(tmp, `[[`, 4))
}
# options(warn = oldw)
res.intc <- res[which(rownames(res) == '(Intercept)'), ]
rownames(res.intc) <- NULL
# options(warn = -1)
suppressMessages(bias.intc <- mlv(sqrt(n) * res.intc[, 1],
method = 'meanshift', kernel = 'gaussian') / sqrt(n))
# options(warn = oldw)
baseMean <- 2 ^ (res.intc[, 1] - bias.intc)
baseMean <- baseMean / sum(baseMean) * 1e6
output.fun <- function(x) {
res.voi <- res[which(rownames(res) == x), ]
rownames(res.voi) <- NULL
if(random.effect) {
df <- res.voi[, 3]
}
log2FoldChange <- res.voi[, 1]
lfcSE <- res.voi[, 2]
# oldw <- getOption('warn')
# options(warn = -1)
suppressMessages(bias <- mlv(sqrt(n) * log2FoldChange,
method = 'meanshift', kernel = 'gaussian') / sqrt(n))
# options(warn = oldw)
log2FoldChange <- log2FoldChange - bias
stat <- log2FoldChange / lfcSE
pvalue <- 2 * pt(-abs(stat), df)
padj <- p.adjust(pvalue, method = p.adj.method)
reject <- padj <= alpha
output <- cbind.data.frame(baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, reject, df)
rownames(output) <- taxa.name
return(list(bias = bias, output = output))
}
cov.fun <- function(x) {
tmp <- (1 : ncol(res.cov))[-c(1, which(colnames(res.cov) == x))]
covariance <- as.data.frame(as.matrix(res.cov[which(rownames(res.cov) == x), tmp]))
rownames(covariance) <- taxa.name
colnames(covariance) <- colnames(res.cov)[tmp]
return(covariance)
}
variables <- unique(rownames(res))[-1]
variables.n <- length(variables)
bias <- rep(NA, variables.n)
output <- list()
if(variables.n == 1) {
covariance <- NULL
} else {
covariance <- list()
}
for(i in 1 : variables.n) {
tmp <- output.fun(variables[i])
output[[i]] <- tmp[[2]]
bias[i] <- tmp[[1]]
if(variables.n > 1) {
covariance[[i]] <- cov.fun(variables[i])
}
}
names(output) <- variables
if(variables.n > 1) {
names(covariance) <- variables
}
rownames(Y) <- taxa.name
colnames(Y) <- samp.name
rownames(Z) <- samp.name
wald[['bias']] <- c(bias.intc, bias)
if (verbose) cat("Completed.\n")
return(list(variables = variables, bias = bias, output = output, covariance = covariance, feature.dat.use = Y, meta.dat.use = Z, wald = wald))
}
#' Wald test for bias-corrected regression coefficient
#'
#' The function implements Wald test for bias-corrected regression coefficient learned from the \code{linda} function.
#' One can utilize the function to perform ANOVA-type analyses.
#'
#' @param linda.obj return from the \code{linda} function.
#' @param L A matrix for testing \code{Lb = 0}, where \code{b} includes the intercept and all fixed effects. Thus the number of columns of
#' L must be equal to \code{length(variables)+1}, where \code{variables} is from \code{linda.obj}, which does not include the intercept.
#' @param model \code{'LM'} or \code{'LMM'} indicating the model fitted in \{linda\} is linear model or linear mixed-effect model.
#' @param alpha significance level for testing \code{Lb = 0}.
#' @param p.adj.method P-value adjusting approach. See R function \code{p.adjust}. The default is 'BH'.
#'
#' @return A data frame with columns
#' \item{Fstat}{Wald statistics for each taxon}
#' \item{df1}{The numerator degrees of freedom}
#' \item{df2}{The denominator degrees of freedom}
#' \item{pvalue}{ \code{1 - pf(Fstat, df1, df2)}}
#' \item{padj}{ \code{p.adjust(pvalue, method = p.adj.method)}}
#' \item{reject}{ \code{padj <= alpha}}
#'
#' @author Huijuan Zhou \email{huijuanzhou2019@gmail.com}
#' Jun Chen \email{Chen.Jun2@mayo.edu}
#' Xianyang Zhang \email{zhangxiany@stat.tamu.edu}
#' @references Huijuan Zhou, Kejun He, Jun Chen, and Xianyang Zhang. LinDA: Linear Models for Differential Abundance
#' Analysis of Microbiome Compositional Data.
#' @examples
#'
#' #install package "phyloseq" for importing "smokers" dataset
#' ind <- smokers$meta$AIRWAYSITE == 'Throat'
#' otu.tab <- as.data.frame(smokers$otu[, ind])
#' meta <- cbind.data.frame(Smoke = factor(smokers$meta$SMOKER[ind]),
#' Sex = factor(smokers$meta$SEX[ind]),
#' Site = factor(smokers$meta$SIDEOFBODY[ind]),
#' SubjectID = factor(smokers$meta$HOST_SUBJECT_ID[ind]))
#' linda.obj <- linda(otu.tab, meta, formula = '~Smoke+Sex+(1|SubjectID)', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' L <- matrix(c(0, 1, 0, 0, 0, 1), nrow = 2, byrow = TRUE)
#' result <- linda.wald.test(linda.obj, L, 'LMM', alpha = 0.1)
#'
#' @export
linda.wald.test <- function(linda.obj, L, model = c('LM', 'LMM'), alpha = 0.05, p.adj.method = 'BH') {
r <- qr(L)$rank
wald <- linda.obj$wald
bias <- wald$bias
beta <- wald$beta - bias
n <- ncol(linda.obj$feature.dat.use)
m <- nrow(linda.obj$feature.dat.use)
if(model == 'LM') {
p <- nrow(beta)
X <- wald$X
Lbeta.cov.inv <- solve(L %*% solve(t(X)%*% X) %*% t(L))
Lbeta <- L %*% beta
Fstat <- sapply(1 : m, function(i) {
lbeta <- Lbeta[, i]
t(lbeta) %*% Lbeta.cov.inv %*% lbeta / (wald$sig[i] ^ 2 * r)
})
df2 <- rep(n - p, m)
} else if(model == 'LMM') {
beta.cov <- wald$beta.cov
rand.cov <- wald$rand.cov
Jac.beta.cov.rand <- wald$Jac.beta.cov.rand
Lbeta <- L %*% beta
fun <- function(i) {
beta.cov.i <- beta.cov[[i]]
rand.cov.i <- rand.cov[[i]]
Jac.beta.cov.rand.i <- Jac.beta.cov.rand[[i]]
Lbeta.cov.i <- L %*% beta.cov.i %*% t(L)
if(nrow(L) == 1) {
d <- as.vector(Lbeta.cov.i)
g <- sapply(Jac.beta.cov.rand.i, function(x) L %*% x %*% t(L))
nu <- 2 * d ^ 2 / as.vector(t(g) %*% rand.cov.i %*% g)
Lbeta.cov.inv.i <- 1 / d
} else {
Lbeta.cov.i.eig <- eigen(Lbeta.cov.i)
P <- Lbeta.cov.i.eig$vectors
D <- Lbeta.cov.i.eig$values
PtL <- t(P) %*% L
nu <- rep(NA, r)
for(j in 1 : r) {
d <- D[j]
l <- PtL[j, ]
g <- sapply(Jac.beta.cov.rand.i, function(x) t(l) %*% x %*% l)
nu[j] <- 2 * d ^ 2 / as.vector(t(g) %*% rand.cov.i %*% g)
}
Lbeta.cov.inv.i <- P %*% diag(D ^ (-1)) %*% t(P)
}
tmp <- sum(nu / (nu - 2))
nu <- 2 * tmp / (tmp - r)
lbeta <- Lbeta[, i]
Fstat <- as.vector(t(lbeta) %*% Lbeta.cov.inv.i %*% lbeta) / r
df2 <- nu
return(c(Fstat, df2))
}
Fstat <- df2 <- rep(NA, m)
for (i in 1 : m) {
res <- fun(i)
Fstat[i] <- res[1]
df2[i] <- res[2]
}
}
df1 <- rep(r, m)
pvalue <- 1 - pf(Fstat, df1, df2)
padj <- p.adjust(pvalue, method = p.adj.method)
reject <- padj <= alpha
res <- cbind.data.frame(Fstat, df1, df2, pvalue, padj, reject)
rownames(res) <- rownames(linda.obj$otu.tab.use)
return(res)
}
#' Plot linda results
#'
#' The function plots the effect size plot and volcano plot based on the output from \code{linda}.
#'
#' @param linda.obj return from function \code{linda}.
#' @param variables.plot vector; variables whose results are to be plotted. For example, suppose the return
#' value \code{variables} is equal to \code{('x1', 'x2', 'x3b', 'x3c', 'x1:x2')}, then one could set \code{variables.plot = c('x3b', 'x1:x2')}.
#' @param titles vector; titles of the effect size plot and volcano plot for each variable in \code{variables.plot}.
#' Default is NULL. If NULL, the titles will be set as \code{variables.plot}.
#' @param alpha a real value between 0 and 1; cutoff for \code{padj}.
#' @param lfc.cut a positive value; cutoff for \code{log2FoldChange}.
#' @param legend TRUE or FALSE; whether to show the legends of the effect size plot and volcano plot.
#' @param directory character; the directory to save the figures, e.g., \code{getwd()}. Default is NULL. If NULL, figures will not be saved.
#' @param width the width of the graphics region in inches. See R function \code{pdf}.
#' @param height the height of the graphics region in inches. See R function \code{pdf}.
#'
#' @return A list of \code{ggplot2} objects.
#' \item{plot.lfc}{a list of effect size plots. Each plot corresponds to one variable in \code{variables.plot}.}
#' \item{plot.volcano}{a list of volcano plots. Each plot corresponds to one variable in \code{variables.plot}.}
#'
#' @author Huijuan Zhou \email{huijuanzhou2019@gmail.com}
#' Jun Chen \email{Chen.Jun2@mayo.edu}
#' Maintainer: Huijuan Zhou
#' @references Huijuan Zhou, Kejun He, Jun Chen, and Xianyang Zhang. LinDA: Linear Models for Differential Abundance
#' Analysis of Microbiome Compositional Data.
#' @import ggplot2
#' @import ggrepel
#' @examples
#'
#' #install package "phyloseq" for importing "smokers" dataset
#' ind <- smokers$meta$AIRWAYSITE == 'Throat'
#' otu.tab <- as.data.frame(smokers$otu[, ind])
#' meta <- cbind.data.frame(Smoke = factor(smokers$meta$SMOKER[ind]),
#' Sex = factor(smokers$meta$SEX[ind]),
#' Site = factor(smokers$meta$SIDEOFBODY[ind]),
#' SubjectID = factor(smokers$meta$HOST_SUBJECT_ID[ind]))
#' ind1 <- which(meta$Site == 'Left')
#' res.left <- linda(otu.tab[, ind1], meta[ind1, ], formula = '~Smoke+Sex', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' ind2 <- which(meta$Site == 'Right')
#' res.right <- linda(otu.tab[, ind2], meta[ind2, ], formula = '~Smoke+Sex', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' rownames(res.left$output[[1]])[which(res.left$output[[1]]$reject)]
#' rownames(res.right$output[[1]])[which(res.right$output[[1]]$reject)]
#'
#' linda.obj <- linda(otu.tab, meta, formula = '~Smoke+Sex+(1|SubjectID)', alpha = 0.1,
#' prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
#' linda.plot(linda.obj, c('Smokey', 'Sexmale'),
#' titles = c('Smoke: n v.s. y', 'Sex: female v.s. male'), alpha = 0.1, lfc.cut = 1,
#' legend = TRUE, directory = NULL, width = 11, height = 8)
#'
#' @export
linda.plot <- function(linda.obj, variables.plot, titles = NULL, alpha = 0.05, lfc.cut = 1,
legend = FALSE, directory = NULL, width = 11, height = 8) {
bias <- linda.obj$bias
output <- linda.obj$output
otu.tab <- linda.obj$feature.dat.use
meta <- linda.obj$meta.dat.use
variables <- linda.obj$variables
if(is.null(titles)) titles <- variables.plot
taxa <- rownames(otu.tab)
m <- length(taxa)
tmp <- match(variables, variables.plot)
voi.ind <- order(tmp)[1 : sum(!is.na(tmp))]
padj.mat <- foreach(i = voi.ind, .combine = 'cbind') %do% {
output[[i]]$padj
}
## effect size plot
if(is.matrix(padj.mat)) {
ind <- which(colSums(padj.mat <= alpha) > 0)
} else if(is.vector(padj.mat)) {
tmp <- which(padj.mat <= alpha)
if(length(tmp) > 0){
ind <- 1
} else {
ind <- integer(0)
}
}
if(length(ind) == 0) {
plot.lfc <- NULL
} else {
if(!is.null(directory)) pdf(paste0(directory, '/plot_lfc.pdf'),
width = width, height = height)
plot.lfc <- list()
j <- 1
Taxa <- Log2FoldChange <- Log10Padj <- NULL
for(i in ind) {
output.i <- output[[voi.ind[i]]]
bias.i <- bias[voi.ind[i]]
lfc <- output.i$log2FoldChange
lfcSE <- output.i$lfcSE
padj <- output.i$padj
ind.rej <- which(padj <= alpha)
n.rej <- length(ind.rej)
taxa.rej <- taxa[ind.rej]
taxa.rej <- factor(taxa.rej, levels = taxa.rej)
data.plot.lfc <- cbind.data.frame(Taxa = rep(taxa.rej, 2),
Log2FoldChange = c(lfc[ind.rej], lfc[ind.rej] + bias.i),
lfcSE = c(lfcSE[ind.rej], rep(NA, n.rej)),
bias = rep(c('Debiased', 'Non-debiased'), each = n.rej))
plot.lfc.i <- ggplot(data.plot.lfc, aes(x = Log2FoldChange, y = Taxa)) +
geom_point(aes(color = bias, shape = bias), size = 3) +
geom_errorbar(aes(xmin = Log2FoldChange - 1.96 * lfcSE,
xmax = Log2FoldChange + 1.96 * lfcSE), width = .2) +
geom_vline(xintercept = 0, color = 'gray', linetype = 'dashed') +
ggtitle(titles[i]) +
theme_bw(base_size = 18)
if(legend) {
plot.lfc.i <- plot.lfc.i +
theme(legend.title = element_blank(),
legend.key.width = unit(1, 'cm'), plot.margin = unit(c(1, 1, 1, 1.5), 'cm'))
} else {
plot.lfc.i <- plot.lfc.i +
theme(legend.position = 'none', plot.margin = unit(c(1, 1, 1, 1.5), 'cm'))
}
plot.lfc[[j]] <- plot.lfc.i
j <- j + 1
if(!is.null(directory)) print(plot.lfc.i)
}
if(!is.null(directory)) dev.off()
}
## volcano plot
plot.volcano <- list()
if(!is.null(directory)) pdf(paste0(directory, '/plot_volcano.pdf'),
width = width, height = height)
leg1 <- paste0('padj>', alpha, ' & ', 'lfc<=', lfc.cut)
leg2 <- paste0('padj>', alpha, ' & ', 'lfc>', lfc.cut)
leg3 <- paste0('padj<=', alpha, ' & ', 'lfc<=', lfc.cut)
leg4 <- paste0('padj<=', alpha, ' & ', 'lfc>', lfc.cut)
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1 : n]
}
color <- gg_color_hue(3)
for(i in 1 : length(voi.ind)) {
output.i <- output[[voi.ind[i]]]
bias.i <- bias[voi.ind[i]]
lfc <- output.i$log2FoldChange
padj <- output.i$padj
ind1 <- padj > alpha & abs(lfc) <= lfc.cut
ind2 <- padj > alpha & abs(lfc) > lfc.cut
ind3 <- padj <= alpha & abs(lfc) <= lfc.cut
ind4 <- padj <= alpha & abs(lfc) > lfc.cut
leg <- rep(NA, m)
leg[ind1] = leg1
leg[ind2] = leg2
leg[ind3] = leg3
leg[ind4] = leg4
leg <- factor(leg, levels = c(leg1, leg2, leg3, leg4))
taxa.sig <- rep('', m)
taxa.sig[ind3 | ind4] <- taxa[ind3 | ind4]
data.volcano <- cbind.data.frame(taxa = taxa.sig, Log2FoldChange = lfc,
Log10Padj = -log10(padj), leg = leg)
plot.volcano.i <- ggplot(data.volcano, aes(x = Log2FoldChange, y = Log10Padj)) +
geom_point(aes(color = leg), size = 2) +
geom_text_repel(aes(label = taxa), max.overlaps = Inf) +
scale_colour_manual(values = c('darkgray', color[c(2, 3, 1)])) +
geom_hline(aes(yintercept = -log10(alpha)), color = 'gray', linetype = 'dashed') +
geom_vline(aes(xintercept = -lfc.cut), color = 'gray', linetype = 'dashed') +
geom_vline(aes(xintercept = lfc.cut), color = 'gray', linetype = 'dashed') +
ylab('-Log10Padj') +
ggtitle(titles[i]) +
theme_bw(base_size = 18)
if(legend) {
plot.volcano.i <- plot.volcano.i +
theme(legend.title = element_blank(),
legend.key.width = unit(1, 'cm'), plot.margin = unit(c(1, 1, 1, 1.5), 'cm'))
} else {
plot.volcano.i <- plot.volcano.i +
theme(legend.position = 'none', plot.margin = unit(c(1, 1, 1, 1.5), 'cm'))
}
plot.volcano[[i]] <- plot.volcano.i
if(!is.null(directory)) print(plot.volcano.i)
}
if(!is.null(directory)) dev.off()
return(list(plot.lfc = plot.lfc, plot.volcano = plot.volcano))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.