#' @title Esophageal microbiome data
#'
#' @description A list with 5 objects, including \code{gamma}, the parameter vector for a
#' Dirichlet distribution. Estimated from the esophageal data set described in
#' Pei Z et al.: Bacterial biota in the human distal esophagus. Proc Natl Acad Sci U S A 2004,
#' 101:4250-4255. Estimated using package \code{dirmult}.
#'
#' @details
#' \itemize{
#' \item gamma. Parameter vector for Dirichlet distribution
#' \item pi. Mean vector for Dirichlet distribution
#' \item loglik. Final log-likelihood value
#' \item ite. Number of iterations used
#' \item theta. Estimated theta-value
#' }
#' @docType data
#' @name dm.fit.eso
#' @format A list with five objects
NULL
#' @title Lung microbiome data
#'
#' @description A list with 5 objects, including \code{gamma}, the parameter vector for a
#' Dirichlet distribution. Estimated from the (non-smoker) lung data set described in
#' Charlson ES et al.: Topographical continuity of bacterial populations in the healthy
#' human respiratory tract. Am J Respir Crit Care Med 2011, 184:957-963.
#' Estimated using package \code{dirmult}.
#'
#' @details
#' \itemize{
#' \item gamma. Parameter vector for Dirichlet distribution
#' \item pi. Mean vector for Dirichlet distribution
#' \item loglik. Final log-likelihood value
#' \item ite. Number of iterations used
#' \item theta. Estimated theta-value
#' }
#' @docType data
#' @name dm.fit.lung
#' @format A list with five objects
NULL
#' @title Dirichlet-multinomial case/control counts simulation
#'
#' @description Simulates counts according to a Dirichlet-multinomial model, according
#' to model parameters given by object dm.fit.
#'
#' @details Simulates two sets of counts (case/control), where one set is generated
#' according to the parameters in object \code{dm.fit}, and and the second set is
#' generated such that \code{Nshuffle} number of high abundance features are
#' shuffled with \code{Nshuffle} number of low abundance features, generating a final
#' data set with a large number of differences between case and control samples for
#' large enough \code{Nshuffle}. Requires package \code{dirmult} to generate samples.
#'
#' @param dm.fit \code{dirmult} object. Alternatively, a list object with at least \code{gamma} object,
#' corresponding to parameters of a Dirichlet distribution
#' @param params list of parameters describing the number the mean and standard deviation of
#' library sizes for case and control samples, as well as the number of samples of each
#' @param Nshuffle number of features to shuffle
#' @return List with objects \code{counts}, \code{i.spike}, \code{groups}: a counts table, feature
#' names which have been shuffled, and group membership (case/control), respectively
#' @seealso \code{\link{sim.counts}} for simulation of a single group of samples
#' @examples
#' library(HMP)
#' library(dirmult)
#' S <- sim.groups(dm.fit.eso)
#' S$eb <- eb.pseudo(S$counts)
#' S$gt <- gt.pseudo(S$counts)
#' round(head(S$eb),4)
#'
#' S <- sim.groups(dm.fit.lung, Nshuffle = 50,
#' params = list(mu1 = 2000, sd1 = 200, N1 = 20,
#' mu2 = 6000, sd2 = 600, N2 = 20))
#' S$eb <- eb.pseudo(S$counts)
#' S$gt <- gt.pseudo(S$counts)
#' round(head(S$gt),4)
#' @export
sim.groups <- function(dm.fit, Nshuffle = 15,
params = list(mu1 = 300, sd1 = 50, N1 = 20,
mu2 = 300, sd2 = 50, N2 = 20)) {
if (Nshuffle > length(dm.fit$gamma) / 2) {
stop("Trying to shuffle more than half of all features! Reduce Nshuffle.")
}
# get library sizes for each sample
ls1 <- round(rnorm(n = params$N1, mean = params$mu1, sd = params$sd1))
ls2 <- round(rnorm(n = params$N2, mean = params$mu2, sd = params$sd2))
# generate control samples
S.control <- sim.counts(dm.fit, ls = ls1)
# if case samples are generated by shuffling features
# shuffle high abundance features with low abundance features
rs <- rowSums(S.control$counts)
rn <- names(sort(rs))
k <- length(rn) - Nshuffle * 2
i <- rn[sample(1:k, Nshuffle)]
j <- rn[sample((k + 1):length(rn), Nshuffle)]
S.case <- sim.counts(dm.fit, ls = ls2)
temp <- S.case$counts[i, ]
S.case$counts[i, ] <- S.case$counts[j, ]
S.case$counts[j, ] <- temp
i.spike <- c(i, j)
# combine case and control counts
counts <- cbind(S.control$counts, S.case$counts)
# remove empty rows
k <- which(rowSums(counts) == 0)
if (length(k) > 0) {
counts <- counts[-k, ]
i.spike <- i.spike[which(i.spike %in% rownames(counts))]
}
colnames(counts) <- paste('Sample', 1:ncol(counts), sep='_')
S <- list(counts = counts, i.spike = i.spike,
groups = factor(rep(1:2, c(ncol(S.control$counts),
ncol(S.case$counts)))))
S
}
#' @title Dirichlet-multinomial counts simulation
#'
#' @description Simulates counts according to a Dirichlet-multinomial model, according
#' to model parameters given by object dm.fit.
#'
#' @details Simulates a single set of counts, generated according to the
#' parameters in object \code{dm.fit}. Requires package \code{dirmult} to generate samples.
#'
#' @param dm.fit \code{dirmult} object. Alternatively, a list object with at least \code{gamma} object,
#' corresponding to parameters of a Dirichlet distribution
#' @param ls vector of library sizes of samples to be generated
#' @param remove.zero.rows whether or not to remove rows of all zeroes
#' @export
sim.counts <- function(dm.fit, ls, remove.zero.rows = FALSE) {
counts <- t(HMP::Dirichlet.multinomial(ls, dm.fit$gamma))
if (remove.zero.rows == TRUE) {
k <- which(rowSums(counts) == 0)
if (length(k) > 0) {counts <- counts[-k, ]}
}
rownames(counts) <- paste("OTU", 1:nrow(counts), sep = "_")
colnames(counts) <- paste("Sample", 1:ncol(counts), sep = "_")
list(counts = counts, dm.fit = dm.fit)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.