R/sim.R

#' @title Esophageal microbiome data
#'
#' @description A list with 5 objects, including \code{gamma}, the parameter vector for a
#' Dirichlet distribution. Estimated from the esophageal data set described in
#' Pei Z et al.: Bacterial biota in the human distal esophagus. Proc Natl Acad Sci U S A 2004,
#' 101:4250-4255. Estimated using package \code{dirmult}.
#'
#' @details
#' \itemize{
#'    \item gamma. Parameter vector for Dirichlet distribution
#'    \item pi. Mean vector for Dirichlet distribution
#'    \item loglik. Final log-likelihood value
#'    \item ite. Number of iterations used
#'    \item theta. Estimated theta-value
#' }
#' @docType data
#' @name dm.fit.eso
#' @format A list with five objects
NULL

#' @title Lung microbiome data
#'
#' @description A list with 5 objects, including \code{gamma}, the parameter vector for a
#' Dirichlet distribution. Estimated from the (non-smoker) lung data set described in
#' Charlson ES et al.: Topographical continuity of bacterial populations in the healthy
#' human respiratory tract. Am J Respir Crit Care Med 2011, 184:957-963.
#' Estimated using package \code{dirmult}.
#'
#' @details
#' \itemize{
#'    \item gamma. Parameter vector for Dirichlet distribution
#'    \item pi. Mean vector for Dirichlet distribution
#'    \item loglik. Final log-likelihood value
#'    \item ite. Number of iterations used
#'    \item theta. Estimated theta-value
#' }
#' @docType data
#' @name dm.fit.lung
#' @format A list with five objects
NULL

#' @title Dirichlet-multinomial case/control counts simulation
#'
#' @description Simulates counts according to a Dirichlet-multinomial model, according
#' to model parameters given by object dm.fit.
#'
#' @details Simulates two sets of counts (case/control), where one set is generated
#' according to the parameters in object \code{dm.fit}, and and the second set is
#' generated such that \code{Nshuffle} number of high abundance features are
#' shuffled with \code{Nshuffle} number of low abundance features, generating a final
#' data set with a large number of differences between case and control samples for
#' large enough \code{Nshuffle}. Requires package \code{dirmult} to generate samples.
#'
#' @param dm.fit \code{dirmult} object. Alternatively, a list object with at least \code{gamma} object,
#' corresponding to parameters of a Dirichlet distribution
#' @param params list of parameters describing the number the mean and standard deviation of
#'    library sizes for case and control samples, as well as the number of samples of each
#' @param Nshuffle number of features to shuffle
#' @return List with objects \code{counts}, \code{i.spike}, \code{groups}: a counts table, feature
#'    names which have been shuffled, and group membership (case/control), respectively
#' @seealso \code{\link{sim.counts}} for simulation of a single group of samples
#' @examples
#'    library(HMP)
#'    library(dirmult)
#'    S <- sim.groups(dm.fit.eso)
#'    S$eb <- eb.pseudo(S$counts)
#'    S$gt <- gt.pseudo(S$counts)
#'    round(head(S$eb),4)
#'
#'    S <- sim.groups(dm.fit.lung, Nshuffle = 50,
#'         params = list(mu1 = 2000, sd1 = 200, N1 = 20,
#'                       mu2 = 6000, sd2 = 600, N2 = 20))
#'    S$eb <- eb.pseudo(S$counts)
#'    S$gt <- gt.pseudo(S$counts)
#'    round(head(S$gt),4)
#' @export
sim.groups <- function(dm.fit, Nshuffle = 15,
                       params = list(mu1 = 300, sd1 = 50, N1 = 20,
                                     mu2 = 300, sd2 = 50, N2 = 20)) {

  if (Nshuffle > length(dm.fit$gamma) / 2) {
    stop("Trying to shuffle more than half of all features! Reduce Nshuffle.")
  }

  # get library sizes for each sample
  ls1 <- round(rnorm(n = params$N1, mean = params$mu1, sd = params$sd1))
  ls2 <- round(rnorm(n = params$N2, mean = params$mu2, sd = params$sd2))

  # generate control samples
  S.control <- sim.counts(dm.fit, ls = ls1)

  # if case samples are generated by shuffling features
  # shuffle high abundance features with low abundance features
  rs <- rowSums(S.control$counts)
  rn <- names(sort(rs))
  k <- length(rn) - Nshuffle * 2
  i <- rn[sample(1:k, Nshuffle)]
  j <- rn[sample((k + 1):length(rn), Nshuffle)]

  S.case <- sim.counts(dm.fit, ls = ls2)
  temp <- S.case$counts[i, ]
  S.case$counts[i, ] <- S.case$counts[j, ]
  S.case$counts[j, ] <- temp
  i.spike <- c(i, j)

  # combine case and control counts
  counts <- cbind(S.control$counts, S.case$counts)

  # remove empty rows
  k <- which(rowSums(counts) == 0)
  if (length(k) > 0) {
    counts <- counts[-k, ]
    i.spike <- i.spike[which(i.spike %in% rownames(counts))]
  }

  colnames(counts) <- paste('Sample', 1:ncol(counts), sep='_')

  S <- list(counts = counts, i.spike = i.spike,
            groups = factor(rep(1:2, c(ncol(S.control$counts),
                                       ncol(S.case$counts)))))
  S
}

#' @title Dirichlet-multinomial counts simulation
#'
#' @description Simulates counts according to a Dirichlet-multinomial model, according
#' to model parameters given by object dm.fit.
#'
#' @details Simulates a single set of counts, generated according to the
#' parameters in object \code{dm.fit}. Requires package \code{dirmult} to generate samples.
#'
#' @param dm.fit \code{dirmult} object. Alternatively, a list object with at least \code{gamma} object,
#' corresponding to parameters of a Dirichlet distribution
#' @param ls vector of library sizes of samples to be generated
#' @param remove.zero.rows whether or not to remove rows of all zeroes
#' @export
sim.counts <- function(dm.fit, ls, remove.zero.rows = FALSE) {
  counts <- t(HMP::Dirichlet.multinomial(ls, dm.fit$gamma))
  if (remove.zero.rows == TRUE) {
    k <- which(rowSums(counts) == 0)
    if (length(k) > 0) {counts <- counts[-k, ]}
  }

  rownames(counts) <- paste("OTU", 1:nrow(counts), sep = "_")
  colnames(counts) <- paste("Sample", 1:ncol(counts), sep = "_")
  list(counts = counts, dm.fit = dm.fit)
}
ldiao/Gimp documentation built on May 20, 2019, 11:29 p.m.