R/simulateCounts.R

Defines functions simulateCounts

Documented in simulateCounts

#' @rdname simulateCounts
#' @title Simulate read counts of methylated and unmethylated cytosines
#' @description Auxiliary function to simulate read counts of methylated and
#'     unmethylated cytosines
#'@details Methylation coverages (minimum 10) are generated from a Negative
#'     Binomial distribution with function \code{\link[MASS]{rnegbin}} from R
#'     package MASS. This function uses the representation of the Negative
#'     Binomial distribution as a continuous mixture of Poisson distributions
#'     with Gamma distributed means. Prior methylation levels are randomly
#'     generated with beta distribution using \code{\link[stats]{Beta}}
#'     function from R package “stats” and posterior methylation levels are
#'     generated according Bayes' theorem. The read of methylation counts are
#'     obtained as the product of coverage by the posterior methylation level.
#'     
#'     Counts on regions are generated from a Negative Binomial distribution
#'     with function \code{\link[MASS]{rnegbin}} with mean mu and variance:
#'      mu + mu^2/theta. 
#' @param num.samples Number of samples to generate.
#' @param sites Number of cytosine sites for each sample.
#' @param alpha Alpha parameter of beta distribution. Parameter shape1 from
#'     \code{\link[stats]{Beta}} function.
#' @param beta Beta parameter of beta distribution. Parameter shape2 from
#'     \code{\link[stats]{Beta}} function.
#' @param size number of trials (11 or more). Expected cytosine coverage.
#' @param theta Parameter theta from \code{\link[MASS]{rnegbin}}
#'     (overdispersion parameter).
#' @param sample.ids Names for the samples.
#' @param chromosome A character string naming the chromosome. Default "1".
#' @param start An nteger vector with the start positions for each cytosine 
#'     site. Default start = NULL.
#' @param end An integer vector with the end position for each cytosine site.
#'     Default end = start. Notice that if end > start, each counts will cover
#'     a region.
#' @param strand One of the characters '*', '+', or '-' denoting the DNA strand.
#'     Default is '*'.
#' @param type One of the string 'on_sites' or 'on_regions'. Default is 
#'     'on_sites'. If type == 'on_sites', then the counts are intended on 
#'     single bases, otherwise the counts are covering regions.
#' @param regions,min_width,max_width,minCountPerIndv,minCountPerIndv,mu 
#'     Arguments to provide when type == 'on_regions':
#'     
#'     \describe{
#'         \item{regions}{Number of regions carrying counts}
#'         \item{min_width}{Minimum size for a region}
#'         \item{max_width}{Maximum size for a region}
#'         \item{minCountPerIndv}{Each region must have more than 
#'             'minCountPerIndv' counts (on average) per individual
#'             (if 'mu' is not provided)}
#'         \item{maxCountPerIndv}{Each region must have less than 
#'             'maxCountPerIndv' counts (on average) per individual (if 'mu' is
#'             not provided)}
#'         \item{mu}{The expected value of the data generated with Negative
#'             Binomial distribution. This a vector of means. Short vectors are
#'             recycled. Default is NULL and, in this case, simulation is
#'             performed with mu = seq(minCountPerIndv, maxCountPerIndv).}
#'     }
#' @param noise A single number from the interval [0, 1] or a numeric vector of
#'     lengh(sites) with all its elements from the interval [0, 1]. Adds noise
#'     to the read counts. The noise is added to the methylation levels 'p',
#'     which are used to compute the coverage.  If for some site 'p' + noise >
#'     1', then the noise is not added to the site. Default is zero.
#' @param seed seed a single value, interpreted as an integer, or NULL, to
#'     set a seed for Random Number Generation. Default is seed = 123.    
#' @importFrom stats rbeta rbinom
#' @importFrom MASS rnegbin
#' @importFrom MethylIT sortBySeqnameAndStart
#' @return A list of GRanges objects with the methylated and unmethylated counts
#'     in its metacolumn.
#' @export
#' @author Robersy Sanchez (\url{https://github.com/genomaths}).
#' @examples
#' ## *** Simulate samples with expected average of difference of methylation
#' ## levels equal to 0.0427.
#' ## === Expected mean of methylation levels ===
#' bmean <- function(alpha, beta) alpha/(alpha + beta)
#' bmean(0.03, 0.5) - bmean(0.007, 0.5) #' Expected difference = 0.04279707
#'
#' ## === The number of cytosine sitesto generate ===
#' sites = 5000
#'
#' ## === Simulate samples ===
#' ref = simulateCounts(num.samples = 1, sites = sites, alpha = 0.007,
#'                     beta = 0.5, size = 50, theta = 4.5, sample.ids = "C1")
#' treat = simulateCounts(num.samples = 2, sites = sites, alpha = 0.03,
#'                     beta = 0.5, size = 50, theta = 4.5,
#'                     sample.ids = c("T1", "T2"))
#' ### === Simulate counts on regions ====
#' simulateCounts(num.samples = 7, 
#'                sample.ids = c(paste0("C",1:4), paste0("T", 1:3)),
#'                type = "on_regions", theta = 2.5, regions = 10)
#'                
simulateCounts <- function(num.samples, sites = NULL, alpha = NULL, 
                           beta = NULL, size = NULL, theta,
                           sample.ids = NULL, chromosome = "1",
                           start = NULL, end = NULL, strand = "*", 
                           type = c("on_sites", "on_regions"),
                           regions =  10, min_width = 1000,
                           max_width = 5000, minCountPerIndv = 8,
                           maxCountPerIndv = 300, mu = NULL, 
                           noise = 0, seed = 123){
   
   type <- match.arg(type)
   missed <- sapply(list(sites, alpha, beta, size), is.null)
   if (type == "on_sites" && any(missed)) 
       stop(paste0("\n *** Agument ", 
                   c("sites ", "alpha ", "beta ", "size ")[missed],
                   "must be provided"))
   
   if (length(sample.ids) != num.samples) 
       stop("*** The number of samples must be equal to length(sample.ids)")
   
   set.seed(seed)
   
   if (type == "on_sites") {
      coverage <- rnegbin(n = sites, mu = 100, theta = theta)
      chromosome <- chromosome[1] # only one chromosome per simulation
      if (is.null(start)) {start <- seq_len(sites); end <- start}
      if (is.null(end)) end <- start
      # fix sites w/o reads
      coverage <- ifelse(coverage < 10, 10, coverage)
      LR = list()
      for(k in seq_len(num.samples)) {
           p = rbeta(n = sites, shape1 = alpha, shape2 = beta)
           shape1 <- theta * p
           shape2 <- theta * (1 - p)
           p0 <- rbinom(n = sites, size=size,
                       prob = rbeta(n=sites, shape1=shape1, shape2=shape2))/size
           p <- (p0 + noise)
           idx <- which(p >  1)
           p[ idx ] <- p0[ idx ]; rm(p0, idx)
           mC = ceiling(coverage * p)
           uC = coverage - mC
           LR[[k]] <- makeGRangesFromDataFrame(data.frame(chr = chromosome, 
                                                       start = start,
                                                       end = end,
                                                       strand = strand, 
                                                       mC = mC, uC = uC), 
                                               keep.extra.columns = TRUE)
      }
      if (!is.null(sample.ids)) names(LR) <- sample.ids
   } else {
      if (is.null(start) || is.null(end)) {
         widths <- round(seq(from = min_width, to = max_width,
                          length.out = regions))
         LR <- GRanges(seqnames = chromosome, 
                       IRanges(start = seq_len(length(widths)) * widths,
                               width = widths),
                       strand = strand)
      } else LR <- makeGRangesFromDataFrame(data.frame(chr = chromosome,
                                                       start = start,
                                                       end = end, 
                                                       strand = strand))
      if (is.null(mu)) mu <- seq(minCountPerIndv, maxCountPerIndv)
      counts <- t(sapply(seq_len(length(widths)), 
                           function(...) rnegbin(n = num.samples, 
                                               mu = sample(mu, 1),
                                               theta = theta)))
      colnames(counts) <- sample.ids
      mcols(LR) <- counts
      LR <- sortBySeqnameAndStart(LR)
   }
   return(LR)
}
genomaths/MethylIT.utils documentation built on Nov. 26, 2019, 2:10 a.m.