R/sampling-utilities.R

# Copyright (C) 2015 Peter Hickey
#
# This file is part of methsim.
#
# methsim is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# methsim is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with methsim  If not, see <http://www.gnu.org/licenses/>.

### =========================================================================
### Sampling utility functions. These functions are not exported.
### -------------------------------------------------------------------------

#' Sample MethLevelDT
#'
#' Sample with replacement from meth_level_dt, stratified by region_type, such
#' that each region is assigned an (average) methylation level.
#' @keywords internal
.sampleMethLevelDT <- function(meth_level_dt, region_type) {
  region_type_levels <- levels(region_type)
  meth_level <- rep(NA_real_, length(region_type))
  for (level in region_type_levels) {
    mli <- which(meth_level_dt[, type] == level)
    ml <- meth_level_dt[mli, ]
    rti <- which(region_type == level)
    n <- length(rti)
    meth_level[rti] <- ml[sample(nrow(ml),
                                   n,
                                   replace = TRUE,
                                   prob = ml[, N]), beta]
  }
  meth_level
}

#' Sample from co-methylation distribution.
#'
#' Sample with replacement from cometh_dt, stratified by region_type, such that
#' methylation loci 2-tuple is assigned a level of within-fragment
#' co-methylation (on the log odds-ratio scale i.e., lor). If there are fewer
#' than \code{min_n} observations for that IPD-type combination then the lor is
#' simulated from a Gaussian distribution, with mean given by \code{mean_fun}
#' and standard deviation given by \code{sd_fun} (see Note).
#' @param two_tuples a
#' \code{MethylationTuples::\link[MethylationTuples]{Mtuples}} containing all
#' two-tuples of methylation loci in the genome. Typically generated by
#' \code{MethylationTuples::\link[MethylationTuples]{findMTuples}}.
#' @param partitioned_methylome a \code{\link{PartitionedMethylome}} object.
#' @param min_n the minimum number of observations on a IPD-type combination in
#' order to sample from the empirical distribution as opposed to sampling from
#' the Gaussian(mean_fun(ipd, type), sd_fun(ipd, type)) distribution.
#' @param mean_fun a function specifying the mean LOR for a given IPD-type
#' combination.
#' @param sd_fun a function specifying the standard deviation of the LOR for a
#' given IPD-type combination.
#'
#' @note Good values of \code{mean_fun} and \code{sd_fun} are difficult to
#' estimate because we have little-to-no data on within-fragment co-methylation
#' for large IPDs (approximately $IPD > 200$). Fortunately, at least in the
#' human genome, more than 90\% of adjacent CpG two-tuples have an $IPD < 200$,
#' therefore this choice only affects a minority of loci. The default
#' \code{mean_fun} effectively means that the methylation state of CpGs with a
#' larger IPD are "independent on average"; the default \code{sd_fun} is based
#' on the median standard deviation of the LOR-by-IPD distribution (these
#' distributions are remarkably stable across IPD and region-type, with a value
#' close to 2).
#' Clearly it should be possible to obtain better estimates of these
#' parameters, which is the subject of ongoing research (ideas welcome!).
#'
#' @return A vector of log odds-ratios (LOR), one for each 2-tuple.
#' @keywords internal
.sampleComethDT <- function(two_tuples, cometh_dt, partitioned_methylome,
                            min_n = 100L, mean_fun = function(ipd, type) 0,
                            sd_fun = function(ipd, type) 2) {

  # If a 2-tuple isn't within a region, then the region_type is (somewhat
  # arbitrarily) defined by the region for the first methylation loci in the
  # tuple.
  within_ol <- findOverlaps(two_tuples, partitioned_methylome, type = "within")
  not_within <- which(!seq_len(queryLength(within_ol)) %in%
                        queryHits(within_ol))
  end_ol <- findOverlaps(two_tuples[not_within], partitioned_methylome,
                         type = "any", select = "first")
  mcols(two_tuples)$regionType <-
    regionType(partitioned_methylome)[c(subjectHits(within_ol), end_ol)]
  region_type <- regionType(partitioned_methylome)
  region_type_levels <- levels(region_type)

  two_tuples_dt <- data.table(IPD = as.vector(IPD(two_tuples)),
                              type = regionType(partitioned_methylome)[
                                c(subjectHits(within_ol), end_ol)])
  # Tabulate the number of occurences of each IPD-type combination (n).
  two_tuples_reduced_dt <- two_tuples_dt[, list(n = .N), by = list(IPD, type)]
  setkey(two_tuples_reduced_dt, IPD, type)
  setkey(cometh_dt, IPD, type)

  lor_list <- lapply(seq_len(nrow(two_tuples_reduced_dt)),
                     function(i, two_tuples_reduced_dt, cometh_dt, min_n) {
    # Create a table with all the co-methylation data for that IPD-type
    # combination
    x <- cometh_dt[two_tuples_reduced_dt[i, ]]
    # If there are a sufficient number of observations then sample from the
    # empirical distribution, otherwise sample from the loess-smoothed
    # Gaussian model.
    if (isTRUE(x[, sum(N)] > min_n)) {
      x[, sample(x = statistic, size = n, replace = TRUE, prob = N)]
    } else {
      rnorm(n = two_tuples_reduced_dt[i, n],
            mean = mean_fun(two_tuples_reduced_dt[i, IPD],
                            two_tuples_reduced_dt[i, type]),
            sd = sd_fun(two_tuples_reduced_dt[i, IPD],
                        two_tuples_reduced_dt[i, type]))
    }
  }, two_tuples_reduced_dt, cometh_dt, min_n)

  # Extract values from lor_list and insert them appropriately for the
  # two_tuples based on IPD-type combination.
  two_tuples_dt[, idx := .I]
  setkey(two_tuples_dt, IPD, type)
  two_tuples_dt[, lor := unlist(x = lor_list, recursive = TRUE,
                                use.names = FALSE)]
  setkey(two_tuples_dt, idx)
  two_tuples_dt[, lor]
}

#' Sample PatternFreqsDT
#'
#' Sample with replacement from pattern_freqs_dt, stratified by region_type,
#' such that each region is assigned a vector of haplotype frequencies.
#' @keywords internal
.samplePatternFreqsDT <- function(pattern_freqs_dt, region_type) {
  W_names <- grep("^w", colnames(pattern_freqs_dt), value = TRUE)
  region_type_levels <- levels(region_type)
  W <- matrix(NA_real_, ncol = length(W_names), nrow = length(region_type))
  for (level in region_type_levels) {
    pfi <- which(pattern_freqs_dt[, type] == level)
    pf <- pattern_freqs_dt[pfi, ]
    rti <- which(region_type == level)
    n <- length(rti)
    W[rti, ] <- as.matrix(
      pf[sample(nrow(pf),
                n,
                replace = TRUE,
                prob = pf[, N])][, W_names, with = FALSE])
  }
  W
}

#' Sample read start positions.
#'
#' @param n the number of reads to simulate
#' @param seqlength the length of the chromosome
#'
#' @return A sorted integer vector of read start positions.
#'
#' @note This function can return a read start positions that results in a read
#' running off the end of the chromosome, e.g., if the read start position is
#' 99, the read_length is 10 but the chromosome length is only 100.
#'
#' @keywords internal
.sampleReadStart <- function(n, seqlength) {
  # TODO: Should sampling be with or without replacement?
  sample(x = seqlength, size = n, replace = TRUE)
}
PeteHaitch/methsim documentation built on May 8, 2019, 1:32 a.m.