R/pairwiseSim.R

Defines functions profileSimParametric markerSimParametric

Documented in markerSimParametric profileSimParametric

#' Simulate marker data given IBD coefficients
#'
#' This function simulates genotypes for two individuals given their IBD
#' distribution, for N identical markers.
#'
#' Exactly one of `kappa`, `delta` and `states` must be given; the other two
#' should remain NULL.
#'
#' If `states` is given, it explicitly determines the condensed identity state
#' at each marker. The states are described by integers 1-9, using the tradition
#' order introduced by Jacquard.
#'
#' If `kappa` is given, the states are generated by the command `states =
#' sample(9:7, size = N, replace = TRUE, prob = kappa)`. (Note that identity
#' states 9, 8, 7 correspond to IBD status 0, 1, 2, respectively.)
#'
#' If `delta` is given, the states are generated by the command `states =
#' sample(1:9, size = N, replace = TRUE, prob = delta)`.
#'
#' @param kappa A probability vector of length 3, giving a set of realised kappa
#'   coefficients (between two noninbred individuals).
#' @param delta A probability vector of length 9, giving a set of condensed
#'   identity coefficients (Jacquard coefficients).
#' @param states An integer vector of length `N`, with entries in 1-9. Each
#'   entry gives the identity state of the corresponding marker. (See details.)
#' @param N A positive integer: the number of independent markers to be
#'   simulated.
#' @param alleles A vector with allele labels. If NULL, the following are tried
#'   in order:
#'
#'   * `names(afreq)`
#'
#'   * `seq_along(afreq)'
#'
#'   * `1:2` (fallback if both `alleles` and `afreq` are NULL)
#'
#' @param afreq A numeric vector with allele frequencies, possibly named with
#'   allele labels.
#' @param seed An integer seed for the random number generator (optional).
#' @param returnValue Either "singleton" (default) or "alleles". (see Value).
#'
#'
#' @return The output depends on the value of the `returnValue` parameter:
#'
#'   * "singletons": a list of two singletons with the simulated marker data
#'   attached.
#'
#'   * "alleles": a list of four vectors of length `N`, named `a`, `b`, `c` and
#'   `d`. These contain the simulated alleles, where a/b and c/d are the
#'   genotypes of the to individuals.
#'
#'   * "genotypes": a list of two vectors of length `N`, containing the
#'   simulated genotypes. Identical to `paste(a, b, sep = "/")` and `paste(c, d,
#'   sep = "/")`, where `a`, `b`, `c`, `d` are the vectors returned when
#'   `returnValue == "alleles"`.
#'
#'   * "internal": similar to "alleles", but using the index integer of each
#'   allele. (This option is mostly for internal use.)
#'
#'
#'
#' @examples
#' # MZ twins
#' markerSimParametric(kappa = c(0,0,1), N = 5, alleles = 1:10)
#'
#' # Equal distribution of states 1 and 2
#' markerSimParametric(delta = c(.5,.5,0,0,0,0,0,0,0), N = 5, alleles = 1:10)
#'
#' # Force a specific sequence of states
#' markerSimParametric(states = c(1,2,7,8,9), N = 5, alleles = 1:10)
#'
#' @export
markerSimParametric = function(kappa = NULL, delta = NULL, states = NULL,
                               N = 1, alleles = NULL, afreq = NULL, seed = NULL,
                               returnValue = c("singletons", "alleles", "genotypes", "internal")) {
  if(!is.null(seed))
    set.seed(seed)

  afreq = fixAllelesAndFreqs(alleles, afreq)
  alleles = names(afreq)

  # Sample random alleles, to be modified below. Genotypes are a/b and c/d.
  .a = sample(alleles, size = N, replace = TRUE, prob = afreq)
  .b = sample(alleles, size = N, replace = TRUE, prob = afreq)
  .c = sample(alleles, size = N, replace = TRUE, prob = afreq)
  .d = sample(alleles, size = N, replace = TRUE, prob = afreq)

  # Jacquard states
  useKappa = !is.null(kappa)
  useDelta = !is.null(delta)
  useStates = !is.null(states)
  if(useKappa + useDelta + useStates != 1)
    stop2("Exactly one of `kappa`, `delta`, `states` must be different from NULL")

  if(useKappa)
    states = sample(9:7, size = N, replace = TRUE, prob = kappa)
  else if(useDelta)
    states = sample(1:9, size = N, replace = TRUE, prob = delta)
  else if(length(states) == 1)
    states = rep_len(states, N)
  else if(length(states) != N)
    stop2("`states` must have length 1 or ", N)

  # Modify b, c, d such that each marker satisfy its assigned state
  S1 = states == 1 # All equal
  .b[S1] = .c[S1] = .d[S1] = .a[S1]

  S2 = states == 2 # a=b, c=d
  .b[S2] = .a[S2]
  .d[S2] = .c[S2]

  S3 = states == 3 # a=b=c
  .b[S3] = .c[S3] = .a[S3]

  S4 = states == 4 # a=b
  .b[S4] = .a[S4]

  S5 = states == 5 # a=c=d
  .c[S5] = .d[S5] = .a[S5]

  S6 = states == 6 # c=d
  .d[S6] = .c[S6]

  S7 = states == 7 # a=c, b=d
  .c[S7] = .a[S7]
  .d[S7] = .b[S7]

  S8 = states == 8 # a=c
  .c[S8] = .a[S8]


  # Return
  switch(match.arg(returnValue),
    singletons = setMarkers(x = singletons(1:2),
                            alleleMatrix = matrix(rbind(.a, .c, .b, .d), nrow = 2,
                                                  dimnames = list(1:2, NULL)),
                            locusAttributes = list(afreq = afreq)),
    genotypes = list(paste(.a, .b, sep="/"), paste(.c, .d, sep="/")),
    alleles = list(a = .a, b = .b, c = .c, d = .d),
    internal = list(a = match(.a, alleles), b = match(.b, alleles),
                    c = match(.c, alleles), d = match(.d, alleles)))
}



#' Simulate complete DNA profiles given IBD coefficients
#'
#' This function generalises [markerSimParametric()] in the same way that
#' [profileSim()] generalises [markerSim()].
#'
#' @inheritParams markerSimParametric
#' @param N A positive integer: the number of complete profiles to be simulated
#' @param freqList A list of numeric vectors. Each vector is the allele
#'   frequencies of a marker.
#'
#' @return A list of length `N`, whose entries are determined by `returnValue`,
#'   as explained in [markerSimParametric()].
#'
#' @examples
#' # A single profile with 9 markers, each with forced identity state
#' profileSimParametric(states = 1:9, freqList = NorwegianFrequencies[1:9])
#'
#' @export
profileSimParametric = function(kappa = NULL, delta = NULL, states = NULL,
                                N = 1, freqList = NULL, seed = NULL,
                                returnValue = c("singletons", "alleles", "genotypes", "internal")) {
  if(!is.null(seed))
    set.seed(seed)

  returnValue = match.arg(returnValue)

  # Return value in the marker-wise step
  retval0 = if(returnValue %in% c("singletons", "genotypes")) "alleles" else returnValue

  # Iterate over loci, make N simulations of each. For each locus,
  # store sims as matrix (4 * Nsim) for simplicity later
  sims_markerwise = lapply(seq_along(freqList), function(i) {
    markeri = markerSimParametric(kappa = kappa, delta = delta, states = states[i],
                                N = N, afreq = freqList[[i]], returnValue = retval0)
    do.call(rbind, markeri)
  })

  # For each sim, extract corresponding column from each locus.
  # Output each sim as list of 4 vectors (as markerSimParametric)
  sims = lapply(1:N, function(i)
    vapply(sims_markerwise, function(locsim) locsim[, i],
           FUN.VALUE = if(returnValue == "internal") integer(4) else character(4)))

  # Return
  switch(returnValue,
    singletons = lapply(sims, function(s)
        setMarkers(singletons(1:2),
                   alleleMatrix = matrix(s[c(1,3,2,4), ], nrow = 2, dimnames = list(1:2, NULL)),
                   locusAttributes = freqList)),
    genotypes = lapply(sims, function(s) list(paste(s[1,], s[2,], sep = "/"),
                                              paste(s[3,], s[4,], sep = "/"))),
    alleles =,
    internal = lapply(sims, function(s) list(a = s[1,], b = s[2,], c = s[3,], d = s[4,])))
}

Try the forrel package in your browser

Any scripts or data that you put into this service are public.

forrel documentation built on Sept. 11, 2024, 9:15 p.m.