R/haplotype-simulation.R

sampleHaplotypeFromSFS <- Vectorize(function(freqs) rbinom(1, 1, freqs), vectorize.args='freqs')

randomGametes <- function(freqs, F=0) {
  inbred <- sample(0:1, 1, prob=c(1-F, F))
  h1 <- sampleHaplotypeFromSFS(freqs)
  if (inbred) {
    return(cbind(h1, h1))
  }
  h2 <- sampleHaplotypeFromSFS(freqs)
  cbind(h1, h2)
}

createIndividuals <- function(n, freqs, F=0) {
  replicate(n, randomGametes(freqs, F),
            simplify=FALSE)
}

haplotypes2NoisyGenotypes <- function(x, ehet, ehom, na_rate=0) {
  d <- do.call(cbind, lapply(x, function(y) addGenotypeError(rowSums(y), ehet, ehom)))
  if (na_rate == 0)
    return(d)
  d[as.logical(rbinom(length(d), 1, na_rate))] <- NA_integer_
  d
}

#' Simulate a half-sib family, fixed number of sites
#'
#' @param n size of halfsib family
#' @param nsites number of sites
#' @param rates selfing, halfsib, fullsib proportions
#' @param nfullsib_fathers number of fathers contributing to fullsib families
#' @param father_F proportion of fathers that are inbred
#' @param mom_inbred whether mom is inbred
#' @param ehet heterozygous error rate
#' @param ehom homozygous error rate
#'
#' @return a list full of simulation data
#' @export
sibFamily <- function(nprogeny, nsites,
                      rates=c(selfing=0.5, halfsib=0.2, fullsib=0.3),
                      nfullsib_fathers=2, mom_inbred=FALSE,
                      father_F=0.1, ehet=0.8, ehom=0.1,
                      na_rate=0) {

  # n is number of parents, an overestimate. Assuming no selfing or fullsibs,
  # this is at most 2*nprogeny (each kid has separate parents)
  n <- 2*nprogeny
  freqs <- sampleSiteFreqs(n, nsites)
  types <- factor(c("selfed", "halfsib", "fullsib"))
  ind_type <- sample(types, nprogeny, prob=rates, replace=TRUE)

  # generate mother (who is a father in selfs)
  mom <- randomGametes(freqs, F=as.numeric(mom_inbred))

  # generate fathers, fullsib fathers + number of halfsib families
  nhalfsib_families <- sum(ind_type == "halfsib")
  nfullsib_families <- sum(ind_type == "fullsib")
  parents <- createIndividuals(nfullsib_fathers+nhalfsib_families, freqs, father_F)
  parents <- c(list(mom), parents) # mom is potential father (selfing), kept in pos 1

  # assign labels to fathers, used to sample subsets for particular types
  # of fathers
  father_type <- rep(c("self", "halfsib", "fullsib"),
                     c(1, nhalfsib_families, nfullsib_fathers))
  fathers <- integer(nprogeny)
  fathers[ind_type == "selfed"] <- 1L # mother is going to be set to father 1
  fathers[ind_type == "halfsib"] <- sample(which(father_type == "halfsib"),
                                           nhalfsib_families)
  fathers[ind_type == "fullsib"] <- sample(which(father_type == "fullsib"),
                                           nfullsib_families, replace=TRUE)
  stopifnot(!any(is.na(fathers)))

  # generate which father gamete we will receive.
  father_gamete_i <- sample(1:2, nprogeny, replace=TRUE)
  mother_gamete_i <- sample(1:2, nprogeny, replace=TRUE)

  # metadata dataframe to keep track of everything
  d <- data.frame(type=ind_type, mother=1L, father=fathers,
                  mgamete=mother_gamete_i, fgamete=father_gamete_i)
  stopifnot(all(fathers[d$type == "selfed"] == 1))
  progeny <- mapply(function(mg, fg, which_father) {
                      father_gamete <- parents[[which_father]][, fg]
                      mother_gamete <- mom[, mg]
                      cbind(mother_gamete, father_gamete)
                    }, mother_gamete_i, father_gamete_i, fathers,
                    SIMPLIFY=FALSE)
  genos <- haplotypes2NoisyGenotypes(progeny, ehet, ehom, na_rate)

  # parent genotypes
  parent_genos <- haplotypes2NoisyGenotypes(parents, ehet, ehom, na_rate)
  list(progeny=genos, metadata=d, freqs=freqs,
       haplos=parents, genos=parent_genos)
}
vsbuffalo/ProgenyArray documentation built on May 3, 2019, 6:41 p.m.