R/sample_many_pedigree_genotypes.R

Defines functions sample_many_pedigree_genotypes

Documented in sample_many_pedigree_genotypes

#' @title Sample (many) genotypes for pedigree members according to allele frequencies by allele dropping and possibly taking linkage into account by simulating recombination.
#'
#' @param pedigree [ped][pedtools::ped] object
#' @param freqs Allele frequencies (see \link{read_allele_freqs})
#' @param loci Character vector of locus names (defaults to \code{names} attribute of \code{freqs})
#' @param unrelated_names Character vector with names of any additional unrelated persons. Defaults to length zero.
#' @param linkage_map A linkage map specifying cM distances between loci. If missing, loci are assumed to be independent.
#' @param number_of_replicates An integer specifying the number of replicate genotype samples to generate. Defaults to 1.
#' @param sex_locus_name Character vector, defaults to "AMEL"
#' @param return_transmission_vectors Should transmission vectors be returned as an attribute? These are usually not of interest, so the default is `FALSE`.
#'
#' @seealso \link{read_allele_freqs}
#' @examples
#' # load allele frequencies
#' freqs <- read_allele_freqs(system.file("extdata",
#'                             "FBI_extended_Cauc_022024.csv",
#'                             package = "simDNAmixtures"))
#'
#' # define a pedigree with two full siblings: S1 and S2
#' ped_fs <- pedtools::nuclearPed(children = c("S1", "S2"))
#'
#' # define two linked loci
#' linkage_map <- data.frame(chromosome = c(12, 12),
#'                           locus = c("vWA", "D12S391"),
#'                           position = c(15.15, 26.63))
#'
#'
#' # sample genotypes ignoring linkage
#' sample_many_pedigree_genotypes(pedigree = ped_fs, freqs = freqs,
#'                              loci = c("vWA", "D12S391"),
#'                             number_of_replicates = 10)
#'
#' # sample genotypes taking linkage into acconut
#' sample_many_pedigree_genotypes(pedigree = ped_fs, freqs = freqs,
#'                             loci = c("vWA", "D12S391"),
#'                             linkage_map = linkage_map,
#'                             number_of_replicates = 10)
#' @export
sample_many_pedigree_genotypes <- function(pedigree, freqs, loci = names(freqs),
                                           unrelated_names = character(),
                                           linkage_map,
                                           number_of_replicates = 1L,
                                           sex_locus_name = "AMEL",
                                           return_transmission_vectors = FALSE){

  if (missing(pedigree)){
    pedigree <- dummy_pedigree
  }

  .validate_pedigree(pedigree, disallow_U_names = TRUE)
  .validate_freqs(freqs, loci)

  locus_idx_by_name <- stats::setNames(seq_along(loci), loci)

  if (missing(linkage_map)) linkage_map <- NULL
  if (!is.null(linkage_map)) .validate_linkage_map(linkage_map)

  # add missing loci to linkage map (but not AMEL)
  missing_loci <- loci[!loci %in% c(linkage_map$locus, sex_locus_name)]
  linkage_map_used <- rbind(linkage_map[linkage_map$locus %in% loci,],
                            data.frame(chromosome = rep("missing", length(missing_loci)),
                                       locus = missing_loci,
                                       position = rep(NA, length(missing_loci))))

  # sample founders
  x <- .sample_many_founders(pedigree, number_of_replicates = number_of_replicates,
                             unrelated_names = unrelated_names,
                             freqs = freqs, loci = loci,
                             sex_locus_name = sex_locus_name,
                             return_integer = TRUE)

  # if there are no non-founders, then we are done here!

  # prepare indices for dropping alleles
  ped_row_is_founder <- pedigree$FIDX == 0 & pedigree$MIDX == 0
  ped_row_is_non_founder <- !ped_row_is_founder

  if (!any(ped_row_is_non_founder)){
    return(.many_genotypes_int_to_labels(x, freqs = freqs,
            sex_locus_name = sex_locus_name, loci = loci))
  }

  if (return_transmission_vectors){
    transmission_vectors_by_locus <- list()
  }

  ped_non_founder_row_idx <- which(ped_row_is_non_founder)
  ped_non_founder_fidx <- pedigree$FIDX[ped_non_founder_row_idx]
  ped_non_founder_midx <- pedigree$MIDX[ped_non_founder_row_idx]

  transmissions <- data.frame(non_founder_row = rep(ped_non_founder_row_idx, each = 2),
                              fidx = rep(ped_non_founder_fidx, each = 2),
                              midx = rep(ped_non_founder_midx, each = 2),
                              allele = if (length(ped_non_founder_row_idx)>0) c(1, 2) else integer())

  number_of_persons <- length(pedigree$ID) + length(unrelated_names)

  replicate_row_offsets <- rep(seq(from = 0, to = number_of_replicates - 1),
                               each = nrow(transmissions)) * number_of_persons

  # for every locus we will adjust column 2 here
  to_idx <- cbind(rep(transmissions$non_founder_row, times = number_of_replicates) + replicate_row_offsets,
                  NA)
  # for every locus we adjust column 2 here based on the transmission_vectors and the locus idx
  from_idx <- cbind(rep(ifelse(transmissions$allele == 1L,
                               transmissions$midx,
                               transmissions$fidx),
                        times = number_of_replicates) +
                      replicate_row_offsets,
                    NA)

  # split the linkage map by chromosome
  linkage_map_by_chromosome <- split(linkage_map_used, linkage_map_used$chromosome)
  chromosomes <- naturalsort::naturalsort(names(linkage_map_by_chromosome))

  chromosome = chromosomes[1]
  # start sampling data by chromosome
  for (chromosome in chromosomes){

    # prepare linkage map for this chromosome
    linkage_map_chromosome <- linkage_map_by_chromosome[[chromosome]]
    linkage_map_chromosome$recombination_rate <- NA_real_

    for (i_row in seq_len(nrow(linkage_map_chromosome))[-1]){
      delta_position <- linkage_map_chromosome$position[i_row] - linkage_map_chromosome$position[i_row - 1]

      linkage_map_chromosome$recombination_rate[i_row] <- if (is.na(delta_position))
        0.5 else pedprobr::haldane(cM = delta_position)
    }

    # sample locus-by-locus
    chromosome_i_locus <- 1L
    for (chromosome_i_locus in seq_len(nrow(linkage_map_chromosome))){
      # sample a starting transmission vector for each replicate
      if (chromosome_i_locus == 1){
        transmission_vectors <- matrix(sample.int(n = 2,
                                                  size = nrow(transmissions) * number_of_replicates,
                                                  replace = TRUE),
                                       nrow = nrow(transmissions))
      }else{
        recombination_rate <- linkage_map_chromosome$recombination_rate[chromosome_i_locus]

        swapped <- c(2,1)[transmission_vectors]
        swap <- sample(c(TRUE, FALSE),
                       size = length(transmission_vectors), replace = TRUE,
                       prob = c(recombination_rate, 1.0 - recombination_rate))

        transmission_vectors <- matrix(ifelse(swap, yes = swapped, no = transmission_vectors),
                                       nrow = nrow(transmissions))
      }

      # determine the index of the loci in the output
      locus_name <- linkage_map_chromosome$locus[chromosome_i_locus]
      locus_idx <- as.integer(locus_idx_by_name[locus_name])

      if (return_transmission_vectors){
        transmission_vectors_by_locus[[locus_name]] <- transmission_vectors
      }

      # drop alleles down the pedigree for this locus
      from_idx[, 2] <- as.vector(transmission_vectors) + (2 * (locus_idx - 1))
      to_idx[, 2] <- rep(transmissions$allele, times = number_of_replicates) + (2 * (locus_idx - 1))

      for (i in seq_len(nrow(to_idx))){
        x[to_idx[i, , drop = FALSE]] <- x[from_idx[i, , drop=FALSE]]
      }
    }
  }

  result <- .many_genotypes_int_to_labels(x, freqs = freqs,
                                         sex_locus_name = sex_locus_name, loci = loci)

  if (return_transmission_vectors){
    attr(result, "transmissions") <- transmissions
    attr(result, "transmission_vectors_by_locus") <- transmission_vectors_by_locus
  }

  return(result)
}

Try the simDNAmixtures package in your browser

Any scripts or data that you put into this service are public.

simDNAmixtures documentation built on April 15, 2025, 1:11 a.m.