R/combine_loci.R
In clipp: Calculating Likelihoods by Pedigree Paring

Documented in combine_loci

#' Combine the genotype frequencies and transition matrices of two genetic loci
#'
#' A function to calculate the genotype frequencies and the transition matrices
#' for the joint genotypes of two unlinked genetic loci in linkage
#' equilibrium, given the corresponding objects for the separate loci.
#' The results from this function can be used as inputs to
#' \code{\link{pedigree_loglikelihood}} or
#' \code{\link{genotype_probabilities}}
#' to model the combined effect of the two loci on phenotypes.
#'
#' @param geno_freq1  A vector of strictly positive numbers that sum to `1`,
#' with `geno_freq1[i]` interpreted as the population genotype frequency of the
#' `i`th possible genotype at a genetic locus (locus 1).
#' When `annotate` is `TRUE`, the names of the genotypes at locus 1 will be
#' taken to be `names(geno_freq1)` or, if `names(geno_freq1)` is `NULL`, to be
#' `1:length(geno_freq1)`.
#'
#' @param geno_freq2  Similar to `geno_freq1` (above) but interpreted as the
#' population genotype frequencies for a different genetic locus (locus 2).
#'
#' @param trans1  An `ngeno1^2` by `ngeno1` matrix of non-negative numbers
#' whose rows sum to `1`, where `ngeno1 = length(geno_freq1)`.
#' This matrix is usually generated by \code{\link{trans_monogenic}} or a similar
#' helper function, and its elements are interpreted as genetic
#' transmission probabilities for locus 1 (see \code{\link{trans_monogenic}}
#' for more details).  If `trans1` has `ngeno1 + 2` instead of `ngeno1` columns,
#' as could occur if it was generated by
#' \code{\link{trans_monogenic}} with `annotate = TRUE`, then the first two
#' columns will be deleted and `trans1` will be converted to a matrix.
#'
#' @param trans2  Similar to `trans1` (above) but interpreted as the
#' genetic transmission probabilities for locus 2.
#'
#' @param annotate A logical flag. When `FALSE` (the default), the function
#' returns objects that can be used as the `geno_freq` and `trans` arguments of
#' \code{\link{pedigree_loglikelihood}}. When `TRUE`, the function annotates
#' these objects (and converts `trans` to a data frame) to make the output more
#' easily understood by humans.
#'
#' @details This function combines the genotype frequencies and transition
#' probabilities of two unlinked genetic loci that are in linkage equilibrium
#' in a given population.  Because
#' the loci are unlinked, any person's genotypes at the two loci are
#' conditionally independent given his or her parental genotypes,
#' and because the loci are in linkage equilibrium, the genotypes at the two
#' loci for a random person from the population are independent.
#' This function uses these assumptions to calculate the population frequencies
#' and transition probabilities for the joint genotypes of the two loci, where a
#' joint genotype is just a pair consisting of a genotype at locus 1
#' and a genotype at locus 2.  If the `annotate` option is set to `FALSE` then
#' these frequencies and probabilities can be used in
#' \code{\link{pedigree_loglikelihood}} to model the combined effect of the two
#' loci on the phenotypes.  By a repeated application of this function, more
#' than two genetic loci can be included in the genetic model.
#'
#' @return A list with the following components:
#' \item{geno_freq}{A vector of strictly positive numbers (the joint genotype
#' frequencies) that sum to `1`, with genotype names added when `annotate` is `TRUE`}
#' \item{trans}{Either a matrix of genetic transmission probabilities suitable to be
#' used as the `trans` argument of \code{\link{pedigree_loglikelihood}}
#' (if `annotate` is `FALSE`), or a data frame that is an annotated version of
#' this matrix (if `annotate` is `TRUE`).}
#' \item{genotype_decoder}{A data frame giving the locus 1 and locus 2 genotypes
#' that correspond to each joint genotype.  In some cases, this could aid the user's
#' calculation of the `penet` argument of \code{\link{pedigree_loglikelihood}}.}
#'
#' @examples
#' pa1 <- c(0.9, 0.1); names(pa1) <- c("-","+")
#' pa2 <- c(0.5, 0.5); names(pa2) <- c("A","a")
#' (geno_freq1 <- geno_freq_monogenic(pa1, TRUE))
#' (geno_freq2 <- geno_freq_monogenic(pa2, TRUE))
#' (trans1 <- trans_monogenic(2, TRUE))
#' (trans2 <- trans_monogenic(2))
#' (cl <- combine_loci(geno_freq1, geno_freq2, trans1, trans2, TRUE))
#' sum(cl$geno_freq)
#' apply(cl$trans[,-(1:2)], 1, sum)
#'
#' @export
#'
combine_loci <- function(geno_freq1, geno_freq2, trans1, trans2, annotate = FALSE) {

  # Check that the dimensions are right
  # Delete the first 2 columns of trans1 and trans2 if need be (e.g. if annotated)
  if (ncol(trans1) == length(geno_freq1) + 2)  {trans1 <- as.matrix(trans1[,-(1:2)])}
  if (ncol(trans2) == length(geno_freq2) + 2)  {trans2 <- as.matrix(trans2[,-(1:2)])}
  ng1 <- length(geno_freq1)
  ng2 <- length(geno_freq2)
  if (nrow(trans1) != ng1^2) {
    stop("trans1 should have length(geno_freq1)^2 rows", call. = FALSE)
  }
  if (ncol(trans1) != ng1) {
    stop("trans1 should have length(geno_freq1) columns", call. = FALSE)
  }
  if (nrow(trans2) != ng2^2) {
    stop("trans2 should have length(geno_freq2)^2 rows", call. = FALSE)
  }
  if (ncol(trans2) != ng2) {
    stop("trans2 should have length(geno_freq2) columns", call. = FALSE)
  }

  # List the possible joint genotypes geno.list
  join.char <- "_"
  if (is.null(names(geno_freq1))) {names1 <- 1:ng1} else {names1 <- names(geno_freq1)}
  if (is.null(names(geno_freq2))) {names2 <- 1:ng2} else {names2 <- names(geno_freq2)}
  eg <- cbind(rep(1:ng1, each = ng2),
              rep(1:ng2, times = ng1))
  f <- function(x) {  return(paste0(names1[x[1]], join.char, names2[x[2]]))  }
  geno.list <- apply(eg, 1, f)
  ng <- length(geno.list)
  geno.list
  ng - ng1*ng2

  # Calculate the transmission matrix
  trans <- matrix(0, ng^2, ng)
  gm.list <- rep("", ng^2)
  gf.list <- rep("", ng^2)
  for (gm in 1:ng) {
    for (gf in 1:ng) {
      for (go in 1:ng) {
        # gm <- 1; gf <- 3
        gm1 <- eg[gm,1]
        gm2 <- eg[gm,2]
        gf1 <- eg[gf,1]
        gf2 <- eg[gf,2]
        go1 <- eg[go,1]
        go2 <- eg[go,2]
        trans[ng * gm + gf - ng, go] <- trans1[ng1 * gm1 + gf1 - ng1, go1] *
                                        trans2[ng2 * gm2 + gf2 - ng2, go2]
        gm.list[ng * gm + gf - ng] <- geno.list[gm]
        gf.list[ng * gm + gf - ng] <- geno.list[gf]
      }
    }
  }
  if (annotate) {
    trans <- data.frame(trans)
    trans <- data.frame(gm = gm.list, gf = gf.list, trans)
    names(trans)[3:ncol(trans)] <- geno.list
  }

  # Calculate the genotype probabilities
  pg <- numeric(ng)
  for (j in 1:ng) {
    j1 <- eg[j,1]
    j2 <- eg[j,2]
    pg[j] <- geno_freq1[j1] * geno_freq2[j2]
  }
  if (annotate) {names(pg) <- geno.list}

  # Output the genotypes at the two loci corresponding to each joint genotype
  # This will (often) help when calculating the penetrance matrix
  genotype_decoder <- data.frame(joint.genotype = 1:ng,
                                 genotype1 = eg[,1],
                                 genotype2 = eg[,2])
  if (annotate) {
    genotype_decoder <- data.frame(joint.genotype = geno.list,
                                   genotype1 = names1[eg[,1]],
                                   genotype2 = names2[eg[,2]])
  }

  return(list(geno_freq = pg, trans = trans, genotype_decoder = genotype_decoder))
}