R/HWadmiX.R

#' Carries out likelihood ratio test and chi-squared test for Hardy-Weinberg equilibrium for 
#' X chromosomal markers, accounting for the possibility of sex-biased admixture
#'
#' @param X a named vector containing the genotype counts (names should be AA, AB, BB, A, B)
#'
#' @return list with the following elements:
#' 
#'   lrt_pval: the p-value of the LRT test
#'   
#'   prev_p_f, prev_p_m: the ML estimates of the previous generation allele frequencies in females and males
#'   
#'   F_ic: the ML estimate of the inbreeding coefficient
#'   
#'   current_p_f, current_p_m: estimates of the current generation allele frequencies in females and males, 
#'   based on the ML estimates of the previous generation allele frequencies and of the inbreeding coefficient
#'   
#'   current_p_AA, current_p_AB, current_p_BB: estimates of the current generation genotype frequencies in females, 
#'   based on the ML estimates of the previous generation allele frequencies and of the inbreeding coefficient
#'   
#'   cs_ml_pval: the p-value of the chi-squared (maximum likelihood) test
#'   
#'   you_lrt0_unconstrained_pval: the p-value of the LRT0 test from 
#'        X.-P. You, Q.-L. Zou, J.-L. Li and J.-Y. Zhou, Likelihood ratio test for 
#'        excess homozygosity at marker loci on X chromosome. LRT0 test, PLoS One 2015; 10(12): e0145032,
#'        where the inbreeding coefficient is not constrained to be in [0,1]
#'
#' @examples
#' snp <- c(A = 122, B = 278, AA = 88, AB = 242, BB = 70)
#' HWadmiX(snp)
#'
#' @export
HWadmiX <- 
function(X) {
  
  if (!any(c("integer", "numeric") %in% class(X))) {
    stop("X must be a numeric or integer vector")
  }
  if (!all(c("AA", "AB", "BB", "A", "B") %in% names(X))) {
    stop("X must have entries AA, AB, BB, A and B")
  }
  
  grid_base <- expand.grid(p_m = seq(0, 1, by = 0.01), 
                      p_f = seq(0, 1, by = 0.01), 
                      F_ic = seq(-1, 1, by = 0.01),
                      LL = NA, 
                      index = NA)
  grid <- grid_base
  grid[, "index"] <- 1:nrow(grid)
  
  grid[, "LL"] <- LogLikelihood(grid, X)
  simple_grid <- grid[grid[, "F_ic"] == 0, ]
  simple_which_max <- which.max(simple_grid[, "LL"])
   
  simple_model_logLL <- simple_grid[simple_which_max, "LL"]

  complex_which_max <- which.max(grid[, "LL"])
  
  complex_model_logLL <- grid[complex_which_max, "LL"]
  
  estimates <- as.list(grid[complex_which_max, c("p_m", "p_f", "F_ic")])
  statistic <- -2 * (simple_model_logLL - complex_model_logLL)
  pval <- pchisq(statistic, 1, lower.tail = F)
  prev_p_f <- estimates[['p_f']]
  prev_p_m <- estimates[['p_m']]
  
  chi_squared_mm_pval <- ChiSquaredTest(dat = X)
  chi_squared_ml_pval <- ChiSquaredTest(dat = X, 
                                        p_Afprev = simple_grid[simple_which_max, "p_f"], 
                                        p_Amprev = simple_grid[simple_which_max, "p_m"])
  
  F_ic <- estimates[['F_ic']]
  current_p_f <- (prev_p_f + prev_p_m) / 2
  current_p_m <- prev_p_f
  Q <- F_ic * current_p_f * (1 - current_p_f)
  current_p_AA <- current_p_f ^ 2 + Q
  current_p_BB <- (1 - current_p_f) ^ 2 + Q
  current_p_AB <- 1 - current_p_AA - current_p_BB
  
  grid_you <- grid_base
  grid_you[, "LL"] <- LogLikelihoodLRT0(grid_you, X)
  
  complex_you_which_max <- which.max(grid_you[, "LL"])
  complex_you_logLL <- grid_you[complex_you_which_max, "LL"]
  
  simple_you_logLL <- LogLikelihoodLRT0Null(X)
  you_statistic <- -2 * (simple_you_logLL - complex_you_logLL)
  pval_LRT0 <- pchisq(you_statistic, 2, lower.tail = F)
  
  return(list(lrt_pval = pval,
              prev_p_f = prev_p_f,
              prev_p_m = prev_p_m,
              F_ic = F_ic, 
              current_p_f = current_p_f,
              current_p_m = current_p_m,
              current_p_AA = current_p_AA, 
              current_p_AB = current_p_AB, 
              current_p_BB = current_p_BB, 
              cs_ml_pval = chi_squared_ml_pval,
              you_lrt0_unconstrained_pval = pval_LRT0))
}

CalcLL <- function(n_AA, n_AB, n_BB, n_A, n_B, 
                   p_AA, p_AB, p_BB, p_A, p_B) {
  
  df <- list(n = c(n_AA = n_AA, n_AB = n_AB, n_BB = n_BB, n_A = n_A, n_B = n_B), 
             p = list(p_AA = p_AA, p_AB = p_AB, p_BB = p_BB, p_A = p_A, p_B = p_B))
  return(CalcLLDf(df))
}

SafeLog <- function(x) {
  negs <- x < 0
  non_negs <- x >= 0
  logx <- rep(NA, length(x))
  logx[non_negs] <- log(x[non_negs])
  return(logx)
}

CalcLLDf <- function(df) {
  
  terms_to_keep <- names(df$n)[which(df$n > 0)]
  result <- 0
  for (n in terms_to_keep) {
    result <- result + df$n[n] * SafeLog(df$p[[gsub("n_", "p_", n)]])
  }
  return(result)
}

LogLikelihood <- function(grid, dat) {
  # dat is a vector with entries AA, AB, BB, A and B
  # grid is a matrix with columns p_f, p_m and F_ic
  
  # returns numeric vector of log-likelihoods of the data, one for each row of the matrix grid
  
  n_AA <- unname(dat['AA'])
  n_AB <- unname(dat['AB'])
  n_BB <- unname(dat['BB'])
  n_A <- unname(dat['A'])
  n_B <- unname(dat['B'])
  
  p_f <- grid[, "p_f"]
  p_m <- grid[, "p_m"]
  F_ic <- grid[, "F_ic"]
  
  Q <- p_m * (1 - p_f) + p_f * (1 - p_m)
  
  p_AA <- p_m * p_f + F_ic / 2 * Q
  p_AB <- (1 - F_ic) * Q
  p_BB <- (1 - p_m) * (1 - p_f) + F_ic / 2 * Q
  p_A <- p_f
  p_B <- 1 - p_f
  LL <- CalcLL(n_AA = n_AA, n_AB = n_AB, n_BB = n_BB, n_A = n_A, n_B = n_B, 
               p_AA = p_AA, p_AB = p_AB, p_BB = p_BB, p_A = p_A, p_B = p_B)
  LL[p_AA < 0 | p_BB < 0] <- -Inf
  return(LL)
}

LogLikelihoodLRT0 <- function(grid, dat) {
  # dat is a vector with entries AA, AB, BB, A and B
  # grid is a matrix with columns p_f, p_m and F_ic
  
  # returns numeric vector of log-likelihoods of the data, one for each row of the matrix grid
  
  n_AA <- unname(dat['AA'])
  n_AB <- unname(dat['AB'])
  n_BB <- unname(dat['BB'])
  n_A <- unname(dat['A'])
  n_B <- unname(dat['B'])
  
  p_f <- grid[, "p_f"]
  p_m <- grid[, "p_m"]
  F_ic <- grid[, "F_ic"]
  
  Q <- p_f * (1 - p_f)
  
  p_AA <- p_f * p_f + F_ic * Q
  p_AB <- 2 * (1 - F_ic) * Q
  p_BB <- (1 - p_f) ^ 2 + F_ic * Q
  p_A <- p_m
  p_B <- 1 - p_m
  LL <- CalcLL(n_AA = n_AA, n_AB = n_AB, n_BB = n_BB, n_A = n_A, n_B = n_B, 
               p_AA = p_AA, p_AB = p_AB, p_BB = p_BB, p_A = p_A, p_B = p_B)
  LL[p_AA < 0 | p_BB < 0] <- -Inf
  return(LL)
}

CalculateChiSquared <- function(o, e) {
  # calculates chi-squared statistic
  # o: vector of observed counts
  # e: vector of expected counts
  
  exclude <- o == 0 & e == 0
  o_include <- o[!exclude]
  e_include <- e[!exclude]
  
  return(sum((o_include - e_include) ^ 2 / e_include))
}

ChiSquaredTest <- function(dat, p_Afprev = NULL, p_Amprev = NULL) {
  n_AA <- unname(dat['AA'])
  n_AB <- unname(dat['AB'])
  n_BB <- unname(dat['BB'])
  n_A <- unname(dat['A'])
  n_B <- unname(dat['B'])
  
  n_m <- n_A + n_B
  n_f <- n_AA + n_AB + n_BB
  n <- n_m + n_f
  p_Af <- (n_AA + 0.5 * n_AB) / n_f
  p_Am <- n_A / n_m
  if (is.null(p_Amprev)) {
    p_Amprev <- 2 * p_Af - p_Am     # allele frequency among males in previous generation
  }
  if (is.null(p_Afprev)) {
    p_Afprev <- p_Am     # allele frequency among females in previous generation
  }
  
  e_AA <- n_f * p_Afprev * p_Amprev
  e_AB <- n_f * ((1 - p_Afprev) * p_Amprev + p_Afprev * (1 - p_Amprev))
  e_BB <- n_f * (1 - p_Afprev) * (1 - p_Amprev)
  
  observed <- c(n_AA, n_AB, n_BB, n_A, n_B)
  expected = c(e_AA, e_AB, e_BB, n_m * p_Afprev, n_m * (1 - p_Afprev))
  
  statistic <- CalculateChiSquared(o = observed, 
                                   e = expected)
  
  pval <- pchisq(statistic, 1, lower.tail = F)
  if (any(expected == 0 & observed > 0)) {
    pval <- 0
  } 
  if (is.na(pval)) browser()
  return(pval)
}

LogLikelihoodLRT0Null <- function(dat) {
  # dat is a vector with entries AA, AB, BB, A and B
  
  # returns log-likelihood of the data under You et al. LRT0 test null hypothesis
  
  n_AA <- unname(dat['AA'])
  n_AB <- unname(dat['AB'])
  n_BB <- unname(dat['BB'])
  n_A <- unname(dat['A'])
  n_B <- unname(dat['B'])
  
  tot_A <- n_A + 2 * n_AA + n_AB
  tot_B <- n_B + 2 * n_BB + n_AB
  tot <- tot_A + tot_B
  p_A <- tot_A / tot
  p_AA <- p_A ^ 2
  p_AB <- 2 * p_A * (1 - p_A)
  p_BB <- (1 - p_A) ^ 2
  p_B <- 1 - p_A
  LL <- CalcLL(n_AA = n_AA, n_AB = n_AB, n_BB = n_BB, n_A = n_A, n_B = n_B, 
               p_AA = p_AA, p_AB = p_AB, p_BB = p_BB, p_A = p_A, p_B = p_B)
  return(LL)
}
dbackenroth/HWadmiX documentation built on May 16, 2019, 9:14 a.m.