R/call_genotypes.R

Defines functions get_GP_bayesian get_AF eBeta fit_beta_em call_genotypes

Documented in call_genotypes eBeta fit_beta_em get_AF get_GP_bayesian

#' Call genotypes based on EM algorithm
#' 
#' The Expectation–maximization (EM) algorithm is used to fit a mixture of three beta distributions representing the three genotypes (AA, AB, and BB) and one uniform distribution representing the outliers (adapted from ewastools). Probe-specific weights were used in the EM algorithm.
#'
#' @param RAI A matrix of RAI (Ratio of Alternative allele Intensity) for probes. Provide probes as rows and samples as columns.
#' @param pop Population to be used to extract AFs. One of EAS, AMR, AFR, EUR, SAS, and ALL.
#' @param type One of snp_probe, typeI_probe, and typeII_probe.
#' @param maxiter Maximal number of iterations for the EM algorithm.
#' @param bayesian Use the Bayesian approach to calculate posterior genotype probabilities.
#' @param platform EPIC or 450K.
#' @param verbose Verbose mode: 0/1/2.
#' @return  A list containing
#' \item{RAI}{Ratio of Alternative allele Intensity}
#' \item{shapes}{Shapes of the mixed beta distributions}
#' \item{weights}{Prior probabilities that the RAI values belong to one of the three genotypes}
#' \item{U}{Overall probability of RAI values being outlier}
#' \item{outliers}{Probability of each RAI value being outlier}
#' \item{logLik}{Log-likelihood}
#' \item{GP}{Genotype probabilities of the three genotypes}
#' @export
call_genotypes <- function(RAI, pop, type, maxiter=50, bayesian=FALSE, platform="EPIC", verbose=1){
  # Fit mixed beta distribution based on EM
  print(paste(Sys.time(), "Running EM to fit beta distributions for RAI values."))
  finalClusters <- fit_beta_em(RAI, maxiter=maxiter, verbose=verbose)
  GP <- finalClusters$GP

  # Get posterior genotype probabilities using a Bayesian approach
  if(bayesian){
    print(paste(Sys.time(), "Calculating genotype probabilities using a Bayesian approach."))
    probe2af <- get_AF(pop=pop, type=type, platform=platform)
    AF <- matrix(rep(probe2af[rownames(RAI)], ncol(RAI)), ncol=ncol(RAI), dimnames=list(rownames(RAI), colnames(RAI)))
    GP <- get_GP_bayesian(GP$pAA, GP$pAB, GP$pBB, AF)
  }
  
  # return
  list(RAI=RAI, 
       shapes = finalClusters$shapes, 
       weights = finalClusters$weights, 
       U = finalClusters$U, 
       outliers = finalClusters$outliers,
       logLiks = finalClusters$logLiks,
       GP = GP)
}

#' Estimate mixed beta distribution parameters based on EM algorithm
#' 
#' The Expectation–maximization (EM) algorithm is used to fit a mixture of three beta distributions representing the three genotypes (AA, AB, and BB) and one uniform distribution representing the outliers (adapted from ewastools).
#'
#' @param RAI A matrix of RAI (Ratio of Alternative allele Intensity) for probes. Provide probes as rows and samples as columns.
#' @param maxiter Maximal number of iterations for the EM algorithm.
#' @param verbose Verbose mode: 0/1/2.
#' @return  A list containing
#' \item{shapes}{Shapes of the mixed beta distributions}
#' \item{weights}{Prior probabilities that the RAI values belong to one of the three genotypes}
#' \item{U}{Overall probability of RAI values being outlier}
#' \item{outliers}{Probability of each RAI value being outlier}
#' \item{logLik}{Log-likelihood}
#' \item{GP}{Genotype probabilities of the three genotypes}
#' @export
fit_beta_em <- function(RAI, maxiter=50, verbose=1){
  # Initialize
  RAI_flat <- as.vector(RAI) # 1-dimension
  m <- nrow(RAI) # number of probes
  n <- ncol(RAI) # number of samples
  N <- length(RAI_flat)
  
  # EM
  phi <- rep(0.2, m) # AF
  weights <- sapply(0:2, function(k) choose(2,k) * phi^k * (1-phi)^(2-k))
  shapes <- as.data.frame(matrix(
    c(5, 30, 60, 60, 30, 5),
    nrow=3, dimnames=list(paste0("Cluster", 0:2), c("shape1", "shape2"))
  ))
  U <- 0.01
  outliers <- rep(U, N)
  GP <- NA
  
  e_step <- function(i){
    GP <- (1 - U) * cbind(
      as.vector(t(sapply(1:m, function(x) weights[x,1] * dbeta(RAI[x,], shapes[1,1], shapes[1,2])))),
      as.vector(t(sapply(1:m, function(x) weights[x,2] * dbeta(RAI[x,], shapes[2,1], shapes[2,2])))),
      as.vector(t(sapply(1:m, function(x) weights[x,3] * dbeta(RAI[x,], shapes[3,1], shapes[3,2]))))
    )
    tmp <- rowSums(GP, na.rm=T)
    GP <<- GP / tmp # If RAI=0,the "dbeta" function above will produce three zeros, and leads to NA here.
    outliers <<- U / (U + tmp)
    logLik <- sum(log10(U + tmp))
    return(logLik)
  }
  
  m_step <- function(){
    DS <- matrix(GP[,2] + 2 * GP[,3], nrow=m, ncol=n)
    phi <<- rowMeans(DS, na.rm=TRUE) / 2
    weights <<- sapply(0:2, function(k) choose(2,k) * phi^k * (1-phi)^(2-k))
    GP <- GP * (1 - outliers)
    U <<- sum(outliers) / N
    # Moments estimator
    s1 <- eBeta(RAI_flat, GP[,1])
    s2 <- eBeta(RAI_flat, GP[,2])
    s3 <- eBeta(RAI_flat, GP[,3])
    shapes[1,1] <<- s1$shape1; shapes[2,1] <<- s2$shape1; shapes[3,1] <<- s3$shape1
    shapes[1,2] <<- s1$shape2; shapes[2,2] <<- s2$shape2; shapes[3,2] <<- s3$shape2
    return(NA)
  }
  
  logLiks <- c()
  gain <- Inf
  i <- 1
  while(i <= maxiter & gain > 1e-4){
    logLik <- e_step(i)
    m_step()
    logLiks <- c(logLiks, logLik)
    if(verbose>=1){
      print(paste0("EM: Iteration ", i, ", log-likelihood: ", round(logLik, 6), ", Outlier probability: ", round(U, 6)))
    }
    if(i>1){
      gain <- logLik - logLiks[i-1]
    }
    i=i+1
  }
  
  names(logLiks) <- paste0("Iter", 1:length(logLiks))
  names(phi) <- rownames(RAI)
  rownames(weights) <- rownames(RAI)
  colnames(weights) <- paste0("Cluster", 0:2)
  outliers <- matrix(outliers, nrow=m, ncol=n, dimnames=list(rownames(RAI), colnames(RAI)))
  GP_AA <- matrix(GP[,1], nrow=m, ncol=n, dimnames=list(rownames(RAI), colnames(RAI)))
  GP_AB <- matrix(GP[,2], nrow=m, ncol=n, dimnames=list(rownames(RAI), colnames(RAI)))
  GP_BB <- matrix(GP[,3], nrow=m, ncol=n, dimnames=list(rownames(RAI), colnames(RAI)))
  iteration <- list(
    phi = phi,
    weights = weights,
    shapes = shapes,
    U = U,
    outliers = outliers,
    logLiks = logLiks,
    GP = list(pAA = GP_AA, pAB = GP_AB, pBB = GP_BB)
  )
  iteration
}

#' Moments estimator for beta distribution (adapted from ewastools)
#' 
#' @param x A vector of RAI values.
#' @param w Weights.
#' @return A list of beta distribution shapes.
#' @export
eBeta = function(x,w){
  n = length(w)
  w = n*w/sum(w)
  sample.mean =  mean(w*x)
  sample.var  = (mean(w*x^2)-sample.mean^2) * n/(n-1)
  v = sample.mean * (1-sample.mean)
  if (is.na(sample.var) | is.na(v)) {
    shape1 = Inf
    shape2 = Inf
  } else if (sample.var < v){
    shape1 = sample.mean * (v/sample.var - 1)
    shape2 = (1 - sample.mean) * (v/sample.var - 1)
  } else {
    shape2 = sample.mean * (v/sample.var - 1)
    shape1 = (1 - sample.mean) * (v/sample.var - 1)
  }
  list(shape1 = shape1, shape2 = shape2)
}

#' Extract AFs from matching population in the 1000 Genomes Project (1KGP)
#'
#' @param platform EPIC or 450K.
#' @param pop Population to be used to extract AFs. One of EAS, AMR, AFR, EUR, SAS, and ALL.
#' @param type One of snp_probe, typeI_probe, and typeII_probe.
#' @param platform EPIC or 450K.
#' @return A vector of AFs
#' @export
get_AF <- function(pop="EAS", type, platform="EPIC"){
  if(!(pop %in% c("EAS", "AMR", "AFR", "EUR", "SAS", "ALL"))){
    print("ERROR: get_AF: Wrong pop value supplied!"); return(NA)
  }
  if(type=="snp_probe"){
    if(platform=="EPIC"){
      data(probeInfo_snp)
    }else{
      data(probeInfo_snp_450K); probeInfo_snp <- probeInfo_snp_450K
    }
    probe2af <- probeInfo_snp[, paste0(pop, "_AF")]
    names(probe2af) <- probeInfo_snp$CpG
  }else if(type=="typeI_probe"){
    if(platform=="EPIC"){
      data(probeInfo_typeI)
    }else{
      data(probeInfo_typeI_450K); probeInfo_typeI <- probeInfo_typeI_450K
    }
    probe2af <- probeInfo_typeI[, paste0(pop, "_AF")]
    names(probe2af) <- probeInfo_typeI$CpG
  }else if(type=="typeII_probe"){
    if(platform=="EPIC"){
      data(probeInfo_typeII)
    }else{
      data(probeInfo_typeII_450K); probeInfo_typeII <- probeInfo_typeII_450K
    }
    probe2af <- probeInfo_typeII[, paste0(pop, "_AF")]
    names(probe2af) <- probeInfo_typeII$CpG
  }else{
    print("ERROR: get_AF: Wrong type value supplied!"); return(NA)
  }
  probe2af
}

#' Infer posterior genotype probabilities based on the Bayesian approach
#'
#' Prior genotype probabilities were inferred from AFs. The AFs can be in population level or individual-specific level. For population level AFs, they can be extracted from the matched population in the 1000 Genomes Project (1KGP). For individual-specific AFs, they can be calculated according to the top four PCs.
#'
#' @param pD_AA A MxN matrix of AA genotype probabilities. Provide SNPs as rows and samples as columns.
#' @param pD_AB A MxN matrix of AB genotype probabilities. Provide SNPs as rows and samples as columns.
#' @param pD_BB A MxN matrix of BB genotype probabilities. Provide SNPs as rows and samples as columns.
#' @param AF A MxN matrix of AFs. Provide SNPs as rows and samples as columns.
#' @return  A list containing
#' \item{pAA}{Posterior genotype probability of AA}
#' \item{pAB}{Posterior genotype probability of AB}
#' \item{pBB}{Posterior genotype probability of BB}
#' @export
get_GP_bayesian <- function(pD_AA, pD_AB, pD_BB, AF){
  probes <- intersect(rownames(pD_AA), rownames(AF))
  samples <- intersect(colnames(pD_AA), colnames(AF))
  pD_AA <- pD_AA[probes, samples]
  pD_AB <- pD_AB[probes, samples]
  pD_BB <- pD_BB[probes, samples]
  AF <- AF[probes, samples]
  pAA_prior <- (1 - AF) ^2 # Prior genotype probability of AA
  pAB_prior <- 2 * AF * (1 - AF)
  pBB_prior <- AF ^2
  pD <- pD_AA * pAA_prior + pD_AB * pAB_prior + pD_BB * pBB_prior
  pAA <- pD_AA * pAA_prior / pD
  pAB <- pD_AB * pAB_prior / pD
  pBB <- pD_BB * pBB_prior / pD
  return(list(pAA=pAA, pAB=pAB, pBB=pBB))
}
Yi-Jiang/MethyGenotyper documentation built on Sept. 4, 2024, 12:47 p.m.