R/Likelihood_functions.R

Defines functions getMaxLikelihoodMatrix getCpGMatrix getCpGs .getScaledLikelihoods .genotype_likelihood estimateErrorRate

Documented in estimateErrorRate getCpGMatrix getCpGs getMaxLikelihoodMatrix

#Likelihood functions for calling methylation states from bsseq objects generated from read.bedMethyl

#estimateErrorRate
estimateErrorRate <- function(
    BSseq, 
    minCov = 10, 
    maxCov = 100, 
    minRatio = 0.8, 
    plotErrorProfile = FALSE) {
  
  # Input validation of BSseq
  if (!inherits(BSseq, "BSseq")) {
    stop("Error: The input object must be of class 'BSseq'.")
  }
  if (ncol(BSseq) != 1) {
    stop("Error: The input 'BSseq' must have exactly one column.")
  }
  if (!all(c("D", "Cov") %in% names(assays(BSseq)))) {
    stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.")
  }
  
  # Input validation for thresholds
  if (minCov < 0 || maxCov < 0) {
    stop("Error: Both 'minCov' and 'maxCov' must be non-negative.")
  }
  if (maxCov <= minCov) {
    stop("Error: 'maxCov' must be greater than 'minCov'.")
  }
  if (minRatio < 0 || minRatio > 1) {
    stop("Error: 'minRatio' must be between 0 and 1.")
  }
  
  # Access assay data safely
  cov <- assays(BSseq)[["Cov"]]
  d <- assays(BSseq)[["D"]]
  
  # Calculate CpG-to-non-CpG ratio and filter sites
  ratio <- cov / (cov + d)
  sites <- getCoverage(BSseq) >= minCov & 
    getCoverage(BSseq) <= maxCov & 
    ratio >= minRatio
  
  # Plot the error profile if requested
  if (plotErrorProfile) {
    if (any(sites)) {
      p_all <- hist(ratio, breaks = 100, plot = FALSE)
      p_used <- hist(ratio[sites], breaks = (1 - minRatio) * 100, plot = FALSE)
      plot(p_all, xlim = c(0, 1), xlab = "CpG to non-CpG ratio", main = "Error Profile")
      plot(p_used, col = "black", xlim = c(0, 1), add = TRUE)
    } else {
      warning("No sites satisfy the filtering criteria. No error profile will be plotted.")
    }
  }
  
  # Calculate and return the error rate
  error_rate <- sum(d[sites]) / (sum(d[sites]) + sum(cov[sites]))
  return(error_rate)
}

#genotype_likelihood
#internal function to calculate the likelihood of a genotype given the observed data
.genotype_likelihood <- function(n = 2, g, e, cov, D) {
  if (g == 0) {
    p_mismatch <- n * e
    p_match <- n * (1 - e)
  } else if (g == 1) {
    p_mismatch <- n / 2
    p_match <- n / 2
  } else if (g == 2) {
    p_mismatch <- n * (1 - e)
    p_match <- n * e
  } else {
    stop("Invalid genotype value. Must be 0, 1, or 2.")
  }
  likelihood <- (p_mismatch^D) * (p_match^cov)
  return(likelihood)
}

#.getScaledLikelihoods
#internal function to calculate the scaled likelihoods for each genotype
.getScaledLikelihoods <- function(
    BSseq, 
    e = NULL) {
  # Input validation of BSseq
  if (!inherits(BSseq, "BSseq")) {
    stop("Error: The input object must be of class 'BSseq'.")
  }
  if (ncol(BSseq) != 1) {
    stop("Error: The input 'BSseq' must have exactly one column.")
  }
  if (!all(c("D", "Cov") %in% names(assays(BSseq)))) {
    stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.")
  }
  # Input validation of e
  if (!is.null(e) && (e < 0 || e > 1)) {
    stop("Error: 'e' must be between 0 and 1.")
  }
  if (is.null(e)) {
    e <- estimateErrorRate(BSseq)
  }
  cov <- assays(BSseq)[["Cov"]]
  d <- assays(BSseq)[["D"]]
  
  l <- matrix(0, nrow = length(cov), ncol = 3)
  l[, 1] <- .genotype_likelihood(g = 0, e = e, cov = cov, D = d)
  l[, 2] <- .genotype_likelihood(g = 1, e = e, cov = cov, D = d)
  l[, 3] <- .genotype_likelihood(g = 2, e = e, cov = cov, D = d)
  
  SL <- l / rowSums(l)
  return(SL)
}

getCpGs <- function(
    BSseq, 
    type = c("homozygous", "heterozygous", "allCpG", "nonCpG"), 
    threshold = 0.99, 
    e = NULL) {
  # Input validation of BSseq
  if (!inherits(BSseq, "BSseq")) {
    stop("Error: The input object must be of class 'BSseq'.")
  }
  if (ncol(BSseq) != 1) {
    stop("Error: The input 'BSseq' must have exactly one column.")
  }
  if (!all(c("D", "Cov") %in% names(assays(BSseq)))) {
    stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.")
  }
  # Input validation of type
  if (!type %in% c("homozygous", "heterozygous", "allCpG", "nonCpG")) {
    stop("Error: 'type' must be one of 'homozygous', 'heterozygous', 'allCpG', or 'nonCpG'.")
  }
  # Input validation of threshold
  if (threshold < 0 || threshold > 1) {
    stop("Error: 'threshold' must be between 0 and 1.")
  }
  # Input validation of e
  if (!is.null(e) && (e < 0 || e > 1)) {
    stop("Error: 'e' must be between 0 and 1.")
  }
  # Get scaled likelihoods
  SL <- .getScaledLikelihoods(BSseq, e)
  
  # Determine the CpG sites based on the type
  loci <- switch(type,
                   "homozygous" = which(SL[, 1] > threshold),
                   "heterozygous" = which(SL[, 2] > threshold),
                   "allCpG" = which(rowSums(SL[, 1:2]) > threshold),
                   "nonCpG" = which(SL[, 3] > threshold))
  
  return(loci)
}

getCpGMatrix <- function(
    BSseq, 
    e = NULL, 
    allCpG = FALSE) {
  
  # Input validation of BSseq
  if (!inherits(BSseq, "BSseq")) {
    stop("Error: The input object must be of class 'BSseq'.")
  }
  if (!all(c("D", "Cov") %in% names(assays(BSseq)))) {
    stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.")
  }
  
  # Input validation of e
  if (!is.null(e)) {
    if (!is.numeric(e) || (length(e) != ncol(BSseq))) {
      stop("Error: 'e' must be NULL or a numeric vector with the same length as the number of columns in 'BSseq'.")
    }
    if (any(e < 0 | e > 1)) {
      stop("Error: All elements of 'e' must be between 0 and 1.")
    }
  }
  
  # Initialize CpG matrix
  G <- matrix(nrow = nrow(BSseq), ncol = ncol(BSseq), byrow = FALSE)
  
  for (i in 1:ncol(BSseq)) {
    # Use the provided error rate or estimate it
    e_i <- if (is.null(e)) estimateErrorRate(BSseq[, i]) else e[i]
    
    # Get scaled likelihoods
    SL_i <- .getScaledLikelihoods(BSseq[, i], e_i)
    
    # Calculate CpG matrix
    if (allCpG) {
      G[, i] <- ifelse(SL_i[, 3] >= 1/3, 2, 0)
    } else {
      G[, i] <- max.col(SL_i, ties.method = "last") - 1
    }
  }
  
  return(G)
}

getMaxLikelihoodMatrix <- function(
    BSseq, 
    e = NULL, 
    allCpG = FALSE) {
  
  # Input validation of BSseq
  if (!inherits(BSseq, "BSseq")) {
    stop("Error: The input object must be of class 'BSseq'.")
  }
  if (!all(c("D", "Cov") %in% names(assays(BSseq)))) {
    stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.")
  }
  
  # Input validation of e
  if (!is.null(e)) {
    if (!is.numeric(e) || (length(e) != ncol(BSseq))) {
      stop("Error: 'e' must be NULL or a numeric vector with the same length as the number of columns in 'BSseq'.")
    }
    if (any(e < 0 | e > 1)) {
      stop("Error: All elements of 'e' must be between 0 and 1.")
    }
  }
  
  # Initialize quality matrix
  Q <- matrix(nrow = nrow(BSseq), ncol = ncol(BSseq), byrow = FALSE)
  
  for (i in 1:ncol(BSseq)) {
    # Use the provided error rate or estimate it
    e_i <- if (is.null(e)) estimateErrorRate(BSseq[, i]) else e[i]
    
    # Get scaled likelihoods
    SL_i <- .getScaledLikelihoods(BSseq[, i], e_i)
    
    # Calculate quality matrix
    if (allCpG) {
      Q[, i] <- ifelse(SL_i[, 3] >= 1/3, SL_i[, 3], (1 - SL_i[, 3]))
    } else {
      Q[, i] <- DelayedMatrixStats::rowMaxs(SL_i)
    }
  }
  
  return(Q)
}
hansenlab/bsseq documentation built on June 12, 2025, 7:42 p.m.