R/betabin.R

Defines functions plotQC plotBBCellFrac .back_sel .logL_MM3 MM3 .logL_MM2 MM2 .logL_MM MM .logL_BB BB .pbb_midp ldbb dbb .check_model betaBinomXI

Documented in betaBinomXI plotBBCellFrac plotQC

#' Fit mixture model
#'
#' Fit a mixture model to estimate mosaicism and XCI-escape.
#'
#' @param genic_dt A \code{data.table}. The table as outputted by \code{getGenicDP}.
#' @param model A \code{character} indicating which model to use to estimate
#'  the mosaicism. Valid choices are "AUTO", "BB", "MM", "MM2", "MM3". See details.
#' @param plot A \code{logical}. If set to TRUE, information about the training
#'  set and the skewing estimate will be plotted.
#' @param hist A \code{logical}. If set to TRUE, an histogram of the skewing
#'  estimates will be displayed.
#' @param flag A \code{numeric}. Specify how to handle convergence issues. See
#'  details.
#' @param xciGenes A \code{character} or NULL. To be passed to \code{readXCI} to
#'  select the training set of inactivated genes.
#' @param a0 A \code{numeric} or NULL. Starting values for the optimization. This
#'  should not be used with more than one model as different models have
#'  different parameters. Leave NULL unless you know what you're doing.
#' @param optimizer A \code{character}. The optimization function to use for minimization
#'  of the log-likelihood. Should be one of "nlminb" or "optim".
#' @param method A \code{character}. The method to be passed to \code{optim}
#'  when it is the selected \code{optimizer}.
#' @param limits A \code{logical}. If set to TRUE, the optimization will be
#'  constrained. Using upper bounds on the probability of sequencing error and
#'  escape in the training set ensures that the dominant mixture represents the
#'  skewing for inactivated genes.
#' @param debug A \code{logical}. If set to TRUE, information about each iteration
#'  will be printed (Useful to identify problematic samples).
#'
#'
#' @details
#' The model determines the number of components used in the mixture model. By
#' default, "AUTO" tries all combinations of mixtures and the best estimate is
#' kept using backward selection based on AIC.
#' BB is a simple beta-binomial. MM adds a binomial component to model the
#' sequencing errors. MM2 jointly models the probability of misclasification
#' in the training set. MM3 include all 3 components.
#'
#' Flags in the output reports issues in convergence. If \code{flag} is set to 0,
#' nothing is done. If set to 1, the model selection will avoid flagged models
#' (will favor parcimonious models).
#' If set to 2, calls for which the best selected model had convergence issue
#' will be removed.
#'
#' @return A \code{data.table} with an entry per sample and per gene.
#'
#' @example inst/examples/betaBinomXI.R
#'
#' @seealso getGenicDP readXCI
#' @export
betaBinomXI <- function(genic_dt,  model = "AUTO", plot = FALSE, hist = FALSE,
                        flag = 0, xciGenes = NULL, a0 = NULL,
                        optimizer = c("nlminb", "optim"), method = NULL, limits = TRUE,
                        debug = FALSE){
  dt <- copy(genic_dt)
  dt[, dp1 := pmin(AD_hap1, AD_hap2)]
  dt[, tot := AD_hap1 + AD_hap2]

  # Use XCI to estimate parameters
  #if(xci > 0){
  xcig <- readXCI(xciGenes)
  inactivated_genes <- xcig
  dt_xci <- dt[GENE %in% inactivated_genes]
  if(nrow(dt_xci) == 0){
    warning("No known silenced gene found in data.")
  }

  optimizer <- match.arg(optimizer)
  model <- .check_model(model)
  modl <- vector("list", length(model))
  for(i in seq_along(modl)){
    modi <- model[i]
    if(modi == "BB"){
      dt <- BB(dt_xci, dt, a0, optimizer, method, limits, debug)
    } else if(modi == "MM"){
      dt <- MM(dt_xci, dt, a0, optimizer, method, limits, debug)
    } else if(modi == "MM2"){
      dt <- MM2(dt_xci, dt, a0, optimizer, method, limits, debug)
    } else if(modi == "MM3"){
      dt <- MM3(dt_xci, dt, a0, optimizer, method, limits, debug)
    }
    modl[[i]] <- dt
  }
  if(length(modl) > 1){
    dt <- .back_sel(modl, flag = flag)
  } else{
    dt <- modl[[1]]
  }

  dt[, f := a_est/(a_est + b_est)]
  dt[, fg := dp1/tot]

  # Use the estimated a and b to calculate var_fg and the test statistic
  # Under H0, f_g ~ BB
  dt[, var_fg := (tot * a_est * b_est * (a_est + b_est + tot))]
  dt[, var_fg := var_fg/((a_est + b_est)^2 * (a_est + b_est + 1))]
  dt[, var_fg := var_fg/tot^2] # Because we want the variance for the fraction, not the variance for the counts
  dt[, t := (fg-f)/sqrt(var_fg)] #Test statistic
  # dt[, pbb := pbb(dp1, tot, a_est, b_est, type = "midp"), by = c("GENE", "sample")]
  dt[, pbb := .pbb_midp(dp1, tot, a_est, b_est), by = c("GENE", "sample")]
  dt[, p_value := pnorm(t, lower.tail = FALSE)]

  #tau is the Xi expression
  dt[, tau := (f-fg)/(2*f-1)]
  # tau is a fraction (0<tau<1) but estimate can be < 0 if fg < f
  # Which could only happens if a gene is tightly inactivated
  dt[, tau := ifelse(tau < 0, 0, tau)]
  dt[, var_tau := (2*f-1)^-2 * var_fg]
  dt[, ivw_tau := tau/sqrt(var_tau)]

  if(hist){
    hist(unique(dt[, f]), breaks = seq(0, .5, .01), main = "Cell fraction", xlab="Cell fraction", ylab = "samples")
  }
  if(plot){
    plotBBCellFrac(xci_dt = dt[GENE %in% xcig])
  }
  return(dt[])
}

.check_model <- function(model){
  model <- unique(toupper(model))
  validmodels <- c("BB", "MM", "MM2", "MM3")
  if("AUTO" %in% model){
    model <- validmodels
  }
  if(any(!model %in% validmodels)){
     err <- paste("Invalid models:", paste(model[!model %in% validmodels], collapse=", "),
                  "should be one of", paste(validmodels, collapse=", "))
     stop(err)
  }
  return(model)
}

################################################################################
dbb <- function(x, n, a, b){#Beta binomial
    #y <- choose(n, x) * beta(x+a, n-x+b)/beta(a,b)
    c <- choose(n, x)
    bn <- beta(x+a, n-x+b)
    bd <- beta(a, b)
    y <- c * bn / bd
    ninf <- sum(is.infinite(y))
    #if(ninf > 0){
    #  warning(paste("dbb returns",ninf, "infinite values"))
    #}
    y <- y[is.finite(y)]
    return(y);
}

# Calculate log likelihood directly to avoid computation overflow
ldbb <- function(x, n, a, b){
  lc <- lchoose(n, x)
  lbn <- lbeta(x+a, n-x+b)
  lbd <- lbeta(a, b)
  ly <- lc + lbn - lbd
  ninf <- sum(is.infinite(ly))
  if(ninf > 0){
    warning(paste("ldbb returns",ninf, "infinite values"))
  }
  ly <- ly[is.finite(ly)]
  return(ly)
}

## P-values for exact inference
#pbb <- function(x, n, a, b, type = "midp"){ #p-value for exact inference
#  if(is.na(x) | is.na(n)){
#    return(as.numeric(NA))
#  }
#  if(x >= n)
#    return(0)
#  if(type == "gt"){
#    from <- x+1
#    sump <- sum(dbb(from:n, n, a, b), na.rm = TRUE)
#  } else if(type == "geq"){
#    from <- x
#    sump <- sum(dbb(from:n, n, a, b), na.rm = TRUE)
#  } else if(type == "midp"){
#    from <- x+1
#    t0 <- dbb(x, n, a, b)
#    t1 <- sum(dbb(from:n, n, a, b), na.rm = TRUE)
#    sump <- .5*t0 + t1
#  } else{
#    stop("Type should be one of 'gt', 'geq', 'midp'")
#  }
#  return(sump)
#}

.pbb_midp <- function(x, n, a, b){
  if(is.na(x) | is.na(n)){
    return(as.numeric(NA))
  }
  if(x >= n)
    return(0)
  from <- x+1
  t0 <- exp(ldbb(x, n, a, b))
  t1 <- sum(exp(ldbb(from:n, n, a, b)), na.rm = TRUE)
  sump <- .5*t0 + t1
  return(sump)
}


################################################################################
# Beta-binomial model:

BB <- function(dt_xci, full_dt, a0 = NULL, optimizer = "nlminb", method = NULL,
               limits = FALSE, debug = FALSE){
  dt <- copy(full_dt)
  samples <- unique(dt_xci$sample)
  if(is.null(a0)){
    a0  <- c(1,1)
  }
  for(sample_i in samples){
    if(debug)
      print(sample_i)
    dp1 <- dt_xci[sample == sample_i, dp1]
    dp  <- dt_xci[sample == sample_i, tot]
    Nxcig <- dt_xci[sample == sample_i, .N]
    if(limits){
      lowb <- c(0, 0)
      upb <- c(Inf, Inf)
      upb <- c(699, 700)
      method <- "L-BFGS-B"
    } else{
      lowb <- -Inf
      upb <- Inf
    }
    if(optimizer == "nlminb"){
      res_optim <- nlminb(a0, .logL_BB, dp1 = dp1, dp = dp,
                          lower = lowb, upper = upb)
      negLogLname <- "objective"
    } else if(optimizer == "optim"){
      res_optim <- optim(par = a0, fn = .logL_BB, method =  method,
                         dp1 = dp1, dp = dp,
                         lower = lowb, upper = upb)
      negLogLname <- "value"
    } else{
      stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
    }
    message <- res_optim$message
    message <- ifelse(is.null(message), "", message)
    dt[sample == sample_i, a_est := exp(res_optim$par[1]) + 1]
    dt[sample == sample_i, b_est := exp(res_optim$par[2]) + 1]
    dt[sample == sample_i, model := "BB"]
    dt[sample == sample_i, k := length(res_optim$par)]
    dt[sample == sample_i, logL := res_optim[[negLogLname]]] #-logL
    dt[sample == sample_i, convergence := res_optim$convergence]
    dt[sample == sample_i, flag := message]
    dt[sample == sample_i, AIC := 2*k + 2*logL]
    dt[sample == sample_i, Ntrain := Nxcig]
    dt[sample == sample_i, BIC := log(Ntrain)*k + 2*logL]
  }
  return(dt)
}
.logL_BB <- function(a, dp1, dp){
  a1 <- exp(a[1]) + 1
  b1 <- exp(a[2]) + 1
  logLbb <- sum(ldbb(dp1,dp,a1,b1)*(-1), na.rm = TRUE)
  return(logLbb)
}
################################################################################
# Mixture models:

# 1 Binom for sequencing errors and 1 BB for inactivated heterozygous SNP
MM <- function(dt_xci, full_dt, a0 = NULL, optimizer = "nlminb", method = NULL,
               limits = FALSE, debug = FALSE){
  dt <- copy(full_dt)
  samples <- unique(dt_xci$sample)
  if(is.null(a0)){
    a0  <- c(1, 1, log(0.98/0.02), log(0.02/0.98))
  }
  for(sample_i in samples){
    if(debug)
      print(sample_i)
    dp1 <- dt_xci[sample == sample_i, dp1]
    dp  <- dt_xci[sample == sample_i, tot]
    Nxcig <- dt_xci[sample == sample_i, .N]
    if(limits){
      lowb <- c(0, 0, 0, -Inf)
      upb <- c(Inf, Inf, Inf, log(0.2/0.8))
      method <- "L-BFGS-B"
    } else{
      lowb <- -Inf
      upb <- Inf
    }
    if(optimizer == "nlminb"){
      res_optim <- nlminb(a0, .logL_MM, dp1 = dp1, dp = dp,
                          lower = lowb, upper = upb)
      negLogLname <- "objective"
    } else if(optimizer == "optim"){
      res_optim <- optim(par = a0, fn = .logL_MM, method = method,
                         dp1 = dp1, dp = dp,
                         lower = lowb, upper = upb)
      negLogLname <- "value"
    } else{
      stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
    }
    message <- res_optim$message
    message <- ifelse(is.null(message), "", message)

    dt[sample == sample_i, a_est := exp(res_optim$par[1]) + 1]
    dt[sample == sample_i, b_est := exp(res_optim$par[2]) + 1]
    dt[sample == sample_i, p_het := exp(res_optim$par[3])/(1 + exp(res_optim$par[3]))]
    dt[sample == sample_i, pi_err := exp(res_optim$par[4])/(1 + exp(res_optim$par[4]))]
    dt[sample == sample_i, model := "MM"]
    dt[sample == sample_i, k := length(res_optim$par)]
    dt[sample == sample_i, logL := res_optim[[negLogLname]]] #Actually -logL
    dt[sample == sample_i, convergence := res_optim$convergence]
    dt[sample == sample_i, flag := message]
    dt[sample == sample_i, AIC := 2*k + 2*logL]
    dt[sample == sample_i, Ntrain := Nxcig]
  }
  return(dt)
}

.logL_MM <- function(a, dp1, dp) {
  a1 <- exp(a[1]) +1
  b1 <- exp(a[2]) +1
  # Proportions have to be kept as an exponent ratio to avoid issues with boundaries
  p_het <- exp(a[3])/(1+exp(a[3])); # Proba that the SNP is indeed bi-allelic (initial proba = .98)
  pi_err <- 0.001+0.999*exp(a[4])/(1+exp(a[4])); # (initial proba = .02)
  pb <- dbinom(dp1, dp, pi_err)

  # Operations on the log scale to avoid infinite values
  lpbb <- ldbb(dp1, dp, a1, b1)
  p_tot <- p_het * exp(lpbb) + (1-p_het)*pb

  logL  <- -sum(log(p_tot))
  return(logL);
}



# 1 BB for inactivated SNP and 1 BB for escaped SNP
MM2 <- function(dt_xci, full_dt, a0 = NULL, optimizer = "nlminb", method = NULL,
                limits = FALSE, roundmax = TRUE, debug = FALSE){
  dt <- copy(full_dt)
  samples <- unique(dt_xci$sample)
  if(is.null(a0)){
    a0 <- c(1, 1, #Beta for the inactivated genes mean = a/(a+b) = .5
            2, 2, #Beta for the escape  #2 has more density around 0.5
            log(0.9/0.1)) #Initial proba that a training gene is indeed inactivated in this sample
  }
  for(sample_i in samples){
    if(debug)
      print(sample_i)
    dp1 <- dt_xci[sample == sample_i, dp1]
    dp  <- dt_xci[sample == sample_i, tot]
    Nxcig <- dt_xci[sample == sample_i, .N]
    err <- try({
    if(limits){
      # ai,bi,ae,be > .5 to have a mode
      # p_inac > log(0.75/0.25) 3/4 of the training genes should be inactivated
      # res_optim <- nlminb(a0, .logL_MM2, dp1 = dp1, dp = dp,
      #                     lower = c(0, 0, -Inf, -Inf, log(0.75/0.25)), # 0 mins min(p_inac) > .5
      #                     upper = c(Inf, Inf, Inf, Inf, Inf))
      lowb <- c(0, 0, -Inf, -Inf, log(0.75/0.25))
      upb <- c(Inf, Inf, Inf, Inf, Inf)
      method <- "L-BFGS-B"
    } else{
      # res_optim <- nlminb(a0, .logL_MM2, dp1 = dp1, dp = dp)
      lowb <- -Inf
      upb <- Inf
    }
    if(optimizer == "nlminb"){
      res_optim <- nlminb(a0, .logL_MM2, dp1 = dp1, dp = dp,
                          lower = lowb, upper = upb)
      negLogLname <- "objective"
    } else if(optimizer == "optim"){
      res_optim <- optim(par = a0, fn = .logL_MM2, method = method,
                         dp1 = dp1, dp = dp,
                         lower = lowb, upper = upb)
      negLogLname <- "value"
    } else{
      stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
    }
    message <- res_optim$message
    message <- ifelse(is.null(message), "", message)

    #p_inac_i <- exp(res_optim$par[5])/(1 + exp(res_optim$par[5]))
    expar5 <- exp(res_optim$par[5])
    if(is.finite(expar5)){
      p_inac_i <- expar5/(1 + expar5)
    } else if(expar5 == Inf){
      p_inac_i <- 1
    } else{
      stop("Something went wrong with the estimation of the inactivated mixture proportion")
    }
    # Selecting the right component to get alpha/beta corresponding to the inactivated group
    a_est_i <- exp(res_optim$par[1]) +1
    b_est_i <- exp(res_optim$par[2]) +1
    a_est_e <- exp(res_optim$par[3]) +1
    b_est_e <- exp(res_optim$par[4]) +1
    flagi <- message
    fi <- a_est_i/(a_est_i + b_est_i)
    fe <- a_est_e/(a_est_e + b_est_e)
    # Exponentiated differences

    if(roundmax){ #If we hit maximum numeric
      if(is.infinite(a_est_e))
        a_est_e <- .Machine$double.xmax
      if(is.infinite(b_est_e))
        b_est_e <- .Machine$double.xmax
      if(is.infinite(fe))
        fe <- 0.5
    }
    if(!is.finite(fi) | !is.finite(fe)){
      stop(paste("For sample", sample_i, "fi is", fi, "and fe is", fe))
    } else if(fi > fe){
      flagi <- "MM2: fi>fe"
    }
    })
    if(inherits(err, "try-error")){
      dt[sample == sample_i, a_est := NA]
      dt[sample == sample_i, b_est := NA]
      dt[sample == sample_i, p_inac := NA]

      dt[sample == sample_i, pi_escape := NA]
      dt[sample == sample_i, model := "MM2"]
      dt[sample == sample_i, k := NA]
      dt[sample == sample_i, logL := NA] #Actually -logL
      dt[sample == sample_i, convergence := NA]
      dt[sample == sample_i, flag := "MM2: Error"]
      dt[sample == sample_i, AIC := NA]
      dt[sample == sample_i, Ntrain := Nxcig]
    } else{
      dt[sample == sample_i, a_est := a_est_i]
      dt[sample == sample_i, b_est := b_est_i]
      dt[sample == sample_i, p_inac := p_inac_i]

      dt[sample == sample_i, pi_escape := a_est_e/(a_est_e + b_est_e)]
      dt[sample == sample_i, model := "MM2"]
      dt[sample == sample_i, k := length(res_optim$par)]
      dt[sample == sample_i, logL := res_optim[[negLogLname]]] #Actually -logL
      dt[sample == sample_i, convergence := res_optim$convergence]
      dt[sample == sample_i, flag := flagi]
      dt[sample == sample_i, AIC := 2*k + 2*logL]
      dt[sample == sample_i, Ntrain := Nxcig]
    }
  }
  return(dt)
}
.logL_MM2 <- function(a, dp1, dp) {
  # Inactivated gene from XCI
  ai <- exp(a[1]) + 1
  bi <- exp(a[2]) + 1
  # Escape gene in XCI list
  ae <- exp(a[3]) + 1
  be <- exp(a[4]) + 1

  p_inac <- exp(a[5])/(1+exp(a[5])) #Proba that the gene is indeed inactivated (0.9)
  lpbbi <- ldbb(dp1, dp, ai, bi)
  lpbbe <- ldbb(dp1, dp, ae, be)
  p_tot <- p_inac * exp(lpbbi) + (1-p_inac) * exp(lpbbe)
  logL  <- -sum(log(p_tot))
  return(logL);
}

# 1 BB for inac SNPs 1 BB for escaped SNP and 1 Binom for sequencing err
MM3 <- function(dt_xci, full_dt, a0 = NULL, optimizer ="nlminb", method = NULL,
                limits = FALSE, debug = FALSE){
  dt <- copy(full_dt)
  samples <- unique(dt_xci$sample)
  if(is.null(a0)){
    a0 <- c(1, 1, #Beta for the inactivated genes mean = a/(a+b) = .5
            2, 2, #Beta for the escape  #2 has more density around 0.5
            log(0.9/0.1), #Initial proba that a training gene is indeed inactivated in this sample
            log(0.98/0.02), #Proba of SNP being actually het
            log(0.02/0.98))
  }
  for(sample_i in samples){
    if(debug)
      print(sample_i)
    dp1 <- dt_xci[sample == sample_i, dp1]
    dp  <- dt_xci[sample == sample_i, tot]
    Nxcig <- dt_xci[sample == sample_i, .N]
    err <- try({
      if(limits){
        # res_optim <- nlminb(a0, .logL_MM3, dp1 = dp1, dp = dp,
                            # lower = c(0, 0, -Inf, -Inf, 0, 0, -Inf),
                            # upper = c(Inf, Inf, Inf, Inf, Inf, Inf, log(0.2/0.8)))
        lowb <- c(0, 0, -Inf, -Inf, 0, 0, -Inf)
        upb <- c(Inf, Inf, Inf, Inf, Inf, Inf, log(0.2/0.8))
        method <- "L-BFGS-B"
      } else{
        lowb <- -Inf
        upb <- Inf
        # res_optim <- nlminb(a0, .logL_MM3, dp1 = dp1, dp = dp)
      }
      if(optimizer == "nlminb"){
        res_optim <- nlminb(a0, .logL_MM3, dp1 = dp1, dp = dp,
                            lower = lowb, upper = upb)
        negLogLname <- "objective"
      } else if(optimizer == "optim"){
        res_optim <- optim(par = a0, fn = .logL_MM3, method = method,
                           dp1 = dp1, dp = dp,
                           lower = lowb, upper = upb)
        negLogLname <- "value"
      } else{
        stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
      }
      message <- res_optim$message
      message <- ifelse(is.null(message), "", message)

      expar5 <- exp(res_optim$par[5])
      if(is.finite(expar5)){
        p_inac_i <- expar5/(1 + expar5)
      } else if(expar5 == Inf){
        p_inac_i <- 1
      } else{
        stop("Something went wrong with the estimation of the inactivated mixture")
      }
      # if(p_inac_i >= .5){ # We assume that the list should always have a majority of inactivated genes
      #   a_est_i <- exp(res_optim$par[1])
      #   b_est_i <- exp(res_optim$par[2])
      #   a_est_e <- exp(res_optim$par[3])
      #   b_est_e <- exp(res_optim$par[4])
      # } else{
      #   p_inac_i <- 1-p_inac_i
      #   a_est_i <- exp(res_optim$par[3])
      #   b_est_i <- exp(res_optim$par[4])
      #   a_est_e <- exp(res_optim$par[1])
      #   b_est_e <- exp(res_optim$par[2])
      # }
      a_est_i <- exp(res_optim$par[1]) + 1
      b_est_i <- exp(res_optim$par[2]) + 1
      a_est_e <- exp(res_optim$par[3]) + 1
      b_est_e <- exp(res_optim$par[4]) + 1
      flagi <- message
      fi <- a_est_i/(a_est_i + b_est_i)
      fe <- a_est_e/(a_est_e + b_est_e)
      ferr_i <- exp(res_optim$par[7])/(1 + exp(res_optim$par[7]))
      if(fi > fe){
        flagi <- "MM3: fi>fe"
      } else if(fi < ferr_i){
        flagi <- "MM3: ferr>fi"
      }
    })
    if(inherits(err, "try-error")){
      dt[sample == sample_i, a_est := NA]
      dt[sample == sample_i, b_est := NA]
      dt[sample == sample_i, p_inac := NA]
      dt[sample == sample_i, p_het := NA]

      dt[sample == sample_i, pi_escape := NA]
      dt[sample == sample_i, model := "MM3"]
      dt[sample == sample_i, k := NA]
      dt[sample == sample_i, logL := NA] #Actually -logL
      dt[sample == sample_i, convergence := NA]
      dt[sample == sample_i, flag := "MM3: Error"]
      dt[sample == sample_i, AIC := NA]
      dt[sample == sample_i, Ntrain := Nxcig]
    } else{
      dt[sample == sample_i, a_est := a_est_i]
      dt[sample == sample_i, b_est := b_est_i]
      dt[sample == sample_i, p_inac := p_inac_i]
      dt[sample == sample_i, p_het := exp(res_optim$par[6])/(1+exp(res_optim$par[6]))]

      dt[sample == sample_i, pi_escape := a_est_e/(a_est_e + b_est_e)]
      dt[sample == sample_i, pi_err := ferr_i]
      dt[sample == sample_i, model := "MM3"]
      dt[sample == sample_i, k := length(res_optim$par)]
      # dt[sample == sample_i, logL := res_optim$objective] #Actually -logL
      dt[sample == sample_i, logL := res_optim[[negLogLname]]] #Actually -logL
      dt[sample == sample_i, convergence := res_optim$convergence]
      dt[sample == sample_i, flag := flagi]
      dt[sample == sample_i, AIC := 2*k + 2*logL]
      dt[sample == sample_i, Ntrain := Nxcig]
    }
  }
  return(dt)
}


.logL_MM3 <- function(a, dp1, dp){
  # Inactivated gene from XCI
  ai <- exp(a[1]) + 1
  bi <- exp(a[2]) + 1
  # Escape gene in XCI list
  ae <- exp(a[3]) + 1
  be <- exp(a[4]) + 1

  if(is.finite(exp(a[5]))){
    p_inac <- exp(a[5])/(1+exp(a[5])) #Proba that the gene is not a training set error
  } else if(exp(a[5]) == Inf){
    p_inac <- 1 #Tends to 1
  } else{
    stop("Something is wrong with the a[5]")
  }
  if(is.finite(exp(a[6]))){
    p_het <- exp(a[6])/(1+exp(a[6])) #Proba that the gene is indeed heterozygous
  } else if(exp(a[6]) == Inf){
    p_het <- 1 #Tends to 1
  } else{
    stop("Something is wrong with the a[6]")
  }

  pi_err  <- .01 + .99*exp(a[7])/(1+exp(a[7]))

  lpbb_i <- ldbb(dp1, dp, ai, bi)
  lpbb_e <- ldbb(dp1, dp, ae, be)
  pb_err <- dbinom(dp1, dp, pi_err)
  ## TODO: Handle cases where exp lpbb_i/lpbb_e > 0
  ## lpbb should be between 0 and 1
  p_tot <- p_het * (p_inac * exp(lpbb_i) + # Reads from component1
                   (1-p_inac) * exp(lpbb_e)) + # Reads from component2
           (1-p_het) * pb_err # Reads from sequencing error

  logL <- -sum(log(p_tot))
  return(logL)
}
################################################################################
# Model selection
.back_sel <- function(modl, criterion = "AIC", flag = 0){
  cols <- Reduce(intersect, lapply(modl, names))
  modl <- lapply(modl, function(XX){XX[, cols, with = FALSE]})
  aics <- rbindlist(modl)
  if(flag == 1){# Models where the sample had errors are discarded
    aics <- aics[!grep("^MM", flag)]
  }
  aics <- aics[aics[, .I[which.min(AIC)], by = "sample,GENE"]$V1] #Model that minimizes AIC
  if(flag == 2){# Remove samples where the selected model had errors
    aics <- aics[!grep("^MM", flag)]
  }
  return(aics)
}
################################################################################

# Can also be used to visualize the cell frac from the calls
#' Plot cell fraction estimates
#'
#' Plot cell fraction estimates from list of known XCI genes
#'
#' @param xci_dt A \code{data.table}. The data to be used for the estimate
#' of skewing (i.e: limited to XCI genes).
#' @param xcig A \code{logical}. If \code{xci_dt} was not subset for training
#' genes only, setting xcig to TRUE will filter the data.
#' @param gene_names A \code{character}. If left blank, only genes that are
#' further than 20% away from the estimated skewing will be annotated, if set
#' to "all", all genes will be named. Set to "none" to remove all annotations.
#' Alternately, a \code{character} vector can be passed to annotate specific
#' genes of interest.
#' @param color_col A \code{character}. One of the columns of \code{xci_dt} can
#' be used to color genes.
#' @param xist A \code{logical}. Set to TRUE to display XIST in addition to
#' the training genes.
#'
#' @return NULL
#'
#' @details
#' This function is mostly used in \code{betaBinomXI} to ensure that the cell
#' fraction is estimated properly. However, it can be used from the output
#' of \code{betaBinomXI} to troubleshoot estimation issues.
#'
#' @return The plot object in class \code{ggplot}.
#'
#' @importFrom ggplot2 element_blank scale_colour_manual facet_wrap
#' @importFrom ggplot2 geom_point geom_text geom_hline theme_bw
plotBBCellFrac <- function(xci_dt, xcig = NULL, gene_names = "",
                           color_col = NULL, xist = TRUE){
  plotfrac <- xci_dt[order(sample)]
  if(!is.null(xcig)){
    xcig <- readXCI(xcig)
    plotfrac <- plotfrac[GENE %in% xcig]
  }
  Nt <- plotfrac[, .N, by = "sample"]
  plotfrac <- merge(plotfrac, Nt, by = "sample")
  plotfrac[, index := unlist(sapply(Nt[, N], function(x){seq_len(x)}))]
  plotfrac[, label := ""]
  if(length(gene_names) > 1){
    plotfrac[GENE %in% gene_names, label := GENE]
  } else if(gene_names == "all"){
    plotfrac[, label := GENE]
  } else if(gene_names == "none"){
    plotfrac[, label := ""]
  } else if(gene_names == ""){
    plotfrac[abs(fg - f) > .2, label := GENE]
  } else{
    plotfrac[GENE %in% gene_names, label := GENE]
  }
  if("model" %in% colnames(plotfrac)){
    gp <- geom_point(aes(shape = model))
  } else{
    gp <- geom_point()
  }

  if(is.null(color_col)){
    p <- ggplot(plotfrac, aes(x = index, y = fg))
  } else{
    p <- ggplot(plotfrac, aes(x = index, y = fg, color = plotfrac[[color_col]]))
  }
  ## TODO: Add the other components when available (also add option to return components from all models to betaBinomXI)

  p <- p + geom_hline(aes(yintercept = f))
  p <- p + gp + geom_text(aes(label = label))
  #p <- p + geom_text(aes(x = N - 5, y= max(fg) + .01, label = paste("N =", N)))
  p <- p + geom_text(aes(x = 0, y= max(fg) + .01, label = paste("N =", N)), hjust = "left")
  p <- p + facet_wrap(~sample, scales = "free_x")
  p <- p + theme_bw() + theme(axis.text.x = element_blank(),
                              axis.ticks.x = element_blank(),
                              axis.title.x = element_blank())
  if(xist){
    xists <- xci_dt[GENE == "XIST"]
    p <- p + geom_point(data = xists, aes(x = Ntrain/2, y = fg), color = "red", shape = 4, size = 4)
  }
  print(p)
  return(invisible(p))
}

#' Plot QC
#'
#' This plot shows QC for skewing estimates
#'
#' @param xci_table A \code{data.table}. Data to plot. Should be the results of
#'  \code{betaBinomXI}, \code{getGenicDP} or one of the annotation functions.
#' @param xcig A \code{character} vector. The names of the genes in the
#'  inactivated training set.
#' @param gene_names A \code{character}. If left blank, only genes that are
#' further than 20% away from the estimated skewing will be annotated, if set
#' to "all", all genes will be named. Set to "none" to remove all annotations.
#' Alternately, a \code{character} vector can be passed to annotate specific
#' genes of interest.
#'
#' @return An invisible plot object.
#'
#' @importFrom ggplot2 ggplot aes theme theme_bw element_blank facet_wrap
#'  geom_point geom_hline geom_text
#'
#' @example inst/examples/betaBinomXI.R
#'
#' @export
plotQC <- function(xci_table, xcig = NULL, gene_names = ""){
  sd_f <- f_sd_up <- f_sd_lo <- meanf <- NULL
  # TODO: Order the genes along X OR by their allelic ratio
  # TODO: Add confint based on the distribution: https://stats.stackexchange.com/questions/82475/calculate-the-confidence-interval-for-the-mean-of-a-beta-distribution
  # TODO: Add the gene names
  plottable <- xci_table[order(sample)]
  plottable[, index := rowid(plottable$sample)]
  plottable[, set := "Test"]
  plottable[GENE %in% xcig, set := "Training"]
  plottable[GENE %in% xcig, meanf := mean(fg), by = "sample"]
  # Add standard deviation of estimates
  plottable[, sd_f := sqrt((a_est + b_est)/((a_est + b_est)^2 * (a_est + b_est + 1)))]
  plottable[, f_sd_up := pmin(0.5, f + sd_f)]
  plottable[, f_sd_lo := pmax(0, f - sd_f)]
  xists <- plottable[GENE == "XIST"]
  # Compute index
  p <- ggplot(plottable, aes(x = index, y = fg))
  # Plot the skewing
  p <- p + geom_hline(aes(yintercept = f))
  p <- p + geom_hline(aes(yintercept = f_sd_up), lty = "dashed")
  p <- p + geom_hline(aes(yintercept = f_sd_lo), lty = "dashed")
  # Plot basic mean for comparison
  p <- p + geom_hline(aes(yintercept = meanf), color = "blue", lty = "dotted")
  p <- p + geom_text(aes(0,meanf,label = "mean", hjust = -1), colour = "blue")
  # Plot the allelic ratios
  p <- p + geom_point(aes(shape = model, color = set))
  # Handle gene names
  # Color by readcount
  # Add theme
  theme_QC <- theme_bw() + theme(axis.text.x = element_blank(),
                                 axis.ticks.x = element_blank(),
                                 axis.title.x = element_blank())
  p <- p + facet_wrap(~sample, scales = "free_x") + theme_QC
  # Plot XIST
  p <- p + geom_point(data = xists, aes(x = Ntrain/2, y = fg), color = "red", shape = 4, size = 4)
  # Add training gene count
  p <- p + geom_text(aes(x = 0, y= max(fg) + .01, label = paste("N =", Ntrain)), hjust = "left")

  suppressWarnings(print(p))
  return(invisible(p))
}

#TODO: A QC plot based on the annotated dataframe instead of the gene summary.
# It requires keeping track of all SNPs (instead of the sums or just top SNPs)

Try the XCIR package in your browser

Any scripts or data that you put into this service are public.

XCIR documentation built on Nov. 8, 2020, 7:41 p.m.