scLearn: Regulome Prediction based on scRNA-seq

#' Poisson log likelihood
loglik <- function(mu, y)
  y * log(mu) - mu - lgamma(y + 1)

#' expit function
expit <- function(x)
  1 / (1 + exp(-x))

#' Estimate Bias and Dropout functions for a single cell
#' @export
#' @importFrom scam scam
norm_model <-
  function(bulk,
           sc,
           mu0_cons = 0.5,
           update_mu = F,
           max_iter = 100,
           tol = 1e-20,
           verbose = F,
           init_lo,
           init_hi) {
    fit_df <- data.frame(eta = bulk, y = sc)
    # initial values
    g <- rep(sum(sc < init_lo) / length(sc), nrow(fit_df))
    l_h <- exp(loglik(init_hi, fit_df$y) + log(1 - g))
    l_mu0 <- exp(loglik(mu0_cons, fit_df$y) + log(g))
    p_a <- l_h / (l_mu0 + l_h)
    # numerical fixes
    # when count is too big
    p_a[l_mu0 == 0 & l_h == 0] <- 1 - .Machine$double.neg.eps
    p_a[p_a == 0]  <- .Machine$double.xmin

    # browser()
    converge_flag <- F
    mu0 <- mu0_cons
    for (iter in 1:max_iter) {
      # fit bias model
      h_model <-
        scam(
          y ~ s(eta, k = 5, m = 2, bs = "mpi"),
          family = poisson(link = "log"),
          data = fit_df,
          weights = p_a
        )
      # fit dropout model
      g_model <-
        scam((1 - p_a) ~ s(eta, k = 5, m = 2, bs = "mpd"),
             family = binomial(link = "logit"),
             data = fit_df
        )
      # optional mu0 model
      if (update_mu) {
        mu0_model <-
          glm(
            y ~ 1,
            family = poisson(link = "log"),
            data = fit_df,
            weights = 1 - p_a
          )
        # mu0 <- min(fitted(mu0_model)[1], mu0_cons)
        mu0 = fitted(mu0_model)[1]
      }

      g <- fitted(g_model)
      l_h <- exp(loglik(fitted(h_model), fit_df$y) + log(1 - g))
      l_mu0 <- exp(loglik(mu0, fit_df$y) + log(g))
      p_a_new <- l_h / (l_mu0 + l_h)
      p_a_new[(l_mu0 == 0 & l_h == 0)] <- 1 - .Machine$double.neg.eps
      p_a_new[p_a_new == 0]  <- .Machine$double.xmin
      if (mean(abs(p_a_new - p_a)) < tol) {
        converge_flag <- T
        if (verbose) {
          message("Converged")
        }
        break
      }
      if (verbose) {
        #TODO: check overall loglik here
        message(paste0("Iter: ", iter,
                       ", max change in prob: ",
                       max(abs(p_a_new - p_a)), "."))
        # print(p_a_new)
      }
      p_a <- p_a_new
    }
    model <-
      list(
        h_model = h_model,
        g_model = g_model,
        mu0 = mu0,
        p_a = p_a,
        converged = converge_flag
      )
    model
  }

#' Evaluate posterior likelihood given a vector of priors
#' @importFrom scam predict.scam
model_loglik_gene <- function(model, eta, y) {
  g <- expit(predict(model$g_model, data.frame(eta = eta)))
  h_mu <- exp(predict(model$h_model, data.frame(eta = eta)))
  l_high <- exp(loglik(h_mu, y))
  l_low <- exp(loglik(model$mu0, y))
  l <- g * l_low + (1 - g) * l_high
  l
}

#' Nearest neighbor imputation of missing
#' @importFrom RANN nn2
#' @export
nn_impute <- function(bulk_mat, sc_mat, k) {
  out <- sc_mat
  for (cell_id in 1:ncol(sc_mat)) {
    # missing in cell
    m_idx <- which(is.na(sc_mat[, cell_id]))
    if (length(m_idx) == 0) {
      next
    }
    nn_out <-
      nn2(t(bulk_mat[-m_idx,]), t(sc_mat[-m_idx, cell_id, drop = F]), k = k)
    out[m_idx, cell_id] <-
      rowMeans(bulk_mat[m_idx, nn_out$nn.idx[1,]])
  }
  out
}

#' Make grid sequence of h functions from normalization model
#' @export
#' @importFrom scam predict.scam
make_hseq <- function(norm_models,
                      seq_len = 20,
                      seq_max = 12) {
  x_seq <- seq(0, seq_max, length = seq_len)
  y_seq_mat <- matrix(, length(norm_models), seq_len)
  for (j in 1:length(norm_models)) {
    y_seq_mat[j,] <-
      predict(norm_models[[j]]$h_model, data.frame(eta = x_seq))
  }
  y_seq_mat
}

#' Make grid sequence of b functions from normalization model
#' @export
#' @importFrom scam predict.scam
make_gseq <- function(norm_models,
                      seq_len = 20,
                      seq_max = 12) {
  x_seq <- seq(0, seq_max, length = seq_len)
  y_seq_mat <- matrix(, length(norm_models), seq_len)
  for (j in 1:length(norm_models)) {
    y_seq_mat[j,] <-
      predict(norm_models[[j]]$g_model, data.frame(eta = x_seq))
  }
  y_seq_mat
}