R/nebula.R

#' Association analysis of a multi-subject single-cell data set using a fast negative binomial mixed model
#' 
#' @param count A raw count matrix of the single-cell data. The rows are the genes, and the columns are the cells. The matrix can be a matrix object or a sparse dgCMatrix object.
#' @param id A vector of subject IDs. The length should be the same as the number of columns of the count matrix.
#' @param pred A design matrix of the predictors. The rows are the cells and the columns are the predictors. If not specified, an intercept column will be generated by default.
#' @param offset A vector of the scaling factor. The values must be strictly positive. If not specified, a vector of all ones will be generated by default. 
#' @param model 'NBGMM', 'PMM' or 'NBLMM'. 'NBGMM' is for fitting a negative binomial gamma mixed model. 'PMM' is for fitting a Poisson gamma mixed model. 'NGLMM' is for fitting a negative binomial lognormal mixed model (the same model as that in the lme4 package). The default is 'NBGMM'.
#' @param method 'LN' or 'HL'. 'LN' is to use NEBULA-LN and 'HL' is to use NEBULA-HL. The default is 'LN'.
#' @param min Minimum values for the overdispersions parameters \eqn{\sigma^2} and \eqn{\phi}. Must be positive. The default is c(1e-4,1e-4).
#' @param max Maximum values for the overdispersions parameters \eqn{\sigma^2} and \eqn{\phi}. Must be positive. The default is c(10,1000).
#' @param kappa Please see the vignettes for more details. The default is 800.
#' @param cpc A non-negative threshold for filtering low-expression genes. Genes with counts per cell smaller than the specified value will not be analyzed. 
#' @param mincp A positive integer threshold for filtering low-expression genes. A gene will not be analyzed if its number of cells that have a non-zero count is smaller than the specified value . 
#' @param cutoff_cell The data will be refit using NEBULA-HL to estimate both overdispersions if the product of the cells per subject and the estimated cell-level overdispersion parameter \eqn{\phi} is smaller than cutoff_cell. The default is 20.
#' @param opt 'lbfgs' or 'trust'. Specifying the optimization algorithm used in NEBULA-LN. The default is 'lbfgs'. If it is 'trust', a trust region algorithm based on the Hessian matrix will be used for optimization.
#' @param verbose An optional logical scalar indicating whether to print additional messages. Default is FALSE.
#' @param covariance If TRUE, nebula will output the covariance matrix for the estimated log(FC), which can be used for testing contrasts.
#' @param output_re If TRUE, nebula will output the subject-level random effects. Only effective for model='NBGMM' or 'NBLMM'.
#' @param reml Either 0 (default) or 1. If it is one, REML will be used to estimate the overdispersions.
#' @param ncore The number of cores used for parallel computing.
#' @param fmaxsize The maximum allowed total size (in bytes) of global variables (future.globals.maxSize) when using parallel computing.
#' @return summary: The estimated coefficient, standard error and p-value for each predictor.
#' @return overdispersion: The estimated cell-level and subject-level overdispersions \eqn{\sigma^2} and \eqn{\phi^{-1}}.
#' @return convergence: More information about the convergence of the algorithm for each gene. A value of -20 or lower indicates a potential failure of the convergence. A value of one indicates that the convergence is reached due to a sufficiently small improvement of the function value. A value of -10 indicates that the convergence is reached because the gradients are close to zero (i.e., the critical point) and no improvement of the function value can be found.    
#' @return algorithm: The algorithm used for analyzing the gene. More information can be found in the vignettes.
#' @return covariance: The covariance matrix for the estimated log(FC).
#' @return random_effect: The subject-level random effects.
#' @export
#' @examples
#' library(nebula)
#' data(sample_data)
#' pred = model.matrix(~X1+X2+cc,data=sample_data$pred)
#' re = nebula(count=sample_data$count,id=sample_data$sid,pred=pred)
#' 


nebula = function (count, id, pred = NULL, offset = NULL,min = c(1e-4,1e-4), max = c(10,1000), 
                   model = 'NBGMM', method = "LN", cutoff_cell = 20, kappa=800, opt='lbfgs',
                   verbose = TRUE, cpc = 0.005, mincp = 5, covariance = FALSE, output_re = FALSE, reml = 0, ncore = 2, fmaxsize = Inf)
{
  reml <- check_reml(reml,model)
  
  maxcore <- max(c(1, length(parallelly::availableWorkers()) - 1))
  
  if(ncore>maxcore)
  {
    if (verbose == TRUE) {
      cat("The specified ncore is larger than the number of available cores. The detected number of cores is used instead.\n")
    }
    ncore <- maxcore
  }
  options(future.globals.maxSize=fmaxsize)
  
  eps = 1e-06
  
  if(cpc<0)
  {cpc = 0}
  
  if(!(method %in% c('LN','HL')))
  {stop("The method argument should be \'LN\' or \'HL\'.")}
  
  if (is.vector(count)) {
    count = matrix(count, nrow = 1)
  }
  count = as(count, "dgCMatrix")
  nind = ncol(count)
  if(nind<2)
  {stop("There is no more than one cell in the count matrix.")}
  gname = rownames(count)
  predn = sds = NULL
  intcol = NULL
  if (is.null(pred)) {
    pred = matrix(1, ncol = 1, nrow = nind)
    sds = 1
    intcol = 1
  }else {
    predn = colnames(pred)
    if((!is.null(predn))&length(unique(predn))<ncol(pred))
    {stop('All columns of the design matrix should have a unique name.')}
    pred = center_m(as.matrix(pred))
    sds = pred$sds
    if((sum(sds==0)>1)|(sum(sds<0)>0))
    {stop("Some predictors have zero variation or a zero vector.")}
    if(sum(sds==0)==0)
    {stop("The design matrix must include an intercept term.")}
    intcol = which(sds==0)
    pred = as.matrix(pred$pred)
    if(Matrix::rankMatrix(pred)<ncol(pred))
    {warning("Some predictors in the design matrix are collinear or linearly dependent. The effects of these predictors are not identifiable.")}
    if(nrow(pred)!=nind)
    {stop('The number of rows of the design matrix should be equal to the number of columns of the count matrix.')}
  }
  
  nb = ncol(pred)
  ngene = nrow(count)
  
  if (is.null(offset)) {
    of_re = cv_offset(0,0,nind)
  }else{
    if(length(offset)!=nind)
    {
      stop('The length of offset should be equal to the number of columns of the count matrix.')
    }
    of_re = cv_offset(as.double(offset),1,nind)
  }
  offset = of_re$offset;
  # moffset = of_re$moffset;
  moffset = log(of_re$mexpoffset)
  cv = of_re$cv
  cv2 = cv*cv
  
  ntau = switch(model,'NBGMM'=2,'PMM'=1,'NBLMM'=2,
                stop("The argument model should be NBGMM, PMM or NBLMM."))
  if(model=='PMM')
  {output_re <- FALSE}
  
  npar = nb + ntau
  lower = c(rep(-100, nb), min[1:ntau])
  upper = c(rep(100, nb), max[1:ntau])
  
  if(length(id)!=nind)
  {stop('The length of subject IDs should be equal to the number of columns of the count matrix.')}
  
  id = as.character(id)
  levels = unique(id)
  id = as.numeric(factor(id,levels=levels))
  if(is.unsorted(id))
  {stop("The cells of the same subject have to be grouped.")}
  k = length(levels)
  fid = which(c(1, diff(id)) == 1)
  fid = as.integer(c(fid, nind + 1))
  
  cell_ind = get_cell(pred,fid-1,nb,k)
  cell_ind = which(cell_ind>0)
  ncell = length(cell_ind)
  
  mfs = nind/k
  if(mfs<30)
  {
    method = 'HL'
    if (verbose == TRUE) {
      cat("The average number of cells per subject (", round(mfs,digits=2), ") is less than 30. The 'method' is set as 'HL'.\n")
    }
  }
  
  cumsumy = call_cumsumy(count, fid - 1, k, ngene)
  if (ngene == 1) {
    cumsumy = matrix(cumsumy, ncol = k)
  }
  posind = lapply(1:ngene, function(x) which(cumsumy[x, ] > 0))
  gid2 = which(tabulate(count@i + 1L, ngene) >= mincp)
  count = Matrix::t(count)
  gid = which((rowSums(cumsumy)/nind) > cpc)
  gid = intersect(gid,gid2)
  lgid = length(gid)
  if (verbose == TRUE) {
    cat("Remove ", ngene-lgid, " genes having low expression.\n")
  }
  if (lgid == 0) {
    stop("No gene passed the filtering.")
  }
  if (verbose == TRUE) {
    cat("Analyzing ", lgid, " genes with ",
        k, " subjects and ", nind, " cells.\n")
  }
  
  cell_init = 1
  
  registerDoFuture()
  if(ncore==1)
  {
    plan(sequential)
  }else{
    cls <- parallelly::makeClusterPSOCK(ncore, autoStop = TRUE)
    plan(cluster, workers = cls)
  }
  
  if (model %in% c('NBGMM','NBLMM')) {

    re <- foreach(i = gid, .combine = 'cbind') %dorng% {
      posv = call_posindy(count, i - 1, nind)
      
      if((posv$mct*mfs)<3)
      {ord = 3}else{ord = 1}
      
      # para = c(log(posv$mct) - moffset, rep(0, nb - 1))
      lmct = log(posv$mct)
      para = rep(0, nb)
      para[intcol] = lmct - moffset
      re_t = tryCatch({
        ref = switch(opt,
                     'lbfgs'=lbfgs(c(para,1,cell_init), ptmg_ll_der, posindy = posv$posindy, X = pred, offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                                   ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,nind = nind, lower = lower, upper = upper,control = list(ftol_abs = eps)),
                     'trust'=trust(objfun=ptmg_ll_der_hes3, c(para,0,0), 2, 100,posindy=posv$posindy,X=pred,offset=offset,Y=posv$Y,n_one=posv$n_onetwo,ytwo=posv$ytwo,fid=fid,cumsumy=cumsumy[i,],posind=posind[[i]],nb=nb,k=k,nind=nind),
                     stop("The argument opt should be lbfgs, or trust.")
        )
        if(opt=='lbfgs')
        {ref = ref$par}else{
          ref = ref$argument
          ref[nb+1] = median(c(exp(ref[nb+1]),min[1],max[1]))
          ref[nb+2] = median(c(exp(ref[nb+2]),min[2],max[2]))
        }
        c(ref,1)
      }, error = function(e) {
        ref = nlminb(start=c(para,1,cell_init), objective=ptmg_ll,gradient=ptmg_der,posindy = posv$posindy, X = pred, offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                     ytwo = posv$ytwo, fam = id, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                     nind = nind, lower = lower, upper = upper)
        c(ref$par,0)
      })
      # betae = re_t$par[1:nb]
      conv = re_t[length(re_t)]
      fit = 1
      betae = rep(0, nb)
      betae[intcol] = lmct - moffset
      vare = re_t[(nb + 1):(nb + 2)]
      para = vare
      
      cv2p <- ifelse(ncell>0,get_cv(offset,pred,re_t[1:nb],cell_ind,ncell,nind),cv2)
      
      gni = mfs * vare[2]
      
      if(method == "LN")
      {
        if(model=="NBGMM")
        {
          if (gni < cutoff_cell) {
            re_t = tryCatch({bobyqa(para, pql_ll, reml = reml, eps = eps, ord=ord,
                          betas = betae, intcol = intcol, posindy = posv$posindy, X = pred,offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                          ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                          nind = nind, lower = min, upper = max)},
                          error = function(e){
                            bobyqa(para, pql_ll, reml = reml, eps = eps, ord=1,
                                   betas = betae, intcol = intcol, posindy = posv$posindy, X = pred,offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                                   ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                                   nind = nind, lower = min, upper = max)
                          })
            
            vare = re_t$par[1:2]
            fit = 2
           
          }else{
            kappa_obs = gni/(1+cv2p)
            # if ((gni/(1+cv2)) < kappa)
            if((kappa_obs<20)|((kappa_obs<kappa)&(vare[1]<(8/kappa_obs))))
            {
              varsig = tryCatch({nlminb(para[1], pql_gamma_ll, gamma = para[2],betas = betae, intcol = intcol, reml = reml, eps = eps,ord=ord,
                              posindy = posv$posindy, X = pred, offset = offset,Y = posv$Y, n_one = posv$n_onetwo, ytwo = posv$ytwo,
                              fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                              nind = nind, lower = min[1], upper = max[1])},
                              error = function(e){
                                nlminb(para[1], pql_gamma_ll, gamma = para[2], betas = betae, intcol = intcol, reml = reml, eps = eps,ord=1,
                                       posindy = posv$posindy, X = pred, offset = offset,Y = posv$Y, n_one = posv$n_onetwo, ytwo = posv$ytwo,
                                       fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                                       nind = nind, lower = min[1], upper = max[1])
                              })
              vare[1] = varsig$par[1]
              fit = 3
            }
          }
        }else{
          if (gni < cutoff_cell) {
            re_t = bobyqa(para, pql_nbm_ll, reml = reml,
                          eps = eps, ord=1, betas = betae, intcol = intcol, posindy = posv$posindy,
                          X = pred, offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                          ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                          nind = nind, lower = min, upper = max)
            vare = re_t$par[1:2]
            fit = 4
          }else{
            varsig = nlminb(para[1], pql_nbm_gamma_ll, gamma = para[2],betas = betae, intcol = intcol, reml = reml, eps = eps,ord=1,
                          posindy = posv$posindy, X = pred, offset = offset,Y = posv$Y, n_one = posv$n_onetwo, ytwo = posv$ytwo,
                          fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                          nind = nind, lower = min[1], upper = max[1])
            vare[1] = varsig$par[1]
            fit = 6
          }
        }
      }else {
        
        if (model == "NBGMM") {
          re_t = tryCatch({bobyqa(para, pql_ll, reml = reml, eps = eps, ord=ord,
                        betas = betae, intcol = intcol, posindy = posv$posindy, X = pred,
                        offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                        ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                        nind = nind, lower = min, upper = max)},
                        error = function(e){
                          bobyqa(para, pql_ll, reml = reml, eps = eps, ord=1,
                                 betas = betae, intcol = intcol, posindy = posv$posindy, X = pred,
                                 offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                                 ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                                 nind = nind, lower = min, upper = max)
                        })
          fit = 2
        }else {
          re_t = bobyqa(para, pql_nbm_ll, reml = reml,
                        eps = eps, ord=1, betas = betae, intcol = intcol, posindy = posv$posindy,
                        X = pred, offset = offset, Y = posv$Y, n_one = posv$n_onetwo,
                        ytwo = posv$ytwo, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,
                        nind = nind, lower = min, upper = max)
          fit = 4
        }
        vare = re_t$par[1:2]
      }
      
      betae[intcol] = betae[intcol] - vare[1]/2
      if (model == "NBGMM") {
        repml = opt_pml(pred, offset, posv$Y, fid-1, cumsumy[i, ], posind[[i]]-1, posv$posindy,nb, nind, k, betae, vare, reml, 1e-06,1)
      }else {
        repml = opt_pml_nbm(pred, offset, posv$Y, fid-1, cumsumy[i, ], posind[[i]]-1,posv$posindy, nb, nind, k, betae,vare, reml, 1e-06,1)
      }
      
      conv = check_conv(repml, conv, nb, vare, min, max)
      fccov = matrix(NA,nb,nb)
      if(conv != -25)
      {
        fccov = Rfast::spdinv(repml$var)
      }
      
      restemp = c(repml$beta, vare[1],1/vare[2], diag(fccov),conv,fit)
      if(covariance==TRUE)
      {
        restemp = c(restemp,fccov[lower.tri(fccov, diag = TRUE)])
      }
      if(output_re==TRUE)
      {
        restemp = c(restemp,repml$logw)
      }
      restemp
    
    }
  }else {

      re <- foreach(i = gid, .combine = 'cbind') %dorng% {
        posv = call_posindy(count, i - 1, nind)
        
        para = c(rep(0,nb),1)
        para[intcol] = log(posv$mct) - moffset
        re_t = tryCatch({
          nlminb(para, pmg_ll, pmg_der,posindy = posv$posindy, X = pred, offset = offset,
                 Y = posv$Y, fid = fid, cumsumy = cumsumy[i,], posind = posind[[i]], nb = nb, k = k,nind = nind, lower = lower, upper = upper)
        }, error = function(e) {
          bobyqa(para, pmg_ll, posindy = posv$posindy,X = pred, offset = offset, Y = posv$Y, 
                 fid = fid, cumsumy = cumsumy[i, ], posind = posind[[i]],
                 nb = nb, k = k, nind = nind, lower = lower,upper = upper)
        })
        
        hes = pmg_hes(re_t$par,posindy = posv$posindy,X = pred, offset = offset, Y = posv$Y,fid = fid, cumsumy = cumsumy[i, ], posind = posind[[i]],nb = nb, k = k, nind = nind)
        
        conv = 1
        if(is.null(re_t$objective))
        {
          if(re_t$convergence<0)
          {conv = -50}
        }else{
          if(re_t$convergence!=0)
          {conv = -50}
        }
        var_re = rep(NA, npar)
        nnaind = (!is.nan(diag(hes))) & (re_t$par != lower) & (re_t$par != upper)
        issingular = 0
        lnnaind = length(nnaind)
        if(lnnaind>1)
        {
          if(min(eigen(-hes[nnaind, nnaind])$values)<1e-8)
          {issingular = 1}
        }
        
        if(issingular == 0)
        {
          covfix = solve(-hes[nnaind, nnaind])
          var_re[nnaind] = diag(covfix)
        }else{
          conv = -25
        }
        
        restemp = c(re_t$par, var_re[1:nb],conv,5)
        if(covariance==TRUE)
        {
          tempv = matrix(NA,npar,npar)
          if(conv != -25)
          {tempv[nnaind, nnaind] = covfix}
          tempv = tempv[1:nb,1:nb]
          restemp = c(restemp, tempv[lower.tri(tempv, diag = TRUE)])
        }
        restemp
      }
  }
  
  if(ncore>1)
  {
    rm(list = "cls")
    gc()
  }
  
  colnames(re) <- NULL
  re_all = as.data.frame(t(re))
  p = sapply(1:nb, function(x) pchisq(re_all[, x]^2/re_all[,x + npar], 1, lower.tail = FALSE))
  for (i in (npar + 1):(npar + nb)) {
    re_all[, i] = sqrt(re_all[, i])
  }
  
  sds[intcol] = 1
  
  re_all[, c(1:nb,npar+(1:nb))] = t(t(re_all[, c(1:nb,npar+(1:nb))])/c(sds,sds))
  
  if(covariance==TRUE)
  {
    sdssc = sds%*%t(sds)
    sdssc = sdssc[lower.tri(sdssc, diag = TRUE)]
    covlen = length(sdssc)
    colind = (npar+nb+3):(npar+nb+2+covlen)
    re_all[, colind] = t(t(re_all[, colind])/sdssc)
  }
  
  rl = c("Subject", "Cell")
  if ((length(predn) == 0)|is.null(predn)) {
    indp = 1:nb
  }else {
    indp = predn
  }
  
  curind = 2*nb+ntau+2
  colnames(re_all)[1:curind] = c(paste0("logFC_",indp), rl[1:ntau], paste0("se_",indp), "convergence", "algorithm")
  ncov = 0
  if(covariance==TRUE)
  {
    ncov = (nb+1)*nb/2
    colnames(re_all)[(curind+1):(curind+ncov)] = paste0("cov_", 1:ncov)
    curind = curind + ncov
  }
  if(output_re==TRUE)
  {
    colnames(re_all)[(curind+1):(curind+k)] = paste0("random_", 1:k)
    curind = curind + k
  }
  
  re_all = cbind(re_all, matrix(p, ncol = nb))
  colnames(re_all)[(curind+1):(curind+nb)] = paste0("p_", indp)
  curind = curind + nb
  re_all$gene_id = gid
  re_all$gene = gname[gid]
  
  algorithms = c('NBGMM (LN)','NBGMM (HL)','NBGMM (LN+HL)','NBLMM (HL)','PGMM','NBLMM (LN+HL)')
  list(summary = re_all[, c(paste0("logFC_", indp), paste0("se_",indp), paste0("p_", indp), colnames(re_all)[(curind+1):ncol(re_all)])],
       overdispersion = re_all[, c(rl[1:ntau])],
       convergence = re_all[,'convergence'],
       algorithm = algorithms[re_all[,'algorithm']],
       covariance = if(covariance==FALSE){NULL}else{re_all[, paste0("cov_", 1:ncov)]},
       random_effect = if(output_re==FALSE){NULL}else{re_all[, paste0("random_", 1:k)]})
      
}

Try the nebula package in your browser

Any scripts or data that you put into this service are public.

nebula documentation built on May 29, 2024, 8:56 a.m.