R/differential_expression_internal.R

Defines functions MinMax NBModelComparison RegularizedTheta RandomOOB DifferentialAUC AUCMarkerTest bb_model_test TobitFitter DifferentialTobit TobitDiffExpTest bimodLikData DifferentialLRT

Documented in MinMax

# internal function to run mcdavid et al. DE test
#
#' @importFrom stats pchisq
#
DifferentialLRT <- function(x, y, xmin = 0) {
  lrtX <- bimodLikData(x = x)
  lrtY <- bimodLikData(x = y)
  lrtZ <- bimodLikData(x = c(x, y))
  lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
  return(pchisq(q = lrt_diff, df = 3, lower.tail = F))
}

#internal function to run mcdavid et al. DE test
#
#' @importFrom stats sd dnorm
#
bimodLikData <- function(x, xmin = 0) {
  x1 <- x[x <= xmin]
  x2 <- x[x > xmin]
  xal <- MinMax(
    data = length(x = x2) / length(x = x),
    min = 1e-2,
    max = (1 - 1e-2)
  )
  likA <- length(x = x1) * log(x = 1 - xal)
  if (length(x = x2) < 2) {
    mysd <- 1
  } else {
    mysd <- sd(x = x2)
  }
  likB <- length(x = x2) *
    log(x = xal) +
    sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
  return(likA + likB)
}

#internal function to run Tobit DE test
TobitDiffExpTest <- function(data1, data2, mygenes, print.bar, NT = 1) {

  if(print.bar) {
    p_val <- unlist(x = pblapply(
      X = mygenes,
      FUN = function(x) {
        return(DifferentialTobit(
          x1 = as.numeric(x = data1[x, ])[!is.na(as.numeric(x = data1[x, ]))],
          x2 = as.numeric(x = data2[x, ])[!is.na(as.numeric(x = data2[x, ]))]
        ))},
      cl = NT
    ))
  } else {
    p_val <- unlist(x = mclapply(
      X = mygenes,
      FUN = function(x) {
        return(DifferentialTobit(
          x1 = as.numeric(x = data1[x, ])[!is.na(as.numeric(x = data1[x, ]))],
          x2 = as.numeric(x = data2[x, ])[!is.na(as.numeric(x = data2[x, ]))]
        ))},
      mc.cores = NT
    ))
  }

  p_val[is.na(x = p_val)] <- 1
  if (print.bar) {
    iterate.fxn <- pblapply
  } else {
    iterate.fxn <- lapply
  }
  toRet <- data.frame(p_val, row.names = mygenes)
  return(toRet)
}

#internal function to run Tobit DE test
#
#' @importFrom stats pchisq logLik
#
DifferentialTobit <- function(x1, x2, lower = 0, upper = 1) {
  my.df <- data.frame(
    c(x1, x2),
    c(rep(x = 0, length(x = x1)), rep(x = 1, length(x = x2)))
  )
  colnames(x = my.df) <- c("Expression", "Stat")
  #model.v1=vgam(Expression~1,family = tobit(Lower = lower,Upper = upper),data = my.df)
  model.v1 <- TobitFitter(
    x = my.df,
    modelFormulaStr = "Expression~1",
    lower = lower,
    upper = upper
  )
  #model.v2=vgam(Expression~Stat+1,family = tobit(Lower = lower,Upper = upper),data = my.df)
  model.v2 <- TobitFitter(
    x = my.df,
    modelFormulaStr = "Expression~Stat+1",
    lower = lower,
    upper = upper
  )
  # if (is.null(x = model.v1) == FALSE && is.null(x = model.v2) == FALSE) {
  if (! is.null(x = model.v1) && ! is.null(x = model.v2)) {
    p <- pchisq(
      q = 2 * (logLik(object = model.v2) - logLik(object = model.v1)),
      df = 1,
      lower.tail = FALSE
    )
  } else {
    p <- 0
  }
  return(p)
}

#internal function to run Tobit DE test
#credit to Cole Trapnell for this
#
#' @importFrom stats as.formula
#' @importFrom VGAM vgam
#' @importFrom VGAM tobit
#
TobitFitter <- function(x, modelFormulaStr, lower = 1, upper = Inf) {
  tryCatch(
    expr = return(suppressWarnings(expr = VGAM::vgam(
      formula = as.formula(object = modelFormulaStr),
      family = VGAM::tobit(Lower = lower, Upper = upper),
      data = x
    ))),
    #warning = function(w) { FM_fit },
    error = function(e) { NULL }
  )
}


# internal function to bbTest
#' @importFrom VGAM vglm
#' @importFrom VGAM fitted

bb_model_test <- function(object, sjgr, sj, cells.1, cells.2, maxit = 10, confounder = NULL) {

  if(is.null(sjgr)) {
    sjgr <- as(row.names(object), "GRanges")
  }

  if(!is.null(confounder)) {
    if(!is.element(confounder, colnames(colData(object))))
      stop("confounder must be one of colnames(colData(object))")
  }

  if(sum(with(as.data.frame(sjgr), paste0(seqnames, ":", start)) == with(as.data.frame(as(sj, "GRanges")), paste0(seqnames, ":", start))) > 1) {
    clu <- as.character(sjgr[with(as.data.frame(sjgr), paste0(seqnames, ":", start)) == with(as.data.frame(as(sj, "GRanges")), paste0(seqnames, ":", start))])

    bdata1 <- data.frame(N = colSums(CountSJ(object = object, SJ = clu, cells = cells.1)),
                         SJ = as.numeric(CountSJ(object = object, SJ = sj, cells = cells.1)),
                         Group = "A",
                         Batch = tryCatch(FetchMeta(object = object, variable = confounder, cells = cells.1), error = function(e) 1))
    bdata2 <- data.frame(N = colSums(CountSJ(object = object, SJ = clu, cells = cells.2)),
                         SJ = as.numeric(CountSJ(object = object, SJ = sj, cells = cells.2)),
                         Group = "B",
                         Batch = tryCatch(FetchMeta(object = object, variable = confounder, cells = cells.2), error = function(e) 1))
    bdata <- rbind(bdata1, bdata2)

    if(length(unique(bdata$Batch)) == 1) {
      fit0 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ 1, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
      fit1 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ Group, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
    } else {
      fit0 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ Batch + 1, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
      fit1 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ Batch + Group, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
    }

    pvalue <- tryCatch(1-pchisq(2*(fit1@criterion$loglikelihood-fit0@criterion$loglikelihood),abs(fit0@df.residual-fit1@df.residual)), error=function(e) NA)
    beta.model <- tryCatch(VGAM::fitted(fit1), error=function(e) NULL)
    if(!is.null(beta.model)) {
      bdata$Fitted <- beta.model[, 1]
      setDT(bdata)
      EffectSize <- bdata[, mean(Fitted), by = Group][, abs(diff(V1))]
      res1 <- data.frame(EffectSize = EffectSize, p_val = pvalue)
      if(anyNA(res1)) res1 <- NULL
    } else {
      res1 <- NULL
    }
  } else {
    res1 <- NULL
  }

  if(sum(with(as.data.frame(sjgr), paste0(seqnames, ":", end)) == with(as.data.frame(as(sj, "GRanges")), paste0(seqnames, ":", end))) > 1) {
    clu <- as.character(sjgr[with(as.data.frame(sjgr), paste0(seqnames, ":", end)) == with(as.data.frame(as(sj, "GRanges")), paste0(seqnames, ":", end))])

    bdata1 <- data.frame(N = colSums(CountSJ(object = object, SJ = clu, cells = cells.1)),
                         SJ = as.numeric(CountSJ(object = object, SJ = sj, cells = cells.1)),
                         Group = "A",
                         Batch = tryCatch(FetchMeta(object = object, variable = confounder, cells = cells.1), error = function(e) 1))
    bdata2 <- data.frame(N = colSums(CountSJ(object = object, SJ = clu, cells = cells.2)),
                         SJ = as.numeric(CountSJ(object = object, SJ = sj, cells = cells.2)),
                         Group = "B",
                         Batch = tryCatch(FetchMeta(object = object, variable = confounder, cells = cells.2), error = function(e) 1))
    bdata <- rbind(bdata1, bdata2)

    if(length(unique(bdata$Batch)) == 1) {
      fit0 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ 1, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
      fit1 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ Group, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
    } else {
      fit0 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ Batch + 1, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
      fit1 <- tryCatch(VGAM::vglm(cbind(SJ, N - SJ) ~ Batch + Group, "betabinomial", data = bdata, trace = FALSE, maxit = maxit), error = function(e) NA)
    }

    pvalue <- tryCatch(1-pchisq(2*(fit1@criterion$loglikelihood-fit0@criterion$loglikelihood),abs(fit0@df.residual-fit1@df.residual)), error=function(e) NA)
    beta.model <- tryCatch(VGAM::fitted(fit1), error=function(e) NULL)
    if(!is.null(beta.model)) {
      bdata$Fitted <- beta.model[, 1]
      setDT(bdata)
      EffectSize <- bdata[, mean(Fitted), by = Group][, abs(diff(V1))]
      res2 <- data.frame(EffectSize = EffectSize, p_val = pvalue)
      if(anyNA(res2)) res2 <- NULL
    } else {
      res2 <- NULL
    }
  } else {
    res2 <- NULL
  }

  if(!is.null(res1) & !is.null(res2)) {
    if(res1$p_val == res2$p_val) {
      if(res1$EffectSize >= res2$EffectSize) {
        res <- res1
      } else {
        res <- res2
      }
    } else {
      if(res1$p_val < res2$p_val) {
        res <- res1
      } else {
        res <- res2
      }
    }
  } else {
    if(is.null(res1) & is.null(res2)) {
      res <- NA
    } else {
      if(is.null(res1)) {
        res <- res2
      } else {
        res <- res1
      }
    }
  }
  return(res)
}


# internal function to calculate AUC values
AUCMarkerTest <- function(data1, data2, mygenes, print.bar = TRUE) {
  myAUC <- unlist(x = lapply(
    X = mygenes,
    FUN = function(x) {
      tryCatch(DifferentialAUC(
        x = as.numeric(x = data1[x, ])[!is.na(as.numeric(x = data1[x, ]))],
        y = as.numeric(x = data2[x, ])[!is.na(as.numeric(x = data2[x, ]))]
      ), error = function(e) NA)
    }
  ))
  myAUC[is.na(x = myAUC)] <- 0
  if (print.bar) {
    iterate.fxn <- pblapply
  } else {
    iterate.fxn <- lapply
  }
  avg_diff <- unlist(x = iterate.fxn(
    X = mygenes,
    FUN = function(x) {
      return(
        mean(x = as.numeric(x = data1[x, ]), na.rm = TRUE) - mean(x = as.numeric(x = data2[x, ]), na.rm = TRUE)
      )
    }
  ))
  toRet <- data.frame(cbind(myAUC, avg_diff), row.names = mygenes)
  toRet <- toRet[rev(x = order(toRet$myAUC)), ]
  return(toRet)
}

# internal function to calculate AUC values
#' @importFrom ROCR prediction performance
DifferentialAUC <- function(x, y) {
  prediction.use <- prediction(
    predictions = c(x, y),
    labels = c(rep(x = 1, length(x = x)), rep(x = 0, length(x = y))),
    label.ordering = 0:1
  )
  perf.use <- performance(prediction.obj = prediction.use, measure = "auc")
  auc.use <- round(x = perf.use@y.values[[1]], digits = 3)
  return(auc.use)
}

# internal function for variable selection from random forests using OOB error
#' @importFrom varSelRF varSelRF
RandomOOB <- function(x, y) {
  tryCatch(as.numeric(varSelRF::varSelRF(xdata = matrix(c(x, y)),
                                         Class = factor(rep(c("x", "y"), c(length(x), length(y)))),
                                         ntree = 100, ntreeIterat = 100)$initialImportances), error = function(e) NA)

}


# given a UMI count matrix, estimate NB theta parameter for each gene
# and use fit of relationship with mean to assign regularized theta to each gene
#
#' @importFrom stats glm loess poisson
#' @importFrom utils txtProgressBar setTxtProgressBar
#
RegularizedTheta <- function(cm, latent.data, min.theta = 0.01, bin.size = 128) {
  genes.regress <- rownames(x = cm)
  bin.ind <- ceiling(x = 1:length(x = genes.regress) / bin.size)
  max.bin <- max(bin.ind)
  message('Running Poisson regression (to get initial mean), and theta estimation per gene')
  pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
  theta.estimate <- c()
  for (i in 1:max.bin) {
    genes.bin.regress <- genes.regress[bin.ind == i]
    bin.theta.estimate <- unlist(
      x = parallel::mclapply(
        X = genes.bin.regress,
        FUN = function(j) {
          return(as.numeric(x = MASS::theta.ml(
            y = cm[j, ],
            mu = glm(
              formula = cm[j, ] ~ .,
              data = latent.data,
              family = poisson
            )$fitted
          )))
        }
      ),
      use.names = FALSE
    )
    theta.estimate <- c(theta.estimate, bin.theta.estimate)
    setTxtProgressBar(pb = pb, value = i)
  }
  close(con = pb)
  UMI.mean <- apply(X = cm, MARGIN = 1, FUN = mean)
  var.estimate <- UMI.mean + (UMI.mean ^ 2) / theta.estimate
  for (span in c(1/3, 1/2, 3/4, 1)) {
    fit <- loess(
      formula = log10(x = var.estimate) ~ log10(x = UMI.mean),
      span = span
    )
    if (! any(is.na(x = fit$fitted))) {
      message(sprintf(
        'Used loess with span %1.2f to fit mean-variance relationship\n',
        span
      ))
      break
    }
  }
  if (any(is.na(x = fit$fitted))) {
    stop('Problem when fitting NB gene variance in RegularizedTheta - NA values were fitted.')
  }
  theta.fit <- (UMI.mean ^ 2) / ((10 ^ fit$fitted) - UMI.mean)
  names(x = theta.fit) <- genes.regress
  to.fix <- theta.fit <= min.theta | is.infinite(x = theta.fit)
  if (any(to.fix)) {
    message(
      'Fitted theta below ',
      min.theta,
      ' for ',
      sum(to.fix),
      ' genes, setting them to ',
      min.theta
    )
    theta.fit[to.fix] <- min.theta
  }
  return(theta.fit)
}

# compare two negative binomial regression models
# model one uses only common factors (com.fac)
# model two additionally uses group factor (grp.fac)
#
#' @importFrom stats glm anova coef
#
NBModelComparison <- function(y, theta, latent.data, com.fac, grp.fac) {
  tab <- as.matrix(x = table(y > 0, latent.data[, grp.fac]))
  freqs <- tab['TRUE', ] / apply(X = tab, MARGIN = 2, FUN = sum)
  fit2 <- 0
  fit4 <- 0
  try(
    expr = fit2 <- glm(
      formula = y ~ .,
      data = latent.data[, com.fac, drop = FALSE],
      family = MASS::negative.binomial(theta = theta)
    ),
    silent=TRUE
  )
  try(
    fit4 <- glm(
      formula = y ~ .,
      data = latent.data[, c(com.fac, grp.fac)],
      family = MASS::negative.binomial(theta = theta)
    ),
    silent = TRUE
  )
  if (class(x = fit2)[1] == 'numeric' | class(x = fit4)[1] == 'numeric') {
    message('One of the glm.nb calls failed')
    return(c(rep(x = NA, 5), freqs))
  }
  pval <- anova(fit2, fit4, test = 'Chisq')$'Pr(>Chi)'[2]
  foi <- 2 + length(x = com.fac)
  log2.fc <- log2(x = 1 / exp(x = coef(object = fit4)[foi]))
  ret <- c(
    fit2$deviance,
    fit4$deviance,
    pval,
    coef(object = fit4)[foi],
    log2.fc,
    freqs
  )
  names(x = ret) <- c(
    'dev1',
    'dev2',
    'pval',
    'coef',
    'log2.fc',
    'freq1',
    'freq2'
  )
  return(ret)
}

#' Apply a ceiling and floor to all values in a matrix
#'
#' @param data Matrix or data frame
#' @param min all values below this min value will be replaced with min
#' @param max all values above this max value will be replaced with max
#' @return Returns matrix after performing these floor and ceil operations
#' @export
#'
#' @examples
#' mat <- matrix(data = rbinom(n = 25, size = 20, prob = 0.2 ), nrow = 5)
#' mat
#' MinMax(data = mat, min = 4, max = 5)
#'
MinMax <- function(data, min, max) {
  data2 <- data
  data2[data2 > max] <- max
  data2[data2 < min] <- min
  return(data2)
}
tangchao7498/ICAS documentation built on Jan. 28, 2021, 3:56 p.m.