In lulizou/spASE: Detecting Allele-Specific Expression in Spatial Transcriptomics

# confidence interval coverage simulation for binomial, beta-binomial
# with varying p, var(p), total UMI (sum(y)) = m, total observations n
library(dplyr)
library(ggplot2)
library(reshape2)
library(aod)
library(RCurl)

# get realistic distributions for size per observation from slide-seq and smart-seq3
# slide-seq
load('pucks.3-4.spatial.RData')
slideseq <- bind_rows(puck3.spatial, puck4.spatial)
slideseq <- slideseq %>% 
  mutate(total = CAST+X129)
plp1 <- slideseq %>% filter(gene=='Plp1') # example of highly expressed slideseq gene
xist <- slideseq %>% filter(gene=='Xist') # example of lowly expressed slideseq gene
# example of smartseq3 gene
c57.ss2 <- as.matrix(read.csv(text=getURL('https://raw.githubusercontent.com/sandberg-lab/aRME_and_bursting/master/data/nature_paper/SS3_c57_UMIs_concat.csv'), row.names = 1))
cast.ss2 <- as.matrix(read.csv(text=getURL('https://raw.githubusercontent.com/sandberg-lab/aRME_and_bursting/master/data/nature_paper/SS3_cast_UMIs_concat.csv'), row.names = 1))
idx <- which(rownames(c57.ss2)=='Ywhae')
ywhae <- data.frame(c57 = c57.ss2[idx,], cast = cast.ss2[idx,])
ywhae$total <- ywhae$c57 + ywhae$cast  # example of smartseq3 gene

hist(plp1$total, bins=50)

test.ci.bb <- function(p.mean, bb.summary.object) {
  logit.p <- bb.summary.object@Coef$Estimate
  se <- bb.summary.object@Coef$`Std. Error`
  low <- exp(logit.p-2*se)/(1+exp(logit.p-2*se))
  high <- exp(logit.p+2*se)/(1+exp(logit.p+2*se))
  return(between(p.mean, low, high))
}

test.ci.glm <- function(p.mean, glm.summary.object) {
  logit.p <- glm.summary.object$coefficients[1]
  se <- glm.summary.object$coefficients[2]
  low <- exp(logit.p-2*se)/(1+exp(logit.p-2*se))
  high <- exp(logit.p+2*se)/(1+exp(logit.p+2*se))
  return(between(p.mean, low, high))
}

test.ci.ct <- function(p.mean, bb.summary.object, n, N) {
  phi <- bb.summary.object@Phi$Estimate
  low <- qbeta(0.025, shape1=n/(1+phi), shape2=((N-n)/(1+phi))+1)
  high <- qbeta(0.975, shape1=(n/(1+phi))+1, shape2=(N-n)/(1+phi))
  return(between(p.mean, low, high))
}

sim.data <- function(n, p) {
  n_slideseq_low <- sample(xist$total, size=n, replace=T)
  n_slideseq_high <- sample(plp1$total, size=n, replace=T)
  n_smartseq <- sample(ywhae$total, size=n, replace=T)
  y_slideseq_low <- rbinom(n, size=n_slideseq_low, prob=p)
  y_slideseq_high <- rbinom(n, size=n_slideseq_high, prob=p)
  y_smartseq <- rbinom(n, size=n_smartseq, prob=p)
  fitdf <- data.frame(y_smart = y_smartseq, y_ss_low = y_slideseq_low,
                      y_ss_high = y_slideseq_high, n_smart=n_smartseq,
                      n_ss_low = n_slideseq_low, n_ss_high=n_slideseq_high)
  return(fitdf)
}

get.model.summary <- function(df, p.mean, type='glm') {
  smartseq.warn <- slideseq.low.warn <- slideseq.high.warn <- FALSE # keep track
  # of which ones get warnings/errors so don't include those
  tryCatch(
    {if (type == 'glm') {
      fit.smartseq  <- glm(cbind(y_smart, n_smart-y_smart)~1, family=binomial, data=df)
    } else if (type == 'quasi') {
      fit.smartseq  <- glm(cbind(y_smart, n_smart-y_smart)~1, family=quasibinomial, data=df)
    } else if (type == 'bb' | type == 'ct') {
      fit.smartseq <- betabin(cbind(y_smart, n_smart-y_smart)~1, ~1, data=df)
    } else {
      stop('CI type not recognized')
    }
    summary.smartseq <- summary(fit.smartseq)},
    warning = function(w) {smartseq.warn <<- TRUE},
    error = function(e) {smartseq.warn <<- TRUE}
  )
  tryCatch(
    {if (type == 'glm') {
      fit.slideseq.low <- glm(cbind(y_ss_low, n_ss_low-y_ss_low)~1, family=binomial, data=df)
    } else if (type == 'quasi') {
      fit.slideseq.low  <- glm(cbind(y_ss_low, n_ss_low-y_ss_low)~1, family=quasibinomial, data=df)
    } else if (type == 'bb' | type == 'ct') {
      fit.slideseq.low<- betabin(cbind(y_ss_low, n_ss_low-y_ss_low)~1, ~1, data=df)
    } else {
      stop('CI type not recognized')
    }
    summary.slideseq.low <- summary(fit.slideseq.low)},
    warning = function(w) {slideseq.low.warn <<- TRUE},
    error = function(e) {slideseq.low.warn <<- TRUE}
  )
  tryCatch(
    {if (type == 'glm') {
      fit.slideseq.high <- glm(cbind(y_ss_high, n_ss_high-y_ss_high)~1, family=binomial, data=df)
    } else if (type == 'quasi') {
      fit.slideseq.high  <- glm(cbind(y_ss_high, n_ss_high-y_ss_high)~1, family=quasibinomial, data=df)
    } else if (type == 'bb' | type == 'ct') {
      fit.slideseq.high <- betabin(cbind(y_ss_high, n_ss_high-y_ss_high)~1, ~1, data=df)
    } else {
      stop('CI type not recognized')
    }
    summary.slideseq.high <- summary(fit.slideseq.high)},
    warning = function(w) {slideseq.high.warn <<- TRUE},
    error = function(e) {slideseq.high.warn <<- TRUE}
  )
  if (!smartseq.warn) {
    if (type == 'glm' | type == 'quasi') {
      smartseq.result <- ifelse(test.ci.glm(p.mean,summary.smartseq), 1, 0)
    } else if (type == 'bb') {
      smartseq.result <- ifelse(test.ci.bb(p.mean,summary.smartseq), 1, 0)
    } else if (type == 'ct')  {
      smartseq.result <- ifelse(test.ci.ct(p.mean,summary.smartseq,n=sum(df$y_smart),N=sum(df$n_smart)), 1, 0)
    }
  } else {
    smartseq.result <- NA
  }
  if (!slideseq.low.warn) {
    if (type == 'glm' | type == 'quasi') {
      slideseq.low.result <- ifelse(test.ci.glm(p.mean,summary.slideseq.low), 1, 0)
    } else if (type == 'bb') {
      slideseq.low.result <- ifelse(test.ci.bb(p.mean,summary.slideseq.low), 1, 0)
    } else if (type == 'ct')  {
      slideseq.low.result <- ifelse(test.ci.ct(p.mean,summary.slideseq.low,n=sum(df$y_ss_low),N=sum(df$n_ss_low)), 1, 0)
    }
  } else {
    slideseq.low.result <- NA
  }
  if (!slideseq.high.warn) {
    if (type == 'glm' | type == 'quasi') {
      slideseq.high.result <- ifelse(test.ci.glm(p.mean,summary.slideseq.high), 1, 0)
    } else if (type == 'bb') {
      slideseq.high.result <- ifelse(test.ci.bb(p.mean,summary.slideseq.high), 1, 0)
    } else if (type == 'ct')  {
      slideseq.high.result <- ifelse(test.ci.ct(p.mean,summary.slideseq.high,n=sum(df$y_ss_high),N=sum(df$n_ss_high)), 1, 0)
    }
  } else {
    slideseq.high.result <- NA
  }
  return(list(smartseq.result=smartseq.result, 
              slideseq.low.result=slideseq.low.result, 
              slideseq.high.result=slideseq.high.result))
}

plot.ci.coverage <- function(df, title='') {
  print (
    df %>%
      group_by(n) %>%
      summarise(bb = sum(bb)/sum(bb>=0), ct = sum(ct)/sum(ct>=0), glm = sum(glm)/sum(glm>=0)) %>%
      melt(id.vars = 'n') %>%
      ggplot(aes(x = n, y = value)) +
      geom_line(aes(color = variable)) +
      theme_bw() +
      ggtitle(title)
  )
}

ci.sim <- function(p.mean, p.phi, iter=1000) {
  # n is total beads/cells, m is total number of umis
  nobs <- c(10, 17, 25, 32, 50, 75, 100)
  results.slideseq.low <- data.frame(bb = numeric(iter*length(nobs)),
                                     ct = numeric(iter*length(nobs)),
                                     glm = numeric(iter*length(nobs)),
                                     quasi = numeric(iter*length(nobs)),
                                     n = numeric(iter*length(nobs)))
  results.slideseq.high <- data.frame(bb = numeric(iter*length(nobs)),
                                      ct = numeric(iter*length(nobs)),
                                      glm = numeric(iter*length(nobs)),
                                      quasi = numeric(iter*length(nobs)),
                                      n = numeric(iter*length(nobs)))
  results.smartseq <- data.frame(bb = numeric(iter*length(nobs)),
                                 ct = numeric(iter*length(nobs)),
                                 glm = numeric(iter*length(nobs)),
                                 quasi = numeric(iter*length(nobs)),
                                 n = numeric(iter*length(nobs)))
  for (i in 1:iter) {
    for (j in 1:length(nobs)) {
      if (p.phi == 0) {
        p <- rep(p.mean, nobs[j])
      } else {
        a1 <- p.mean*(1-p.phi)/p.phi
        a2 <- (1-p.mean)*(1-p.phi)/p.phi
        p <- rbeta(nobs[j], shape1=a1, shape2=a2)
      }
      idx <- (j-1)*iter+i
      ##
      fitdf <- sim.data(nobs[j], p)
      res <- get.model.summary(fitdf, p.mean, type = 'bb')
      results.smartseq$bb[idx] <- res$smartseq.result
      results.slideseq.low$bb[idx] <- res$slideseq.low.result
      results.slideseq.high$bb[idx] <- res$slideseq.high.result
      if (p.phi > 0) {
        p <- rbeta(nobs[j], shape1=a1, shape2=a2)
      }
      fitdf <- sim.data(nobs[j], p)
      res <- get.model.summary(fitdf, p.mean, type = 'glm')
      results.smartseq$glm[idx] <- res$smartseq.result
      results.slideseq.low$glm[idx] <- res$slideseq.low.result
      results.slideseq.high$glm[idx] <- res$slideseq.high.result
      ###
      if (p.phi > 0) {
        p <- rbeta(nobs[j], shape1=a1, shape2=a2)
      }
      fitdf <- sim.data(nobs[j], p)
      res <- get.model.summary(fitdf, p.mean, type = 'quasi')
      results.smartseq$quasi[idx] <- res$smartseq.result
      results.slideseq.low$quasi[idx] <- res$slideseq.low.result
      results.slideseq.high$quasi[idx] <- res$slideseq.high.result
      ###
      if (p.phi > 0) {
        p <- rbeta(nobs[j], shape1=a1, shape2=a2)
      }
      fitdf <- sim.data(nobs[j], p)
      res <- get.model.summary(fitdf, p.mean, type = 'ct')
      results.smartseq$ct[idx] <- res$smartseq.result
      results.slideseq.low$ct[idx] <- res$slideseq.low.result
      results.slideseq.high$ct[idx] <- res$slideseq.high.result
      ##
      results.slideseq.low$n[idx] <- nobs[j]
      results.slideseq.high$n[idx] <- nobs[j]
      results.smartseq$n[idx] <- nobs[j]
    }
    if (i%%100==0) {
      print(i)
    }
  }
  return(list(results.smartseq=results.smartseq, results.slideseq.low=results.slideseq.low,
              results.slideseq.high=results.slideseq.high))
}

res <- ci.sim(p.mean = as.numeric(params$pmean), p.phi=as.numeric(params$pphi), iter=as.numeric(params$iter))
saveRDS(res, file = paste0('ci_lowrange2_mean',params$pmean,'_phi',params$pphi,'_iter',params$iter,'.rds'))