# confidence interval coverage simulation for binomial, beta-binomial # with varying p, var(p), total UMI (sum(y)) = m, total observations n library(dplyr) library(ggplot2) library(reshape2) library(aod) library(RCurl)
# get realistic distributions for size per observation from slide-seq and smart-seq3 # slide-seq load('pucks.3-4.spatial.RData') slideseq <- bind_rows(puck3.spatial, puck4.spatial) slideseq <- slideseq %>% mutate(total = CAST+X129) plp1 <- slideseq %>% filter(gene=='Plp1') # example of highly expressed slideseq gene xist <- slideseq %>% filter(gene=='Xist') # example of lowly expressed slideseq gene # example of smartseq3 gene c57.ss2 <- as.matrix(read.csv(text=getURL('https://raw.githubusercontent.com/sandberg-lab/aRME_and_bursting/master/data/nature_paper/SS3_c57_UMIs_concat.csv'), row.names = 1)) cast.ss2 <- as.matrix(read.csv(text=getURL('https://raw.githubusercontent.com/sandberg-lab/aRME_and_bursting/master/data/nature_paper/SS3_cast_UMIs_concat.csv'), row.names = 1)) idx <- which(rownames(c57.ss2)=='Ywhae') ywhae <- data.frame(c57 = c57.ss2[idx,], cast = cast.ss2[idx,]) ywhae$total <- ywhae$c57 + ywhae$cast # example of smartseq3 gene
hist(plp1$total, bins=50)
test.ci.bb <- function(p.mean, bb.summary.object) { logit.p <- bb.summary.object@Coef$Estimate se <- bb.summary.object@Coef$`Std. Error` low <- exp(logit.p-2*se)/(1+exp(logit.p-2*se)) high <- exp(logit.p+2*se)/(1+exp(logit.p+2*se)) return(between(p.mean, low, high)) } test.ci.glm <- function(p.mean, glm.summary.object) { logit.p <- glm.summary.object$coefficients[1] se <- glm.summary.object$coefficients[2] low <- exp(logit.p-2*se)/(1+exp(logit.p-2*se)) high <- exp(logit.p+2*se)/(1+exp(logit.p+2*se)) return(between(p.mean, low, high)) } test.ci.ct <- function(p.mean, bb.summary.object, n, N) { phi <- bb.summary.object@Phi$Estimate low <- qbeta(0.025, shape1=n/(1+phi), shape2=((N-n)/(1+phi))+1) high <- qbeta(0.975, shape1=(n/(1+phi))+1, shape2=(N-n)/(1+phi)) return(between(p.mean, low, high)) } sim.data <- function(n, p) { n_slideseq_low <- sample(xist$total, size=n, replace=T) n_slideseq_high <- sample(plp1$total, size=n, replace=T) n_smartseq <- sample(ywhae$total, size=n, replace=T) y_slideseq_low <- rbinom(n, size=n_slideseq_low, prob=p) y_slideseq_high <- rbinom(n, size=n_slideseq_high, prob=p) y_smartseq <- rbinom(n, size=n_smartseq, prob=p) fitdf <- data.frame(y_smart = y_smartseq, y_ss_low = y_slideseq_low, y_ss_high = y_slideseq_high, n_smart=n_smartseq, n_ss_low = n_slideseq_low, n_ss_high=n_slideseq_high) return(fitdf) } get.model.summary <- function(df, p.mean, type='glm') { smartseq.warn <- slideseq.low.warn <- slideseq.high.warn <- FALSE # keep track # of which ones get warnings/errors so don't include those tryCatch( {if (type == 'glm') { fit.smartseq <- glm(cbind(y_smart, n_smart-y_smart)~1, family=binomial, data=df) } else if (type == 'quasi') { fit.smartseq <- glm(cbind(y_smart, n_smart-y_smart)~1, family=quasibinomial, data=df) } else if (type == 'bb' | type == 'ct') { fit.smartseq <- betabin(cbind(y_smart, n_smart-y_smart)~1, ~1, data=df) } else { stop('CI type not recognized') } summary.smartseq <- summary(fit.smartseq)}, warning = function(w) {smartseq.warn <<- TRUE}, error = function(e) {smartseq.warn <<- TRUE} ) tryCatch( {if (type == 'glm') { fit.slideseq.low <- glm(cbind(y_ss_low, n_ss_low-y_ss_low)~1, family=binomial, data=df) } else if (type == 'quasi') { fit.slideseq.low <- glm(cbind(y_ss_low, n_ss_low-y_ss_low)~1, family=quasibinomial, data=df) } else if (type == 'bb' | type == 'ct') { fit.slideseq.low<- betabin(cbind(y_ss_low, n_ss_low-y_ss_low)~1, ~1, data=df) } else { stop('CI type not recognized') } summary.slideseq.low <- summary(fit.slideseq.low)}, warning = function(w) {slideseq.low.warn <<- TRUE}, error = function(e) {slideseq.low.warn <<- TRUE} ) tryCatch( {if (type == 'glm') { fit.slideseq.high <- glm(cbind(y_ss_high, n_ss_high-y_ss_high)~1, family=binomial, data=df) } else if (type == 'quasi') { fit.slideseq.high <- glm(cbind(y_ss_high, n_ss_high-y_ss_high)~1, family=quasibinomial, data=df) } else if (type == 'bb' | type == 'ct') { fit.slideseq.high <- betabin(cbind(y_ss_high, n_ss_high-y_ss_high)~1, ~1, data=df) } else { stop('CI type not recognized') } summary.slideseq.high <- summary(fit.slideseq.high)}, warning = function(w) {slideseq.high.warn <<- TRUE}, error = function(e) {slideseq.high.warn <<- TRUE} ) if (!smartseq.warn) { if (type == 'glm' | type == 'quasi') { smartseq.result <- ifelse(test.ci.glm(p.mean,summary.smartseq), 1, 0) } else if (type == 'bb') { smartseq.result <- ifelse(test.ci.bb(p.mean,summary.smartseq), 1, 0) } else if (type == 'ct') { smartseq.result <- ifelse(test.ci.ct(p.mean,summary.smartseq,n=sum(df$y_smart),N=sum(df$n_smart)), 1, 0) } } else { smartseq.result <- NA } if (!slideseq.low.warn) { if (type == 'glm' | type == 'quasi') { slideseq.low.result <- ifelse(test.ci.glm(p.mean,summary.slideseq.low), 1, 0) } else if (type == 'bb') { slideseq.low.result <- ifelse(test.ci.bb(p.mean,summary.slideseq.low), 1, 0) } else if (type == 'ct') { slideseq.low.result <- ifelse(test.ci.ct(p.mean,summary.slideseq.low,n=sum(df$y_ss_low),N=sum(df$n_ss_low)), 1, 0) } } else { slideseq.low.result <- NA } if (!slideseq.high.warn) { if (type == 'glm' | type == 'quasi') { slideseq.high.result <- ifelse(test.ci.glm(p.mean,summary.slideseq.high), 1, 0) } else if (type == 'bb') { slideseq.high.result <- ifelse(test.ci.bb(p.mean,summary.slideseq.high), 1, 0) } else if (type == 'ct') { slideseq.high.result <- ifelse(test.ci.ct(p.mean,summary.slideseq.high,n=sum(df$y_ss_high),N=sum(df$n_ss_high)), 1, 0) } } else { slideseq.high.result <- NA } return(list(smartseq.result=smartseq.result, slideseq.low.result=slideseq.low.result, slideseq.high.result=slideseq.high.result)) } plot.ci.coverage <- function(df, title='') { print ( df %>% group_by(n) %>% summarise(bb = sum(bb)/sum(bb>=0), ct = sum(ct)/sum(ct>=0), glm = sum(glm)/sum(glm>=0)) %>% melt(id.vars = 'n') %>% ggplot(aes(x = n, y = value)) + geom_line(aes(color = variable)) + theme_bw() + ggtitle(title) ) } ci.sim <- function(p.mean, p.phi, iter=1000) { # n is total beads/cells, m is total number of umis nobs <- c(10, 17, 25, 32, 50, 75, 100) results.slideseq.low <- data.frame(bb = numeric(iter*length(nobs)), ct = numeric(iter*length(nobs)), glm = numeric(iter*length(nobs)), quasi = numeric(iter*length(nobs)), n = numeric(iter*length(nobs))) results.slideseq.high <- data.frame(bb = numeric(iter*length(nobs)), ct = numeric(iter*length(nobs)), glm = numeric(iter*length(nobs)), quasi = numeric(iter*length(nobs)), n = numeric(iter*length(nobs))) results.smartseq <- data.frame(bb = numeric(iter*length(nobs)), ct = numeric(iter*length(nobs)), glm = numeric(iter*length(nobs)), quasi = numeric(iter*length(nobs)), n = numeric(iter*length(nobs))) for (i in 1:iter) { for (j in 1:length(nobs)) { if (p.phi == 0) { p <- rep(p.mean, nobs[j]) } else { a1 <- p.mean*(1-p.phi)/p.phi a2 <- (1-p.mean)*(1-p.phi)/p.phi p <- rbeta(nobs[j], shape1=a1, shape2=a2) } idx <- (j-1)*iter+i ## fitdf <- sim.data(nobs[j], p) res <- get.model.summary(fitdf, p.mean, type = 'bb') results.smartseq$bb[idx] <- res$smartseq.result results.slideseq.low$bb[idx] <- res$slideseq.low.result results.slideseq.high$bb[idx] <- res$slideseq.high.result if (p.phi > 0) { p <- rbeta(nobs[j], shape1=a1, shape2=a2) } fitdf <- sim.data(nobs[j], p) res <- get.model.summary(fitdf, p.mean, type = 'glm') results.smartseq$glm[idx] <- res$smartseq.result results.slideseq.low$glm[idx] <- res$slideseq.low.result results.slideseq.high$glm[idx] <- res$slideseq.high.result ### if (p.phi > 0) { p <- rbeta(nobs[j], shape1=a1, shape2=a2) } fitdf <- sim.data(nobs[j], p) res <- get.model.summary(fitdf, p.mean, type = 'quasi') results.smartseq$quasi[idx] <- res$smartseq.result results.slideseq.low$quasi[idx] <- res$slideseq.low.result results.slideseq.high$quasi[idx] <- res$slideseq.high.result ### if (p.phi > 0) { p <- rbeta(nobs[j], shape1=a1, shape2=a2) } fitdf <- sim.data(nobs[j], p) res <- get.model.summary(fitdf, p.mean, type = 'ct') results.smartseq$ct[idx] <- res$smartseq.result results.slideseq.low$ct[idx] <- res$slideseq.low.result results.slideseq.high$ct[idx] <- res$slideseq.high.result ## results.slideseq.low$n[idx] <- nobs[j] results.slideseq.high$n[idx] <- nobs[j] results.smartseq$n[idx] <- nobs[j] } if (i%%100==0) { print(i) } } return(list(results.smartseq=results.smartseq, results.slideseq.low=results.slideseq.low, results.slideseq.high=results.slideseq.high)) }
res <- ci.sim(p.mean = as.numeric(params$pmean), p.phi=as.numeric(params$pphi), iter=as.numeric(params$iter)) saveRDS(res, file = paste0('ci_lowrange2_mean',params$pmean,'_phi',params$pphi,'_iter',params$iter,'.rds'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.